sched/loadavg: Avoid loadavg spikes caused by delayed NO_HZ accounting

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 7d98d58..4b3e12e 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2084,19 +2084,6 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
  
  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
-       rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       rq->prev_steal_time_rq = 0;
-#endif
-}
-
  #ifdef CONFIG_PARAVIRT
  static inline u64 steal_ticks(u64 steal)
  {
@@ -2846,6 +2833,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
  
+       /*
+        * Ensure we load p->on_rq _after_ p->state, otherwise it would
+        * be possible to, falsely, observe p->on_rq == 0 and get stuck
+        * in smp_cond_load_acquire() below.
+        *
+        * sched_ttwu_pending()                 try_to_wake_up()
+        *   [S] p->on_rq = 1;                  [L] P->state
+        *       UNLOCK rq->lock  -----.
+        *                              \
+        *                               +---   RMB
+        * schedule()                   /
+        *       LOCK rq->lock    -----'
+        *       UNLOCK rq->lock
+        *
+        * [task p]
+        *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+        *
+        * Pairs with the UNLOCK+LOCK on rq->lock from the
+        * last wakeup of our task and the schedule that got our task
+        * current.
+        */
+       smp_rmb();
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto stat;
  
@@ -3608,8 +3617,9 @@ void calc_load_exit_idle(void)
         struct rq *this_rq = this_rq();
  
         /*
-        * If we're still before the sample window, we're done.
+        * If we're still before the pending sample window, we're done.
          */
+       this_rq->calc_load_update = calc_load_update;
         if (time_before(jiffies, this_rq->calc_load_update))
                 return;
  
@@ -3618,7 +3628,6 @@ void calc_load_exit_idle(void)
          * accounted through the nohz accounting, so skip the entire deal and
          * sync up for the next window.
          */
-       this_rq->calc_load_update = calc_load_update;
         if (time_before(jiffies, this_rq->calc_load_update + 10))
                 this_rq->calc_load_update += LOAD_FREQ;
  }
@@ -6260,14 +6269,16 @@ void show_state_filter(unsigned long state_filter)
                 /*
                  * reset the NMI-timeout, listing all files on a slow
                  * console might take a lot of time:
+                * Also, reset softlockup watchdogs on all CPUs, because
+                * another CPU might be blocked waiting for us to process
+                * an IPI.
                  */
                 touch_nmi_watchdog();
+               touch_all_softlockup_watchdogs();
                 if (!state_filter || (p->state & state_filter))
                         sched_show_task(p);
         } while_each_thread(g, p);
  
-       touch_all_softlockup_watchdogs();
-
  #ifdef CONFIG_SCHED_DEBUG
         sysrq_sched_debug_show();
  #endif
@@ -6867,7 +6878,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  
         case CPU_UP_PREPARE:
                 rq->calc_load_update = calc_load_update;
-               account_reset_rq(rq);
                 break;
  
         case CPU_ONLINE: