Merge branch 'linus' into sched/core

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index f8b8996..16f3f77 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
  #include <asm/irq_regs.h>
  
  #include "sched_cpupri.h"
+#include "workqueue_sched.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -306,52 +307,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
   */
  struct task_group init_task_group;
  
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
-       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-                               struct task_group, css);
-#else
-       tg = &init_task_group;
-#endif
-       return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-       /*
-        * Strictly speaking this rcu_read_lock() is not needed since the
-        * task_group is tied to the cgroup, which in turn can never go away
-        * as long as there are tasks attached to it.
-        *
-        * However since task_group() uses task_subsys_state() which is an
-        * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
-        */
-       rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-       p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-       p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-       rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       return NULL;
-}
-
  #endif /* CONFIG_CGROUP_SCHED */
  
  /* CFS-related fields in a runqueue */
@@ -502,9 +457,10 @@ struct rq {
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+       unsigned long last_load_update_tick;
  #ifdef CONFIG_NO_HZ
         u64 nohz_stamp;
-       unsigned char in_nohz_recently;
+       unsigned char nohz_balance_kick;
  #endif
         unsigned int skip_clock_update;
  
@@ -644,6 +600,49 @@ static inline int cpu_of(struct rq *rq)
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  #define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       struct cgroup_subsys_state *css;
+
+       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                       lockdep_is_held(&task_rq(p)->lock));
+       return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+       p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+       p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
  inline void update_rq_clock(struct rq *rq)
  {
         if (!rq->skip_clock_update)
@@ -1195,6 +1194,27 @@ static void resched_cpu(int cpu)
  }
  
  #ifdef CONFIG_NO_HZ
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+       int cpu = smp_processor_id();
+       int i;
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd) {
+               for_each_cpu(i, sched_domain_span(sd))
+                       if (!idle_cpu(i))
+                               return i;
+       }
+       return cpu;
+}
  /*
   * When add_timer_on() enqueues a timer into the timer wheel of an
   * idle CPU then this timer might expire before the next timer event
@@ -1257,6 +1277,12 @@ static void sched_avg_update(struct rq *rq)
         s64 period = sched_avg_period();
  
         while ((s64)(rq->clock - rq->age_stamp) > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (rq->age_stamp));
                 rq->age_stamp += period;
                 rq->rt_avg /= 2;
         }
@@ -1649,7 +1675,7 @@ static void update_shares(struct sched_domain *sd)
         if (root_task_group_empty())
                 return;
  
-       now = cpu_clock(raw_smp_processor_id());
+       now = local_clock();
         elapsed = now - sd->last_update;
  
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1660,9 +1686,6 @@ static void update_shares(struct sched_domain *sd)
  
  static void update_h_load(long cpu)
  {
-       if (root_task_group_empty())
-               return;
-
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
@@ -1805,6 +1828,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
  static void calc_load_account_idle(struct rq *this_rq);
  static void update_sysctl(void);
  static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
  
  static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  {
@@ -2267,11 +2291,55 @@ static void update_avg(u64 *avg, u64 sample)
  }
  #endif
  
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+                                bool is_sync, bool is_migrate, bool is_local,
+                                unsigned long en_flags)
+{
+       schedstat_inc(p, se.statistics.nr_wakeups);
+       if (is_sync)
+               schedstat_inc(p, se.statistics.nr_wakeups_sync);
+       if (is_migrate)
+               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+       if (is_local)
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       else
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+
+       activate_task(rq, p, en_flags);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+                                       int wake_flags, bool success)
+{
+       trace_sched_wakeup(p, success);
+       check_preempt_curr(rq, p, wake_flags);
+
+       p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
+
+       if (unlikely(rq->idle_stamp)) {
+               u64 delta = rq->clock - rq->idle_stamp;
+               u64 max = 2*sysctl_sched_migration_cost;
+
+               if (delta > max)
+                       rq->avg_idle = max;
+               else
+                       update_avg(&rq->avg_idle, delta);
+               rq->idle_stamp = 0;
+       }
+#endif
+       /* if a worker is waking up, notify workqueue */
+       if ((p->flags & PF_WQ_WORKER) && success)
+               wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/**
   * try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
   * @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
   *
   * Put it on the run-queue if it's not already there. The "current"
   * thread is always on the run-queue (except when the actual
@@ -2279,7 +2347,8 @@ static void update_avg(u64 *avg, u64 sample)
   * the simpler "current->state = TASK_RUNNING" to mark yourself
   * runnable without the overhead of this.
   *
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
   */
  static int try_to_wake_up(struct task_struct *p, unsigned int state,
                           int wake_flags)
@@ -2359,38 +2428,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
  
  out_activate:
  #endif /* CONFIG_SMP */
-       schedstat_inc(p, se.statistics.nr_wakeups);
-       if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-       if (orig_cpu != cpu)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-       if (cpu == this_cpu)
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
-       else
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
-       activate_task(rq, p, en_flags);
+       ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+                     cpu == this_cpu, en_flags);
         success = 1;
-
  out_running:
-       trace_sched_wakeup(p, success);
-       check_preempt_curr(rq, p, wake_flags);
-
-       p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
-               p->sched_class->task_woken(rq, p);
-
-       if (unlikely(rq->idle_stamp)) {
-               u64 delta = rq->clock - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
-
-               if (delta > max)
-                       rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
-               rq->idle_stamp = 0;
-       }
-#endif
+       ttwu_post_activation(p, rq, wake_flags, success);
  out:
         task_rq_unlock(rq, &flags);
         put_cpu();
@@ -2398,6 +2440,37 @@ out:
         return success;
  }
  
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+       bool success = false;
+
+       BUG_ON(rq != this_rq());
+       BUG_ON(p == current);
+       lockdep_assert_held(&rq->lock);
+
+       if (!(p->state & TASK_NORMAL))
+               return;
+
+       if (!p->se.on_rq) {
+               if (likely(!task_running(rq, p))) {
+                       schedstat_inc(rq, ttwu_count);
+                       schedstat_inc(rq, ttwu_local);
+               }
+               ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+               success = true;
+       }
+       ttwu_post_activation(p, rq, 0, success);
+}
+
  /**
   * wake_up_process - Wake up a specific process
   * @p: The process to be woken up.
@@ -2494,7 +2567,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
         if (p->sched_class->task_fork)
                 p->sched_class->task_fork(p);
  
+       /*
+        * The child is not yet in the pid-hash so no cgroup attach races,
+        * and the cgroup is pinned to this child due to cgroup_fork()
+        * is ran before sched_fork().
+        *
+        * Silence PROVE_RCU.
+        */
+       rcu_read_lock();
         set_task_cpu(p, cpu);
+       rcu_read_unlock();
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         if (likely(sched_info_on()))
@@ -2864,9 +2946,9 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
  {
-       struct rq *this = this_rq();
+       struct rq *this = cpu_rq(cpu);
         return atomic_read(&this->nr_iowait);
  }
  
@@ -3002,24 +3084,103 @@ static void calc_load_account_active(struct rq *this_rq)
         this_rq->calc_load_update += LOAD_FREQ;
  }
  
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT          7
+static const unsigned char
+               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                       {0, 0, 0, 0, 0, 0, 0, 0},
+                                       {64, 32, 8, 0, 0, 0, 0, 0},
+                                       {96, 72, 40, 12, 1, 0, 0},
+                                       {112, 98, 75, 43, 15, 1, 0},
+                                       {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+       int j = 0;
+
+       if (!missed_updates)
+               return load;
+
+       if (missed_updates >= degrade_zero_ticks[idx])
+               return 0;
+
+       if (idx == 1)
+               return load >> missed_updates;
+
+       while (missed_updates) {
+               if (missed_updates % 2)
+                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+               missed_updates >>= 1;
+               j++;
+       }
+       return load;
+}
+
  /*
   * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
   */
  static void update_cpu_load(struct rq *this_rq)
  {
         unsigned long this_load = this_rq->load.weight;
+       unsigned long curr_jiffies = jiffies;
+       unsigned long pending_updates;
         int i, scale;
  
         this_rq->nr_load_updates++;
  
+       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
         /* Update our load: */
-       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
                 unsigned long old_load, new_load;
  
                 /* scale is effectively 1 << i now, and >> i divides by scale */
  
                 old_load = this_rq->cpu_load[i];
+               old_load = decay_load_missed(old_load, pending_updates - 1, i);
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@ -3027,9 +3188,15 @@ static void update_cpu_load(struct rq *this_rq)
                  * example.
                  */
                 if (new_load > old_load)
-                       new_load += scale-1;
-               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+                       new_load += scale - 1;
+
+               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
         }
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+       update_cpu_load(this_rq);
  
         calc_load_account_active(this_rq);
  }
@@ -3417,7 +3584,7 @@ void scheduler_tick(void)
  
         raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
-       update_cpu_load(rq);
+       update_cpu_load_active(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
@@ -3589,7 +3756,6 @@ need_resched:
         rq = cpu_rq(cpu);
         rcu_note_context_switch(cpu);
         prev = rq->curr;
-       switch_count = &prev->nivcsw;
  
         release_kernel_lock(prev);
  need_resched_nonpreemptible:
@@ -3602,11 +3768,26 @@ need_resched_nonpreemptible:
         raw_spin_lock_irq(&rq->lock);
         clear_tsk_need_resched(prev);
  
+       switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-               if (unlikely(signal_pending_state(prev->state, prev)))
+               if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
-               else
+               } else {
+                       /*
+                        * If a worker is going to sleep, notify and
+                        * ask workqueue whether it wants to wake up a
+                        * task to maintain concurrency.  If so, wake
+                        * up the task.
+                        */
+                       if (prev->flags & PF_WQ_WORKER) {
+                               struct task_struct *to_wakeup;
+
+                               to_wakeup = wq_worker_sleeping(prev, cpu);
+                               if (to_wakeup)
+                                       try_to_wake_up_local(to_wakeup);
+                       }
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
+               }
                 switch_count = &prev->nvcsw;
         }
  
@@ -3628,8 +3809,10 @@ need_resched_nonpreemptible:
  
                 context_switch(rq, prev, next); /* unlocks the rq */
                 /*
-                * the context switch might have flipped the stack from under
-                * us, hence refresh the local variables.
+                * The context switch have flipped the stack from under us
+                * and restored the local variables which were saved when
+                * this task called schedule() in the past. prev == current
+                * is still correct, but it can be moved to another cpu/rq.
                  */
                 cpu = smp_processor_id();
                 rq = cpu_rq(cpu);
@@ -3638,11 +3821,8 @@ need_resched_nonpreemptible:
  
         post_schedule(rq);
  
-       if (unlikely(reacquire_kernel_lock(current) < 0)) {
-               prev = rq->curr;
-               switch_count = &prev->nivcsw;
+       if (unlikely(reacquire_kernel_lock(prev)))
                 goto need_resched_nonpreemptible;
-       }
  
         preempt_enable_no_resched();
         if (need_resched())
@@ -4432,12 +4612,8 @@ recheck:
          */
         if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
-                       unsigned long rlim_rtprio;
-
-                       if (!lock_task_sighand(p, &flags))
-                               return -ESRCH;
-                       rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-                       unlock_task_sighand(p, &flags);
+                       unsigned long rlim_rtprio =
+                                       task_rlimit(p, RLIMIT_RTPRIO);
  
                         /* can't set/change the rt policy */
                         if (policy != p->policy && !rlim_rtprio)
@@ -4465,16 +4641,6 @@ recheck:
         }
  
         if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-               /*
-                * Do not allow realtime tasks into groups that have no runtime
-                * assigned.
-                */
-               if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0)
-                       return -EPERM;
-#endif
-
                 retval = security_task_setscheduler(p, policy, param);
                 if (retval)
                         return retval;
@@ -4490,6 +4656,22 @@ recheck:
          * runqueue lock must be held.
          */
         rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (user) {
+               /*
+                * Do not allow realtime tasks into groups that have no runtime
+                * assigned.
+                */
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                       __task_rq_unlock(rq);
+                       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                       return -EPERM;
+               }
+       }
+#endif
+
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
@@ -5801,20 +5983,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
   */
  static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
-       .priority = 10
+       .priority = CPU_PRI_MIGRATION,
  };
  
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+                                     unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               set_cpu_active((long)hcpu, true);
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               set_cpu_active((long)hcpu, false);
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
  static int __init migration_init(void)
  {
         void *cpu = (void *)(long)smp_processor_id();
         int err;
  
-       /* Start one for the boot CPU: */
+       /* Initialize migration for the boot CPU */
         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
         BUG_ON(err == NOTIFY_BAD);
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
  
+       /* Register cpu active notifiers */
+       cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+       cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
         return 0;
  }
  early_initcall(migration_init);
@@ -6049,23 +6260,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                 free_rootdomain(old_rd);
  }
  
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
  {
-       gfp_t gfp = GFP_KERNEL;
-
         memset(rd, 0, sizeof(*rd));
  
-       if (bootmem)
-               gfp = GFP_NOWAIT;
-
-       if (!alloc_cpumask_var(&rd->span, gfp))
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
-       if (!alloc_cpumask_var(&rd->online, gfp))
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_online;
  
-       if (cpupri_init(&rd->cpupri, bootmem) != 0)
+       if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
         return 0;
  
@@ -6081,7 +6287,7 @@ out:
  
  static void init_defrootdomain(void)
  {
-       init_rootdomain(&def_root_domain, true);
+       init_rootdomain(&def_root_domain);
  
         atomic_set(&def_root_domain.refcount, 1);
  }
@@ -6094,7 +6300,7 @@ static struct root_domain *alloc_rootdomain(void)
         if (!rd)
                 return NULL;
  
-       if (init_rootdomain(rd, false) != 0) {
+       if (init_rootdomain(rd) != 0) {
                 kfree(rd);
                 return NULL;
         }
@@ -7273,29 +7479,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  }
  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
-#ifndef CONFIG_CPUSETS
  /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
   */
-static int update_sched_domains(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+                            void *hcpu)
  {
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
         case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               partition_sched_domains(1, NULL, NULL);
+               cpuset_update_active_cpus();
                 return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
  
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+                              void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               cpuset_update_active_cpus();
+               return NOTIFY_OK;
         default:
                 return NOTIFY_DONE;
         }
  }
-#endif
  
  static int update_runtime(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
@@ -7341,10 +7553,8 @@ void __init sched_init_smp(void)
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
  
-#ifndef CONFIG_CPUSETS
-       /* XXX: Theoretical race here - CPU may be hotplugged now */
-       hotcpu_notifier(update_sched_domains, 0);
-#endif
+       hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+       hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
  
         /* RT runtime code needs to handle some hotplug events */
         hotcpu_notifier(update_runtime, 0);
@@ -7589,6 +7799,9 @@ void __init sched_init(void)
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                         rq->cpu_load[j] = 0;
+
+               rq->last_load_update_tick = jiffies;
+
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
@@ -7602,6 +7815,10 @@ void __init sched_init(void)
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+               rq->nohz_balance_kick = 0;
+               init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
  #endif
                 init_rq_hrtick(rq);
                 atomic_set(&rq->nr_iowait, 0);
@@ -7646,8 +7863,11 @@ void __init sched_init(void)
         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
  #ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ
-       zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
-       alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+       zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+       alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+       atomic_set(&nohz.load_balancer, nr_cpu_ids);
+       atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+       atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
  #endif
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)