Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 297d1a0..18d38e4 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
+#include <asm/mutex.h>
  
  #include "sched_cpupri.h"
  #include "workqueue_sched.h"
+#include "sched_autogroup.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
         unsigned long shares;
+
+       atomic_t load_weight;
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
         struct task_group *parent;
         struct list_head siblings;
         struct list_head children;
-};
  
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
+};
  
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return list_empty(&root_task_group.children);
-}
-#endif
-
-# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD  NICE_0_LOAD
  
  /*
   * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
  #define MIN_SHARES     2
  #define MAX_SHARES     (1UL << 18)
  
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
  #endif
  
  /* Default task group.
   *     Every task in system belong to this group at bootup.
   */
-struct task_group init_task_group;
+struct task_group root_task_group;
  
  #endif /* CONFIG_CGROUP_SCHED */
  
@@ -342,6 +339,7 @@ struct cfs_rq {
          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
          * list is used during load balance.
          */
+       int on_list;
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
  
@@ -360,14 +358,17 @@ struct cfs_rq {
         unsigned long h_load;
  
         /*
-        * this cpu's part of tg->shares
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
          */
-       unsigned long shares;
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
  
-       /*
-        * load.weight at the time we set shares
-        */
-       unsigned long rq_weight;
+       unsigned long load_contribution;
  #endif
  #endif
  };
@@ -552,9 +553,6 @@ struct rq {
         /* try_to_wake_up() stats */
         unsigned int ttwu_count;
         unsigned int ttwu_local;
-
-       /* BKL stats */
-       unsigned int bkl_count;
  #endif
  };
  
@@ -605,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
   */
  static inline struct task_group *task_group(struct task_struct *p)
  {
+       struct task_group *tg;
         struct cgroup_subsys_state *css;
  
+       if (p->flags & PF_EXITING)
+               return &root_task_group;
+
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                         lockdep_is_held(&task_rq(p)->lock));
-       return container_of(css, struct task_group, css);
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
  }
  
  /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -737,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         buf[cnt] = 0;
         cmp = strstrip(buf);
  
-       if (strncmp(buf, "NO_", 3) == 0) {
+       if (strncmp(cmp, "NO_", 3) == 0) {
                 neg = 1;
                 cmp += 3;
         }
@@ -792,20 +796,6 @@ late_initcall(sched_init_debug);
   */
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
  
-/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
  /*
   * period over which we average the RT time consumption, measured
   * in ms.
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
         lw->inv_weight = 0;
  }
  
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
   * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                   unsigned long sd_shares,
-                                   unsigned long sd_rq_weight,
-                                   unsigned long *usd_rq_weight)
-{
-       unsigned long shares, rq_weight;
-       int boost = 0;
-
-       rq_weight = usd_rq_weight[cpu];
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       /*
-        *             \Sum_j shares_j * rq_weight_i
-        * shares_i =  -----------------------------
-        *                  \Sum_j rq_weight_j
-        */
-       shares = (sd_shares * rq_weight) / sd_rq_weight;
-       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-       if (abs(shares - tg->se[cpu]->load.weight) >
-                       sysctl_sched_shares_thresh) {
-               struct rq *rq = cpu_rq(cpu);
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               __set_se_shares(tg->se[cpu], shares);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-       unsigned long *usd_rq_weight;
-       struct sched_domain *sd = data;
-       unsigned long flags;
-       int i;
-
-       if (!tg->se[0])
-               return 0;
-
-       local_irq_save(flags);
-       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-       for_each_cpu(i, sched_domain_span(sd)) {
-               weight = tg->cfs_rq[i]->load.weight;
-               usd_rq_weight[i] = weight;
-
-               rq_weight += weight;
-               /*
-                * If there are currently no tasks on the cpu pretend there
-                * is one of average load so that when a new task gets to
-                * run here it will not get delayed by group starvation.
-                */
-               if (!weight)
-                       weight = NICE_0_LOAD;
-
-               sum_weight += weight;
-               shares += tg->cfs_rq[i]->shares;
-       }
-
-       if (!rq_weight)
-               rq_weight = sum_weight;
-
-       if ((!shares && rq_weight) || shares > tg->shares)
-               shares = tg->shares;
-
-       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-               shares = tg->shares;
-
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
-
  /*
   * Compute the cpu's hierarchical load factor for each task group.
   * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                 load = cpu_rq(cpu)->load.weight;
         } else {
                 load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->cfs_rq[cpu]->shares;
+               load *= tg->se[cpu]->load.weight;
                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
         }
  
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void update_shares(struct sched_domain *sd)
-{
-       s64 elapsed;
-       u64 now;
-
-       if (root_task_group_empty())
-               return;
-
-       now = local_clock();
-       elapsed = now - sd->last_update;
-
-       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-               sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, sd);
-       }
-}
-
  static void update_h_load(long cpu)
  {
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
  #endif
  
  #ifdef CONFIG_PREEMPT
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
  
  #endif
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-       cfs_rq->shares = shares;
-#endif
-}
-#endif
-
  static void calc_load_account_idle(struct rq *this_rq);
  static void update_sysctl(void);
  static int get_update_sysctl_factor(void);
@@ -2063,6 +1932,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_autogroup.c"
  #include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
@@ -2255,10 +2125,8 @@ static int migration_cpu_stop(void *data);
   * The task's runqueue lock must be held.
   * Returns true if you have to wait for migration thread.
   */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
  {
-       struct rq *rq = task_rq(p);
-
         /*
          * If the task is not on a runqueue (and not running), then
          * the next wake-up will properly place the task.
@@ -2438,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                 return dest_cpu;
  
         /* No more Mr. Nice Guy. */
-       if (unlikely(dest_cpu >= nr_cpu_ids)) {
-               dest_cpu = cpuset_cpus_allowed_fallback(p);
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, cpu);
-               }
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
         }
  
         return dest_cpu;
@@ -2640,7 +2505,7 @@ out:
   * try_to_wake_up_local - try to wake up a local task with rq lock held
   * @p: the thread to be awakened
   *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there.  The caller must
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
   * the current task.  this_rq() stays locked over invocation.
   */
@@ -2785,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+#ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
  
         put_cpu();
  }
@@ -3549,7 +3416,7 @@ void sched_exec(void)
          * select_task_rq() can race against ->cpus_allowed
          */
         if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
  
                 task_rq_unlock(rq, &flags);
@@ -4020,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
         schedstat_inc(this_rq(), sched_count);
  #ifdef CONFIG_SCHEDSTATS
         if (unlikely(prev->lock_depth >= 0)) {
-               schedstat_inc(this_rq(), bkl_count);
+               schedstat_inc(this_rq(), rq_sched_info.bkl_count);
                 schedstat_inc(prev, sched_info.bkl_count);
         }
  #endif
@@ -4214,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                 if (task_thread_info(rq->curr) != owner || need_resched())
                         return 0;
  
-               cpu_relax();
+               arch_mutex_cpu_relax();
         }
  
         return 1;
@@ -4526,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
   * This waits for either a completion of a specific task to be signaled or for a
   * specified timeout to expire. It is interruptible. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
  {
@@ -4559,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
   * signaled or for a specified timeout to expire. It can be
   * interrupted by a kill signal. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_killable_timeout(struct completion *x,
                                      unsigned long timeout)
  {
@@ -4901,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
  }
  
  static int __sched_setscheduler(struct task_struct *p, int policy,
-                               struct sched_param *param, bool user)
+                               const struct sched_param *param, bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@ -5004,7 +4871,8 @@ recheck:
                  * assigned.
                  */
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                               task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                               !task_group_is_autogroup(task_group(p))) {
                         __task_rq_unlock(rq);
                         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                         return -EPERM;
@@ -5056,7 +4924,7 @@ recheck:
   * NOTE that the task may be already dead.
   */
  int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+                      const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, true);
  }
@@ -5074,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
   * but our caller might not have that capability.
   */
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              struct sched_param *param)
+                              const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, false);
  }
@@ -5590,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
         unsigned state;
  
         state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  #if BITS_PER_LONG == 32
         if (state == TASK_RUNNING)
@@ -5754,7 +5622,6 @@ static void update_sysctl(void)
         SET_SYSCTL(sched_min_granularity);
         SET_SYSCTL(sched_latency);
         SET_SYSCTL(sched_wakeup_granularity);
-       SET_SYSCTL(sched_shares_ratelimit);
  #undef SET_SYSCTL
  }
  
@@ -5830,7 +5697,7 @@ again:
                 goto out;
  
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, dest_cpu)) {
+       if (migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
@@ -5912,29 +5779,20 @@ static int migration_cpu_stop(void *data)
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
+
  /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
   */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
  {
-       struct rq *rq = cpu_rq(dead_cpu);
-       int needs_cpu, uninitialized_var(dest_cpu);
-       unsigned long flags;
+       struct mm_struct *mm = current->active_mm;
  
-       local_irq_save(flags);
+       BUG_ON(cpu_online(smp_processor_id()));
  
-       raw_spin_lock(&rq->lock);
-       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-       if (needs_cpu)
-               dest_cpu = select_fallback_rq(dead_cpu, p);
-       raw_spin_unlock(&rq->lock);
-       /*
-        * It can only fail if we race with set_cpus_allowed(),
-        * in the racer should migrate the task anyway.
-        */
-       if (needs_cpu)
-               __migrate_task(p, dead_cpu, dest_cpu);
-       local_irq_restore(flags);
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
  }
  
  /*
@@ -5947,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
         struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-       unsigned long flags;
  
-       local_irq_save(flags);
-       double_rq_lock(rq_src, rq_dest);
         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
         rq_src->nr_uninterruptible = 0;
-       double_rq_unlock(rq_src, rq_dest);
-       local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-
-       do_each_thread(t, p) {
-               if (p == current)
-                       continue;
-
-               if (task_cpu(p) == src_cpu)
-                       move_task_off_dead_cpu(src_cpu, p);
-       } while_each_thread(t, p);
-
-       read_unlock(&tasklist_lock);
  }
  
  /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
   */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
  {
-       int this_cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(this_cpu);
-       struct task_struct *p = rq->idle;
-       unsigned long flags;
-
-       /* cpu has to be offline */
-       BUG_ON(cpu_online(this_cpu));
-
-       /*
-        * Strictly not necessary since rest of the CPUs are stopped by now
-        * and interrupts disabled on the current cpu.
-        */
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-       activate_task(rq, p, 0);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  
  /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
   */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
  {
         struct rq *rq = cpu_rq(dead_cpu);
-
-       /* Must be exiting, otherwise would be on tasklist. */
-       BUG_ON(!p->exit_state);
-
-       /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->state == TASK_DEAD);
-
-       get_task_struct(p);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
  
         /*
-        * Drop lock around migration; if someone else moves it,
-        * that's OK. No task can be added to this CPU, so iteration is
-        * fine.
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
          */
-       raw_spin_unlock_irq(&rq->lock);
-       move_task_off_dead_cpu(dead_cpu, p);
-       raw_spin_lock_irq(&rq->lock);
-
-       put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next;
+       rq->stop = NULL;
  
         for ( ; ; ) {
-               if (!rq->nr_running)
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
                         break;
+
                 next = pick_next_task(rq);
-               if (!next)
-                       break;
+               BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
-               migrate_dead(dead_cpu, next);
  
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
         }
-}
  
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       rq->stop = stop;
  }
+
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                 rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
@@ -6298,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  
  #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               migrate_live_tasks(cpu);
-               /* Idle task back to normal (off runqueue, low prio) */
-               raw_spin_lock_irq(&rq->lock);
-               deactivate_task(rq, rq->idle, 0);
-               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-               rq->idle->sched_class = &idle_sched_class;
-               migrate_dead_tasks(cpu);
-               raw_spin_unlock_irq(&rq->lock);
-               migrate_nr_uninterruptible(rq);
-               BUG_ON(rq->nr_running != 0);
-               calc_global_load_remove(rq);
-               break;
-
         case CPU_DYING:
-       case CPU_DYING_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
                 break;
  #endif
         }
@@ -8052,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *se, int cpu,
                                 struct sched_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
         tg->cfs_rq[cpu] = cfs_rq;
         init_cfs_rq(cfs_rq, rq);
         cfs_rq->tg = tg;
-       if (add)
-               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
  
         tg->se[cpu] = se;
-       /* se could be NULL for init_task_group */
+       /* se could be NULL for root_task_group */
         if (!se)
                 return;
  
@@ -8073,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                 se->cfs_rq = parent->my_q;
  
         se->my_q = cfs_rq;
-       se->load.weight = tg->shares;
-       se->load.inv_weight = 0;
+       update_load_set(&se->load, 0);
         se->parent = parent;
  }
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *rt_se, int cpu,
                 struct sched_rt_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-       if (add)
-               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
  
         tg->rt_se[cpu] = rt_se;
         if (!rt_se)
@@ -8126,18 +7907,18 @@ void __init sched_init(void)
                 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               init_task_group.se = (struct sched_entity **)ptr;
+               root_task_group.se = (struct sched_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
-               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  #ifdef CONFIG_RT_GROUP_SCHED
-               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
-               init_task_group.rt_rq = (struct rt_rq **)ptr;
+               root_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
  
  #endif /* CONFIG_RT_GROUP_SCHED */
@@ -8157,20 +7938,16 @@ void __init sched_init(void)
                         global_rt_period(), global_rt_runtime());
  
  #ifdef CONFIG_RT_GROUP_SCHED
-       init_rt_bandwidth(&init_task_group.rt_bandwidth,
+       init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
  #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_CGROUP_SCHED
-       list_add(&init_task_group.list, &task_groups);
-       INIT_LIST_HEAD(&init_task_group.children);
-
+       list_add(&root_task_group.list, &task_groups);
+       INIT_LIST_HEAD(&root_task_group.children);
+       autogroup_init(&init_task);
  #endif /* CONFIG_CGROUP_SCHED */
  
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                           __alignof__(unsigned long));
-#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
  
@@ -8182,38 +7959,34 @@ void __init sched_init(void)
                 init_cfs_rq(&rq->cfs, rq);
                 init_rt_rq(&rq->rt, rq);
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               init_task_group.shares = init_task_group_load;
+               root_task_group.shares = root_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                 /*
-                * How much cpu bandwidth does init_task_group get?
+                * How much cpu bandwidth does root_task_group get?
                  *
                  * In case of task-groups formed thr' the cgroup filesystem, it
                  * gets 100% of the cpu resources in the system. This overall
                  * system cpu resource is divided among the tasks of
-                * init_task_group and its child task-groups in a fair manner,
+                * root_task_group and its child task-groups in a fair manner,
                  * based on each entity's (task or task-group's) weight
                  * (se->load.weight).
                  *
-                * In other words, if init_task_group has 10 tasks of weight
+                * In other words, if root_task_group has 10 tasks of weight
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
                  *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
-                * We achieve this by letting init_task_group's tasks sit
-                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                * We achieve this by letting root_task_group's tasks sit
+                * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
-               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
  #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
-               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8293,8 +8066,6 @@ void __init sched_init(void)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
-       perf_event_init();
-
         scheduler_running = 1;
  }
  
@@ -8488,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!se)
                         goto err_free_rq;
  
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
  
         return 1;
@@ -8499,15 +8270,21 @@ err:
         return 0;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                       &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
-       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  #else /* !CONFG_FAIR_GROUP_SCHED */
  static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
  }
@@ -8578,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!rt_se)
                         goto err_free_rq;
  
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }
  
         return 1;
@@ -8588,17 +8361,6 @@ err_free_rq:
  err:
         return 0;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                       &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
  #else /* !CONFIG_RT_GROUP_SCHED */
  static inline void free_rt_sched_group(struct task_group *tg)
  {
@@ -8609,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
         return 1;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
  #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_CGROUP_SCHED
@@ -8624,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
         free_rt_sched_group(tg);
+       autogroup_free(tg);
         kfree(tg);
  }
  
@@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
         unsigned long flags;
-       int i;
  
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                 goto err;
  
         spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
-               register_fair_sched_group(tg, i);
-               register_rt_sched_group(tg, i);
-       }
         list_add_rcu(&tg->list, &task_groups);
  
         WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
         unsigned long flags;
         int i;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
                 unregister_fair_sched_group(tg, i);
-               unregister_rt_sched_group(tg, i);
-       }
+
+       spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
  #endif /* CONFIG_CGROUP_SCHED */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       int on_rq;
-
-       on_rq = se->on_rq;
-       if (on_rq)
-               dequeue_entity(cfs_rq, se, 0);
-
-       se->load.weight = shares;
-       se->load.inv_weight = 0;
-
-       if (on_rq)
-               enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
  static DEFINE_MUTEX(shares_mutex);
  
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         if (tg->shares == shares)
                 goto done;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for any ongoing reference to this group to finish */
-       synchronize_sched();
-
-       /*
-        * Now we are free to modify the group's share on each cpu
-        * w/o tripping rebalance_share or load_balance_fair.
-        */
         tg->shares = shares;
         for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
-               set_se_shares(tg->se[i], shares);
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se), 0);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
  
-       /*
-        * Enable load balance activity on this group, by inserting it back on
-        * each cpu's rq->leaf_cfs_rq_list.
-        */
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               register_fair_sched_group(tg, i);
-       list_add_rcu(&tg->siblings, &tg->parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
  done:
         mutex_unlock(&shares_mutex);
         return 0;
@@ -9107,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
  
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
-               return &init_task_group.css;
+               return &root_task_group.css;
         }
  
         parent = cgroup_tg(cgrp->parent);
@@ -9178,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
         }
  }
  
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+       /*
+        * cgroup_exit() is called in the copy_process() failure path.
+        * Ignore this case since the task hasn't ran yet, this avoids
+        * trying to poke a half freed task state from generic code.
+        */
+       if (!(task->flags & PF_EXITING))
+               return;
+
+       sched_move_task(task);
+}
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                 u64 shareval)
@@ -9250,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
         .destroy        = cpu_cgroup_destroy,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
+       .exit           = cpu_cgroup_exit,
         .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
         .early_init     = 1,
@@ -9534,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
  };
  #endif /* CONFIG_CGROUP_CPUACCT */
  
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-       barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-       int snap, trycount = 0;
-
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */