Merge branch 'unlikely/sched' of git://git.kernel.org/pub/scm/linux/kernel/git/rosted...

author Ingo Molnar <mingo@elte.hu>

Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)

committer Ingo Molnar <mingo@elte.hu>

Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)
author Ingo Molnar <mingo@elte.hu>
Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)
committer Ingo Molnar <mingo@elte.hu>
Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)
diff --combined kernel/sched.c

index 2fe98ed,6d24b2e..fd18f39
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -32,6 -32,7 +32,6 @@@
   #include <linux/init.h>
   #include <linux/uaccess.h>
   #include <linux/highmem.h>
- -#include <linux/smp_lock.h>
   #include <asm/mmu_context.h>
   #include <linux/interrupt.h>
   #include <linux/capability.h>
@@@ -74,11 -75,9 +74,11 @@@
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
+ +#include <asm/mutex.h>
   
   #include "sched_cpupri.h"
   #include "workqueue_sched.h"
+ +#include "sched_autogroup.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
@@@ -124,7 -123,7 +124,7 @@@
   
   static inline int rt_policy(int policy)
   {
-       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+       if (policy == SCHED_FIFO || policy == SCHED_RR)
                 return 1;
         return 0;
   }
@@@ -231,7 -230,7 +231,7 @@@ static void destroy_rt_bandwidth(struc
   #endif
   
   /*
- - * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ + * sched_domains_mutex serializes calls to init_sched_domains,
    * detach_destroy_domains and partition_sched_domains.
    */
   static DEFINE_MUTEX(sched_domains_mutex);
@@@ -254,8 -253,6 +254,8 @@@ struct task_group 
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
         unsigned long shares;
+ +
+ +      atomic_t load_weight;
   #endif
   
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -271,18 -268,25 +271,18 @@@
         struct task_group *parent;
         struct list_head siblings;
         struct list_head children;
- -};
   
- -#define root_task_group init_task_group
+ +#ifdef CONFIG_SCHED_AUTOGROUP
+ +      struct autogroup *autogroup;
+ +#endif
+ +};
   
- -/* task_group_lock serializes add/remove of task groups and also changes to
- - * a task group's cpu shares.
- - */
+ +/* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
- -#ifdef CONFIG_SMP
- -static int root_task_group_empty(void)
- -{
- -      return list_empty(&root_task_group.children);
- -}
- -#endif
- -
- -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+ +# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
   
   /*
    * A weight of 0 or 1 can cause arithmetics problems.
@@@ -293,15 -297,15 +293,15 @@@
    *  limitation from this.)
    */
   #define MIN_SHARES    2
- -#define MAX_SHARES    (1UL << 18)
+ +#define MAX_SHARES    (1UL << (18 + SCHED_LOAD_RESOLUTION))
   
- -static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+ +static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
   #endif
   
   /* Default task group.
    *    Every task in system belong to this group at bootup.
    */
- -struct task_group init_task_group;
+ +struct task_group root_task_group;
   
   #endif        /* CONFIG_CGROUP_SCHED */
   
@@@ -312,9 -316,6 +312,9 @@@ struct cfs_rq 
   
         u64 exec_clock;
         u64 min_vruntime;
+ +#ifndef CONFIG_64BIT
+ +      u64 min_vruntime_copy;
+ +#endif
   
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@@ -326,11 -327,9 +326,11 @@@
          * 'curr' points to currently running entity on this cfs_rq.
          * It is set to NULL otherwise (i.e when none are currently running).
          */
- -      struct sched_entity *curr, *next, *last;
+ +      struct sched_entity *curr, *next, *last, *skip;
   
+ +#ifdef        CONFIG_SCHED_DEBUG
         unsigned int nr_spread_over;
+ +#endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@@ -343,7 -342,6 +343,7 @@@
          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
          * list is used during load balance.
          */
+ +      int on_list;
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
   
@@@ -362,17 -360,14 +362,17 @@@
         unsigned long h_load;
   
         /*
- -       * this cpu's part of tg->shares
+ +       * Maintaining per-cpu shares distribution for group scheduling
+ +       *
+ +       * load_stamp is the last time we updated the load average
+ +       * load_last is the last time we updated the load average and saw load
+ +       * load_unacc_exec_time is currently unaccounted execution time
          */
- -      unsigned long shares;
+ +      u64 load_avg;
+ +      u64 load_period;
+ +      u64 load_stamp, load_last, load_unacc_exec_time;
   
- -      /*
- -       * load.weight at the time we set shares
- -       */
- -      unsigned long rq_weight;
+ +      unsigned long load_contribution;
   #endif
   #endif
   };
@@@ -422,7 -417,6 +422,7 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
+ +      struct rcu_head rcu;
         cpumask_var_t span;
         cpumask_var_t online;
   
@@@ -466,7 -460,7 +466,7 @@@ struct rq 
         u64 nohz_stamp;
         unsigned char nohz_balance_kick;
   #endif
- -      unsigned int skip_clock_update;
+ +      int skip_clock_update;
   
         /* capture load from *all* tasks on this cpu: */
         struct load_weight load;
@@@ -558,10 -552,9 +558,10 @@@
         /* try_to_wake_up() stats */
         unsigned int ttwu_count;
         unsigned int ttwu_local;
+ +#endif
   
- -      /* BKL stats */
- -      unsigned int bkl_count;
+ +#ifdef CONFIG_SMP
+ +      struct task_struct *wake_list;
   #endif
   };
   
@@@ -581,7 -574,7 +581,7 @@@ static inline int cpu_of(struct rq *rq
   
   #define rcu_dereference_check_sched_domain(p) \
         rcu_dereference_check((p), \
- -                            rcu_read_lock_sched_held() || \
+ +                            rcu_read_lock_held() || \
                               lockdep_is_held(&sched_domains_mutex))
   
   /*
@@@ -606,20 -599,17 +606,20 @@@
    * Return the group to which this tasks belongs.
    *
    * We use task_subsys_state_check() and extend the RCU verification
- - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
    * holds that lock for each task it moves into the cgroup. Therefore
    * by holding that lock, we pin the task to the current cgroup.
    */
   static inline struct task_group *task_group(struct task_struct *p)
   {
+ +      struct task_group *tg;
         struct cgroup_subsys_state *css;
   
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- -                      lockdep_is_held(&task_rq(p)->lock));
- -      return container_of(css, struct task_group, css);
+ +                      lockdep_is_held(&p->pi_lock));
+ +      tg = container_of(css, struct task_group, css);
+ +
+ +      return autogroup_task_group(p, tg);
   }
   
   /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@@ -646,18 -636,22 +646,18 @@@ static inline struct task_group *task_g
   
   #endif /* CONFIG_CGROUP_SCHED */
   
- -static u64 irq_time_cpu(int cpu);
- -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+ +static void update_rq_clock_task(struct rq *rq, s64 delta);
   
- -inline void update_rq_clock(struct rq *rq)
+ +static void update_rq_clock(struct rq *rq)
   {
- -      if (!rq->skip_clock_update) {
- -              int cpu = cpu_of(rq);
- -              u64 irq_time;
+ +      s64 delta;
   
- -              rq->clock = sched_clock_cpu(cpu);
- -              irq_time = irq_time_cpu(cpu);
- -              if (rq->clock - irq_time > rq->clock_task)
- -                      rq->clock_task = rq->clock - irq_time;
+ +      if (rq->skip_clock_update > 0)
+ +              return;
   
- -              sched_irq_time_avg_update(rq, irq_time);
- -      }
+ +      delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ +      rq->clock += delta;
+ +      update_rq_clock_task(rq, delta);
   }
   
   /*
@@@ -670,9 -664,10 +670,9 @@@
   #endif
   
   /**
- - * runqueue_is_locked
+ + * runqueue_is_locked - Returns true if the current cpu runqueue is locked
    * @cpu: the processor in question.
    *
- - * Returns true if the current cpu runqueue is locked.
    * This interface allows printk to be called with the runqueue lock
    * held and know whether or not it is OK to wake up the klogd.
    */
@@@ -746,7 -741,7 +746,7 @@@ sched_feat_write(struct file *filp, con
         buf[cnt] = 0;
         cmp = strstrip(buf);
   
- -      if (strncmp(buf, "NO_", 3) == 0) {
+ +      if (strncmp(cmp, "NO_", 3) == 0) {
                 neg = 1;
                 cmp += 3;
         }
@@@ -801,6 -796,20 +801,6 @@@ late_initcall(sched_init_debug)
    */
   const_debug unsigned int sysctl_sched_nr_migrate = 32;
   
- -/*
- - * ratelimit for updating the group shares.
- - * default: 0.25ms
- - */
- -unsigned int sysctl_sched_shares_ratelimit = 250000;
- -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
- -
- -/*
- - * Inject some fuzzyness into changing the per-cpu group shares
- - * this avoids remote rq-locks at the expense of fairness.
- - * default: 4
- - */
- -unsigned int sysctl_sched_shares_thresh = 4;
- -
   /*
    * period over which we average the RT time consumption, measured
    * in ms.
@@@ -848,39 -857,18 +848,39 @@@ static inline int task_current(struct r
         return rq->curr == p;
   }
   
- -#ifndef __ARCH_WANT_UNLOCKED_CTXSW
   static inline int task_running(struct rq *rq, struct task_struct *p)
   {
+ +#ifdef CONFIG_SMP
+ +      return p->on_cpu;
+ +#else
         return task_current(rq, p);
+ +#endif
   }
   
+ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
   static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
   {
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * We can optimise this out completely for !SMP, because the
+ +       * SMP rebalancing from interrupt is the only thing that cares
+ +       * here.
+ +       */
+ +      next->on_cpu = 1;
+ +#endif
   }
   
   static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
   {
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ +       * We must ensure this doesn't happen until the switch is completely
+ +       * finished.
+ +       */
+ +      smp_wmb();
+ +      prev->on_cpu = 0;
+ +#endif
   #ifdef CONFIG_DEBUG_SPINLOCK
         /* this is a valid case when another task releases the spinlock */
         rq->lock.owner = current;
@@@ -896,6 -884,15 +896,6 @@@
   }
   
   #else /* __ARCH_WANT_UNLOCKED_CTXSW */
- -static inline int task_running(struct rq *rq, struct task_struct *p)
- -{
- -#ifdef CONFIG_SMP
- -      return p->oncpu;
- -#else
- -      return task_current(rq, p);
- -#endif
- -}
- -
   static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
   {
   #ifdef CONFIG_SMP
@@@ -904,7 -901,7 +904,7 @@@
          * SMP rebalancing from interrupt is the only thing that cares
          * here.
          */
- -      next->oncpu = 1;
+ +      next->on_cpu = 1;
   #endif
   #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         raw_spin_unlock_irq(&rq->lock);
@@@ -917,12 -914,12 +917,12 @@@ static inline void finish_lock_switch(s
   {
   #ifdef CONFIG_SMP
         /*
- -       * After ->oncpu is cleared, the task can be moved to a different CPU.
+ +       * After ->on_cpu is cleared, the task can be moved to a different CPU.
          * We must ensure this doesn't happen until the switch is completely
          * finished.
          */
         smp_wmb();
- -      prev->oncpu = 0;
+ +      prev->on_cpu = 0;
   #endif
   #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_enable();
@@@ -931,15 -928,23 +931,15 @@@
   #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
   
   /*
- - * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- - * against ttwu().
- - */
- -static inline int task_is_waking(struct task_struct *p)
- -{
- -      return unlikely(p->state == TASK_WAKING);
- -}
- -
- -/*
- - * __task_rq_lock - lock the runqueue a given task resides on.
- - * Must be called interrupts disabled.
+ + * __task_rq_lock - lock the rq @p resides on.
    */
   static inline struct rq *__task_rq_lock(struct task_struct *p)
         __acquires(rq->lock)
   {
         struct rq *rq;
   
+ +      lockdep_assert_held(&p->pi_lock);
+ +
         for (;;) {
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
@@@ -950,22 -955,22 +950,22 @@@
   }
   
   /*
- - * task_rq_lock - lock the runqueue a given task resides on and disable
- - * interrupts. Note the ordering: we can safely lookup the task_rq without
- - * explicitly disabling preemption.
+ + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
    */
   static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ +      __acquires(p->pi_lock)
         __acquires(rq->lock)
   {
         struct rq *rq;
   
         for (;;) {
- -              local_irq_save(*flags);
+ +              raw_spin_lock_irqsave(&p->pi_lock, *flags);
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
                 if (likely(rq == task_rq(p)))
                         return rq;
- -              raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ +              raw_spin_unlock(&rq->lock);
+ +              raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
         }
   }
   
@@@ -975,13 -980,10 +975,13 @@@ static void __task_rq_unlock(struct rq 
         raw_spin_unlock(&rq->lock);
   }
   
- -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+ +static inline void
+ +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
         __releases(rq->lock)
+ +      __releases(p->pi_lock)
   {
- -      raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ +      raw_spin_unlock(&rq->lock);
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
   }
   
   /*
@@@ -1210,17 -1212,11 +1210,17 @@@ int get_nohz_timer_target(void
         int i;
         struct sched_domain *sd;
   
+ +      rcu_read_lock();
         for_each_domain(cpu, sd) {
- -              for_each_cpu(i, sched_domain_span(sd))
- -                      if (!idle_cpu(i))
- -                              return i;
+ +              for_each_cpu(i, sched_domain_span(sd)) {
+ +                      if (!idle_cpu(i)) {
+ +                              cpu = i;
+ +                              goto unlock;
+ +                      }
+ +              }
         }
+ +unlock:
+ +      rcu_read_unlock();
         return cpu;
   }
   /*
@@@ -1330,27 -1326,15 +1330,27 @@@ calc_delta_mine(unsigned long delta_exe
   {
         u64 tmp;
   
+ +      /*
+ +       * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ +       * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ +       * 2^SCHED_LOAD_RESOLUTION.
+ +       */
+ +      if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ +              tmp = (u64)delta_exec * scale_load_down(weight);
+ +      else
+ +              tmp = (u64)delta_exec;
+ +
         if (!lw->inv_weight) {
- -              if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ +              unsigned long w = scale_load_down(lw->weight);
+ +
+ +              if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
                         lw->inv_weight = 1;
+ +              else if (unlikely(!w))
+ +                      lw->inv_weight = WMULT_CONST;
                 else
- -                      lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
- -                              / (lw->weight+1);
+ +                      lw->inv_weight = WMULT_CONST / w;
         }
   
- -      tmp = (u64)delta_exec * weight;
         /*
          * Check whether we'd overflow the 64-bit multiplication:
          */
@@@ -1375,12 -1359,6 +1375,12 @@@ static inline void update_load_sub(stru
         lw->inv_weight = 0;
   }
   
+ +static inline void update_load_set(struct load_weight *lw, unsigned long w)
+ +{
+ +      lw->weight = w;
+ +      lw->inv_weight = 0;
+ +}
+ +
   /*
    * To aid in avoiding the subversion of "niceness" due to uneven distribution
    * of tasks with abnormal "nice" values across CPUs the contribution that
@@@ -1569,6 -1547,101 +1569,6 @@@ static unsigned long cpu_avg_load_per_t
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
- -static __read_mostly unsigned long __percpu *update_shares_data;
- -
- -static void __set_se_shares(struct sched_entity *se, unsigned long shares);
- -
- -/*
- - * Calculate and set the cpu's group shares.
- - */
- -static void update_group_shares_cpu(struct task_group *tg, int cpu,
- -                                  unsigned long sd_shares,
- -                                  unsigned long sd_rq_weight,
- -                                  unsigned long *usd_rq_weight)
- -{
- -      unsigned long shares, rq_weight;
- -      int boost = 0;
- -
- -      rq_weight = usd_rq_weight[cpu];
- -      if (!rq_weight) {
- -              boost = 1;
- -              rq_weight = NICE_0_LOAD;
- -      }
- -
- -      /*
- -       *             \Sum_j shares_j * rq_weight_i
- -       * shares_i =  -----------------------------
- -       *                  \Sum_j rq_weight_j
- -       */
- -      shares = (sd_shares * rq_weight) / sd_rq_weight;
- -      shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- -
- -      if (abs(shares - tg->se[cpu]->load.weight) >
- -                      sysctl_sched_shares_thresh) {
- -              struct rq *rq = cpu_rq(cpu);
- -              unsigned long flags;
- -
- -              raw_spin_lock_irqsave(&rq->lock, flags);
- -              tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- -              tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- -              __set_se_shares(tg->se[cpu], shares);
- -              raw_spin_unlock_irqrestore(&rq->lock, flags);
- -      }
- -}
- -
- -/*
- - * Re-compute the task group their per cpu shares over the given domain.
- - * This needs to be done in a bottom-up fashion because the rq weight of a
- - * parent group depends on the shares of its child groups.
- - */
- -static int tg_shares_up(struct task_group *tg, void *data)
- -{
- -      unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- -      unsigned long *usd_rq_weight;
- -      struct sched_domain *sd = data;
- -      unsigned long flags;
- -      int i;
- -
- -      if (!tg->se[0])
- -              return 0;
- -
- -      local_irq_save(flags);
- -      usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
- -
- -      for_each_cpu(i, sched_domain_span(sd)) {
- -              weight = tg->cfs_rq[i]->load.weight;
- -              usd_rq_weight[i] = weight;
- -
- -              rq_weight += weight;
- -              /*
- -               * If there are currently no tasks on the cpu pretend there
- -               * is one of average load so that when a new task gets to
- -               * run here it will not get delayed by group starvation.
- -               */
- -              if (!weight)
- -                      weight = NICE_0_LOAD;
- -
- -              sum_weight += weight;
- -              shares += tg->cfs_rq[i]->shares;
- -      }
- -
- -      if (!rq_weight)
- -              rq_weight = sum_weight;
- -
- -      if ((!shares && rq_weight) || shares > tg->shares)
- -              shares = tg->shares;
- -
- -      if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- -              shares = tg->shares;
- -
- -      for_each_cpu(i, sched_domain_span(sd))
- -              update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
- -
- -      local_irq_restore(flags);
- -
- -      return 0;
- -}
- -
   /*
    * Compute the cpu's hierarchical load factor for each task group.
    * This needs to be done in a top-down fashion because the load of a child
@@@ -1583,7 -1656,7 +1583,7 @@@ static int tg_load_down(struct task_gro
                 load = cpu_rq(cpu)->load.weight;
         } else {
                 load = tg->parent->cfs_rq[cpu]->h_load;
- -              load *= tg->cfs_rq[cpu]->shares;
+ +              load *= tg->se[cpu]->load.weight;
                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
         }
   
@@@ -1592,11 -1665,34 +1592,11 @@@
         return 0;
   }
   
- -static void update_shares(struct sched_domain *sd)
- -{
- -      s64 elapsed;
- -      u64 now;
- -
- -      if (root_task_group_empty())
- -              return;
- -
- -      now = local_clock();
- -      elapsed = now - sd->last_update;
- -
- -      if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- -              sd->last_update = now;
- -              walk_tg_tree(tg_nop, tg_shares_up, sd);
- -      }
- -}
- -
   static void update_h_load(long cpu)
   {
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
   }
   
- -#else
- -
- -static inline void update_shares(struct sched_domain *sd)
- -{
- -}
- -
   #endif
   
   #ifdef CONFIG_PREEMPT
@@@ -1716,39 -1812,15 +1716,39 @@@ static void double_rq_unlock(struct rq 
                 __release(rq2->lock);
   }
   
- -#endif
+ +#else /* CONFIG_SMP */
   
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ +/*
+ + * double_rq_lock - safely lock two runqueues
+ + *
+ + * Note this does not disable interrupts like task_rq_lock,
+ + * you need to do so manually before calling.
+ + */
+ +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ +      __acquires(rq1->lock)
+ +      __acquires(rq2->lock)
   {
- -#ifdef CONFIG_SMP
- -      cfs_rq->shares = shares;
- -#endif
+ +      BUG_ON(!irqs_disabled());
+ +      BUG_ON(rq1 != rq2);
+ +      raw_spin_lock(&rq1->lock);
+ +      __acquire(rq2->lock);   /* Fake it out ;) */
+ +}
+ +
+ +/*
+ + * double_rq_unlock - safely unlock two runqueues
+ + *
+ + * Note this does not restore interrupts like task_rq_unlock,
+ + * you need to do so manually after calling.
+ + */
+ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ +      __releases(rq1->lock)
+ +      __releases(rq2->lock)
+ +{
+ +      BUG_ON(rq1 != rq2);
+ +      raw_spin_unlock(&rq1->lock);
+ +      __release(rq2->lock);
   }
+ +
   #endif
   
   static void calc_load_account_idle(struct rq *this_rq);
@@@ -1790,20 -1862,17 +1790,20 @@@ static void dec_nr_running(struct rq *r
   
   static void set_load_weight(struct task_struct *p)
   {
+ +      int prio = p->static_prio - MAX_RT_PRIO;
+ +      struct load_weight *load = &p->se.load;
+ +
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
         if (p->policy == SCHED_IDLE) {
- -              p->se.load.weight = WEIGHT_IDLEPRIO;
- -              p->se.load.inv_weight = WMULT_IDLEPRIO;
+ +              load->weight = scale_load(WEIGHT_IDLEPRIO);
+ +              load->inv_weight = WMULT_IDLEPRIO;
                 return;
         }
   
- -      p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- -      p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ +      load->weight = scale_load(prio_to_weight[prio]);
+ +      load->inv_weight = prio_to_wmult[prio];
   }
   
   static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -1811,6 -1880,7 +1811,6 @@@
         update_rq_clock(rq);
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, flags);
- -      p->se.on_rq = 1;
   }
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -1818,6 -1888,7 +1818,6 @@@
         update_rq_clock(rq);
         sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, flags);
- -      p->se.on_rq = 0;
   }
   
   /*
@@@ -1853,9 -1924,10 +1853,9 @@@ static void deactivate_task(struct rq *
    * They are read and saved off onto struct rq in update_rq_clock().
    * This may result in other CPU reading this CPU's irq time and can
    * race with irq/account_system_vtime on this CPU. We would either get old
- - * or new value (or semi updated value on 32 bit) with a side effect of
- - * accounting a slice of irq time to wrong task when irq is in progress
- - * while we read rq->clock. That is a worthy compromise in place of having
- - * locks on each irq in account_system_time.
+ + * or new value with a side effect of accounting a slice of irq time to wrong
+ + * task when irq is in progress while we read rq->clock. That is a worthy
+ + * compromise in place of having locks on each irq in account_system_time.
    */
   static DEFINE_PER_CPU(u64, cpu_hardirq_time);
   static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@@ -1873,58 -1945,19 +1873,58 @@@ void disable_sched_clock_irqtime(void
         sched_clock_irqtime = 0;
   }
   
- -static u64 irq_time_cpu(int cpu)
+ +#ifndef CONFIG_64BIT
+ +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+ +
+ +static inline void irq_time_write_begin(void)
+ +{
+ +      __this_cpu_inc(irq_time_seq.sequence);
+ +      smp_wmb();
+ +}
+ +
+ +static inline void irq_time_write_end(void)
   {
- -      if (!sched_clock_irqtime)
- -              return 0;
+ +      smp_wmb();
+ +      __this_cpu_inc(irq_time_seq.sequence);
+ +}
+ +
+ +static inline u64 irq_time_read(int cpu)
+ +{
+ +      u64 irq_time;
+ +      unsigned seq;
+ +
+ +      do {
+ +              seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ +              irq_time = per_cpu(cpu_softirq_time, cpu) +
+ +                         per_cpu(cpu_hardirq_time, cpu);
+ +      } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+ +
+ +      return irq_time;
+ +}
+ +#else /* CONFIG_64BIT */
+ +static inline void irq_time_write_begin(void)
+ +{
+ +}
+ +
+ +static inline void irq_time_write_end(void)
+ +{
+ +}
   
+ +static inline u64 irq_time_read(int cpu)
+ +{
         return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
   }
+ +#endif /* CONFIG_64BIT */
   
+ +/*
+ + * Called before incrementing preempt_count on {soft,}irq_enter
+ + * and before decrementing preempt_count on {soft,}irq_exit.
+ + */
   void account_system_vtime(struct task_struct *curr)
   {
         unsigned long flags;
+ +      s64 delta;
         int cpu;
- -      u64 now, delta;
   
         if (!sched_clock_irqtime)
                 return;
@@@ -1932,10 -1965,9 +1932,10 @@@
         local_irq_save(flags);
   
         cpu = smp_processor_id();
- -      now = sched_clock_cpu(cpu);
- -      delta = now - per_cpu(irq_start_time, cpu);
- -      per_cpu(irq_start_time, cpu) = now;
+ +      delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ +      __this_cpu_add(irq_start_time, delta);
+ +
+ +      irq_time_write_begin();
         /*
          * We do not account for softirq time from ksoftirqd here.
          * We want to continue accounting softirq time to ksoftirqd thread
@@@ -1943,92 -1975,37 +1943,92 @@@
          * that do not consume any time, but still wants to run.
          */
         if (hardirq_count())
- -              per_cpu(cpu_hardirq_time, cpu) += delta;
- -      else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- -              per_cpu(cpu_softirq_time, cpu) += delta;
+ +              __this_cpu_add(cpu_hardirq_time, delta);
+ +      else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ +              __this_cpu_add(cpu_softirq_time, delta);
   
+ +      irq_time_write_end();
         local_irq_restore(flags);
   }
   EXPORT_SYMBOL_GPL(account_system_vtime);
   
- -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+ +static void update_rq_clock_task(struct rq *rq, s64 delta)
   {
- -      if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- -              u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- -              rq->prev_irq_time = curr_irq_time;
- -              sched_rt_avg_update(rq, delta_irq);
- -      }
+ +      s64 irq_delta;
+ +
+ +      irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ +
+ +      /*
+ +       * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ +       * this case when a previous update_rq_clock() happened inside a
+ +       * {soft,}irq region.
+ +       *
+ +       * When this happens, we stop ->clock_task and only update the
+ +       * prev_irq_time stamp to account for the part that fit, so that a next
+ +       * update will consume the rest. This ensures ->clock_task is
+ +       * monotonic.
+ +       *
+ +       * It does however cause some slight miss-attribution of {soft,}irq
+ +       * time, a more accurate solution would be to update the irq_time using
+ +       * the current rq->clock timestamp, except that would require using
+ +       * atomic ops.
+ +       */
+ +      if (irq_delta > delta)
+ +              irq_delta = delta;
+ +
+ +      rq->prev_irq_time += irq_delta;
+ +      delta -= irq_delta;
+ +      rq->clock_task += delta;
+ +
+ +      if (irq_delta && sched_feat(NONIRQ_POWER))
+ +              sched_rt_avg_update(rq, irq_delta);
   }
   
- -#else
+ +static int irqtime_account_hi_update(void)
+ +{
+ +      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      unsigned long flags;
+ +      u64 latest_ns;
+ +      int ret = 0;
+ +
+ +      local_irq_save(flags);
+ +      latest_ns = this_cpu_read(cpu_hardirq_time);
+ +      if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ +              ret = 1;
+ +      local_irq_restore(flags);
+ +      return ret;
+ +}
   
- -static u64 irq_time_cpu(int cpu)
+ +static int irqtime_account_si_update(void)
   {
- -      return 0;
+ +      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      unsigned long flags;
+ +      u64 latest_ns;
+ +      int ret = 0;
+ +
+ +      local_irq_save(flags);
+ +      latest_ns = this_cpu_read(cpu_softirq_time);
+ +      if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ +              ret = 1;
+ +      local_irq_restore(flags);
+ +      return ret;
   }
   
- -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+ +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
   
- -#endif
+ +#define sched_clock_irqtime   (0)
+ +
+ +static void update_rq_clock_task(struct rq *rq, s64 delta)
+ +{
+ +      rq->clock_task += delta;
+ +}
+ +
+ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
   
   #include "sched_idletask.c"
   #include "sched_fair.c"
   #include "sched_rt.c"
+ +#include "sched_autogroup.c"
   #include "sched_stoptask.c"
   #ifdef CONFIG_SCHED_DEBUG
   # include "sched_debug.c"
@@@ -2121,14 -2098,14 +2121,14 @@@ inline int task_curr(const struct task_
   
   static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
- -                                     int oldprio, int running)
+ +                                     int oldprio)
   {
         if (prev_class != p->sched_class) {
                 if (prev_class->switched_from)
- -                      prev_class->switched_from(rq, p, running);
- -              p->sched_class->switched_to(rq, p, running);
- -      } else
- -              p->sched_class->prio_changed(rq, p, oldprio, running);
+ +                      prev_class->switched_from(rq, p);
+ +              p->sched_class->switched_to(rq, p);
+ +      } else if (oldprio != p->prio)
+ +              p->sched_class->prio_changed(rq, p, oldprio);
   }
   
   static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@@ -2152,7 -2129,7 +2152,7 @@@
          * A queue event has occurred, and we're going to schedule.  In
          * this case, we can save a useless back to back clock update.
          */
- -      if (test_tsk_need_resched(rq->curr))
+ +      if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                 rq->skip_clock_update = 1;
   }
   
@@@ -2198,11 -2175,6 +2198,11 @@@ void set_task_cpu(struct task_struct *p
          */
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                         !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+ +
+ +#ifdef CONFIG_LOCKDEP
+ +      WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ +                                    lockdep_is_held(&task_rq(p)->lock)));
+ +#endif
   #endif
   
         trace_sched_migrate_task(p, new_cpu);
@@@ -2222,6 -2194,21 +2222,6 @@@ struct migration_arg 
   
   static int migration_cpu_stop(void *data);
   
- -/*
- - * The task's runqueue lock must be held.
- - * Returns true if you have to wait for migration thread.
- - */
- -static bool migrate_task(struct task_struct *p, int dest_cpu)
- -{
- -      struct rq *rq = task_rq(p);
- -
- -      /*
- -       * If the task is not on a runqueue (and not running), then
- -       * the next wake-up will properly place the task.
- -       */
- -      return p->se.on_rq || task_running(rq, p);
- -}
- -
   /*
    * wait_task_inactive - wait for a thread to unschedule.
    *
@@@ -2279,11 -2266,11 +2279,11 @@@ unsigned long wait_task_inactive(struc
                 rq = task_rq_lock(p, &flags);
                 trace_sched_wait_task(p);
                 running = task_running(rq, p);
- -              on_rq = p->se.on_rq;
+ +              on_rq = p->on_rq;
                 ncsw = 0;
                 if (!match_state || p->state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- -              task_rq_unlock(rq, &flags);
+ +              task_rq_unlock(rq, p, &flags);
   
                 /*
                  * If it changed from the expected state, bail out now.
@@@ -2312,10 -2299,7 +2312,10 @@@
                  * yield - it could be a while.
                  */
                 if (unlikely(on_rq)) {
- -                      schedule_timeout_uninterruptible(1);
+ +                      ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ +
+ +                      set_current_state(TASK_UNINTERRUPTIBLE);
+ +                      schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                         continue;
                 }
   
@@@ -2337,7 -2321,7 +2337,7 @@@
    * Cause a process which is running on another CPU to enter
    * kernel-mode, without any delay. (to get signals handled.)
    *
- - * NOTE: this function doesnt have to take the runqueue lock,
+ + * NOTE: this function doesn't have to take the runqueue lock,
    * because all it wants to ensure is that the remote task enters
    * the kernel. If the IPI races and the task has been migrated
    * to another CPU then no harm is done and the purpose has been
@@@ -2356,9 -2340,30 +2356,9 @@@ void kick_process(struct task_struct *p
   EXPORT_SYMBOL_GPL(kick_process);
   #endif /* CONFIG_SMP */
   
- -/**
- - * task_oncpu_function_call - call a function on the cpu on which a task runs
- - * @p:                the task to evaluate
- - * @func:     the function to be called
- - * @info:     the function call argument
- - *
- - * Calls the function @func when the task is currently running. This might
- - * be on the current CPU, which just calls the function directly
- - */
- -void task_oncpu_function_call(struct task_struct *p,
- -                            void (*func) (void *info), void *info)
- -{
- -      int cpu;
- -
- -      preempt_disable();
- -      cpu = task_cpu(p);
- -      if (task_curr(p))
- -              smp_call_function_single(cpu, func, info, 1);
- -      preempt_enable();
- -}
- -
   #ifdef CONFIG_SMP
   /*
- - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ + * ->cpus_allowed is protected by both rq->lock and p->pi_lock
    */
   static int select_fallback_rq(int cpu, struct task_struct *p)
   {
@@@ -2376,27 -2381,30 +2376,27 @@@
                 return dest_cpu;
   
         /* No more Mr. Nice Guy. */
- -      if (unlikely(dest_cpu >= nr_cpu_ids)) {
- -              dest_cpu = cpuset_cpus_allowed_fallback(p);
- -              /*
- -               * Don't tell them about moving exiting tasks or
- -               * kernel threads (both mm NULL), since they never
- -               * leave kernel.
- -               */
- -              if (p->mm && printk_ratelimit()) {
- -                      printk(KERN_INFO "process %d (%s) no "
- -                             "longer affine to cpu%d\n",
- -                             task_pid_nr(p), p->comm, cpu);
- -              }
+ +      dest_cpu = cpuset_cpus_allowed_fallback(p);
+ +      /*
+ +       * Don't tell them about moving exiting tasks or
+ +       * kernel threads (both mm NULL), since they never
+ +       * leave kernel.
+ +       */
+ +      if (p->mm && printk_ratelimit()) {
+ +              printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ +                              task_pid_nr(p), p->comm, cpu);
         }
   
         return dest_cpu;
   }
   
   /*
- - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
    */
   static inline
- -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+ +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
   {
- -      int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+ +      int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
   
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@@ -2422,63 -2430,27 +2422,63 @@@ static void update_avg(u64 *avg, u64 sa
   }
   #endif
   
- -static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- -                               bool is_sync, bool is_migrate, bool is_local,
- -                               unsigned long en_flags)
+ +static void
+ +ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
   {
- -      schedstat_inc(p, se.statistics.nr_wakeups);
- -      if (is_sync)
- -              schedstat_inc(p, se.statistics.nr_wakeups_sync);
- -      if (is_migrate)
- -              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- -      if (is_local)
+ +#ifdef CONFIG_SCHEDSTATS
+ +      struct rq *rq = this_rq();
+ +
+ +#ifdef CONFIG_SMP
+ +      int this_cpu = smp_processor_id();
+ +
+ +      if (cpu == this_cpu) {
+ +              schedstat_inc(rq, ttwu_local);
                 schedstat_inc(p, se.statistics.nr_wakeups_local);
- -      else
+ +      } else {
+ +              struct sched_domain *sd;
+ +
                 schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ +              rcu_read_lock();
+ +              for_each_domain(this_cpu, sd) {
+ +                      if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ +                              schedstat_inc(sd, ttwu_wake_remote);
+ +                              break;
+ +                      }
+ +              }
+ +              rcu_read_unlock();
+ +      }
+ +
+ +      if (wake_flags & WF_MIGRATED)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ +
+ +#endif /* CONFIG_SMP */
+ +
+ +      schedstat_inc(rq, ttwu_count);
+ +      schedstat_inc(p, se.statistics.nr_wakeups);
+ +
+ +      if (wake_flags & WF_SYNC)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_sync);
   
+ +#endif /* CONFIG_SCHEDSTATS */
+ +}
+ +
+ +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+ +{
         activate_task(rq, p, en_flags);
+ +      p->on_rq = 1;
+ +
+ +      /* if a worker is waking up, notify workqueue */
+ +      if (p->flags & PF_WQ_WORKER)
+ +              wq_worker_waking_up(p, cpu_of(rq));
   }
   
- -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- -                                      int wake_flags, bool success)
+ +/*
+ + * Mark the task runnable and perform wakeup-preemption.
+ + */
+ +static void
+ +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   {
- -      trace_sched_wakeup(p, success);
+ +      trace_sched_wakeup(p, true);
         check_preempt_curr(rq, p, wake_flags);
   
         p->state = TASK_RUNNING;
@@@ -2486,7 -2458,7 +2486,7 @@@
         if (p->sched_class->task_woken)
                 p->sched_class->task_woken(rq, p);
   
-       if (unlikely(rq->idle_stamp)) {
+       if (rq->idle_stamp) {
                 u64 delta = rq->clock - rq->idle_stamp;
                 u64 max = 2*sysctl_sched_migration_cost;
   
@@@ -2497,119 -2469,9 +2497,119 @@@
                 rq->idle_stamp = 0;
         }
   #endif
- -      /* if a worker is waking up, notify workqueue */
- -      if ((p->flags & PF_WQ_WORKER) && success)
- -              wq_worker_waking_up(p, cpu_of(rq));
+ +}
+ +
+ +static void
+ +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ +{
+ +#ifdef CONFIG_SMP
+ +      if (p->sched_contributes_to_load)
+ +              rq->nr_uninterruptible--;
+ +#endif
+ +
+ +      ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ +      ttwu_do_wakeup(rq, p, wake_flags);
+ +}
+ +
+ +/*
+ + * Called in case the task @p isn't fully descheduled from its runqueue,
+ + * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ + * since all we need to do is flip p->state to TASK_RUNNING, since
+ + * the task is still ->on_rq.
+ + */
+ +static int ttwu_remote(struct task_struct *p, int wake_flags)
+ +{
+ +      struct rq *rq;
+ +      int ret = 0;
+ +
+ +      rq = __task_rq_lock(p);
+ +      if (p->on_rq) {
+ +              ttwu_do_wakeup(rq, p, wake_flags);
+ +              ret = 1;
+ +      }
+ +      __task_rq_unlock(rq);
+ +
+ +      return ret;
+ +}
+ +
+ +#ifdef CONFIG_SMP
+ +static void sched_ttwu_pending(void)
+ +{
+ +      struct rq *rq = this_rq();
+ +      struct task_struct *list = xchg(&rq->wake_list, NULL);
+ +
+ +      if (!list)
+ +              return;
+ +
+ +      raw_spin_lock(&rq->lock);
+ +
+ +      while (list) {
+ +              struct task_struct *p = list;
+ +              list = list->wake_entry;
+ +              ttwu_do_activate(rq, p, 0);
+ +      }
+ +
+ +      raw_spin_unlock(&rq->lock);
+ +}
+ +
+ +void scheduler_ipi(void)
+ +{
+ +      sched_ttwu_pending();
+ +}
+ +
+ +static void ttwu_queue_remote(struct task_struct *p, int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +      struct task_struct *next = rq->wake_list;
+ +
+ +      for (;;) {
+ +              struct task_struct *old = next;
+ +
+ +              p->wake_entry = next;
+ +              next = cmpxchg(&rq->wake_list, old, p);
+ +              if (next == old)
+ +                      break;
+ +      }
+ +
+ +      if (!next)
+ +              smp_send_reschedule(cpu);
+ +}
+ +
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+ +{
+ +      struct rq *rq;
+ +      int ret = 0;
+ +
+ +      rq = __task_rq_lock(p);
+ +      if (p->on_cpu) {
+ +              ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ +              ttwu_do_wakeup(rq, p, wake_flags);
+ +              ret = 1;
+ +      }
+ +      __task_rq_unlock(rq);
+ +
+ +      return ret;
+ +
+ +}
+ +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+ +#endif /* CONFIG_SMP */
+ +
+ +static void ttwu_queue(struct task_struct *p, int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +
+ +#if defined(CONFIG_SMP)
+ +      if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ +              sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ +              ttwu_queue_remote(p, cpu);
+ +              return;
+ +      }
+ +#endif
+ +
+ +      raw_spin_lock(&rq->lock);
+ +      ttwu_do_activate(rq, p, 0);
+ +      raw_spin_unlock(&rq->lock);
   }
   
   /**
@@@ -2627,66 -2489,92 +2627,66 @@@
    * Returns %true if @p was woken up, %false if it was already running
    * or @state didn't match @p's state.
    */
- -static int try_to_wake_up(struct task_struct *p, unsigned int state,
- -                        int wake_flags)
+ +static int
+ +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
   {
- -      int cpu, orig_cpu, this_cpu, success = 0;
         unsigned long flags;
- -      unsigned long en_flags = ENQUEUE_WAKEUP;
- -      struct rq *rq;
- -
- -      this_cpu = get_cpu();
+ +      int cpu, success = 0;
   
         smp_wmb();
- -      rq = task_rq_lock(p, &flags);
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
         if (!(p->state & state))
                 goto out;
   
- -      if (p->se.on_rq)
- -              goto out_running;
- -
+ +      success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
- -      orig_cpu = cpu;
   
- -#ifdef CONFIG_SMP
- -      if (unlikely(task_running(rq, p)))
- -              goto out_activate;
+ +      if (p->on_rq && ttwu_remote(p, wake_flags))
+ +              goto stat;
   
+ +#ifdef CONFIG_SMP
         /*
- -       * In order to handle concurrent wakeups and release the rq->lock
- -       * we put the task in TASK_WAKING state.
- -       *
- -       * First fix up the nr_uninterruptible count:
+ +       * If the owning (remote) cpu is still in the middle of schedule() with
+ +       * this task as prev, wait until its done referencing the task.
          */
- -      if (task_contributes_to_load(p)) {
- -              if (likely(cpu_online(orig_cpu)))
- -                      rq->nr_uninterruptible--;
- -              else
- -                      this_rq()->nr_uninterruptible--;
+ +      while (p->on_cpu) {
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +              /*
+ +               * In case the architecture enables interrupts in
+ +               * context_switch(), we cannot busy wait, since that
+ +               * would lead to deadlocks when an interrupt hits and
+ +               * tries to wake up @prev. So bail and do a complete
+ +               * remote wakeup.
+ +               */
+ +              if (ttwu_activate_remote(p, wake_flags))
+ +                      goto stat;
+ +#else
+ +              cpu_relax();
+ +#endif
         }
+ +      /*
+ +       * Pairs with the smp_wmb() in finish_lock_switch().
+ +       */
+ +      smp_rmb();
+ +
+ +      p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
   
- -      if (p->sched_class->task_waking) {
- -              p->sched_class->task_waking(rq, p);
- -              en_flags |= ENQUEUE_WAKING;
- -      }
+ +      if (p->sched_class->task_waking)
+ +              p->sched_class->task_waking(p);
   
- -      cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
- -      if (cpu != orig_cpu)
+ +      cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ +      if (task_cpu(p) != cpu) {
+ +              wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
- -      __task_rq_unlock(rq);
- -
- -      rq = cpu_rq(cpu);
- -      raw_spin_lock(&rq->lock);
- -
- -      /*
- -       * We migrated the task without holding either rq->lock, however
- -       * since the task is not on the task list itself, nobody else
- -       * will try and migrate the task, hence the rq should match the
- -       * cpu we just moved it to.
- -       */
- -      WARN_ON(task_cpu(p) != cpu);
- -      WARN_ON(p->state != TASK_WAKING);
- -
- -#ifdef CONFIG_SCHEDSTATS
- -      schedstat_inc(rq, ttwu_count);
- -      if (cpu == this_cpu)
- -              schedstat_inc(rq, ttwu_local);
- -      else {
- -              struct sched_domain *sd;
- -              for_each_domain(this_cpu, sd) {
- -                      if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- -                              schedstat_inc(sd, ttwu_wake_remote);
- -                              break;
- -                      }
- -              }
         }
- -#endif /* CONFIG_SCHEDSTATS */
- -
- -out_activate:
   #endif /* CONFIG_SMP */
- -      ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
- -                    cpu == this_cpu, en_flags);
- -      success = 1;
- -out_running:
- -      ttwu_post_activation(p, rq, wake_flags, success);
+ +
+ +      ttwu_queue(p, cpu);
+ +stat:
+ +      ttwu_stat(p, cpu, wake_flags);
   out:
- -      task_rq_unlock(rq, &flags);
- -      put_cpu();
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
         return success;
   }
@@@ -2695,34 -2583,31 +2695,34 @@@
    * try_to_wake_up_local - try to wake up a local task with rq lock held
    * @p: the thread to be awakened
    *
- - * Put @p on the run-queue if it's not alredy there.  The caller must
+ + * Put @p on the run-queue if it's not already there. The caller must
    * ensure that this_rq() is locked, @p is bound to this_rq() and not
- - * the current task.  this_rq() stays locked over invocation.
+ + * the current task.
    */
   static void try_to_wake_up_local(struct task_struct *p)
   {
         struct rq *rq = task_rq(p);
- -      bool success = false;
   
         BUG_ON(rq != this_rq());
         BUG_ON(p == current);
         lockdep_assert_held(&rq->lock);
   
+ +      if (!raw_spin_trylock(&p->pi_lock)) {
+ +              raw_spin_unlock(&rq->lock);
+ +              raw_spin_lock(&p->pi_lock);
+ +              raw_spin_lock(&rq->lock);
+ +      }
+ +
         if (!(p->state & TASK_NORMAL))
- -              return;
+ +              goto out;
   
- -      if (!p->se.on_rq) {
- -              if (likely(!task_running(rq, p))) {
- -                      schedstat_inc(rq, ttwu_count);
- -                      schedstat_inc(rq, ttwu_local);
- -              }
- -              ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
- -              success = true;
- -      }
- -      ttwu_post_activation(p, rq, 0, success);
+ +      if (!p->on_rq)
+ +              ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ +
+ +      ttwu_do_wakeup(rq, p, 0);
+ +      ttwu_stat(p, smp_processor_id(), 0);
+ +out:
+ +      raw_spin_unlock(&p->pi_lock);
   }
   
   /**
@@@ -2755,21 -2640,18 +2755,21 @@@ int wake_up_state(struct task_struct *p
    */
   static void __sched_fork(struct task_struct *p)
   {
+ +      p->on_rq                        = 0;
+ +
+ +      p->se.on_rq                     = 0;
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
+ +      p->se.vruntime                  = 0;
+ +      INIT_LIST_HEAD(&p->se.group_node);
   
   #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   #endif
   
         INIT_LIST_HEAD(&p->rt.run_list);
- -      p->se.on_rq = 0;
- -      INIT_LIST_HEAD(&p->se.group_node);
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@@ -2779,9 -2661,8 +2779,9 @@@
   /*
    * fork()/clone()-time setup:
    */
- -void sched_fork(struct task_struct *p, int clone_flags)
+ +void sched_fork(struct task_struct *p)
   {
+ +      unsigned long flags;
         int cpu = get_cpu();
   
         __sched_fork(p);
@@@ -2832,24 -2713,22 +2832,24 @@@
          *
          * Silence PROVE_RCU.
          */
- -      rcu_read_lock();
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
         set_task_cpu(p, cpu);
- -      rcu_read_unlock();
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         if (likely(sched_info_on()))
                 memset(&p->sched_info, 0, sizeof(p->sched_info));
   #endif
- -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- -      p->oncpu = 0;
+ +#if defined(CONFIG_SMP)
+ +      p->on_cpu = 0;
   #endif
   #ifdef CONFIG_PREEMPT
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+ +#ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +#endif
   
         put_cpu();
   }
@@@ -2861,31 -2740,41 +2861,31 @@@
    * that must be done for every newly created context, then puts the task
    * on the runqueue and wakes it.
    */
- -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+ +void wake_up_new_task(struct task_struct *p)
   {
         unsigned long flags;
         struct rq *rq;
- -      int cpu __maybe_unused = get_cpu();
   
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
   #ifdef CONFIG_SMP
- -      rq = task_rq_lock(p, &flags);
- -      p->state = TASK_WAKING;
- -
         /*
          * Fork balancing, do it here and not earlier because:
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
- -       *
- -       * We set TASK_WAKING so that select_task_rq() can drop rq->lock
- -       * without people poking at ->cpus_allowed.
          */
- -      cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
- -      set_task_cpu(p, cpu);
- -
- -      p->state = TASK_RUNNING;
- -      task_rq_unlock(rq, &flags);
+ +      set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
   #endif
   
- -      rq = task_rq_lock(p, &flags);
+ +      rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
- -      trace_sched_wakeup_new(p, 1);
+ +      p->on_rq = 1;
+ +      trace_sched_wakeup_new(p, true);
         check_preempt_curr(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_woken)
                 p->sched_class->task_woken(rq, p);
   #endif
- -      task_rq_unlock(rq, &flags);
- -      put_cpu();
+ +      task_rq_unlock(rq, p, &flags);
   }
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
@@@ -2963,12 -2852,9 +2963,12 @@@ static inline voi
   prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
   {
+ +      sched_info_switch(prev, next);
+ +      perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
         prepare_arch_switch(next);
+ +      trace_sched_switch(prev, next);
   }
   
   /**
@@@ -3101,7 -2987,7 +3101,7 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
- -      trace_sched_switch(prev, next);
+ +
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -3233,15 -3119,6 +3233,15 @@@ static long calc_load_fold_active(struc
         return delta;
   }
   
+ +static unsigned long
+ +calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ +{
+ +      load *= exp;
+ +      load += active * (FIXED_1 - exp);
+ +      load += 1UL << (FSHIFT - 1);
+ +      return load >> FSHIFT;
+ +}
+ +
   #ifdef CONFIG_NO_HZ
   /*
    * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@@ -3271,128 -3148,6 +3271,128 @@@ static long calc_load_fold_idle(void
   
         return delta;
   }
+ +
+ +/**
+ + * fixed_power_int - compute: x^n, in O(log n) time
+ + *
+ + * @x:         base of the power
+ + * @frac_bits: fractional bits of @x
+ + * @n:         power to raise @x to.
+ + *
+ + * By exploiting the relation between the definition of the natural power
+ + * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ + * (where: n_i \elem {0, 1}, the binary vector representing n),
+ + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ + * of course trivially computable in O(log_2 n), the length of our binary
+ + * vector.
+ + */
+ +static unsigned long
+ +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+ +{
+ +      unsigned long result = 1UL << frac_bits;
+ +
+ +      if (n) for (;;) {
+ +              if (n & 1) {
+ +                      result *= x;
+ +                      result += 1UL << (frac_bits - 1);
+ +                      result >>= frac_bits;
+ +              }
+ +              n >>= 1;
+ +              if (!n)
+ +                      break;
+ +              x *= x;
+ +              x += 1UL << (frac_bits - 1);
+ +              x >>= frac_bits;
+ +      }
+ +
+ +      return result;
+ +}
+ +
+ +/*
+ + * a1 = a0 * e + a * (1 - e)
+ + *
+ + * a2 = a1 * e + a * (1 - e)
+ + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ + *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ + *
+ + * a3 = a2 * e + a * (1 - e)
+ + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ + *
+ + *  ...
+ + *
+ + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ + *    = a0 * e^n + a * (1 - e^n)
+ + *
+ + * [1] application of the geometric series:
+ + *
+ + *              n         1 - x^(n+1)
+ + *     S_n := \Sum x^i = -------------
+ + *             i=0          1 - x
+ + */
+ +static unsigned long
+ +calc_load_n(unsigned long load, unsigned long exp,
+ +          unsigned long active, unsigned int n)
+ +{
+ +
+ +      return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+ +}
+ +
+ +/*
+ + * NO_HZ can leave us missing all per-cpu ticks calling
+ + * calc_load_account_active(), but since an idle CPU folds its delta into
+ + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ + * in the pending idle delta if our idle period crossed a load cycle boundary.
+ + *
+ + * Once we've updated the global active value, we need to apply the exponential
+ + * weights adjusted to the number of cycles missed.
+ + */
+ +static void calc_global_nohz(unsigned long ticks)
+ +{
+ +      long delta, active, n;
+ +
+ +      if (time_before(jiffies, calc_load_update))
+ +              return;
+ +
+ +      /*
+ +       * If we crossed a calc_load_update boundary, make sure to fold
+ +       * any pending idle changes, the respective CPUs might have
+ +       * missed the tick driven calc_load_account_active() update
+ +       * due to NO_HZ.
+ +       */
+ +      delta = calc_load_fold_idle();
+ +      if (delta)
+ +              atomic_long_add(delta, &calc_load_tasks);
+ +
+ +      /*
+ +       * If we were idle for multiple load cycles, apply them.
+ +       */
+ +      if (ticks >= LOAD_FREQ) {
+ +              n = ticks / LOAD_FREQ;
+ +
+ +              active = atomic_long_read(&calc_load_tasks);
+ +              active = active > 0 ? active * FIXED_1 : 0;
+ +
+ +              avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ +              avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ +              avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ +
+ +              calc_load_update += n * LOAD_FREQ;
+ +      }
+ +
+ +      /*
+ +       * Its possible the remainder of the above division also crosses
+ +       * a LOAD_FREQ period, the regular check in calc_global_load()
+ +       * which comes after this will take care of that.
+ +       *
+ +       * Consider us being 11 ticks before a cycle completion, and us
+ +       * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ +       * age us 4 cycles, and the test in calc_global_load() will
+ +       * pick up the final one.
+ +       */
+ +}
   #else
   static void calc_load_account_idle(struct rq *this_rq)
   {
@@@ -3402,10 -3157,6 +3402,10 @@@ static inline long calc_load_fold_idle(
   {
         return 0;
   }
+ +
+ +static void calc_global_nohz(unsigned long ticks)
+ +{
+ +}
   #endif
   
   /**
@@@ -3423,17 -3174,24 +3423,17 @@@ void get_avenrun(unsigned long *loads, 
         loads[2] = (avenrun[2] + offset) << shift;
   }
   
- -static unsigned long
- -calc_load(unsigned long load, unsigned long exp, unsigned long active)
- -{
- -      load *= exp;
- -      load += active * (FIXED_1 - exp);
- -      return load >> FSHIFT;
- -}
- -
   /*
    * calc_load - update the avenrun load estimates 10 ticks after the
    * CPUs have updated calc_load_tasks.
    */
- -void calc_global_load(void)
+ +void calc_global_load(unsigned long ticks)
   {
- -      unsigned long upd = calc_load_update + 10;
         long active;
   
- -      if (time_before(jiffies, upd))
+ +      calc_global_nohz(ticks);
+ +
+ +      if (time_before(jiffies, calc_load_update + 10))
                 return;
   
         active = atomic_long_read(&calc_load_tasks);
@@@ -3594,22 -3352,27 +3594,22 @@@ void sched_exec(void
   {
         struct task_struct *p = current;
         unsigned long flags;
- -      struct rq *rq;
         int dest_cpu;
   
- -      rq = task_rq_lock(p, &flags);
- -      dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
+ +      dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
   
- -      /*
- -       * select_task_rq() can race against ->cpus_allowed
- -       */
- -      if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- -          likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ +      if (likely(cpu_active(dest_cpu))) {
                 struct migration_arg arg = { p, dest_cpu };
   
- -              task_rq_unlock(rq, &flags);
- -              stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ +              raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +              stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                 return;
         }
   unlock:
- -      task_rq_unlock(rq, &flags);
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   }
   
   #endif
@@@ -3646,7 -3409,7 +3646,7 @@@ unsigned long long task_delta_exec(stru
   
         rq = task_rq_lock(p, &flags);
         ns = do_task_delta_exec(p, rq);
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -3664,7 -3427,7 +3664,7 @@@ unsigned long long task_sched_runtime(s
   
         rq = task_rq_lock(p, &flags);
         ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -3688,7 -3451,7 +3688,7 @@@ unsigned long long thread_group_sched_r
         rq = task_rq_lock(p, &flags);
         thread_group_cputime(p, &totals);
         ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -3752,32 -3515,6 +3752,32 @@@ static void account_guest_time(struct t
         }
   }
   
+ +/*
+ + * Account system cpu time to a process and desired cpustat field
+ + * @p: the process that the cpu time gets accounted to
+ + * @cputime: the cpu time spent in kernel space since the last update
+ + * @cputime_scaled: cputime scaled by cpu frequency
+ + * @target_cputime64: pointer to cpustat field that has to be updated
+ + */
+ +static inline
+ +void __account_system_time(struct task_struct *p, cputime_t cputime,
+ +                      cputime_t cputime_scaled, cputime64_t *target_cputime64)
+ +{
+ +      cputime64_t tmp = cputime_to_cputime64(cputime);
+ +
+ +      /* Add system time to process. */
+ +      p->stime = cputime_add(p->stime, cputime);
+ +      p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ +      account_group_system_time(p, cputime);
+ +
+ +      /* Add system time to cpustat. */
+ +      *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ +      cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ +
+ +      /* Account for system time used */
+ +      acct_update_integrals(p);
+ +}
+ +
   /*
    * Account system cpu time to a process.
    * @p: the process that the cpu time gets accounted to
@@@ -3789,26 -3526,36 +3789,26 @@@ void account_system_time(struct task_st
                          cputime_t cputime, cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t tmp;
+ +      cputime64_t *target_cputime64;
   
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                 account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
   
- -      /* Add system time to process. */
- -      p->stime = cputime_add(p->stime, cputime);
- -      p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- -      account_group_system_time(p, cputime);
- -
- -      /* Add system time to cpustat. */
- -      tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
- -              cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ +              target_cputime64 = &cpustat->irq;
         else if (in_serving_softirq())
- -              cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ +              target_cputime64 = &cpustat->softirq;
         else
- -              cpustat->system = cputime64_add(cpustat->system, tmp);
- -
- -      cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ +              target_cputime64 = &cpustat->system;
   
- -      /* Account for system time used */
- -      acct_update_integrals(p);
+ +      __account_system_time(p, cputime, cputime_scaled, target_cputime64);
   }
   
   /*
    * Account for involuntary wait time.
- - * @steal: the cpu time spent in involuntary wait
+ + * @cputime: the cpu time spent in involuntary wait
    */
   void account_steal_time(cputime_t cputime)
   {
@@@ -3828,80 -3575,13 +3828,80 @@@ void account_idle_time(cputime_t cputim
         cputime64_t cputime64 = cputime_to_cputime64(cputime);
         struct rq *rq = this_rq();
   
- -      if (atomic_read(&rq->nr_iowait) > 0)
- -              cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
- -      else
- -              cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ +      if (atomic_read(&rq->nr_iowait) > 0)
+ +              cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+ +      else
+ +              cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ +}
+ +
+ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ +
+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ +/*
+ + * Account a tick to a process and cpustat
+ + * @p: the process that the cpu time gets accounted to
+ + * @user_tick: is the tick from userspace
+ + * @rq: the pointer to rq
+ + *
+ + * Tick demultiplexing follows the order
+ + * - pending hardirq update
+ + * - pending softirq update
+ + * - user_time
+ + * - idle_time
+ + * - system time
+ + *   - check for guest_time
+ + *   - else account as system_time
+ + *
+ + * Check for hardirq is done both for system and user time as there is
+ + * no timer going off while we are on hardirq and hence we may never get an
+ + * opportunity to update it solely in system time.
+ + * p->stime and friends are only updated on system time and not on irq
+ + * softirq as those do not count in task exec_runtime any more.
+ + */
+ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ +                                              struct rq *rq)
+ +{
+ +      cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ +      cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ +      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +
+ +      if (irqtime_account_hi_update()) {
+ +              cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ +      } else if (irqtime_account_si_update()) {
+ +              cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ +      } else if (this_cpu_ksoftirqd() == p) {
+ +              /*
+ +               * ksoftirqd time do not get accounted in cpu_softirq_time.
+ +               * So, we have to handle it separately here.
+ +               * Also, p->stime needs to be updated for ksoftirqd.
+ +               */
+ +              __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ +                                      &cpustat->softirq);
+ +      } else if (user_tick) {
+ +              account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ +      } else if (p == rq->idle) {
+ +              account_idle_time(cputime_one_jiffy);
+ +      } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ +              account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ +      } else {
+ +              __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ +                                      &cpustat->system);
+ +      }
+ +}
+ +
+ +static void irqtime_account_idle_ticks(int ticks)
+ +{
+ +      int i;
+ +      struct rq *rq = this_rq();
+ +
+ +      for (i = 0; i < ticks; i++)
+ +              irqtime_account_process_tick(current, 0, rq);
   }
- -
- -#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+ +static void irqtime_account_idle_ticks(int ticks) {}
+ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ +                                              struct rq *rq) {}
+ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
   
   /*
    * Account a single tick of cpu time.
@@@ -3913,11 -3593,6 +3913,11 @@@ void account_process_tick(struct task_s
         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
         struct rq *rq = this_rq();
   
+ +      if (sched_clock_irqtime) {
+ +              irqtime_account_process_tick(p, user_tick, rq);
+ +              return;
+ +      }
+ +
         if (user_tick)
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@@ -3943,12 -3618,6 +3943,12 @@@ void account_steal_ticks(unsigned long 
    */
   void account_idle_ticks(unsigned long ticks)
   {
+ +
+ +      if (sched_clock_irqtime) {
+ +              irqtime_account_idle_ticks(ticks);
+ +              return;
+ +      }
+ +
         account_idle_time(jiffies_to_cputime(ticks));
   }
   
@@@ -4042,6 -3711,9 +4042,6 @@@ void thread_group_times(struct task_str
   /*
    * This function gets called by the timer code, with HZ frequency.
    * We call it with interrupts disabled.
- - *
- - * It also gets called by the fork code, when changing the parent's
- - * timeslices.
    */
   void scheduler_tick(void)
   {
@@@ -4161,12 -3833,19 +4161,12 @@@ static inline void schedule_debug(struc
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
   
         schedstat_inc(this_rq(), sched_count);
- -#ifdef CONFIG_SCHEDSTATS
- -      if (unlikely(prev->lock_depth >= 0)) {
- -              schedstat_inc(this_rq(), bkl_count);
- -              schedstat_inc(prev, sched_info.bkl_count);
- -      }
- -#endif
   }
   
   static void put_prev_task(struct rq *rq, struct task_struct *prev)
   {
- -      if (prev->se.on_rq)
+ +      if (prev->on_rq || rq->skip_clock_update < 0)
                 update_rq_clock(rq);
- -      rq->skip_clock_update = 0;
         prev->sched_class->put_prev_task(rq, prev);
   }
   
@@@ -4215,25 -3894,27 +4215,25 @@@ need_resched
         rcu_note_context_switch(cpu);
         prev = rq->curr;
   
- -      release_kernel_lock(prev);
- -need_resched_nonpreemptible:
- -
         schedule_debug(prev);
   
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
   
         raw_spin_lock_irq(&rq->lock);
- -      clear_tsk_need_resched(prev);
   
         switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
+ +                      deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ +                      prev->on_rq = 0;
+ +
                         /*
- -                       * If a worker is going to sleep, notify and
- -                       * ask workqueue whether it wants to wake up a
- -                       * task to maintain concurrency.  If so, wake
- -                       * up the task.
+ +                       * If a worker went to sleep, notify and ask workqueue
+ +                       * whether it wants to wake up a task to maintain
+ +                       * concurrency.
                          */
                         if (prev->flags & PF_WQ_WORKER) {
                                 struct task_struct *to_wakeup;
@@@ -4242,16 -3923,7 +4242,16 @@@
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
- -                      deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ +
+ +                      /*
+ +                       * If we are going to sleep and we have plugged IO
+ +                       * queued, make sure to submit it to avoid deadlocks.
+ +                       */
+ +                      if (blk_needs_flush_plug(prev)) {
+ +                              raw_spin_unlock(&rq->lock);
+ +                              blk_schedule_flush_plug(prev);
+ +                              raw_spin_lock(&rq->lock);
+ +                      }
                 }
                 switch_count = &prev->nvcsw;
         }
@@@ -4263,10 -3935,11 +4263,10 @@@
   
         put_prev_task(rq, prev);
         next = pick_next_task(rq);
+ +      clear_tsk_need_resched(prev);
+ +      rq->skip_clock_update = 0;
   
         if (likely(prev != next)) {
- -              sched_info_switch(prev, next);
- -              perf_event_task_sched_out(prev, next);
- -
                 rq->nr_switches++;
                 rq->curr = next;
                 ++*switch_count;
@@@ -4285,6 -3958,9 +4285,6 @@@
   
         post_schedule(rq);
   
- -      if (unlikely(reacquire_kernel_lock(prev)))
- -              goto need_resched_nonpreemptible;
- -
         preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
@@@ -4292,53 -3968,70 +4292,53 @@@
   EXPORT_SYMBOL(schedule);
   
   #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ +
+ +static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+ +{
+ +      bool ret = false;
+ +
+ +      rcu_read_lock();
+ +      if (lock->owner != owner)
+ +              goto fail;
+ +
+ +      /*
+ +       * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ +       * lock->owner still matches owner, if that fails, owner might
+ +       * point to free()d memory, if it still matches, the rcu_read_lock()
+ +       * ensures the memory stays valid.
+ +       */
+ +      barrier();
+ +
+ +      ret = owner->on_cpu;
+ +fail:
+ +      rcu_read_unlock();
+ +
+ +      return ret;
+ +}
+ +
   /*
    * Look out! "owner" is an entirely speculative pointer
    * access and not reliable.
    */
- -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
   {
- -      unsigned int cpu;
- -      struct rq *rq;
- -
         if (!sched_feat(OWNER_SPIN))
                 return 0;
   
- -#ifdef CONFIG_DEBUG_PAGEALLOC
- -      /*
- -       * Need to access the cpu field knowing that
- -       * DEBUG_PAGEALLOC could have unmapped it if
- -       * the mutex owner just released it and exited.
- -       */
- -      if (probe_kernel_address(&owner->cpu, cpu))
- -              return 0;
- -#else
- -      cpu = owner->cpu;
- -#endif
+ +      while (owner_running(lock, owner)) {
+ +              if (need_resched())
+ +                      return 0;
   
- -      /*
- -       * Even if the access succeeded (likely case),
- -       * the cpu field may no longer be valid.
- -       */
- -      if (cpu >= nr_cpumask_bits)
- -              return 0;
+ +              arch_mutex_cpu_relax();
+ +      }
   
         /*
- -       * We need to validate that we can do a
- -       * get_cpu() and that we have the percpu area.
+ +       * If the owner changed to another task there is likely
+ +       * heavy contention, stop spinning.
          */
- -      if (!cpu_online(cpu))
+ +      if (lock->owner)
                 return 0;
   
- -      rq = cpu_rq(cpu);
- -
- -      for (;;) {
- -              /*
- -               * Owner changed, break to re-assess state.
- -               */
- -              if (lock->owner != owner) {
- -                      /*
- -                       * If the lock has switched to a different owner,
- -                       * we likely have heavy contention. Return 0 to quit
- -                       * optimistic spinning and not contend further:
- -                       */
- -                      if (lock->owner)
- -                              return 0;
- -                      break;
- -              }
- -
- -              /*
- -               * Is that owner really running on that cpu?
- -               */
- -              if (task_thread_info(rq->curr) != owner || need_resched())
- -                      return 0;
- -
- -              cpu_relax();
- -      }
- -
         return 1;
   }
   #endif
@@@ -4468,7 -4161,6 +4468,7 @@@ void __wake_up_locked_key(wait_queue_he
   {
         __wake_up_common(q, mode, 1, 0, key);
   }
+ +EXPORT_SYMBOL_GPL(__wake_up_locked_key);
   
   /**
    * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@@ -4649,7 -4341,7 +4649,7 @@@ EXPORT_SYMBOL(wait_for_completion_inter
    * This waits for either a completion of a specific task to be signaled or for a
    * specified timeout to expire. It is interruptible. The timeout is in jiffies.
    */
- -unsigned long __sched
+ +long __sched
   wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
   {
@@@ -4682,7 -4374,7 +4682,7 @@@ EXPORT_SYMBOL(wait_for_completion_killa
    * signaled or for a specified timeout to expire. It can be
    * interrupted by a kill signal. The timeout is in jiffies.
    */
- -unsigned long __sched
+ +long __sched
   wait_for_completion_killable_timeout(struct completion *x,
                                      unsigned long timeout)
   {
@@@ -4798,18 -4490,19 +4798,18 @@@ EXPORT_SYMBOL(sleep_on_timeout)
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
- -      unsigned long flags;
         int oldprio, on_rq, running;
         struct rq *rq;
         const struct sched_class *prev_class;
   
         BUG_ON(prio < 0 || prio > MAX_PRIO);
   
- -      rq = task_rq_lock(p, &flags);
+ +      rq = __task_rq_lock(p);
   
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
- -      on_rq = p->se.on_rq;
+ +      on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 dequeue_task(rq, p, 0);
@@@ -4825,11 -4518,12 +4825,11 @@@
   
         if (running)
                 p->sched_class->set_curr_task(rq);
- -      if (on_rq) {
+ +      if (on_rq)
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
   
- -              check_class_changed(rq, p, prev_class, oldprio, running);
- -      }
- -      task_rq_unlock(rq, &flags);
+ +      check_class_changed(rq, p, prev_class, oldprio);
+ +      __task_rq_unlock(rq);
   }
   
   #endif
@@@ -4857,7 -4551,7 +4857,7 @@@ void set_user_nice(struct task_struct *
                 p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
- -      on_rq = p->se.on_rq;
+ +      on_rq = p->on_rq;
         if (on_rq)
                 dequeue_task(rq, p, 0);
   
@@@ -4877,7 -4571,7 +4877,7 @@@
                         resched_task(rq->curr);
         }
   out_unlock:
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   }
   EXPORT_SYMBOL(set_user_nice);
   
@@@ -4991,6 -4685,8 +4991,6 @@@ static struct task_struct *find_process
   static void
   __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
   {
- -      BUG_ON(p->se.on_rq);
- -
         p->policy = policy;
         p->rt_priority = prio;
         p->normal_prio = normal_prio(p);
@@@ -5013,17 -4709,14 +5013,17 @@@ static bool check_same_owner(struct tas
   
         rcu_read_lock();
         pcred = __task_cred(p);
- -      match = (cred->euid == pcred->euid ||
- -               cred->euid == pcred->uid);
+ +      if (cred->user->user_ns == pcred->user->user_ns)
+ +              match = (cred->euid == pcred->euid ||
+ +                       cred->euid == pcred->uid);
+ +      else
+ +              match = false;
         rcu_read_unlock();
         return match;
   }
   
   static int __sched_setscheduler(struct task_struct *p, int policy,
- -                              struct sched_param *param, bool user)
+ +                              const struct sched_param *param, bool user)
   {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@@ -5077,15 -4770,12 +5077,15 @@@ recheck
                             param->sched_priority > rlim_rtprio)
                                 return -EPERM;
                 }
+ +
                 /*
- -               * Like positive nice levels, dont allow tasks to
- -               * move out of SCHED_IDLE either:
+ +               * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ +               * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
- -              if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
- -                      return -EPERM;
+ +              if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ +                      if (!can_nice(p, TASK_NICE(p)))
+ +                              return -EPERM;
+ +              }
   
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
@@@ -5105,29 -4795,21 +5105,29 @@@
         /*
          * make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
- -       */
- -      raw_spin_lock_irqsave(&p->pi_lock, flags);
- -      /*
- -       * To be able to change p->policy safely, the apropriate
+ +       *
+ +       * To be able to change p->policy safely, the appropriate
          * runqueue lock must be held.
          */
- -      rq = __task_rq_lock(p);
+ +      rq = task_rq_lock(p, &flags);
   
         /*
          * Changing the policy of the stop threads its a very bad idea
          */
         if (p == rq->stop) {
+ +              task_rq_unlock(rq, p, &flags);
+ +              return -EINVAL;
+ +      }
+ +
+ +      /*
+ +       * If not changing anything there's no need to proceed further:
+ +       */
+ +      if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+ +                      param->sched_priority == p->rt_priority))) {
+ +
                 __task_rq_unlock(rq);
                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- -              return -EINVAL;
+ +              return 0;
         }
   
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -5137,9 -4819,9 +5137,9 @@@
                  * assigned.
                  */
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
- -                              task_group(p)->rt_bandwidth.rt_runtime == 0) {
- -                      __task_rq_unlock(rq);
- -                      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +                              task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ +                              !task_group_is_autogroup(task_group(p))) {
+ +                      task_rq_unlock(rq, p, &flags);
                         return -EPERM;
                 }
         }
@@@ -5148,10 -4830,11 +5148,10 @@@
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
- -              __task_rq_unlock(rq);
- -              raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +              task_rq_unlock(rq, p, &flags);
                 goto recheck;
         }
- -      on_rq = p->se.on_rq;
+ +      on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 deactivate_task(rq, p, 0);
@@@ -5166,11 -4849,13 +5166,11 @@@
   
         if (running)
                 p->sched_class->set_curr_task(rq);
- -      if (on_rq) {
+ +      if (on_rq)
                 activate_task(rq, p, 0);
   
- -              check_class_changed(rq, p, prev_class, oldprio, running);
- -      }
- -      __task_rq_unlock(rq);
- -      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +      check_class_changed(rq, p, prev_class, oldprio);
+ +      task_rq_unlock(rq, p, &flags);
   
         rt_mutex_adjust_pi(p);
   
@@@ -5186,7 -4871,7 +5186,7 @@@
    * NOTE that the task may be already dead.
    */
   int sched_setscheduler(struct task_struct *p, int policy,
- -                     struct sched_param *param)
+ +                     const struct sched_param *param)
   {
         return __sched_setscheduler(p, policy, param, true);
   }
@@@ -5204,7 -4889,7 +5204,7 @@@ EXPORT_SYMBOL_GPL(sched_setscheduler)
    * but our caller might not have that capability.
    */
   int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- -                             struct sched_param *param)
+ +                             const struct sched_param *param)
   {
         return __sched_setscheduler(p, policy, param, false);
   }
@@@ -5350,7 -5035,7 +5350,7 @@@ long sched_setaffinity(pid_t pid, cons
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
- -      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ +      if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                 goto out_unlock;
   
         retval = security_task_setscheduler(p);
@@@ -5421,6 -5106,7 +5421,6 @@@ long sched_getaffinity(pid_t pid, struc
   {
         struct task_struct *p;
         unsigned long flags;
- -      struct rq *rq;
         int retval;
   
         get_online_cpus();
@@@ -5435,9 -5121,9 +5435,9 @@@
         if (retval)
                 goto out_unlock;
   
- -      rq = task_rq_lock(p, &flags);
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
         cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
- -      task_rq_unlock(rq, &flags);
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
   out_unlock:
         rcu_read_unlock();
@@@ -5584,67 -5270,6 +5584,67 @@@ void __sched yield(void
   }
   EXPORT_SYMBOL(yield);
   
+ +/**
+ + * yield_to - yield the current processor to another thread in
+ + * your thread group, or accelerate that thread toward the
+ + * processor it's on.
+ + * @p: target task
+ + * @preempt: whether task preemption is allowed or not
+ + *
+ + * It's the caller's job to ensure that the target task struct
+ + * can't go away on us before we can do any checks.
+ + *
+ + * Returns true if we indeed boosted the target task.
+ + */
+ +bool __sched yield_to(struct task_struct *p, bool preempt)
+ +{
+ +      struct task_struct *curr = current;
+ +      struct rq *rq, *p_rq;
+ +      unsigned long flags;
+ +      bool yielded = 0;
+ +
+ +      local_irq_save(flags);
+ +      rq = this_rq();
+ +
+ +again:
+ +      p_rq = task_rq(p);
+ +      double_rq_lock(rq, p_rq);
+ +      while (task_rq(p) != p_rq) {
+ +              double_rq_unlock(rq, p_rq);
+ +              goto again;
+ +      }
+ +
+ +      if (!curr->sched_class->yield_to_task)
+ +              goto out;
+ +
+ +      if (curr->sched_class != p->sched_class)
+ +              goto out;
+ +
+ +      if (task_running(p_rq, p) || p->state)
+ +              goto out;
+ +
+ +      yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ +      if (yielded) {
+ +              schedstat_inc(rq, yld_count);
+ +              /*
+ +               * Make p's CPU reschedule; pick_next_entity takes care of
+ +               * fairness.
+ +               */
+ +              if (preempt && rq != p_rq)
+ +                      resched_task(p_rq->curr);
+ +      }
+ +
+ +out:
+ +      double_rq_unlock(rq, p_rq);
+ +      local_irq_restore(flags);
+ +
+ +      if (yielded)
+ +              schedule();
+ +
+ +      return yielded;
+ +}
+ +EXPORT_SYMBOL_GPL(yield_to);
+ +
   /*
    * This task is about to go to sleep on IO. Increment rq->nr_iowait so
    * that process accounting knows that this is a task in IO wait state.
@@@ -5655,7 -5280,6 +5655,7 @@@ void __sched io_schedule(void
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+ +      blk_flush_plug(current);
         current->in_iowait = 1;
         schedule();
         current->in_iowait = 0;
@@@ -5671,7 -5295,6 +5671,7 @@@ long __sched io_schedule_timeout(long t
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+ +      blk_flush_plug(current);
         current->in_iowait = 1;
         ret = schedule_timeout(timeout);
         current->in_iowait = 0;
@@@ -5762,7 -5385,7 +5762,7 @@@ SYSCALL_DEFINE2(sched_rr_get_interval, 
   
         rq = task_rq_lock(p, &flags);
         time_slice = p->sched_class->get_rr_interval(rq, p);
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   
         rcu_read_unlock();
         jiffies_to_timespec(time_slice, &t);
@@@ -5782,7 -5405,7 +5782,7 @@@ void sched_show_task(struct task_struc
         unsigned state;
   
         state = p->state ? __ffs(p->state) + 1 : 0;
- -      printk(KERN_INFO "%-13.13s %c", p->comm,
+ +      printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
   #if BITS_PER_LONG == 32
         if (state == TASK_RUNNING)
@@@ -5820,7 -5443,7 +5820,7 @@@ void show_state_filter(unsigned long st
         do_each_thread(g, p) {
                 /*
                  * reset the NMI-timeout, listing all files on a slow
- -               * console might take alot of time:
+ +               * console might take a lot of time:
                  */
                 touch_nmi_watchdog();
                 if (!state_filter || (p->state & state_filter))
@@@ -5864,7 -5487,7 +5864,7 @@@ void __cpuinit init_idle(struct task_st
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
   
- -      cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ +      do_set_cpus_allowed(idle, cpumask_of(cpu));
         /*
          * We're having a chicken and egg problem, even though we are
          * holding rq->lock, the cpu isn't yet set to this cpu so the
@@@ -5880,19 -5503,22 +5880,19 @@@
         rcu_read_unlock();
   
         rq->curr = rq->idle = idle;
- -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- -      idle->oncpu = 1;
+ +#if defined(CONFIG_SMP)
+ +      idle->on_cpu = 1;
   #endif
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   
         /* Set the preempt count _outside_ the spinlocks! */
- -#if defined(CONFIG_PREEMPT)
- -      task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
- -#else
         task_thread_info(idle)->preempt_count = 0;
- -#endif
+ +
         /*
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
- -      ftrace_graph_init_task(idle);
+ +      ftrace_graph_init_idle_task(idle, cpu);
   }
   
   /*
@@@ -5943,6 -5569,7 +5943,6 @@@ static void update_sysctl(void
         SET_SYSCTL(sched_min_granularity);
         SET_SYSCTL(sched_latency);
         SET_SYSCTL(sched_wakeup_granularity);
- -      SET_SYSCTL(sched_shares_ratelimit);
   #undef SET_SYSCTL
   }
   
@@@ -5952,16 -5579,6 +5952,16 @@@ static inline void sched_init_granulari
   }
   
   #ifdef CONFIG_SMP
+ +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+ +{
+ +      if (p->sched_class && p->sched_class->set_cpus_allowed)
+ +              p->sched_class->set_cpus_allowed(p, new_mask);
+ +      else {
+ +              cpumask_copy(&p->cpus_allowed, new_mask);
+ +              p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ +      }
+ +}
+ +
   /*
    * This is how migration works:
    *
@@@ -5992,38 -5609,52 +5992,38 @@@ int set_cpus_allowed_ptr(struct task_st
         unsigned int dest_cpu;
         int ret = 0;
   
- -      /*
- -       * Serialize against TASK_WAKING so that ttwu() and wunt() can
- -       * drop the rq->lock and still rely on ->cpus_allowed.
- -       */
- -again:
- -      while (task_is_waking(p))
- -              cpu_relax();
         rq = task_rq_lock(p, &flags);
- -      if (task_is_waking(p)) {
- -              task_rq_unlock(rq, &flags);
- -              goto again;
- -      }
+ +
+ +      if (cpumask_equal(&p->cpus_allowed, new_mask))
+ +              goto out;
   
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
   
- -      if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- -                   !cpumask_equal(&p->cpus_allowed, new_mask))) {
+ +      if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
                 ret = -EINVAL;
                 goto out;
         }
   
- -      if (p->sched_class->set_cpus_allowed)
- -              p->sched_class->set_cpus_allowed(p, new_mask);
- -      else {
- -              cpumask_copy(&p->cpus_allowed, new_mask);
- -              p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- -      }
+ +      do_set_cpus_allowed(p, new_mask);
   
         /* Can the task run on the task's current CPU? If so, we're done */
         if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- -      if (migrate_task(p, dest_cpu)) {
+ +      if (p->on_rq) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
- -              task_rq_unlock(rq, &flags);
+ +              task_rq_unlock(rq, p, &flags);
                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                 tlb_migrate_finish(p->mm);
                 return 0;
         }
   out:
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, p, &flags);
   
         return ret;
   }
@@@ -6051,7 -5682,6 +6051,7 @@@ static int __migrate_task(struct task_s
         rq_src = cpu_rq(src_cpu);
         rq_dest = cpu_rq(dest_cpu);
   
+ +      raw_spin_lock(&p->pi_lock);
         double_rq_lock(rq_src, rq_dest);
         /* Already moved. */
         if (task_cpu(p) != src_cpu)
@@@ -6064,7 -5694,7 +6064,7 @@@
          * If we're not on a rq, the next wake-up will ensure we're
          * placed properly.
          */
- -      if (p->se.on_rq) {
+ +      if (p->on_rq) {
                 deactivate_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
                 activate_task(rq_dest, p, 0);
@@@ -6074,7 -5704,6 +6074,7 @@@ done
         ret = 1;
   fail:
         double_rq_unlock(rq_src, rq_dest);
+ +      raw_spin_unlock(&p->pi_lock);
         return ret;
   }
   
@@@ -6098,20 -5727,29 +6098,20 @@@ static int migration_cpu_stop(void *dat
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
+ +
   /*
- - * Figure out where task on dead CPU should go, use force if necessary.
+ + * Ensures that the idle task is using init_mm right before its cpu goes
+ + * offline.
    */
- -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+ +void idle_task_exit(void)
   {
- -      struct rq *rq = cpu_rq(dead_cpu);
- -      int needs_cpu, uninitialized_var(dest_cpu);
- -      unsigned long flags;
+ +      struct mm_struct *mm = current->active_mm;
   
- -      local_irq_save(flags);
+ +      BUG_ON(cpu_online(smp_processor_id()));
   
- -      raw_spin_lock(&rq->lock);
- -      needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
- -      if (needs_cpu)
- -              dest_cpu = select_fallback_rq(dead_cpu, p);
- -      raw_spin_unlock(&rq->lock);
- -      /*
- -       * It can only fail if we race with set_cpus_allowed(),
- -       * in the racer should migrate the task anyway.
- -       */
- -      if (needs_cpu)
- -              __migrate_task(p, dead_cpu, dest_cpu);
- -      local_irq_restore(flags);
+ +      if (mm != &init_mm)
+ +              switch_mm(mm, &init_mm, current);
+ +      mmdrop(mm);
   }
   
   /*
@@@ -6124,69 -5762,128 +6124,69 @@@
   static void migrate_nr_uninterruptible(struct rq *rq_src)
   {
         struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
- -      unsigned long flags;
   
- -      local_irq_save(flags);
- -      double_rq_lock(rq_src, rq_dest);
         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
         rq_src->nr_uninterruptible = 0;
- -      double_rq_unlock(rq_src, rq_dest);
- -      local_irq_restore(flags);
- -}
- -
- -/* Run through task list and migrate tasks from the dead cpu. */
- -static void migrate_live_tasks(int src_cpu)
- -{
- -      struct task_struct *p, *t;
- -
- -      read_lock(&tasklist_lock);
- -
- -      do_each_thread(t, p) {
- -              if (p == current)
- -                      continue;
- -
- -              if (task_cpu(p) == src_cpu)
- -                      move_task_off_dead_cpu(src_cpu, p);
- -      } while_each_thread(t, p);
- -
- -      read_unlock(&tasklist_lock);
   }
   
   /*
- - * Schedules idle task to be the next runnable task on current CPU.
- - * It does so by boosting its priority to highest possible.
- - * Used by CPU offline code.
+ + * remove the tasks which were accounted by rq from calc_load_tasks.
    */
- -void sched_idle_next(void)
+ +static void calc_global_load_remove(struct rq *rq)
   {
- -      int this_cpu = smp_processor_id();
- -      struct rq *rq = cpu_rq(this_cpu);
- -      struct task_struct *p = rq->idle;
- -      unsigned long flags;
- -
- -      /* cpu has to be offline */
- -      BUG_ON(cpu_online(this_cpu));
- -
- -      /*
- -       * Strictly not necessary since rest of the CPUs are stopped by now
- -       * and interrupts disabled on the current cpu.
- -       */
- -      raw_spin_lock_irqsave(&rq->lock, flags);
- -
- -      __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- -
- -      activate_task(rq, p, 0);
- -
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
+ +      atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ +      rq->calc_load_active = 0;
   }
   
   /*
- - * Ensures that the idle task is using init_mm right before its cpu goes
- - * offline.
+ + * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ + * try_to_wake_up()->select_task_rq().
+ + *
+ + * Called with rq->lock held even though we'er in stop_machine() and
+ + * there's no concurrency possible, we hold the required locks anyway
+ + * because of lock validation efforts.
    */
- -void idle_task_exit(void)
- -{
- -      struct mm_struct *mm = current->active_mm;
- -
- -      BUG_ON(cpu_online(smp_processor_id()));
- -
- -      if (mm != &init_mm)
- -              switch_mm(mm, &init_mm, current);
- -      mmdrop(mm);
- -}
- -
- -/* called under rq->lock with disabled interrupts */
- -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+ +static void migrate_tasks(unsigned int dead_cpu)
   {
         struct rq *rq = cpu_rq(dead_cpu);
- -
- -      /* Must be exiting, otherwise would be on tasklist. */
- -      BUG_ON(!p->exit_state);
- -
- -      /* Cannot have done final schedule yet: would have vanished. */
- -      BUG_ON(p->state == TASK_DEAD);
- -
- -      get_task_struct(p);
+ +      struct task_struct *next, *stop = rq->stop;
+ +      int dest_cpu;
   
         /*
- -       * Drop lock around migration; if someone else moves it,
- -       * that's OK. No task can be added to this CPU, so iteration is
- -       * fine.
+ +       * Fudge the rq selection such that the below task selection loop
+ +       * doesn't get stuck on the currently eligible stop task.
+ +       *
+ +       * We're currently inside stop_machine() and the rq is either stuck
+ +       * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ +       * either way we should never end up calling schedule() until we're
+ +       * done here.
          */
- -      raw_spin_unlock_irq(&rq->lock);
- -      move_task_off_dead_cpu(dead_cpu, p);
- -      raw_spin_lock_irq(&rq->lock);
- -
- -      put_task_struct(p);
- -}
- -
- -/* release_task() removes task from tasklist, so we won't find dead tasks. */
- -static void migrate_dead_tasks(unsigned int dead_cpu)
- -{
- -      struct rq *rq = cpu_rq(dead_cpu);
- -      struct task_struct *next;
+ +      rq->stop = NULL;
   
         for ( ; ; ) {
- -              if (!rq->nr_running)
+ +              /*
+ +               * There's this thread running, bail when that's the only
+ +               * remaining thread.
+ +               */
+ +              if (rq->nr_running == 1)
                         break;
+ +
                 next = pick_next_task(rq);
- -              if (!next)
- -                      break;
+ +              BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
- -              migrate_dead(dead_cpu, next);
   
+ +              /* Find suitable destination for @next, with force if needed. */
+ +              dest_cpu = select_fallback_rq(dead_cpu, next);
+ +              raw_spin_unlock(&rq->lock);
+ +
+ +              __migrate_task(next, dead_cpu, dest_cpu);
+ +
+ +              raw_spin_lock(&rq->lock);
         }
- -}
   
- -/*
- - * remove the tasks which were accounted by rq from calc_load_tasks.
- - */
- -static void calc_global_load_remove(struct rq *rq)
- -{
- -      atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- -      rq->calc_load_active = 0;
+ +      rq->stop = stop;
   }
+ +
   #endif /* CONFIG_HOTPLUG_CPU */
   
   #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@@ -6396,13 -6093,15 +6396,13 @@@ migration_call(struct notifier_block *n
         unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
   
- -      switch (action) {
+ +      switch (action & ~CPU_TASKS_FROZEN) {
   
         case CPU_UP_PREPARE:
- -      case CPU_UP_PREPARE_FROZEN:
                 rq->calc_load_update = calc_load_update;
                 break;
   
         case CPU_ONLINE:
- -      case CPU_ONLINE_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
@@@ -6414,26 -6113,33 +6414,26 @@@
                 break;
   
   #ifdef CONFIG_HOTPLUG_CPU
- -      case CPU_DEAD:
- -      case CPU_DEAD_FROZEN:
- -              migrate_live_tasks(cpu);
- -              /* Idle task back to normal (off runqueue, low prio) */
- -              raw_spin_lock_irq(&rq->lock);
- -              deactivate_task(rq, rq->idle, 0);
- -              __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
- -              rq->idle->sched_class = &idle_sched_class;
- -              migrate_dead_tasks(cpu);
- -              raw_spin_unlock_irq(&rq->lock);
- -              migrate_nr_uninterruptible(rq);
- -              BUG_ON(rq->nr_running != 0);
- -              calc_global_load_remove(rq);
- -              break;
- -
         case CPU_DYING:
- -      case CPU_DYING_FROZEN:
+ +              sched_ttwu_pending();
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
+ +              migrate_tasks(cpu);
+ +              BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
+ +
+ +              migrate_nr_uninterruptible(rq);
+ +              calc_global_load_remove(rq);
                 break;
   #endif
         }
+ +
+ +      update_max_interval();
+ +
         return NOTIFY_OK;
   }
   
@@@ -6494,8 -6200,6 +6494,8 @@@ early_initcall(migration_init)
   
   #ifdef CONFIG_SMP
   
+ +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+ +
   #ifdef CONFIG_SCHED_DEBUG
   
   static __read_mostly int sched_domain_debug_enabled;
@@@ -6570,7 -6274,7 +6570,7 @@@ static int sched_domain_debug_one(struc
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
   
                 printk(KERN_CONT " %s", str);
- -              if (group->cpu_power != SCHED_LOAD_SCALE) {
+ +              if (group->cpu_power != SCHED_POWER_SCALE) {
                         printk(KERN_CONT " (cpu_power = %d)",
                                 group->cpu_power);
                 }
@@@ -6591,6 -6295,7 +6591,6 @@@
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
- -      cpumask_var_t groupmask;
         int level = 0;
   
         if (!sched_domain_debug_enabled)
@@@ -6603,14 -6308,20 +6603,14 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
- -      if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- -              printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- -              return;
- -      }
- -
         for (;;) {
- -              if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ +              if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                         break;
                 level++;
                 sd = sd->parent;
                 if (!sd)
                         break;
         }
- -      free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6667,11 -6378,12 +6667,11 @@@ sd_parent_degenerate(struct sched_domai
         return 1;
   }
   
- -static void free_rootdomain(struct root_domain *rd)
+ +static void free_rootdomain(struct rcu_head *rcu)
   {
- -      synchronize_sched();
+ +      struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
   
         cpupri_cleanup(&rd->cpupri);
- -
         free_cpumask_var(rd->rto_mask);
         free_cpumask_var(rd->online);
         free_cpumask_var(rd->span);
@@@ -6712,7 -6424,7 +6712,7 @@@ static void rq_attach_root(struct rq *r
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   
         if (old_rd)
- -              free_rootdomain(old_rd);
+ +              call_rcu_sched(&old_rd->rcu, free_rootdomain);
   }
   
   static int init_rootdomain(struct root_domain *rd)
@@@ -6763,25 -6475,6 +6763,25 @@@ static struct root_domain *alloc_rootdo
         return rd;
   }
   
+ +static void free_sched_domain(struct rcu_head *rcu)
+ +{
+ +      struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ +      if (atomic_dec_and_test(&sd->groups->ref))
+ +              kfree(sd->groups);
+ +      kfree(sd);
+ +}
+ +
+ +static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+ +{
+ +      call_rcu(&sd->rcu, free_sched_domain);
+ +}
+ +
+ +static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+ +{
+ +      for (; sd; sd = sd->parent)
+ +              destroy_sched_domain(sd, cpu);
+ +}
+ +
   /*
    * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
    * hold the hotplug lock.
@@@ -6792,6 -6485,9 +6792,6 @@@ cpu_attach_domain(struct sched_domain *
         struct rq *rq = cpu_rq(cpu);
         struct sched_domain *tmp;
   
- -      for (tmp = sd; tmp; tmp = tmp->parent)
- -              tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
- -
         /* Remove the sched domains which do not contribute to scheduling. */
         for (tmp = sd; tmp; ) {
                 struct sched_domain *parent = tmp->parent;
@@@ -6802,15 -6498,12 +6802,15 @@@
                         tmp->parent = parent->parent;
                         if (parent->parent)
                                 parent->parent->child = tmp;
+ +                      destroy_sched_domain(parent, cpu);
                 } else
                         tmp = tmp->parent;
         }
   
         if (sd && sd_degenerate(sd)) {
+ +              tmp = sd;
                 sd = sd->parent;
+ +              destroy_sched_domain(tmp, cpu);
                 if (sd)
                         sd->child = NULL;
         }
@@@ -6818,9 -6511,7 +6818,9 @@@
         sched_domain_debug(sd, cpu);
   
         rq_attach_root(rq, rd);
+ +      tmp = rq->sd;
         rcu_assign_pointer(rq->sd, sd);
+ +      destroy_sched_domains(tmp, cpu);
   }
   
   /* cpus with isolated domains */
@@@ -6836,6 -6527,56 +6836,6 @@@ static int __init isolated_cpu_setup(ch
   
   __setup("isolcpus=", isolated_cpu_setup);
   
- -/*
- - * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- - * to a function which identifies what group(along with sched group) a CPU
- - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- - * (due to the fact that we keep track of groups covered with a struct cpumask).
- - *
- - * init_sched_build_groups will build a circular linked list of the groups
- - * covered by the given span, and will set each group's ->cpumask correctly,
- - * and ->cpu_power to 0.
- - */
- -static void
- -init_sched_build_groups(const struct cpumask *span,
- -                      const struct cpumask *cpu_map,
- -                      int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- -                                      struct sched_group **sg,
- -                                      struct cpumask *tmpmask),
- -                      struct cpumask *covered, struct cpumask *tmpmask)
- -{
- -      struct sched_group *first = NULL, *last = NULL;
- -      int i;
- -
- -      cpumask_clear(covered);
- -
- -      for_each_cpu(i, span) {
- -              struct sched_group *sg;
- -              int group = group_fn(i, cpu_map, &sg, tmpmask);
- -              int j;
- -
- -              if (cpumask_test_cpu(i, covered))
- -                      continue;
- -
- -              cpumask_clear(sched_group_cpus(sg));
- -              sg->cpu_power = 0;
- -
- -              for_each_cpu(j, span) {
- -                      if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- -                              continue;
- -
- -                      cpumask_set_cpu(j, covered);
- -                      cpumask_set_cpu(j, sched_group_cpus(sg));
- -              }
- -              if (!first)
- -                      first = sg;
- -              if (last)
- -                      last->next = sg;
- -              last = sg;
- -      }
- -      last->next = first;
- -}
- -
   #define SD_NODES_PER_DOMAIN 16
   
   #ifdef CONFIG_NUMA
@@@ -6852,7 -6593,7 +6852,7 @@@
    */
   static int find_next_best_node(int node, nodemask_t *used_nodes)
   {
- -      int i, n, val, min_val, best_node = 0;
+ +      int i, n, val, min_val, best_node = -1;
   
         min_val = INT_MAX;
   
@@@ -6876,8 -6617,7 +6876,8 @@@
                 }
         }
   
- -      node_set(best_node, *used_nodes);
+ +      if (best_node != -1)
+ +              node_set(best_node, *used_nodes);
         return best_node;
   }
   
@@@ -6903,130 -6643,315 +6903,130 @@@ static void sched_domain_node_span(int 
   
         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                 int next_node = find_next_best_node(node, &used_nodes);
- -
+ +              if (next_node < 0)
+ +                      break;
                 cpumask_or(span, span, cpumask_of_node(next_node));
         }
   }
+ +
+ +static const struct cpumask *cpu_node_mask(int cpu)
+ +{
+ +      lockdep_assert_held(&sched_domains_mutex);
+ +
+ +      sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+ +
+ +      return sched_domains_tmpmask;
+ +}
+ +
+ +static const struct cpumask *cpu_allnodes_mask(int cpu)
+ +{
+ +      return cpu_possible_mask;
+ +}
   #endif /* CONFIG_NUMA */
   
- -int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+ +static const struct cpumask *cpu_cpu_mask(int cpu)
+ +{
+ +      return cpumask_of_node(cpu_to_node(cpu));
+ +}
   
- -/*
- - * The cpus mask in sched_group and sched_domain hangs off the end.
- - *
- - * ( See the the comments in include/linux/sched.h:struct sched_group
- - *   and struct sched_domain. )
- - */
- -struct static_sched_group {
- -      struct sched_group sg;
- -      DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
- -};
+ +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
- -struct static_sched_domain {
- -      struct sched_domain sd;
- -      DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ +struct sd_data {
+ +      struct sched_domain **__percpu sd;
+ +      struct sched_group **__percpu sg;
   };
   
   struct s_data {
- -#ifdef CONFIG_NUMA
- -      int                     sd_allnodes;
- -      cpumask_var_t           domainspan;
- -      cpumask_var_t           covered;
- -      cpumask_var_t           notcovered;
- -#endif
- -      cpumask_var_t           nodemask;
- -      cpumask_var_t           this_sibling_map;
- -      cpumask_var_t           this_core_map;
- -      cpumask_var_t           this_book_map;
- -      cpumask_var_t           send_covered;
- -      cpumask_var_t           tmpmask;
- -      struct sched_group      **sched_group_nodes;
+ +      struct sched_domain ** __percpu sd;
         struct root_domain      *rd;
   };
   
   enum s_alloc {
- -      sa_sched_groups = 0,
         sa_rootdomain,
- -      sa_tmpmask,
- -      sa_send_covered,
- -      sa_this_book_map,
- -      sa_this_core_map,
- -      sa_this_sibling_map,
- -      sa_nodemask,
- -      sa_sched_group_nodes,
- -#ifdef CONFIG_NUMA
- -      sa_notcovered,
- -      sa_covered,
- -      sa_domainspan,
- -#endif
+ +      sa_sd,
+ +      sa_sd_storage,
         sa_none,
   };
   
- -/*
- - * SMT sched-domains:
- - */
- -#ifdef CONFIG_SCHED_SMT
- -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
- -
- -static int
- -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- -               struct sched_group **sg, struct cpumask *unused)
- -{
- -      if (sg)
- -              *sg = &per_cpu(sched_groups, cpu).sg;
- -      return cpu;
- -}
- -#endif /* CONFIG_SCHED_SMT */
+ +struct sched_domain_topology_level;
   
- -/*
- - * multi-core sched-domains:
- - */
- -#ifdef CONFIG_SCHED_MC
- -static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+ +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+ +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
   
- -static int
- -cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
- -{
- -      int group;
- -#ifdef CONFIG_SCHED_SMT
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#else
- -      group = cpu;
- -#endif
- -      if (sg)
- -              *sg = &per_cpu(sched_group_core, group).sg;
- -      return group;
- -}
- -#endif /* CONFIG_SCHED_MC */
+ +struct sched_domain_topology_level {
+ +      sched_domain_init_f init;
+ +      sched_domain_mask_f mask;
+ +      struct sd_data      data;
+ +};
   
   /*
- - * book sched-domains:
+ + * Assumes the sched_domain tree is fully constructed
    */
- -#ifdef CONFIG_SCHED_BOOK
- -static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
- -
- -static int
- -cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
+ +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
   {
- -      int group = cpu;
- -#ifdef CONFIG_SCHED_MC
- -      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_SMT)
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#endif
- -      if (sg)
- -              *sg = &per_cpu(sched_group_book, group).sg;
- -      return group;
- -}
- -#endif /* CONFIG_SCHED_BOOK */
+ +      struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ +      struct sched_domain *child = sd->child;
   
- -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ +      if (child)
+ +              cpu = cpumask_first(sched_domain_span(child));
   
- -static int
- -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
- -{
- -      int group;
- -#ifdef CONFIG_SCHED_BOOK
- -      cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_MC)
- -      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_SMT)
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#else
- -      group = cpu;
- -#endif
         if (sg)
- -              *sg = &per_cpu(sched_group_phys, group).sg;
- -      return group;
+ +              *sg = *per_cpu_ptr(sdd->sg, cpu);
+ +
+ +      return cpu;
   }
   
- -#ifdef CONFIG_NUMA
   /*
- - * The init_sched_build_groups can't handle what we want to do with node
- - * groups, so roll our own. Now each node has its own list of groups which
- - * gets dynamically allocated.
+ + * build_sched_groups takes the cpumask we wish to span, and a pointer
+ + * to a function which identifies what group(along with sched group) a CPU
+ + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ + * (due to the fact that we keep track of groups covered with a struct cpumask).
+ + *
+ + * build_sched_groups will build a circular linked list of the groups
+ + * covered by the given span, and will set each group's ->cpumask correctly,
+ + * and ->cpu_power to 0.
    */
- -static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
- -static struct sched_group ***sched_group_nodes_bycpu;
- -
- -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
- -
- -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- -                               struct sched_group **sg,
- -                               struct cpumask *nodemask)
- -{
- -      int group;
- -
- -      cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- -      group = cpumask_first(nodemask);
- -
- -      if (sg)
- -              *sg = &per_cpu(sched_group_allnodes, group).sg;
- -      return group;
- -}
- -
- -static void init_numa_sched_groups_power(struct sched_group *group_head)
- -{
- -      struct sched_group *sg = group_head;
- -      int j;
- -
- -      if (!sg)
- -              return;
- -      do {
- -              for_each_cpu(j, sched_group_cpus(sg)) {
- -                      struct sched_domain *sd;
- -
- -                      sd = &per_cpu(phys_domains, j).sd;
- -                      if (j != group_first_cpu(sd->groups)) {
- -                              /*
- -                               * Only add "power" once for each
- -                               * physical package.
- -                               */
- -                              continue;
- -                      }
- -
- -                      sg->cpu_power += sd->groups->cpu_power;
- -              }
- -              sg = sg->next;
- -      } while (sg != group_head);
- -}
- -
- -static int build_numa_sched_groups(struct s_data *d,
- -                                 const struct cpumask *cpu_map, int num)
+ +static void
+ +build_sched_groups(struct sched_domain *sd)
   {
- -      struct sched_domain *sd;
- -      struct sched_group *sg, *prev;
- -      int n, j;
- -
- -      cpumask_clear(d->covered);
- -      cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- -      if (cpumask_empty(d->nodemask)) {
- -              d->sched_group_nodes[num] = NULL;
- -              goto out;
- -      }
- -
- -      sched_domain_node_span(num, d->domainspan);
- -      cpumask_and(d->domainspan, d->domainspan, cpu_map);
- -
- -      sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                        GFP_KERNEL, num);
- -      if (!sg) {
- -              printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- -                     num);
- -              return -ENOMEM;
- -      }
- -      d->sched_group_nodes[num] = sg;
- -
- -      for_each_cpu(j, d->nodemask) {
- -              sd = &per_cpu(node_domains, j).sd;
- -              sd->groups = sg;
- -      }
- -
- -      sg->cpu_power = 0;
- -      cpumask_copy(sched_group_cpus(sg), d->nodemask);
- -      sg->next = sg;
- -      cpumask_or(d->covered, d->covered, d->nodemask);
+ +      struct sched_group *first = NULL, *last = NULL;
+ +      struct sd_data *sdd = sd->private;
+ +      const struct cpumask *span = sched_domain_span(sd);
+ +      struct cpumask *covered;
+ +      int i;
   
- -      prev = sg;
- -      for (j = 0; j < nr_node_ids; j++) {
- -              n = (num + j) % nr_node_ids;
- -              cpumask_complement(d->notcovered, d->covered);
- -              cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- -              cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- -              if (cpumask_empty(d->tmpmask))
- -                      break;
- -              cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- -              if (cpumask_empty(d->tmpmask))
- -                      continue;
- -              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                                GFP_KERNEL, num);
- -              if (!sg) {
- -                      printk(KERN_WARNING
- -                             "Can not alloc domain group for node %d\n", j);
- -                      return -ENOMEM;
- -              }
- -              sg->cpu_power = 0;
- -              cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- -              sg->next = prev->next;
- -              cpumask_or(d->covered, d->covered, d->tmpmask);
- -              prev->next = sg;
- -              prev = sg;
- -      }
- -out:
- -      return 0;
- -}
- -#endif /* CONFIG_NUMA */
+ +      lockdep_assert_held(&sched_domains_mutex);
+ +      covered = sched_domains_tmpmask;
   
- -#ifdef CONFIG_NUMA
- -/* Free memory allocated for various sched_group structures */
- -static void free_sched_groups(const struct cpumask *cpu_map,
- -                            struct cpumask *nodemask)
- -{
- -      int cpu, i;
+ +      cpumask_clear(covered);
   
- -      for_each_cpu(cpu, cpu_map) {
- -              struct sched_group **sched_group_nodes
- -                      = sched_group_nodes_bycpu[cpu];
+ +      for_each_cpu(i, span) {
+ +              struct sched_group *sg;
+ +              int group = get_group(i, sdd, &sg);
+ +              int j;
   
- -              if (!sched_group_nodes)
+ +              if (cpumask_test_cpu(i, covered))
                         continue;
   
- -              for (i = 0; i < nr_node_ids; i++) {
- -                      struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ +              cpumask_clear(sched_group_cpus(sg));
+ +              sg->cpu_power = 0;
   
- -                      cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- -                      if (cpumask_empty(nodemask))
+ +              for_each_cpu(j, span) {
+ +                      if (get_group(j, sdd, NULL) != group)
                                 continue;
   
- -                      if (sg == NULL)
- -                              continue;
- -                      sg = sg->next;
- -next_sg:
- -                      oldsg = sg;
- -                      sg = sg->next;
- -                      kfree(oldsg);
- -                      if (oldsg != sched_group_nodes[i])
- -                              goto next_sg;
+ +                      cpumask_set_cpu(j, covered);
+ +                      cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
- -              kfree(sched_group_nodes);
- -              sched_group_nodes_bycpu[cpu] = NULL;
+ +
+ +              if (!first)
+ +                      first = sg;
+ +              if (last)
+ +                      last->next = sg;
+ +              last = sg;
         }
+ +      last->next = first;
   }
- -#else /* !CONFIG_NUMA */
- -static void free_sched_groups(const struct cpumask *cpu_map,
- -                            struct cpumask *nodemask)
- -{
- -}
- -#endif /* CONFIG_NUMA */
   
   /*
    * Initialize sched groups cpu_power.
@@@ -7040,6 -6965,11 +7040,6 @@@
    */
   static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   {
- -      struct sched_domain *child;
- -      struct sched_group *group;
- -      long power;
- -      int weight;
- -
         WARN_ON(!sd || !sd->groups);
   
         if (cpu != group_first_cpu(sd->groups))
@@@ -7047,7 -6977,36 +7047,7 @@@
   
         sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
   
- -      child = sd->child;
- -
- -      sd->groups->cpu_power = 0;
- -
- -      if (!child) {
- -              power = SCHED_LOAD_SCALE;
- -              weight = cpumask_weight(sched_domain_span(sd));
- -              /*
- -               * SMT siblings share the power of a single core.
- -               * Usually multiple threads get a better yield out of
- -               * that one core than a single thread would have,
- -               * reflect that in sd->smt_gain.
- -               */
- -              if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- -                      power *= sd->smt_gain;
- -                      power /= weight;
- -                      power >>= SCHED_LOAD_SHIFT;
- -              }
- -              sd->groups->cpu_power += power;
- -              return;
- -      }
- -
- -      /*
- -       * Add cpu_power of each child group to this groups cpu_power.
- -       */
- -      group = child->groups;
- -      do {
- -              sd->groups->cpu_power += group->cpu_power;
- -              group = group->next;
- -      } while (group != child->groups);
+ +      update_group_power(sd, cpu);
   }
   
   /*
@@@ -7061,15 -7020,15 +7061,15 @@@
   # define SD_INIT_NAME(sd, type)               do { } while (0)
   #endif
   
- -#define       SD_INIT(sd, type)       sd_init_##type(sd)
- -
- -#define SD_INIT_FUNC(type)    \
- -static noinline void sd_init_##type(struct sched_domain *sd)  \
- -{                                                             \
- -      memset(sd, 0, sizeof(*sd));                             \
- -      *sd = SD_##type##_INIT;                                 \
- -      sd->level = SD_LV_##type;                               \
- -      SD_INIT_NAME(sd, type);                                 \
+ +#define SD_INIT_FUNC(type)                                            \
+ +static noinline struct sched_domain *                                 \
+ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu)       \
+ +{                                                                     \
+ +      struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
+ +      *sd = SD_##type##_INIT;                                         \
+ +      SD_INIT_NAME(sd, type);                                         \
+ +      sd->private = &tl->data;                                        \
+ +      return sd;                                                      \
   }
   
   SD_INIT_FUNC(CPU)
@@@ -7088,14 -7047,13 +7088,14 @@@
   #endif
   
   static int default_relax_domain_level = -1;
+ +int sched_domain_level_max;
   
   static int __init setup_relax_domain_level(char *str)
   {
         unsigned long val;
   
         val = simple_strtoul(str, NULL, 0);
- -      if (val < SD_LV_MAX)
+ +      if (val < sched_domain_level_max)
                 default_relax_domain_level = val;
   
         return 1;
@@@ -7123,20 -7081,37 +7123,20 @@@ static void set_domain_attribute(struc
         }
   }
   
+ +static void __sdt_free(const struct cpumask *cpu_map);
+ +static int __sdt_alloc(const struct cpumask *cpu_map);
+ +
   static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                  const struct cpumask *cpu_map)
   {
         switch (what) {
- -      case sa_sched_groups:
- -              free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- -              d->sched_group_nodes = NULL;
         case sa_rootdomain:
- -              free_rootdomain(d->rd); /* fall through */
- -      case sa_tmpmask:
- -              free_cpumask_var(d->tmpmask); /* fall through */
- -      case sa_send_covered:
- -              free_cpumask_var(d->send_covered); /* fall through */
- -      case sa_this_book_map:
- -              free_cpumask_var(d->this_book_map); /* fall through */
- -      case sa_this_core_map:
- -              free_cpumask_var(d->this_core_map); /* fall through */
- -      case sa_this_sibling_map:
- -              free_cpumask_var(d->this_sibling_map); /* fall through */
- -      case sa_nodemask:
- -              free_cpumask_var(d->nodemask); /* fall through */
- -      case sa_sched_group_nodes:
- -#ifdef CONFIG_NUMA
- -              kfree(d->sched_group_nodes); /* fall through */
- -      case sa_notcovered:
- -              free_cpumask_var(d->notcovered); /* fall through */
- -      case sa_covered:
- -              free_cpumask_var(d->covered); /* fall through */
- -      case sa_domainspan:
- -              free_cpumask_var(d->domainspan); /* fall through */
- -#endif
+ +              if (!atomic_read(&d->rd->refcount))
+ +                      free_rootdomain(&d->rd->rcu); /* fall through */
+ +      case sa_sd:
+ +              free_percpu(d->sd); /* fall through */
+ +      case sa_sd_storage:
+ +              __sdt_free(cpu_map); /* fall through */
         case sa_none:
                 break;
         }
@@@ -7144,213 -7119,309 +7144,213 @@@
   
   static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                    const struct cpumask *cpu_map)
- -{
- -#ifdef CONFIG_NUMA
- -      if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- -              return sa_none;
- -      if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- -              return sa_domainspan;
- -      if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- -              return sa_covered;
- -      /* Allocate the per-node list of sched groups */
- -      d->sched_group_nodes = kcalloc(nr_node_ids,
- -                                    sizeof(struct sched_group *), GFP_KERNEL);
- -      if (!d->sched_group_nodes) {
- -              printk(KERN_WARNING "Can not alloc sched group node list\n");
- -              return sa_notcovered;
- -      }
- -      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
- -#endif
- -      if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- -              return sa_sched_group_nodes;
- -      if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- -              return sa_nodemask;
- -      if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- -              return sa_this_sibling_map;
- -      if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- -              return sa_this_core_map;
- -      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- -              return sa_this_book_map;
- -      if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- -              return sa_send_covered;
- -      d->rd = alloc_rootdomain();
- -      if (!d->rd) {
- -              printk(KERN_WARNING "Cannot alloc root domain\n");
- -              return sa_tmpmask;
- -      }
- -      return sa_rootdomain;
- -}
- -
- -static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
- -{
- -      struct sched_domain *sd = NULL;
- -#ifdef CONFIG_NUMA
- -      struct sched_domain *parent;
- -
- -      d->sd_allnodes = 0;
- -      if (cpumask_weight(cpu_map) >
- -          SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- -              sd = &per_cpu(allnodes_domains, i).sd;
- -              SD_INIT(sd, ALLNODES);
- -              set_domain_attribute(sd, attr);
- -              cpumask_copy(sched_domain_span(sd), cpu_map);
- -              cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- -              d->sd_allnodes = 1;
- -      }
- -      parent = sd;
- -
- -      sd = &per_cpu(node_domains, i).sd;
- -      SD_INIT(sd, NODE);
- -      set_domain_attribute(sd, attr);
- -      sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- -      sd->parent = parent;
- -      if (parent)
- -              parent->child = sd;
- -      cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
- -#endif
- -      return sd;
- -}
- -
- -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd;
- -      sd = &per_cpu(phys_domains, i).sd;
- -      SD_INIT(sd, CPU);
- -      set_domain_attribute(sd, attr);
- -      cpumask_copy(sched_domain_span(sd), d->nodemask);
- -      sd->parent = parent;
- -      if (parent)
- -              parent->child = sd;
- -      cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- -      return sd;
- -}
- -
- -static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd = parent;
- -#ifdef CONFIG_SCHED_BOOK
- -      sd = &per_cpu(book_domains, i).sd;
- -      SD_INIT(sd, BOOK);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
- -#endif
- -      return sd;
+ +{
+ +      memset(d, 0, sizeof(*d));
+ +
+ +      if (__sdt_alloc(cpu_map))
+ +              return sa_sd_storage;
+ +      d->sd = alloc_percpu(struct sched_domain *);
+ +      if (!d->sd)
+ +              return sa_sd_storage;
+ +      d->rd = alloc_rootdomain();
+ +      if (!d->rd)
+ +              return sa_sd;
+ +      return sa_rootdomain;
   }
   
- -static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
+ +/*
+ + * NULL the sd_data elements we've used to build the sched_domain and
+ + * sched_group structure so that the subsequent __free_domain_allocs()
+ + * will not free the data we're using.
+ + */
+ +static void claim_allocations(int cpu, struct sched_domain *sd)
   {
- -      struct sched_domain *sd = parent;
- -#ifdef CONFIG_SCHED_MC
- -      sd = &per_cpu(core_domains, i).sd;
- -      SD_INIT(sd, MC);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
- -#endif
- -      return sd;
+ +      struct sd_data *sdd = sd->private;
+ +      struct sched_group *sg = sd->groups;
+ +
+ +      WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ +      *per_cpu_ptr(sdd->sd, cpu) = NULL;
+ +
+ +      if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ +              WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ +              *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ +      }
   }
   
- -static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd = parent;
   #ifdef CONFIG_SCHED_SMT
- -      sd = &per_cpu(cpu_domains, i).sd;
- -      SD_INIT(sd, SIBLING);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
- -#endif
- -      return sd;
+ +static const struct cpumask *cpu_smt_mask(int cpu)
+ +{
+ +      return topology_thread_cpumask(cpu);
   }
+ +#endif
   
- -static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- -                             const struct cpumask *cpu_map, int cpu)
- -{
- -      switch (l) {
+ +/*
+ + * Topology list, bottom-up.
+ + */
+ +static struct sched_domain_topology_level default_topology[] = {
   #ifdef CONFIG_SCHED_SMT
- -      case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- -              cpumask_and(d->this_sibling_map, cpu_map,
- -                          topology_thread_cpumask(cpu));
- -              if (cpu == cpumask_first(d->this_sibling_map))
- -                      init_sched_build_groups(d->this_sibling_map, cpu_map,
- -                                              &cpu_to_cpu_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_SIBLING, cpu_smt_mask, },
   #endif
   #ifdef CONFIG_SCHED_MC
- -      case SD_LV_MC: /* set up multi-core groups */
- -              cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- -              if (cpu == cpumask_first(d->this_core_map))
- -                      init_sched_build_groups(d->this_core_map, cpu_map,
- -                                              &cpu_to_core_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_MC, cpu_coregroup_mask, },
   #endif
   #ifdef CONFIG_SCHED_BOOK
- -      case SD_LV_BOOK: /* set up book groups */
- -              cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- -              if (cpu == cpumask_first(d->this_book_map))
- -                      init_sched_build_groups(d->this_book_map, cpu_map,
- -                                              &cpu_to_book_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_BOOK, cpu_book_mask, },
   #endif
- -      case SD_LV_CPU: /* set up physical groups */
- -              cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- -              if (!cpumask_empty(d->nodemask))
- -                      init_sched_build_groups(d->nodemask, cpu_map,
- -                                              &cpu_to_phys_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_CPU, cpu_cpu_mask, },
   #ifdef CONFIG_NUMA
- -      case SD_LV_ALLNODES:
- -              init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- -                                      d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_NODE, cpu_node_mask, },
+ +      { sd_init_ALLNODES, cpu_allnodes_mask, },
   #endif
- -      default:
- -              break;
+ +      { NULL, },
+ +};
+ +
+ +static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ +
+ +static int __sdt_alloc(const struct cpumask *cpu_map)
+ +{
+ +      struct sched_domain_topology_level *tl;
+ +      int j;
+ +
+ +      for (tl = sched_domain_topology; tl->init; tl++) {
+ +              struct sd_data *sdd = &tl->data;
+ +
+ +              sdd->sd = alloc_percpu(struct sched_domain *);
+ +              if (!sdd->sd)
+ +                      return -ENOMEM;
+ +
+ +              sdd->sg = alloc_percpu(struct sched_group *);
+ +              if (!sdd->sg)
+ +                      return -ENOMEM;
+ +
+ +              for_each_cpu(j, cpu_map) {
+ +                      struct sched_domain *sd;
+ +                      struct sched_group *sg;
+ +
+ +                      sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ +                                      GFP_KERNEL, cpu_to_node(j));
+ +                      if (!sd)
+ +                              return -ENOMEM;
+ +
+ +                      *per_cpu_ptr(sdd->sd, j) = sd;
+ +
+ +                      sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                                      GFP_KERNEL, cpu_to_node(j));
+ +                      if (!sg)
+ +                              return -ENOMEM;
+ +
+ +                      *per_cpu_ptr(sdd->sg, j) = sg;
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static void __sdt_free(const struct cpumask *cpu_map)
+ +{
+ +      struct sched_domain_topology_level *tl;
+ +      int j;
+ +
+ +      for (tl = sched_domain_topology; tl->init; tl++) {
+ +              struct sd_data *sdd = &tl->data;
+ +
+ +              for_each_cpu(j, cpu_map) {
+ +                      kfree(*per_cpu_ptr(sdd->sd, j));
+ +                      kfree(*per_cpu_ptr(sdd->sg, j));
+ +              }
+ +              free_percpu(sdd->sd);
+ +              free_percpu(sdd->sg);
         }
   }
   
+ +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ +              struct s_data *d, const struct cpumask *cpu_map,
+ +              struct sched_domain_attr *attr, struct sched_domain *child,
+ +              int cpu)
+ +{
+ +      struct sched_domain *sd = tl->init(tl, cpu);
+ +      if (!sd)
+ +              return child;
+ +
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ +      if (child) {
+ +              sd->level = child->level + 1;
+ +              sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ +              child->parent = sd;
+ +      }
+ +      sd->child = child;
+ +
+ +      return sd;
+ +}
+ +
   /*
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- -static int __build_sched_domains(const struct cpumask *cpu_map,
- -                               struct sched_domain_attr *attr)
+ +static int build_sched_domains(const struct cpumask *cpu_map,
+ +                             struct sched_domain_attr *attr)
   {
         enum s_alloc alloc_state = sa_none;
- -      struct s_data d;
         struct sched_domain *sd;
- -      int i;
- -#ifdef CONFIG_NUMA
- -      d.sd_allnodes = 0;
- -#endif
+ +      struct s_data d;
+ +      int i, ret = -ENOMEM;
   
         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
         if (alloc_state != sa_rootdomain)
                 goto error;
- -      alloc_state = sa_sched_groups;
- -
- -      /*
- -       * Set up domains for cpus specified by the cpu_map.
- -       */
- -      for_each_cpu(i, cpu_map) {
- -              cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- -                          cpu_map);
- -
- -              sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- -              sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- -      }
   
+ +      /* Set up domains for cpus specified by the cpu_map. */
         for_each_cpu(i, cpu_map) {
- -              build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- -              build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- -              build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- -      }
- -
- -      /* Set up physical groups */
- -      for (i = 0; i < nr_node_ids; i++)
- -              build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ +              struct sched_domain_topology_level *tl;
   
- -#ifdef CONFIG_NUMA
- -      /* Set up node groups */
- -      if (d.sd_allnodes)
- -              build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
+ +              sd = NULL;
+ +              for (tl = sched_domain_topology; tl->init; tl++)
+ +                      sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
   
- -      for (i = 0; i < nr_node_ids; i++)
- -              if (build_numa_sched_groups(&d, cpu_map, i))
- -                      goto error;
- -#endif
+ +              while (sd->child)
+ +                      sd = sd->child;
   
- -      /* Calculate CPU power for physical packages and nodes */
- -#ifdef CONFIG_SCHED_SMT
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(cpu_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
- -#endif
- -#ifdef CONFIG_SCHED_MC
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(core_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
- -#endif
- -#ifdef CONFIG_SCHED_BOOK
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(book_domains, i).sd;
- -              init_sched_groups_power(i, sd);
+ +              *per_cpu_ptr(d.sd, i) = sd;
         }
- -#endif
   
+ +      /* Build the groups for the domains */
         for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(phys_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
+ +              for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ +                      sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ +                      get_group(i, sd->private, &sd->groups);
+ +                      atomic_inc(&sd->groups->ref);
   
- -#ifdef CONFIG_NUMA
- -      for (i = 0; i < nr_node_ids; i++)
- -              init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ +                      if (i != cpumask_first(sched_domain_span(sd)))
+ +                              continue;
   
- -      if (d.sd_allnodes) {
- -              struct sched_group *sg;
+ +                      build_sched_groups(sd);
+ +              }
+ +      }
+ +
+ +      /* Calculate CPU power for physical packages and nodes */
+ +      for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ +              if (!cpumask_test_cpu(i, cpu_map))
+ +                      continue;
   
- -              cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- -                                                              d.tmpmask);
- -              init_numa_sched_groups_power(sg);
+ +              for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ +                      claim_allocations(i, sd);
+ +                      init_sched_groups_power(i, sd);
+ +              }
         }
- -#endif
   
         /* Attach the domains */
+ +      rcu_read_lock();
         for_each_cpu(i, cpu_map) {
- -#ifdef CONFIG_SCHED_SMT
- -              sd = &per_cpu(cpu_domains, i).sd;
- -#elif defined(CONFIG_SCHED_MC)
- -              sd = &per_cpu(core_domains, i).sd;
- -#elif defined(CONFIG_SCHED_BOOK)
- -              sd = &per_cpu(book_domains, i).sd;
- -#else
- -              sd = &per_cpu(phys_domains, i).sd;
- -#endif
+ +              sd = *per_cpu_ptr(d.sd, i);
                 cpu_attach_domain(sd, d.rd, i);
         }
+ +      rcu_read_unlock();
   
- -      d.sched_group_nodes = NULL; /* don't free this we still need it */
- -      __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- -      return 0;
- -
+ +      ret = 0;
   error:
         __free_domain_allocs(&d, alloc_state, cpu_map);
- -      return -ENOMEM;
- -}
- -
- -static int build_sched_domains(const struct cpumask *cpu_map)
- -{
- -      return __build_sched_domains(cpu_map, NULL);
+ +      return ret;
   }
   
   static cpumask_var_t *doms_cur;       /* current sched domains */
@@@ -7405,7 -7476,7 +7405,7 @@@ void free_sched_domains(cpumask_var_t d
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- -static int arch_init_sched_domains(const struct cpumask *cpu_map)
+ +static int init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
@@@ -7416,24 -7487,32 +7416,24 @@@
                 doms_cur = &fallback_doms;
         cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
- -      err = build_sched_domains(doms_cur[0]);
+ +      err = build_sched_domains(doms_cur[0], NULL);
         register_sched_domain_sysctl();
   
         return err;
   }
   
- -static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- -                                     struct cpumask *tmpmask)
- -{
- -      free_sched_groups(cpu_map, tmpmask);
- -}
- -
   /*
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
   static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
- -      /* Save because hotplug lock held. */
- -      static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
+ +      rcu_read_lock();
         for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
- -      synchronize_sched();
- -      arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ +      rcu_read_unlock();
   }
   
   /* handle null as "default" */
@@@ -7522,7 -7601,8 +7522,7 @@@ match1
                                 goto match2;
                 }
                 /* no match - add a new doms_new */
- -              __build_sched_domains(doms_new[i],
- -                                      dattr_new ? dattr_new + i : NULL);
+ +              build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
   match2:
                 ;
         }
@@@ -7541,7 -7621,7 +7541,7 @@@
   }
   
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -static void arch_reinit_sched_domains(void)
+ +static void reinit_sched_domains(void)
   {
         get_online_cpus();
   
@@@ -7574,7 -7654,7 +7574,7 @@@ static ssize_t sched_power_savings_stor
         else
                 sched_mc_power_savings = level;
   
- -      arch_reinit_sched_domains();
+ +      reinit_sched_domains();
   
         return count;
   }
@@@ -7693,9 -7773,14 +7693,9 @@@ void __init sched_init_smp(void
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
- -#if defined(CONFIG_NUMA)
- -      sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- -                                                              GFP_KERNEL);
- -      BUG_ON(sched_group_nodes_bycpu == NULL);
- -#endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
- -      arch_init_sched_domains(cpu_active_mask);
+ +      init_sched_domains(cpu_active_mask);
         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
         if (cpumask_empty(non_isolated_cpus))
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@@ -7740,10 -7825,6 +7740,10 @@@ static void init_cfs_rq(struct cfs_rq *
         INIT_LIST_HEAD(&cfs_rq->tasks);
   #ifdef CONFIG_FAIR_GROUP_SCHED
         cfs_rq->rq = rq;
+ +      /* allow initial update_cfs_load() to truncate */
+ +#ifdef CONFIG_SMP
+ +      cfs_rq->load_stamp = 1;
+ +#endif
   #endif
         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
   }
@@@ -7786,16 -7867,18 +7786,16 @@@ static void init_rt_rq(struct rt_rq *rt
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- -                              struct sched_entity *se, int cpu, int add,
+ +                              struct sched_entity *se, int cpu,
                                 struct sched_entity *parent)
   {
         struct rq *rq = cpu_rq(cpu);
         tg->cfs_rq[cpu] = cfs_rq;
         init_cfs_rq(cfs_rq, rq);
         cfs_rq->tg = tg;
- -      if (add)
- -              list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
   
         tg->se[cpu] = se;
- -      /* se could be NULL for init_task_group */
+ +      /* se could be NULL for root_task_group */
         if (!se)
                 return;
   
@@@ -7805,14 -7888,15 +7805,14 @@@
                 se->cfs_rq = parent->my_q;
   
         se->my_q = cfs_rq;
- -      se->load.weight = tg->shares;
- -      se->load.inv_weight = 0;
+ +      update_load_set(&se->load, 0);
         se->parent = parent;
   }
   #endif
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- -              struct sched_rt_entity *rt_se, int cpu, int add,
+ +              struct sched_rt_entity *rt_se, int cpu,
                 struct sched_rt_entity *parent)
   {
         struct rq *rq = cpu_rq(cpu);
@@@ -7821,6 -7905,8 +7821,6 @@@
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- -      if (add)
- -              list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
   
         tg->rt_se[cpu] = rt_se;
         if (!rt_se)
@@@ -7855,18 -7941,18 +7855,18 @@@ void __init sched_init(void
                 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -              init_task_group.se = (struct sched_entity **)ptr;
+ +              root_task_group.se = (struct sched_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
- -              init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ +              root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   #ifdef CONFIG_RT_GROUP_SCHED
- -              init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ +              root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
- -              init_task_group.rt_rq = (struct rt_rq **)ptr;
+ +              root_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
   #endif /* CONFIG_RT_GROUP_SCHED */
@@@ -7886,16 -7972,20 +7886,16 @@@
                         global_rt_period(), global_rt_runtime());
   
   #ifdef CONFIG_RT_GROUP_SCHED
- -      init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ +      init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
   #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_CGROUP_SCHED
- -      list_add(&init_task_group.list, &task_groups);
- -      INIT_LIST_HEAD(&init_task_group.children);
- -
+ +      list_add(&root_task_group.list, &task_groups);
+ +      INIT_LIST_HEAD(&root_task_group.children);
+ +      autogroup_init(&init_task);
   #endif /* CONFIG_CGROUP_SCHED */
   
- -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- -      update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- -                                          __alignof__(unsigned long));
- -#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
   
@@@ -7907,34 -7997,38 +7907,34 @@@
                 init_cfs_rq(&rq->cfs, rq);
                 init_rt_rq(&rq->rt, rq);
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -              init_task_group.shares = init_task_group_load;
+ +              root_task_group.shares = root_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
- -#ifdef CONFIG_CGROUP_SCHED
                 /*
- -               * How much cpu bandwidth does init_task_group get?
+ +               * How much cpu bandwidth does root_task_group get?
                  *
                  * In case of task-groups formed thr' the cgroup filesystem, it
                  * gets 100% of the cpu resources in the system. This overall
                  * system cpu resource is divided among the tasks of
- -               * init_task_group and its child task-groups in a fair manner,
+ +               * root_task_group and its child task-groups in a fair manner,
                  * based on each entity's (task or task-group's) weight
                  * (se->load.weight).
                  *
- -               * In other words, if init_task_group has 10 tasks of weight
+ +               * In other words, if root_task_group has 10 tasks of weight
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
                  *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
- -               * We achieve this by letting init_task_group's tasks sit
- -               * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+ +               * We achieve this by letting root_task_group's tasks sit
+ +               * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
- -              init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
- -#endif
+ +              init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
   #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
- -#ifdef CONFIG_CGROUP_SCHED
- -              init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
- -#endif
+ +              init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
   #endif
   
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@@ -7945,7 -8039,7 +7945,7 @@@
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
- -              rq->cpu_power = SCHED_LOAD_SCALE;
+ +              rq->cpu_power = SCHED_POWER_SCALE;
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
@@@ -8002,7 -8096,6 +8002,7 @@@
         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
   #ifdef CONFIG_SMP
+ +      zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
   #ifdef CONFIG_NO_HZ
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
         alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@@ -8015,6 -8108,8 +8015,6 @@@
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
   #endif /* SMP */
   
- -      perf_event_init();
- -
         scheduler_running = 1;
   }
   
@@@ -8023,7 -8118,7 +8023,7 @@@ static inline int preempt_count_equals(
   {
         int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
   
- -      return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ +      return (nested == preempt_offset);
   }
   
   void __might_sleep(const char *file, int line, int preempt_offset)
@@@ -8058,11 -8153,9 +8058,11 @@@ EXPORT_SYMBOL(__might_sleep)
   #ifdef CONFIG_MAGIC_SYSRQ
   static void normalize_task(struct rq *rq, struct task_struct *p)
   {
+ +      const struct sched_class *prev_class = p->sched_class;
+ +      int old_prio = p->prio;
         int on_rq;
   
- -      on_rq = p->se.on_rq;
+ +      on_rq = p->on_rq;
         if (on_rq)
                 deactivate_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
@@@ -8070,8 -8163,6 +8070,8 @@@
                 activate_task(rq, p, 0);
                 resched_task(rq->curr);
         }
+ +
+ +      check_class_changed(rq, p, prev_class, old_prio);
   }
   
   void normalize_rt_tasks(void)
@@@ -8187,6 -8278,7 +8187,6 @@@ int alloc_fair_sched_group(struct task_
   {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se;
- -      struct rq *rq;
         int i;
   
         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@@ -8199,6 -8291,8 +8199,6 @@@
         tg->shares = NICE_0_LOAD;
   
         for_each_possible_cpu(i) {
- -              rq = cpu_rq(i);
- -
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
                 if (!cfs_rq)
@@@ -8209,7 -8303,7 +8209,7 @@@
                 if (!se)
                         goto err_free_rq;
   
- -              init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+ +              init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
   
         return 1;
@@@ -8220,21 -8314,15 +8220,21 @@@ err
         return 0;
   }
   
- -static inline void register_fair_sched_group(struct task_group *tg, int cpu)
- -{
- -      list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
- -                      &cpu_rq(cpu)->leaf_cfs_rq_list);
- -}
- -
   static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
   {
- -      list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+ +      struct rq *rq = cpu_rq(cpu);
+ +      unsigned long flags;
+ +
+ +      /*
+ +      * Only empty task groups can be destroyed; so we can speculatively
+ +      * check on_list without danger of it being re-added.
+ +      */
+ +      if (!tg->cfs_rq[cpu]->on_list)
+ +              return;
+ +
+ +      raw_spin_lock_irqsave(&rq->lock, flags);
+ +      list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ +      raw_spin_unlock_irqrestore(&rq->lock, flags);
   }
   #else /* !CONFG_FAIR_GROUP_SCHED */
   static inline void free_fair_sched_group(struct task_group *tg)
@@@ -8247,6 -8335,10 +8247,6 @@@ int alloc_fair_sched_group(struct task_
         return 1;
   }
   
- -static inline void register_fair_sched_group(struct task_group *tg, int cpu)
- -{
- -}
- -
   static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
   {
   }
@@@ -8275,6 -8367,7 +8275,6 @@@ int alloc_rt_sched_group(struct task_gr
   {
         struct rt_rq *rt_rq;
         struct sched_rt_entity *rt_se;
- -      struct rq *rq;
         int i;
   
         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@@ -8288,6 -8381,8 +8288,6 @@@
                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
   
         for_each_possible_cpu(i) {
- -              rq = cpu_rq(i);
- -
                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                 if (!rt_rq)
@@@ -8298,7 -8393,7 +8298,7 @@@
                 if (!rt_se)
                         goto err_free_rq;
   
- -              init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+ +              init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }
   
         return 1;
@@@ -8308,6 -8403,17 +8308,6 @@@ err_free_rq
   err:
         return 0;
   }
- -
- -static inline void register_rt_sched_group(struct task_group *tg, int cpu)
- -{
- -      list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
- -                      &cpu_rq(cpu)->leaf_rt_rq_list);
- -}
- -
- -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
- -{
- -      list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
- -}
   #else /* !CONFIG_RT_GROUP_SCHED */
   static inline void free_rt_sched_group(struct task_group *tg)
   {
@@@ -8318,6 -8424,14 +8318,6 @@@ int alloc_rt_sched_group(struct task_gr
   {
         return 1;
   }
- -
- -static inline void register_rt_sched_group(struct task_group *tg, int cpu)
- -{
- -}
- -
- -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
- -{
- -}
   #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_CGROUP_SCHED
@@@ -8325,7 -8439,6 +8325,7 @@@ static void free_sched_group(struct tas
   {
         free_fair_sched_group(tg);
         free_rt_sched_group(tg);
+ +      autogroup_free(tg);
         kfree(tg);
   }
   
@@@ -8334,6 -8447,7 +8334,6 @@@ struct task_group *sched_create_group(s
   {
         struct task_group *tg;
         unsigned long flags;
- -      int i;
   
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@@ -8346,6 -8460,10 +8346,6 @@@
                 goto err;
   
         spin_lock_irqsave(&task_group_lock, flags);
- -      for_each_possible_cpu(i) {
- -              register_fair_sched_group(tg, i);
- -              register_rt_sched_group(tg, i);
- -      }
         list_add_rcu(&tg->list, &task_groups);
   
         WARN_ON(!parent); /* root should already exist */
@@@ -8375,11 -8493,11 +8375,11 @@@ void sched_destroy_group(struct task_gr
         unsigned long flags;
         int i;
   
- -      spin_lock_irqsave(&task_group_lock, flags);
- -      for_each_possible_cpu(i) {
+ +      /* end participation in shares distribution */
+ +      for_each_possible_cpu(i)
                 unregister_fair_sched_group(tg, i);
- -              unregister_rt_sched_group(tg, i);
- -      }
+ +
+ +      spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
@@@ -8402,7 -8520,7 +8402,7 @@@ void sched_move_task(struct task_struc
         rq = task_rq_lock(tsk, &flags);
   
         running = task_current(rq, tsk);
- -      on_rq = tsk->se.on_rq;
+ +      on_rq = tsk->on_rq;
   
         if (on_rq)
                 dequeue_task(rq, tsk, 0);
@@@ -8421,11 -8539,38 +8421,11 @@@
         if (on_rq)
                 enqueue_task(rq, tsk, 0);
   
- -      task_rq_unlock(rq, &flags);
+ +      task_rq_unlock(rq, tsk, &flags);
   }
   #endif /* CONFIG_CGROUP_SCHED */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static void __set_se_shares(struct sched_entity *se, unsigned long shares)
- -{
- -      struct cfs_rq *cfs_rq = se->cfs_rq;
- -      int on_rq;
- -
- -      on_rq = se->on_rq;
- -      if (on_rq)
- -              dequeue_entity(cfs_rq, se, 0);
- -
- -      se->load.weight = shares;
- -      se->load.inv_weight = 0;
- -
- -      if (on_rq)
- -              enqueue_entity(cfs_rq, se, 0);
- -}
- -
- -static void set_se_shares(struct sched_entity *se, unsigned long shares)
- -{
- -      struct cfs_rq *cfs_rq = se->cfs_rq;
- -      struct rq *rq = cfs_rq->rq;
- -      unsigned long flags;
- -
- -      raw_spin_lock_irqsave(&rq->lock, flags);
- -      __set_se_shares(se, shares);
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
- -}
- -
   static DEFINE_MUTEX(shares_mutex);
   
   int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@@ -8448,19 -8593,37 +8448,19 @@@
         if (tg->shares == shares)
                 goto done;
   
- -      spin_lock_irqsave(&task_group_lock, flags);
- -      for_each_possible_cpu(i)
- -              unregister_fair_sched_group(tg, i);
- -      list_del_rcu(&tg->siblings);
- -      spin_unlock_irqrestore(&task_group_lock, flags);
- -
- -      /* wait for any ongoing reference to this group to finish */
- -      synchronize_sched();
- -
- -      /*
- -       * Now we are free to modify the group's share on each cpu
- -       * w/o tripping rebalance_share or load_balance_fair.
- -       */
         tg->shares = shares;
         for_each_possible_cpu(i) {
- -              /*
- -               * force a rebalance
- -               */
- -              cfs_rq_set_shares(tg->cfs_rq[i], 0);
- -              set_se_shares(tg->se[i], shares);
+ +              struct rq *rq = cpu_rq(i);
+ +              struct sched_entity *se;
+ +
+ +              se = tg->se[i];
+ +              /* Propagate contribution to hierarchy */
+ +              raw_spin_lock_irqsave(&rq->lock, flags);
+ +              for_each_sched_entity(se)
+ +                      update_cfs_shares(group_cfs_rq(se));
+ +              raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
   
- -      /*
- -       * Enable load balance activity on this group, by inserting it back on
- -       * each cpu's rq->leaf_cfs_rq_list.
- -       */
- -      spin_lock_irqsave(&task_group_lock, flags);
- -      for_each_possible_cpu(i)
- -              register_fair_sched_group(tg, i);
- -      list_add_rcu(&tg->siblings, &tg->parent->children);
- -      spin_unlock_irqrestore(&task_group_lock, flags);
   done:
         mutex_unlock(&shares_mutex);
         return 0;
@@@ -8759,7 -8922,7 +8759,7 @@@ cpu_cgroup_create(struct cgroup_subsys 
   
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
- -              return &init_task_group.css;
+ +              return &root_task_group.css;
         }
   
         parent = cgroup_tg(cgrp->parent);
@@@ -8792,39 -8955,56 +8792,39 @@@ cpu_cgroup_can_attach_task(struct cgrou
         return 0;
   }
   
- -static int
- -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- -                    struct task_struct *tsk, bool threadgroup)
+ +static void
+ +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
   {
- -      int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- -      if (retval)
- -              return retval;
- -      if (threadgroup) {
- -              struct task_struct *c;
- -              rcu_read_lock();
- -              list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- -                      retval = cpu_cgroup_can_attach_task(cgrp, c);
- -                      if (retval) {
- -                              rcu_read_unlock();
- -                              return retval;
- -                      }
- -              }
- -              rcu_read_unlock();
- -      }
- -      return 0;
+ +      sched_move_task(tsk);
   }
   
   static void
- -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- -                struct cgroup *old_cont, struct task_struct *tsk,
- -                bool threadgroup)
+ +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ +              struct cgroup *old_cgrp, struct task_struct *task)
   {
- -      sched_move_task(tsk);
- -      if (threadgroup) {
- -              struct task_struct *c;
- -              rcu_read_lock();
- -              list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- -                      sched_move_task(c);
- -              }
- -              rcu_read_unlock();
- -      }
+ +      /*
+ +       * cgroup_exit() is called in the copy_process() failure path.
+ +       * Ignore this case since the task hasn't ran yet, this avoids
+ +       * trying to poke a half freed task state from generic code.
+ +       */
+ +      if (!(task->flags & PF_EXITING))
+ +              return;
+ +
+ +      sched_move_task(task);
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                 u64 shareval)
   {
- -      return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ +      return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
   }
   
   static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
   {
         struct task_group *tg = cgroup_tg(cgrp);
   
- -      return (u64) tg->shares;
+ +      return (u64) scale_load_down(tg->shares);
   }
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
@@@ -8883,9 -9063,8 +8883,9 @@@ struct cgroup_subsys cpu_cgroup_subsys 
         .name           = "cpu",
         .create         = cpu_cgroup_create,
         .destroy        = cpu_cgroup_destroy,
- -      .can_attach     = cpu_cgroup_can_attach,
- -      .attach         = cpu_cgroup_attach,
+ +      .can_attach_task = cpu_cgroup_can_attach_task,
+ +      .attach_task    = cpu_cgroup_attach_task,
+ +      .exit           = cpu_cgroup_exit,
         .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
         .early_init     = 1,
@@@ -9170,3 -9349,72 +9170,3 @@@ struct cgroup_subsys cpuacct_subsys = 
   };
   #endif        /* CONFIG_CGROUP_CPUACCT */
   
- -#ifndef CONFIG_SMP
- -
- -void synchronize_sched_expedited(void)
- -{
- -      barrier();
- -}
- -EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
- -
- -#else /* #ifndef CONFIG_SMP */
- -
- -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
- -
- -static int synchronize_sched_expedited_cpu_stop(void *data)
- -{
- -      /*
- -       * There must be a full memory barrier on each affected CPU
- -       * between the time that try_stop_cpus() is called and the
- -       * time that it returns.
- -       *
- -       * In the current initial implementation of cpu_stop, the
- -       * above condition is already met when the control reaches
- -       * this point and the following smp_mb() is not strictly
- -       * necessary.  Do smp_mb() anyway for documentation and
- -       * robustness against future implementation changes.
- -       */
- -      smp_mb(); /* See above comment block. */
- -      return 0;
- -}
- -
- -/*
- - * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- - * approach to force grace period to end quickly.  This consumes
- - * significant time on all CPUs, and is thus not recommended for
- - * any sort of common-case code.
- - *
- - * Note that it is illegal to call this function while holding any
- - * lock that is acquired by a CPU-hotplug notifier.  Failing to
- - * observe this restriction will result in deadlock.
- - */
- -void synchronize_sched_expedited(void)
- -{
- -      int snap, trycount = 0;
- -
- -      smp_mb();  /* ensure prior mod happens before capturing snap. */
- -      snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- -      get_online_cpus();
- -      while (try_stop_cpus(cpu_online_mask,
- -                           synchronize_sched_expedited_cpu_stop,
- -                           NULL) == -EAGAIN) {
- -              put_online_cpus();
- -              if (trycount++ < 10)
- -                      udelay(trycount * num_online_cpus());
- -              else {
- -                      synchronize_sched();
- -                      return;
- -              }
- -              if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- -                      smp_mb(); /* ensure test happens before caller kfree */
- -                      return;
- -              }
- -              get_online_cpus();
- -      }
- -      atomic_inc(&synchronize_sched_expedited_count);
- -      smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- -      put_online_cpus();
- -}
- -EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
- -
- -#endif /* #else #ifndef CONFIG_SMP */
diff --combined kernel/sched_rt.c

index 88725c9,c2266c4..08e9374
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -183,25 -183,6 +183,25 @@@ static inline u64 sched_rt_period(struc
         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
   }
   
+ +typedef struct task_group *rt_rq_iter_t;
+ +
+ +#define for_each_rt_rq(rt_rq, iter, rq) \
+ +      for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
+ +           (&iter->list != &task_groups) && \
+ +           (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+ +           iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+ +
+ +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +      list_add_rcu(&rt_rq->leaf_rt_rq_list,
+ +                      &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+ +}
+ +
+ +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +      list_del_rcu(&rt_rq->leaf_rt_rq_list);
+ +}
+ +
   #define for_each_leaf_rt_rq(rt_rq, rq) \
         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
   
@@@ -218,12 -199,11 +218,12 @@@ static void dequeue_rt_entity(struct sc
   
   static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
   {
- -      int this_cpu = smp_processor_id();
         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
         struct sched_rt_entity *rt_se;
   
- -      rt_se = rt_rq->tg->rt_se[this_cpu];
+ +      int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+ +
+ +      rt_se = rt_rq->tg->rt_se[cpu];
   
         if (rt_rq->rt_nr_running) {
                 if (rt_se && !on_rt_rq(rt_se))
@@@ -235,10 -215,10 +235,10 @@@
   
   static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
   {
- -      int this_cpu = smp_processor_id();
         struct sched_rt_entity *rt_se;
+ +      int cpu = cpu_of(rq_of_rt_rq(rt_rq));
   
- -      rt_se = rt_rq->tg->rt_se[this_cpu];
+ +      rt_se = rt_rq->tg->rt_se[cpu];
   
         if (rt_se && on_rt_rq(rt_se))
                 dequeue_rt_entity(rt_se);
@@@ -296,19 -276,6 +296,19 @@@ static inline u64 sched_rt_period(struc
         return ktime_to_ns(def_rt_bandwidth.rt_period);
   }
   
+ +typedef struct rt_rq *rt_rq_iter_t;
+ +
+ +#define for_each_rt_rq(rt_rq, iter, rq) \
+ +      for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+ +
+ +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +}
+ +
   #define for_each_leaf_rt_rq(rt_rq, rq) \
         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
   
@@@ -415,13 -382,12 +415,13 @@@ next
   static void __disable_runtime(struct rq *rq)
   {
         struct root_domain *rd = rq->rd;
+ +      rt_rq_iter_t iter;
         struct rt_rq *rt_rq;
   
         if (unlikely(!scheduler_running))
                 return;
   
- -      for_each_leaf_rt_rq(rt_rq, rq) {
+ +      for_each_rt_rq(rt_rq, iter, rq) {
                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
                 s64 want;
                 int i;
@@@ -501,7 -467,6 +501,7 @@@ static void disable_runtime(struct rq *
   
   static void __enable_runtime(struct rq *rq)
   {
+ +      rt_rq_iter_t iter;
         struct rt_rq *rt_rq;
   
         if (unlikely(!scheduler_running))
@@@ -510,7 -475,7 +510,7 @@@
         /*
          * Reset each runqueue's bandwidth settings
          */
- -      for_each_leaf_rt_rq(rt_rq, rq) {
+ +      for_each_rt_rq(rt_rq, iter, rq) {
                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
   
                 raw_spin_lock(&rt_b->rt_runtime_lock);
@@@ -577,22 -542,12 +577,22 @@@ static int do_sched_rt_period_timer(str
                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                 rt_rq->rt_throttled = 0;
                                 enqueue = 1;
+ +
+ +                              /*
+ +                               * Force a clock update if the CPU was idle,
+ +                               * lest wakeup -> unthrottle time accumulate.
+ +                               */
+ +                              if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+ +                                      rq->skip_clock_update = -1;
                         }
                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                 idle = 0;
                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
- -              } else if (rt_rq->rt_nr_running)
+ +              } else if (rt_rq->rt_nr_running) {
                         idle = 0;
+ +                      if (!rt_rq_throttled(rt_rq))
+ +                              enqueue = 1;
+ +              }
   
                 if (enqueue)
                         sched_rt_rq_enqueue(rt_rq);
@@@ -651,7 -606,7 +651,7 @@@ static void update_curr_rt(struct rq *r
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         u64 delta_exec;
   
- -      if (!task_has_rt_policy(curr))
+ +      if (curr->sched_class != &rt_sched_class)
                 return;
   
         delta_exec = rq->clock_task - curr->se.exec_start;
@@@ -870,9 -825,6 +870,9 @@@ static void __enqueue_rt_entity(struct 
         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                 return;
   
+ +      if (!rt_rq->rt_nr_running)
+ +              list_add_leaf_rt_rq(rt_rq);
+ +
         if (head)
                 list_add(&rt_se->run_list, queue);
         else
@@@ -892,8 -844,6 +892,8 @@@ static void __dequeue_rt_entity(struct 
                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
   
         dec_rt_tasks(rt_se, rt_rq);
+ +      if (!rt_rq->rt_nr_running)
+ +              list_del_leaf_rt_rq(rt_rq);
   }
   
   /*
@@@ -999,23 -949,13 +999,23 @@@ static void yield_task_rt(struct rq *rq
   static int find_lowest_rq(struct task_struct *task);
   
   static int
- -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+ +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
   {
+ +      struct task_struct *curr;
+ +      struct rq *rq;
+ +      int cpu;
+ +
         if (sd_flag != SD_BALANCE_WAKE)
                 return smp_processor_id();
   
+ +      cpu = task_cpu(p);
+ +      rq = cpu_rq(cpu);
+ +
+ +      rcu_read_lock();
+ +      curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ +
         /*
- -       * If the current task is an RT task, then
+ +       * If the current task on @p's runqueue is an RT task, then
          * try to see if we can wake this RT task up on another
          * runqueue. Otherwise simply start this RT task
          * on its current runqueue.
@@@ -1029,25 -969,21 +1029,25 @@@
          * lock?
          *
          * For equal prio tasks, we just let the scheduler sort it out.
+ +       *
+ +       * Otherwise, just let it ride on the affined RQ and the
+ +       * post-schedule router will push the preempted task away
+ +       *
+ +       * This test is optimistic, if we get it wrong the load-balancer
+ +       * will have to sort it out.
          */
- -      if (unlikely(rt_task(rq->curr)) &&
- -          (rq->curr->rt.nr_cpus_allowed < 2 ||
- -           rq->curr->prio < p->prio) &&
+ +      if (curr && unlikely(rt_task(curr)) &&
+ +          (curr->rt.nr_cpus_allowed < 2 ||
+ +           curr->prio < p->prio) &&
             (p->rt.nr_cpus_allowed > 1)) {
- -              int cpu = find_lowest_rq(p);
+ +              int target = find_lowest_rq(p);
   
- -              return (cpu == -1) ? task_cpu(p) : cpu;
+ +              if (target != -1)
+ +                      cpu = target;
         }
+ +      rcu_read_unlock();
   
- -      /*
- -       * Otherwise, just let it ride on the affined RQ and the
- -       * post-schedule router will push the preempted task away
- -       */
- -      return task_cpu(p);
+ +      return cpu;
   }
   
   static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@@ -1126,7 -1062,7 +1126,7 @@@ static struct task_struct *_pick_next_t
   
         rt_rq = &rq->rt;
   
-       if (unlikely(!rt_rq->rt_nr_running))
+       if (!rt_rq->rt_nr_running)
                 return NULL;
   
         if (rt_rq_throttled(rt_rq))
@@@ -1172,7 -1108,7 +1172,7 @@@ static void put_prev_task_rt(struct rq 
          * The previous task needs to be made eligible for pushing
          * if it is still active
          */
- -      if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ +      if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
   }
   
@@@ -1263,7 -1199,6 +1263,7 @@@ static int find_lowest_rq(struct task_s
         if (!cpumask_test_cpu(this_cpu, lowest_mask))
                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
   
+ +      rcu_read_lock();
         for_each_domain(cpu, sd) {
                 if (sd->flags & SD_WAKE_AFFINE) {
                         int best_cpu;
@@@ -1273,20 -1208,15 +1273,20 @@@
                          * remote processor.
                          */
                         if (this_cpu != -1 &&
- -                          cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+ +                          cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ +                              rcu_read_unlock();
                                 return this_cpu;
+ +                      }
   
                         best_cpu = cpumask_first_and(lowest_mask,
                                                      sched_domain_span(sd));
- -                      if (best_cpu < nr_cpu_ids)
+ +                      if (best_cpu < nr_cpu_ids) {
+ +                              rcu_read_unlock();
                                 return best_cpu;
+ +                      }
                 }
         }
+ +      rcu_read_unlock();
   
         /*
          * And finally, if there were no matches within the domains
@@@ -1329,7 -1259,7 +1329,7 @@@ static struct rq *find_lock_lowest_rq(s
                                      !cpumask_test_cpu(lowest_rq->cpu,
                                                        &task->cpus_allowed) ||
                                      task_running(rq, task) ||
- -                                   !task->se.on_rq)) {
+ +                                   !task->on_rq)) {
   
                                 raw_spin_unlock(&lowest_rq->lock);
                                 lowest_rq = NULL;
@@@ -1363,7 -1293,7 +1363,7 @@@ static struct task_struct *pick_next_pu
         BUG_ON(task_current(rq, p));
         BUG_ON(p->rt.nr_cpus_allowed <= 1);
   
- -      BUG_ON(!p->se.on_rq);
+ +      BUG_ON(!p->on_rq);
         BUG_ON(!rt_task(p));
   
         return p;
@@@ -1420,7 -1350,7 +1420,7 @@@ retry
                 task = pick_next_pushable_task(rq);
                 if (task_cpu(next_task) == rq->cpu && task == next_task) {
                         /*
- -                       * If we get here, the task hasnt moved at all, but
+ +                       * If we get here, the task hasn't moved at all, but
                          * it has failed to push.  We will not try again,
                          * since the other cpus will pull from us when they
                          * are ready.
@@@ -1509,7 -1439,7 +1509,7 @@@ static int pull_rt_task(struct rq *this
                  */
                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                         WARN_ON(p == src_rq->curr);
- -                      WARN_ON(!p->se.on_rq);
+ +                      WARN_ON(!p->on_rq);
   
                         /*
                          * There's a chance that p is higher in priority
@@@ -1530,7 -1460,7 +1530,7 @@@
                         /*
                          * We continue with the search, just in
                          * case there's an even higher prio task
- -                       * in another runqueue. (low likelyhood
+ +                       * in another runqueue. (low likelihood
                          * but possible)
                          */
                 }
@@@ -1544,7 -1474,7 +1544,7 @@@ skip
   static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
   {
         /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+       if (rq->rt.highest_prio.curr > prev->prio)
                 pull_rt_task(rq);
   }
   
@@@ -1580,7 -1510,7 +1580,7 @@@ static void set_cpus_allowed_rt(struct 
          * Update the migration status of the RQ if we have an RT task
          * which is running AND changing its weight value.
          */
- -      if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+ +      if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
   
                 if (!task_current(rq, p)) {
@@@ -1641,7 -1571,8 +1641,7 @@@ static void rq_offline_rt(struct rq *rq
    * When switch from the rt queue, we bring ourselves to a position
    * that we might want to pull RT tasks from other runqueues.
    */
- -static void switched_from_rt(struct rq *rq, struct task_struct *p,
- -                         int running)
+ +static void switched_from_rt(struct rq *rq, struct task_struct *p)
   {
         /*
          * If there are other RT tasks then we will reschedule
@@@ -1650,7 -1581,7 +1650,7 @@@
          * we may need to handle the pulling of RT tasks
          * now.
          */
- -      if (!rq->rt.rt_nr_running)
+ +      if (p->on_rq && !rq->rt.rt_nr_running)
                 pull_rt_task(rq);
   }
   
@@@ -1669,7 -1600,8 +1669,7 @@@ static inline void init_sched_rt_class(
    * with RT tasks. In this case we try to push them off to
    * other runqueues.
    */
- -static void switched_to_rt(struct rq *rq, struct task_struct *p,
- -                         int running)
+ +static void switched_to_rt(struct rq *rq, struct task_struct *p)
   {
         int check_resched = 1;
   
@@@ -1680,7 -1612,7 +1680,7 @@@
          * If that current running task is also an RT task
          * then see if we can move to another run queue.
          */
- -      if (!running) {
+ +      if (p->on_rq && rq->curr != p) {
   #ifdef CONFIG_SMP
                 if (rq->rt.overloaded && push_rt_task(rq) &&
                     /* Don't resched if we changed runqueues */
@@@ -1696,13 -1628,10 +1696,13 @@@
    * Priority of the task has changed. This may cause
    * us to initiate a push or pull.
    */
- -static void prio_changed_rt(struct rq *rq, struct task_struct *p,
- -                          int oldprio, int running)
+ +static void
+ +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
   {
- -      if (running) {
+ +      if (!p->on_rq)
+ +              return;
+ +
+ +      if (rq->curr == p) {
   #ifdef CONFIG_SMP
                 /*
                  * If our priority decreases while running, we
@@@ -1838,11 -1767,10 +1838,11 @@@ extern void print_rt_rq(struct seq_fil
   
   static void print_rt_stats(struct seq_file *m, int cpu)
   {
+ +      rt_rq_iter_t iter;
         struct rt_rq *rt_rq;
   
         rcu_read_lock();
- -      for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+ +      for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
                 print_rt_rq(m, cpu, rt_rq);
         rcu_read_unlock();
   }
author	Ingo Molnar <mingo@elte.hu>
	Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 3 Jun 2011 08:27:47 +0000 (10:27 +0200)
		1	2
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history