Merge branch 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[pandora-kernel.git] / kernel / sched.c
index 8f977d5..2f76e06 100644 (file)
@@ -39,7 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -64,7 +64,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
  */
 #define RUNTIME_INF    ((u64)~0ULL)
 
-#ifdef CONFIG_SMP
-
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-
-/*
- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
- * Since cpu_power is a 'constant', we can use a reciprocal divide.
- */
-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
-{
-       return reciprocal_divide(load, sg->reciprocal_cpu_power);
-}
-
-/*
- * Each time a sched group cpu_power is changed,
- * we must compute its reciprocal value
- */
-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
-{
-       sg->__cpu_power += val;
-       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
-}
-#endif
-
 static inline int rt_policy(int policy)
 {
        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
 
 /*
  * Root task group.
- *     Every UID task group (including init_task_group aka UID-0) will
- *     be a child to this group.
+ *     Every UID task group (including init_task_group aka UID-0) will
+ *     be a child to this group.
  */
 struct task_group root_task_group;
 
@@ -318,12 +293,12 @@ struct task_group root_task_group;
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 
 #else
 
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return 1;
-}
-#endif
-
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -537,14 +505,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       /*
-        * Preferred wake up cpu nominated by sched_mc balance that will be
-        * used when most cpus are idle in the system indicating overall very
-        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-        */
-       unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 
 /*
@@ -616,6 +576,7 @@ struct rq {
 
        unsigned char idle_at_tick;
        /* For active balancing */
+       int post_schedule;
        int active_balance;
        int push_cpu;
        /* cpu of this runqueue: */
@@ -626,6 +587,9 @@ struct rq {
 
        struct task_struct *migration_thread;
        struct list_head migration_queue;
+
+       u64 rt_avg;
+       u64 age_stamp;
 #endif
 
        /* calc_load related fields */
@@ -665,9 +629,10 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+static inline
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
-       rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+       rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
 #define this_rq()              (&__get_cpu_var(runqueues))
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+#define raw_rq()               (&__raw_get_cpu_var(runqueues))
 
 inline void update_rq_clock(struct rq *rq)
 {
@@ -715,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
-int runqueue_is_locked(void)
+int runqueue_is_locked(int cpu)
 {
-       int cpu = get_cpu();
-       struct rq *rq = cpu_rq(cpu);
-       int ret;
-
-       ret = spin_is_locked(&rq->lock);
-       put_cpu();
-       return ret;
+       return spin_is_locked(&cpu_rq(cpu)->lock);
 }
 
 /*
@@ -860,6 +820,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
  */
 unsigned int sysctl_sched_shares_thresh = 4;
 
+/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1278,12 +1246,37 @@ void wake_up_idle_cpu(int cpu)
 }
 #endif /* CONFIG_NO_HZ */
 
+static u64 sched_avg_period(void)
+{
+       return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+static void sched_avg_update(struct rq *rq)
+{
+       s64 period = sched_avg_period();
+
+       while ((s64)(rq->clock - rq->age_stamp) > period) {
+               rq->age_stamp += period;
+               rq->rt_avg /= 2;
+       }
+}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+       rq->rt_avg += rt_delta;
+       sched_avg_update(rq);
+}
+
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+}
 #endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
@@ -1494,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
 #endif
 
 #ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+       return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long total = weighted_cpuload(cpu);
+
+       if (type == 0 || !sched_feat(LB_BIAS))
+               return total;
+
+       return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long total = weighted_cpuload(cpu);
+
+       if (type == 0 || !sched_feat(LB_BIAS))
+               return total;
+
+       return max(rq->cpu_load[type-1], total);
+}
+
+static struct sched_group *group_of(int cpu)
+{
+       struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+       if (!sd)
+               return NULL;
+
+       return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+       struct sched_group *group = group_of(cpu);
+
+       if (!group)
+               return SCHED_LOAD_SCALE;
+
+       return group->cpu_power;
+}
+
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+struct update_shares_data {
+       unsigned long rq_weight[NR_CPUS];
+};
+
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
  * Calculate and set the cpu's group shares.
  */
-static void
-update_group_shares_cpu(struct task_group *tg, int cpu,
-                       unsigned long sd_shares, unsigned long sd_rq_weight)
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
+                                   unsigned long sd_shares,
+                                   unsigned long sd_rq_weight,
+                                   struct update_shares_data *usd)
 {
-       unsigned long shares;
-       unsigned long rq_weight;
+       unsigned long shares, rq_weight;
+       int boost = 0;
 
-       if (!tg->se[cpu])
-               return;
-
-       rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       rq_weight = usd->rq_weight[cpu];
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
 
        /*
-        *           \Sum shares * rq_weight
-        * shares =  -----------------------
-        *               \Sum rq_weight
-        *
+        *             \Sum_j shares_j * rq_weight_i
+        * shares_i =  -----------------------------
+        *                  \Sum_j rq_weight_j
         */
        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
-
+               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -1559,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long weight, rq_weight = 0;
-       unsigned long shares = 0;
+       unsigned long weight, rq_weight = 0, shares = 0;
+       struct update_shares_data *usd;
        struct sched_domain *sd = data;
+       unsigned long flags;
        int i;
 
+       if (!tg->se[0])
+               return 0;
+
+       local_irq_save(flags);
+       usd = &__get_cpu_var(update_shares_data);
+
        for_each_cpu(i, sched_domain_span(sd)) {
+               weight = tg->cfs_rq[i]->load.weight;
+               usd->rq_weight[i] = weight;
+
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
                 * run here it will not get delayed by group starvation.
                 */
-               weight = tg->cfs_rq[i]->load.weight;
                if (!weight)
                        weight = NICE_0_LOAD;
 
-               tg->cfs_rq[i]->rq_weight = weight;
                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1586,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
                shares = tg->shares;
 
        for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+               update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+
+       local_irq_restore(flags);
 
        return 0;
 }
@@ -1616,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_shares(struct sched_domain *sd)
 {
-       u64 now = cpu_clock(raw_smp_processor_id());
-       s64 elapsed = now - sd->last_update;
+       s64 elapsed;
+       u64 now;
+
+       if (root_task_group_empty())
+               return;
+
+       now = cpu_clock(raw_smp_processor_id());
+       elapsed = now - sd->last_update;
 
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
@@ -1627,6 +1700,9 @@ static void update_shares(struct sched_domain *sd)
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
+       if (root_task_group_empty())
+               return;
+
        spin_unlock(&rq->lock);
        update_shares(sd);
        spin_lock(&rq->lock);
@@ -1634,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 static void update_h_load(long cpu)
 {
+       if (root_task_group_empty())
+               return;
+
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
@@ -1651,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #ifdef CONFIG_PREEMPT
 
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
 /*
  * fair double_lock_balance: Safely acquires both rq->locks in a fair
  * way at the expense of forcing extra atomic operations in all
@@ -1915,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-       return cpu_rq(cpu)->load.weight;
-}
-
 /*
  * Is this task likely cache-hot:
  */
@@ -1979,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-               perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                     1, 1, NULL, 0);
        }
        p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2195,186 +2269,6 @@ void kick_process(struct task_struct *p)
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-
-       if (type == 0 || !sched_feat(LB_BIAS))
-               return total;
-
-       return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-
-       if (type == 0 || !sched_feat(LB_BIAS))
-               return total;
-
-       return max(rq->cpu_load[type-1], total);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
-{
-       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long min_load = ULONG_MAX, this_load = 0;
-       int load_idx = sd->forkexec_idx;
-       int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
-       do {
-               unsigned long load, avg_load;
-               int local_group;
-               int i;
-
-               /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_cpus(group),
-                                       &p->cpus_allowed))
-                       continue;
-
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
-
-               /* Tally up the load of all CPUs in the group */
-               avg_load = 0;
-
-               for_each_cpu(i, sched_group_cpus(group)) {
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group)
-                               load = source_load(i, load_idx);
-                       else
-                               load = target_load(i, load_idx);
-
-                       avg_load += load;
-               }
-
-               /* Adjust by relative CPU power of the group */
-               avg_load = sg_div_cpu_power(group,
-                               avg_load * SCHED_LOAD_SCALE);
-
-               if (local_group) {
-                       this_load = avg_load;
-                       this = group;
-               } else if (avg_load < min_load) {
-                       min_load = avg_load;
-                       idlest = group;
-               }
-       } while (group = group->next, group != sd->groups);
-
-       if (!idlest || 100*this_load < imbalance*min_load)
-               return NULL;
-       return idlest;
-}
-
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
-       unsigned long load, min_load = ULONG_MAX;
-       int idlest = -1;
-       int i;
-
-       /* Traverse only the allowed CPUs */
-       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
-               load = weighted_cpuload(i);
-
-               if (load < min_load || (load == min_load && i == this_cpu)) {
-                       min_load = load;
-                       idlest = i;
-               }
-       }
-
-       return idlest;
-}
-
-/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
- *
- * Balance, ie. select the least loaded group.
- *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
- *
- * preempt must be disabled.
- */
-static int sched_balance_self(int cpu, int flag)
-{
-       struct task_struct *t = current;
-       struct sched_domain *tmp, *sd = NULL;
-
-       for_each_domain(cpu, tmp) {
-               /*
-                * If power savings logic is enabled for a domain, stop there.
-                */
-               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-                       break;
-               if (tmp->flags & flag)
-                       sd = tmp;
-       }
-
-       if (sd)
-               update_shares(sd);
-
-       while (sd) {
-               struct sched_group *group;
-               int new_cpu, weight;
-
-               if (!(sd->flags & flag)) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               group = find_idlest_group(sd, t, cpu);
-               if (!group) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               new_cpu = find_idlest_cpu(group, t, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       /* Now try balancing at a lower domain level of cpu */
-                       sd = sd->child;
-                       continue;
-               }
-
-               /* Now try balancing at a lower domain level of new_cpu */
-               cpu = new_cpu;
-               weight = cpumask_weight(sched_domain_span(sd));
-               sd = NULL;
-               for_each_domain(cpu, tmp) {
-                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
-                               break;
-                       if (tmp->flags & flag)
-                               sd = tmp;
-               }
-               /* while loop will break here if sd == NULL */
-       }
-
-       return cpu;
-}
-
 #endif /* CONFIG_SMP */
 
 /**
@@ -2412,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state,
+                         int wake_flags)
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-       long old_state;
        struct rq *rq;
 
        if (!sched_feat(SYNC_WAKEUPS))
-               sync = 0;
-
-#ifdef CONFIG_SMP
-       if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-               struct sched_domain *sd;
-
-               this_cpu = raw_smp_processor_id();
-               cpu = task_cpu(p);
+               wake_flags &= ~WF_SYNC;
 
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               update_shares(sd);
-                               break;
-                       }
-               }
-       }
-#endif
+       this_cpu = get_cpu();
 
        smp_wmb();
        rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
-       old_state = p->state;
-       if (!(old_state & state))
+       if (!(p->state & state))
                goto out;
 
        if (p->se.on_rq)
@@ -2450,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
        cpu = task_cpu(p);
        orig_cpu = cpu;
-       this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
        if (unlikely(task_running(rq, p)))
                goto out_activate;
 
-       cpu = p->sched_class->select_task_rq(p, sync);
-       if (cpu != orig_cpu) {
+       /*
+        * In order to handle concurrent wakeups and release the rq->lock
+        * we put the task in TASK_WAKING state.
+        *
+        * First fix up the nr_uninterruptible count:
+        */
+       if (task_contributes_to_load(p))
+               rq->nr_uninterruptible--;
+       p->state = TASK_WAKING;
+       task_rq_unlock(rq, &flags);
+
+       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
-               task_rq_unlock(rq, &flags);
-               /* might preempt at this point */
-               rq = task_rq_lock(p, &flags);
-               old_state = p->state;
-               if (!(old_state & state))
-                       goto out;
-               if (p->se.on_rq)
-                       goto out_running;
 
-               this_cpu = smp_processor_id();
-               cpu = task_cpu(p);
-       }
+       rq = task_rq_lock(p, &flags);
+       WARN_ON(p->state != TASK_WAKING);
+       cpu = task_cpu(p);
 
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 out_activate:
 #endif /* CONFIG_SMP */
        schedstat_inc(p, se.nr_wakeups);
-       if (sync)
+       if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.nr_wakeups_sync);
        if (orig_cpu != cpu)
                schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,7 +2400,7 @@ out_activate:
 
 out_running:
        trace_sched_wakeup(rq, p, success);
-       check_preempt_curr(rq, p, sync);
+       check_preempt_curr(rq, p, wake_flags);
 
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2528,6 +2409,7 @@ out_running:
 #endif
 out:
        task_rq_unlock(rq, &flags);
+       put_cpu();
 
        return success;
 }
@@ -2570,6 +2452,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
+       p->se.avg_running               = 0;
 
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2631,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
        __sched_fork(p);
 
-#ifdef CONFIG_SMP
-       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-       set_task_cpu(p, cpu);
-
        /*
-        * Make sure we do not leak PI boosting priority to the child:
+        * Make sure we do not leak PI boosting priority to the child.
         */
        p->prio = current->normal_prio;
+
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                       p->policy = SCHED_NORMAL;
+
+               if (p->normal_prio < DEFAULT_PRIO)
+                       p->prio = DEFAULT_PRIO;
+
+               if (PRIO_TO_NICE(p->static_prio) < 0) {
+                       p->static_prio = NICE_TO_PRIO(0);
+                       set_load_weight(p);
+               }
+
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
 
+#ifdef CONFIG_SMP
+       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+#endif
+       set_task_cpu(p, cpu);
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2688,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                inc_nr_running(rq);
        }
        trace_sched_wakeup_new(rq, p, 1);
-       check_preempt_curr(rq, p, 0);
+       check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
-#ifdef CONFIG_SMP
-       int post_schedule = 0;
-
-       if (current->sched_class->needs_post_schedule)
-               post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
 
        rq->prev_mm = NULL;
 
@@ -2818,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
-       perf_counter_task_sched_in(current, cpu_of(rq));
+       perf_event_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-       if (post_schedule)
-               current->sched_class->post_schedule(rq);
-#endif
 
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2838,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        }
 }
 
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->sched_class->pre_schedule)
+               prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+       if (rq->post_schedule) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&rq->lock, flags);
+               if (rq->curr->sched_class->post_schedule)
+                       rq->curr->sched_class->post_schedule(rq);
+               spin_unlock_irqrestore(&rq->lock, flags);
+
+               rq->post_schedule = 0;
+       }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -2848,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
        struct rq *rq = this_rq();
 
        finish_task_switch(rq, prev);
+
+       /*
+        * FIXME: do we need to worry about rq being invalidated by the
+        * task_switch?
+        */
+       post_schedule(rq);
+
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
@@ -2965,6 +2904,19 @@ unsigned long nr_iowait(void)
        return sum;
 }
 
+unsigned long nr_iowait_cpu(void)
+{
+       struct rq *this = this_rq();
+       return atomic_read(&this->nr_iowait);
+}
+
+unsigned long this_cpu_load(void)
+{
+       struct rq *this = this_rq();
+       return this->cpu_load[0];
+}
+
+
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
@@ -3164,7 +3116,7 @@ out:
 void sched_exec(void)
 {
        int new_cpu, this_cpu = get_cpu();
-       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+       new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
        put_cpu();
        if (new_cpu != this_cpu)
                sched_migrate_task(current, new_cpu);
@@ -3379,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
        const struct sched_class *class;
 
-       for (class = sched_class_highest; class; class = class->next)
+       for_each_class(class) {
                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                        return 1;
+       }
 
        return 0;
 }
@@ -3544,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
         * capacity but still has some space to pick up some load
         * from other group and save more power
         */
-       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+       if (sgs->sum_nr_running + 1 > sgs->group_capacity)
                return;
 
        if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
        *imbalance = sds->min_load_per_task;
        sds->busiest = sds->group_min;
 
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                       group_first_cpu(sds->group_leader);
-       }
-
        return 1;
 
 }
@@ -3612,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
 
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+       return SCHED_LOAD_SCALE;
+}
+
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+       return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long smt_gain = sd->smt_gain;
+
+       smt_gain /= weight;
+
+       return smt_gain;
+}
+
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+       return default_scale_smt_power(sd, cpu);
+}
+
+unsigned long scale_rt_power(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       u64 total, available;
+
+       sched_avg_update(rq);
+
+       total = sched_avg_period() + (rq->clock - rq->age_stamp);
+       available = total - rq->rt_avg;
+
+       if (unlikely((s64)total < SCHED_LOAD_SCALE))
+               total = SCHED_LOAD_SCALE;
+
+       total >>= SCHED_LOAD_SHIFT;
+
+       return div_u64(available, total);
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long power = SCHED_LOAD_SCALE;
+       struct sched_group *sdg = sd->groups;
+
+       if (sched_feat(ARCH_POWER))
+               power *= arch_scale_freq_power(sd, cpu);
+       else
+               power *= default_scale_freq_power(sd, cpu);
+
+       power >>= SCHED_LOAD_SHIFT;
+
+       if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+               if (sched_feat(ARCH_POWER))
+                       power *= arch_scale_smt_power(sd, cpu);
+               else
+                       power *= default_scale_smt_power(sd, cpu);
+
+               power >>= SCHED_LOAD_SHIFT;
+       }
+
+       power *= scale_rt_power(cpu);
+       power >>= SCHED_LOAD_SHIFT;
+
+       if (!power)
+               power = 1;
+
+       sdg->cpu_power = power;
+}
+
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+       struct sched_domain *child = sd->child;
+       struct sched_group *group, *sdg = sd->groups;
+       unsigned long power;
+
+       if (!child) {
+               update_cpu_power(sd, cpu);
+               return;
+       }
+
+       power = 0;
+
+       group = child->groups;
+       do {
+               power += group->cpu_power;
+               group = group->next;
+       } while (group != child->groups);
+
+       sdg->cpu_power = power;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @group: sched_group whose statistics are to be updated.
@@ -3624,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  * @balance: Should we balance.
  * @sgs: variable to hold the statistics for this group.
  */
-static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                       struct sched_group *group, int this_cpu,
                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
        unsigned long sum_avg_load_per_task;
        unsigned long avg_load_per_task;
 
-       if (local_group)
+       if (local_group) {
                balance_cpu = group_first_cpu(group);
+               if (balance_cpu == this_cpu)
+                       update_group_power(sd, this_cpu);
+       }
 
        /* Tally up the load of all CPUs in the group */
        sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
        }
 
        /* Adjust by relative CPU power of the group */
-       sgs->avg_load = sg_div_cpu_power(group,
-                       sgs->group_load * SCHED_LOAD_SCALE);
+       sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
 
        /*
@@ -3698,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
         *      normalized nr_running number somewhere that negates
         *      the hierarchy?
         */
-       avg_load_per_task = sg_div_cpu_power(group,
-                       sum_avg_load_per_task * SCHED_LOAD_SCALE);
+       avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
+               group->cpu_power;
 
        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                sgs->group_imb = 1;
 
-       sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
-
+       sgs->group_capacity =
+               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 }
 
 /**
@@ -3723,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        const struct cpumask *cpus, int *balance,
                        struct sd_lb_stats *sds)
 {
+       struct sched_domain *child = sd->child;
        struct sched_group *group = sd->groups;
        struct sg_lb_stats sgs;
-       int load_idx;
+       int load_idx, prefer_sibling = 0;
+
+       if (child && child->flags & SD_PREFER_SIBLING)
+               prefer_sibling = 1;
 
        init_sd_power_savings_stats(sd, sds, idle);
        load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));
                memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+               update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);
 
                if (local_group && balance && !(*balance))
                        return;
 
                sds->total_load += sgs.group_load;
-               sds->total_pwr += group->__cpu_power;
+               sds->total_pwr += group->cpu_power;
+
+               /*
+                * In case the child domain prefers tasks go to siblings
+                * first, lower the group capacity to one so that we'll try
+                * and move all the excess tasks away.
+                */
+               if (prefer_sibling)
+                       sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
                if (local_group) {
                        sds->this_load = sgs.avg_load;
@@ -3763,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
-
 }
 
 /**
@@ -3801,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
         * moving them.
         */
 
-       pwr_now += sds->busiest->__cpu_power *
+       pwr_now += sds->busiest->cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load);
-       pwr_now += sds->this->__cpu_power *
+       pwr_now += sds->this->cpu_power *
                        min(sds->this_load_per_task, sds->this_load);
        pwr_now /= SCHED_LOAD_SCALE;
 
        /* Amount of load we'd subtract */
-       tmp = sg_div_cpu_power(sds->busiest,
-                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+               sds->busiest->cpu_power;
        if (sds->max_load > tmp)
-               pwr_move += sds->busiest->__cpu_power *
+               pwr_move += sds->busiest->cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load - tmp);
 
        /* Amount of load we'd add */
-       if (sds->max_load * sds->busiest->__cpu_power <
+       if (sds->max_load * sds->busiest->cpu_power <
                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-               tmp = sg_div_cpu_power(sds->this,
-                       sds->max_load * sds->busiest->__cpu_power);
+               tmp = (sds->max_load * sds->busiest->cpu_power) /
+                       sds->this->cpu_power;
        else
-               tmp = sg_div_cpu_power(sds->this,
-                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
-       pwr_move += sds->this->__cpu_power *
+               tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                       sds->this->cpu_power;
+       pwr_move += sds->this->cpu_power *
                        min(sds->this_load_per_task, sds->this_load + tmp);
        pwr_move /= SCHED_LOAD_SCALE;
 
@@ -3857,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                        sds->max_load - sds->busiest_load_per_task);
 
        /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * sds->busiest->__cpu_power,
-               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+       *imbalance = min(max_pull * sds->busiest->cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->cpu_power)
                        / SCHED_LOAD_SCALE;
 
        /*
@@ -3988,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
        int i;
 
        for_each_cpu(i, sched_group_cpus(group)) {
+               unsigned long power = power_of(i);
+               unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                unsigned long wl;
 
                if (!cpumask_test_cpu(i, cpus))
                        continue;
 
                rq = cpu_rq(i);
-               wl = weighted_cpuload(i);
+               wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+               wl /= power;
 
-               if (rq->nr_running == 1 && wl > imbalance)
+               if (capacity && rq->nr_running == 1 && wl > imbalance)
                        continue;
 
                if (wl > max_load) {
@@ -5144,7 +5205,7 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
 
-       perf_counter_task_tick(curr, cpu);
+       perf_event_task_tick(curr, cpu);
 
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -5256,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
+static void put_prev_task(struct rq *rq, struct task_struct *p)
 {
-       if (prev->state == TASK_RUNNING) {
-               u64 runtime = prev->se.sum_exec_runtime;
+       u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
 
-               runtime -= prev->se.prev_sum_exec_runtime;
-               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+       update_avg(&p->se.avg_running, runtime);
 
+       if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5273,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-               update_avg(&prev->se.avg_overlap, runtime);
+               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+               update_avg(&p->se.avg_overlap, runtime);
+       } else {
+               update_avg(&p->se.avg_running, 0);
        }
-       prev->sched_class->put_prev_task(rq, prev);
+       p->sched_class->put_prev_task(rq, p);
 }
 
 /*
@@ -5324,7 +5387,7 @@ need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
-       rcu_qsctr_inc(cpu);
+       rcu_sched_qs(cpu);
        prev = rq->curr;
        switch_count = &prev->nivcsw;
 
@@ -5348,10 +5411,7 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
 
-#ifdef CONFIG_SMP
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
-#endif
+       pre_schedule(rq, prev);
 
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
@@ -5361,7 +5421,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
-               perf_counter_task_sched_out(prev, next, cpu);
+               perf_event_task_sched_out(prev, next, cpu);
 
                rq->nr_switches++;
                rq->curr = next;
@@ -5377,6 +5437,8 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
 
+       post_schedule(rq);
+
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
 
@@ -5508,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
-       return try_to_wake_up(curr->private, mode, sync);
+       return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -5525,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, int sync, void *key)
+                       int nr_exclusive, int wake_flags, void *key)
 {
        wait_queue_t *curr, *next;
 
        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
                unsigned flags = curr->flags;
 
-               if (curr->func(curr, mode, sync, key) &&
+               if (curr->func(curr, mode, wake_flags, key) &&
                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
                        break;
        }
@@ -5593,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
 {
        unsigned long flags;
-       int sync = 1;
+       int wake_flags = WF_SYNC;
 
        if (unlikely(!q))
                return;
 
        if (unlikely(!nr_exclusive))
-               sync = 0;
+               wake_flags = 0;
 
        spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, sync, key);
+       __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6122,17 +6184,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
        unsigned long flags;
        const struct sched_class *prev_class = p->sched_class;
        struct rq *rq;
+       int reset_on_fork;
 
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
 recheck:
        /* double check policy once rq lock held */
-       if (policy < 0)
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
-       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                       policy != SCHED_IDLE)
-               return -EINVAL;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6176,6 +6246,10 @@ recheck:
                /* can't change other user's priorities */
                if (!check_same_owner(p))
                        return -EPERM;
+
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
        }
 
        if (user) {
@@ -6219,6 +6293,8 @@ recheck:
        if (running)
                p->sched_class->put_prev_task(rq, p);
 
+       p->sched_reset_on_fork = reset_on_fork;
+
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
 
@@ -6335,14 +6411,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
        if (p) {
                retval = security_task_getscheduler(p);
                if (!retval)
-                       retval = p->policy;
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
        read_unlock(&tasklist_lock);
        return retval;
 }
 
 /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
@@ -6567,22 +6644,12 @@ static inline int should_resched(void)
 {
        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
-
-static void __cond_resched(void)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-       __might_sleep(__FILE__, __LINE__);
-#endif
-       /*
-        * The BKS might be reacquired before we have dropped
-        * PREEMPT_ACTIVE, which could trigger a second
-        * cond_resched() call.
-        */
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               schedule();
-               sub_preempt_count(PREEMPT_ACTIVE);
-       } while (need_resched());
+
+static void __cond_resched(void)
+{
+       add_preempt_count(PREEMPT_ACTIVE);
+       schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
@@ -6596,18 +6663,20 @@ int __sched _cond_resched(void)
 EXPORT_SYMBOL(_cond_resched);
 
 /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
 {
        int resched = should_resched();
        int ret = 0;
 
+       lockdep_assert_held(lock);
+
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
@@ -6619,9 +6688,9 @@ int cond_resched_lock(spinlock_t *lock)
        }
        return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
@@ -6633,7 +6702,7 @@ int __sched cond_resched_softirq(void)
        }
        return 0;
 }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
 
 /**
  * yield - yield the current processor to other threads.
@@ -6657,11 +6726,13 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
        schedule();
+       current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
 }
@@ -6669,12 +6740,14 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
        long ret;
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
        ret = schedule_timeout(timeout);
+       current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
        return ret;
@@ -6758,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
 
-       /*
-        * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
-        * tasks that are on an otherwise idle runqueue:
-        */
-       time_slice = 0;
-       if (p->policy == SCHED_RR) {
-               time_slice = DEF_TIMESLICE;
-       } else if (p->policy != SCHED_FIFO) {
-               struct sched_entity *se = &p->se;
-               unsigned long flags;
-               struct rq *rq;
+       time_slice = p->sched_class->get_rr_interval(p);
 
-               rq = task_rq_lock(p, &flags);
-               if (rq->cfs.load.weight)
-                       time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-               task_rq_unlock(rq, &flags);
-       }
        read_unlock(&tasklist_lock);
        jiffies_to_timespec(time_slice, &t);
        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -6991,8 +7049,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
+               struct task_struct *mt = rq->migration_thread;
+
+               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
+               put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
@@ -7050,6 +7112,11 @@ fail:
        return ret;
 }
 
+#define RCU_MIGRATION_IDLE     0
+#define RCU_MIGRATION_NEED_QS  1
+#define RCU_MIGRATION_GOT_QS   2
+#define RCU_MIGRATION_MUST_SYNC        3
+
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7057,6 +7124,7 @@ fail:
  */
 static int migration_thread(void *data)
 {
+       int badcpu;
        int cpu = (long)data;
        struct rq *rq;
 
@@ -7091,8 +7159,17 @@ static int migration_thread(void *data)
                req = list_entry(head->next, struct migration_req, list);
                list_del_init(head->next);
 
-               spin_unlock(&rq->lock);
-               __migrate_task(req->task, cpu, req->dest_cpu);
+               if (req->task != NULL) {
+                       spin_unlock(&rq->lock);
+                       __migrate_task(req->task, cpu, req->dest_cpu);
+               } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+                       req->dest_cpu = RCU_MIGRATION_GOT_QS;
+                       spin_unlock(&rq->lock);
+               } else {
+                       req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+                       spin_unlock(&rq->lock);
+                       WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+               }
                local_irq_enable();
 
                complete(&req->done);
@@ -7606,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
+ * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
@@ -7624,7 +7701,7 @@ static int __init migration_init(void)
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
 
-       return err;
+       return 0;
 }
 early_initcall(migration_init);
 #endif
@@ -7671,7 +7748,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (!group->__cpu_power) {
+               if (!group->cpu_power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -7695,9 +7772,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
                printk(KERN_CONT " %s", str);
-               if (group->__cpu_power != SCHED_LOAD_SCALE) {
-                       printk(KERN_CONT " (__cpu_power = %d)",
-                               group->__cpu_power);
+               if (group->cpu_power != SCHED_LOAD_SCALE) {
+                       printk(KERN_CONT " (cpu_power = %d)",
+                               group->cpu_power);
                }
 
                group = group->next;
@@ -7762,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd)
        }
 
        /* Following flags don't use groups */
-       if (sd->flags & (SD_WAKE_IDLE |
-                        SD_WAKE_AFFINE |
-                        SD_WAKE_BALANCE))
+       if (sd->flags & (SD_WAKE_AFFINE))
                return 0;
 
        return 1;
@@ -7781,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
 
-       /* Does parent contain flags not in child? */
-       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
-       if (cflags & SD_WAKE_AFFINE)
-               pflags &= ~SD_WAKE_BALANCE;
        /* Flags needing groups don't count if only 1 group in parent */
        if (parent->groups == parent->groups->next) {
                pflags &= ~(SD_LOAD_BALANCE |
@@ -7840,7 +7911,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        rq->rd = rd;
 
        cpumask_set_cpu(rq->cpu, rd->span);
-       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
@@ -7982,7 +8053,7 @@ init_sched_build_groups(const struct cpumask *span,
                        continue;
 
                cpumask_clear(sched_group_cpus(sg));
-               sg->__cpu_power = 0;
+               sg->cpu_power = 0;
 
                for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8090,6 +8161,39 @@ struct static_sched_domain {
        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
 };
 
+struct s_data {
+#ifdef CONFIG_NUMA
+       int                     sd_allnodes;
+       cpumask_var_t           domainspan;
+       cpumask_var_t           covered;
+       cpumask_var_t           notcovered;
+#endif
+       cpumask_var_t           nodemask;
+       cpumask_var_t           this_sibling_map;
+       cpumask_var_t           this_core_map;
+       cpumask_var_t           send_covered;
+       cpumask_var_t           tmpmask;
+       struct sched_group      **sched_group_nodes;
+       struct root_domain      *rd;
+};
+
+enum s_alloc {
+       sa_sched_groups = 0,
+       sa_rootdomain,
+       sa_tmpmask,
+       sa_send_covered,
+       sa_this_core_map,
+       sa_this_sibling_map,
+       sa_nodemask,
+       sa_sched_group_nodes,
+#ifdef CONFIG_NUMA
+       sa_notcovered,
+       sa_covered,
+       sa_domainspan,
+#endif
+       sa_none,
+};
+
 /*
  * SMT sched-domains:
  */
@@ -8207,11 +8311,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                                continue;
                        }
 
-                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+                       sg->cpu_power += sd->groups->cpu_power;
                }
                sg = sg->next;
        } while (sg != group_head);
 }
+
+static int build_numa_sched_groups(struct s_data *d,
+                                  const struct cpumask *cpu_map, int num)
+{
+       struct sched_domain *sd;
+       struct sched_group *sg, *prev;
+       int n, j;
+
+       cpumask_clear(d->covered);
+       cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+       if (cpumask_empty(d->nodemask)) {
+               d->sched_group_nodes[num] = NULL;
+               goto out;
+       }
+
+       sched_domain_node_span(num, d->domainspan);
+       cpumask_and(d->domainspan, d->domainspan, cpu_map);
+
+       sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                         GFP_KERNEL, num);
+       if (!sg) {
+               printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+                      num);
+               return -ENOMEM;
+       }
+       d->sched_group_nodes[num] = sg;
+
+       for_each_cpu(j, d->nodemask) {
+               sd = &per_cpu(node_domains, j).sd;
+               sd->groups = sg;
+       }
+
+       sg->cpu_power = 0;
+       cpumask_copy(sched_group_cpus(sg), d->nodemask);
+       sg->next = sg;
+       cpumask_or(d->covered, d->covered, d->nodemask);
+
+       prev = sg;
+       for (j = 0; j < nr_node_ids; j++) {
+               n = (num + j) % nr_node_ids;
+               cpumask_complement(d->notcovered, d->covered);
+               cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+               cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+               if (cpumask_empty(d->tmpmask))
+                       break;
+               cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+               if (cpumask_empty(d->tmpmask))
+                       continue;
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, num);
+               if (!sg) {
+                       printk(KERN_WARNING
+                              "Can not alloc domain group for node %d\n", j);
+                       return -ENOMEM;
+               }
+               sg->cpu_power = 0;
+               cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+               sg->next = prev->next;
+               cpumask_or(d->covered, d->covered, d->tmpmask);
+               prev->next = sg;
+               prev = sg;
+       }
+out:
+       return 0;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_NUMA
@@ -8265,15 +8434,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
- *
- * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
- * the maximum number of tasks a group can handle in the presence of other idle
- * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
        struct sched_domain *child;
        struct sched_group *group;
+       long power;
+       int weight;
 
        WARN_ON(!sd || !sd->groups);
 
@@ -8282,28 +8449,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
        child = sd->child;
 
-       sd->groups->__cpu_power = 0;
+       sd->groups->cpu_power = 0;
 
-       /*
-        * For perf policy, if the groups in child domain share resources
-        * (for example cores sharing some portions of the cache hierarchy
-        * or SMT), then set this domain groups cpu_power such that each group
-        * can handle only one task, when there are other idle groups in the
-        * same sched domain.
-        */
-       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
-                      (child->flags &
-                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+       if (!child) {
+               power = SCHED_LOAD_SCALE;
+               weight = cpumask_weight(sched_domain_span(sd));
+               /*
+                * SMT siblings share the power of a single core.
+                * Usually multiple threads get a better yield out of
+                * that one core than a single thread would have,
+                * reflect that in sd->smt_gain.
+                */
+               if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                       power *= sd->smt_gain;
+                       power /= weight;
+                       power >>= SCHED_LOAD_SHIFT;
+               }
+               sd->groups->cpu_power += power;
                return;
        }
 
        /*
-        * add cpu_power of each child group to this groups cpu_power
+        * Add cpu_power of each child group to this groups cpu_power.
         */
        group = child->groups;
        do {
-               sg_inc_cpu_power(sd->groups, group->__cpu_power);
+               sd->groups->cpu_power += group->cpu_power;
                group = group->next;
        } while (group != child->groups);
 }
@@ -8370,287 +8541,292 @@ static void set_domain_attribute(struct sched_domain *sd,
                request = attr->relax_domain_level;
        if (request < sd->level) {
                /* turn off idle balance on this domain */
-               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+               sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
        } else {
                /* turn on idle balance on this domain */
-               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+               sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+       }
+}
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+                                const struct cpumask *cpu_map)
+{
+       switch (what) {
+       case sa_sched_groups:
+               free_sched_groups(cpu_map, d->tmpmask); /* fall through */
+               d->sched_group_nodes = NULL;
+       case sa_rootdomain:
+               free_rootdomain(d->rd); /* fall through */
+       case sa_tmpmask:
+               free_cpumask_var(d->tmpmask); /* fall through */
+       case sa_send_covered:
+               free_cpumask_var(d->send_covered); /* fall through */
+       case sa_this_core_map:
+               free_cpumask_var(d->this_core_map); /* fall through */
+       case sa_this_sibling_map:
+               free_cpumask_var(d->this_sibling_map); /* fall through */
+       case sa_nodemask:
+               free_cpumask_var(d->nodemask); /* fall through */
+       case sa_sched_group_nodes:
+#ifdef CONFIG_NUMA
+               kfree(d->sched_group_nodes); /* fall through */
+       case sa_notcovered:
+               free_cpumask_var(d->notcovered); /* fall through */
+       case sa_covered:
+               free_cpumask_var(d->covered); /* fall through */
+       case sa_domainspan:
+               free_cpumask_var(d->domainspan); /* fall through */
+#endif
+       case sa_none:
+               break;
        }
 }
 
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int __build_sched_domains(const struct cpumask *cpu_map,
-                                struct sched_domain_attr *attr)
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+                                                  const struct cpumask *cpu_map)
 {
-       int i, err = -ENOMEM;
-       struct root_domain *rd;
-       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
-               tmpmask;
-#ifdef CONFIG_NUMA
-       cpumask_var_t domainspan, covered, notcovered;
-       struct sched_group **sched_group_nodes = NULL;
-       int sd_allnodes = 0;
-
-       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
-               goto out;
-       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
-               goto free_domainspan;
-       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
-               goto free_covered;
-#endif
-
-       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
-               goto free_notcovered;
-       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
-               goto free_nodemask;
-       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
-               goto free_this_sibling_map;
-       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
-               goto free_this_core_map;
-       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
-               goto free_send_covered;
-
 #ifdef CONFIG_NUMA
-       /*
-        * Allocate the per-node list of sched groups
-        */
-       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
-                                   GFP_KERNEL);
-       if (!sched_group_nodes) {
+       if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
+               return sa_none;
+       if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+               return sa_domainspan;
+       if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+               return sa_covered;
+       /* Allocate the per-node list of sched groups */
+       d->sched_group_nodes = kcalloc(nr_node_ids,
+                                     sizeof(struct sched_group *), GFP_KERNEL);
+       if (!d->sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               goto free_tmpmask;
-       }
-#endif
-
-       rd = alloc_rootdomain();
-       if (!rd) {
+               return sa_notcovered;
+       }
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+#endif
+       if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
+               return sa_sched_group_nodes;
+       if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+               return sa_nodemask;
+       if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+               return sa_this_sibling_map;
+       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+               return sa_this_core_map;
+       if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+               return sa_send_covered;
+       d->rd = alloc_rootdomain();
+       if (!d->rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
-               goto free_sched_groups;
+               return sa_tmpmask;
        }
+       return sa_rootdomain;
+}
 
+static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+{
+       struct sched_domain *sd = NULL;
 #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
-#endif
-
-       /*
-        * Set up domains for cpus specified by the cpu_map.
-        */
-       for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = NULL, *p;
-
-               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
-
-#ifdef CONFIG_NUMA
-               if (cpumask_weight(cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i).sd;
-                       SD_INIT(sd, ALLNODES);
-                       set_domain_attribute(sd, attr);
-                       cpumask_copy(sched_domain_span(sd), cpu_map);
-                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
-                       p = sd;
-                       sd_allnodes = 1;
-               } else
-                       p = NULL;
+       struct sched_domain *parent;
 
-               sd = &per_cpu(node_domains, i).sd;
-               SD_INIT(sd, NODE);
+       d->sd_allnodes = 0;
+       if (cpumask_weight(cpu_map) >
+           SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+               sd = &per_cpu(allnodes_domains, i).sd;
+               SD_INIT(sd, ALLNODES);
                set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-               sd->parent = p;
-               if (p)
-                       p->child = sd;
-               cpumask_and(sched_domain_span(sd),
-                           sched_domain_span(sd), cpu_map);
+               cpumask_copy(sched_domain_span(sd), cpu_map);
+               cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
+               d->sd_allnodes = 1;
+       }
+       parent = sd;
+
+       sd = &per_cpu(node_domains, i).sd;
+       SD_INIT(sd, NODE);
+       set_domain_attribute(sd, attr);
+       sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+       sd->parent = parent;
+       if (parent)
+               parent->child = sd;
+       cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
 #endif
+       return sd;
+}
 
-               p = sd;
-               sd = &per_cpu(phys_domains, i).sd;
-               SD_INIT(sd, CPU);
-               set_domain_attribute(sd, attr);
-               cpumask_copy(sched_domain_span(sd), nodemask);
-               sd->parent = p;
-               if (p)
-                       p->child = sd;
-               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+{
+       struct sched_domain *sd;
+       sd = &per_cpu(phys_domains, i).sd;
+       SD_INIT(sd, CPU);
+       set_domain_attribute(sd, attr);
+       cpumask_copy(sched_domain_span(sd), d->nodemask);
+       sd->parent = parent;
+       if (parent)
+               parent->child = sd;
+       cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+       return sd;
+}
 
+static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+{
+       struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_MC
-               p = sd;
-               sd = &per_cpu(core_domains, i).sd;
-               SD_INIT(sd, MC);
-               set_domain_attribute(sd, attr);
-               cpumask_and(sched_domain_span(sd), cpu_map,
-                                                  cpu_coregroup_mask(i));
-               sd->parent = p;
-               p->child = sd;
-               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+       sd = &per_cpu(core_domains, i).sd;
+       SD_INIT(sd, MC);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
 #endif
+       return sd;
+}
 
+static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+{
+       struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
-               p = sd;
-               sd = &per_cpu(cpu_domains, i).sd;
-               SD_INIT(sd, SIBLING);
-               set_domain_attribute(sd, attr);
-               cpumask_and(sched_domain_span(sd),
-                           topology_thread_cpumask(i), cpu_map);
-               sd->parent = p;
-               p->child = sd;
-               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+       sd = &per_cpu(cpu_domains, i).sd;
+       SD_INIT(sd, SIBLING);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
 #endif
-       }
+       return sd;
+}
 
+static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+                              const struct cpumask *cpu_map, int cpu)
+{
+       switch (l) {
 #ifdef CONFIG_SCHED_SMT
-       /* Set up CPU (sibling) groups */
-       for_each_cpu(i, cpu_map) {
-               cpumask_and(this_sibling_map,
-                           topology_thread_cpumask(i), cpu_map);
-               if (i != cpumask_first(this_sibling_map))
-                       continue;
-
-               init_sched_build_groups(this_sibling_map, cpu_map,
-                                       &cpu_to_cpu_group,
-                                       send_covered, tmpmask);
-       }
+       case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+               cpumask_and(d->this_sibling_map, cpu_map,
+                           topology_thread_cpumask(cpu));
+               if (cpu == cpumask_first(d->this_sibling_map))
+                       init_sched_build_groups(d->this_sibling_map, cpu_map,
+                                               &cpu_to_cpu_group,
+                                               d->send_covered, d->tmpmask);
+               break;
 #endif
-
 #ifdef CONFIG_SCHED_MC
-       /* Set up multi-core groups */
-       for_each_cpu(i, cpu_map) {
-               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
-               if (i != cpumask_first(this_core_map))
-                       continue;
-
-               init_sched_build_groups(this_core_map, cpu_map,
-                                       &cpu_to_core_group,
-                                       send_covered, tmpmask);
-       }
+       case SD_LV_MC: /* set up multi-core groups */
+               cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
+               if (cpu == cpumask_first(d->this_core_map))
+                       init_sched_build_groups(d->this_core_map, cpu_map,
+                                               &cpu_to_core_group,
+                                               d->send_covered, d->tmpmask);
+               break;
 #endif
-
-       /* Set up physical groups */
-       for (i = 0; i < nr_node_ids; i++) {
-               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-               if (cpumask_empty(nodemask))
-                       continue;
-
-               init_sched_build_groups(nodemask, cpu_map,
-                                       &cpu_to_phys_group,
-                                       send_covered, tmpmask);
-       }
-
+       case SD_LV_CPU: /* set up physical groups */
+               cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+               if (!cpumask_empty(d->nodemask))
+                       init_sched_build_groups(d->nodemask, cpu_map,
+                                               &cpu_to_phys_group,
+                                               d->send_covered, d->tmpmask);
+               break;
 #ifdef CONFIG_NUMA
-       /* Set up node groups */
-       if (sd_allnodes) {
-               init_sched_build_groups(cpu_map, cpu_map,
-                                       &cpu_to_allnodes_group,
-                                       send_covered, tmpmask);
+       case SD_LV_ALLNODES:
+               init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+                                       d->send_covered, d->tmpmask);
+               break;
+#endif
+       default:
+               break;
        }
+}
 
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Set up node groups */
-               struct sched_group *sg, *prev;
-               int j;
-
-               cpumask_clear(covered);
-               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-               if (cpumask_empty(nodemask)) {
-                       sched_group_nodes[i] = NULL;
-                       continue;
-               }
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int __build_sched_domains(const struct cpumask *cpu_map,
+                                struct sched_domain_attr *attr)
+{
+       enum s_alloc alloc_state = sa_none;
+       struct s_data d;
+       struct sched_domain *sd;
+       int i;
+#ifdef CONFIG_NUMA
+       d.sd_allnodes = 0;
+#endif
 
-               sched_domain_node_span(i, domainspan);
-               cpumask_and(domainspan, domainspan, cpu_map);
+       alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+       if (alloc_state != sa_rootdomain)
+               goto error;
+       alloc_state = sa_sched_groups;
 
-               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                 GFP_KERNEL, i);
-               if (!sg) {
-                       printk(KERN_WARNING "Can not alloc domain group for "
-                               "node %d\n", i);
-                       goto error;
-               }
-               sched_group_nodes[i] = sg;
-               for_each_cpu(j, nodemask) {
-                       struct sched_domain *sd;
+       /*
+        * Set up domains for cpus specified by the cpu_map.
+        */
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+                           cpu_map);
 
-                       sd = &per_cpu(node_domains, j).sd;
-                       sd->groups = sg;
-               }
-               sg->__cpu_power = 0;
-               cpumask_copy(sched_group_cpus(sg), nodemask);
-               sg->next = sg;
-               cpumask_or(covered, covered, nodemask);
-               prev = sg;
+               sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+               sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+       }
 
-               for (j = 0; j < nr_node_ids; j++) {
-                       int n = (i + j) % nr_node_ids;
+       for_each_cpu(i, cpu_map) {
+               build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+               build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+       }
 
-                       cpumask_complement(notcovered, covered);
-                       cpumask_and(tmpmask, notcovered, cpu_map);
-                       cpumask_and(tmpmask, tmpmask, domainspan);
-                       if (cpumask_empty(tmpmask))
-                               break;
+       /* Set up physical groups */
+       for (i = 0; i < nr_node_ids; i++)
+               build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
 
-                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
-                       if (cpumask_empty(tmpmask))
-                               continue;
+#ifdef CONFIG_NUMA
+       /* Set up node groups */
+       if (d.sd_allnodes)
+               build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
 
-                       sg = kmalloc_node(sizeof(struct sched_group) +
-                                         cpumask_size(),
-                                         GFP_KERNEL, i);
-                       if (!sg) {
-                               printk(KERN_WARNING
-                               "Can not alloc domain group for node %d\n", j);
-                               goto error;
-                       }
-                       sg->__cpu_power = 0;
-                       cpumask_copy(sched_group_cpus(sg), tmpmask);
-                       sg->next = prev->next;
-                       cpumask_or(covered, covered, tmpmask);
-                       prev->next = sg;
-                       prev = sg;
-               }
-       }
+       for (i = 0; i < nr_node_ids; i++)
+               if (build_numa_sched_groups(&d, cpu_map, i))
+                       goto error;
 #endif
 
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
        for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
-
+               sd = &per_cpu(cpu_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
        for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
-
+               sd = &per_cpu(core_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 #endif
 
        for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
-
+               sd = &per_cpu(phys_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 
 #ifdef CONFIG_NUMA
        for (i = 0; i < nr_node_ids; i++)
-               init_numa_sched_groups_power(sched_group_nodes[i]);
+               init_numa_sched_groups_power(d.sched_group_nodes[i]);
 
-       if (sd_allnodes) {
+       if (d.sd_allnodes) {
                struct sched_group *sg;
 
                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-                                                               tmpmask);
+                                                               d.tmpmask);
                init_numa_sched_groups_power(sg);
        }
 #endif
 
        /* Attach the domains */
        for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
@@ -8658,44 +8834,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #else
                sd = &per_cpu(phys_domains, i).sd;
 #endif
-               cpu_attach_domain(sd, rd, i);
+               cpu_attach_domain(sd, d.rd, i);
        }
 
-       err = 0;
-
-free_tmpmask:
-       free_cpumask_var(tmpmask);
-free_send_covered:
-       free_cpumask_var(send_covered);
-free_this_core_map:
-       free_cpumask_var(this_core_map);
-free_this_sibling_map:
-       free_cpumask_var(this_sibling_map);
-free_nodemask:
-       free_cpumask_var(nodemask);
-free_notcovered:
-#ifdef CONFIG_NUMA
-       free_cpumask_var(notcovered);
-free_covered:
-       free_cpumask_var(covered);
-free_domainspan:
-       free_cpumask_var(domainspan);
-out:
-#endif
-       return err;
-
-free_sched_groups:
-#ifdef CONFIG_NUMA
-       kfree(sched_group_nodes);
-#endif
-       goto free_tmpmask;
+       d.sched_group_nodes = NULL; /* don't free this we still need it */
+       __free_domain_allocs(&d, sa_tmpmask, cpu_map);
+       return 0;
 
-#ifdef CONFIG_NUMA
 error:
-       free_sched_groups(cpu_map, tmpmask);
-       free_rootdomain(rd);
-       goto free_tmpmask;
-#endif
+       __free_domain_allocs(&d, alloc_state, cpu_map);
+       return -ENOMEM;
 }
 
 static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9014,6 +9162,7 @@ void __init sched_init_smp(void)
        cpumask_var_t non_isolated_cpus;
 
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
 #if defined(CONFIG_NUMA)
        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9045,7 +9194,6 @@ void __init sched_init_smp(void)
        sched_init_granularity();
        free_cpumask_var(non_isolated_cpus);
 
-       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
        init_sched_rt_class();
 }
 #else
@@ -9303,11 +9451,11 @@ void __init sched_init(void)
                 * system cpu resource, based on the weight assigned to root
                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                * (init_cfs_rq) and having one entity represent this group of
+                * (init_tg_cfs_rq) and having one entity represent this group of
                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                 */
                init_tg_cfs_entry(&init_task_group,
-                               &per_cpu(init_cfs_rq, i),
+                               &per_cpu(init_tg_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1,
                                root_task_group.se[i]);
 
@@ -9333,6 +9481,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
+               rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
@@ -9391,19 +9540,26 @@ void __init sched_init(void)
        alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
-       perf_counter_init();
+       perf_event_init();
 
        scheduler_running = 1;
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+       int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
 
-       if ((!in_atomic() && !irqs_disabled()) ||
-                   system_state != SYSTEM_RUNNING || oops_in_progress)
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;
@@ -10580,3 +10736,113 @@ struct cgroup_subsys cpuacct_subsys = {
        .subsys_id = cpuacct_subsys_id,
 };
 #endif /* CONFIG_CGROUP_CPUACCT */
+
+#ifndef CONFIG_SMP
+
+int rcu_expedited_torture_stats(char *page)
+{
+       return 0;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+
+#define RCU_EXPEDITED_STATE_POST -2
+#define RCU_EXPEDITED_STATE_IDLE -1
+
+static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+
+int rcu_expedited_torture_stats(char *page)
+{
+       int cnt = 0;
+       int cpu;
+
+       cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+       for_each_online_cpu(cpu) {
+                cnt += sprintf(&page[cnt], " %d:%d",
+                               cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+       }
+       cnt += sprintf(&page[cnt], "\n");
+       return cnt;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+static long synchronize_sched_expedited_count;
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+       int cpu;
+       unsigned long flags;
+       bool need_full_sync = 0;
+       struct rq *rq;
+       struct migration_req *req;
+       long snap;
+       int trycount = 0;
+
+       smp_mb();  /* ensure prior mod happens before capturing snap. */
+       snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+       get_online_cpus();
+       while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+               put_online_cpus();
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+               if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+               get_online_cpus();
+       }
+       rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+       for_each_online_cpu(cpu) {
+               rq = cpu_rq(cpu);
+               req = &per_cpu(rcu_migration_req, cpu);
+               init_completion(&req->done);
+               req->task = NULL;
+               req->dest_cpu = RCU_MIGRATION_NEED_QS;
+               spin_lock_irqsave(&rq->lock, flags);
+               list_add(&req->list, &rq->migration_queue);
+               spin_unlock_irqrestore(&rq->lock, flags);
+               wake_up_process(rq->migration_thread);
+       }
+       for_each_online_cpu(cpu) {
+               rcu_expedited_state = cpu;
+               req = &per_cpu(rcu_migration_req, cpu);
+               rq = cpu_rq(cpu);
+               wait_for_completion(&req->done);
+               spin_lock_irqsave(&rq->lock, flags);
+               if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+                       need_full_sync = 1;
+               req->dest_cpu = RCU_MIGRATION_IDLE;
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
+       rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+       mutex_unlock(&rcu_sched_expedited_mutex);
+       put_online_cpus();
+       if (need_full_sync)
+               synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */