Merge branch 'fix/hda' into for-linus

[pandora-kernel.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 0c26e2d..7f00772 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
  
  #include <linux/latencytop.h>
  #include <linux/sched.h>
+#include <linux/cpumask.h>
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
@@ -68,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
   */
  unsigned int sysctl_sched_child_runs_first __read_mostly;
  
-/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
  /*
   * SCHED_OTHER wake-up granularity.
   * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
@@ -419,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
  }
  
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  {
         struct rb_node *left = cfs_rq->rb_leftmost;
  
@@ -429,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
         return rb_entry(left, struct sched_entity, run_node);
  }
  
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+       struct rb_node *next = rb_next(&se->run_node);
+
+       if (!next)
+               return NULL;
+
+       return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
  static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  {
         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
   * Scheduling class statistics methods:
   */
  
-#ifdef CONFIG_SCHED_DEBUG
  int sched_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
@@ -540,7 +543,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
  
  /*
   * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +736,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
             now - cfs_rq->load_last > 4 * period) {
                 cfs_rq->load_period = 0;
                 cfs_rq->load_avg = 0;
+               delta = period - 1;
         }
  
         cfs_rq->load_stamp = now;
@@ -763,16 +767,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                 list_del_leaf_cfs_rq(cfs_rq);
  }
  
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-                               long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  {
         long load_weight, load, shares;
  
-       load = cfs_rq->load.weight + weight_delta;
+       load = cfs_rq->load.weight;
  
         load_weight = atomic_read(&tg->load_weight);
-       load_weight -= cfs_rq->load_contribution;
         load_weight += load;
+       load_weight -= cfs_rq->load_contribution;
  
         shares = (tg->shares * load);
         if (load_weight)
@@ -790,7 +793,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
  {
         if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
                 update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
         }
  }
  # else /* CONFIG_SMP */
@@ -798,8 +801,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  {
  }
  
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-                               long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  {
         return tg->shares;
  }
@@ -824,7 +826,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                 account_entity_enqueue(cfs_rq, se);
  }
  
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
  {
         struct task_group *tg;
         struct sched_entity *se;
@@ -838,7 +840,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
         if (likely(se->load.weight == tg->shares))
                 return;
  #endif
-       shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+       shares = calc_cfs_shares(cfs_rq, tg);
  
         reweight_entity(cfs_rq_of(se), se, shares);
  }
@@ -847,7 +849,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  {
  }
  
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
  {
  }
  
@@ -978,8 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          */
         update_curr(cfs_rq);
         update_cfs_load(cfs_rq, 0);
-       update_cfs_shares(cfs_rq, se->load.weight);
         account_entity_enqueue(cfs_rq, se);
+       update_cfs_shares(cfs_rq);
  
         if (flags & ENQUEUE_WAKEUP) {
                 place_entity(cfs_rq, se, 0);
@@ -996,19 +998,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                 list_add_leaf_cfs_rq(cfs_rq);
  }
  
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
  {
-       if (!se || cfs_rq->last == se)
-               cfs_rq->last = NULL;
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->last == se)
+                       cfs_rq->last = NULL;
+               else
+                       break;
+       }
+}
+
+static void __clear_buddies_next(struct sched_entity *se)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->next == se)
+                       cfs_rq->next = NULL;
+               else
+                       break;
+       }
+}
  
-       if (!se || cfs_rq->next == se)
-               cfs_rq->next = NULL;
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->skip == se)
+                       cfs_rq->skip = NULL;
+               else
+                       break;
+       }
  }
  
  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       for_each_sched_entity(se)
-               __clear_buddies(cfs_rq_of(se), se);
+       if (cfs_rq->last == se)
+               __clear_buddies_last(se);
+
+       if (cfs_rq->next == se)
+               __clear_buddies_next(se);
+
+       if (cfs_rq->skip == se)
+               __clear_buddies_skip(se);
  }
  
  static void
@@ -1041,7 +1073,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         update_cfs_load(cfs_rq, 0);
         account_entity_dequeue(cfs_rq, se);
         update_min_vruntime(cfs_rq);
-       update_cfs_shares(cfs_rq, 0);
+       update_cfs_shares(cfs_rq);
  
         /*
          * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 return;
  
         if (cfs_rq->nr_running > 1) {
-               struct sched_entity *se = __pick_next_entity(cfs_rq);
+               struct sched_entity *se = __pick_first_entity(cfs_rq);
                 s64 delta = curr->vruntime - se->vruntime;
  
                 if (delta < 0)
@@ -1128,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
  static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
  {
-       struct sched_entity *se = __pick_next_entity(cfs_rq);
+       struct sched_entity *se = __pick_first_entity(cfs_rq);
         struct sched_entity *left = se;
  
-       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-               se = cfs_rq->next;
+       /*
+        * Avoid running the skip buddy, if running something else can
+        * be done without getting too unfair.
+        */
+       if (cfs_rq->skip == se) {
+               struct sched_entity *second = __pick_next_entity(se);
+               if (second && wakeup_preempt_entity(second, left) < 1)
+                       se = second;
+       }
  
         /*
          * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
                 se = cfs_rq->last;
  
+       /*
+        * Someone really wants this to run. If it's not unfair, run it.
+        */
+       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+               se = cfs_rq->next;
+
         clear_buddies(cfs_rq, se);
  
         return se;
@@ -1282,7 +1334,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
                 update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
         }
  
         hrtick_update(rq);
@@ -1312,58 +1364,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
                 update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
         }
  
         hrtick_update(rq);
  }
  
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-       struct task_struct *curr = rq->curr;
-       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       struct sched_entity *rightmost, *se = &curr->se;
-
-       /*
-        * Are we the only task in the tree?
-        */
-       if (unlikely(cfs_rq->nr_running == 1))
-               return;
-
-       clear_buddies(cfs_rq, se);
-
-       if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-               update_rq_clock(rq);
-               /*
-                * Update run-time statistics of the 'current'.
-                */
-               update_curr(cfs_rq);
-
-               return;
-       }
-       /*
-        * Find the rightmost entry in the rbtree:
-        */
-       rightmost = __pick_last_entity(cfs_rq);
-       /*
-        * Already in the rightmost position?
-        */
-       if (unlikely(!rightmost || entity_before(rightmost, se)))
-               return;
-
-       /*
-        * Minimally necessary key value to be last in the tree:
-        * Upon rescheduling, sched_class::put_prev_task() will place
-        * 'current' within the tree based on its new key value.
-        */
-       se->vruntime = rightmost->vruntime + 1;
-}
-
  #ifdef CONFIG_SMP
  
  static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se)
         }
  }
  
+static void set_skip_buddy(struct sched_entity *se)
+{
+       if (likely(task_of(se)->policy != SCHED_IDLE)) {
+               for_each_sched_entity(se)
+                       cfs_rq_of(se)->skip = se;
+       }
+}
+
  /*
   * Preempt the current task with a newly woken task if needed:
   */
@@ -1857,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         if (test_tsk_need_resched(curr))
                 return;
  
+       /* Idle tasks are by definition preempted by non-idle tasks. */
+       if (unlikely(curr->policy == SCHED_IDLE) &&
+           likely(p->policy != SCHED_IDLE))
+               goto preempt;
+
         /*
-        * Batch and idle tasks do not preempt (their preemption is driven by
-        * the tick):
+        * Batch and idle tasks do not preempt non-idle tasks (their preemption
+        * is driven by the tick):
          */
         if (unlikely(p->policy != SCHED_NORMAL))
                 return;
  
-       /* Idle tasks are by definition preempted by everybody. */
-       if (unlikely(curr->policy == SCHED_IDLE))
-               goto preempt;
  
         if (!sched_feat(WAKEUP_PREEMPT))
                 return;
@@ -1932,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
         }
  }
  
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       struct sched_entity *se = &curr->se;
+
+       /*
+        * Are we the only task in the tree?
+        */
+       if (unlikely(rq->nr_running == 1))
+               return;
+
+       clear_buddies(cfs_rq, se);
+
+       if (curr->policy != SCHED_BATCH) {
+               update_rq_clock(rq);
+               /*
+                * Update run-time statistics of the 'current'.
+                */
+               update_curr(cfs_rq);
+       }
+
+       set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+       struct sched_entity *se = &p->se;
+
+       if (!se->on_rq)
+               return false;
+
+       /* Tell the scheduler that we'd really like pse to run next. */
+       set_next_buddy(se);
+
+       yield_task_fair(rq);
+
+       return true;
+}
+
  #ifdef CONFIG_SMP
  /**************************************************
   * Fair scheduling class load-balancing methods:
@@ -2123,7 +2184,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
          * We need to update shares after updating tg->load_weight in
          * order to adjust the weight of groups with long running tasks.
          */
-       update_cfs_shares(cfs_rq, 0);
+       update_cfs_shares(cfs_rq);
  
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  
@@ -2610,7 +2671,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * @this_cpu: Cpu for which load balance is currently performed.
   * @idle: Idle status of this_cpu
   * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
   * @local_group: Does group contain this_cpu.
   * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
@@ -2618,7 +2678,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   */
  static inline void update_sg_lb_stats(struct sched_domain *sd,
                         struct sched_group *group, int this_cpu,
-                       enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                       enum cpu_idle_type idle, int load_idx,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
  {
@@ -2638,9 +2698,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                 struct rq *rq = cpu_rq(i);
  
-               if (*sd_idle && rq->nr_running)
-                       *sd_idle = 0;
-
                 /* Bias balancing toward cpus of our domain */
                 if (local_group) {
                         if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2742,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
  
         /*
          * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of two tasks.
+        * than the average weight of a task.
          *
          * APZ: with cgroup the avg task weight can vary wildly and
          *      might not be a suitable number - should we keep a
@@ -2695,7 +2752,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         if (sgs->sum_nr_running)
                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+       if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
                 sgs->group_imb = 1;
  
         sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2812,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
   * @sd: sched_domain whose statistics are to be updated.
   * @this_cpu: Cpu for which load balance is currently performed.
   * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
   * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
  static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                       enum cpu_idle_type idle, int *sd_idle,
-                       const struct cpumask *cpus, int *balance,
-                       struct sd_lb_stats *sds)
+                       enum cpu_idle_type idle, const struct cpumask *cpus,
+                       int *balance, struct sd_lb_stats *sds)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *sg = sd->groups;
@@ -2781,7 +2836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
  
                 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                 memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
                                 local_group, cpus, balance, &sgs);
  
                 if (local_group && !(*balance))
@@ -3007,7 +3062,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  
         /*
          * if *imbalance is less than the average load per runnable task
-        * there is no gaurantee that any tasks will be moved so we'll have
+        * there is no guarantee that any tasks will be moved so we'll have
          * a think about bumping its value to force at least one task to be
          * moved
          */
@@ -3033,7 +3088,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
   * @imbalance: Variable which stores amount of weighted load which should
   *             be moved to restore balance/put a group to idle.
   * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
   * @cpus: The set of CPUs under consideration for load-balancing.
   * @balance: Pointer to a variable indicating if this_cpu
   *     is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3100,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
                    unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const struct cpumask *cpus, int *balance)
+                  const struct cpumask *cpus, int *balance)
  {
         struct sd_lb_stats sds;
  
@@ -3056,22 +3110,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * Compute the various statistics relavent for load balancing at
          * this level.
          */
-       update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-                                       balance, &sds);
-
-       /* Cases where imbalance does not exist from POV of this_cpu */
-       /* 1) this_cpu is not the appropriate cpu to perform load balancing
-        *    at this level.
-        * 2) There is no busy sibling group to pull from.
-        * 3) This group is the busiest group.
-        * 4) This group is more busy than the avg busieness at this
-        *    sched_domain.
-        * 5) The imbalance is within the specified limit.
-        *
-        * Note: when doing newidle balance, if the local group has excess
-        * capacity (i.e. nr_running < group_capacity) and the busiest group
-        * does not have any capacity, we force a load balance to pull tasks
-        * to the local group. In this case, we skip past checks 3, 4 and 5.
+       update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+
+       /*
+        * this_cpu is not the appropriate cpu to perform load balancing at
+        * this level.
          */
         if (!(*balance))
                 goto ret;
@@ -3080,41 +3123,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
             check_asym_packing(sd, &sds, this_cpu, imbalance))
                 return sds.busiest;
  
+       /* There is no busy sibling group to pull tasks from */
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
  
-       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       /*
+        * If the busiest group is imbalanced the below checks don't
+        * work because they assumes all things are equal, which typically
+        * isn't true due to cpus_allowed constraints and the like.
+        */
+       if (sds.group_imb)
+               goto force_balance;
+
+       /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
         if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
                         !sds.busiest_has_capacity)
                 goto force_balance;
  
+       /*
+        * If the local group is more busy than the selected busiest group
+        * don't try and pull any tasks.
+        */
         if (sds.this_load >= sds.max_load)
                 goto out_balanced;
  
+       /*
+        * Don't pull any tasks if this group is already above the domain
+        * average load.
+        */
         sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
         if (sds.this_load >= sds.avg_load)
                 goto out_balanced;
  
-       /*
-        * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
-        * And to check for busy balance use !idle_cpu instead of
-        * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
-        * even when they are idle.
-        */
-       if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
-               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                       goto out_balanced;
-       } else {
+       if (idle == CPU_IDLE) {
                 /*
                  * This cpu is idle. If the busiest group load doesn't
                  * have more tasks than the number of available cpu's and
                  * there is no imbalance between this and busiest group
                  * wrt to idle cpu's, it is balanced.
                  */
-               if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+               if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
                     sds.busiest_nr_running <= sds.busiest_group_weight)
                         goto out_balanced;
+       } else {
+               /*
+                * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+                * imbalance_pct to be conservative.
+                */
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
         }
  
  force_balance:
@@ -3193,7 +3250,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
  /* Working cpumask for load_balance and load_balance_newidle. */
  static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
  
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
                                int busiest_cpu, int this_cpu)
  {
         if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3282,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
                  * move_tasks() will succeed.  ld_moved will be true and this
                  * active balance code will not be triggered.
                  */
-               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                       return 0;
-
                 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
                         return 0;
         }
@@ -3246,7 +3299,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
                         int *balance)
  {
-       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+       int ld_moved, all_pinned = 0, active_balance = 0;
         struct sched_group *group;
         unsigned long imbalance;
         struct rq *busiest;
@@ -3255,20 +3308,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
  
         cpumask_copy(cpus, cpu_active_mask);
  
-       /*
-        * When power savings policy is enabled for the parent domain, idle
-        * sibling can pick up load irrespective of busy siblings. In this case,
-        * let the state of idle sibling percolate up as CPU_IDLE, instead of
-        * portraying it as CPU_NOT_IDLE.
-        */
-       if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               sd_idle = 1;
-
         schedstat_inc(sd, lb_count[idle]);
  
  redo:
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle,
                                    cpus, balance);
  
         if (*balance == 0)
@@ -3330,8 +3373,7 @@ redo:
                 if (idle != CPU_NEWLY_IDLE)
                         sd->nr_balance_failed++;
  
-               if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
-                                       this_cpu)) {
+               if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
                         raw_spin_lock_irqsave(&busiest->lock, flags);
  
                         /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3428,6 @@ redo:
                         sd->balance_interval *= 2;
         }
  
-       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               ld_moved = -1;
-
         goto out;
  
  out_balanced:
@@ -3403,11 +3441,7 @@ out_one_pinned:
                         (sd->balance_interval < sd->max_interval))
                 sd->balance_interval *= 2;
  
-       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               ld_moved = -1;
-       else
-               ld_moved = 0;
+       ld_moved = 0;
  out:
         return ld_moved;
  }
@@ -3786,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick)
  
  static DEFINE_SPINLOCK(balancing);
  
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+       max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
  /*
   * It checks each scheduling domain to see if it is due to be balanced,
   * and initiates a balancing operation if so.
@@ -3815,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
  
                 /* scale ms to jiffies */
                 interval = msecs_to_jiffies(interval);
-               if (unlikely(!interval))
-                       interval = 1;
-               if (interval > HZ*NR_CPUS/10)
-                       interval = HZ*NR_CPUS/10;
+               interval = clamp(interval, 1UL, max_load_balance_interval);
  
                 need_serialize = sd->flags & SD_SERIALIZE;
  
@@ -3831,8 +3873,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                         if (load_balance(cpu, rq, sd, idle, &balance)) {
                                 /*
                                  * We've pulled tasks over so either we're no
-                                * longer idle, or one of our SMT siblings is
-                                * not idle.
+                                * longer idle.
                                  */
                                 idle = CPU_NOT_IDLE;
                         }
@@ -4079,33 +4120,62 @@ static void task_fork_fair(struct task_struct *p)
   * Priority of the task has changed. Check to see if we preempt
   * the current task.
   */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
-                             int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
  {
+       if (!p->se.on_rq)
+               return;
+
         /*
          * Reschedule if we are currently running on this runqueue and
          * our priority decreased, or if we are not currently running on
          * this runqueue and our priority is higher than the current's
          */
-       if (running) {
+       if (rq->curr == p) {
                 if (p->prio > oldprio)
                         resched_task(rq->curr);
         } else
                 check_preempt_curr(rq, p, 0);
  }
  
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /*
+        * Ensure the task's vruntime is normalized, so that when its
+        * switched back to the fair class the enqueue_entity(.flags=0) will
+        * do the right thing.
+        *
+        * If it was on_rq, then the dequeue_entity(.flags=0) will already
+        * have normalized the vruntime, if it was !on_rq, then only when
+        * the task is sleeping will it still have non-normalized vruntime.
+        */
+       if (!se->on_rq && p->state != TASK_RUNNING) {
+               /*
+                * Fix up our vruntime so that the current sleep doesn't
+                * cause 'unlimited' sleep bonus.
+                */
+               place_entity(cfs_rq, se, 0);
+               se->vruntime -= cfs_rq->min_vruntime;
+       }
+}
+
  /*
   * We switched to the sched_fair class.
   */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
-                            int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
  {
+       if (!p->se.on_rq)
+               return;
+
         /*
          * We were most likely switched from sched_rt, so
          * kick off the schedule if running, otherwise just see
          * if we can still preempt the current task.
          */
-       if (running)
+       if (rq->curr == p)
                 resched_task(rq->curr);
         else
                 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4241,7 @@ static const struct sched_class fair_sched_class = {
         .enqueue_task           = enqueue_task_fair,
         .dequeue_task           = dequeue_task_fair,
         .yield_task             = yield_task_fair,
+       .yield_to_task          = yield_to_task_fair,
  
         .check_preempt_curr     = check_preempt_wakeup,
  
@@ -4191,6 +4262,7 @@ static const struct sched_class fair_sched_class = {
         .task_fork              = task_fork_fair,
  
         .prio_changed           = prio_changed_fair,
+       .switched_from          = switched_from_fair,
         .switched_to            = switched_to_fair,
  
         .get_rr_interval        = get_rr_interval_fair,