Merge branch 'sched/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...

[pandora-kernel.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 26ebe18..cf2cd6c 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                 __enqueue_entity(cfs_rq, se);
  }
  
-static void update_avg(u64 *avg, u64 sample)
-{
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (!se->last_wakeup)
-               return;
-
-       update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-       se->last_wakeup = 0;
-}
-
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  
         update_stats_dequeue(cfs_rq, se);
         if (sleep) {
-               update_avg_stats(cfs_rq, se);
  #ifdef CONFIG_SCHEDSTATS
                 if (entity_is_task(se)) {
                         struct task_struct *tsk = task_of(se);
@@ -894,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  #ifdef CONFIG_SCHED_HRTICK
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
-       int requeue = rq->curr == p;
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
@@ -915,13 +898,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                  * Don't schedule slices shorter than 10000ns, that just
                  * doesn't make sense. Rely on vruntime for fairness.
                  */
-               if (!requeue)
+               if (rq->curr != p)
                         delta = max(10000LL, delta);
  
-               hrtick_start(rq, delta, requeue);
+               hrtick_start(rq, delta);
         }
  }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
@@ -1020,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
   * not idle and an idle cpu is available.  The span of cpus to
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
   *
   * Returns the CPU we should wake onto.
   */
@@ -1047,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p)
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
                         && !task_hot(p, task_rq(p)->clock, sd))) {
                         cpus_and(tmp, sd->span, p->cpus_allowed);
-                       for_each_cpu_mask(i, tmp) {
+                       cpus_and(tmp, tmp, cpu_active_map);
+                       for_each_cpu_mask_nr(i, tmp) {
                                 if (idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
                                                 schedstat_inc(p,
@@ -1062,7 +1048,7 @@ static int wake_idle(int cpu, struct task_struct *p)
         }
         return cpu;
  }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
  static inline int wake_idle(int cpu, struct task_struct *p)
  {
         return cpu;
@@ -1074,10 +1060,50 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  static const struct sched_class fair_sched_class;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+               long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
-       long wg = wl;
+       long more_w;
+
+       if (!tg->parent)
+               return wl;
+
+       /*
+        * By not taking the decrease of shares on the other cpu into
+        * account our error leans towards reducing the affine wakeups.
+        */
+       if (!wl && sched_feat(ASYM_EFF_LOAD))
+               return wl;
+
+       /*
+        * Instead of using this increment, also add the difference
+        * between when the shares were last updated and now.
+        */
+       more_w = se->my_q->load.weight - se->my_q->rq_weight;
+       wl += more_w;
+       wg += more_w;
  
         for_each_sched_entity(se) {
  #define D(n) (likely(n) ? (n) : 1)
@@ -1086,12 +1112,19 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
  
                 S = se->my_q->tg->shares;
                 s = se->my_q->shares;
-               rw = se->my_q->load.weight;
+               rw = se->my_q->rq_weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
  
                 wl = s*(a-b)/D(b);
+               /*
+                * Assume the group is already running and will
+                * thus already be accounted for in the weight.
+                *
+                * That is, moving shares between CPUs, does not
+                * alter the group weight.
+                */
                 wg = 0;
  #undef D
         }
@@ -1099,26 +1132,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
         return wl;
  }
  
-static unsigned long task_load_sub(struct task_struct *p)
-{
-       return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-       return effective_load(task_group(p), p->se.load.weight, cpu);
-}
-
  #else
  
-static unsigned long task_load_sub(struct task_struct *p)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+               unsigned long wl, unsigned long wg)
  {
-       return -p->se.load.weight;
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-       return p->se.load.weight;
+       return wl;
  }
  
  #endif
@@ -1130,8 +1149,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
             unsigned int imbalance)
  {
         struct task_struct *curr = this_rq->curr;
+       struct task_group *tg;
         unsigned long tl = this_load;
         unsigned long tl_per_task;
+       unsigned long weight;
         int balanced;
  
         if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1142,19 +1163,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
          * effect of the currently running task from the load
          * of the current CPU:
          */
-       if (sync)
-               tl += task_load_sub(current);
+       if (sync) {
+               tg = task_group(current);
+               weight = current->se.load.weight;
  
-       balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
+               tl += effective_load(tg, this_cpu, -weight, -weight);
+               load += effective_load(tg, prev_cpu, 0, -weight);
+       }
+
+       tg = task_group(p);
+       weight = p->se.load.weight;
+
+       balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+               imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
  
         /*
          * If the currently running task will sleep within
          * a reasonable amount of time then attract this newly
          * woken task:
          */
-       if (sync && balanced && curr->sched_class == &fair_sched_class) {
+       if (sync && balanced) {
                 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                               p->se.avg_overlap < sysctl_sched_migration_cost)
+                   p->se.avg_overlap < sysctl_sched_migration_cost)
                         return 1;
         }
  
@@ -1315,7 +1345,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                 return;
         }
  
-       se->last_wakeup = se->sum_exec_runtime;
         if (unlikely(se == pse))
                 return;
  
@@ -1542,7 +1571,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
         return 0;
  }
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * scheduler tick hitting a task of our scheduling class: