Merge branch 'next' of git://git.monstr.eu/linux-2.6-microblaze

[pandora-kernel.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 00ebd76..c62ebae 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
  static const struct sched_class fair_sched_class;
  
  /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return cfs_rq->tg->cfs_rq[this_cpu];
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return &cpu_rq(this_cpu)->cfs;
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
         WRT_SYSCTL(sched_min_granularity);
         WRT_SYSCTL(sched_latency);
         WRT_SYSCTL(sched_wakeup_granularity);
-       WRT_SYSCTL(sched_shares_ratelimit);
  #undef WRT_SYSCTL
  
         return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  
         curr->vruntime += delta_exec_weighted;
         update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
  }
  
  static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_add(&se->group_node, &cfs_rq->tasks);
         }
         cfs_rq->nr_running++;
-       se->on_rq = 1;
  }
  
  static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_del_init(&se->group_node);
         }
         cfs_rq->nr_running--;
-       se->on_rq = 0;
  }
  
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (!cfs_rq)
+               return;
+
+       now = rq_of(cfs_rq)->clock;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long load_weight, load, shares;
+
+       if (!cfs_rq)
+               return;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se)
+               return;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       update_cfs_shares(cfs_rq, se->load.weight);
         account_entity_enqueue(cfs_rq, se);
  
         if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         check_spread(cfs_rq, se);
         if (se != cfs_rq->curr)
                 __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1)
+               list_add_leaf_cfs_rq(cfs_rq);
  }
  
  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
         account_entity_dequeue(cfs_rq, se);
         update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq, 0);
  
         /*
          * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          */
         update_curr(cfs_rq);
  
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
  #ifdef CONFIG_SCHED_HRTICK
         /*
          * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 flags = ENQUEUE_WAKEUP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 dequeue_entity(cfs_rq, se, flags);
+
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight)
                         break;
                 flags |= DEQUEUE_SLEEP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
   * Adding load to a group doesn't make a group heavier, but can cause movement
   * of group shares between cpus. Assuming the shares were perfectly aligned one
   * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
   */
-static long effective_load(struct task_group *tg, int cpu,
-               long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
         if (!tg->parent)
                 return wl;
  
-       /*
-        * By not taking the decrease of shares on the other cpu into
-        * account our error leans towards reducing the affine wakeups.
-        */
-       if (!wl && sched_feat(ASYM_EFF_LOAD))
-               return wl;
-
         for_each_sched_entity(se) {
                 long S, rw, s, a, b;
-               long more_w;
-
-               /*
-                * Instead of using this increment, also add the difference
-                * between when the shares were last updated and now.
-                */
-               more_w = se->my_q->load.weight - se->my_q->rq_weight;
-               wl += more_w;
-               wg += more_w;
  
                 S = se->my_q->tg->shares;
-               s = se->my_q->shares;
-               rw = se->my_q->rq_weight;
+               s = se->load.weight;
+               rw = se->my_q->load.weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                         sd = tmp;
         }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (sched_feat(LB_SHARES_UPDATE)) {
-               /*
-                * Pick the largest domain to update shares over
-                */
-               tmp = sd;
-               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                       tmp = affine_sd;
-
-               if (tmp) {
-                       raw_spin_unlock(&rq->lock);
-                       update_shares(tmp);
-                       raw_spin_lock(&rq->lock);
-               }
-       }
-#endif
-
         if (affine_sd) {
                 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                         return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq, 0);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
+       rcu_read_unlock();
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
         return max_load_move - rem_load_move;
  }
  #else
+static inline void update_shares(int cpu)
+{
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         schedstat_inc(sd, lb_count[idle]);
  
  redo:
-       update_shares(sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                    cpus, balance);
  
@@ -3174,8 +3381,6 @@ out_one_pinned:
         else
                 ld_moved = 0;
  out:
-       if (ld_moved)
-               update_shares(sd);
         return ld_moved;
  }
  
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
          */
         raw_spin_unlock(&this_rq->lock);
  
+       update_shares(this_cpu);
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         int update_next_balance = 0;
         int need_serialize;
  
+       update_shares(cpu);
+
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;