X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=9cd8ca76bbaca843180baae39d1d9031c946862e;hb=c812fe636b6e1e3b3b409b316157f56054252e25;hp=d6b149ccf925c320841e8a42f31fd23b6ee64dc6;hpb=42a3b63bb2ca4996a3d1210a004eae2333f1119e;p=pandora-kernel.git diff --git a/kernel/sched.c b/kernel/sched.c index d6b149ccf925..9cd8ca76bbac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1885,10 +1885,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); -static void update_cpu_load(struct rq *this_rq); +static void update_idle_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -3401,11 +3400,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -3422,6 +3483,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -3433,30 +3497,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} -static void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_read_idx(void) { + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); /* - * Its got a race, we don't care... + * If we're still before the sample window, we're done. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -3538,28 +3690,16 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { long delta, active, n; - if (time_before(jiffies, calc_load_update)) - return; - - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * If we were idle for multiple load cycles, apply them. - */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -3572,45 +3712,21 @@ static void calc_global_nohz(unsigned long ticks) } /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. + * Flip the idle index... * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. */ + smp_wmb(); + calc_load_idx++; } -#else -static void calc_load_account_idle(struct rq *this_rq) -{ -} - -static inline long calc_load_fold_idle(void) -{ - return 0; -} +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(unsigned long ticks) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -3618,13 +3734,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; - - calc_global_nohz(ticks); + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -3633,6 +3754,11 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); } /* @@ -3647,13 +3773,16 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } +/* + * End of global load-average stuff + */ + /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load @@ -3726,22 +3855,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -3766,9 +3886,78 @@ static void update_cpu_load(struct rq *this_rq) sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ static void update_cpu_load_active(struct rq *this_rq) { - update_cpu_load(this_rq); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); calc_load_account_active(this_rq); } @@ -7430,11 +7619,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -7605,16 +7791,26 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); + sdd->sg = NULL; free_percpu(sdd->sgp); + sdd->sgp = NULL; } } @@ -7627,7 +7823,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -7635,6 +7830,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; }