}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+ __acquires(rq->lock)
+{
+ struct runqueue *rq;
+
+repeat_lock_task:
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (unlikely(rq != task_rq(p))) {
+ spin_unlock(&rq->lock);
+ goto repeat_lock_task;
+ }
+ return rq;
+}
+
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
__acquires(rq->lock)
{
struct runqueue *rq;
return rq;
}
+static inline void __task_rq_unlock(runqueue_t *rq)
+ __releases(rq->lock)
+{
+ spin_unlock(&rq->lock);
+}
+
static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
__releases(rq->lock)
{
}
/*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
*
* Both properties are important to certain workloads.
*/
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
{
int bonus, prio;
- if (rt_task(p))
- return p->prio;
-
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus;
static void set_load_weight(task_t *p)
{
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
#ifdef CONFIG_SMP
if (p == task_rq(p)->migration_thread)
/*
dec_raw_weighted_load(rq, p);
}
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+ int prio;
+
+ if (has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
/*
* __activate_task - move a task to the runqueue.
*/
inc_nr_running(p, rq);
}
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
struct sched_domain *tmp, *sd = NULL;
for_each_domain(cpu, tmp) {
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
+ if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+ break;
if (tmp->flags & flag)
sd = tmp;
}
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_RUNNING;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child:
+ */
+ p->prio = current->normal_prio;
+
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
#ifdef CONFIG_SCHEDSTATS
__activate_task(p, rq);
else {
p->prio = current->prio;
+ p->normal_prio = current->normal_prio;
list_add_tail(&p->run_list, ¤t->run_list);
p->array = current->array;
p->array->nr_active++;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance = 1;
+ unsigned long leader_nr_running = 0, min_load_per_task = 0;
+ unsigned long min_nr_running = ULONG_MAX;
+ struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
max_load = this_load = total_load = total_pwr = 0;
busiest_load_per_task = busiest_nr_running = 0;
load_idx = sd->idle_idx;
do {
- unsigned long load;
+ unsigned long load, group_capacity;
int local_group;
int i;
unsigned long sum_nr_running, sum_weighted_load;
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
if (local_group) {
this_load = avg_load;
this = group;
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load > max_load &&
- sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
+ sum_nr_running > group_capacity) {
max_load = avg_load;
busiest = group;
busiest_nr_running = sum_nr_running;
busiest_load_per_task = sum_weighted_load;
}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto group_next;
+
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (this_nr_running >= group_capacity ||
+ !this_nr_running))
+ power_savings_balance = 0;
+
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!power_savings_balance || sum_nr_running >= group_capacity
+ || !sum_nr_running)
+ goto group_next;
+
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sum_nr_running < min_nr_running) ||
+ (sum_nr_running == min_nr_running &&
+ first_cpu(group->cpumask) <
+ first_cpu(group_min->cpumask))) {
+ group_min = group;
+ min_nr_running = sum_nr_running;
+ min_load_per_task = sum_weighted_load /
+ sum_nr_running;
+ }
+
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sum_nr_running <= group_capacity - 1)
+ if (sum_nr_running > leader_nr_running ||
+ (sum_nr_running == leader_nr_running &&
+ first_cpu(group->cpumask) >
+ first_cpu(group_leader->cpumask))) {
+ group_leader = group;
+ leader_nr_running = sum_nr_running;
+ }
+
+group_next:
+#endif
group = group->next;
} while (group != sd->groups);
return busiest;
out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto ret;
+ if (this == group_leader && group_leader != group_min) {
+ *imbalance = min_load_per_task;
+ return group_min;
+ }
+ret:
+#endif
*imbalance = 0;
return NULL;
}
int active_balance = 0;
int sd_idle = 0;
- if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
sd->balance_interval *= 2;
}
- if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
return nr_moved;
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
return -1;
return 0;
}
int nr_moved = 0;
int sd_idle = 0;
- if (sd->flags & SD_SHARE_CPUPOWER)
+ if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
return -1;
sd->nr_balance_failed = 0;
return 0;
EXPORT_SYMBOL(sleep_on_timeout);
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+ unsigned long flags;
+ prio_array_t *array;
+ runqueue_t *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+
+ oldprio = p->prio;
+ array = p->array;
+ if (array)
+ dequeue_task(p, array);
+ p->prio = prio;
+
+ if (array) {
+ /*
+ * If changing to an RT priority then queue it
+ * in the active array!
+ */
+ if (rt_task(p))
+ array = rq->active;
+ enqueue_task(p, array);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
void set_user_nice(task_t *p, long nice)
{
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
- int old_prio, new_prio, delta;
+ int old_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH:
*/
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
dec_raw_weighted_load(rq, p);
}
- old_prio = p->prio;
- new_prio = NICE_TO_PRIO(nice);
- delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
- p->prio += delta;
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
if (array) {
enqueue_task(p, array);
out_unlock:
task_rq_unlock(rq, &flags);
}
-
EXPORT_SYMBOL(set_user_nice);
/*
BUG_ON(p->array);
p->policy = policy;
p->rt_priority = prio;
- if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
- p->prio = MAX_RT_PRIO-1 - p->rt_priority;
- } else {
- p->prio = p->static_prio;
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
- }
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ /*
+ * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+ */
+ if (policy == SCHED_BATCH)
+ p->sleep_avg = 0;
set_load_weight(p);
}
unsigned long flags;
runqueue_t *rq;
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
recheck:
/* double check policy once rq lock held */
if (policy < 0)
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
/*
* To be able to change p->policy safely, the apropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
array = p->array;
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+
return 0;
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
read_unlock_irq(&tasklist_lock);
return -ESRCH;
}
- retval = sched_setscheduler(p, policy, &lparam);
+ get_task_struct(p);
read_unlock_irq(&tasklist_lock);
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
return retval;
}
return 0;
}
-static inline void __cond_resched(void)
+static inline int __resched_legal(void)
+{
+ if (unlikely(preempt_count()))
+ return 0;
+ if (unlikely(system_state != SYSTEM_RUNNING))
+ return 0;
+ return 1;
+}
+
+static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
__might_sleep(__FILE__, __LINE__);
* PREEMPT_ACTIVE, which could trigger a second
* cond_resched() call.
*/
- if (unlikely(preempt_count()))
- return;
- if (unlikely(system_state != SYSTEM_RUNNING))
- return;
do {
add_preempt_count(PREEMPT_ACTIVE);
schedule();
int __sched cond_resched(void)
{
- if (need_resched()) {
+ if (need_resched() && __resched_legal()) {
__cond_resched();
return 1;
}
return 0;
}
-
EXPORT_SYMBOL(cond_resched);
/*
ret = 1;
spin_lock(lock);
}
- if (need_resched()) {
+ if (need_resched() && __resched_legal()) {
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
}
return ret;
}
-
EXPORT_SYMBOL(cond_resched_lock);
int __sched cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched()) {
+ if (need_resched() && __resched_legal()) {
__local_bh_enable();
__cond_resched();
local_bh_disable();
}
return 0;
}
-
EXPORT_SYMBOL(cond_resched_softirq);
-
/**
* yield - yield the current processor to other threads.
*
idle->timestamp = sched_clock();
idle->sleep_avg = 0;
idle->array = NULL;
- idle->prio = MAX_PRIO;
+ idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);
}
#endif
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
* can switch it on easily if needed.
#endif
/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- int power;
struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
- power = SCHED_LOAD_SCALE;
- sd->groups->cpu_power = power;
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ }
#endif
#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask(i, *cpu_map) {
+ int power;
+ struct sched_domain *sd;
sd = &per_cpu(core_domains, i);
- power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
* SCHED_LOAD_SCALE / 10;
sd->groups->cpu_power = power;
+ }
+#endif
+ for_each_cpu_mask(i, *cpu_map) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
sd = &per_cpu(phys_domains, i);
+ if (i != first_cpu(sd->groups->cpumask))
+ continue;
- /*
- * This has to be < 2 * SCHED_LOAD_SCALE
- * Lets keep it SCHED_LOAD_SCALE, so that
- * while calculating NUMA group's cpu_power
- * we can simply do
- * numa_group->cpu_power += phys_group->cpu_power;
- *
- * See "only add power once for each physical pkg"
- * comment below
- */
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sd->groups->cpu_power = 0;
+ if (sched_mc_power_savings || sched_smt_power_savings) {
+ int j;
+
+ for_each_cpu_mask(j, sd->groups->cpumask) {
+ struct sched_domain *sd1;
+ sd1 = &per_cpu(core_domains, j);
+ /*
+ * for each core we will add once
+ * to the group in physical domain
+ */
+ if (j != first_cpu(sd1->groups->cpumask))
+ continue;
+
+ if (sched_smt_power_savings)
+ sd->groups->cpu_power += sd1->groups->cpu_power;
+ else
+ sd->groups->cpu_power += SCHED_LOAD_SCALE;
+ }
+ } else
+ /*
+ * This has to be < 2 * SCHED_LOAD_SCALE
+ * Lets keep it SCHED_LOAD_SCALE, so that
+ * while calculating NUMA group's cpu_power
+ * we can simply do
+ * numa_group->cpu_power += phys_group->cpu_power;
+ *
+ * See "only add power once for each physical pkg"
+ * comment below
+ */
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
#else
+ int power;
sd = &per_cpu(phys_domains, i);
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE;
sd->groups->cpu_power = power;
#endif
}
return err;
}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+ int err;
+
+ lock_cpu_hotplug();
+ detach_destroy_domains(&cpu_online_map);
+ err = arch_init_sched_domains(&cpu_online_map);
+ unlock_cpu_hotplug();
+
+ return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+ int ret;
+
+ if (buf[0] != '0' && buf[0] != '1')
+ return -EINVAL;
+
+ if (smt)
+ sched_smt_power_savings = (buf[0] == '1');
+ else
+ sched_mc_power_savings = (buf[0] == '1');
+
+ ret = arch_reinit_sched_domains();
+
+ return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
if (!rt_task(p))
continue;
- rq = task_rq_lock(p, &flags);
+ spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
array = p->array;
if (array)
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
}
read_unlock_irq(&tasklist_lock);
}