/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See update_sched_domains: synchronize_kernel for details.
+ * See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
rq->nr_running++;
}
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long long __sleep_time = now - p->timestamp;
}
}
- p->prio = effective_prio(p);
+ return effective_prio(p);
}
/*
}
#endif
- recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
return idlest;
}
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
-#endif
+ for_each_domain(cpu, tmp)
+ if (tmp->flags & flag)
+ sd = tmp;
+
+ while (sd) {
+ cpumask_t span;
+ struct sched_group *group;
+ int new_cpu;
+ int weight;
+
+ span = sd->span;
+ group = find_idlest_group(sd, t, cpu);
+ if (!group)
+ goto nextlevel;
+
+ new_cpu = find_idlest_cpu(group, cpu);
+ if (new_cpu == -1 || new_cpu == cpu)
+ goto nextlevel;
+
+ /* Now try balancing at a lower domain level */
+ cpu = new_cpu;
+nextlevel:
+ sd = NULL;
+ weight = cpus_weight(span);
+ for_each_domain(cpu, tmp) {
+ if (weight <= cpus_weight(tmp->span))
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return cpu;
+}
+
+#endif /* CONFIG_SMP */
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*/
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
{
+ int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+ set_task_cpu(p, cpu);
+
/*
* We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that
* runqueue lock is not a problem.
*/
current->time_slice = 1;
- preempt_disable();
scheduler_tick();
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
+ put_cpu();
}
/*
unsigned long flags;
int this_cpu, cpu;
runqueue_t *rq, *this_rq;
-#ifdef CONFIG_SMP
- struct sched_domain *tmp, *sd = NULL;
-#endif
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id();
cpu = task_cpu(p);
-#ifdef CONFIG_SMP
- for_each_domain(cpu, tmp)
- if (tmp->flags & SD_BALANCE_FORK)
- sd = tmp;
-
- if (sd) {
- cpumask_t span;
- int new_cpu;
- struct sched_group *group;
-
-again:
- schedstat_inc(sd, sbf_cnt);
- span = sd->span;
- cpu = task_cpu(p);
- group = find_idlest_group(sd, p, cpu);
- if (!group) {
- schedstat_inc(sd, sbf_balanced);
- goto nextlevel;
- }
-
- new_cpu = find_idlest_cpu(group, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- schedstat_inc(sd, sbf_balanced);
- goto nextlevel;
- }
-
- if (cpu_isset(new_cpu, p->cpus_allowed)) {
- schedstat_inc(sd, sbf_pushed);
- set_task_cpu(p, new_cpu);
- task_rq_unlock(rq, &flags);
- rq = task_rq_lock(p, &flags);
- cpu = task_cpu(p);
- }
-
- /* Now try balancing at a lower domain level */
-nextlevel:
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (cpus_subset(span, tmp->span))
- break;
- if (tmp->flags & SD_BALANCE_FORK)
- sd = tmp;
- }
-
- if (sd)
- goto again;
- }
-
-#endif
/*
* We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks
}
/*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
*/
void sched_exec(void)
{
- struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
-
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
-
- if (sd) {
- cpumask_t span;
- struct sched_group *group;
-again:
- schedstat_inc(sd, sbe_cnt);
- span = sd->span;
- group = find_idlest_group(sd, current, this_cpu);
- if (!group) {
- schedstat_inc(sd, sbe_balanced);
- goto nextlevel;
- }
- new_cpu = find_idlest_cpu(group, this_cpu);
- if (new_cpu == -1 || new_cpu == this_cpu) {
- schedstat_inc(sd, sbe_balanced);
- goto nextlevel;
- }
-
- schedstat_inc(sd, sbe_pushed);
- put_cpu();
- sched_migrate_task(current, new_cpu);
-
- /* Now try balancing at a lower domain level */
- this_cpu = get_cpu();
-nextlevel:
- sd = NULL;
- for_each_domain(this_cpu, tmp) {
- if (cpus_subset(span, tmp->span))
- break;
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
- }
-
- if (sd)
- goto again;
- }
-
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
put_cpu();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
}
/*
return busiest;
}
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
struct sched_group *group;
runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved, all_pinned;
+ int nr_moved, all_pinned = 0;
int active_balance = 0;
spin_lock(&this_rq->lock);
sd->nr_balance_failed = 0;
/* tune up the balancing interval */
- if (sd->balance_interval < sd->max_interval)
+ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
return 0;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, new_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
- recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+ new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+ if (unlikely(next->prio != new_prio)) {
+ dequeue_task(next, array);
+ next->prio = new_prio;
+ enqueue_task(next, array);
+ } else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
{
return TASK_NICE(p);
}
-
-/*
- * The only users of task_nice are binfmt_elf and binfmt_elf32.
- * binfmt_elf is no longer modular, but binfmt_elf32 still is.
- * Therefore, task_nice is needed if there is a compat_mode.
- */
-#ifdef CONFIG_COMPAT
EXPORT_SYMBOL_GPL(task_nice);
-#endif
/**
* idle_cpu - is a given cpu idle currently?
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
return -EINVAL;
- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
- param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ /* can't change policy */
+ if (policy != p->policy)
+ return -EPERM;
+ /* can't increase priority */
+ if (policy != SCHED_NORMAL &&
+ param->sched_priority > p->rt_priority &&
+ param->sched_priority >
+ p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+ return -EPERM;
+ /* can't change other user's priorities */
+ if ((current->euid != p->euid) &&
+ (current->euid != p->uid))
+ return -EPERM;
+ }
retval = security_task_setscheduler(p, policy, param);
if (retval)
struct list_head *head;
migration_req_t *req;
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ try_to_freeze();
spin_lock_irq(&rq->lock);
#endif
#ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
#define sched_domain_debug(sd, cpu) {}
#endif
-static int __devinit sd_degenerate(struct sched_domain *sd)
+static int sd_degenerate(struct sched_domain *sd)
{
if (cpus_weight(sd->span) == 1)
return 1;
return 1;
}
-static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+static int sd_parent_degenerate(struct sched_domain *sd,
struct sched_domain *parent)
{
unsigned long cflags = sd->flags, pflags = parent->flags;
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
runqueue_t *rq = cpu_rq(cpu);
struct sched_domain *tmp;
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
#else
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
{
return cpu;
}
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
{
#ifdef CONFIG_SCHED_SMT
return first_cpu(cpu_sibling_map[cpu]);
static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
{
return cpu_to_node(cpu);
}
#endif
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
*/
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
{
int i;
- cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- check_sibling_maps();
-#endif
/*
- * Setup mask for cpus without special case scheduling requirements.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
- cpus_complement(cpu_default_map, cpu_isolated_map);
- cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
-
- /*
- * Set up domains. Isolated domains just stay on the NULL domain.
+ * Set up domains for cpus specified by the cpu_map.
*/
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
#ifdef CONFIG_NUMA
sd = &per_cpu(node_domains, i);
group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- sd->span = cpu_default_map;
+ sd->span = *cpu_map;
sd->groups = &sched_group_nodes[group];
#endif
group = cpu_to_cpu_group(i);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
- cpus_and(sd->span, sd->span, cpu_default_map);
+ cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
sd->groups = &sched_group_cpus[group];
#endif
/* Set up CPU (sibling) groups */
for_each_online_cpu(i) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
- cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+ cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
continue;
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
if (cpus_empty(nodemask))
continue;
#ifdef CONFIG_NUMA
/* Set up node groups */
- init_sched_build_groups(sched_group_nodes, cpu_default_map,
+ init_sched_build_groups(sched_group_nodes, *cpu_map,
&cpu_to_node_group);
#endif
/* Calculate CPU power for physical packages and nodes */
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int power;
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
}
/* Attach the domains */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
cpu_attach_domain(sd, i);
}
}
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+ cpumask_t cpu_default_map;
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+ check_sibling_maps();
+#endif
+ /*
+ * Setup mask for cpus without special case scheduling requirements.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+ build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
/* Do nothing: everything is statically allocated. */
}
-#endif
#endif /* ARCH_HAS_SCHED_DOMAIN */
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+ int i;
+
+ for_each_cpu_mask(i, *cpu_map)
+ cpu_attach_domain(NULL, i);
+ synchronize_sched();
+ arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+ cpumask_t change_map;
+
+ cpus_and(*partition1, *partition1, cpu_online_map);
+ cpus_and(*partition2, *partition2, cpu_online_map);
+ cpus_or(change_map, *partition1, *partition2);
+
+ /* Detach sched domains from all of the affected cpus */
+ detach_destroy_domains(&change_map);
+ if (!cpus_empty(*partition1))
+ build_sched_domains(partition1);
+ if (!cpus_empty(*partition2))
+ build_sched_domains(partition2);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- int i;
-
switch (action) {
case CPU_UP_PREPARE:
case CPU_DOWN_PREPARE:
- for_each_online_cpu(i)
- cpu_attach_domain(NULL, i);
- synchronize_kernel();
- arch_destroy_sched_domains();
+ detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
}
/* The hotplug lock is already held by cpu_up/cpu_down */
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
return NOTIFY_OK;
}
void __init sched_init_smp(void)
{
lock_cpu_hotplug();
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
unlock_cpu_hotplug();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);