[PATCH] sched: fix SMT scheduler latency bug
[pandora-kernel.git] / kernel / sched.c
index a646e4f..c61ee34 100644 (file)
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                int local_group;
                int i;
 
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+                       goto nextgroup;
+
                local_group = cpu_isset(this_cpu, group->cpumask);
-               /* XXX: put a cpus allowed check */
 
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                        min_load = avg_load;
                        idlest = group;
                }
+nextgroup:
                group = group->next;
        } while (group != sd->groups);
 
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
+       cpumask_t tmp;
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
 
-       for_each_cpu_mask(i, group->cpumask) {
+       /* Traverse only the allowed CPUs */
+       cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+       for_each_cpu_mask(i, tmp) {
                load = source_load(i, 0);
 
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
                if (!group)
                        goto nextlevel;
 
-               new_cpu = find_idlest_cpu(group, cpu);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu)
                        goto nextlevel;
 
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 {
        int cpu, this_cpu, success = 0;
        unsigned long flags;
@@ -1251,6 +1260,16 @@ out_activate:
                p->activated = -1;
        }
 
+       /*
+        * Tasks that have marked their sleep as noninteractive get
+        * woken up without updating their sleep average. (i.e. their
+        * sleep is handled in a priority-neutral manner, no priority
+        * boost and no penalty.)
+        */
+       if (old_state & TASK_NONINTERACTIVE)
+               __activate_task(p, rq);
+       else
+               activate_task(p, rq, cpu == this_cpu);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
@@ -1259,7 +1278,6 @@ out_activate:
         * the waker guarantees that the freshly woken up task is going
         * to be considered on this CPU.)
         */
-       activate_task(p, rq, cpu == this_cpu);
        if (!sync || cpu != this_cpu) {
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
        return success;
 }
 
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
 {
        return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
                                 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 {
        unsigned long flags;
        int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
  * artificially, because any timeslice recovered here
  * was given away by the parent in the first place.)
  */
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 
 /**
  * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-            struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+                    struct sched_domain *sd, enum idle_type idle,
+                    int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -2575,6 +2599,13 @@ out:
 }
 
 #ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+       /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+       if (rq->curr == rq->idle && rq->nr_running)
+               resched_task(rq->idle);
+}
+
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2639,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
        for_each_cpu_mask(i, sibling_map) {
                runqueue_t *smt_rq = cpu_rq(i);
 
-               /*
-                * If an SMT sibling task is sleeping due to priority
-                * reasons wake it up now.
-                */
-               if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-                       resched_task(smt_rq->idle);
+               wakeup_busy_runqueue(smt_rq);
        }
 
        for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2650,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
         */
 }
 
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2703,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                runqueue_t *smt_rq = cpu_rq(i);
                task_t *smt_curr = smt_rq->curr;
 
+               /* Kernel threads do not participate in dependent sleeping */
+               if (!p->mm || !smt_curr->mm || rt_task(p))
+                       goto check_smt_task;
+
                /*
                 * If a user task with lower static priority than the
                 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2715,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                 * task from using an unfair proportion of the
                 * physical cpu's resources. -ck
                 */
-               if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                       task_timeslice(p) || rt_task(smt_curr)) &&
-                       p->mm && smt_curr->mm && !rt_task(p))
-                               ret = 1;
+               if (rt_task(smt_curr)) {
+                       /*
+                        * With real time tasks we run non-rt tasks only
+                        * per_cpu_gain% of the time.
+                        */
+                       if ((jiffies % DEF_TIMESLICE) >
+                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                       ret = 1;
+               } else
+                       if (smt_curr->static_prio < p->static_prio &&
+                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(smt_curr, sd) > task_timeslice(p))
+                                       ret = 1;
+
+check_smt_task:
+               if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+                       rt_task(smt_curr))
+                               continue;
+               if (!p->mm) {
+                       wakeup_busy_runqueue(smt_rq);
+                       continue;
+               }
 
                /*
-                * Reschedule a lower priority task on the SMT sibling,
-                * or wake it up if it has been put to sleep for priority
-                * reasons.
+                * Reschedule a lower priority task on the SMT sibling for
+                * it to be put to sleep, or wake it up if it has been put to
+                * sleep for priority reasons to see if it should run now.
                 */
-               if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                       task_timeslice(smt_curr) || rt_task(p)) &&
-                       smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
-                       (smt_curr == smt_rq->idle && smt_rq->nr_running))
-                               resched_task(smt_curr);
+               if (rt_task(p)) {
+                       if ((jiffies % DEF_TIMESLICE) >
+                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                       resched_task(smt_curr);
+               } else {
+                       if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(p, sd) > task_timeslice(smt_curr))
+                                       resched_task(smt_curr);
+                       else
+                               wakeup_busy_runqueue(smt_rq);
+               }
        }
 out_unlock:
        for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2951,7 @@ switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
+       prefetch_stack(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));
 
@@ -3014,7 +3079,8 @@ need_resched:
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+                         void *key)
 {
        task_t *p = curr->private;
        return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3122,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
-                               int nr_exclusive, void *key)
+                       int nr_exclusive, void *key)
 {
        unsigned long flags;
 
@@ -3088,7 +3154,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
        unsigned long flags;
        int sync = 1;
@@ -3279,7 +3346,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
        SLEEP_ON_VAR
 
@@ -3378,8 +3446,8 @@ EXPORT_SYMBOL(set_user_nice);
  */
 int can_nice(const task_t *p, const int nice)
 {
-       /* convert nice value [19,-20] to rlimit style value [0,39] */
-       int nice_rlim = 19 - nice;
+       /* convert nice value [19,-20] to rlimit style value [1,40] */
+       int nice_rlim = 20 - nice;
        return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
                capable(CAP_SYS_NICE));
 }
@@ -3498,7 +3566,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
 {
        int retval;
        int oldprio, oldpolicy = -1;
@@ -3518,7 +3587,7 @@ recheck:
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
         */
        if (param->sched_priority < 0 ||
-           (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3650,8 @@ recheck:
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
        int retval;
        struct sched_param lparam;
@@ -3912,7 +3982,7 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
 {
        int ret = 0;
 
@@ -4095,7 +4165,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
        return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static void show_task(task_t * p)
+static void show_task(task_t *p)
 {
        task_t *relative;
        unsigned state;
@@ -4121,7 +4191,7 @@ static void show_task(task_t * p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
        {
-               unsigned long * n = (unsigned long *) (p->thread_info+1);
+               unsigned long *n = (unsigned long *) (p->thread_info+1);
                while (!*n)
                        n++;
                free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4400,7 @@ out:
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
-static int migration_thread(void * data)
+static int migration_thread(void *data)
 {
        runqueue_t *rq;
        int cpu = (long)data;
@@ -4779,7 +4849,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4802,7 +4872,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4900,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-                       cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+                                   int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4934,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+       int i, n, val, min_val, best_node = 0;
+
+       min_val = INT_MAX;
+
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               /* Start at @node */
+               n = (node + i) % MAX_NUMNODES;
+
+               if (!nr_cpus_node(n))
+                       continue;
+
+               /* Skip already used nodes */
+               if (test_bit(n, used_nodes))
+                       continue;
+
+               /* Simple min distance search */
+               val = node_distance(node, n);
+
+               if (val < min_val) {
+                       min_val = val;
+                       best_node = n;
+               }
+       }
+
+       set_bit(best_node, used_nodes);
+       return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+       int i;
+       cpumask_t span, nodemask;
+       DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+       cpus_clear(span);
+       bitmap_zero(used_nodes, MAX_NUMNODES);
+
+       nodemask = node_to_cpumask(node);
+       cpus_or(span, span, nodemask);
+       set_bit(node, used_nodes);
+
+       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+               int next_node = find_next_best_node(node, used_nodes);
+               nodemask = node_to_cpumask(next_node);
+               cpus_or(span, span, nodemask);
+       }
+
+       return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5034,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-       return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-       int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
-       for_each_online_cpu(i) {
-               for_each_cpu_mask(j, cpu_sibling_map[i]) {
-                       if (cpu_to_node(i) != cpu_to_node(j)) {
-                               printk(KERN_INFO "warning: CPU %d siblings map "
-                                       "to different node - isolating "
-                                       "them.\n", i);
-                               cpu_sibling_map[i] = cpumask_of_cpu(i);
-                               break;
-                       }
-               }
-       }
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+       return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,9 +5055,24 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+       struct sched_group **sched_group_nodes = NULL;
+       struct sched_group *sched_group_allnodes = NULL;
+
+       /*
+        * Allocate the per-node list of sched groups
+        */
+       sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                          GFP_ATOMIC);
+       if (!sched_group_nodes) {
+               printk(KERN_WARNING "Can not alloc sched group node list\n");
+               return;
+       }
+       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
 
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5085,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+               if (cpus_weight(*cpu_map)
+                               > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                       if (!sched_group_allnodes) {
+                               sched_group_allnodes
+                                       = kmalloc(sizeof(struct sched_group)
+                                                       * MAX_NUMNODES,
+                                                 GFP_KERNEL);
+                               if (!sched_group_allnodes) {
+                                       printk(KERN_WARNING
+                                       "Can not alloc allnodes sched group\n");
+                                       break;
+                               }
+                               sched_group_allnodes_bycpu[i]
+                                               = sched_group_allnodes;
+                       }
+                       sd = &per_cpu(allnodes_domains, i);
+                       *sd = SD_ALLNODES_INIT;
+                       sd->span = *cpu_map;
+                       group = cpu_to_allnodes_group(i);
+                       sd->groups = &sched_group_allnodes[group];
+                       p = sd;
+               } else
+                       p = NULL;
+
                sd = &per_cpu(node_domains, i);
-               group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-               sd->span = *cpu_map;
-               sd->groups = &sched_group_nodes[group];
+               sd->span = sched_domain_node_span(cpu_to_node(i));
+               sd->parent = p;
+               cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
                p = sd;
@@ -4972,7 +5138,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-       for_each_online_cpu(i) {
+       for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5163,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-       init_sched_build_groups(sched_group_nodes, *cpu_map,
-                                       &cpu_to_node_group);
+       if (sched_group_allnodes)
+               init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                       &cpu_to_allnodes_group);
+
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               /* Set up node groups */
+               struct sched_group *sg, *prev;
+               cpumask_t nodemask = node_to_cpumask(i);
+               cpumask_t domainspan;
+               cpumask_t covered = CPU_MASK_NONE;
+               int j;
+
+               cpus_and(nodemask, nodemask, *cpu_map);
+               if (cpus_empty(nodemask)) {
+                       sched_group_nodes[i] = NULL;
+                       continue;
+               }
+
+               domainspan = sched_domain_node_span(i);
+               cpus_and(domainspan, domainspan, *cpu_map);
+
+               sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+               sched_group_nodes[i] = sg;
+               for_each_cpu_mask(j, nodemask) {
+                       struct sched_domain *sd;
+                       sd = &per_cpu(node_domains, j);
+                       sd->groups = sg;
+                       if (sd->groups == NULL) {
+                               /* Turn off balancing if we have no groups */
+                               sd->flags = 0;
+                       }
+               }
+               if (!sg) {
+                       printk(KERN_WARNING
+                       "Can not alloc domain group for node %d\n", i);
+                       continue;
+               }
+               sg->cpu_power = 0;
+               sg->cpumask = nodemask;
+               cpus_or(covered, covered, nodemask);
+               prev = sg;
+
+               for (j = 0; j < MAX_NUMNODES; j++) {
+                       cpumask_t tmp, notcovered;
+                       int n = (i + j) % MAX_NUMNODES;
+
+                       cpus_complement(notcovered, covered);
+                       cpus_and(tmp, notcovered, *cpu_map);
+                       cpus_and(tmp, tmp, domainspan);
+                       if (cpus_empty(tmp))
+                               break;
+
+                       nodemask = node_to_cpumask(n);
+                       cpus_and(tmp, tmp, nodemask);
+                       if (cpus_empty(tmp))
+                               continue;
+
+                       sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                       if (!sg) {
+                               printk(KERN_WARNING
+                               "Can not alloc domain group for node %d\n", j);
+                               break;
+                       }
+                       sg->cpu_power = 0;
+                       sg->cpumask = tmp;
+                       cpus_or(covered, covered, tmp);
+                       prev->next = sg;
+                       prev = sg;
+               }
+               prev->next = sched_group_nodes[i];
+       }
 #endif
 
        /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5252,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-               if (i == first_cpu(sd->groups->cpumask)) {
-                       /* Only add "power" once for each physical package. */
-                       sd = &per_cpu(node_domains, i);
-                       sd->groups->cpu_power += power;
+               sd = &per_cpu(allnodes_domains, i);
+               if (sd->groups) {
+                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                               (cpus_weight(sd->groups->cpumask)-1) / 10;
+                       sd->groups->cpu_power = power;
                }
 #endif
        }
 
+#ifdef CONFIG_NUMA
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               struct sched_group *sg = sched_group_nodes[i];
+               int j;
+
+               if (sg == NULL)
+                       continue;
+next_sg:
+               for_each_cpu_mask(j, sg->cpumask) {
+                       struct sched_domain *sd;
+                       int power;
+
+                       sd = &per_cpu(phys_domains, j);
+                       if (j != first_cpu(sd->groups->cpumask)) {
+                               /*
+                                * Only add "power" once for each
+                                * physical package.
+                                */
+                               continue;
+                       }
+                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                               (cpus_weight(sd->groups->cpumask)-1) / 10;
+
+                       sg->cpu_power += power;
+               }
+               sg = sg->next;
+               if (sg != sched_group_nodes[i])
+                       goto next_sg;
+       }
+#endif
+
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5039,13 +5306,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-       check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5322,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-       /* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+       int i;
+       int cpu;
+
+       for_each_cpu_mask(cpu, *cpu_map) {
+               struct sched_group *sched_group_allnodes
+                       = sched_group_allnodes_bycpu[cpu];
+               struct sched_group **sched_group_nodes
+                       = sched_group_nodes_bycpu[cpu];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+               if (sched_group_allnodes) {
+                       kfree(sched_group_allnodes);
+                       sched_group_allnodes_bycpu[cpu] = NULL;
+               }
+
+               if (!sched_group_nodes)
+                       continue;
+
+               for (i = 0; i < MAX_NUMNODES; i++) {
+                       cpumask_t nodemask = node_to_cpumask(i);
+                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+                       cpus_and(nodemask, nodemask, *cpu_map);
+                       if (cpus_empty(nodemask))
+                               continue;
+
+                       if (sg == NULL)
+                               continue;
+                       sg = sg->next;
+next_sg:
+                       oldsg = sg;
+                       sg = sg->next;
+                       kfree(oldsg);
+                       if (oldsg != sched_group_nodes[i])
+                               goto next_sg;
+               }
+               kfree(sched_group_nodes);
+               sched_group_nodes_bycpu[cpu] = NULL;
+       }
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map