/home/lenb/src/to-linus-stable branch 'acpi-2.6.12'

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 2711130..a646e4f 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
  #define SCALE_PRIO(x, prio) \
         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
  
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
  {
         if (p->static_prio < NICE_TO_PRIO(0))
                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -260,22 +260,86 @@ struct runqueue {
  
  static DEFINE_PER_CPU(struct runqueue, runqueues);
  
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
  #define for_each_domain(cpu, domain) \
-       for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
  
  #define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
  #define this_rq()              (&__get_cpu_var(runqueues))
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  
-/*
- * Default context-switch locking:
- */
  #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next)  spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)           ((rq)->curr == (p))
+# define prepare_arch_switch(next)     do { } while (0)
  #endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)      do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+       return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+       spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+       return p->oncpu;
+#else
+       return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       spin_unlock_irq(&rq->lock);
+#else
+       spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  
  /*
   * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
  
  #ifdef CONFIG_SMP
                 /* domain-specific stats */
+               preempt_disable();
                 for_each_domain(cpu, sd) {
                         enum idle_type itype;
                         char mask_str[NR_CPUS];
@@ -362,6 +427,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
                             sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
                             sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                 }
+               preempt_enable();
  #endif
         }
         return 0;
@@ -607,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
         rq->nr_running++;
  }
  
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
  {
         /* Caller must always ensure 'now >= p->timestamp' */
         unsigned long long __sleep_time = now - p->timestamp;
@@ -666,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
                 }
         }
  
-       p->prio = effective_prio(p);
+       return effective_prio(p);
  }
  
  /*
@@ -689,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
         }
  #endif
  
-       recalc_task_prio(p, now);
+       p->prio = recalc_task_prio(p, now);
  
         /*
          * This checks to make sure it's not an uninterruptible task
@@ -767,22 +833,12 @@ inline int task_curr(const task_t *p)
  }
  
  #ifdef CONFIG_SMP
-enum request_type {
-       REQ_MOVE_TASK,
-       REQ_SET_DOMAIN,
-};
-
  typedef struct {
         struct list_head list;
-       enum request_type type;
  
-       /* For REQ_MOVE_TASK */
         task_t *task;
         int dest_cpu;
  
-       /* For REQ_SET_DOMAIN */
-       struct sched_domain *sd;
-
         struct completion done;
  } migration_req_t;
  
@@ -804,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
         }
  
         init_completion(&req->done);
-       req->type = REQ_MOVE_TASK;
         req->task = p;
         req->dest_cpu = dest_cpu;
         list_add(&req->list, &rq->migration_queue);
@@ -966,8 +1021,59 @@ static int find_idlest_cpu(struct sched_group *group, int this_cpu)
         return idlest;
  }
  
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+       struct task_struct *t = current;
+       struct sched_domain *tmp, *sd = NULL;
  
-#endif
+       for_each_domain(cpu, tmp)
+               if (tmp->flags & flag)
+                       sd = tmp;
+
+       while (sd) {
+               cpumask_t span;
+               struct sched_group *group;
+               int new_cpu;
+               int weight;
+
+               span = sd->span;
+               group = find_idlest_group(sd, t, cpu);
+               if (!group)
+                       goto nextlevel;
+
+               new_cpu = find_idlest_cpu(group, cpu);
+               if (new_cpu == -1 || new_cpu == cpu)
+                       goto nextlevel;
+
+               /* Now try balancing at a lower domain level */
+               cpu = new_cpu;
+nextlevel:
+               sd = NULL;
+               weight = cpus_weight(span);
+               for_each_domain(cpu, tmp) {
+                       if (weight <= cpus_weight(tmp->span))
+                               break;
+                       if (tmp->flags & flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+
+       return cpu;
+}
+
+#endif /* CONFIG_SMP */
  
  /*
   * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1185,8 +1291,15 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
   * Perform scheduler related setup for a newly forked process p.
   * p is forked by current.
   */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
  {
+       int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+       set_task_cpu(p, cpu);
+
         /*
          * We mark the process as running here, but have not actually
          * inserted it onto the runqueue yet. This guarantees that
@@ -1196,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
         p->state = TASK_RUNNING;
         INIT_LIST_HEAD(&p->run_list);
         p->array = NULL;
-       spin_lock_init(&p->switch_lock);
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->sched_info, 0, sizeof(p->sched_info));
  #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       p->oncpu = 0;
+#endif
  #ifdef CONFIG_PREEMPT
-       /*
-        * During context-switch we hold precisely one spinlock, which
-        * schedule_tail drops. (in the common case it's this_rq()->lock,
-        * but it also can be p->switch_lock.) So we compensate with a count
-        * of 1. Also, we want to start with kernel preemption disabled.
-        */
+       /* Want to start with kernel preemption disabled. */
         p->thread_info->preempt_count = 1;
  #endif
         /*
@@ -1230,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
                  * runqueue lock is not a problem.
                  */
                 current->time_slice = 1;
-               preempt_disable();
                 scheduler_tick();
-               local_irq_enable();
-               preempt_enable();
-       } else
-               local_irq_enable();
+       }
+       local_irq_enable();
+       put_cpu();
  }
  
  /*
@@ -1250,49 +1358,12 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
         unsigned long flags;
         int this_cpu, cpu;
         runqueue_t *rq, *this_rq;
-#ifdef CONFIG_SMP
-       struct sched_domain *tmp, *sd = NULL;
-#endif
  
         rq = task_rq_lock(p, &flags);
         BUG_ON(p->state != TASK_RUNNING);
         this_cpu = smp_processor_id();
         cpu = task_cpu(p);
  
-#ifdef CONFIG_SMP
-       for_each_domain(cpu, tmp)
-               if (tmp->flags & SD_BALANCE_FORK)
-                       sd = tmp;
-
-       if (sd) {
-               int new_cpu;
-               struct sched_group *group;
-
-               schedstat_inc(sd, sbf_cnt);
-               cpu = task_cpu(p);
-               group = find_idlest_group(sd, p, cpu);
-               if (!group) {
-                       schedstat_inc(sd, sbf_balanced);
-                       goto no_forkbalance;
-               }
-
-               new_cpu = find_idlest_cpu(group, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       schedstat_inc(sd, sbf_balanced);
-                       goto no_forkbalance;
-               }
-
-               if (cpu_isset(new_cpu, p->cpus_allowed)) {
-                       schedstat_inc(sd, sbf_pushed);
-                       set_task_cpu(p, new_cpu);
-                       task_rq_unlock(rq, &flags);
-                       rq = task_rq_lock(p, &flags);
-                       cpu = task_cpu(p);
-               }
-       }
-
-no_forkbalance:
-#endif
         /*
          * We decrease the sleep average of forking parents
          * and children as well, to keep max-interactive tasks
@@ -1387,23 +1458,41 @@ void fastcall sched_exit(task_t * p)
         task_rq_unlock(rq, &flags);
  }
  
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+       prepare_lock_switch(rq, next);
+       prepare_arch_switch(next);
+}
+
  /**
   * finish_task_switch - clean up after a task-switch
   * @prev: the thread we just switched away from.
   *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
   *
   * Note that we may have delayed dropping an mm in context_switch(). If
   * so, we finish that here outside of the runqueue lock.  (Doing it
   * with the lock held can cause deadlocks; see schedule() for
   * details.)
   */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         __releases(rq->lock)
  {
-       runqueue_t *rq = this_rq();
         struct mm_struct *mm = rq->prev_mm;
         unsigned long prev_task_flags;
  
@@ -1421,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_task_flags = prev->flags;
-       finish_arch_switch(rq, prev);
+       finish_arch_switch(prev);
+       finish_lock_switch(rq, prev);
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_task_flags & PF_DEAD))
@@ -1435,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
  asmlinkage void schedule_tail(task_t *prev)
         __releases(rq->lock)
  {
-       finish_task_switch(prev);
-
+       runqueue_t *rq = this_rq();
+       finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+       /* In this case, finish_task_switch does not reenable preemption */
+       preempt_enable();
+#endif
         if (current->set_child_tid)
                 put_user(current->pid, current->set_child_tid);
  }
@@ -1618,42 +1712,16 @@ out:
  }
  
  /*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
   */
  void sched_exec(void)
  {
-       struct sched_domain *tmp, *sd = NULL;
         int new_cpu, this_cpu = get_cpu();
-
-       for_each_domain(this_cpu, tmp)
-               if (tmp->flags & SD_BALANCE_EXEC)
-                       sd = tmp;
-
-       if (sd) {
-               struct sched_group *group;
-               schedstat_inc(sd, sbe_cnt);
-               group = find_idlest_group(sd, current, this_cpu);
-               if (!group) {
-                       schedstat_inc(sd, sbe_balanced);
-                       goto out;
-               }
-               new_cpu = find_idlest_cpu(group, this_cpu);
-               if (new_cpu == -1 || new_cpu == this_cpu) {
-                       schedstat_inc(sd, sbe_balanced);
-                       goto out;
-               }
-
-               schedstat_inc(sd, sbe_pushed);
-               put_cpu();
-               sched_migrate_task(current, new_cpu);
-               return;
-       }
-out:
+       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
         put_cpu();
+       if (new_cpu != this_cpu)
+               sched_migrate_task(current, new_cpu);
  }
  
  /*
@@ -1962,6 +2030,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
         return busiest;
  }
  
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL    512
+
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
@@ -1974,7 +2048,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         struct sched_group *group;
         runqueue_t *busiest;
         unsigned long imbalance;
-       int nr_moved, all_pinned;
+       int nr_moved, all_pinned = 0;
         int active_balance = 0;
  
         spin_lock(&this_rq->lock);
@@ -2065,7 +2139,8 @@ out_balanced:
  
         sd->nr_balance_failed = 0;
         /* tune up the balancing interval */
-       if (sd->balance_interval < sd->max_interval)
+       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                       (sd->balance_interval < sd->max_interval))
                 sd->balance_interval *= 2;
  
         return 0;
@@ -2502,11 +2577,15 @@ out:
  #ifdef CONFIG_SCHED_SMT
  static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
  {
-       struct sched_domain *sd = this_rq->sd;
+       struct sched_domain *tmp, *sd = NULL;
         cpumask_t sibling_map;
         int i;
  
-       if (!(sd->flags & SD_SHARE_CPUPOWER))
+       for_each_domain(this_cpu, tmp)
+               if (tmp->flags & SD_SHARE_CPUPOWER)
+                       sd = tmp;
+
+       if (!sd)
                 return;
  
         /*
@@ -2547,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
  
  static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
  {
-       struct sched_domain *sd = this_rq->sd;
+       struct sched_domain *tmp, *sd = NULL;
         cpumask_t sibling_map;
         prio_array_t *array;
         int ret = 0, i;
         task_t *p;
  
-       if (!(sd->flags & SD_SHARE_CPUPOWER))
+       for_each_domain(this_cpu, tmp)
+               if (tmp->flags & SD_SHARE_CPUPOWER)
+                       sd = tmp;
+
+       if (!sd)
                 return 0;
  
         /*
@@ -2668,7 +2751,7 @@ asmlinkage void __sched schedule(void)
         struct list_head *queue;
         unsigned long long now;
         unsigned long run_time;
-       int cpu, idx;
+       int cpu, idx, new_prio;
  
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
@@ -2790,9 +2873,14 @@ go_idle:
                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
  
                 array = next->array;
-               dequeue_task(next, array);
-               recalc_task_prio(next, next->timestamp + delta);
-               enqueue_task(next, array);
+               new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+               if (unlikely(next->prio != new_prio)) {
+                       dequeue_task(next, array);
+                       next->prio = new_prio;
+                       enqueue_task(next, array);
+               } else
+                       requeue_task(next, array);
         }
         next->activated = 0;
  switch_tasks:
@@ -2816,11 +2904,15 @@ switch_tasks:
                 rq->curr = next;
                 ++*switch_count;
  
-               prepare_arch_switch(rq, next);
+               prepare_task_switch(rq, next);
                 prev = context_switch(rq, prev, next);
                 barrier();
-
-               finish_task_switch(prev);
+               /*
+                * this_rq must be evaluated again because prev may have moved
+                * CPUs since it called schedule(), thus the 'rq' on its stack
+                * frame will be invalid.
+                */
+               finish_task_switch(this_rq(), prev);
         } else
                 spin_unlock_irq(&rq->lock);
  
@@ -3356,15 +3448,7 @@ int task_nice(const task_t *p)
  {
         return TASK_NICE(p);
  }
-
-/*
- * The only users of task_nice are binfmt_elf and binfmt_elf32.
- * binfmt_elf is no longer modular, but binfmt_elf32 still is.
- * Therefore, task_nice is needed if there is a compat_mode.
- */
-#ifdef CONFIG_COMPAT
  EXPORT_SYMBOL_GPL(task_nice);
-#endif
  
  /**
   * idle_cpu - is a given cpu idle currently?
@@ -3402,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
         p->policy = policy;
         p->rt_priority = prio;
         if (policy != SCHED_NORMAL)
-               p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+               p->prio = MAX_RT_PRIO-1 - p->rt_priority;
         else
                 p->prio = p->static_prio;
  }
@@ -3434,18 +3518,31 @@ recheck:
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
          */
         if (param->sched_priority < 0 ||
-           param->sched_priority > MAX_USER_RT_PRIO-1)
+           (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                 return -EINVAL;
         if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
                 return -EINVAL;
  
-       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
-           param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
-           !capable(CAP_SYS_NICE))
-               return -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-           !capable(CAP_SYS_NICE))
-               return -EPERM;
+       /*
+        * Allow unprivileged RT tasks to decrease priority:
+        */
+       if (!capable(CAP_SYS_NICE)) {
+               /* can't change policy */
+               if (policy != p->policy &&
+                       !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                       return -EPERM;
+               /* can't increase priority */
+               if (policy != SCHED_NORMAL &&
+                   param->sched_priority > p->rt_priority &&
+                   param->sched_priority >
+                               p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                       return -EPERM;
+               /* can't change other user's priorities */
+               if ((current->euid != p->euid) &&
+                   (current->euid != p->uid))
+                       return -EPERM;
+       }
  
         retval = security_task_setscheduler(p, policy, param);
         if (retval)
@@ -3782,6 +3879,13 @@ asmlinkage long sys_sched_yield(void)
  
  static inline void __cond_resched(void)
  {
+       /*
+        * The BKS might be reacquired before we have dropped
+        * PREEMPT_ACTIVE, which could trigger a second
+        * cond_resched() call.
+        */
+       if (unlikely(preempt_count()))
+               return;
         do {
                 add_preempt_count(PREEMPT_ACTIVE);
                 schedule();
@@ -4071,6 +4175,14 @@ void show_state(void)
         read_unlock(&tasklist_lock);
  }
  
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
  void __devinit init_idle(task_t *idle, int cpu)
  {
         runqueue_t *rq = cpu_rq(cpu);
@@ -4085,7 +4197,9 @@ void __devinit init_idle(task_t *idle, int cpu)
  
         spin_lock_irqsave(&rq->lock, flags);
         rq->curr = rq->idle = idle;
-       set_tsk_need_resched(idle);
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       idle->oncpu = 1;
+#endif
         spin_unlock_irqrestore(&rq->lock, flags);
  
         /* Set the preempt count _outside_ the spinlocks! */
@@ -4229,8 +4343,7 @@ static int migration_thread(void * data)
                 struct list_head *head;
                 migration_req_t *req;
  
-               if (current->flags & PF_FREEZE)
-                       refrigerator(PF_FREEZE);
+               try_to_freeze();
  
                 spin_lock_irq(&rq->lock);
  
@@ -4255,17 +4368,9 @@ static int migration_thread(void * data)
                 req = list_entry(head->next, migration_req_t, list);
                 list_del_init(head->next);
  
-               if (req->type == REQ_MOVE_TASK) {
-                       spin_unlock(&rq->lock);
-                       __migrate_task(req->task, cpu, req->dest_cpu);
-                       local_irq_enable();
-               } else if (req->type == REQ_SET_DOMAIN) {
-                       rq->sd = req->sd;
-                       spin_unlock_irq(&rq->lock);
-               } else {
-                       spin_unlock_irq(&rq->lock);
-                       WARN_ON(1);
-               }
+               spin_unlock(&rq->lock);
+               __migrate_task(req->task, cpu, req->dest_cpu);
+               local_irq_enable();
  
                 complete(&req->done);
         }
@@ -4496,7 +4601,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                         migration_req_t *req;
                         req = list_entry(rq->migration_queue.next,
                                          migration_req_t, list);
-                       BUG_ON(req->type != REQ_MOVE_TASK);
                         list_del_init(&req->list);
                         complete(&req->done);
                 }
@@ -4527,12 +4631,17 @@ int __init migration_init(void)
  #endif
  
  #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
  #ifdef SCHED_DOMAIN_DEBUG
  static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
         int level = 0;
  
+       if (!sd) {
+               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+               return;
+       }
+
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  
         do {
@@ -4615,37 +4724,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
  #define sched_domain_debug(sd, cpu) {}
  #endif
  
+static int sd_degenerate(struct sched_domain *sd)
+{
+       if (cpus_weight(sd->span) == 1)
+               return 1;
+
+       /* Following flags need at least 2 groups */
+       if (sd->flags & (SD_LOAD_BALANCE |
+                        SD_BALANCE_NEWIDLE |
+                        SD_BALANCE_FORK |
+                        SD_BALANCE_EXEC)) {
+               if (sd->groups != sd->groups->next)
+                       return 0;
+       }
+
+       /* Following flags don't use groups */
+       if (sd->flags & (SD_WAKE_IDLE |
+                        SD_WAKE_AFFINE |
+                        SD_WAKE_BALANCE))
+               return 0;
+
+       return 1;
+}
+
+static int sd_parent_degenerate(struct sched_domain *sd,
+                                               struct sched_domain *parent)
+{
+       unsigned long cflags = sd->flags, pflags = parent->flags;
+
+       if (sd_degenerate(parent))
+               return 1;
+
+       if (!cpus_equal(sd->span, parent->span))
+               return 0;
+
+       /* Does parent contain flags not in child? */
+       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+       if (cflags & SD_WAKE_AFFINE)
+               pflags &= ~SD_WAKE_BALANCE;
+       /* Flags needing groups don't count if only 1 group in parent */
+       if (parent->groups == parent->groups->next) {
+               pflags &= ~(SD_LOAD_BALANCE |
+                               SD_BALANCE_NEWIDLE |
+                               SD_BALANCE_FORK |
+                               SD_BALANCE_EXEC);
+       }
+       if (~cflags & pflags)
+               return 0;
+
+       return 1;
+}
+
  /*
   * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
   * hold the hotplug lock.
   */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
  {
-       migration_req_t req;
-       unsigned long flags;
         runqueue_t *rq = cpu_rq(cpu);
-       int local = 1;
-
-       sched_domain_debug(sd, cpu);
-
-       spin_lock_irqsave(&rq->lock, flags);
+       struct sched_domain *tmp;
  
-       if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-               rq->sd = sd;
-       } else {
-               init_completion(&req.done);
-               req.type = REQ_SET_DOMAIN;
-               req.sd = sd;
-               list_add(&req.list, &rq->migration_queue);
-               local = 0;
+       /* Remove the sched domains which do not contribute to scheduling. */
+       for (tmp = sd; tmp; tmp = tmp->parent) {
+               struct sched_domain *parent = tmp->parent;
+               if (!parent)
+                       break;
+               if (sd_parent_degenerate(tmp, parent))
+                       tmp->parent = parent->parent;
         }
  
-       spin_unlock_irqrestore(&rq->lock, flags);
+       if (sd && sd_degenerate(sd))
+               sd = sd->parent;
  
-       if (!local) {
-               wake_up_process(rq->migration_thread);
-               wait_for_completion(&req.done);
-       }
+       sched_domain_debug(sd, cpu);
+
+       rcu_assign_pointer(rq->sd, sd);
  }
  
  /* cpus with isolated domains */
@@ -4677,7 +4830,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
                         cpumask_t span, int (*group_fn)(int cpu))
  {
         struct sched_group *first = NULL, *last = NULL;
@@ -4713,13 +4866,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
  
  
  #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
  #else
  #ifdef CONFIG_SCHED_SMT
  static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
  static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
  {
         return cpu;
  }
@@ -4727,7 +4881,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
  
  static DEFINE_PER_CPU(struct sched_domain, phys_domains);
  static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
  {
  #ifdef CONFIG_SCHED_SMT
         return first_cpu(cpu_sibling_map[cpu]);
@@ -4740,7 +4894,7 @@ static int __devinit cpu_to_phys_group(int cpu)
  
  static DEFINE_PER_CPU(struct sched_domain, node_domains);
  static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
  {
         return cpu_to_node(cpu);
  }
@@ -4771,39 +4925,28 @@ static void check_sibling_maps(void)
  #endif
  
  /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
   */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
  {
         int i;
-       cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-       check_sibling_maps();
-#endif
-       /*
-        * Setup mask for cpus without special case scheduling requirements.
-        * For now this just excludes isolated cpus, but could be used to
-        * exclude other special cases in the future.
-        */
-       cpus_complement(cpu_default_map, cpu_isolated_map);
-       cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
  
         /*
-        * Set up domains. Isolated domains just stay on the dummy domain.
+        * Set up domains for cpus specified by the cpu_map.
          */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, *cpu_map) {
                 int group;
                 struct sched_domain *sd = NULL, *p;
                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
  
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, *cpu_map);
  
  #ifdef CONFIG_NUMA
                 sd = &per_cpu(node_domains, i);
                 group = cpu_to_node_group(i);
                 *sd = SD_NODE_INIT;
-               sd->span = cpu_default_map;
+               sd->span = *cpu_map;
                 sd->groups = &sched_group_nodes[group];
  #endif
  
@@ -4821,7 +4964,7 @@ static void __devinit arch_init_sched_domains(void)
                 group = cpu_to_cpu_group(i);
                 *sd = SD_SIBLING_INIT;
                 sd->span = cpu_sibling_map[i];
-               cpus_and(sd->span, sd->span, cpu_default_map);
+               cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 sd->groups = &sched_group_cpus[group];
  #endif
@@ -4831,7 +4974,7 @@ static void __devinit arch_init_sched_domains(void)
         /* Set up CPU (sibling) groups */
         for_each_online_cpu(i) {
                 cpumask_t this_sibling_map = cpu_sibling_map[i];
-               cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+               cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                 if (i != first_cpu(this_sibling_map))
                         continue;
  
@@ -4844,7 +4987,7 @@ static void __devinit arch_init_sched_domains(void)
         for (i = 0; i < MAX_NUMNODES; i++) {
                 cpumask_t nodemask = node_to_cpumask(i);
  
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, *cpu_map);
                 if (cpus_empty(nodemask))
                         continue;
  
@@ -4854,12 +4997,12 @@ static void __devinit arch_init_sched_domains(void)
  
  #ifdef CONFIG_NUMA
         /* Set up node groups */
-       init_sched_build_groups(sched_group_nodes, cpu_default_map,
+       init_sched_build_groups(sched_group_nodes, *cpu_map,
                                         &cpu_to_node_group);
  #endif
  
         /* Calculate CPU power for physical packages and nodes */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, *cpu_map) {
                 int power;
                 struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
@@ -4883,7 +5026,7 @@ static void __devinit arch_init_sched_domains(void)
         }
  
         /* Attach the domains */
-       for_each_online_cpu(i) {
+       for_each_cpu_mask(i, *cpu_map) {
                 struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
@@ -4893,41 +5036,85 @@ static void __devinit arch_init_sched_domains(void)
                 cpu_attach_domain(sd, i);
         }
  }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+       cpumask_t cpu_default_map;
  
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+       check_sibling_maps();
+#endif
+       /*
+        * Setup mask for cpus without special case scheduling requirements.
+        * For now this just excludes isolated cpus, but could be used to
+        * exclude other special cases in the future.
+        */
+       cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+       build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
  {
         /* Do nothing: everything is statically allocated. */
  }
-#endif
  
  #endif /* ARCH_HAS_SCHED_DOMAIN */
  
  /*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+       int i;
+
+       for_each_cpu_mask(i, *cpu_map)
+               cpu_attach_domain(NULL, i);
+       synchronize_sched();
+       arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
   */
-static struct sched_domain sched_domain_dummy;
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+       cpumask_t change_map;
+
+       cpus_and(*partition1, *partition1, cpu_online_map);
+       cpus_and(*partition2, *partition2, cpu_online_map);
+       cpus_or(change_map, *partition1, *partition2);
+
+       /* Detach sched domains from all of the affected cpus */
+       detach_destroy_domains(&change_map);
+       if (!cpus_empty(*partition1))
+               build_sched_domains(partition1);
+       if (!cpus_empty(*partition2))
+               build_sched_domains(partition2);
+}
  
  #ifdef CONFIG_HOTPLUG_CPU
  /*
   * Force a reinitialization of the sched domains hierarchy.  The domains
   * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
   * which will prevent rebalancing while the sched domains are recalculated.
   */
  static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
  {
-       int i;
-
         switch (action) {
         case CPU_UP_PREPARE:
         case CPU_DOWN_PREPARE:
-               for_each_online_cpu(i)
-                       cpu_attach_domain(&sched_domain_dummy, i);
-               arch_destroy_sched_domains();
+               detach_destroy_domains(&cpu_online_map);
                 return NOTIFY_OK;
  
         case CPU_UP_CANCELED:
@@ -4943,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb,
         }
  
         /* The hotplug lock is already held by cpu_up/cpu_down */
-       arch_init_sched_domains();
+       arch_init_sched_domains(&cpu_online_map);
  
         return NOTIFY_OK;
  }
@@ -4952,7 +5139,7 @@ static int update_sched_domains(struct notifier_block *nfb,
  void __init sched_init_smp(void)
  {
         lock_cpu_hotplug();
-       arch_init_sched_domains();
+       arch_init_sched_domains(&cpu_online_map);
         unlock_cpu_hotplug();
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
@@ -4988,7 +5175,7 @@ void __init sched_init(void)
                 rq->best_expired_prio = MAX_PRIO;
  
  #ifdef CONFIG_SMP
-               rq->sd = &sched_domain_dummy;
+               rq->sd = NULL;
                 for (j = 1; j < 3; j++)
                         rq->cpu_load[j] = 0;
                 rq->active_balance = 0;