Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Feb 2010 18:23:41 +0000 (10:23 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 28 Feb 2010 18:23:41 +0000 (10:23 -0800)
* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: Fix race between ttwu() and task_rq_lock()
  sched: Fix SMT scheduler regression in find_busiest_queue()
  sched: Fix sched_mv_power_savings for !SMT
  kernel/sched.c: Suppress unused var warning

1  2 
kernel/sched.c

diff --combined kernel/sched.c
@@@ -645,11 -645,6 +645,11 @@@ static inline int cpu_of(struct rq *rq
  #endif
  }
  
 +#define rcu_dereference_check_sched_domain(p) \
 +      rcu_dereference_check((p), \
 +                            rcu_read_lock_sched_held() || \
 +                            lockdep_is_held(&sched_domains_mutex))
 +
  /*
   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
   * See detach_destroy_domains: synchronize_sched for details.
   * preempt-disabled sections.
   */
  #define for_each_domain(cpu, __sd) \
 -      for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 +      for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
  
  #define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
  #define this_rq()             (&__get_cpu_var(runqueues))
@@@ -945,6 -940,19 +945,19 @@@ static inline void finish_lock_switch(s
  }
  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  
+ /*
+  * Check whether the task is waking, we use this to synchronize against
+  * ttwu() so that task_cpu() reports a stable number.
+  *
+  * We need to make an exception for PF_STARTING tasks because the fork
+  * path might require task_rq_lock() to work, eg. it can call
+  * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+  */
+ static inline int task_is_waking(struct task_struct *p)
+ {
+       return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+ }
  /*
   * __task_rq_lock - lock the runqueue a given task resides on.
   * Must be called interrupts disabled.
  static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
  {
+       struct rq *rq;
        for (;;) {
-               struct rq *rq = task_rq(p);
+               while (task_is_waking(p))
+                       cpu_relax();
+               rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@@ -972,10 -984,12 +989,12 @@@ static struct rq *task_rq_lock(struct t
        struct rq *rq;
  
        for (;;) {
+               while (task_is_waking(p))
+                       cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@@ -1536,7 -1550,7 +1555,7 @@@ static unsigned long target_load(int cp
  
  static struct sched_group *group_of(int cpu)
  {
 -      struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
 +      struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
  
        if (!sd)
                return NULL;
@@@ -2413,14 -2427,27 +2432,27 @@@ static int try_to_wake_up(struct task_s
        __task_rq_unlock(rq);
  
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu)
+       if (cpu != orig_cpu) {
+               /*
+                * Since we migrate the task without holding any rq->lock,
+                * we need to be careful with task_rq_lock(), since that
+                * might end up locking an invalid rq.
+                */
                set_task_cpu(p, cpu);
+       }
  
-       rq = __task_rq_lock(p);
+       rq = cpu_rq(cpu);
+       raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
  
+       /*
+        * We migrated the task without holding either rq->lock, however
+        * since the task is not on the task list itself, nobody else
+        * will try and migrate the task, hence the rq should match the
+        * cpu we just moved it to.
+        */
+       WARN_ON(task_cpu(p) != cpu);
        WARN_ON(p->state != TASK_WAKING);
-       cpu = task_cpu(p);
  
  #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@@ -2668,7 -2695,13 +2700,13 @@@ void wake_up_new_task(struct task_struc
        set_task_cpu(p, cpu);
  #endif
  
-       rq = task_rq_lock(p, &flags);
+       /*
+        * Since the task is not on the rq and we still have TASK_WAKING set
+        * nobody else will migrate this task.
+        */
+       rq = cpu_rq(cpu);
+       raw_spin_lock_irqsave(&rq->lock, flags);
        BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
        update_rq_clock(rq);
@@@ -2799,13 -2832,7 +2837,13 @@@ static void finish_task_switch(struct r
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
 -      perf_event_task_sched_in(current, cpu_of(rq));
 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 +      local_irq_disable();
 +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 +      perf_event_task_sched_in(current);
 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 +      local_irq_enable();
 +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
  
        fire_sched_in_preempt_notifiers(current);
@@@ -4130,12 -4157,23 +4168,23 @@@ find_busiest_queue(struct sched_group *
                        continue;
  
                rq = cpu_rq(i);
-               wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-               wl /= power;
+               wl = weighted_cpuload(i);
  
+               /*
+                * When comparing with imbalance, use weighted_cpuload()
+                * which is not scaled with the cpu power.
+                */
                if (capacity && rq->nr_running == 1 && wl > imbalance)
                        continue;
  
+               /*
+                * For the load comparisons with the other cpu's, consider
+                * the weighted_cpuload() scaled with the cpu power, so that
+                * the load can be moved away from the cpu that is potentially
+                * running at a lower capacity.
+                */
+               wl = (wl * SCHED_LOAD_SCALE) / power;
                if (wl > max_load) {
                        max_load = wl;
                        busiest = rq;
@@@ -4899,7 -4937,7 +4948,7 @@@ static void run_rebalance_domains(struc
  
  static inline int on_null_domain(int cpu)
  {
 -      return !rcu_dereference(cpu_rq(cpu)->sd);
 +      return !rcu_dereference_sched(cpu_rq(cpu)->sd);
  }
  
  /*
@@@ -5320,7 -5358,7 +5369,7 @@@ void scheduler_tick(void
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
  
 -      perf_event_task_tick(curr, cpu);
 +      perf_event_task_tick(curr);
  
  #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@@ -5534,7 -5572,7 +5583,7 @@@ need_resched_nonpreemptible
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
 -              perf_event_task_sched_out(prev, next, cpu);
 +              perf_event_task_sched_out(prev, next);
  
                rq->nr_switches++;
                rq->curr = next;
@@@ -7156,27 -7194,8 +7205,8 @@@ int set_cpus_allowed_ptr(struct task_st
        struct rq *rq;
        int ret = 0;
  
-       /*
-        * Since we rely on wake-ups to migrate sleeping tasks, don't change
-        * the ->cpus_allowed mask from under waking tasks, which would be
-        * possible when we change rq->lock in ttwu(), so synchronize against
-        * TASK_WAKING to avoid that.
-        *
-        * Make an exception for freshly cloned tasks, since cpuset namespaces
-        * might move the task about, we have to validate the target in
-        * wake_up_new_task() anyway since the cpu might have gone away.
-        */
- again:
-       while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
-               cpu_relax();
        rq = task_rq_lock(p, &flags);
  
-       if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;