Merge branch 'ipi-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)
diff --combined arch/s390/include/asm/smp.h

index 2009158,f89e2d5..72137bc
--- 1/arch/s390/include/asm/smp.h
--- 2/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@@ -50,7 -50,12 +50,7 @@@ extern void machine_power_off_smp(void)
    
   #define PROC_CHANGE_PENALTY   20              /* Schedule penalty */
   
- -#define raw_smp_processor_id()        (S390_lowcore.cpu_data.cpu_nr)
- -
- -static inline __u16 hard_smp_processor_id(void)
- -{
- -      return stap();
- -}
+ +#define raw_smp_processor_id()        (S390_lowcore.cpu_nr)
   
   /*
    * returns 1 if cpu is in stopped/check stopped state or not operational
@@@ -92,12 -97,6 +92,6 @@@ extern void arch_send_call_function_ipi
   #endif
   
   #ifndef CONFIG_SMP
- static inline void smp_send_stop(void)
- {
-       /* Disable all interrupts/machine checks */
-       __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK);
- }
- 
   #define hard_smp_processor_id()               0
   #define smp_cpu_not_running(cpu)      1
   #endif
diff --combined kernel/sched.c

index 73513f4,983c3ac..2325db2
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -331,13 -331,6 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
    */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return list_empty(&root_task_group.children);
+ +}
+ +#endif
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -398,13 -391,6 +398,13 @@@ static inline void set_task_rq(struct t
   
   #else
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return 1;
+ +}
+ +#endif
+ +
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
   static inline struct task_group *task_group(struct task_struct *p)
   {
@@@ -481,17 -467,11 +481,17 @@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      int highest_prio; /* highest queued rt task prio */
+ +      struct {
+ +              int curr; /* highest queued rt task prio */
+ +#ifdef CONFIG_SMP
+ +              int next; /* next highest */
+ +#endif
+ +      } highest_prio;
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+ +      struct plist_head pushable_tasks;
   #endif
         int rt_throttled;
         u64 rt_time;
@@@ -569,6 -549,7 +569,6 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- -      unsigned char idle_at_tick;
   #ifdef CONFIG_NO_HZ
         unsigned long last_tick_seen;
         unsigned char in_nohz_recently;
@@@ -609,7 -590,6 +609,7 @@@
         struct root_domain *rd;
         struct sched_domain *sd;
   
+ +      unsigned char idle_at_tick;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@@ -638,6 -618,9 +638,6 @@@
         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
- -      unsigned int yld_exp_empty;
- -      unsigned int yld_act_empty;
- -      unsigned int yld_both_empty;
         unsigned int yld_count;
   
         /* schedule() stats */
@@@ -1110,7 -1093,7 +1110,7 @@@ static void hrtick_start(struct rq *rq
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
         } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                 rq->hrtick_csd_pending = 1;
         }
   }
@@@ -1200,10 -1183,10 +1200,10 @@@ static void resched_task(struct task_st
   
         assert_spin_locked(&task_rq(p)->lock);
   
- -      if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ +      if (test_tsk_need_resched(p))
                 return;
   
- -      set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(p);
   
         cpu = task_cpu(p);
         if (cpu == smp_processor_id())
@@@ -1259,7 -1242,7 +1259,7 @@@ void wake_up_idle_cpu(int cpu
          * lockless. The worst case is that the other CPU runs the
          * idle task through an additional NOOP schedule()
          */
- -      set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(rq->idle);
   
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@@ -1627,42 -1610,21 +1627,42 @@@ static inline void update_shares_locked
   
   #endif
   
+ +#ifdef CONFIG_PREEMPT
+ +
   /*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ + * way at the expense of forcing extra atomic operations in all
+ + * invocations.  This assures that the double_lock is acquired using the
+ + * same underlying policy as the spinlock_t on this architecture, which
+ + * reduces latency compared to the unfair variant below.  However, it
+ + * also adds more overhead and therefore may reduce throughput.
    */
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +      __releases(this_rq->lock)
+ +      __acquires(busiest->lock)
+ +      __acquires(this_rq->lock)
+ +{
+ +      spin_unlock(&this_rq->lock);
+ +      double_rq_lock(this_rq, busiest);
+ +
+ +      return 1;
+ +}
+ +
+ +#else
+ +/*
+ + * Unfair double_lock_balance: Optimizes throughput at the expense of
+ + * latency by eliminating extra atomic operations when the locks are
+ + * already in proper order on entry.  This favors lower cpu-ids and will
+ + * grant the double lock to lower cpus over higher ids under contention,
+ + * regardless of entry order into the function.
+ + */
+ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
   {
         int ret = 0;
   
- -      if (unlikely(!irqs_disabled())) {
- -              /* printk() doesn't work good under rq->lock */
- -              spin_unlock(&this_rq->lock);
- -              BUG_ON(1);
- -      }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@ -1675,22 -1637,6 +1675,22 @@@
         return ret;
   }
   
+ +#endif /* CONFIG_PREEMPT */
+ +
+ +/*
+ + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + */
+ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +{
+ +      if (unlikely(!irqs_disabled())) {
+ +              /* printk() doesn't work good under rq->lock */
+ +              spin_unlock(&this_rq->lock);
+ +              BUG_ON(1);
+ +      }
+ +
+ +      return _double_lock_balance(this_rq, busiest);
+ +}
+ +
   static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
   {
@@@ -1759,9 -1705,6 +1759,9 @@@ static void update_avg(u64 *avg, u64 sa
   
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
+ +      if (wakeup)
+ +              p->se.start_runtime = p->se.sum_exec_runtime;
+ +
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@@ -1769,15 -1712,10 +1769,15 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
- -      if (sleep && p->se.last_wakeup) {
- -              update_avg(&p->se.avg_overlap,
- -                         p->se.sum_exec_runtime - p->se.last_wakeup);
- -              p->se.last_wakeup = 0;
+ +      if (sleep) {
+ +              if (p->se.last_wakeup) {
+ +                      update_avg(&p->se.avg_overlap,
+ +                              p->se.sum_exec_runtime - p->se.last_wakeup);
+ +                      p->se.last_wakeup = 0;
+ +              } else {
+ +                      update_avg(&p->se.avg_wakeup,
+ +                              sysctl_sched_wakeup_granularity);
+ +              }
         }
   
         sched_info_dequeued(p);
@@@ -2079,7 -2017,7 +2079,7 @@@ unsigned long wait_task_inactive(struc
                  * it must be off the runqueue _entirely_, and not
                  * preempted!
                  *
- -               * So if it wa still runnable (but just not actively
+ +               * So if it was still runnable (but just not actively
                  * running right now), it's preempted, and we should
                  * yield - it could be a while.
                  */
@@@ -2329,7 -2267,7 +2329,7 @@@ static int try_to_wake_up(struct task_s
                 sync = 0;
   
   #ifdef CONFIG_SMP
- -      if (sched_feat(LB_WAKEUP_UPDATE)) {
+ +      if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                 struct sched_domain *sd;
   
                 this_cpu = raw_smp_processor_id();
@@@ -2407,22 -2345,6 +2407,22 @@@ out_activate
         activate_task(rq, p, 1);
         success = 1;
   
+ +      /*
+ +       * Only attribute actual wakeups done by this task.
+ +       */
+ +      if (!in_interrupt()) {
+ +              struct sched_entity *se = &current->se;
+ +              u64 sample = se->sum_exec_runtime;
+ +
+ +              if (se->last_wakeup)
+ +                      sample -= se->last_wakeup;
+ +              else
+ +                      sample -= se->start_runtime;
+ +              update_avg(&se->avg_wakeup, sample);
+ +
+ +              se->last_wakeup = se->sum_exec_runtime;
+ +      }
+ +
   out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@@ -2433,6 -2355,8 +2433,6 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
- -      current->se.last_wakeup = current->se.sum_exec_runtime;
- -
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2462,8 -2386,6 +2462,8 @@@ static void __sched_fork(struct task_st
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+ +      p->se.start_runtime             = 0;
+ +      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@@ -2526,8 -2448,6 +2526,8 @@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+ +      plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +
         put_cpu();
   }
   
@@@ -2571,7 -2491,7 +2571,7 @@@ void wake_up_new_task(struct task_struc
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
   /**
- - * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ + * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2668,12 -2588,6 +2668,12 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ +#ifdef CONFIG_SMP
+ +      int post_schedule = 0;
+ +
+ +      if (current->sched_class->needs_post_schedule)
+ +              post_schedule = current->sched_class->needs_post_schedule(rq);
+ +#endif
   
         rq->prev_mm = NULL;
   
@@@ -2692,7 -2606,7 +2692,7 @@@
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
- -      if (current->sched_class->post_schedule)
+ +      if (post_schedule)
                 current->sched_class->post_schedule(rq);
   #endif
   
@@@ -2999,7 -2913,6 +2999,7 @@@ int can_migrate_task(struct task_struc
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
   {
+ +      int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
          * 1) running (obviously), or
@@@ -3023,11 -2936,10 +3023,11 @@@
          * 2) too many balance attempts have failed.
          */
   
- -      if (!task_hot(p, rq->clock, sd) ||
- -                      sd->nr_balance_failed > sd->cache_nice_tries) {
+ +      tsk_cache_hot = task_hot(p, rq->clock, sd);
+ +      if (!tsk_cache_hot ||
+ +              sd->nr_balance_failed > sd->cache_nice_tries) {
   #ifdef CONFIG_SCHEDSTATS
- -              if (task_hot(p, rq->clock, sd)) {
+ +              if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
                         schedstat_inc(p, se.nr_forced_migrations);
                 }
@@@ -3035,7 -2947,7 +3035,7 @@@
                 return 1;
         }
   
- -      if (task_hot(p, rq->clock, sd)) {
+ +      if (tsk_cache_hot) {
                 schedstat_inc(p, se.nr_failed_migrations_hot);
                 return 0;
         }
@@@ -3075,16 -2987,6 +3075,16 @@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
   
+ +#ifdef CONFIG_PREEMPT
+ +      /*
+ +       * NEWIDLE balancing is a source of latency, so preemptible kernels
+ +       * will stop after the first task is pulled to minimize the critical
+ +       * section.
+ +       */
+ +      if (idle == CPU_NEWLY_IDLE)
+ +              goto out;
+ +#endif
+ +
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@ -3131,15 -3033,9 +3131,15 @@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
   
+ +#ifdef CONFIG_PREEMPT
+ +              /*
+ +               * NEWIDLE balancing is a source of latency, so preemptible
+ +               * kernels will stop after the first task is pulled to minimize
+ +               * the critical section.
+ +               */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- -
+ +#endif
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3189,480 -3085,246 +3189,480 @@@ static int move_one_task(struct rq *thi
   
         return 0;
   }
- -
+ +/********** Helpers for find_busiest_group ************************/
   /*
- - * find_busiest_group finds and returns the busiest CPU group within the
- - * domain. It calculates and returns the amount of weighted load which
- - * should be moved to restore balance via the imbalance parameter.
+ + * sd_lb_stats - Structure to store the statistics of a sched_domain
+ + *            during load balancing.
    */
- -static struct sched_group *
- -find_busiest_group(struct sched_domain *sd, int this_cpu,
- -                 unsigned long *imbalance, enum cpu_idle_type idle,
- -                 int *sd_idle, const struct cpumask *cpus, int *balance)
- -{
- -      struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- -      unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- -      unsigned long max_pull;
- -      unsigned long busiest_load_per_task, busiest_nr_running;
- -      unsigned long this_load_per_task, this_nr_running;
- -      int load_idx, group_imb = 0;
+ +struct sd_lb_stats {
+ +      struct sched_group *busiest; /* Busiest group in this sd */
+ +      struct sched_group *this;  /* Local group in this sd */
+ +      unsigned long total_load;  /* Total load of all groups in sd */
+ +      unsigned long total_pwr;   /*   Total power of all groups in sd */
+ +      unsigned long avg_load;    /* Average load across all groups in sd */
+ +
+ +      /** Statistics of this group */
+ +      unsigned long this_load;
+ +      unsigned long this_load_per_task;
+ +      unsigned long this_nr_running;
+ +
+ +      /* Statistics of the busiest group */
+ +      unsigned long max_load;
+ +      unsigned long busiest_load_per_task;
+ +      unsigned long busiest_nr_running;
+ +
+ +      int group_imb; /* Is there imbalance in this sd */
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      int power_savings_balance = 1;
- -      unsigned long leader_nr_running = 0, min_load_per_task = 0;
- -      unsigned long min_nr_running = ULONG_MAX;
- -      struct sched_group *group_min = NULL, *group_leader = NULL;
+ +      int power_savings_balance; /* Is powersave balance needed for this sd */
+ +      struct sched_group *group_min; /* Least loaded group in sd */
+ +      struct sched_group *group_leader; /* Group which relieves group_min */
+ +      unsigned long min_load_per_task; /* load_per_task in group_min */
+ +      unsigned long leader_nr_running; /* Nr running of group_leader */
+ +      unsigned long min_nr_running; /* Nr running of group_min */
   #endif
+ +};
+ +
+ +/*
+ + * sg_lb_stats - stats of a sched_group required for load_balancing
+ + */
+ +struct sg_lb_stats {
+ +      unsigned long avg_load; /*Avg load across the CPUs of the group */
+ +      unsigned long group_load; /* Total load over the CPUs of the group */
+ +      unsigned long sum_nr_running; /* Nr tasks running in the group */
+ +      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ +      unsigned long group_capacity;
+ +      int group_imb; /* Is there an imbalance in the group ? */
+ +};
+ +
+ +/**
+ + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ + * @group: The group whose first cpu is to be returned.
+ + */
+ +static inline unsigned int group_first_cpu(struct sched_group *group)
+ +{
+ +      return cpumask_first(sched_group_cpus(group));
+ +}
   
- -      max_load = this_load = total_load = total_pwr = 0;
- -      busiest_load_per_task = busiest_nr_running = 0;
- -      this_load_per_task = this_nr_running = 0;
+ +/**
+ + * get_sd_load_idx - Obtain the load index for a given sched domain.
+ + * @sd: The sched_domain whose load_idx is to be obtained.
+ + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ + */
+ +static inline int get_sd_load_idx(struct sched_domain *sd,
+ +                                      enum cpu_idle_type idle)
+ +{
+ +      int load_idx;
   
- -      if (idle == CPU_NOT_IDLE)
+ +      switch (idle) {
+ +      case CPU_NOT_IDLE:
                 load_idx = sd->busy_idx;
- -      else if (idle == CPU_NEWLY_IDLE)
+ +              break;
+ +
+ +      case CPU_NEWLY_IDLE:
                 load_idx = sd->newidle_idx;
- -      else
+ +              break;
+ +      default:
                 load_idx = sd->idle_idx;
+ +              break;
+ +      }
   
- -      do {
- -              unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- -              int local_group;
- -              int i;
- -              int __group_imb = 0;
- -              unsigned int balance_cpu = -1, first_idle_cpu = 0;
- -              unsigned long sum_nr_running, sum_weighted_load;
- -              unsigned long sum_avg_load_per_task;
- -              unsigned long avg_load_per_task;
+ +      return load_idx;
+ +}
   
- -              local_group = cpumask_test_cpu(this_cpu,
- -                                             sched_group_cpus(group));
   
- -              if (local_group)
- -                      balance_cpu = cpumask_first(sched_group_cpus(group));
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +/**
+ + * init_sd_power_savings_stats - Initialize power savings statistics for
+ + * the given sched_domain, during load balancing.
+ + *
+ + * @sd: Sched domain whose power-savings statistics are to be initialized.
+ + * @sds: Variable containing the statistics for sd.
+ + * @idle: Idle status of the CPU at which we're performing load-balancing.
+ + */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      /*
+ +       * Busy processors will not participate in power savings
+ +       * balance.
+ +       */
+ +      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ +              sds->power_savings_balance = 0;
+ +      else {
+ +              sds->power_savings_balance = 1;
+ +              sds->min_nr_running = ULONG_MAX;
+ +              sds->leader_nr_running = 0;
+ +      }
+ +}
   
- -              /* Tally up the load of all CPUs in the group */
- -              sum_weighted_load = sum_nr_running = avg_load = 0;
- -              sum_avg_load_per_task = avg_load_per_task = 0;
+ +/**
+ + * update_sd_power_savings_stats - Update the power saving stats for a
+ + * sched_domain while performing load balancing.
+ + *
+ + * @group: sched_group belonging to the sched_domain under consideration.
+ + * @sds: Variable containing the statistics of the sched_domain
+ + * @local_group: Does group contain the CPU for which we're performing
+ + *            load balancing ?
+ + * @sgs: Variable containing the statistics of the group.
+ + */
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
   
- -              max_cpu_load = 0;
- -              min_cpu_load = ~0UL;
+ +      if (!sds->power_savings_balance)
+ +              return;
   
- -              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- -                      struct rq *rq = cpu_rq(i);
+ +      /*
+ +       * If the local group is idle or completely loaded
+ +       * no need to do power savings balance at this domain
+ +       */
+ +      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ +                              !sds->this_nr_running))
+ +              sds->power_savings_balance = 0;
   
- -                      if (*sd_idle && rq->nr_running)
- -                              *sd_idle = 0;
+ +      /*
+ +       * If a group is already running at full capacity or idle,
+ +       * don't include that group in power savings calculations
+ +       */
+ +      if (!sds->power_savings_balance ||
+ +              sgs->sum_nr_running >= sgs->group_capacity ||
+ +              !sgs->sum_nr_running)
+ +              return;
   
- -                      /* Bias balancing toward cpus of our domain */
- -                      if (local_group) {
- -                              if (idle_cpu(i) && !first_idle_cpu) {
- -                                      first_idle_cpu = 1;
- -                                      balance_cpu = i;
- -                              }
+ +      /*
+ +       * Calculate the group which has the least non-idle load.
+ +       * This is the group from where we need to pick up the load
+ +       * for saving power
+ +       */
+ +      if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ +          (sgs->sum_nr_running == sds->min_nr_running &&
+ +           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ +              sds->group_min = group;
+ +              sds->min_nr_running = sgs->sum_nr_running;
+ +              sds->min_load_per_task = sgs->sum_weighted_load /
+ +                                              sgs->sum_nr_running;
+ +      }
   
- -                              load = target_load(i, load_idx);
- -                      } else {
- -                              load = source_load(i, load_idx);
- -                              if (load > max_cpu_load)
- -                                      max_cpu_load = load;
- -                              if (min_cpu_load > load)
- -                                      min_cpu_load = load;
- -                      }
+ +      /*
+ +       * Calculate the group which is almost near its
+ +       * capacity but still has some space to pick up some load
+ +       * from other group and save more power
+ +       */
+ +      if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ +              return;
   
- -                      avg_load += load;
- -                      sum_nr_running += rq->nr_running;
- -                      sum_weighted_load += weighted_cpuload(i);
+ +      if (sgs->sum_nr_running > sds->leader_nr_running ||
+ +          (sgs->sum_nr_running == sds->leader_nr_running &&
+ +           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ +              sds->group_leader = group;
+ +              sds->leader_nr_running = sgs->sum_nr_running;
+ +      }
+ +}
   
- -                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
- -              }
+ +/**
+ + * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ + * @sds: Variable containing the statistics of the sched_domain
+ + *    under consideration.
+ + * @this_cpu: Cpu at which we're currently performing load-balancing.
+ + * @imbalance: Variable to store the imbalance.
+ + *
+ + * Description:
+ + * Check if we have potential to perform some power-savings balance.
+ + * If yes, set the busiest group to be the least loaded group in the
+ + * sched_domain, so that it's CPUs can be put to idle.
+ + *
+ + * Returns 1 if there is potential to perform power-savings balance.
+ + * Else returns 0.
+ + */
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      if (!sds->power_savings_balance)
+ +              return 0;
   
- -              /*
- -               * First idle cpu or the first cpu(busiest) in this sched group
- -               * is eligible for doing load balancing at this and above
- -               * domains. In the newly idle case, we will allow all the cpu's
- -               * to do the newly idle load balance.
- -               */
- -              if (idle != CPU_NEWLY_IDLE && local_group &&
- -                  balance_cpu != this_cpu && balance) {
- -                      *balance = 0;
- -                      goto ret;
- -              }
+ +      if (sds->this != sds->group_leader ||
+ +                      sds->group_leader == sds->group_min)
+ +              return 0;
   
- -              total_load += avg_load;
- -              total_pwr += group->__cpu_power;
+ +      *imbalance = sds->min_load_per_task;
+ +      sds->busiest = sds->group_min;
   
- -              /* Adjust by relative CPU power of the group */
- -              avg_load = sg_div_cpu_power(group,
- -                              avg_load * SCHED_LOAD_SCALE);
+ +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ +                      group_first_cpu(sds->group_leader);
+ +      }
   
+ +      return 1;
   
- -              /*
- -               * Consider the group unbalanced when the imbalance is larger
- -               * than the average weight of two tasks.
- -               *
- -               * APZ: with cgroup the avg task weight can vary wildly and
- -               *      might not be a suitable number - should we keep a
- -               *      normalized nr_running number somewhere that negates
- -               *      the hierarchy?
- -               */
- -              avg_load_per_task = sg_div_cpu_power(group,
- -                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +}
+ +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      return;
+ +}
   
- -              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- -                      __group_imb = 1;
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
+ +      return;
+ +}
+ +
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      return 0;
+ +}
+ +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ +
+ +
+ +/**
+ + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ + * @group: sched_group whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @local_group: Does group contain this_cpu.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sgs: variable to hold the statistics for this group.
+ + */
+ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ +                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ +                      int local_group, const struct cpumask *cpus,
+ +                      int *balance, struct sg_lb_stats *sgs)
+ +{
+ +      unsigned long load, max_cpu_load, min_cpu_load;
+ +      int i;
+ +      unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ +      unsigned long sum_avg_load_per_task;
+ +      unsigned long avg_load_per_task;
   
- -              group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ +      if (local_group)
+ +              balance_cpu = group_first_cpu(group);
   
+ +      /* Tally up the load of all CPUs in the group */
+ +      sum_avg_load_per_task = avg_load_per_task = 0;
+ +      max_cpu_load = 0;
+ +      min_cpu_load = ~0UL;
+ +
+ +      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ +              struct rq *rq = cpu_rq(i);
+ +
+ +              if (*sd_idle && rq->nr_running)
+ +                      *sd_idle = 0;
+ +
+ +              /* Bias balancing toward cpus of our domain */
                 if (local_group) {
- -                      this_load = avg_load;
- -                      this = group;
- -                      this_nr_running = sum_nr_running;
- -                      this_load_per_task = sum_weighted_load;
- -              } else if (avg_load > max_load &&
- -                         (sum_nr_running > group_capacity || __group_imb)) {
- -                      max_load = avg_load;
- -                      busiest = group;
- -                      busiest_nr_running = sum_nr_running;
- -                      busiest_load_per_task = sum_weighted_load;
- -                      group_imb = __group_imb;
+ +                      if (idle_cpu(i) && !first_idle_cpu) {
+ +                              first_idle_cpu = 1;
+ +                              balance_cpu = i;
+ +                      }
+ +
+ +                      load = target_load(i, load_idx);
+ +              } else {
+ +                      load = source_load(i, load_idx);
+ +                      if (load > max_cpu_load)
+ +                              max_cpu_load = load;
+ +                      if (min_cpu_load > load)
+ +                              min_cpu_load = load;
                 }
   
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -              /*
- -               * Busy processors will not participate in power savings
- -               * balance.
- -               */
- -              if (idle == CPU_NOT_IDLE ||
- -                              !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -                      goto group_next;
+ +              sgs->group_load += load;
+ +              sgs->sum_nr_running += rq->nr_running;
+ +              sgs->sum_weighted_load += weighted_cpuload(i);
   
- -              /*
- -               * If the local group is idle or completely loaded
- -               * no need to do power savings balance at this domain
- -               */
- -              if (local_group && (this_nr_running >= group_capacity ||
- -                                  !this_nr_running))
- -                      power_savings_balance = 0;
+ +              sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ +      }
   
- -              /*
- -               * If a group is already running at full capacity or idle,
- -               * don't include that group in power savings calculations
- -               */
- -              if (!power_savings_balance || sum_nr_running >= group_capacity
- -                  || !sum_nr_running)
- -                      goto group_next;
+ +      /*
+ +       * First idle cpu or the first cpu(busiest) in this sched group
+ +       * is eligible for doing load balancing at this and above
+ +       * domains. In the newly idle case, we will allow all the cpu's
+ +       * to do the newly idle load balance.
+ +       */
+ +      if (idle != CPU_NEWLY_IDLE && local_group &&
+ +          balance_cpu != this_cpu && balance) {
+ +              *balance = 0;
+ +              return;
+ +      }
   
- -              /*
- -               * Calculate the group which has the least non-idle load.
- -               * This is the group from where we need to pick up the load
- -               * for saving power
- -               */
- -              if ((sum_nr_running < min_nr_running) ||
- -                  (sum_nr_running == min_nr_running &&
- -                   cpumask_first(sched_group_cpus(group)) >
- -                   cpumask_first(sched_group_cpus(group_min)))) {
- -                      group_min = group;
- -                      min_nr_running = sum_nr_running;
- -                      min_load_per_task = sum_weighted_load /
- -                                              sum_nr_running;
- -              }
+ +      /* Adjust by relative CPU power of the group */
+ +      sgs->avg_load = sg_div_cpu_power(group,
+ +                      sgs->group_load * SCHED_LOAD_SCALE);
   
- -              /*
- -               * Calculate the group which is almost near its
- -               * capacity but still has some space to pick up some load
- -               * from other group and save more power
- -               */
- -              if (sum_nr_running <= group_capacity - 1) {
- -                      if (sum_nr_running > leader_nr_running ||
- -                          (sum_nr_running == leader_nr_running &&
- -                           cpumask_first(sched_group_cpus(group)) <
- -                           cpumask_first(sched_group_cpus(group_leader)))) {
- -                              group_leader = group;
- -                              leader_nr_running = sum_nr_running;
- -                      }
+ +
+ +      /*
+ +       * Consider the group unbalanced when the imbalance is larger
+ +       * than the average weight of two tasks.
+ +       *
+ +       * APZ: with cgroup the avg task weight can vary wildly and
+ +       *      might not be a suitable number - should we keep a
+ +       *      normalized nr_running number somewhere that negates
+ +       *      the hierarchy?
+ +       */
+ +      avg_load_per_task = sg_div_cpu_power(group,
+ +                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +
+ +      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ +              sgs->group_imb = 1;
+ +
+ +      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ +
+ +}
+ +
+ +/**
+ + * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ + * @sd: sched_domain whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sds: variable to hold the statistics for this sched_domain.
+ + */
+ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ +                      enum cpu_idle_type idle, int *sd_idle,
+ +                      const struct cpumask *cpus, int *balance,
+ +                      struct sd_lb_stats *sds)
+ +{
+ +      struct sched_group *group = sd->groups;
+ +      struct sg_lb_stats sgs;
+ +      int load_idx;
+ +
+ +      init_sd_power_savings_stats(sd, sds, idle);
+ +      load_idx = get_sd_load_idx(sd, idle);
+ +
+ +      do {
+ +              int local_group;
+ +
+ +              local_group = cpumask_test_cpu(this_cpu,
+ +                                             sched_group_cpus(group));
+ +              memset(&sgs, 0, sizeof(sgs));
+ +              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ +                              local_group, cpus, balance, &sgs);
+ +
+ +              if (local_group && balance && !(*balance))
+ +                      return;
+ +
+ +              sds->total_load += sgs.group_load;
+ +              sds->total_pwr += group->__cpu_power;
+ +
+ +              if (local_group) {
+ +                      sds->this_load = sgs.avg_load;
+ +                      sds->this = group;
+ +                      sds->this_nr_running = sgs.sum_nr_running;
+ +                      sds->this_load_per_task = sgs.sum_weighted_load;
+ +              } else if (sgs.avg_load > sds->max_load &&
+ +                         (sgs.sum_nr_running > sgs.group_capacity ||
+ +                              sgs.group_imb)) {
+ +                      sds->max_load = sgs.avg_load;
+ +                      sds->busiest = group;
+ +                      sds->busiest_nr_running = sgs.sum_nr_running;
+ +                      sds->busiest_load_per_task = sgs.sum_weighted_load;
+ +                      sds->group_imb = sgs.group_imb;
                 }
- -group_next:
- -#endif
+ +
+ +              update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
   
- -      if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- -              goto out_balanced;
- -
- -      avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ +}
   
- -      if (this_load >= avg_load ||
- -                      100*max_load <= sd->imbalance_pct*this_load)
- -              goto out_balanced;
+ +/**
+ + * fix_small_imbalance - Calculate the minor imbalance that exists
+ + *                    amongst the groups of a sched_domain, during
+ + *                    load balancing.
+ + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ + * @imbalance: Variable to store the imbalance.
+ + */
+ +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ +                              int this_cpu, unsigned long *imbalance)
+ +{
+ +      unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ +      unsigned int imbn = 2;
+ +
+ +      if (sds->this_nr_running) {
+ +              sds->this_load_per_task /= sds->this_nr_running;
+ +              if (sds->busiest_load_per_task >
+ +                              sds->this_load_per_task)
+ +                      imbn = 1;
+ +      } else
+ +              sds->this_load_per_task =
+ +                      cpu_avg_load_per_task(this_cpu);
   
- -      busiest_load_per_task /= busiest_nr_running;
- -      if (group_imb)
- -              busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ +      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ +                      sds->busiest_load_per_task * imbn) {
+ +              *imbalance = sds->busiest_load_per_task;
+ +              return;
+ +      }
   
         /*
- -       * We're trying to get all the cpus to the average_load, so we don't
- -       * want to push ourselves above the average load, nor do we wish to
- -       * reduce the max loaded cpu below the average load, as either of these
- -       * actions would just result in more rebalancing later, and ping-pong
- -       * tasks around. Thus we look for the minimum possible imbalance.
- -       * Negative imbalances (*we* are more loaded than anyone else) will
- -       * be counted as no imbalance for these purposes -- we can't fix that
- -       * by pulling tasks to us. Be careful of negative numbers as they'll
- -       * appear as very large values with unsigned longs.
+ +       * OK, we don't have enough imbalance to justify moving tasks,
+ +       * however we may be able to increase total CPU power used by
+ +       * moving them.
          */
- -      if (max_load <= busiest_load_per_task)
- -              goto out_balanced;
   
+ +      pwr_now += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load);
+ +      pwr_now += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load);
+ +      pwr_now /= SCHED_LOAD_SCALE;
+ +
+ +      /* Amount of load we'd subtract */
+ +      tmp = sg_div_cpu_power(sds->busiest,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      if (sds->max_load > tmp)
+ +              pwr_move += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load - tmp);
+ +
+ +      /* Amount of load we'd add */
+ +      if (sds->max_load * sds->busiest->__cpu_power <
+ +              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->max_load * sds->busiest->__cpu_power);
+ +      else
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      pwr_move += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load + tmp);
+ +      pwr_move /= SCHED_LOAD_SCALE;
+ +
+ +      /* Move if we gain throughput */
+ +      if (pwr_move > pwr_now)
+ +              *imbalance = sds->busiest_load_per_task;
+ +}
+ +
+ +/**
+ + * calculate_imbalance - Calculate the amount of imbalance present within the
+ + *                     groups of a given sched_domain during load balance.
+ + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: Cpu for which currently load balance is being performed.
+ + * @imbalance: The variable to store the imbalance.
+ + */
+ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ +              unsigned long *imbalance)
+ +{
+ +      unsigned long max_pull;
         /*
          * In the presence of smp nice balancing, certain scenarios can have
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
- -      if (max_load < avg_load) {
+ +      if (sds->max_load < sds->avg_load) {
                 *imbalance = 0;
- -              goto small_imbalance;
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
         }
   
         /* Don't want to pull so many tasks that a group would go idle */
- -      max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ +      max_pull = min(sds->max_load - sds->avg_load,
+ +                      sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
- -      *imbalance = min(max_pull * busiest->__cpu_power,
- -                              (avg_load - this_load) * this->__cpu_power)
+ +      *imbalance = min(max_pull * sds->busiest->__cpu_power,
+ +              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -3671,110 -3333,78 +3671,110 @@@
          * a think about bumping its value to force at least one task to be
          * moved
          */
- -      if (*imbalance < busiest_load_per_task) {
- -              unsigned long tmp, pwr_now, pwr_move;
- -              unsigned int imbn;
- -
- -small_imbalance:
- -              pwr_move = pwr_now = 0;
- -              imbn = 2;
- -              if (this_nr_running) {
- -                      this_load_per_task /= this_nr_running;
- -                      if (busiest_load_per_task > this_load_per_task)
- -                              imbn = 1;
- -              } else
- -                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ +      if (*imbalance < sds->busiest_load_per_task)
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
   
- -              if (max_load - this_load + busiest_load_per_task >=
- -                                      busiest_load_per_task * imbn) {
- -                      *imbalance = busiest_load_per_task;
- -                      return busiest;
- -              }
+ +}
+ +/******* find_busiest_group() helpers end here *********************/
   
- -              /*
- -               * OK, we don't have enough imbalance to justify moving tasks,
- -               * however we may be able to increase total CPU power used by
- -               * moving them.
- -               */
+ +/**
+ + * find_busiest_group - Returns the busiest group within the sched_domain
+ + * if there is an imbalance. If there isn't an imbalance, and
+ + * the user has opted for power-savings, it returns a group whose
+ + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ + * such a group exists.
+ + *
+ + * Also calculates the amount of weighted load which should be moved
+ + * to restore balance.
+ + *
+ + * @sd: The sched_domain whose busiest group is to be returned.
+ + * @this_cpu: The cpu for which load balancing is currently being performed.
+ + * @imbalance: Variable which stores amount of weighted load which should
+ + *            be moved to restore balance/put a group to idle.
+ + * @idle: The idle status of this_cpu.
+ + * @sd_idle: The idleness of sd
+ + * @cpus: The set of CPUs under consideration for load-balancing.
+ + * @balance: Pointer to a variable indicating if this_cpu
+ + *    is the appropriate cpu to perform load balancing at this_level.
+ + *
+ + * Returns:   - the busiest group if imbalance exists.
+ + *            - If no imbalance and user has opted for power-savings balance,
+ + *               return the least loaded group whose CPUs can be
+ + *               put to idle by rebalancing its tasks onto our group.
+ + */
+ +static struct sched_group *
+ +find_busiest_group(struct sched_domain *sd, int this_cpu,
+ +                 unsigned long *imbalance, enum cpu_idle_type idle,
+ +                 int *sd_idle, const struct cpumask *cpus, int *balance)
+ +{
+ +      struct sd_lb_stats sds;
   
- -              pwr_now += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load);
- -              pwr_now += this->__cpu_power *
- -                              min(this_load_per_task, this_load);
- -              pwr_now /= SCHED_LOAD_SCALE;
- -
- -              /* Amount of load we'd subtract */
- -              tmp = sg_div_cpu_power(busiest,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              if (max_load > tmp)
- -                      pwr_move += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load - tmp);
- -
- -              /* Amount of load we'd add */
- -              if (max_load * busiest->__cpu_power <
- -                              busiest_load_per_task * SCHED_LOAD_SCALE)
- -                      tmp = sg_div_cpu_power(this,
- -                                      max_load * busiest->__cpu_power);
- -              else
- -                      tmp = sg_div_cpu_power(this,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              pwr_move += this->__cpu_power *
- -                              min(this_load_per_task, this_load + tmp);
- -              pwr_move /= SCHED_LOAD_SCALE;
+ +      memset(&sds, 0, sizeof(sds));
   
- -              /* Move if we gain throughput */
- -              if (pwr_move > pwr_now)
- -                      *imbalance = busiest_load_per_task;
- -      }
+ +      /*
+ +       * Compute the various statistics relavent for load balancing at
+ +       * this level.
+ +       */
+ +      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ +                                      balance, &sds);
+ +
+ +      /* Cases where imbalance does not exist from POV of this_cpu */
+ +      /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ +       *    at this level.
+ +       * 2) There is no busy sibling group to pull from.
+ +       * 3) This group is the busiest group.
+ +       * 4) This group is more busy than the avg busieness at this
+ +       *    sched_domain.
+ +       * 5) The imbalance is within the specified limit.
+ +       * 6) Any rebalance would lead to ping-pong
+ +       */
+ +      if (balance && !(*balance))
+ +              goto ret;
   
- -      return busiest;
+ +      if (!sds.busiest || sds.busiest_nr_running == 0)
+ +              goto out_balanced;
   
- -out_balanced:
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -              goto ret;
+ +      if (sds.this_load >= sds.max_load)
+ +              goto out_balanced;
   
- -      if (this == group_leader && group_leader != group_min) {
- -              *imbalance = min_load_per_task;
- -              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- -                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- -                              cpumask_first(sched_group_cpus(group_leader));
- -              }
- -              return group_min;
- -      }
- -#endif
+ +      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ +
+ +      if (sds.this_load >= sds.avg_load)
+ +              goto out_balanced;
+ +
+ +      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ +              goto out_balanced;
+ +
+ +      sds.busiest_load_per_task /= sds.busiest_nr_running;
+ +      if (sds.group_imb)
+ +              sds.busiest_load_per_task =
+ +                      min(sds.busiest_load_per_task, sds.avg_load);
+ +
+ +      /*
+ +       * We're trying to get all the cpus to the average_load, so we don't
+ +       * want to push ourselves above the average load, nor do we wish to
+ +       * reduce the max loaded cpu below the average load, as either of these
+ +       * actions would just result in more rebalancing later, and ping-pong
+ +       * tasks around. Thus we look for the minimum possible imbalance.
+ +       * Negative imbalances (*we* are more loaded than anyone else) will
+ +       * be counted as no imbalance for these purposes -- we can't fix that
+ +       * by pulling tasks to us. Be careful of negative numbers as they'll
+ +       * appear as very large values with unsigned longs.
+ +       */
+ +      if (sds.max_load <= sds.busiest_load_per_task)
+ +              goto out_balanced;
+ +
+ +      /* Looks like there is an imbalance. Compute it */
+ +      calculate_imbalance(&sds, this_cpu, imbalance);
+ +      return sds.busiest;
+ +
+ +out_balanced:
+ +      /*
+ +       * There is no obvious imbalance. But check if we can do some balancing
+ +       * to save power.
+ +       */
+ +      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ +              return sds.busiest;
   ret:
         *imbalance = 0;
         return NULL;
@@@ -4427,11 -4057,6 +4427,11 @@@ static void run_rebalance_domains(struc
   #endif
   }
   
+ +static inline int on_null_domain(int cpu)
+ +{
+ +      return !rcu_dereference(cpu_rq(cpu)->sd);
+ +}
+ +
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
    *
@@@ -4489,9 -4114,7 +4489,9 @@@ static inline void trigger_load_balance
             cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
- -      if (time_after_eq(jiffies, rq->next_balance))
+ +      /* Don't need to rebalance while attached to NULL domain */
+ +      if (time_after_eq(jiffies, rq->next_balance) &&
+ +          likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
   }
   
@@@ -4885,33 -4508,11 +4885,33 @@@ static inline void schedule_debug(struc
   #endif
   }
   
+ +static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ +{
+ +      if (prev->state == TASK_RUNNING) {
+ +              u64 runtime = prev->se.sum_exec_runtime;
+ +
+ +              runtime -= prev->se.prev_sum_exec_runtime;
+ +              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+ +
+ +              /*
+ +               * In order to avoid avg_overlap growing stale when we are
+ +               * indeed overlapping and hence not getting put to sleep, grow
+ +               * the avg_overlap on preemption.
+ +               *
+ +               * We use the average preemption runtime because that
+ +               * correlates to the amount of cache footprint a task can
+ +               * build up.
+ +               */
+ +              update_avg(&prev->se.avg_overlap, runtime);
+ +      }
+ +      prev->sched_class->put_prev_task(rq, prev);
+ +}
+ +
   /*
    * Pick up the highest-prio task:
    */
   static inline struct task_struct *
- -pick_next_task(struct rq *rq, struct task_struct *prev)
+ +pick_next_task(struct rq *rq)
   {
         const struct sched_class *class;
         struct task_struct *p;
@@@ -4942,13 -4543,15 +4942,13 @@@
   /*
    * schedule() is the main scheduler function.
    */
- -asmlinkage void __sched schedule(void)
+ +asmlinkage void __sched __schedule(void)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
   
- -need_resched:
- -      preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@@ -4983,8 -4586,8 +4983,8 @@@ need_resched_nonpreemptible
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
   
- -      prev->sched_class->put_prev_task(rq, prev);
- -      next = pick_next_task(rq, prev);
+ +      put_prev_task(rq, prev);
+ +      next = pick_next_task(rq);
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@@ -5005,80 -4608,13 +5005,80 @@@
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
+ +}
   
+ +asmlinkage void __sched schedule(void)
+ +{
+ +need_resched:
+ +      preempt_disable();
+ +      __schedule();
         preempt_enable_no_resched();
         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                 goto need_resched;
   }
   EXPORT_SYMBOL(schedule);
   
+ +#ifdef CONFIG_SMP
+ +/*
+ + * Look out! "owner" is an entirely speculative pointer
+ + * access and not reliable.
+ + */
+ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ +{
+ +      unsigned int cpu;
+ +      struct rq *rq;
+ +
+ +      if (!sched_feat(OWNER_SPIN))
+ +              return 0;
+ +
+ +#ifdef CONFIG_DEBUG_PAGEALLOC
+ +      /*
+ +       * Need to access the cpu field knowing that
+ +       * DEBUG_PAGEALLOC could have unmapped it if
+ +       * the mutex owner just released it and exited.
+ +       */
+ +      if (probe_kernel_address(&owner->cpu, cpu))
+ +              goto out;
+ +#else
+ +      cpu = owner->cpu;
+ +#endif
+ +
+ +      /*
+ +       * Even if the access succeeded (likely case),
+ +       * the cpu field may no longer be valid.
+ +       */
+ +      if (cpu >= nr_cpumask_bits)
+ +              goto out;
+ +
+ +      /*
+ +       * We need to validate that we can do a
+ +       * get_cpu() and that we have the percpu area.
+ +       */
+ +      if (!cpu_online(cpu))
+ +              goto out;
+ +
+ +      rq = cpu_rq(cpu);
+ +
+ +      for (;;) {
+ +              /*
+ +               * Owner changed, break to re-assess state.
+ +               */
+ +              if (lock->owner != owner)
+ +                      break;
+ +
+ +              /*
+ +               * Is that owner really running on that cpu?
+ +               */
+ +              if (task_thread_info(rq->curr) != owner || need_resched())
+ +                      return 0;
+ +
+ +              cpu_relax();
+ +      }
+ +out:
+ +      return 1;
+ +}
+ +#endif
+ +
   #ifdef CONFIG_PREEMPT
   /*
    * this is the entry point to schedule() from in-kernel preemption
@@@ -5106,7 -4642,7 +5106,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   EXPORT_SYMBOL(preempt_schedule);
   
@@@ -5135,7 -4671,7 +5135,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   
   #endif /* CONFIG_PREEMPT */
@@@ -5196,17 -4732,11 +5196,17 @@@ void __wake_up_locked(wait_queue_head_
         __wake_up_common(q, mode, 1, 0, NULL);
   }
   
+ +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+ +{
+ +      __wake_up_common(q, mode, 1, 0, key);
+ +}
+ +
   /**
- - * __wake_up_sync - wake up threads blocked on a waitqueue.
+ + * __wake_up_sync_key - wake up threads blocked on a waitqueue.
    * @q: the waitqueue
    * @mode: which threads
    * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ + * @key: opaque value to be passed to wakeup targets
    *
    * The sync wakeup differs that the waker knows that it will schedule
    * away soon, so while the target thread will be woken up, it will not
@@@ -5215,8 -4745,8 +5215,8 @@@
    *
    * On UP it can prevent extra preemption.
    */
- -void
- -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ +                      int nr_exclusive, void *key)
   {
         unsigned long flags;
         int sync = 1;
@@@ -5228,18 -4758,9 +5228,18 @@@
                 sync = 0;
   
         spin_lock_irqsave(&q->lock, flags);
- -      __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ +      __wake_up_common(q, mode, nr_exclusive, sync, key);
         spin_unlock_irqrestore(&q->lock, flags);
   }
+ +EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+ +
+ +/*
+ + * __wake_up_sync - see __wake_up_sync_key()
+ + */
+ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ +{
+ +      __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ +}
   EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
   
   /**
@@@ -5624,7 -5145,7 +5624,7 @@@ SYSCALL_DEFINE1(nice, int, increment
         if (increment > 40)
                 increment = 40;
   
- -      nice = PRIO_TO_NICE(current->static_prio) + increment;
+ +      nice = TASK_NICE(current) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@@ -6897,7 -6418,7 +6897,7 @@@ static void migrate_dead_tasks(unsigne
                 if (!rq->nr_running)
                         break;
                 update_rq_clock(rq);
- -              next = pick_next_task(rq, rq->curr);
+ +              next = pick_next_task(rq);
                 if (!next)
                         break;
                 next->sched_class->put_prev_task(rq, next);
@@@ -8692,15 -8213,11 +8692,15 @@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      rt_rq->highest_prio = MAX_RT_PRIO;
+ +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +#ifdef CONFIG_SMP
+ +      rt_rq->highest_prio.next = MAX_RT_PRIO;
+ +#endif
   #endif
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+ +      plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
@@@ -10076,7 -9593,7 +10076,7 @@@ static void cpuacct_charge(struct task_
         struct cpuacct *ca;
         int cpu;
   
- -      if (!cpuacct_subsys.active)
+ +      if (unlikely(!cpuacct_subsys.active))
                 return;
   
         cpu = task_cpu(tsk);
diff --combined kernel/softirq.c

index 4877516,c349a03..ea23ec0
--- 1/kernel/softirq.c
--- 2/kernel/softirq.c
+++ b/kernel/softirq.c
@@@ -180,7 -180,7 +180,7 @@@ asmlinkage void __do_softirq(void
         account_system_vtime(current);
   
         __local_bh_disable((unsigned long)__builtin_return_address(0));
- -      trace_softirq_enter();
+ +      lockdep_softirq_enter();
   
         cpu = smp_processor_id();
   restart:
@@@ -220,7 -220,7 +220,7 @@@
         if (pending)
                 wakeup_softirqd();
   
- -      trace_softirq_exit();
+ +      lockdep_softirq_exit();
   
         account_system_vtime(current);
         _local_bh_enable();
@@@ -496,7 -496,7 +496,7 @@@ static int __try_remote_softirq(struct 
                 cp->flags = 0;
                 cp->priv = softirq;
   
-               __smp_call_function_single(cpu, cp);
+               __smp_call_function_single(cpu, cp, 0);
                 return 0;
         }
         return 1;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)
		1	2
arch/s390/include/asm/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/softirq.c	patch \|	diff1 \|	diff2 \|	blob \| history