x86: mark get_cpu_leaves() with __cpuinit annotation

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 3f5bfdc..756d981 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -267,6 +267,10 @@ struct task_group {
         struct cgroup_subsys_state css;
  #endif
  
+#ifdef CONFIG_USER_SCHED
+       uid_t uid;
+#endif
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /* schedulable entities of this group on each cpu */
         struct sched_entity **se;
@@ -292,6 +296,12 @@ struct task_group {
  
  #ifdef CONFIG_USER_SCHED
  
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+       user->tg->uid = user->uid;
+}
+
  /*
   * Root task group.
   *     Every UID task group (including init_task_group aka UID-0) will
@@ -499,6 +509,14 @@ struct root_domain {
  #ifdef CONFIG_SMP
         struct cpupri cpupri;
  #endif
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+#endif
  };
  
  /*
@@ -1439,9 +1457,10 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
+       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
  
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       if (nr_running)
+               rq->avg_load_per_task = rq->load.weight / nr_running;
         else
                 rq->avg_load_per_task = 0;
  
@@ -1586,6 +1605,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  #endif
  
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2782,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                 __release(rq2->lock);
  }
  
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
  /*
   * If dest_cpu is allowed for this process, migrate the task to it.
   * This is accomplished by forcing the cpu_allowed mask to only
@@ -3231,7 +3249,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                  */
                 if ((sum_nr_running < min_nr_running) ||
                     (sum_nr_running == min_nr_running &&
-                    cpumask_first(sched_group_cpus(group)) <
+                    cpumask_first(sched_group_cpus(group)) >
                      cpumask_first(sched_group_cpus(group_min)))) {
                         group_min = group;
                         min_nr_running = sum_nr_running;
@@ -3247,7 +3265,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 if (sum_nr_running <= group_capacity - 1) {
                         if (sum_nr_running > leader_nr_running ||
                             (sum_nr_running == leader_nr_running &&
-                            cpumask_first(sched_group_cpus(group)) >
+                            cpumask_first(sched_group_cpus(group)) <
                              cpumask_first(sched_group_cpus(group_leader)))) {
                                 group_leader = group;
                                 leader_nr_running = sum_nr_running;
@@ -3374,6 +3392,10 @@ out_balanced:
  
         if (this == group_leader && group_leader != group_min) {
                 *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                 return group_min;
         }
  #endif
@@ -3648,10 +3670,64 @@ redo:
         }
  
         if (!ld_moved) {
+               int active_balance = 0;
+
                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
+
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+
+               double_unlock_balance(this_rq, busiest);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+
         } else
                 sd->nr_balance_failed = 0;
  
@@ -3675,7 +3751,7 @@ out_balanced:
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
         struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
         cpumask_var_t tmpmask;
  
@@ -5892,7 +5968,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
-       ftrace_retfunc_init_task(idle);
+       ftrace_graph_init_task(idle);
  }
  
  /*
@@ -6576,7 +6652,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                         req = list_entry(rq->migration_queue.next,
                                          struct migration_req, list);
                         list_del_init(&req->list);
+                       spin_unlock_irq(&rq->lock);
                         complete(&req->done);
+                       spin_lock_irq(&rq->lock);
                 }
                 spin_unlock_irq(&rq->lock);
                 break;
@@ -6631,7 +6709,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
         struct sched_group *group = sd->groups;
         char str[256];
  
-       cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
         cpumask_clear(groupmask);
  
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6684,7 +6762,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
-               cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                 printk(KERN_CONT " %s", str);
  
                 group = group->next;
@@ -6780,6 +6858,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_EXEC |
                                 SD_SHARE_CPUPOWER |
                                 SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
         }
         if (~cflags & pflags)
                 return 0;
@@ -6924,7 +7004,7 @@ static cpumask_var_t cpu_isolated_map;
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
-       cpulist_parse(str, *cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
         return 1;
  }
  
@@ -7715,8 +7795,14 @@ static struct sched_domain_attr *dattr_cur;
   */
  static cpumask_var_t fallback_doms;
  
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
  {
+       return 0;
  }
  
  /*
@@ -7810,17 +7896,21 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                              struct sched_domain_attr *dattr_new)
  {
         int i, j, n;
+       int new_topology;
  
         mutex_lock(&sched_domains_mutex);
  
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
  
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+
         n = doms_new ? ndoms_new : 0;
  
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
+               for (j = 0; j < n && !new_topology; j++) {
                         if (cpumask_equal(&doms_cur[i], &doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
@@ -7840,7 +7930,7 @@ match1:
  
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
                         if (cpumask_equal(&doms_new[i], &doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
@@ -7882,14 +7972,25 @@ int arch_reinit_sched_domains(void)
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
         int ret;
+       unsigned int level = 0;
+
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
  
-       if (buf[0] != '0' && buf[0] != '1')
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                 return -EINVAL;
  
         if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
         else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
  
         ret = arch_reinit_sched_domains();