[PATCH] sched: fix SMT scheduler latency bug

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 18b9552..c61ee34 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
   * smp_call_function() if an IPI is sent by the same process we are
   * waiting to become inactive.
   */
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
  {
         unsigned long flags;
         runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                 int local_group;
                 int i;
  
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+                       goto nextgroup;
+
                 local_group = cpu_isset(this_cpu, group->cpumask);
-               /* XXX: put a cpus allowed check */
  
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                         min_load = avg_load;
                         idlest = group;
                 }
+nextgroup:
                 group = group->next;
         } while (group != sd->groups);
  
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  /*
   * find_idlest_queue - find the idlest runqueue among the cpus in group.
   */
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
+       cpumask_t tmp;
         unsigned long load, min_load = ULONG_MAX;
         int idlest = -1;
         int i;
  
-       for_each_cpu_mask(i, group->cpumask) {
+       /* Traverse only the allowed CPUs */
+       cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+       for_each_cpu_mask(i, tmp) {
                 load = source_load(i, 0);
  
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
                 if (!group)
                         goto nextlevel;
  
-               new_cpu = find_idlest_cpu(group, cpu);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                 if (new_cpu == -1 || new_cpu == cpu)
                         goto nextlevel;
  
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
   *
   * returns failure only if the task is already active.
   */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
  {
         int cpu, this_cpu, success = 0;
         unsigned long flags;
@@ -1251,6 +1260,16 @@ out_activate:
                 p->activated = -1;
         }
  
+       /*
+        * Tasks that have marked their sleep as noninteractive get
+        * woken up without updating their sleep average. (i.e. their
+        * sleep is handled in a priority-neutral manner, no priority
+        * boost and no penalty.)
+        */
+       if (old_state & TASK_NONINTERACTIVE)
+               __activate_task(p, rq);
+       else
+               activate_task(p, rq, cpu == this_cpu);
         /*
          * Sync wakeups (i.e. those types of wakeups where the waker
          * has indicated that it will leave the CPU in short order)
@@ -1259,7 +1278,6 @@ out_activate:
          * the waker guarantees that the freshly woken up task is going
          * to be considered on this CPU.)
          */
-       activate_task(p, rq, cpu == this_cpu);
         if (!sync || cpu != this_cpu) {
                 if (TASK_PREEMPTS_CURR(p, rq))
                         resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
         return success;
  }
  
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
  {
         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
   * that must be done for every newly created context, then puts the task
   * on the runqueue and wakes it.
   */
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
  {
         unsigned long flags;
         int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
   * artificially, because any timeslice recovered here
   * was given away by the parent in the first place.)
   */
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
  {
         unsigned long flags;
         runqueue_t *rq;
@@ -1511,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
         if (mm)
@@ -1753,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
   */
  static inline
  int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-            struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+                    struct sched_domain *sd, enum idle_type idle,
+                    int *all_pinned)
  {
         /*
          * We do not migrate tasks that are:
@@ -2576,6 +2599,13 @@ out:
  }
  
  #ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+       /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+       if (rq->curr == rq->idle && rq->nr_running)
+               resched_task(rq->idle);
+}
+
  static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
  {
         struct sched_domain *tmp, *sd = NULL;
@@ -2609,12 +2639,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
         for_each_cpu_mask(i, sibling_map) {
                 runqueue_t *smt_rq = cpu_rq(i);
  
-               /*
-                * If an SMT sibling task is sleeping due to priority
-                * reasons wake it up now.
-                */
-               if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-                       resched_task(smt_rq->idle);
+               wakeup_busy_runqueue(smt_rq);
         }
  
         for_each_cpu_mask(i, sibling_map)
@@ -2625,6 +2650,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
          */
  }
  
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
  static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
  {
         struct sched_domain *tmp, *sd = NULL;
@@ -2668,6 +2703,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                 runqueue_t *smt_rq = cpu_rq(i);
                 task_t *smt_curr = smt_rq->curr;
  
+               /* Kernel threads do not participate in dependent sleeping */
+               if (!p->mm || !smt_curr->mm || rt_task(p))
+                       goto check_smt_task;
+
                 /*
                  * If a user task with lower static priority than the
                  * running task on the SMT sibling is trying to schedule,
@@ -2676,21 +2715,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                  * task from using an unfair proportion of the
                  * physical cpu's resources. -ck
                  */
-               if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                       task_timeslice(p) || rt_task(smt_curr)) &&
-                       p->mm && smt_curr->mm && !rt_task(p))
-                               ret = 1;
+               if (rt_task(smt_curr)) {
+                       /*
+                        * With real time tasks we run non-rt tasks only
+                        * per_cpu_gain% of the time.
+                        */
+                       if ((jiffies % DEF_TIMESLICE) >
+                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                       ret = 1;
+               } else
+                       if (smt_curr->static_prio < p->static_prio &&
+                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(smt_curr, sd) > task_timeslice(p))
+                                       ret = 1;
+
+check_smt_task:
+               if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+                       rt_task(smt_curr))
+                               continue;
+               if (!p->mm) {
+                       wakeup_busy_runqueue(smt_rq);
+                       continue;
+               }
  
                 /*
-                * Reschedule a lower priority task on the SMT sibling,
-                * or wake it up if it has been put to sleep for priority
-                * reasons.
+                * Reschedule a lower priority task on the SMT sibling for
+                * it to be put to sleep, or wake it up if it has been put to
+                * sleep for priority reasons to see if it should run now.
                  */
-               if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                       task_timeslice(smt_curr) || rt_task(p)) &&
-                       smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
-                       (smt_curr == smt_rq->idle && smt_rq->nr_running))
-                               resched_task(smt_curr);
+               if (rt_task(p)) {
+                       if ((jiffies % DEF_TIMESLICE) >
+                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                       resched_task(smt_curr);
+               } else {
+                       if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(p, sd) > task_timeslice(smt_curr))
+                                       resched_task(smt_curr);
+                       else
+                               wakeup_busy_runqueue(smt_rq);
+               }
         }
  out_unlock:
         for_each_cpu_mask(i, sibling_map)
@@ -2888,6 +2951,7 @@ switch_tasks:
         if (next == rq->idle)
                 schedstat_inc(rq, sched_goidle);
         prefetch(next);
+       prefetch_stack(next);
         clear_tsk_need_resched(prev);
         rcu_qsctr_inc(task_cpu(prev));
  
@@ -3015,7 +3079,8 @@ need_resched:
  
  #endif /* CONFIG_PREEMPT */
  
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+                         void *key)
  {
         task_t *p = curr->private;
         return try_to_wake_up(p, mode, sync);
@@ -3057,7 +3122,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
   * @key: is directly passed to the wakeup function
   */
  void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
-                               int nr_exclusive, void *key)
+                       int nr_exclusive, void *key)
  {
         unsigned long flags;
  
@@ -3089,7 +3154,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
   *
   * On UP it can prevent extra preemption.
   */
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
  {
         unsigned long flags;
         int sync = 1;
@@ -3280,7 +3346,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
  
  EXPORT_SYMBOL(interruptible_sleep_on);
  
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
  {
         SLEEP_ON_VAR
  
@@ -3499,7 +3566,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
   * @policy: new policy.
   * @param: structure containing the new RT priority.
   */
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
  {
         int retval;
         int oldprio, oldpolicy = -1;
@@ -3519,7 +3587,7 @@ recheck:
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
          */
         if (param->sched_priority < 0 ||
-           (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                 return -EINVAL;
         if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3582,7 +3650,8 @@ recheck:
  }
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  {
         int retval;
         struct sched_param lparam;
@@ -3913,7 +3982,7 @@ EXPORT_SYMBOL(cond_resched);
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
  {
         int ret = 0;
  
@@ -4096,7 +4165,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
         return list_entry(p->sibling.next,struct task_struct,sibling);
  }
  
-static void show_task(task_t * p)
+static void show_task(task_t *p)
  {
         task_t *relative;
         unsigned state;
@@ -4122,7 +4191,7 @@ static void show_task(task_t * p)
  #endif
  #ifdef CONFIG_DEBUG_STACK_USAGE
         {
-               unsigned long * n = (unsigned long *) (p->thread_info+1);
+               unsigned long *n = (unsigned long *) (p->thread_info+1);
                 while (!*n)
                         n++;
                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4331,7 +4400,7 @@ out:
   * thread migration by bumping thread off CPU then 'pushing' onto
   * another runqueue.
   */
-static int migration_thread(void * data)
+static int migration_thread(void *data)
  {
         runqueue_t *rq;
         int cpu = (long)data;