Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 2111491..d42992b 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -486,11 +486,12 @@ struct rq {
          */
         unsigned long nr_uninterruptible;
  
-       struct task_struct *curr, *idle;
+       struct task_struct *curr, *idle, *stop;
         unsigned long next_balance;
         struct mm_struct *prev_mm;
  
         u64 clock;
+       u64 clock_task;
  
         atomic_t nr_iowait;
  
@@ -518,6 +519,10 @@ struct rq {
         u64 avg_idle;
  #endif
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+#endif
+
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
  
  #endif /* CONFIG_CGROUP_SCHED */
  
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
  inline void update_rq_clock(struct rq *rq)
  {
-       if (!rq->skip_clock_update)
-               rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update) {
+               int cpu = cpu_of(rq);
+               u64 irq_time;
+
+               rq->clock = sched_clock_cpu(cpu);
+               irq_time = irq_time_cpu(cpu);
+               if (rq->clock - irq_time > rq->clock_task)
+                       rq->clock_task = rq->clock - irq_time;
+
+               sched_irq_time_avg_update(rq, irq_time);
+       }
  }
  
  /*
@@ -1837,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  
  static const struct sched_class rt_sched_class;
  
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
  #define for_each_class(class) \
     for (class = sched_class_highest; class; class = class->next)
  
@@ -1855,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
  
  static void set_load_weight(struct task_struct *p)
  {
-       if (task_has_rt_policy(p)) {
-               p->se.load.weight = 0;
-               p->se.load.inv_weight = WMULT_CONST;
-               return;
-       }
-
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
@@ -1914,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
         dec_nr_running(rq);
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+       if (!sched_clock_irqtime)
+               return 0;
+
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       int cpu;
+       u64 now, delta;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       now = sched_clock_cpu(cpu);
+       delta = now - per_cpu(irq_start_time, cpu);
+       per_cpu(irq_start_time, cpu) = now;
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               per_cpu(cpu_hardirq_time, cpu) += delta;
+       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+               per_cpu(cpu_softirq_time, cpu) += delta;
+
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+               rq->prev_irq_time = curr_irq_time;
+               sched_rt_avg_update(rq, delta_irq);
+       }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+       return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
  #endif
  
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+               stop->sched_class = &stop_sched_class;
+       }
+
+       cpu_rq(cpu)->stop = stop;
+
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+}
+
  /*
   * __normal_prio - return the priority that is based on the static prio
   */
@@ -2000,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         if (p->sched_class != &fair_sched_class)
                 return 0;
  
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+
         /*
          * Buddy candidates are cache hot:
          */
@@ -3245,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
-               ns = rq->clock - p->se.exec_start;
+               ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@ -3394,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
+       else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3581,7 +3714,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr);
+       perf_event_task_tick();
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -3720,17 +3853,13 @@ pick_next_task(struct rq *rq)
                         return p;
         }
  
-       class = sched_class_highest;
-       for ( ; ; ) {
+       for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
                         return p;
-               /*
-                * Will never be NULL as the idle class always
-                * returns a non-NULL p:
-                */
-               class = class->next;
         }
+
+       BUG(); /* the idle class will always have a runnable task */
  }
  
  /*
@@ -4643,7 +4772,7 @@ recheck:
         }
  
         if (user) {
-               retval = security_task_setscheduler(p, policy, param);
+               retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@ -4659,6 +4788,15 @@ recheck:
          */
         rq = __task_rq_lock(p);
  
+       /*
+        * Changing the policy of the stop threads its a very bad idea
+        */
+       if (p == rq->stop) {
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return -EINVAL;
+       }
+
  #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@ -4885,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
  
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
         retval = set_cpus_allowed_ptr(p, new_mask);
  
         if (!retval) {
@@ -5335,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
         __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -8141,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(cfs_rq);
- err:
+err:
         return 0;
  }
  
@@ -8231,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(rt_rq);
- err:
+err:
         return 0;
  }
  
@@ -8591,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
         }
         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);