Merge branch 'sched-fixes-for-linus' of git://tesla.tglx.de/git/linux-2.6-tip

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index fde6ff9..ec5f472 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
  #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
  
  #include "sched_cpupri.h"
  #include "workqueue_sched.h"
@@ -124,7 +127,7 @@
  
  static inline int rt_policy(int policy)
  {
-       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+       if (policy == SCHED_FIFO || policy == SCHED_RR)
                 return 1;
         return 0;
  }
@@ -422,6 +425,7 @@ struct rt_rq {
   */
  struct root_domain {
         atomic_t refcount;
+       atomic_t rto_count;
         struct rcu_head rcu;
         cpumask_var_t span;
         cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
          * one runnable RT task.
          */
         cpumask_var_t rto_mask;
-       atomic_t rto_count;
         struct cpupri cpupri;
  };
  
@@ -528,6 +531,12 @@ struct rq {
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         u64 prev_irq_time;
  #endif
+#ifdef CONFIG_PARAVIRT
+       u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       u64 prev_steal_time_rq;
+#endif
  
         /* calc_load related fields */
         unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
  
  #define rcu_dereference_check_sched_domain(p) \
         rcu_dereference_check((p), \
-                             rcu_read_lock_held() || \
                               lockdep_is_held(&sched_domains_mutex))
  
  /*
@@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
         return rq->avg_load_per_task;
  }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-       unsigned long load;
-       long cpu = (long)data;
-
-       if (!tg->parent) {
-               load = cpu_rq(cpu)->load.weight;
-       } else {
-               load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->se[cpu]->load.weight;
-               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-       }
-
-       tg->cfs_rq[cpu]->h_load = load;
-
-       return 0;
-}
-
-static void update_h_load(long cpu)
-{
-       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
  #ifdef CONFIG_PREEMPT
  
  static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
  }
  EXPORT_SYMBOL_GPL(account_system_vtime);
  
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
  {
-       s64 irq_delta;
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
+
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
  
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
         irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
  
         /*
@@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  
         rq->prev_irq_time += irq_delta;
         delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       if (static_branch((&paravirt_steal_rq_enabled))) {
+               u64 st;
+
+               steal = paravirt_steal_clock(cpu_of(rq));
+               steal -= rq->prev_steal_time_rq;
+
+               if (unlikely(steal > delta))
+                       steal = delta;
+
+               st = steal_ticks(steal);
+               steal = st * TICK_NSEC;
+
+               rq->prev_steal_time_rq += steal;
+
+               delta -= steal;
+       }
+#endif
+
         rq->clock_task += delta;
  
-       if (irq_delta && sched_feat(NONIRQ_POWER))
-               sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+               sched_rt_avg_update(rq, irq_delta + steal);
+#endif
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  static int irqtime_account_hi_update(void)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void)
  
  #define sched_clock_irqtime    (0)
  
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-       rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
  
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -2220,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         if (task_cpu(p) != new_cpu) {
                 p->se.nr_migrations++;
-               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
         if (p->sched_class->task_woken)
                 p->sched_class->task_woken(rq, p);
  
-       if (unlikely(rq->idle_stamp)) {
+       if (rq->idle_stamp) {
                 u64 delta = rq->clock - rq->idle_stamp;
                 u64 max = 2*sysctl_sched_migration_cost;
  
@@ -2886,7 +2898,7 @@ void sched_fork(struct task_struct *p)
  #if defined(CONFIG_SMP)
         p->on_cpu = 0;
  #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
@@ -3053,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_disable();
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-       perf_event_task_sched_in(current);
+       perf_event_task_sched_in(prev, current);
  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_enable();
  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3877,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
                 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
  }
  
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+       if (static_branch(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+
+               account_steal_time(st);
+               return st;
+       }
+#endif
+       return false;
+}
+
  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3908,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
         cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
  
+       if (steal_account_process_tick())
+               return;
+
         if (irqtime_account_hi_update()) {
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         } else if (irqtime_account_si_update()) {
@@ -3961,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                 return;
         }
  
+       if (steal_account_process_tick())
+               return;
+
         if (user_tick)
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4242,9 +4279,9 @@ pick_next_task(struct rq *rq)
  }
  
  /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
   */
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@ -4285,16 +4322,6 @@ need_resched:
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
-
-                       /*
-                        * If we are going to sleep and we have plugged IO
-                        * queued, make sure to submit it to avoid deadlocks.
-                        */
-                       if (blk_needs_flush_plug(prev)) {
-                               raw_spin_unlock(&rq->lock);
-                               blk_schedule_flush_plug(prev);
-                               raw_spin_lock(&rq->lock);
-                       }
                 }
                 switch_count = &prev->nvcsw;
         }
@@ -4332,17 +4359,34 @@ need_resched:
         if (need_resched())
                 goto need_resched;
  }
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+       if (!tsk->state)
+               return;
+       /*
+        * If we are going to sleep and we have plugged IO queued,
+        * make sure to submit it to avoid deadlocks.
+        */
+       if (blk_needs_flush_plug(tsk))
+               blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage void schedule(void)
+{
+       struct task_struct *tsk = current;
+
+       sched_submit_work(tsk);
+       __schedule();
+}
  EXPORT_SYMBOL(schedule);
  
  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
  
  static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
  {
-       bool ret = false;
-
-       rcu_read_lock();
         if (lock->owner != owner)
-               goto fail;
+               return false;
  
         /*
          * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4396,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
          */
         barrier();
  
-       ret = owner->on_cpu;
-fail:
-       rcu_read_unlock();
-
-       return ret;
+       return owner->on_cpu;
  }
  
  /*
@@ -4368,21 +4408,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
         if (!sched_feat(OWNER_SPIN))
                 return 0;
  
+       rcu_read_lock();
         while (owner_running(lock, owner)) {
                 if (need_resched())
-                       return 0;
+                       break;
  
                 arch_mutex_cpu_relax();
         }
+       rcu_read_unlock();
  
         /*
-        * If the owner changed to another task there is likely
-        * heavy contention, stop spinning.
+        * We break out the loop above on need_resched() and when the
+        * owner changed, which is a sign for heavy contention. Return
+        * success only when lock->owner is NULL.
          */
-       if (lock->owner)
-               return 0;
-
-       return 1;
+       return lock->owner == NULL;
  }
  #endif
  
@@ -4405,7 +4445,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
  
         do {
                 add_preempt_count_notrace(PREEMPT_ACTIVE);
-               schedule();
+               __schedule();
                 sub_preempt_count_notrace(PREEMPT_ACTIVE);
  
                 /*
@@ -4433,7 +4473,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
         do {
                 add_preempt_count(PREEMPT_ACTIVE);
                 local_irq_enable();
-               schedule();
+               __schedule();
                 local_irq_disable();
                 sub_preempt_count(PREEMPT_ACTIVE);
  
@@ -5558,7 +5598,7 @@ static inline int should_resched(void)
  static void __cond_resched(void)
  {
         add_preempt_count(PREEMPT_ACTIVE);
-       schedule();
+       __schedule();
         sub_preempt_count(PREEMPT_ACTIVE);
  }
  
@@ -7413,6 +7453,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
                         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
                         if (sd && (sd->flags & SD_OVERLAP))
                                 free_sched_groups(sd->groups, 0);
+                       kfree(*per_cpu_ptr(sdd->sd, j));
                         kfree(*per_cpu_ptr(sdd->sg, j));
                         kfree(*per_cpu_ptr(sdd->sgp, j));
                 }
@@ -7898,17 +7939,10 @@ int in_sched_functions(unsigned long addr)
                 && addr < (unsigned long)__sched_text_end);
  }
  
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
  {
         cfs_rq->tasks_timeline = RB_ROOT;
         INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->rq = rq;
-       /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-       cfs_rq->load_stamp = 1;
-#endif
-#endif
         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  #ifndef CONFIG_64BIT
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7928,27 +7962,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
         /* delimiter for bitsearch: */
         __set_bit(MAX_RT_PRIO, array->bitmap);
  
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
         rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
         rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
-       plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks);
  #endif
  
         rt_rq->rt_time = 0;
         rt_rq->rt_throttled = 0;
         rt_rq->rt_runtime = 0;
         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       rt_rq->rt_nr_boosted = 0;
-       rt_rq->rq = rq;
-#endif
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7957,11 +7982,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                 struct sched_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
-       tg->cfs_rq[cpu] = cfs_rq;
-       init_cfs_rq(cfs_rq, rq);
+
         cfs_rq->tg = tg;
+       cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+       /* allow initial update_cfs_load() to truncate */
+       cfs_rq->load_stamp = 1;
+#endif
  
+       tg->cfs_rq[cpu] = cfs_rq;
         tg->se[cpu] = se;
+
         /* se could be NULL for root_task_group */
         if (!se)
                 return;
@@ -7984,12 +8015,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
  {
         struct rq *rq = cpu_rq(cpu);
  
-       tg->rt_rq[cpu] = rt_rq;
-       init_rt_rq(rt_rq, rq);
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->rt_nr_boosted = 0;
+       rt_rq->rq = rq;
         rt_rq->tg = tg;
-       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
  
+       tg->rt_rq[cpu] = rt_rq;
         tg->rt_se[cpu] = rt_se;
+
         if (!rt_se)
                 return;
  
@@ -8071,7 +8104,7 @@ void __init sched_init(void)
                 rq->nr_running = 0;
                 rq->calc_load_active = 0;
                 rq->calc_load_update = jiffies + LOAD_FREQ;
-               init_cfs_rq(&rq->cfs, rq);
+               init_cfs_rq(&rq->cfs);
                 init_rt_rq(&rq->rt, rq);
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.shares = root_task_group_load;
@@ -8142,7 +8175,7 @@ void __init sched_init(void)
  #endif
  
  #ifdef CONFIG_RT_MUTEXES
-       plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+       plist_head_init(&init_task.pi_waiters);
  #endif
  
         /*
@@ -8185,7 +8218,7 @@ void __init sched_init(void)
         scheduler_running = 1;
  }
  
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
         int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8195,7 +8228,6 @@ static inline int preempt_count_equals(int preempt_offset)
  
  void __might_sleep(const char *file, int line, int preempt_offset)
  {
-#ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
  
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8217,7 +8249,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
         if (irqs_disabled())
                 print_irqtrace_events(current);
         dump_stack();
-#endif
  }
  EXPORT_SYMBOL(__might_sleep);
  #endif
@@ -8376,6 +8407,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!se)
                         goto err_free_rq;
  
+               init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
  
@@ -8403,7 +8435,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
  static inline void free_fair_sched_group(struct task_group *tg)
  {
  }
@@ -8424,7 +8456,8 @@ static void free_rt_sched_group(struct task_group *tg)
  {
         int i;
  
-       destroy_rt_bandwidth(&tg->rt_bandwidth);
+       if (tg->rt_se)
+               destroy_rt_bandwidth(&tg->rt_bandwidth);
  
         for_each_possible_cpu(i) {
                 if (tg->rt_rq)
@@ -8465,6 +8498,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!rt_se)
                         goto err_free_rq;
  
+               init_rt_rq(rt_rq, cpu_rq(i));
+               rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }