[PATCH] sched: make cpu_clock() not use the rq clock

[pandora-kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index cb31fb4..3eed860 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -263,8 +263,6 @@ struct rq {
         unsigned int clock_warps, clock_overflows;
         unsigned int clock_unstable_events;
  
-       struct sched_class *load_balance_class;
-
         atomic_t nr_iowait;
  
  #ifdef CONFIG_SMP
@@ -301,7 +299,7 @@ struct rq {
         struct lock_class_key rq_lock_key;
  };
  
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  static DEFINE_MUTEX(sched_hotcpu_mutex);
  
  static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -379,6 +377,22 @@ static inline unsigned long long rq_clock(struct rq *rq)
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+       unsigned long long now;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       now = rq_clock(cpu_rq(cpu));
+       local_irq_restore(flags);
+
+       return now;
+}
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /* Change a task's ->cfs_rq if it moves across CPUs */
  static inline void set_task_cfs_rq(struct task_struct *p)
@@ -1575,6 +1589,10 @@ static void __sched_fork(struct task_struct *p)
         INIT_LIST_HEAD(&p->run_list);
         p->se.on_rq = 0;
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
         /*
          * We mark the process as running here, but have not actually
          * inserted it onto the runqueue yet. This guarantees that
@@ -1656,6 +1674,63 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
         task_rq_unlock(rq, &flags);
  }
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted
+ *                         and rescheduled
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+       hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+       hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+}
+
+#endif
+
  /**
   * prepare_task_switch - prepare to switch tasks
   * @rq: the runqueue preparing to switch
@@ -1668,8 +1743,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
   * prepare_task_switch sets up locking and calls architecture specific
   * hooks.
   */
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+                   struct task_struct *next)
  {
+       fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
         prepare_arch_switch(next);
  }
@@ -1711,6 +1789,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
         prev_state = prev->state;
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
+       fire_sched_in_preempt_notifiers(current);
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
@@ -1751,7 +1830,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
  {
         struct mm_struct *mm, *oldmm;
  
-       prepare_task_switch(rq, next);
+       prepare_task_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -2235,7 +2314,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  
                         rq = cpu_rq(i);
  
-                       if (*sd_idle && !idle_cpu(i))
+                       if (*sd_idle && rq->nr_running)
                                 *sd_idle = 0;
  
                         /* Bias balancing toward cpus of our domain */
@@ -2257,9 +2336,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 /*
                  * First idle cpu or the first cpu(busiest) in this sched group
                  * is eligible for doing load balancing at this and above
-                * domains.
+                * domains. In the newly idle case, we will allow all the cpu's
+                * to do the newly idle load balance.
                  */
-               if (local_group && balance_cpu != this_cpu && balance) {
+               if (idle != CPU_NEWLY_IDLE && local_group &&
+                   balance_cpu != this_cpu && balance) {
                         *balance = 0;
                         goto ret;
                 }
@@ -2677,6 +2758,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
         unsigned long imbalance;
         int nr_moved = 0;
         int sd_idle = 0;
+       int all_pinned = 0;
         cpumask_t cpus = CPU_MASK_ALL;
  
         /*
@@ -2715,10 +2797,11 @@ redo:
                 double_lock_balance(this_rq, busiest);
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                         minus_1_or_zero(busiest->nr_running),
-                                       imbalance, sd, CPU_NEWLY_IDLE, NULL);
+                                       imbalance, sd, CPU_NEWLY_IDLE,
+                                       &all_pinned);
                 spin_unlock(&busiest->lock);
  
-               if (!nr_moved) {
+               if (unlikely(all_pinned)) {
                         cpu_clear(cpu_of(busiest), cpus);
                         if (!cpus_empty(cpus))
                                 goto redo;
@@ -6314,6 +6397,10 @@ void __init sched_init(void)
  
         set_load_weight(&init_task);
  
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
  #ifdef CONFIG_SMP
         nr_cpu_ids = highest_cpu + 1;
         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);