sched/loadavg: Fix loadavg artifacts on fully idle and on fully loaded systems

[pandora-kernel.git] / kernel / workqueue.c
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index 1783aab..563820c 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
   * Please read Documentation/workqueue.txt for details.
   */
  
-#include <linux/module.h>
+#include <linux/export.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/init.h>
@@ -128,6 +128,7 @@ struct worker {
         };
  
         struct work_struct      *current_work;  /* L: work being processed */
+       work_func_t             current_func;   /* L: current_work's fn */
         struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
         struct list_head        scheduled;      /* L: scheduled works */
         struct task_struct      *task;          /* I: worker task */
@@ -253,11 +254,13 @@ struct workqueue_struct *system_long_wq __read_mostly;
  struct workqueue_struct *system_nrt_wq __read_mostly;
  struct workqueue_struct *system_unbound_wq __read_mostly;
  struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
  EXPORT_SYMBOL_GPL(system_wq);
  EXPORT_SYMBOL_GPL(system_long_wq);
  EXPORT_SYMBOL_GPL(system_nrt_wq);
  EXPORT_SYMBOL_GPL(system_unbound_wq);
  EXPORT_SYMBOL_GPL(system_freezable_wq);
+EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/workqueue.h>
@@ -841,7 +844,8 @@ static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
         struct hlist_node *tmp;
  
         hlist_for_each_entry(worker, tmp, bwh, hentry)
-               if (worker->current_work == work)
+               if (worker->current_work == work &&
+                   worker->current_func == work->func)
                         return worker;
         return NULL;
  }
@@ -851,9 +855,27 @@ static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
   * @gcwq: gcwq of interest
   * @work: work to find worker for
   *
- * Find a worker which is executing @work on @gcwq.  This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
+ * Find a worker which is executing @work on @gcwq by searching
+ * @gcwq->busy_hash which is keyed by the address of @work.  For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function.  This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky.  A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item.  If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address, work function and workqueue
+ * to avoid false positives.  Note that this isn't complete as one may
+ * construct a work function which can introduce dependency onto itself
+ * through a recycled work item.  Well, if somebody wants to shoot oneself
+ * in the foot that badly, there's only so much we can do, and if such
+ * deadlock actually occurs, it should be easy to locate the culprit work
+ * function.
   *
   * CONTEXT:
   * spin_lock_irq(gcwq->lock).
@@ -1144,8 +1166,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                 unsigned int lcpu;
  
-               BUG_ON(timer_pending(timer));
-               BUG_ON(!list_empty(&work->entry));
+               WARN_ON_ONCE(timer_pending(timer));
+               WARN_ON_ONCE(!list_empty(&work->entry));
  
                 timer_stats_timer_set_start_info(&dwork->timer);
  
@@ -1213,8 +1235,13 @@ static void worker_enter_idle(struct worker *worker)
         } else
                 wake_up_all(&gcwq->trustee_wait);
  
-       /* sanity check nr_running */
-       WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+       /*
+        * Sanity check nr_running.  Because trustee releases gcwq->lock
+        * between setting %WORKER_ROGUE and zapping nr_running, the
+        * warning may trigger spuriously.  Check iff trustee is idle.
+        */
+       WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+                    gcwq->nr_workers == gcwq->nr_idle &&
                      atomic_read(get_gcwq_nr_running(gcwq->cpu)));
  }
  
@@ -1447,12 +1474,19 @@ static void destroy_worker(struct worker *worker)
         if (worker->flags & WORKER_IDLE)
                 gcwq->nr_idle--;
  
+       /*
+        * Once WORKER_DIE is set, the kworker may destroy itself at any
+        * point.  Pin to ensure the task stays until we're done with it.
+        */
+       get_task_struct(worker->task);
+
         list_del_init(&worker->entry);
         worker->flags |= WORKER_DIE;
  
         spin_unlock_irq(&gcwq->lock);
  
         kthread_stop(worker->task);
+       put_task_struct(worker->task);
         kfree(worker);
  
         spin_lock_irq(&gcwq->lock);
@@ -1719,10 +1753,9 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
                 *nextp = n;
  }
  
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void cwq_activate_delayed_work(struct work_struct *work)
  {
-       struct work_struct *work = list_first_entry(&cwq->delayed_works,
-                                                   struct work_struct, entry);
+       struct cpu_workqueue_struct *cwq = get_work_cwq(work);
         struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
  
         trace_workqueue_activate_work(work);
@@ -1731,6 +1764,14 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
         cwq->nr_active++;
  }
  
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+       struct work_struct *work = list_first_entry(&cwq->delayed_works,
+                                                   struct work_struct, entry);
+
+       cwq_activate_delayed_work(work);
+}
+
  /**
   * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
   * @cwq: cwq of interest
@@ -1802,7 +1843,6 @@ __acquires(&gcwq->lock)
         struct global_cwq *gcwq = cwq->gcwq;
         struct hlist_head *bwh = busy_worker_head(gcwq, work);
         bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
-       work_func_t f = work->func;
         int work_color;
         struct worker *collision;
  #ifdef CONFIG_LOCKDEP
@@ -1831,6 +1871,7 @@ __acquires(&gcwq->lock)
         debug_work_deactivate(work);
         hlist_add_head(&worker->hentry, bwh);
         worker->current_work = work;
+       worker->current_func = work->func;
         worker->current_cwq = cwq;
         work_color = get_work_color(work);
  
@@ -1862,11 +1903,13 @@ __acquires(&gcwq->lock)
  
         spin_unlock_irq(&gcwq->lock);
  
+       smp_wmb();      /* paired with test_and_set_bit(PENDING) */
         work_clear_pending(work);
+
         lock_map_acquire_read(&cwq->wq->lockdep_map);
         lock_map_acquire(&lockdep_map);
         trace_workqueue_execute_start(work);
-       f(work);
+       worker->current_func(work);
         /*
          * While we must be careful to not use "work" after this, the trace
          * point will only record its address.
@@ -1876,15 +1919,23 @@ __acquires(&gcwq->lock)
         lock_map_release(&cwq->wq->lockdep_map);
  
         if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-               printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-                      "%s/0x%08x/%d\n",
-                      current->comm, preempt_count(), task_pid_nr(current));
-               printk(KERN_ERR "    last function: ");
-               print_symbol("%s\n", (unsigned long)f);
+               pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+                      "     last function: %pf\n",
+                      current->comm, preempt_count(), task_pid_nr(current),
+                      worker->current_func);
                 debug_show_held_locks(current);
                 dump_stack();
         }
  
+       /*
+        * The following prevents a kworker from hogging CPU on !PREEMPT
+        * kernels, where a requeueing work item waiting for something to
+        * happen could deadlock with stop_machine as such work item could
+        * indefinitely requeue itself while all other CPUs are trapped in
+        * stop_machine.
+        */
+       cond_resched();
+
         spin_lock_irq(&gcwq->lock);
  
         /* clear cpu intensive status */
@@ -1894,6 +1945,7 @@ __acquires(&gcwq->lock)
         /* we're done with it, release */
         hlist_del_init(&worker->hentry);
         worker->current_work = NULL;
+       worker->current_func = NULL;
         worker->current_cwq = NULL;
         cwq_dec_nr_in_flight(cwq, work_color, false);
  }
@@ -2036,8 +2088,10 @@ static int rescuer_thread(void *__wq)
  repeat:
         set_current_state(TASK_INTERRUPTIBLE);
  
-       if (kthread_should_stop())
+       if (kthread_should_stop()) {
+               __set_current_state(TASK_RUNNING);
                 return 0;
+       }
  
         /*
          * See whether any cpu is asking for help.  Unbounded
@@ -2619,6 +2673,18 @@ static int try_to_grab_pending(struct work_struct *work)
                 smp_rmb();
                 if (gcwq == get_work_gcwq(work)) {
                         debug_work_deactivate(work);
+
+                       /*
+                        * A delayed work item cannot be grabbed directly
+                        * because it might have linked NO_COLOR work items
+                        * which, if left on the delayed_list, will confuse
+                        * cwq->nr_active management later on and cause
+                        * stall.  Make sure the work item is activated
+                        * before grabbing.
+                        */
+                       if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+                               cwq_activate_delayed_work(work);
+
                         list_del_init(&work->entry);
                         cwq_dec_nr_in_flight(get_work_cwq(work),
                                 get_work_color(work),
@@ -3430,14 +3496,17 @@ static int __cpuinit trustee_thread(void *__gcwq)
  
         for_each_busy_worker(worker, i, pos, gcwq) {
                 struct work_struct *rebind_work = &worker->rebind_work;
+               unsigned long worker_flags = worker->flags;
  
                 /*
                  * Rebind_work may race with future cpu hotplug
                  * operations.  Use a separate flag to mark that
-                * rebinding is scheduled.
+                * rebinding is scheduled.  The morphing should
+                * be atomic.
                  */
-               worker->flags |= WORKER_REBIND;
-               worker->flags &= ~WORKER_ROGUE;
+               worker_flags |= WORKER_REBIND;
+               worker_flags &= ~WORKER_ROGUE;
+               ACCESS_ONCE(worker->flags) = worker_flags;
  
                 /* queue rebind_work, wq doesn't matter, use the default one */
                 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
@@ -3579,21 +3648,55 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
         return notifier_from_errno(0);
  }
  
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+                                              unsigned long action,
+                                              void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_CANCELED:
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               return workqueue_cpu_callback(nfb, action, hcpu);
+       }
+       return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+       case CPU_DYING:
+       case CPU_POST_DEAD:
+               return workqueue_cpu_callback(nfb, action, hcpu);
+       }
+       return NOTIFY_OK;
+}
+
  #ifdef CONFIG_SMP
  
  struct work_for_cpu {
-       struct completion completion;
+       struct work_struct work;
         long (*fn)(void *);
         void *arg;
         long ret;
  };
  
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
  {
-       struct work_for_cpu *wfc = _wfc;
+       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
         wfc->ret = wfc->fn(wfc->arg);
-       complete(&wfc->completion);
-       return 0;
  }
  
  /**
@@ -3608,19 +3711,11 @@ static int do_work_for_cpu(void *_wfc)
   */
  long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
  {
-       struct task_struct *sub_thread;
-       struct work_for_cpu wfc = {
-               .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-               .fn = fn,
-               .arg = arg,
-       };
+       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
  
-       sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
-       if (IS_ERR(sub_thread))
-               return PTR_ERR(sub_thread);
-       kthread_bind(sub_thread, cpu);
-       wake_up_process(sub_thread);
-       wait_for_completion(&wfc.completion);
+       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+       schedule_work_on(cpu, &wfc.work);
+       flush_work(&wfc.work);
         return wfc.ret;
  }
  EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -3772,7 +3867,8 @@ static int __init init_workqueues(void)
         unsigned int cpu;
         int i;
  
-       cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+       cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+       cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
  
         /* initialize gcwqs */
         for_each_gcwq_cpu(cpu) {
@@ -3821,8 +3917,11 @@ static int __init init_workqueues(void)
                                             WQ_UNBOUND_MAX_ACTIVE);
         system_freezable_wq = alloc_workqueue("events_freezable",
                                               WQ_FREEZABLE, 0);
+       system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
+                       WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
         BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-              !system_unbound_wq || !system_freezable_wq);
+              !system_unbound_wq || !system_freezable_wq ||
+               !system_nrt_freezable_wq);
         return 0;
  }
  early_initcall(init_workqueues);