Merge branch 'for-3.17' of git://linux-nfs.org/~bfields/linux
[pandora-kernel.git] / kernel / rcu / tree_plugin.h
index 02ac0fb..00dc411 100644 (file)
@@ -33,6 +33,7 @@
 #define RCU_KTHREAD_PRIO 1
 
 #ifdef CONFIG_RCU_BOOST
+#include "../locking/rtmutex_common.h"
 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
 #else
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
-       struct rt_mutex *rbmp = NULL;
+       bool drop_boost_mutex = false;
 #endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                if (&t->rcu_node_entry == rnp->boost_tasks)
                        rnp->boost_tasks = np;
-               /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
-               if (t->rcu_boost_mutex) {
-                       rbmp = t->rcu_boost_mutex;
-                       t->rcu_boost_mutex = NULL;
-               }
+               /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+               drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
                /*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-               if (rbmp)
-                       rt_mutex_unlock(rbmp);
+               if (drop_boost_mutex) {
+                       rt_mutex_unlock(&rnp->boost_mtx);
+                       complete(&rnp->boost_completion);
+               }
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
                /*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
 {
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
 static int rcu_boost(struct rcu_node *rnp)
 {
        unsigned long flags;
-       struct rt_mutex mtx;
        struct task_struct *t;
        struct list_head *tb;
 
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
         * section.
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
-       rt_mutex_init_proxy_locked(&mtx, t);
-       t->rcu_boost_mutex = &mtx;
+       rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+       init_completion(&rnp->boost_completion);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-       rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
-       rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+       /* Lock only for side effect: boosts task t's priority. */
+       rt_mutex_lock(&rnp->boost_mtx);
+       rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
+
+       /* Wait for boostee to be done w/boost_mtx before reinitializing. */
+       wait_for_completion(&rnp->boost_completion);
 
        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
               ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
  * about it going away.
  */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
 {
        struct task_struct *t;
 
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
 #else /* #ifdef CONFIG_RCU_BOOST */
 
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
 {
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2059,6 +2065,22 @@ bool rcu_is_nocb_cpu(int cpu)
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
+/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+       struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+       if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+               return;
+       if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+               /* Prior xchg orders against prior callback enqueue. */
+               ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+               wake_up(&rdp_leader->nocb_wq);
+       }
+}
+
 /*
  * Enqueue the specified string of rcu_head structures onto the specified
  * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        len = atomic_long_read(&rdp->nocb_q_count);
        if (old_rhpp == &rdp->nocb_head) {
                if (!irqs_disabled_flags(flags)) {
-                       wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+                       /* ... if queue was empty ... */
+                       wake_nocb_leader(rdp, false);
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                }
                rdp->qlen_last_fqs_check = 0;
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
-               wake_up_process(t); /* ... or if many callbacks queued. */
+               /* ... or if many callbacks queued. */
+               wake_nocb_leader(rdp, true);
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
        } else {
@@ -2212,14 +2236,151 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
        smp_mb(); /* Ensure that CB invocation happens after GP end. */
 }
 
+/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+       bool firsttime = true;
+       bool gotcbs;
+       struct rcu_data *rdp;
+       struct rcu_head **tail;
+
+wait_again:
+
+       /* Wait for callbacks to appear. */
+       if (!rcu_nocb_poll) {
+               trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+               wait_event_interruptible(my_rdp->nocb_wq,
+                                        ACCESS_ONCE(my_rdp->nocb_leader_wake));
+               /* Memory barrier handled by smp_mb() calls below and repoll. */
+       } else if (firsttime) {
+               firsttime = false; /* Don't drown trace log with "Poll"! */
+               trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+       }
+
+       /*
+        * Each pass through the following loop checks a follower for CBs.
+        * We are our own first follower.  Any CBs found are moved to
+        * nocb_gp_head, where they await a grace period.
+        */
+       gotcbs = false;
+       for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+               rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+               if (!rdp->nocb_gp_head)
+                       continue;  /* No CBs here, try next follower. */
+
+               /* Move callbacks to wait-for-GP list, which is empty. */
+               ACCESS_ONCE(rdp->nocb_head) = NULL;
+               rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+               rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
+               rdp->nocb_gp_count_lazy =
+                       atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+               gotcbs = true;
+       }
+
+       /*
+        * If there were no callbacks, sleep a bit, rescan after a
+        * memory barrier, and go retry.
+        */
+       if (unlikely(!gotcbs)) {
+               if (!rcu_nocb_poll)
+                       trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+                                           "WokeEmpty");
+               flush_signals(current);
+               schedule_timeout_interruptible(1);
+
+               /* Rescan in case we were a victim of memory ordering. */
+               my_rdp->nocb_leader_wake = false;
+               smp_mb();  /* Ensure _wake false before scan. */
+               for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+                       if (ACCESS_ONCE(rdp->nocb_head)) {
+                               /* Found CB, so short-circuit next wait. */
+                               my_rdp->nocb_leader_wake = true;
+                               break;
+                       }
+               goto wait_again;
+       }
+
+       /* Wait for one grace period. */
+       rcu_nocb_wait_gp(my_rdp);
+
+       /*
+        * We left ->nocb_leader_wake set to reduce cache thrashing.
+        * We clear it now, but recheck for new callbacks while
+        * traversing our follower list.
+        */
+       my_rdp->nocb_leader_wake = false;
+       smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+
+       /* Each pass through the following loop wakes a follower, if needed. */
+       for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+               if (ACCESS_ONCE(rdp->nocb_head))
+                       my_rdp->nocb_leader_wake = true; /* No need to wait. */
+               if (!rdp->nocb_gp_head)
+                       continue; /* No CBs, so no need to wake follower. */
+
+               /* Append callbacks to follower's "done" list. */
+               tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+               *tail = rdp->nocb_gp_head;
+               atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
+               atomic_long_add(rdp->nocb_gp_count_lazy,
+                               &rdp->nocb_follower_count_lazy);
+               if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+                       /*
+                        * List was empty, wake up the follower.
+                        * Memory barriers supplied by atomic_long_add().
+                        */
+                       wake_up(&rdp->nocb_wq);
+               }
+       }
+
+       /* If we (the leader) don't have CBs, go wait some more. */
+       if (!my_rdp->nocb_follower_head)
+               goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+       bool firsttime = true;
+
+       for (;;) {
+               if (!rcu_nocb_poll) {
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           "FollowerSleep");
+                       wait_event_interruptible(rdp->nocb_wq,
+                                                ACCESS_ONCE(rdp->nocb_follower_head));
+               } else if (firsttime) {
+                       /* Don't drown trace log with "Poll"! */
+                       firsttime = false;
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+               }
+               if (smp_load_acquire(&rdp->nocb_follower_head)) {
+                       /* ^^^ Ensure CB invocation follows _head test. */
+                       return;
+               }
+               if (!rcu_nocb_poll)
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           "WokeEmpty");
+               flush_signals(current);
+               schedule_timeout_interruptible(1);
+       }
+}
+
 /*
  * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
  */
 static int rcu_nocb_kthread(void *arg)
 {
        int c, cl;
-       bool firsttime = 1;
        struct rcu_head *list;
        struct rcu_head *next;
        struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
 
        /* Each pass through this loop invokes one batch of callbacks */
        for (;;) {
-               /* If not polling, wait for next batch of callbacks. */
-               if (!rcu_nocb_poll) {
-                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                           TPS("Sleep"));
-                       wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
-                       /* Memory barrier provide by xchg() below. */
-               } else if (firsttime) {
-                       firsttime = 0;
-                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                           TPS("Poll"));
-               }
-               list = ACCESS_ONCE(rdp->nocb_head);
-               if (!list) {
-                       if (!rcu_nocb_poll)
-                               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                                   TPS("WokeEmpty"));
-                       schedule_timeout_interruptible(1);
-                       flush_signals(current);
-                       continue;
-               }
-               firsttime = 1;
-               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                   TPS("WokeNonEmpty"));
-
-               /*
-                * Extract queued callbacks, update counts, and wait
-                * for a grace period to elapse.
-                */
-               ACCESS_ONCE(rdp->nocb_head) = NULL;
-               tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-               c = atomic_long_xchg(&rdp->nocb_q_count, 0);
-               cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
-               ACCESS_ONCE(rdp->nocb_p_count) += c;
-               ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-               rcu_nocb_wait_gp(rdp);
+               /* Wait for callbacks. */
+               if (rdp->nocb_leader == rdp)
+                       nocb_leader_wait(rdp);
+               else
+                       nocb_follower_wait(rdp);
+
+               /* Pull the ready-to-invoke callbacks onto local list. */
+               list = ACCESS_ONCE(rdp->nocb_follower_head);
+               BUG_ON(!list);
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+               ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+               tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+               c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
+               cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
+               rdp->nocb_p_count += c;
+               rdp->nocb_p_count_lazy += cl;
 
                /* Each pass through the following loop invokes a callback. */
                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
        if (!rcu_nocb_need_deferred_wakeup(rdp))
                return;
        ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
-       wake_up(&rdp->nocb_wq);
+       wake_nocb_leader(rdp, false);
        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
 }
 
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
        rdp->nocb_tail = &rdp->nocb_head;
        init_waitqueue_head(&rdp->nocb_wq);
+       rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+/* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Also initialize leader-follower relationships.
+ */
 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 {
        int cpu;
+       int ls = rcu_nocb_leader_stride;
+       int nl = 0;  /* Next leader. */
        struct rcu_data *rdp;
+       struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
+       struct rcu_data *rdp_prev = NULL;
        struct task_struct *t;
 
        if (rcu_nocb_mask == NULL)
                return;
+#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
+       if (tick_nohz_full_running)
+               cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
+       if (ls == -1) {
+               ls = int_sqrt(nr_cpu_ids);
+               rcu_nocb_leader_stride = ls;
+       }
+
+       /*
+        * Each pass through this loop sets up one rcu_data structure and
+        * spawns one rcu_nocb_kthread().
+        */
        for_each_cpu(cpu, rcu_nocb_mask) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
+               if (rdp->cpu >= nl) {
+                       /* New leader, set up for followers & next leader. */
+                       nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+                       rdp->nocb_leader = rdp;
+                       rdp_leader = rdp;
+               } else {
+                       /* Another follower, link to previous leader. */
+                       rdp->nocb_leader = rdp_leader;
+                       rdp_prev->nocb_next_follower = rdp;
+               }
+               rdp_prev = rdp;
+
+               /* Spawn the kthread for this CPU. */
                t = kthread_run(rcu_nocb_kthread, rdp,
                                "rcuo%c/%d", rsp->abbr, cpu);
                BUG_ON(IS_ERR(t));
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
  */
 static void rcu_bind_gp_kthread(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-       int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+       int __maybe_unused cpu;
 
-       if (cpu < 0 || cpu >= nr_cpu_ids)
+       if (!tick_nohz_full_enabled())
                return;
-       if (raw_smp_processor_id() != cpu)
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+       cpu = tick_do_timer_cpu;
+       if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
                set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+       if (!is_housekeeping_cpu(raw_smp_processor_id()))
+               housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }