slab: do ClearSlabPfmemalloc() for all pages of slab

[pandora-kernel.git] / kernel / rcutree.c
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index 4b97bba..f280e54 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
  
  /* Data structures. */
  
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-
-#define RCU_STATE_INITIALIZER(structname) { \
-       .level = { &structname##_state.node[0] }, \
-       .levelcnt = { \
-               NUM_RCU_LVL_0,  /* root of hierarchy. */ \
-               NUM_RCU_LVL_1, \
-               NUM_RCU_LVL_2, \
-               NUM_RCU_LVL_3, \
-               NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
-       }, \
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+
+#define RCU_STATE_INITIALIZER(sname, cr) { \
+       .level = { &sname##_state.node[0] }, \
+       .call = cr, \
         .fqs_state = RCU_GP_IDLE, \
         .gpnum = -300, \
         .completed = -300, \
-       .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
-       .orphan_nxttail = &structname##_state.orphan_nxtlist, \
-       .orphan_donetail = &structname##_state.orphan_donelist, \
-       .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
-       .n_force_qs = 0, \
-       .n_force_qs_ngp = 0, \
-       .name = #structname, \
+       .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
+       .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+       .orphan_donetail = &sname##_state.orphan_donelist, \
+       .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+       .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
+       .name = #sname, \
  }
  
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+       RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
  DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
  
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
  
  static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
+
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
+       NUM_RCU_LVL_0,
+       NUM_RCU_LVL_1,
+       NUM_RCU_LVL_2,
+       NUM_RCU_LVL_3,
+       NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
  
  /*
   * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  unsigned long rcutorture_testseq;
  unsigned long rcutorture_vernum;
  
-/* State information for rcu_barrier() and friends. */
-
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
  /*
   * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
   * permit this function to be invoked without holding the root rcu_node
@@ -358,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
                 struct task_struct *idle = idle_task(smp_processor_id());
  
                 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
-               ftrace_dump(DUMP_ALL);
+               ftrace_dump(DUMP_ORIG);
                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                           current->pid, current->comm,
                           idle->pid, idle->comm); /* must be idle task! */
@@ -468,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
  
                 trace_rcu_dyntick("Error on exit: not idle task",
                                   oldval, rdtp->dynticks_nesting);
-               ftrace_dump(DUMP_ALL);
+               ftrace_dump(DUMP_ORIG);
                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                           current->pid, current->comm,
                           idle->pid, idle->comm); /* must be idle task! */
@@ -585,8 +586,6 @@ void rcu_nmi_exit(void)
         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
  }
  
-#ifdef CONFIG_PROVE_RCU
-
  /**
   * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
   *
@@ -604,7 +603,7 @@ int rcu_is_cpu_idle(void)
  }
  EXPORT_SYMBOL(rcu_is_cpu_idle);
  
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
  
  /*
   * Is the current CPU online?  Disable preemption to avoid false positives
@@ -645,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
  }
  EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
  
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
  
  /**
   * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -733,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         int cpu;
         long delta;
         unsigned long flags;
-       int ndetected;
+       int ndetected = 0;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Only let one CPU complain about others per time interval. */
@@ -774,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
          */
         rnp = rcu_get_root(rsp);
         raw_spin_lock_irqsave(&rnp->lock, flags);
-       ndetected = rcu_print_task_stall(rnp);
+       ndetected += rcu_print_task_stall(rnp);
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
         print_cpu_stall_info_end();
@@ -860,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
   */
  void rcu_cpu_stall_reset(void)
  {
-       rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-       rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-       rcu_preempt_stall_reset();
+       struct rcu_state *rsp;
+
+       for_each_rcu_flavor(rsp)
+               rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
  }
  
  static struct notifier_block rcu_panic_block = {
@@ -894,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                 if (rnp->qsmask & rdp->grpmask) {
                         rdp->qs_pending = 1;
                         rdp->passed_quiesce = 0;
-               } else
+               } else {
                         rdp->qs_pending = 0;
+               }
                 zero_cpu_stall_ticks(rdp);
         }
  }
@@ -936,6 +935,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
         return ret;
  }
  
+/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+       int i;
+
+       rdp->nxtlist = NULL;
+       for (i = 0; i < RCU_NEXT_SIZE; i++)
+               rdp->nxttail[i] = &rdp->nxtlist;
+}
+
  /*
   * Advance this CPU's callbacks, but only if the current grace period
   * has ended.  This may be called only from the CPU to whom the rdp
@@ -1328,8 +1339,6 @@ static void
  rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                           struct rcu_node *rnp, struct rcu_data *rdp)
  {
-       int i;
-
         /*
          * Orphan the callbacks.  First adjust the counts.  This is safe
          * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1340,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                 rsp->qlen += rdp->qlen;
                 rdp->n_cbs_orphaned += rdp->qlen;
                 rdp->qlen_lazy = 0;
-               rdp->qlen = 0;
+               ACCESS_ONCE(rdp->qlen) = 0;
         }
  
         /*
@@ -1369,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
         }
  
         /* Finally, initialize the rcu_data structure's list to empty.  */
-       rdp->nxtlist = NULL;
-       for (i = 0; i < RCU_NEXT_SIZE; i++)
-               rdp->nxttail[i] = &rdp->nxtlist;
+       init_callback_list(rdp);
  }
  
  /*
@@ -1505,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
         if (need_report & RCU_OFL_TASKS_EXP_GP)
                 rcu_report_exp_rnp(rsp, rnp, true);
+       WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+                 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+                 cpu, rdp->qlen, rdp->nxtlist);
  }
  
  #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1592,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
         }
         smp_mb(); /* List handling before counting for rcu_barrier(). */
         rdp->qlen_lazy -= count_lazy;
-       rdp->qlen -= count;
+       ACCESS_ONCE(rdp->qlen) -= count;
         rdp->n_cbs_invoked += count;
  
         /* Reinstate batch limit if we have worked down the excess. */
@@ -1605,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                 rdp->n_force_qs_snap = rsp->n_force_qs;
         } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
                 rdp->qlen_last_fqs_check = rdp->qlen;
+       WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
  
         local_irq_restore(flags);
  
@@ -1745,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                 break; /* grace period idle or initializing, ignore. */
  
         case RCU_SAVE_DYNTICK:
-               if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
-                       break; /* So gcc recognizes the dead code. */
  
                 raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
  
@@ -1788,9 +1797,10 @@ unlock_fqs_ret:
   * whom the rdp belongs.
   */
  static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
  {
         unsigned long flags;
+       struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
  
         WARN_ON_ONCE(rdp->beenonline == 0);
  
@@ -1826,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
   */
  static void rcu_process_callbacks(struct softirq_action *unused)
  {
+       struct rcu_state *rsp;
+
         trace_rcu_utilization("Start RCU core");
-       __rcu_process_callbacks(&rcu_sched_state,
-                               &__get_cpu_var(rcu_sched_data));
-       __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
-       rcu_preempt_process_callbacks();
+       for_each_rcu_flavor(rsp)
+               __rcu_process_callbacks(rsp);
         trace_rcu_utilization("End RCU core");
  }
  
@@ -1857,6 +1867,56 @@ static void invoke_rcu_core(void)
         raise_softirq(RCU_SOFTIRQ);
  }
  
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+                           struct rcu_head *head, unsigned long flags)
+{
+       /*
+        * If called from an extended quiescent state, invoke the RCU
+        * core in order to force a re-evaluation of RCU's idleness.
+        */
+       if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+               invoke_rcu_core();
+
+       /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+       if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+               return;
+
+       /*
+        * Force the grace period if too many callbacks or too long waiting.
+        * Enforce hysteresis, and don't invoke force_quiescent_state()
+        * if some other CPU has recently done so.  Also, don't bother
+        * invoking force_quiescent_state() if the newly enqueued callback
+        * is the only one waiting for a grace period to complete.
+        */
+       if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+
+               /* Are we ignoring a completed grace period? */
+               rcu_process_gp_end(rsp, rdp);
+               check_for_new_grace_period(rsp, rdp);
+
+               /* Start a new grace period if one not already started. */
+               if (!rcu_gp_in_progress(rsp)) {
+                       unsigned long nestflag;
+                       struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+                       raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                       rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+               } else {
+                       /* Give the grace period a kick. */
+                       rdp->blimit = LONG_MAX;
+                       if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                           *rdp->nxttail[RCU_DONE_TAIL] != head)
+                               force_quiescent_state(rsp, 0);
+                       rdp->n_force_qs_snap = rsp->n_force_qs;
+                       rdp->qlen_last_fqs_check = rdp->qlen;
+               }
+       } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+               force_quiescent_state(rsp, 1);
+}
+
  static void
  __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
            struct rcu_state *rsp, bool lazy)
@@ -1881,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         rdp = this_cpu_ptr(rsp->rda);
  
         /* Add the callback to our list. */
-       rdp->qlen++;
+       ACCESS_ONCE(rdp->qlen)++;
         if (lazy)
                 rdp->qlen_lazy++;
         else
@@ -1896,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         else
                 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
  
-       /* If interrupts were disabled, don't dive into RCU core. */
-       if (irqs_disabled_flags(flags)) {
-               local_irq_restore(flags);
-               return;
-       }
-
-       /*
-        * Force the grace period if too many callbacks or too long waiting.
-        * Enforce hysteresis, and don't invoke force_quiescent_state()
-        * if some other CPU has recently done so.  Also, don't bother
-        * invoking force_quiescent_state() if the newly enqueued callback
-        * is the only one waiting for a grace period to complete.
-        */
-       if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-
-               /* Are we ignoring a completed grace period? */
-               rcu_process_gp_end(rsp, rdp);
-               check_for_new_grace_period(rsp, rdp);
-
-               /* Start a new grace period if one not already started. */
-               if (!rcu_gp_in_progress(rsp)) {
-                       unsigned long nestflag;
-                       struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-                       raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                       rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
-               } else {
-                       /* Give the grace period a kick. */
-                       rdp->blimit = LONG_MAX;
-                       if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                           *rdp->nxttail[RCU_DONE_TAIL] != head)
-                               force_quiescent_state(rsp, 0);
-                       rdp->n_force_qs_snap = rsp->n_force_qs;
-                       rdp->qlen_last_fqs_check = rdp->qlen;
-               }
-       } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
-               force_quiescent_state(rsp, 1);
+       /* Go handle any RCU core processing required. */
+       __call_rcu_core(rsp, rdp, head, flags);
         local_irq_restore(flags);
  }
  
@@ -1962,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
   * occasionally incorrectly indicate that there are multiple CPUs online
   * when there was in fact only one the whole time, as this just adds
   * some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout.  But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system.  If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
   */
  static inline int rcu_blocking_is_gp(void)
  {
+       int ret;
+
         might_sleep();  /* Check for RCU read-side critical section. */
-       return num_online_cpus() <= 1;
+       preempt_disable();
+       ret = num_online_cpus() <= 1;
+       preempt_enable();
+       return ret;
  }
  
  /**
@@ -2118,9 +2131,9 @@ void synchronize_sched_expedited(void)
                 put_online_cpus();
  
                 /* No joy, try again later.  Or just synchronize_sched(). */
-               if (trycount++ < 10)
+               if (trycount++ < 10) {
                         udelay(trycount * num_online_cpus());
-               else {
+               } else {
                         synchronize_sched();
                         return;
                 }
@@ -2241,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
   */
  static int rcu_pending(int cpu)
  {
-       return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
-              __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
-              rcu_preempt_pending(cpu);
+       struct rcu_state *rsp;
+
+       for_each_rcu_flavor(rsp)
+               if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+                       return 1;
+       return 0;
  }
  
  /*
@@ -2253,20 +2269,41 @@ static int rcu_pending(int cpu)
   */
  static int rcu_cpu_has_callbacks(int cpu)
  {
+       struct rcu_state *rsp;
+
         /* RCU callbacks either ready or pending? */
-       return per_cpu(rcu_sched_data, cpu).nxtlist ||
-              per_cpu(rcu_bh_data, cpu).nxtlist ||
-              rcu_preempt_cpu_has_callbacks(cpu);
+       for_each_rcu_flavor(rsp)
+               if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+                       return 1;
+       return 0;
+}
+
+/*
+ * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+                              int cpu, unsigned long done)
+{
+       trace_rcu_barrier(rsp->name, s, cpu,
+                         atomic_read(&rsp->barrier_cpu_count), done);
  }
  
  /*
   * RCU callback function for _rcu_barrier().  If we are last, wake
   * up the task executing _rcu_barrier().
   */
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
  {
-       if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-               complete(&rcu_barrier_completion);
+       struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+       struct rcu_state *rsp = rdp->rsp;
+
+       if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+               _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+               complete(&rsp->barrier_completion);
+       } else {
+               _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+       }
  }
  
  /*
@@ -2274,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
   */
  static void rcu_barrier_func(void *type)
  {
-       int cpu = smp_processor_id();
-       struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
-       void (*call_rcu_func)(struct rcu_head *head,
-                             void (*func)(struct rcu_head *head));
+       struct rcu_state *rsp = type;
+       struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
  
-       atomic_inc(&rcu_barrier_cpu_count);
-       call_rcu_func = type;
-       call_rcu_func(head, rcu_barrier_callback);
+       _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+       atomic_inc(&rsp->barrier_cpu_count);
+       rsp->call(&rdp->barrier_head, rcu_barrier_callback);
  }
  
  /*
   * Orchestrate the specified type of RCU barrier, waiting for all
   * RCU callbacks of the specified type to complete.
   */
-static void _rcu_barrier(struct rcu_state *rsp,
-                        void (*call_rcu_func)(struct rcu_head *head,
-                                              void (*func)(struct rcu_head *head)))
+static void _rcu_barrier(struct rcu_state *rsp)
  {
         int cpu;
         unsigned long flags;
         struct rcu_data *rdp;
-       struct rcu_head rh;
+       struct rcu_data rd;
+       unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+       unsigned long snap_done;
  
-       init_rcu_head_on_stack(&rh);
+       init_rcu_head_on_stack(&rd.barrier_head);
+       _rcu_barrier_trace(rsp, "Begin", -1, snap);
  
         /* Take mutex to serialize concurrent rcu_barrier() requests. */
-       mutex_lock(&rcu_barrier_mutex);
+       mutex_lock(&rsp->barrier_mutex);
+
+       /*
+        * Ensure that all prior references, including to ->n_barrier_done,
+        * are ordered before the _rcu_barrier() machinery.
+        */
+       smp_mb();  /* See above block comment. */
+
+       /*
+        * Recheck ->n_barrier_done to see if others did our work for us.
+        * This means checking ->n_barrier_done for an even-to-odd-to-even
+        * transition.  The "if" expression below therefore rounds the old
+        * value up to the next even number and adds two before comparing.
+        */
+       snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+       _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+       if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+               _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+               smp_mb(); /* caller's subsequent code after above check. */
+               mutex_unlock(&rsp->barrier_mutex);
+               return;
+       }
  
-       smp_mb();  /* Prevent any prior operations from leaking in. */
+       /*
+        * Increment ->n_barrier_done to avoid duplicate work.  Use
+        * ACCESS_ONCE() to prevent the compiler from speculating
+        * the increment to precede the early-exit check.
+        */
+       ACCESS_ONCE(rsp->n_barrier_done)++;
+       WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+       _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+       smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
  
         /*
          * Initialize the count to one rather than to zero in order to
@@ -2321,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
          * 6.   Both rcu_barrier_callback() callbacks are invoked, awakening
          *      us -- but before CPU 1's orphaned callbacks are invoked!!!
          */
-       init_completion(&rcu_barrier_completion);
-       atomic_set(&rcu_barrier_cpu_count, 1);
+       init_completion(&rsp->barrier_completion);
+       atomic_set(&rsp->barrier_cpu_count, 1);
         raw_spin_lock_irqsave(&rsp->onofflock, flags);
         rsp->rcu_barrier_in_progress = current;
         raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2338,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
                 preempt_disable();
                 rdp = per_cpu_ptr(rsp->rda, cpu);
                 if (cpu_is_offline(cpu)) {
+                       _rcu_barrier_trace(rsp, "Offline", cpu,
+                                          rsp->n_barrier_done);
                         preempt_enable();
                         while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
                                 schedule_timeout_interruptible(1);
                 } else if (ACCESS_ONCE(rdp->qlen)) {
-                       smp_call_function_single(cpu, rcu_barrier_func,
-                                                (void *)call_rcu_func, 1);
+                       _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+                                          rsp->n_barrier_done);
+                       smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                         preempt_enable();
                 } else {
+                       _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+                                          rsp->n_barrier_done);
                         preempt_enable();
                 }
         }
@@ -2362,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
         rcu_adopt_orphan_cbs(rsp);
         rsp->rcu_barrier_in_progress = NULL;
         raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-       atomic_inc(&rcu_barrier_cpu_count);
+       atomic_inc(&rsp->barrier_cpu_count);
         smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
-       call_rcu_func(&rh, rcu_barrier_callback);
+       rd.rsp = rsp;
+       rsp->call(&rd.barrier_head, rcu_barrier_callback);
  
         /*
          * Now that we have an rcu_barrier_callback() callback on each
          * CPU, and thus each counted, remove the initial count.
          */
-       if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-               complete(&rcu_barrier_completion);
+       if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+               complete(&rsp->barrier_completion);
+
+       /* Increment ->n_barrier_done to prevent duplicate work. */
+       smp_mb(); /* Keep increment after above mechanism. */
+       ACCESS_ONCE(rsp->n_barrier_done)++;
+       WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+       _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+       smp_mb(); /* Keep increment before caller's subsequent code. */
  
         /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-       wait_for_completion(&rcu_barrier_completion);
+       wait_for_completion(&rsp->barrier_completion);
  
         /* Other rcu_barrier() invocations can now safely proceed. */
-       mutex_unlock(&rcu_barrier_mutex);
+       mutex_unlock(&rsp->barrier_mutex);
  
-       destroy_rcu_head_on_stack(&rh);
+       destroy_rcu_head_on_stack(&rd.barrier_head);
  }
  
  /**
@@ -2387,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
   */
  void rcu_barrier_bh(void)
  {
-       _rcu_barrier(&rcu_bh_state, call_rcu_bh);
+       _rcu_barrier(&rcu_bh_state);
  }
  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  
@@ -2396,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
   */
  void rcu_barrier_sched(void)
  {
-       _rcu_barrier(&rcu_sched_state, call_rcu_sched);
+       _rcu_barrier(&rcu_sched_state);
  }
  EXPORT_SYMBOL_GPL(rcu_barrier_sched);
  
@@ -2407,18 +2485,15 @@ static void __init
  rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
  {
         unsigned long flags;
-       int i;
         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Set up local state, ensuring consistent view of global state. */
         raw_spin_lock_irqsave(&rnp->lock, flags);
         rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-       rdp->nxtlist = NULL;
-       for (i = 0; i < RCU_NEXT_SIZE; i++)
-               rdp->nxttail[i] = &rdp->nxtlist;
+       init_callback_list(rdp);
         rdp->qlen_lazy = 0;
-       rdp->qlen = 0;
+       ACCESS_ONCE(rdp->qlen) = 0;
         rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
         WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
         WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2492,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
  
  static void __cpuinit rcu_prepare_cpu(int cpu)
  {
-       rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
-       rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
-       rcu_preempt_init_percpu_data(cpu);
+       struct rcu_state *rsp;
+
+       for_each_rcu_flavor(rsp)
+               rcu_init_percpu_data(cpu, rsp,
+                                    strcmp(rsp->name, "rcu_preempt") == 0);
  }
  
  /*
@@ -2506,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
         long cpu = (long)hcpu;
         struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
         struct rcu_node *rnp = rdp->mynode;
+       struct rcu_state *rsp;
  
         trace_rcu_utilization("Start CPU hotplug");
         switch (action) {
@@ -2530,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                  * touch any data without introducing corruption. We send the
                  * dying CPU's callbacks to an arbitrarily chosen online CPU.
                  */
-               rcu_cleanup_dying_cpu(&rcu_bh_state);
-               rcu_cleanup_dying_cpu(&rcu_sched_state);
-               rcu_preempt_cleanup_dying_cpu();
+               for_each_rcu_flavor(rsp)
+                       rcu_cleanup_dying_cpu(rsp);
                 rcu_cleanup_after_idle(cpu);
                 break;
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
         case CPU_UP_CANCELED:
         case CPU_UP_CANCELED_FROZEN:
-               rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
-               rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
-               rcu_preempt_cleanup_dead_cpu(cpu);
+               for_each_rcu_flavor(rsp)
+                       rcu_cleanup_dead_cpu(cpu, rsp);
                 break;
         default:
                 break;
@@ -2574,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
  {
         int i;
  
-       for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+       for (i = rcu_num_lvls - 1; i > 0; i--)
                 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-       rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+       rsp->levelspread[0] = rcu_fanout_leaf;
  }
  #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
  static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2586,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
         int i;
  
         cprv = NR_CPUS;
-       for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+       for (i = rcu_num_lvls - 1; i >= 0; i--) {
                 ccur = rsp->levelcnt[i];
                 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
                 cprv = ccur;
@@ -2613,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
  
         /* Initialize the level-tracking arrays. */
  
-       for (i = 1; i < NUM_RCU_LVLS; i++)
+       for (i = 0; i < rcu_num_lvls; i++)
+               rsp->levelcnt[i] = num_rcu_lvl[i];
+       for (i = 1; i < rcu_num_lvls; i++)
                 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
         rcu_init_levelspread(rsp);
  
         /* Initialize the elements themselves, starting from the leaves. */
  
-       for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+       for (i = rcu_num_lvls - 1; i >= 0; i--) {
                 cpustride *= rsp->levelspread[i];
                 rnp = rsp->level[i];
                 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2649,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
         }
  
         rsp->rda = rda;
-       rnp = rsp->level[NUM_RCU_LVLS - 1];
+       rnp = rsp->level[rcu_num_lvls - 1];
         for_each_possible_cpu(i) {
                 while (i > rnp->grphi)
                         rnp++;
                 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                 rcu_boot_init_percpu_data(i, rsp);
         }
+       list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Compute the rcu_node tree geometry from kernel parameters.  This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+       int i;
+       int j;
+       int n = nr_cpu_ids;
+       int rcu_capacity[MAX_RCU_LVLS + 1];
+
+       /* If the compile-time values are accurate, just leave. */
+       if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+               return;
+
+       /*
+        * Compute number of nodes that can be handled an rcu_node tree
+        * with the given number of levels.  Setting rcu_capacity[0] makes
+        * some of the arithmetic easier.
+        */
+       rcu_capacity[0] = 1;
+       rcu_capacity[1] = rcu_fanout_leaf;
+       for (i = 2; i <= MAX_RCU_LVLS; i++)
+               rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+       /*
+        * The boot-time rcu_fanout_leaf parameter is only permitted
+        * to increase the leaf-level fanout, not decrease it.  Of course,
+        * the leaf-level fanout cannot exceed the number of bits in
+        * the rcu_node masks.  Finally, the tree must be able to accommodate
+        * the configured number of CPUs.  Complain and fall back to the
+        * compile-time values if these limits are exceeded.
+        */
+       if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+           rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+           n > rcu_capacity[MAX_RCU_LVLS]) {
+               WARN_ON(1);
+               return;
+       }
+
+       /* Calculate the number of rcu_nodes at each level of the tree. */
+       for (i = 1; i <= MAX_RCU_LVLS; i++)
+               if (n <= rcu_capacity[i]) {
+                       for (j = 0; j <= i; j++)
+                               num_rcu_lvl[j] =
+                                       DIV_ROUND_UP(n, rcu_capacity[i - j]);
+                       rcu_num_lvls = i;
+                       for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+                               num_rcu_lvl[j] = 0;
+                       break;
+               }
+
+       /* Calculate the total number of rcu_node structures. */
+       rcu_num_nodes = 0;
+       for (i = 0; i <= MAX_RCU_LVLS; i++)
+               rcu_num_nodes += num_rcu_lvl[i];
+       rcu_num_nodes -= n;
  }
  
  void __init rcu_init(void)
@@ -2663,6 +2802,7 @@ void __init rcu_init(void)
         int cpu;
  
         rcu_bootup_announce();
+       rcu_init_geometry();
         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
         __rcu_init_preempt();