rcu: Optionally run grace-period kthreads at real-time priority

[pandora-kernel.git] / kernel / rcu / tree.c
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 7680fc2..75ce123 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  static void invoke_rcu_core(void);
  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
  /*
   * Track the rcutorture test sequence number and the update version
   * number within a given test.  The rcutorture_testseq is incremented
@@ -759,39 +763,71 @@ void rcu_irq_enter(void)
  /**
   * rcu_nmi_enter - inform RCU of entry to NMI context
   *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
   */
  void rcu_nmi_enter(void)
  {
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+       int incby = 2;
  
-       if (rdtp->dynticks_nmi_nesting == 0 &&
-           (atomic_read(&rdtp->dynticks) & 0x1))
-               return;
-       rdtp->dynticks_nmi_nesting++;
-       smp_mb__before_atomic();  /* Force delay from prior write. */
-       atomic_inc(&rdtp->dynticks);
-       /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-       smp_mb__after_atomic();  /* See above. */
-       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+       /* Complain about underflow. */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+       /*
+        * If idle from RCU viewpoint, atomically increment ->dynticks
+        * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+        * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
+        * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+        * to be in the outermost NMI handler that interrupted an RCU-idle
+        * period (observation due to Andy Lutomirski).
+        */
+       if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+               smp_mb__before_atomic();  /* Force delay from prior write. */
+               atomic_inc(&rdtp->dynticks);
+               /* atomic_inc() before later RCU read-side crit sects */
+               smp_mb__after_atomic();  /* See above. */
+               WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+               incby = 1;
+       }
+       rdtp->dynticks_nmi_nesting += incby;
+       barrier();
  }
  
  /**
   * rcu_nmi_exit - inform RCU of exit from NMI context
   *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
   */
  void rcu_nmi_exit(void)
  {
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
  
-       if (rdtp->dynticks_nmi_nesting == 0 ||
-           --rdtp->dynticks_nmi_nesting != 0)
+       /*
+        * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+        * (We are exiting an NMI handler, so RCU better be paying attention
+        * to us!)
+        */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+       /*
+        * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+        * leave it in non-RCU-idle state.
+        */
+       if (rdtp->dynticks_nmi_nesting != 1) {
+               rdtp->dynticks_nmi_nesting -= 2;
                 return;
+       }
+
+       /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+       rdtp->dynticks_nmi_nesting = 0;
         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
         smp_mb__before_atomic();  /* See above. */
         atomic_inc(&rdtp->dynticks);
@@ -898,6 +934,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                 return 1;
         } else {
+               if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                rdp->mynode->gpnum))
+                       ACCESS_ONCE(rdp->gpwrap) = true;
                 return 0;
         }
  }
@@ -1011,6 +1050,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
         j1 = rcu_jiffies_till_stall_check();
         ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
         rsp->jiffies_resched = j + j1 / 2;
+       rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
  }
  
  /*
@@ -1033,11 +1073,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
         }
  }
  
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
  {
         int cpu;
         long delta;
         unsigned long flags;
+       unsigned long gpa;
+       unsigned long j;
         int ndetected = 0;
         struct rcu_node *rnp = rcu_get_root(rsp);
         long totqlen = 0;
@@ -1090,10 +1132,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
                smp_processor_id(), (long)(jiffies - rsp->gp_start),
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
-       if (ndetected == 0)
-               pr_err("INFO: Stall ended before state dump start\n");
-       else
+       if (ndetected) {
                 rcu_dump_cpu_stacks(rsp);
+       } else {
+               if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                   ACCESS_ONCE(rsp->completed) == gpnum) {
+                       pr_err("INFO: Stall ended before state dump start\n");
+               } else {
+                       j = jiffies;
+                       gpa = ACCESS_ONCE(rsp->gp_activity);
+                       pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                              rsp->name, j - gpa, j, gpa,
+                              jiffies_till_next_fqs);
+                       /* In this case, the current CPU might be at fault. */
+                       sched_show_task(current);
+               }
+       }
  
         /* Complain about tasks blocking the grace period. */
  
@@ -1193,7 +1247,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                    ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
  
                 /* They had a few time units to dump stack, so complain. */
-               print_other_cpu_stall(rsp);
+               print_other_cpu_stall(rsp, gpnum);
         }
  }
  
@@ -1530,7 +1584,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
         bool ret;
  
         /* Handle the ends of any preceding grace periods first. */
-       if (rdp->completed == rnp->completed) {
+       if (rdp->completed == rnp->completed &&
+           !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
  
                 /* No grace period end, so just accelerate recent callbacks. */
                 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1600,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
         }
  
-       if (rdp->gpnum != rnp->gpnum) {
+       if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                 /*
                  * If the current grace period is waiting for this CPU,
                  * set up to detect a quiescent state, otherwise don't
@@ -1556,6 +1611,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 rdp->passed_quiesce = 0;
                 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                 zero_cpu_stall_ticks(rdp);
+               ACCESS_ONCE(rdp->gpwrap) = false;
         }
         return ret;
  }
@@ -1569,7 +1625,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
         local_irq_save(flags);
         rnp = rdp->mynode;
         if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-            rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+            rdp->completed == ACCESS_ONCE(rnp->completed) &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                 local_irq_restore(flags);
                 return;
@@ -1589,6 +1646,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
         rcu_bind_gp_kthread();
         raw_spin_lock_irq(&rnp->lock);
         smp_mb__after_unlock_lock();
@@ -1649,6 +1707,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                             rnp->grphi, rnp->qsmask);
                 raw_spin_unlock_irq(&rnp->lock);
                 cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
         }
  
         mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1724,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
         unsigned long maxj;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
         rsp->n_force_qs++;
         if (fqs_state == RCU_SAVE_DYNTICK) {
                 /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1763,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
         raw_spin_lock_irq(&rnp->lock);
         smp_mb__after_unlock_lock();
         gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1800,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                 nocb += rcu_future_gp_cleanup(rsp, rnp);
                 raw_spin_unlock_irq(&rnp->lock);
                 cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
         }
         rnp = rcu_get_root(rsp);
         raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1850,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                         if (rcu_gp_init(rsp))
                                 break;
                         cond_resched_rcu_qs();
+                       ACCESS_ONCE(rsp->gp_activity) = jiffies;
                         WARN_ON(signal_pending(current));
                         trace_rcu_grace_period(rsp->name,
                                                ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1894,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                        ACCESS_ONCE(rsp->gpnum),
                                                        TPS("fqsend"));
                                 cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                         } else {
                                 /* Deal with stray signal. */
                                 cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                 WARN_ON(signal_pending(current));
                                 trace_rcu_grace_period(rsp->name,
                                                        ACCESS_ONCE(rsp->gpnum),
@@ -2011,7 +2076,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
         raw_spin_lock_irqsave(&rnp->lock, flags);
         smp_mb__after_unlock_lock();
         if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-           rnp->completed == rnp->gpnum) {
+           rnp->completed == rnp->gpnum || rdp->gpwrap) {
  
                 /*
                  * The grace period in which this quiescent state was
@@ -3135,7 +3200,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
         }
  
         /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+           unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                 rdp->n_rp_gp_started++;
                 return 1;
         }
@@ -3535,17 +3601,35 @@ static int rcu_pm_notify(struct notifier_block *self,
  static int __init rcu_spawn_gp_kthread(void)
  {
         unsigned long flags;
+       int kthread_prio_in = kthread_prio;
         struct rcu_node *rnp;
         struct rcu_state *rsp;
+       struct sched_param sp;
         struct task_struct *t;
  
+       /* Force priority into range. */
+       if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+               kthread_prio = 1;
+       else if (kthread_prio < 0)
+               kthread_prio = 0;
+       else if (kthread_prio > 99)
+               kthread_prio = 99;
+       if (kthread_prio != kthread_prio_in)
+               pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                        kthread_prio, kthread_prio_in);
+
         rcu_scheduler_fully_active = 1;
         for_each_rcu_flavor(rsp) {
-               t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+               t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                 BUG_ON(IS_ERR(t));
                 rnp = rcu_get_root(rsp);
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 rsp->gp_kthread = t;
+               if (kthread_prio) {
+                       sp.sched_priority = kthread_prio;
+                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+               }
+               wake_up_process(t);
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
         }
         rcu_spawn_nocb_kthreads();