rcu: Optionally run grace-period kthreads at real-time priority
[pandora-kernel.git] / kernel / rcu / tree.c
index 7680fc2..75ce123 100644 (file)
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -759,39 +763,71 @@ void rcu_irq_enter(void)
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
  *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
  */
 void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+       int incby = 2;
 
-       if (rdtp->dynticks_nmi_nesting == 0 &&
-           (atomic_read(&rdtp->dynticks) & 0x1))
-               return;
-       rdtp->dynticks_nmi_nesting++;
-       smp_mb__before_atomic();  /* Force delay from prior write. */
-       atomic_inc(&rdtp->dynticks);
-       /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-       smp_mb__after_atomic();  /* See above. */
-       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+       /* Complain about underflow. */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+       /*
+        * If idle from RCU viewpoint, atomically increment ->dynticks
+        * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+        * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
+        * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+        * to be in the outermost NMI handler that interrupted an RCU-idle
+        * period (observation due to Andy Lutomirski).
+        */
+       if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+               smp_mb__before_atomic();  /* Force delay from prior write. */
+               atomic_inc(&rdtp->dynticks);
+               /* atomic_inc() before later RCU read-side crit sects */
+               smp_mb__after_atomic();  /* See above. */
+               WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+               incby = 1;
+       }
+       rdtp->dynticks_nmi_nesting += incby;
+       barrier();
 }
 
 /**
  * rcu_nmi_exit - inform RCU of exit from NMI context
  *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
  */
 void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-       if (rdtp->dynticks_nmi_nesting == 0 ||
-           --rdtp->dynticks_nmi_nesting != 0)
+       /*
+        * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+        * (We are exiting an NMI handler, so RCU better be paying attention
+        * to us!)
+        */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+       /*
+        * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+        * leave it in non-RCU-idle state.
+        */
+       if (rdtp->dynticks_nmi_nesting != 1) {
+               rdtp->dynticks_nmi_nesting -= 2;
                return;
+       }
+
+       /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+       rdtp->dynticks_nmi_nesting = 0;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
@@ -898,6 +934,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                return 1;
        } else {
+               if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                rdp->mynode->gpnum))
+                       ACCESS_ONCE(rdp->gpwrap) = true;
                return 0;
        }
 }
@@ -1011,6 +1050,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        j1 = rcu_jiffies_till_stall_check();
        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
+       rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
 }
 
 /*
@@ -1033,11 +1073,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        }
 }
 
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
        long delta;
        unsigned long flags;
+       unsigned long gpa;
+       unsigned long j;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        long totqlen = 0;
@@ -1090,10 +1132,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
-       if (ndetected == 0)
-               pr_err("INFO: Stall ended before state dump start\n");
-       else
+       if (ndetected) {
                rcu_dump_cpu_stacks(rsp);
+       } else {
+               if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                   ACCESS_ONCE(rsp->completed) == gpnum) {
+                       pr_err("INFO: Stall ended before state dump start\n");
+               } else {
+                       j = jiffies;
+                       gpa = ACCESS_ONCE(rsp->gp_activity);
+                       pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                              rsp->name, j - gpa, j, gpa,
+                              jiffies_till_next_fqs);
+                       /* In this case, the current CPU might be at fault. */
+                       sched_show_task(current);
+               }
+       }
 
        /* Complain about tasks blocking the grace period. */
 
@@ -1193,7 +1247,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 
                /* They had a few time units to dump stack, so complain. */
-               print_other_cpu_stall(rsp);
+               print_other_cpu_stall(rsp, gpnum);
        }
 }
 
@@ -1530,7 +1584,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
        bool ret;
 
        /* Handle the ends of any preceding grace periods first. */
-       if (rdp->completed == rnp->completed) {
+       if (rdp->completed == rnp->completed &&
+           !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
 
                /* No grace period end, so just accelerate recent callbacks. */
                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1600,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
 
-       if (rdp->gpnum != rnp->gpnum) {
+       if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /*
                 * If the current grace period is waiting for this CPU,
                 * set up to detect a quiescent state, otherwise don't
@@ -1556,6 +1611,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->passed_quiesce = 0;
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
+               ACCESS_ONCE(rdp->gpwrap) = false;
        }
        return ret;
 }
@@ -1569,7 +1625,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-            rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+            rdp->completed == ACCESS_ONCE(rnp->completed) &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
@@ -1589,6 +1646,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
@@ -1649,6 +1707,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
 
        mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1724,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1763,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1800,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1850,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched_rcu_qs();
+                       ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1894,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
                                cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        } else {
                                /* Deal with stray signal. */
                                cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
@@ -2011,7 +2076,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-           rnp->completed == rnp->gpnum) {
+           rnp->completed == rnp->gpnum || rdp->gpwrap) {
 
                /*
                 * The grace period in which this quiescent state was
@@ -3135,7 +3200,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 
        /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+           unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
@@ -3535,17 +3601,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
        unsigned long flags;
+       int kthread_prio_in = kthread_prio;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+       struct sched_param sp;
        struct task_struct *t;
 
+       /* Force priority into range. */
+       if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+               kthread_prio = 1;
+       else if (kthread_prio < 0)
+               kthread_prio = 0;
+       else if (kthread_prio > 99)
+               kthread_prio = 99;
+       if (kthread_prio != kthread_prio_in)
+               pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                        kthread_prio, kthread_prio_in);
+
        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
-               t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+               t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
+               if (kthread_prio) {
+                       sp.sched_priority = kthread_prio;
+                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+               }
+               wake_up_process(t);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rcu_spawn_nocb_kthreads();