timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC

[pandora-kernel.git] / kernel / time / timekeeping.c
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index dee23c9..8980fb7 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
  static DEFINE_RAW_SPINLOCK(timekeeper_lock);
  static struct timekeeper shadow_timekeeper;
  
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq:       Sequence counter for protecting updates. The lowest bit
+ *             is the index for the tk_read_base array
+ * @base:      tk_read_base array. Access is indexed by the lowest bit of
+ *             @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+       seqcount_t              seq;
+       struct tk_read_base     base[2];
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
+
  /* flag for if timekeeping is suspended */
  int __read_mostly timekeeping_suspended;
  
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
         return nsec + arch_gettimeoffset();
  }
  
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk:                The timekeeper from which we take the update
+ * @tkf:       The fast timekeeper to update
+ * @tbase:     The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb();  <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb();  <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb();  <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb();  <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ *     seq = tkf->seq;
+ *     smp_rmb();
+ *     idx = seq & 0x01;
+ *     now = now(tkf->base[idx]);
+ *     smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+       struct tk_read_base *base = tk_fast_mono.base;
+
+       /* Force readers off to base[1] */
+       raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+       /* Update base[0] */
+       memcpy(base, &tk->tkr, sizeof(*base));
+
+       /* Force readers back to base[0] */
+       raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+       /* Update base[1] */
+       memcpy(base + 1, base, sizeof(*base));
+}
+
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ *     now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * |    o  n
+ * |   o n
+ * |  u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+       struct tk_read_base *tkr;
+       unsigned int seq;
+       u64 now;
+
+       do {
+               seq = raw_read_seqcount(&tk_fast_mono.seq);
+               tkr = tk_fast_mono.base + (seq & 0x01);
+               now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+
+       } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+       return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+
  #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
  
  static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
         if (action & TK_MIRROR)
                 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                        sizeof(tk_core.timekeeper));
+
+       update_fast_timekeeper(tk);
  }
  
  /**