Merge branch 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Oct 2010 01:48:00 +0000 (18:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Oct 2010 01:48:00 +0000 (18:48 -0700)
* 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (50 commits)
  perf python scripting: Add futex-contention script
  perf python scripting: Fixup cut'n'paste error in sctop script
  perf scripting: Shut up 'perf record' final status
  perf record: Remove newline character from perror() argument
  perf python scripting: Support fedora 11 (audit 1.7.17)
  perf python scripting: Improve the syscalls-by-pid script
  perf python scripting: print the syscall name on sctop
  perf python scripting: Improve the syscalls-counts script
  perf python scripting: Improve the failed-syscalls-by-pid script
  kprobes: Remove redundant text_mutex lock in optimize
  x86/oprofile: Fix uninitialized variable use in debug printk
  tracing: Fix 'faild' -> 'failed' typo
  perf probe: Fix format specified for Dwarf_Off parameter
  perf trace: Fix detection of script extension
  perf trace: Use $PERF_EXEC_PATH in canned report scripts
  perf tools: Document event modifiers
  perf tools: Remove direct slang.h include
  perf_events: Fix for transaction recovery in group_sched_in()
  perf_events: Revert: Fix transaction recovery in group_sched_in()
  perf, x86: Use NUMA aware allocations for PEBS/BTS/DS allocations
  ...

1  2 
arch/x86/include/asm/msr-index.h
arch/x86/kernel/cpu/perf_event.c
include/linux/interrupt.h
kernel/kprobes.c
kernel/softirq.c
kernel/trace/ring_buffer.c

  #define MSR_AMD64_IBSDCLINAD          0xc0011038
  #define MSR_AMD64_IBSDCPHYSAD         0xc0011039
  #define MSR_AMD64_IBSCTL              0xc001103a
+ #define MSR_AMD64_IBSBRTARGET         0xc001103b
  
  /* Fam 10h MSRs */
  #define MSR_FAM10H_MMIO_CONF_BASE     0xc0010058
  #define MSR_IA32_TSC                  0x00000010
  #define MSR_IA32_PLATFORM_ID          0x00000017
  #define MSR_IA32_EBL_CR_POWERON               0x0000002a
 +#define MSR_EBC_FREQUENCY_ID          0x0000002c
  #define MSR_IA32_FEATURE_CONTROL        0x0000003a
  
  #define FEATURE_CONTROL_LOCKED                                (1<<0)
@@@ -49,6 -49,7 +49,6 @@@ static unsigned lon
  copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
  {
        unsigned long offset, addr = (unsigned long)from;
 -      int type = in_nmi() ? KM_NMI : KM_IRQ0;
        unsigned long size, len = 0;
        struct page *page;
        void *map;
@@@ -62,9 -63,9 +62,9 @@@
                offset = addr & (PAGE_SIZE - 1);
                size = min(PAGE_SIZE - offset, n - len);
  
 -              map = kmap_atomic(page, type);
 +              map = kmap_atomic(page);
                memcpy(to, map+offset, size);
 -              kunmap_atomic(map, type);
 +              kunmap_atomic(map);
                put_page(page);
  
                len  += size;
@@@ -237,6 -238,7 +237,7 @@@ struct x86_pmu 
         * Intel DebugStore bits
         */
        int             bts, pebs;
+       int             bts_active, pebs_active;
        int             pebs_record_size;
        void            (*drain_pebs)(struct pt_regs *regs);
        struct event_constraint *pebs_constraints;
@@@ -380,7 -382,7 +381,7 @@@ static void release_pmc_hardware(void) 
  
  #endif
  
- static int reserve_ds_buffers(void);
+ static void reserve_ds_buffers(void);
  static void release_ds_buffers(void);
  
  static void hw_perf_event_destroy(struct perf_event *event)
@@@ -477,7 -479,7 +478,7 @@@ static int x86_setup_perfctr(struct per
        if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
            (hwc->sample_period == 1)) {
                /* BTS is not supported by this architecture. */
-               if (!x86_pmu.bts)
+               if (!x86_pmu.bts_active)
                        return -EOPNOTSUPP;
  
                /* BTS is currently only allowed for user-mode. */
@@@ -496,12 -498,13 +497,13 @@@ static int x86_pmu_hw_config(struct per
                int precise = 0;
  
                /* Support for constant skid */
-               if (x86_pmu.pebs)
+               if (x86_pmu.pebs_active) {
                        precise++;
  
-               /* Support for IP fixup */
-               if (x86_pmu.lbr_nr)
-                       precise++;
+                       /* Support for IP fixup */
+                       if (x86_pmu.lbr_nr)
+                               precise++;
+               }
  
                if (event->attr.precise_ip > precise)
                        return -EOPNOTSUPP;
@@@ -543,11 -546,8 +545,8 @@@ static int __x86_pmu_event_init(struct 
                if (atomic_read(&active_events) == 0) {
                        if (!reserve_pmc_hardware())
                                err = -EBUSY;
-                       else {
-                               err = reserve_ds_buffers();
-                               if (err)
-                                       release_pmc_hardware();
-                       }
+                       else
+                               reserve_ds_buffers();
                }
                if (!err)
                        atomic_inc(&active_events);
@@@ -410,12 -410,13 +410,12 @@@ extern void open_softirq(int nr, void (
  extern void softirq_init(void);
  static inline void __raise_softirq_irqoff(unsigned int nr)
  {
-       trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL);
+       trace_softirq_raise(nr);
        or_softirq_pending(1UL << nr);
  }
  
  extern void raise_softirq_irqoff(unsigned int nr);
  extern void raise_softirq(unsigned int nr);
 -extern void wakeup_softirqd(void);
  
  /* This is the worklist that queues up per-cpu softirq work.
   *
diff --combined kernel/kprobes.c
@@@ -74,7 -74,8 +74,8 @@@ static struct hlist_head kretprobe_inst
  /* NOTE: change this value only with kprobe_mutex held */
  static bool kprobes_all_disarmed;
  
- static DEFINE_MUTEX(kprobe_mutex);    /* Protects kprobe_table */
+ /* This protects kprobe_table and optimizing_list */
+ static DEFINE_MUTEX(kprobe_mutex);
  static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
  static struct {
        spinlock_t lock ____cacheline_aligned_in_smp;
@@@ -595,6 -596,7 +596,7 @@@ static __kprobes void try_to_optimize_k
  }
  
  #ifdef CONFIG_SYSCTL
+ /* This should be called with kprobe_mutex locked */
  static void __kprobes optimize_all_kprobes(void)
  {
        struct hlist_head *head;
                return;
  
        kprobes_allow_optimization = true;
-       mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
-       mutex_unlock(&text_mutex);
        printk(KERN_INFO "Kprobes globally optimized\n");
  }
  
+ /* This should be called with kprobe_mutex locked */
  static void __kprobes unoptimize_all_kprobes(void)
  {
        struct hlist_head *head;
@@@ -2000,7 -2001,6 +2001,7 @@@ static ssize_t write_enabled_file_bool(
  static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
 +      .llseek =       default_llseek,
  };
  
  static int __kprobes debugfs_kprobe_init(void)
diff --combined kernel/softirq.c
@@@ -67,7 -67,7 +67,7 @@@ char *softirq_to_name[NR_SOFTIRQS] = 
   * to the pending events, so lets the scheduler to balance
   * the softirq load for us.
   */
 -void wakeup_softirqd(void)
 +static void wakeup_softirqd(void)
  {
        /* Interrupts are disabled: no need to stop preemption */
        struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@@ -229,18 -229,20 +229,20 @@@ restart
  
        do {
                if (pending & 1) {
+                       unsigned int vec_nr = h - softirq_vec;
                        int prev_count = preempt_count();
-                       kstat_incr_softirqs_this_cpu(h - softirq_vec);
  
-                       trace_softirq_entry(h, softirq_vec);
+                       kstat_incr_softirqs_this_cpu(vec_nr);
+                       trace_softirq_entry(vec_nr);
                        h->action(h);
-                       trace_softirq_exit(h, softirq_vec);
+                       trace_softirq_exit(vec_nr);
                        if (unlikely(prev_count != preempt_count())) {
-                               printk(KERN_ERR "huh, entered softirq %td %s %p"
+                               printk(KERN_ERR "huh, entered softirq %u %s %p"
                                       "with preempt_count %08x,"
-                                      " exited with %08x?\n", h - softirq_vec,
-                                      softirq_to_name[h - softirq_vec],
-                                      h->action, prev_count, preempt_count());
+                                      " exited with %08x?\n", vec_nr,
+                                      softirq_to_name[vec_nr], h->action,
+                                      prev_count, preempt_count());
                                preempt_count() = prev_count;
                        }
  
@@@ -224,6 -224,9 +224,9 @@@ enum 
        RB_LEN_TIME_STAMP = 16,
  };
  
+ #define skip_time_extend(event) \
+       ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
  static inline int rb_null_event(struct ring_buffer_event *event)
  {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@@ -248,8 -251,12 +251,12 @@@ rb_event_data_length(struct ring_buffer
        return length + RB_EVNT_HDR_SIZE;
  }
  
- /* inline for ring buffer fast paths */
- static unsigned
+ /*
+  * Return the length of the given event. Will return
+  * the length of the time extend if the event is a
+  * time extend.
+  */
+ static inline unsigned
  rb_event_length(struct ring_buffer_event *event)
  {
        switch (event->type_len) {
        return 0;
  }
  
+ /*
+  * Return total length of time extend and data,
+  *   or just the event length for all other events.
+  */
+ static inline unsigned
+ rb_event_ts_length(struct ring_buffer_event *event)
+ {
+       unsigned len = 0;
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+               /* time extends include the data event after it */
+               len = RB_LEN_TIME_EXTEND;
+               event = skip_time_extend(event);
+       }
+       return len + rb_event_length(event);
+ }
  /**
   * ring_buffer_event_length - return the length of the event
   * @event: the event to get the length of
+  *
+  * Returns the size of the data load of a data event.
+  * If the event is something other than a data event, it
+  * returns the size of the event itself. With the exception
+  * of a TIME EXTEND, where it still returns the size of the
+  * data load of the data event after it.
   */
  unsigned ring_buffer_event_length(struct ring_buffer_event *event)
  {
-       unsigned length = rb_event_length(event);
+       unsigned length;
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
+       length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@@ -294,6 -329,8 +329,8 @@@ EXPORT_SYMBOL_GPL(ring_buffer_event_len
  static void *
  rb_event_data(struct ring_buffer_event *event)
  {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@@ -404,9 -441,6 +441,6 @@@ static inline int test_time_stamp(u64 d
  /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
  #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
  
- /* Max number of timestamps that can fit on a page */
- #define RB_TIMESTAMPS_PER_PAGE        (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
  int ring_buffer_print_page_header(struct trace_seq *s)
  {
        struct buffer_data_page field;
@@@ -1546,6 -1580,25 +1580,25 @@@ static void rb_inc_iter(struct ring_buf
        iter->head = 0;
  }
  
+ /* Slow path, do not inline */
+ static noinline struct ring_buffer_event *
+ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+ {
+       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+       /* Not the first event on the page? */
+       if (rb_event_index(event)) {
+               event->time_delta = delta & TS_MASK;
+               event->array[0] = delta >> TS_SHIFT;
+       } else {
+               /* nope, just zero it */
+               event->time_delta = 0;
+               event->array[0] = 0;
+       }
+       return skip_time_extend(event);
+ }
  /**
   * ring_buffer_update_event - update event type and data
   * @event: the even to update
   * data field.
   */
  static void
- rb_update_event(struct ring_buffer_event *event,
-                        unsigned type, unsigned length)
+ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+               struct ring_buffer_event *event, unsigned length,
+               int add_timestamp, u64 delta)
  {
-       event->type_len = type;
-       switch (type) {
-       case RINGBUF_TYPE_PADDING:
-       case RINGBUF_TYPE_TIME_EXTEND:
-       case RINGBUF_TYPE_TIME_STAMP:
-               break;
+       /* Only a commit updates the timestamp */
+       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+               delta = 0;
  
-       case 0:
-               length -= RB_EVNT_HDR_SIZE;
-               if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-                       event->array[0] = length;
-               else
-                       event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-               break;
-       default:
-               BUG();
+       /*
+        * If we need to add a timestamp, then we
+        * add it to the start of the resevered space.
+        */
+       if (unlikely(add_timestamp)) {
+               event = rb_add_time_stamp(event, delta);
+               length -= RB_LEN_TIME_EXTEND;
+               delta = 0;
        }
+       event->time_delta = delta;
+       length -= RB_EVNT_HDR_SIZE;
+       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+               event->type_len = 0;
+               event->array[0] = length;
+       } else
+               event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
  }
  
  /*
@@@ -1823,10 -1879,13 +1879,13 @@@ rb_reset_tail(struct ring_buffer_per_cp
        local_sub(length, &tail_page->write);
  }
  
- static struct ring_buffer_event *
+ /*
+  * This is the slow path, force gcc not to inline it.
+  */
+ static noinline struct ring_buffer_event *
  rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-            struct buffer_page *tail_page, u64 *ts)
+            struct buffer_page *tail_page, u64 ts)
  {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-               *ts = rb_time_stamp(buffer);
-               next_page->page->time_stamp = *ts;
+               ts = rb_time_stamp(buffer);
+               next_page->page->time_stamp = ts;
        }
  
   out_again:
  
  static struct ring_buffer_event *
  __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned type, unsigned long length, u64 *ts)
+                 unsigned long length, u64 ts,
+                 u64 delta, int add_timestamp)
  {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
  
+       /*
+        * If the time delta since the last event is too big to
+        * hold in the time field of the event, then we append a
+        * TIME EXTEND event ahead of the data event.
+        */
+       if (unlikely(add_timestamp))
+               length += RB_LEN_TIME_EXTEND;
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
  
        tail = write - length;
  
        /* See if we shot pass the end of this buffer page */
-       if (write > BUF_PAGE_SIZE)
+       if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
  
  
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-       rb_update_event(event, type, length);
+       rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
  
-       /* The passed in type is zero for DATA */
-       if (likely(!type))
-               local_inc(&tail_page->entries);
+       local_inc(&tail_page->entries);
  
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-               tail_page->page->time_stamp = *ts;
+               tail_page->page->time_stamp = ts;
  
        return event;
  }
@@@ -1977,7 -2043,7 +2043,7 @@@ rb_try_to_discard(struct ring_buffer_pe
        unsigned long addr;
  
        new_index = rb_event_index(event);
-       old_index = new_index + rb_event_length(event);
+       old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
  
        return 0;
  }
  
- static int
- rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                 u64 *ts, u64 *delta)
- {
-       struct ring_buffer_event *event;
-       int ret;
-       WARN_ONCE(*delta > (1ULL << 59),
-                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                 (unsigned long long)*delta,
-                 (unsigned long long)*ts,
-                 (unsigned long long)cpu_buffer->write_stamp);
-       /*
-        * The delta is too big, we to add a
-        * new timestamp.
-        */
-       event = __rb_reserve_next(cpu_buffer,
-                                 RINGBUF_TYPE_TIME_EXTEND,
-                                 RB_LEN_TIME_EXTEND,
-                                 ts);
-       if (!event)
-               return -EBUSY;
-       if (PTR_ERR(event) == -EAGAIN)
-               return -EAGAIN;
-       /* Only a commited time event can update the write stamp */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * If this is the first on the page, then it was
-                * updated with the page itself. Try to discard it
-                * and if we can't just make it zero.
-                */
-               if (rb_event_index(event)) {
-                       event->time_delta = *delta & TS_MASK;
-                       event->array[0] = *delta >> TS_SHIFT;
-               } else {
-                       /* try to discard, since we do not need this */
-                       if (!rb_try_to_discard(cpu_buffer, event)) {
-                               /* nope, just zero it */
-                               event->time_delta = 0;
-                               event->array[0] = 0;
-                       }
-               }
-               cpu_buffer->write_stamp = *ts;
-               /* let the caller know this was the commit */
-               ret = 1;
-       } else {
-               /* Try to discard the event */
-               if (!rb_try_to_discard(cpu_buffer, event)) {
-                       /* Darn, this is just wasted space */
-                       event->time_delta = 0;
-                       event->array[0] = 0;
-               }
-               ret = 0;
-       }
-       *delta = 0;
-       return ret;
- }
  static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
  {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
  }
  
- static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
  {
        unsigned long commits;
  
@@@ -2110,9 -2113,10 +2113,10 @@@ rb_reserve_next_event(struct ring_buffe
                      unsigned long length)
  {
        struct ring_buffer_event *event;
-       u64 ts, delta = 0;
-       int commit = 0;
+       u64 ts, delta;
        int nr_loops = 0;
+       int add_timestamp;
+       u64 diff;
  
        rb_start_commit(cpu_buffer);
  
  
        length = rb_calculate_event_length(length);
   again:
+       add_timestamp = 0;
+       delta = 0;
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
                goto out_fail;
  
        ts = rb_time_stamp(cpu_buffer->buffer);
+       diff = ts - cpu_buffer->write_stamp;
  
-       /*
-        * Only the first commit can update the timestamp.
-        * Yes there is a race here. If an interrupt comes in
-        * just after the conditional and it traces too, then it
-        * will also check the deltas. More than one timestamp may
-        * also be made. But only the entry that did the actual
-        * commit will be something other than zero.
-        */
-       if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                  rb_page_write(cpu_buffer->tail_page) ==
-                  rb_commit_index(cpu_buffer))) {
-               u64 diff;
-               diff = ts - cpu_buffer->write_stamp;
-               /* make sure this diff is calculated here */
-               barrier();
-               /* Did the write stamp get updated already? */
-               if (unlikely(ts < cpu_buffer->write_stamp))
-                       goto get_event;
+       /* make sure this diff is calculated here */
+       barrier();
  
+       /* Did the write stamp get updated already? */
+       if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
-                       commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-                       if (commit == -EBUSY)
-                               goto out_fail;
-                       if (commit == -EAGAIN)
-                               goto again;
-                       RB_WARN_ON(cpu_buffer, commit < 0);
+                       WARN_ONCE(delta > (1ULL << 59),
+                                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+                                 (unsigned long long)delta,
+                                 (unsigned long long)ts,
+                                 (unsigned long long)cpu_buffer->write_stamp);
+                       add_timestamp = 1;
                }
        }
  
-  get_event:
-       event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+       event = __rb_reserve_next(cpu_buffer, length, ts,
+                                 delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
  
        if (!event)
                goto out_fail;
  
-       if (!rb_event_is_commit(cpu_buffer, event))
-               delta = 0;
-       event->time_delta = delta;
        return event;
  
   out_fail:
  
  #define TRACE_RECURSIVE_DEPTH 16
  
- static int trace_recursive_lock(void)
+ /* Keep this code out of the fast path cache */
+ static noinline void trace_recursive_fail(void)
  {
-       current->trace_recursion++;
-       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-               return 0;
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
  
                    in_nmi());
  
        WARN_ON_ONCE(1);
+ }
+ static inline int trace_recursive_lock(void)
+ {
+       current->trace_recursion++;
+       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+               return 0;
+       trace_recursive_fail();
        return -1;
  }
  
- static void trace_recursive_unlock(void)
+ static inline void trace_recursive_unlock(void)
  {
        WARN_ON_ONCE(!current->trace_recursion);
  
@@@ -2308,12 -2298,28 +2298,28 @@@ static voi
  rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
  {
+       u64 delta;
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-       if (rb_event_is_commit(cpu_buffer, event))
-               cpu_buffer->write_stamp += event->time_delta;
+       if (rb_event_is_commit(cpu_buffer, event)) {
+               /*
+                * A commit event that is first on a page
+                * updates the write timestamp with the page stamp
+                */
+               if (!rb_event_index(event))
+                       cpu_buffer->write_stamp =
+                               cpu_buffer->commit_page->page->time_stamp;
+               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                       delta = event->array[0];
+                       delta <<= TS_SHIFT;
+                       delta += event->time_delta;
+                       cpu_buffer->write_stamp += delta;
+               } else
+                       cpu_buffer->write_stamp += event->time_delta;
+       }
  }
  
  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@@ -2353,6 -2359,9 +2359,9 @@@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_co
  
  static inline void rb_event_discard(struct ring_buffer_event *event)
  {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@@ -3049,12 -3058,12 +3058,12 @@@ rb_buffer_peek(struct ring_buffer_per_c
  
   again:
        /*
-        * We repeat when a timestamp is encountered. It is possible
-        * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written, or from discarded
-        * commits. The most that we can have is the number on a single page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
  
        reader = rb_get_reader_page(cpu_buffer);
@@@ -3130,14 -3139,12 +3139,12 @@@ rb_iter_peek(struct ring_buffer_iter *i
                return NULL;
  
        /*
-        * We repeat when a timestamp is encountered.
-        * We can get multiple timestamps by nested interrupts or also
-        * if filtering is on (discarding commits). Since discarding
-        * commits can be frequent we can get a lot of timestamps.
-        * But we limit them by not adding timestamps if they begin
-        * at the start of a page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
  
        if (rb_per_cpu_empty(cpu_buffer))
@@@ -3835,7 -3842,8 +3842,8 @@@ int ring_buffer_read_page(struct ring_b
                if (len > (commit - read))
                        len = (commit - read);
  
-               size = rb_event_length(event);
+               /* Always keep the time extend and data together */
+               size = rb_event_ts_length(event);
  
                if (len < size)
                        goto out_unlock;
                                break;
  
                        event = rb_reader_event(cpu_buffer);
-                       size = rb_event_length(event);
+                       /* Always keep the time extend and data together */
+                       size = rb_event_ts_length(event);
                } while (len > size);
  
                /* update bpage */
@@@ -3974,7 -3983,6 +3983,7 @@@ static const struct file_operations rb_
        .open           = tracing_open_generic,
        .read           = rb_simple_read,
        .write          = rb_simple_write,
 +      .llseek         = default_llseek,
  };