perf: Optimize perf_tp_event_match()
[pandora-kernel.git] / kernel / perf_event.c
index 3d1552d..e099650 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)              { barrier(); }
 void __weak hw_perf_enable(void)               { barrier(); }
 
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-              struct perf_cpu_context *cpuctx,
-              struct perf_event_context *ctx)
-{
-       return 0;
-}
-
 void __weak perf_event_print_debug(void)       { }
 
 static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
        event->total_time_running = run_end - event->tstamp_running;
 }
 
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+       struct perf_event *event;
+
+       update_event_times(leader);
+       list_for_each_entry(event, &leader->sibling_list, group_entry)
+               update_event_times(event);
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-       struct perf_event *sibling, *tmp;
-
        if (list_empty(&event->group_entry))
                return;
        ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader != event)
                event->group_leader->nr_siblings--;
 
-       update_event_times(event);
+       update_group_times(event);
 
        /*
         * If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+}
+
+static void
+perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+{
+       struct perf_event *sibling, *tmp;
 
        /*
         * If this was a group event with sibling events then
@@ -504,18 +513,6 @@ retry:
        raw_spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-       struct perf_event *event;
-
-       update_event_times(leader);
-       list_for_each_entry(event, &leader->sibling_list, group_entry)
-               update_event_times(event);
-}
-
 /*
  * Cross CPU call to disable a performance event
  */
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
               struct perf_event_context *ctx)
 {
-       struct perf_event *event, *partial_group;
+       struct perf_event *event, *partial_group = NULL;
+       const struct pmu *pmu = group_event->pmu;
+       bool txn = false;
        int ret;
 
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
 
-       ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
-       if (ret)
-               return ret < 0 ? ret : 0;
+       /* Check if group transaction availabe */
+       if (pmu->start_txn)
+               txn = true;
+
+       if (txn)
+               pmu->start_txn(pmu);
 
        if (event_sched_in(group_event, cpuctx, ctx))
                return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
                }
        }
 
-       return 0;
+       if (!txn)
+               return 0;
+
+       ret = pmu->commit_txn(pmu);
+       if (!ret) {
+               pmu->cancel_txn(pmu);
+               return 0;
+       }
 
 group_error:
+       if (txn)
+               pmu->cancel_txn(pmu);
+
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        if (cpuctx->task_ctx == ctx)
                return;
 
+       perf_disable();
+
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
        cpuctx->task_ctx = ctx;
+
+       perf_enable();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
 
+       /*
+        * Remove from the PMU, can't get re-enabled since we got
+        * here because the last ref went.
+        */
+       perf_event_disable(event);
+
        WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       perf_event_remove_from_context(event);
+       /*
+        * There are two ways this annotation is useful:
+        *
+        *  1) there is a lock recursion from perf_event_exit_task
+        *     see the comment there.
+        *
+        *  2) there is a lock-inversion with mmap_sem through
+        *     perf_event_read_group(), which takes faults while
+        *     holding ctx->mutex, however this is called after
+        *     the last filedesc died, so there is no possibility
+        *     to trigger the AB-BA case.
+        */
+       mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+       raw_spin_lock_irq(&ctx->lock);
+       list_del_event(event, ctx);
+       perf_destroy_group(event, ctx);
+       raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
 
        mutex_lock(&event->owner->perf_event_mutex);
@@ -2260,11 +2297,6 @@ unlock:
        rcu_read_unlock();
 }
 
-static unsigned long perf_data_size(struct perf_mmap_data *data)
-{
-       return data->nr_pages << (PAGE_SHIFT + data->data_order);
-}
-
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
@@ -2283,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
        return virt_to_page(data->data_pages[pgoff - 1]);
 }
 
+static void *perf_mmap_alloc_page(int cpu)
+{
+       struct page *page;
+       int node;
+
+       node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+       page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+       if (!page)
+               return NULL;
+
+       return page_address(page);
+}
+
 static struct perf_mmap_data *
 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 {
@@ -2299,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        if (!data)
                goto fail;
 
-       data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+       data->user_page = perf_mmap_alloc_page(event->cpu);
        if (!data->user_page)
                goto fail_user_page;
 
        for (i = 0; i < nr_pages; i++) {
-               data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+               data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                if (!data->data_pages[i])
                        goto fail_data_pages;
        }
 
-       data->data_order = 0;
        data->nr_pages = nr_pages;
 
        return data;
@@ -2345,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        kfree(data);
 }
 
+static inline int page_order(struct perf_mmap_data *data)
+{
+       return 0;
+}
+
 #else
 
 /*
@@ -2353,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
  * Required for architectures that have d-cache aliasing issues.
  */
 
+static inline int page_order(struct perf_mmap_data *data)
+{
+       return data->page_order;
+}
+
 static struct page *
 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
 {
-       if (pgoff > (1UL << data->data_order))
+       if (pgoff > (1UL << page_order(data)))
                return NULL;
 
        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2376,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
        int i, nr;
 
        data = container_of(work, struct perf_mmap_data, work);
-       nr = 1 << data->data_order;
+       nr = 1 << page_order(data);
 
        base = data->user_page;
        for (i = 0; i < nr + 1; i++)
@@ -2415,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 
        data->user_page = all_buf;
        data->data_pages[0] = all_buf + PAGE_SIZE;
-       data->data_order = ilog2(nr_pages);
+       data->page_order = ilog2(nr_pages);
        data->nr_pages = 1;
 
        return data;
@@ -2429,6 +2483,11 @@ fail:
 
 #endif
 
+static unsigned long perf_data_size(struct perf_mmap_data *data)
+{
+       return data->nr_pages << (PAGE_SHIFT + page_order(data));
+}
+
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
@@ -2469,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 {
        long max_size = perf_data_size(data);
 
-       atomic_set(&data->lock, -1);
-
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@ -2543,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        long user_extra, extra;
        int ret = 0;
 
+       /*
+        * Don't allow mmap() of inherited per-task counters. This would
+        * create a performance issue due to all children writing to the
+        * same buffer.
+        */
+       if (event->cpu == -1 && event->attr.inherit)
+               return -EINVAL;
+
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
 
@@ -2642,6 +2707,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 
 static const struct file_operations perf_fops = {
+       .llseek                 = no_llseek,
        .release                = perf_release,
        .read                   = perf_read,
        .poll                   = perf_poll,
@@ -2791,6 +2857,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 }
 
 
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+       perf_guest_cbs = cbs;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+       perf_guest_cbs = NULL;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
@@ -2826,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 
 /*
- * Curious locking construct.
- *
  * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
  * cannot fully serialize things.
  *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
  * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
  */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-       int cur, cpu = get_cpu();
-
-       handle->locked = 0;
 
-       for (;;) {
-               cur = atomic_cmpxchg(&data->lock, -1, cpu);
-               if (cur == -1) {
-                       handle->locked = 1;
-                       break;
-               }
-               if (cur == cpu)
-                       break;
-
-               cpu_relax();
-       }
+       preempt_disable();
+       local_inc(&data->nest);
+       handle->wakeup = local_read(&data->wakeup);
 }
 
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
-       int cpu;
-
-       data->done_head = data->head;
-
-       if (!handle->locked)
-               goto out;
 
 again:
-       /*
-        * The xchg implies a full barrier that ensures all writes are done
-        * before we publish the new head, matched by a rmb() in userspace when
-        * reading this position.
-        */
-       while ((head = atomic_long_xchg(&data->done_head, 0)))
-               data->user_page->data_head = head;
+       head = local_read(&data->head);
 
        /*
-        * NMI can happen here, which means we can miss a done_head update.
+        * IRQ/NMI can happen here, which means we can miss a head update.
         */
 
-       cpu = atomic_xchg(&data->lock, -1);
-       WARN_ON_ONCE(cpu != smp_processor_id());
+       if (!local_dec_and_test(&data->nest))
+               goto out;
 
        /*
-        * Therefore we have to validate we did not indeed do so.
+        * Publish the known good head. Rely on the full barrier implied
+        * by atomic_dec_and_test() order the data->head read and this
+        * write.
         */
-       if (unlikely(atomic_long_read(&data->done_head))) {
-               /*
-                * Since we had it locked, we can lock it again.
-                */
-               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                       cpu_relax();
+       data->user_page->data_head = head;
 
+       /*
+        * Now check if we missed an update, rely on the (compiler)
+        * barrier in atomic_dec_and_test() to re-read data->head.
+        */
+       if (unlikely(head != local_read(&data->head))) {
+               local_inc(&data->nest);
                goto again;
        }
 
-       if (atomic_xchg(&data->wakeup, 0))
+       if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
-out:
-       put_cpu();
+
+ out:
+       preempt_enable();
 }
 
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-       unsigned int pages_mask;
-       unsigned long offset;
-       unsigned int size;
-       void **pages;
-
-       offset          = handle->offset;
-       pages_mask      = handle->data->nr_pages - 1;
-       pages           = handle->data->data_pages;
-
        do {
-               unsigned long page_offset;
-               unsigned long page_size;
-               int nr;
+               unsigned long size = min_t(unsigned long, handle->size, len);
 
-               nr          = (offset >> PAGE_SHIFT) & pages_mask;
-               page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
-               page_offset = offset & (page_size - 1);
-               size        = min_t(unsigned int, page_size - page_offset, len);
+               memcpy(handle->addr, buf, size);
 
-               memcpy(pages[nr] + page_offset, buf, size);
+               len -= size;
+               handle->addr += size;
+               handle->size -= size;
+               if (!handle->size) {
+                       struct perf_mmap_data *data = handle->data;
 
-               len         -= size;
-               buf         += size;
-               offset      += size;
+                       handle->page++;
+                       handle->page &= data->nr_pages - 1;
+                       handle->addr = data->data_pages[handle->page];
+                       handle->size = PAGE_SIZE << page_order(data);
+               }
        } while (len);
-
-       handle->offset = offset;
-
-       /*
-        * Check we didn't copy past our reservation window, taking the
-        * possible unsigned int wrap into account.
-        */
-       WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
@@ -2977,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,
        handle->sample  = sample;
 
        if (!data->nr_pages)
-               goto fail;
+               goto out;
 
-       have_lost = atomic_read(&data->lost);
+       have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
 
-       perf_output_lock(handle);
+       perf_output_get_handle(handle);
 
        do {
                /*
@@ -2993,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
-               offset = head = atomic_long_read(&data->head);
+               offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
-       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+       } while (local_cmpxchg(&data->head, offset, head) != offset);
 
-       handle->offset  = offset;
-       handle->head    = head;
+       if (head - local_read(&data->wakeup) > data->watermark)
+               local_add(data->watermark, &data->wakeup);
 
-       if (head - tail > data->watermark)
-               atomic_set(&data->wakeup, 1);
+       handle->page = offset >> (PAGE_SHIFT + page_order(data));
+       handle->page &= data->nr_pages - 1;
+       handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+       handle->addr = data->data_pages[handle->page];
+       handle->addr += handle->size;
+       handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
 
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-               lost_event.lost        = atomic_xchg(&data->lost, 0);
+               lost_event.lost        = local_xchg(&data->lost, 0);
 
                perf_output_put(handle, lost_event);
        }
@@ -3018,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 
 fail:
-       atomic_inc(&data->lost);
-       perf_output_unlock(handle);
+       local_inc(&data->lost);
+       perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
 
@@ -3034,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = event->attr.wakeup_events;
 
        if (handle->sample && wakeup_events) {
-               int events = atomic_inc_return(&data->events);
+               int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
-                       atomic_sub(wakeup_events, &data->events);
-                       atomic_set(&data->wakeup, 1);
+                       local_sub(wakeup_events, &data->events);
+                       local_inc(&data->wakeup);
                }
        }
 
-       perf_output_unlock(handle);
+       perf_output_put_handle(handle);
        rcu_read_unlock();
 }
 
@@ -3377,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,
 {
        struct perf_output_handle handle;
        struct task_struct *task = task_event->task;
-       unsigned long flags;
        int size, ret;
 
-       /*
-        * If this CPU attempts to acquire an rq lock held by a CPU spinning
-        * in perf_output_lock() from interrupt context, it's game over.
-        */
-       local_irq_save(flags);
-
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
 
-       if (ret) {
-               local_irq_restore(flags);
+       if (ret)
                return;
-       }
 
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3403,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
 
        perf_output_end(&handle);
-       local_irq_restore(flags);
 }
 
 static int perf_event_task_match(struct perf_event *event)
@@ -3743,7 +3784,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
-                               .misc = 0,
+                               .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
@@ -3961,39 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-       /*
-        * The event is active, we're good!
-        */
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
-               return 1;
-
-       /*
-        * The event is off/error, not counting.
-        */
-       if (event->state != PERF_EVENT_STATE_INACTIVE)
-               return 0;
-
-       /*
-        * The event is inactive, if the context is active
-        * we're part of a group that didn't make it on the 'pmu',
-        * not counting.
-        */
-       if (event->ctx->is_active)
-               return 0;
-
-       /*
-        * We're inactive and the context is too, this means the
-        * task is scheduled out, we're counting events that happen
-        * to us, like migration events.
-        */
-       return 1;
-}
-
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data);
-
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
@@ -4014,12 +4022,6 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-       if (event->cpu != -1 && event->cpu != smp_processor_id())
-               return 0;
-
-       if (!perf_swevent_is_counting(event))
-               return 0;
-
        if (event->attr.type != type)
                return 0;
 
@@ -4029,30 +4031,88 @@ static int perf_swevent_match(struct perf_event *event,
        if (perf_exclude_event(event, regs))
                return 0;
 
-       if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-           !perf_tp_event_match(event, data))
-               return 0;
-
        return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-                                    enum perf_type_id type,
-                                    u32 event_id, u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+       u64 val = event_id | (type << 32);
+
+       return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+       u64 hash = swevent_hash(type, event_id);
+
+       return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+       struct swevent_hlist *hlist;
+
+       hlist = rcu_dereference(ctx->swevent_hlist);
+       if (!hlist)
+               return NULL;
+
+       return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
 {
+       struct swevent_hlist *hlist;
+       u32 event_id = event->attr.config;
+       u64 type = event->attr.type;
+
+       /*
+        * Event scheduling is always serialized against hlist allocation
+        * and release. Which makes the protected version suitable here.
+        * The context lock guarantees that.
+        */
+       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                         lockdep_is_held(&event->ctx->lock));
+       if (!hlist)
+               return NULL;
+
+       return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                   u64 nr, int nmi,
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
+{
+       struct perf_cpu_context *cpuctx;
        struct perf_event *event;
+       struct hlist_node *node;
+       struct hlist_head *head;
 
-       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+       cpuctx = &__get_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+
+       head = find_swevent_head_rcu(cpuctx, type, event_id);
+
+       if (!head)
+               goto end;
+
+       hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
+end:
+       rcu_read_unlock();
 }
 
 int perf_swevent_get_recursion_context(void)
 {
-       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        int rctx;
 
        if (in_nmi())
@@ -4064,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)
        else
                rctx = 0;
 
-       if (cpuctx->recursion[rctx]) {
-               put_cpu_var(perf_cpu_context);
+       if (cpuctx->recursion[rctx])
                return -1;
-       }
 
        cpuctx->recursion[rctx]++;
        barrier();
@@ -4081,31 +4139,9 @@ void perf_swevent_put_recursion_context(int rctx)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        barrier();
        cpuctx->recursion[rctx]--;
-       put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                   u64 nr, int nmi,
-                                   struct perf_sample_data *data,
-                                   struct pt_regs *regs)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
-
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-       rcu_read_lock();
-       perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-                                nr, nmi, data, regs);
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-       rcu_read_unlock();
-}
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -4113,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        struct perf_sample_data data;
        int rctx;
 
+       preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (rctx < 0)
                return;
@@ -4122,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
 
        perf_swevent_put_recursion_context(rctx);
+       preempt_enable_notrace();
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -4131,16 +4169,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+       struct perf_cpu_context *cpuctx;
+       struct hlist_head *head;
+
+       cpuctx = &__get_cpu_var(perf_cpu_context);
 
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
+
+       head = find_swevent_head(cpuctx, event);
+       if (WARN_ON_ONCE(!head))
+               return -EINVAL;
+
+       hlist_add_head_rcu(&event->hlist_entry, head);
+
        return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+       hlist_del_rcu(&event->hlist_entry);
 }
 
 static const struct pmu perf_ops_generic = {
@@ -4168,15 +4218,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        perf_sample_data_init(&data, 0);
        data.period = event->hw.last_period;
        regs = get_irq_regs();
-       /*
-        * In case we exclude kernel IPs or are somehow not in interrupt
-        * context, provide the next best thing, the user IP.
-        */
-       if ((event->attr.exclude_kernel || !regs) &&
-                       !event->attr.exclude_user)
-               regs = task_pt_regs(current);
 
-       if (regs) {
+       if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
                        if (perf_event_overflow(event, 0, &data, regs))
                                ret = HRTIMER_NORESTART;
@@ -4324,27 +4367,122 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
 
-#ifdef CONFIG_EVENT_TRACING
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+       return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                        lockdep_is_held(&cpuctx->hlist_mutex));
+}
 
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                  int entry_size, struct pt_regs *regs)
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
-       struct perf_sample_data data;
-       struct perf_raw_record raw = {
-               .size = entry_size,
-               .data = record,
-       };
+       struct swevent_hlist *hlist;
 
-       perf_sample_data_init(&data, addr);
-       data.raw = &raw;
+       hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+       kfree(hlist);
+}
+
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
 
-       /* Trace events already protected against recursion */
-       do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                        &data, regs);
+       if (!hlist)
+               return;
+
+       rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+       call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
-EXPORT_SYMBOL_GPL(perf_tp_event);
 
-static int perf_tp_event_match(struct perf_event *event,
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+       mutex_lock(&cpuctx->hlist_mutex);
+
+       if (!--cpuctx->hlist_refcount)
+               swevent_hlist_release(cpuctx);
+
+       mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+       int cpu;
+
+       if (event->cpu != -1) {
+               swevent_hlist_put_cpu(event, event->cpu);
+               return;
+       }
+
+       for_each_possible_cpu(cpu)
+               swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       int err = 0;
+
+       mutex_lock(&cpuctx->hlist_mutex);
+
+       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+               if (!hlist) {
+                       err = -ENOMEM;
+                       goto exit;
+               }
+               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+       }
+       cpuctx->hlist_refcount++;
+ exit:
+       mutex_unlock(&cpuctx->hlist_mutex);
+
+       return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+       int err;
+       int cpu, failed_cpu;
+
+       if (event->cpu != -1)
+               return swevent_hlist_get_cpu(event, event->cpu);
+
+       get_online_cpus();
+       for_each_possible_cpu(cpu) {
+               err = swevent_hlist_get_cpu(event, cpu);
+               if (err) {
+                       failed_cpu = cpu;
+                       goto fail;
+               }
+       }
+       put_online_cpus();
+
+       return 0;
+ fail:
+       for_each_possible_cpu(cpu) {
+               if (cpu == failed_cpu)
+                       break;
+               swevent_hlist_put_cpu(event, cpu);
+       }
+
+       put_online_cpus();
+       return err;
+}
+
+#ifdef CONFIG_EVENT_TRACING
+
+static const struct pmu perf_ops_tracepoint = {
+       .enable         = perf_trace_enable,
+       .disable        = perf_trace_disable,
+       .read           = perf_swevent_read,
+       .unthrottle     = perf_swevent_unthrottle,
+};
+
+static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
        void *record = data->raw->data;
@@ -4354,13 +4492,55 @@ static int perf_tp_event_match(struct perf_event *event,
        return 0;
 }
 
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       /*
+        * All tracepoints are from kernel-space.
+        */
+       if (event->attr.exclude_kernel)
+               return 0;
+
+       if (!perf_tp_filter_match(event, data))
+               return 0;
+
+       return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+                  struct pt_regs *regs, struct hlist_head *head)
+{
+       struct perf_sample_data data;
+       struct perf_event *event;
+       struct hlist_node *node;
+
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
+
+       perf_sample_data_init(&data, addr);
+       data.raw = &raw;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+               if (perf_tp_event_match(event, &data, regs))
+                       perf_swevent_add(event, count, 1, &data, regs);
+       }
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-       perf_trace_disable(event->attr.config);
+       perf_trace_destroy(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+       int err;
+
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4370,12 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
-       if (perf_trace_enable(event->attr.config))
+       err = perf_trace_init(event);
+       if (err)
                return NULL;
 
        event->destroy = tp_perf_event_destroy;
 
-       return &perf_ops_generic;
+       return &perf_ops_tracepoint;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4403,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static int perf_tp_event_match(struct perf_event *event,
-                               struct perf_sample_data *data)
-{
-       return 1;
-}
-
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
@@ -4474,6 +4649,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
 
        atomic_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4688,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
+                       int err;
+
+                       err = swevent_hlist_get(event);
+                       if (err)
+                               return ERR_PTR(err);
+
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
                }
@@ -4738,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
        int fput_needed = 0;
        int ret = -EINVAL;
 
+       /*
+        * Don't allow output of inherited per-task events. This would
+        * create performance issues due to cross cpu access.
+        */
+       if (event->cpu == -1 && event->attr.inherit)
+               return -EINVAL;
+
        if (!output_fd)
                goto set;
 
@@ -4758,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
        if (event->data)
                goto out;
 
+       /*
+        * Don't allow cross-cpu buffers
+        */
+       if (output_event->cpu != event->cpu)
+               goto out;
+
+       /*
+        * If its not a per-cpu buffer, it must be the same task.
+        */
+       if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+               goto out;
+
        atomic_long_inc(&output_file->f_count);
 
 set:
@@ -5176,7 +5377,7 @@ void perf_event_exit_task(struct task_struct *child)
         *
         * But since its the parent context it won't be the same instance.
         */
-       mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+       mutex_lock(&child_ctx->mutex);
 
 again:
        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5585,7 @@ static void __init perf_event_init_all_cpus(void)
 
        for_each_possible_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
+               mutex_init(&cpuctx->hlist_mutex);
                __perf_event_init_context(&cpuctx->ctx, NULL);
        }
 }
@@ -5397,6 +5599,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
+
+       mutex_lock(&cpuctx->hlist_mutex);
+       if (cpuctx->hlist_refcount > 0) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+               WARN_ON_ONCE(!hlist);
+               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+       }
+       mutex_unlock(&cpuctx->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5628,10 @@ static void perf_event_exit_cpu(int cpu)
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_event_context *ctx = &cpuctx->ctx;
 
+       mutex_lock(&cpuctx->hlist_mutex);
+       swevent_hlist_release(cpuctx);
+       mutex_unlock(&cpuctx->hlist_mutex);
+
        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
        mutex_unlock(&ctx->mutex);