perf: Fix inherited events vs. tracepoint filters

[pandora-kernel.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 600c162..7c0b4f0 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
  #include <linux/hw_breakpoint.h>
+#include <linux/compat.h>
  
  #include "internal.h"
  
@@ -185,9 +186,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  static void update_context_time(struct perf_event_context *ctx);
  static u64 perf_event_time(struct perf_event *event);
  
-static void ring_buffer_attach(struct perf_event *event,
-                              struct ring_buffer *rb);
-
  void __weak perf_event_print_debug(void)       { }
  
  extern __weak const char *perf_pmu_name(void)
@@ -245,9 +243,9 @@ perf_cgroup_match(struct perf_event *event)
         return !event->cgrp || event->cgrp == cpuctx->cgrp;
  }
  
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
  {
-       css_get(&event->cgrp->css);
+       return css_tryget(&event->cgrp->css);
  }
  
  static inline void perf_put_cgroup(struct perf_event *event)
@@ -363,6 +361,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
  
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+               if (cpuctx->unique_pmu != pmu)
+                       continue; /* ensure we process each cpuctx once */
  
                 /*
                  * perf_cgroup_events says at least one
@@ -386,9 +386,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
  
                         if (mode & PERF_CGROUP_SWIN) {
                                 WARN_ON_ONCE(cpuctx->cgrp);
-                               /* set cgrp before ctxsw in to
-                                * allow event_filter_match() to not
-                                * have to pass task around
+                               /*
+                                * set cgrp before ctxsw in to allow
+                                * event_filter_match() to not have to pass
+                                * task around
                                  */
                                 cpuctx->cgrp = perf_cgroup_from_task(task);
                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -476,7 +477,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
         event->cgrp = cgrp;
  
         /* must be done before we fput() the file */
-       perf_get_cgroup(event);
+       if (!perf_tryget_cgroup(event)) {
+               event->cgrp = NULL;
+               ret = -ENOENT;
+               goto out;
+       }
  
         /*
          * all events in a group must monitor
@@ -714,8 +719,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  {
         struct perf_event_context *ctx;
  
-       rcu_read_lock();
  retry:
+       /*
+        * One of the few rules of preemptible RCU is that one cannot do
+        * rcu_read_unlock() while holding a scheduler (or nested) lock when
+        * part of the read side critical section was preemptible -- see
+        * rcu_read_unlock_special().
+        *
+        * Since ctx->lock nests under rq->lock we must ensure the entire read
+        * side critical section is non-preemptible.
+        */
+       preempt_disable();
+       rcu_read_lock();
         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
         if (ctx) {
                 /*
@@ -731,6 +746,8 @@ retry:
                 raw_spin_lock_irqsave(&ctx->lock, *flags);
                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       rcu_read_unlock();
+                       preempt_enable();
                         goto retry;
                 }
  
@@ -740,6 +757,7 @@ retry:
                 }
         }
         rcu_read_unlock();
+       preempt_enable();
         return ctx;
  }
  
@@ -889,6 +907,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                 ctx->nr_stat++;
  }
  
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+       event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+                                             PERF_EVENT_STATE_INACTIVE;
+}
+
  /*
   * Called at perf_event creation and when events are attached/detached from a
   * group.
@@ -1154,6 +1181,11 @@ group_sched_out(struct perf_event *group_event,
                 cpuctx->exclusive = 0;
  }
  
+struct remove_event {
+       struct perf_event *event;
+       bool detach_group;
+};
+
  /*
   * Cross CPU call to remove a performance event
   *
@@ -1162,12 +1194,15 @@ group_sched_out(struct perf_event *group_event,
   */
  static int __perf_remove_from_context(void *info)
  {
-       struct perf_event *event = info;
+       struct remove_event *re = info;
+       struct perf_event *event = re->event;
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         raw_spin_lock(&ctx->lock);
         event_sched_out(event, cpuctx, ctx);
+       if (re->detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                 ctx->is_active = 0;
@@ -1192,10 +1227,14 @@ static int __perf_remove_from_context(void *info)
   * When called from perf_event_exit_task, it's OK because the
   * context has been detached from its task.
   */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
+       struct remove_event re = {
+               .event = event,
+               .detach_group = detach_group,
+       };
  
         lockdep_assert_held(&ctx->mutex);
  
@@ -1204,12 +1243,12 @@ static void perf_remove_from_context(struct perf_event *event)
                  * Per cpu events are removed via an smp call and
                  * the removal is always successful.
                  */
-               cpu_function_call(event->cpu, __perf_remove_from_context, event);
+               cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                 return;
         }
  
  retry:
-       if (!task_function_call(task, __perf_remove_from_context, event))
+       if (!task_function_call(task, __perf_remove_from_context, &re))
                 return;
  
         raw_spin_lock_irq(&ctx->lock);
@@ -1226,6 +1265,8 @@ retry:
          * Since the task isn't running, its safe to remove the event, us
          * holding the ctx->lock ensures the task won't get scheduled in.
          */
+       if (detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         raw_spin_unlock_irq(&ctx->lock);
  }
@@ -1643,6 +1684,16 @@ retry:
          */
         if (ctx->is_active) {
                 raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                 goto retry;
         }
  
@@ -1687,7 +1738,16 @@ static int __perf_event_enable(void *info)
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
-       if (WARN_ON_ONCE(!ctx->is_active))
+       /*
+        * There's a time window between 'ctx->is_active' check
+        * in perf_event_enable function and this place having:
+        *   - IRQs on
+        *   - ctx->lock unlocked
+        *
+        * where the task could be killed and 'ctx' deactivated
+        * by perf_event_exit_task.
+        */
+       if (!ctx->is_active)
                 return -EINVAL;
  
         raw_spin_lock(&ctx->lock);
@@ -2174,11 +2234,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
          */
         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  
-       perf_event_sched_in(cpuctx, ctx, task);
-
         if (ctx->nr_events)
                 cpuctx->task_ctx = ctx;
  
+       perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
+
         perf_pmu_enable(ctx->pmu);
         perf_ctx_unlock(cpuctx, ctx);
  
@@ -2939,6 +2999,7 @@ static void free_event_rcu(struct rcu_head *head)
  }
  
  static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
  
  static void free_event(struct perf_event *event)
  {
@@ -2962,8 +3023,22 @@ static void free_event(struct perf_event *event)
         }
  
         if (event->rb) {
-               ring_buffer_put(event->rb);
-               event->rb = NULL;
+               struct ring_buffer *rb;
+
+               /*
+                * Can happen when we close an event with re-directed output.
+                *
+                * Since we have a 0 refcount, perf_mmap_close() will skip
+                * over us; possibly making our ring_buffer_put() the last.
+                */
+               mutex_lock(&event->mmap_mutex);
+               rb = event->rb;
+               if (rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* could be last */
+               }
+               mutex_unlock(&event->mmap_mutex);
         }
  
         if (is_cgroup_event(event))
@@ -2996,10 +3071,7 @@ int perf_event_release_kernel(struct perf_event *event)
          *     to trigger the AB-BA case.
          */
         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-       raw_spin_lock_irq(&ctx->lock);
-       perf_group_detach(event);
-       raw_spin_unlock_irq(&ctx->lock);
-       perf_remove_from_context(event);
+       perf_remove_from_context(event, true);
         mutex_unlock(&ctx->mutex);
  
         free_event(event);
@@ -3011,12 +3083,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  /*
   * Called when the last reference to the file is gone.
   */
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
  {
-       struct perf_event *event = file->private_data;
         struct task_struct *owner;
  
-       file->private_data = NULL;
+       if (!atomic_long_dec_and_test(&event->refcount))
+               return;
  
         rcu_read_lock();
         owner = ACCESS_ONCE(event->owner);
@@ -3051,7 +3123,13 @@ static int perf_release(struct inode *inode, struct file *file)
                 put_task_struct(owner);
         }
  
-       return perf_event_release_kernel(event);
+       perf_event_release_kernel(event);
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+       put_event(file->private_data);
+       return 0;
  }
  
  u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3195,30 +3273,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
         unsigned int events = POLL_HUP;
  
         /*
-        * Race between perf_event_set_output() and perf_poll(): perf_poll()
-        * grabs the rb reference but perf_event_set_output() overrides it.
-        * Here is the timeline for two threads T1, T2:
-        * t0: T1, rb = rcu_dereference(event->rb)
-        * t1: T2, old_rb = event->rb
-        * t2: T2, event->rb = new rb
-        * t3: T2, ring_buffer_detach(old_rb)
-        * t4: T1, ring_buffer_attach(rb1)
-        * t5: T1, poll_wait(event->waitq)
-        *
-        * To avoid this problem, we grab mmap_mutex in perf_poll()
-        * thereby ensuring that the assignment of the new ring buffer
-        * and the detachment of the old buffer appear atomic to perf_poll()
+        * Pin the event->rb by taking event->mmap_mutex; otherwise
+        * perf_event_set_output() can swizzle our rb and make us miss wakeups.
          */
         mutex_lock(&event->mmap_mutex);
-
-       rcu_read_lock();
-       rb = rcu_dereference(event->rb);
-       if (rb) {
-               ring_buffer_attach(event, rb);
+       rb = event->rb;
+       if (rb)
                 events = atomic_xchg(&rb->poll, 0);
-       }
-       rcu_read_unlock();
-
         mutex_unlock(&event->mmap_mutex);
  
         poll_wait(file, &event->waitq, wait);
@@ -3304,7 +3365,7 @@ unlock:
  
  static const struct file_operations perf_fops;
  
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
  {
         struct file *file;
  
@@ -3318,7 +3379,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
                 return ERR_PTR(-EBADF);
         }
  
-       return file->private_data;
+       return file;
  }
  
  static int perf_event_set_output(struct perf_event *event,
@@ -3350,19 +3411,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  
         case PERF_EVENT_IOC_SET_OUTPUT:
         {
+               struct file *output_file = NULL;
                 struct perf_event *output_event = NULL;
                 int fput_needed = 0;
                 int ret;
  
                 if (arg != -1) {
-                       output_event = perf_fget_light(arg, &fput_needed);
-                       if (IS_ERR(output_event))
-                               return PTR_ERR(output_event);
+                       output_file = perf_fget_light(arg, &fput_needed);
+                       if (IS_ERR(output_file))
+                               return PTR_ERR(output_file);
+                       output_event = output_file->private_data;
                 }
  
                 ret = perf_event_set_output(event, output_event);
                 if (output_event)
-                       fput_light(output_event->filp, fput_needed);
+                       fput_light(output_file, fput_needed);
  
                 return ret;
         }
@@ -3382,6 +3445,25 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         return 0;
  }
  
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg)
+{
+       switch (_IOC_NR(cmd)) {
+       case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+               /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+               if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+                       cmd &= ~IOCSIZE_MASK;
+                       cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+               }
+               break;
+       }
+       return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
  int perf_event_task_enable(void)
  {
         struct perf_event *event;
@@ -3530,16 +3612,12 @@ static void ring_buffer_attach(struct perf_event *event,
                 return;
  
         spin_lock_irqsave(&rb->event_lock, flags);
-       if (!list_empty(&event->rb_entry))
-               goto unlock;
-
-       list_add(&event->rb_entry, &rb->event_list);
-unlock:
+       if (list_empty(&event->rb_entry))
+               list_add(&event->rb_entry, &rb->event_list);
         spin_unlock_irqrestore(&rb->event_lock, flags);
  }
  
-static void ring_buffer_detach(struct perf_event *event,
-                              struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
  {
         unsigned long flags;
  
@@ -3558,8 +3636,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
  
         rcu_read_lock();
         rb = rcu_dereference(event->rb);
-       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
-               wake_up_all(&event->waitq);
+       if (rb) {
+               list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                       wake_up_all(&event->waitq);
         }
         rcu_read_unlock();
  }
@@ -3589,18 +3668,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
  
  static void ring_buffer_put(struct ring_buffer *rb)
  {
-       struct perf_event *event, *n;
-       unsigned long flags;
-
         if (!atomic_dec_and_test(&rb->refcount))
                 return;
  
-       spin_lock_irqsave(&rb->event_lock, flags);
-       list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-               list_del_init(&event->rb_entry);
-               wake_up_all(&event->waitq);
-       }
-       spin_unlock_irqrestore(&rb->event_lock, flags);
+       WARN_ON_ONCE(!list_empty(&rb->event_list));
  
         call_rcu(&rb->rcu_head, rb_free_rcu);
  }
@@ -3610,26 +3681,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
         struct perf_event *event = vma->vm_file->private_data;
  
         atomic_inc(&event->mmap_count);
+       atomic_inc(&event->rb->mmap_count);
  }
  
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
  static void perf_mmap_close(struct vm_area_struct *vma)
  {
         struct perf_event *event = vma->vm_file->private_data;
  
-       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               unsigned long size = perf_data_size(event->rb);
-               struct user_struct *user = event->mmap_user;
-               struct ring_buffer *rb = event->rb;
+       struct ring_buffer *rb = event->rb;
+       struct user_struct *mmap_user = rb->mmap_user;
+       int mmap_locked = rb->mmap_locked;
+       unsigned long size = perf_data_size(rb);
+
+       atomic_dec(&rb->mmap_count);
+
+       if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+               return;
+
+       /* Detach current event from the buffer. */
+       rcu_assign_pointer(event->rb, NULL);
+       ring_buffer_detach(event, rb);
+       mutex_unlock(&event->mmap_mutex);
+
+       /* If there's still other mmap()s of this buffer, we're done. */
+       if (atomic_read(&rb->mmap_count)) {
+               ring_buffer_put(rb); /* can't be last */
+               return;
+       }
+
+       /*
+        * No other mmap()s, detach from all other events that might redirect
+        * into the now unreachable buffer. Somewhat complicated by the
+        * fact that rb::event_lock otherwise nests inside mmap_mutex.
+        */
+again:
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+               if (!atomic_long_inc_not_zero(&event->refcount)) {
+                       /*
+                        * This event is en-route to free_event() which will
+                        * detach it and remove it from the list.
+                        */
+                       continue;
+               }
+               rcu_read_unlock();
  
-               atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-               vma->vm_mm->pinned_vm -= event->mmap_locked;
-               rcu_assign_pointer(event->rb, NULL);
-               ring_buffer_detach(event, rb);
+               mutex_lock(&event->mmap_mutex);
+               /*
+                * Check we didn't race with perf_event_set_output() which can
+                * swizzle the rb from under us while we were waiting to
+                * acquire mmap_mutex.
+                *
+                * If we find a different rb; ignore this event, a next
+                * iteration will no longer find it on the list. We have to
+                * still restart the iteration to make sure we're not now
+                * iterating the wrong list.
+                */
+               if (event->rb == rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* can't be last, we still have one */
+               }
                 mutex_unlock(&event->mmap_mutex);
+               put_event(event);
  
-               ring_buffer_put(rb);
-               free_uid(user);
+               /*
+                * Restart the iteration; either we're on the wrong list or
+                * destroyed its integrity by doing a deletion.
+                */
+               goto again;
         }
+       rcu_read_unlock();
+
+       /*
+        * It could be there's still a few 0-ref events on the list; they'll
+        * get cleaned up by free_event() -- they'll also still have their
+        * ref on the rb and will free it whenever they are done with it.
+        *
+        * Aside from that, this buffer is 'fully' detached and unmapped,
+        * undo the VM accounting.
+        */
+
+       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       vma->vm_mm->pinned_vm -= mmap_locked;
+       free_uid(mmap_user);
+
+       ring_buffer_put(rb); /* could be last */
  }
  
  static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3679,12 +3824,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 return -EINVAL;
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
         mutex_lock(&event->mmap_mutex);
         if (event->rb) {
-               if (event->rb->nr_pages == nr_pages)
-                       atomic_inc(&event->rb->refcount);
-               else
+               if (event->rb->nr_pages != nr_pages) {
                         ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                       /*
+                        * Raced against perf_mmap_close() through
+                        * perf_event_set_output(). Try again, hope for better
+                        * luck.
+                        */
+                       mutex_unlock(&event->mmap_mutex);
+                       goto again;
+               }
+
                 goto unlock;
         }
  
@@ -3725,19 +3882,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 ret = -ENOMEM;
                 goto unlock;
         }
-       rcu_assign_pointer(event->rb, rb);
+
+       atomic_set(&rb->mmap_count, 1);
+       rb->mmap_locked = extra;
+       rb->mmap_user = get_current_user();
  
         atomic_long_add(user_extra, &user->locked_vm);
-       event->mmap_locked = extra;
-       event->mmap_user = get_current_user();
-       vma->vm_mm->pinned_vm += event->mmap_locked;
+       vma->vm_mm->pinned_vm += extra;
+
+       ring_buffer_attach(event, rb);
+       rcu_assign_pointer(event->rb, rb);
  
  unlock:
         if (!ret)
                 atomic_inc(&event->mmap_count);
         mutex_unlock(&event->mmap_mutex);
  
-       vma->vm_flags |= VM_RESERVED;
+       /*
+        * Since pinned accounting is per vm we cannot allow fork() to copy our
+        * vma.
+        */
+       vma->vm_flags |= VM_DONTCOPY | VM_RESERVED;
         vma->vm_ops = &perf_mmap_vmops;
  
         return ret;
@@ -3765,7 +3930,7 @@ static const struct file_operations perf_fops = {
         .read                   = perf_read,
         .poll                   = perf_poll,
         .unlocked_ioctl         = perf_ioctl,
-       .compat_ioctl           = perf_ioctl,
+       .compat_ioctl           = perf_compat_ioctl,
         .mmap                   = perf_mmap,
         .fasync                 = perf_fasync,
  };
@@ -3777,12 +3942,20 @@ static const struct file_operations perf_fops = {
   * to user-space before waking everybody up.
   */
  
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+       /* only the parent has fasync state */
+       if (event->parent)
+               event = event->parent;
+       return &event->fasync;
+}
+
  void perf_event_wakeup(struct perf_event *event)
  {
         ring_buffer_wakeup(event);
  
         if (event->pending_kill) {
-               kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+               kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                 event->pending_kill = 0;
         }
  }
@@ -3791,6 +3964,13 @@ static void perf_pending_event(struct irq_work *entry)
  {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
+       int rctx;
+
+       rctx = perf_swevent_get_recursion_context();
+       /*
+        * If we 'fail' here, that's OK, it means recursion is already disabled
+        * and we won't recurse 'further'.
+        */
  
         if (event->pending_disable) {
                 event->pending_disable = 0;
@@ -3801,6 +3981,9 @@ static void perf_pending_event(struct irq_work *entry)
                 event->pending_wakeup = 0;
                 perf_event_wakeup(event);
         }
+
+       if (rctx >= 0)
+               perf_swevent_put_recursion_context(rctx);
  }
  
  /*
@@ -4260,7 +4443,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_task_ctx(&cpuctx->ctx, task_event);
  
@@ -4406,7 +4589,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
  
@@ -4602,7 +4785,7 @@ got_name:
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
                                         vma->vm_flags & VM_EXEC);
@@ -4749,7 +4932,7 @@ static int __perf_event_overflow(struct perf_event *event,
         else
                 perf_event_output(event, data, regs);
  
-       if (event->fasync && event->pending_kill) {
+       if (*perf_event_fasync(event) && event->pending_kill) {
                 event->pending_wakeup = 1;
                 irq_work_queue(&event->pending);
         }
@@ -4775,6 +4958,9 @@ struct swevent_htable {
  
         /* Recursion avoidance in each contexts */
         int                             recursion[PERF_NR_CONTEXTS];
+
+       /* Keeps track of cpu being initialized/exited */
+       bool                            online;
  };
  
  static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5017,8 +5203,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
         hwc->state = !(flags & PERF_EF_START);
  
         head = find_swevent_head(swhash, event);
-       if (WARN_ON_ONCE(!head))
+       if (!head) {
+               /*
+                * We can race with cpu hotplug code. Do not
+                * WARN if the cpu just got unplugged.
+                */
+               WARN_ON_ONCE(swhash->online);
                 return -EINVAL;
+       }
  
         hlist_add_head_rcu(&event->hlist_entry, head);
  
@@ -5152,7 +5344,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
  
  static int perf_swevent_init(struct perf_event *event)
  {
-       int event_id = event->attr.config;
+       u64 event_id = event->attr.config;
  
         if (event->attr.type != PERF_TYPE_SOFTWARE)
                 return -ENOENT;
@@ -5201,6 +5393,10 @@ static int perf_tp_filter_match(struct perf_event *event,
  {
         void *record = data->raw->data;
  
+       /* only top level events have filters set */
+       if (event->parent)
+               event = event->parent;
+
         if (likely(!event->filter) || filter_match_preds(event->filter, record))
                 return 1;
         return 0;
@@ -5624,8 +5820,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
  
                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  
-               if (cpuctx->active_pmu == old_pmu)
-                       cpuctx->active_pmu = pmu;
+               if (cpuctx->unique_pmu == old_pmu)
+                       cpuctx->unique_pmu = pmu;
         }
  }
  
@@ -5744,6 +5940,7 @@ skip_type:
         if (pmu->pmu_cpu_context)
                 goto got_cpu_context;
  
+       ret = -ENOMEM;
         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
         if (!pmu->pmu_cpu_context)
                 goto free_dev;
@@ -5759,7 +5956,7 @@ skip_type:
                 cpuctx->ctx.pmu = pmu;
                 cpuctx->jiffies_interval = 1;
                 INIT_LIST_HEAD(&cpuctx->rotation_list);
-               cpuctx->active_pmu = pmu;
+               cpuctx->unique_pmu = pmu;
         }
  
  got_cpu_context:
@@ -5908,6 +6105,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
  
         mutex_init(&event->mmap_mutex);
  
+       atomic_long_set(&event->refcount, 1);
         event->cpu              = cpu;
         event->attr             = *attr;
         event->group_leader     = group_leader;
@@ -5940,8 +6138,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         event->overflow_handler = overflow_handler;
         event->overflow_handler_context = context;
  
-       if (attr->disabled)
-               event->state = PERF_EVENT_STATE_OFF;
+       perf_event__state_init(event);
  
         pmu = NULL;
  
@@ -6100,6 +6297,8 @@ set:
         if (atomic_read(&event->mmap_count))
                 goto unlock;
  
+       old_rb = event->rb;
+
         if (output_event) {
                 /* get the rb we want to redirect to */
                 rb = ring_buffer_get(output_event);
@@ -6107,16 +6306,28 @@ set:
                         goto unlock;
         }
  
-       old_rb = event->rb;
-       rcu_assign_pointer(event->rb, rb);
         if (old_rb)
                 ring_buffer_detach(event, old_rb);
+
+       if (rb)
+               ring_buffer_attach(event, rb);
+
+       rcu_assign_pointer(event->rb, rb);
+
+       if (old_rb) {
+               ring_buffer_put(old_rb);
+               /*
+                * Since we detached before setting the new rb, so that we
+                * could attach the new rb, we could have missed a wakeup.
+                * Provide it now.
+                */
+               wake_up_all(&event->waitq);
+       }
+
         ret = 0;
  unlock:
         mutex_unlock(&event->mmap_mutex);
  
-       if (old_rb)
-               ring_buffer_put(old_rb);
  out:
         return ret;
  }
@@ -6162,6 +6373,9 @@ SYSCALL_DEFINE5(perf_event_open,
         if (attr.freq) {
                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
                         return -EINVAL;
+       } else {
+               if (attr.sample_period & (1ULL << 63))
+                       return -EINVAL;
         }
  
         /*
@@ -6178,12 +6392,12 @@ SYSCALL_DEFINE5(perf_event_open,
                 return event_fd;
  
         if (group_fd != -1) {
-               group_leader = perf_fget_light(group_fd, &fput_needed);
-               if (IS_ERR(group_leader)) {
-                       err = PTR_ERR(group_leader);
+               group_file = perf_fget_light(group_fd, &fput_needed);
+               if (IS_ERR(group_file)) {
+                       err = PTR_ERR(group_file);
                         goto err_fd;
                 }
-               group_file = group_leader->filp;
+               group_leader = group_file->private_data;
                 if (flags & PERF_FLAG_FD_OUTPUT)
                         output_event = group_leader;
                 if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6308,17 +6522,24 @@ SYSCALL_DEFINE5(perf_event_open,
                 struct perf_event_context *gctx = group_leader->ctx;
  
                 mutex_lock(&gctx->mutex);
-               perf_remove_from_context(group_leader);
+               perf_remove_from_context(group_leader, false);
+
+               /*
+                * Removing from the context ends up with disabled
+                * event. What we want here is event in the initial
+                * startup state, ready to be add into new context.
+                */
+               perf_event__state_init(group_leader);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
                                     group_entry) {
-                       perf_remove_from_context(sibling);
+                       perf_remove_from_context(sibling, false);
+                       perf_event__state_init(sibling);
                         put_ctx(gctx);
                 }
                 mutex_unlock(&gctx->mutex);
                 put_ctx(gctx);
         }
  
-       event->filp = event_file;
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
  
@@ -6408,7 +6629,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                 goto err_free;
         }
  
-       event->filp = NULL;
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
         perf_install_in_context(ctx, event, cpu);
@@ -6457,7 +6677,7 @@ static void sync_child_event(struct perf_event *child_event,
          * Release the parent event, if this was the last
          * reference to it.
          */
-       fput(parent_event->filp);
+       put_event(parent_event);
  }
  
  static void
@@ -6465,13 +6685,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                          struct perf_event_context *child_ctx,
                          struct task_struct *child)
  {
-       if (child_event->parent) {
-               raw_spin_lock_irq(&child_ctx->lock);
-               perf_group_detach(child_event);
-               raw_spin_unlock_irq(&child_ctx->lock);
-       }
-
-       perf_remove_from_context(child_event);
+       perf_remove_from_context(child_event, !!child_event->parent);
  
         /*
          * It can happen that the parent exits first, and has events
@@ -6533,9 +6747,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
          *
          *   __perf_event_exit_task()
          *     sync_child_event()
-        *       fput(parent_event->filp)
-        *         perf_release()
-        *           mutex_lock(&ctx->mutex)
+        *       put_event()
+        *         mutex_lock(&ctx->mutex)
          *
          * But since its the parent context it won't be the same instance.
          */
@@ -6603,7 +6816,7 @@ static void perf_free_event(struct perf_event *event,
         list_del_init(&event->child_list);
         mutex_unlock(&parent->child_mutex);
  
-       fput(parent->filp);
+       put_event(parent);
  
         perf_group_detach(event);
         list_del_event(event, ctx);
@@ -6683,6 +6896,12 @@ inherit_event(struct perf_event *parent_event,
                                            NULL, NULL);
         if (IS_ERR(child_event))
                 return child_event;
+
+       if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+               free_event(child_event);
+               return NULL;
+       }
+
         get_ctx(child_ctx);
  
         /*
@@ -6723,14 +6942,6 @@ inherit_event(struct perf_event *parent_event,
         add_event_to_ctx(child_event, child_ctx);
         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
  
-       /*
-        * Get a reference to the parent filp - we will fput it
-        * when the child event exits. This is safe to do because
-        * we are in the parent and we know that the filp still
-        * exists and has a nonzero count:
-        */
-       atomic_long_inc(&parent_event->filp->f_count);
-
         /*
          * Link this into the parent event's child list
          */
@@ -6788,7 +6999,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                  * child.
                  */
  
-               child_ctx = alloc_perf_context(event->pmu, child);
+               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                 if (!child_ctx)
                         return -ENOMEM;
  
@@ -6912,8 +7123,10 @@ int perf_event_init_task(struct task_struct *child)
  
         for_each_task_context_nr(ctxn) {
                 ret = perf_event_init_context(child, ctxn);
-               if (ret)
+               if (ret) {
+                       perf_event_free_task(child);
                         return ret;
+               }
         }
  
         return 0;
@@ -6936,6 +7149,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
         mutex_lock(&swhash->hlist_mutex);
+       swhash->online = true;
         if (swhash->hlist_refcount > 0) {
                 struct swevent_hlist *hlist;
  
@@ -6958,15 +7172,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
  
  static void __perf_event_exit_context(void *__info)
  {
+       struct remove_event re = { .detach_group = false };
         struct perf_event_context *ctx = __info;
-       struct perf_event *event, *tmp;
  
         perf_pmu_rotate_stop(ctx->pmu);
  
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               __perf_remove_from_context(event);
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-               __perf_remove_from_context(event);
+       rcu_read_lock();
+       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(&re);
+       rcu_read_unlock();
  }
  
  static void perf_event_exit_cpu_context(int cpu)
@@ -6990,11 +7204,12 @@ static void perf_event_exit_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
+       perf_event_exit_cpu_context(cpu);
+
         mutex_lock(&swhash->hlist_mutex);
+       swhash->online = false;
         swevent_hlist_release(swhash);
         mutex_unlock(&swhash->hlist_mutex);
-
-       perf_event_exit_cpu_context(cpu);
  }
  #else
  static inline void perf_event_exit_cpu(int cpu) { }