ptrace: use fsuid, fsgid, effective creds for fs access checks

[pandora-kernel.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 77e035a..bc94278 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
  #include <linux/hw_breakpoint.h>
+#include <linux/compat.h>
  
  #include "internal.h"
  
@@ -185,9 +186,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  static void update_context_time(struct perf_event_context *ctx);
  static u64 perf_event_time(struct perf_event *event);
  
-static void ring_buffer_attach(struct perf_event *event,
-                              struct ring_buffer *rb);
-
  void __weak perf_event_print_debug(void)       { }
  
  extern __weak const char *perf_pmu_name(void)
@@ -245,9 +243,9 @@ perf_cgroup_match(struct perf_event *event)
         return !event->cgrp || event->cgrp == cpuctx->cgrp;
  }
  
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
  {
-       css_get(&event->cgrp->css);
+       return css_tryget(&event->cgrp->css);
  }
  
  static inline void perf_put_cgroup(struct perf_event *event)
@@ -363,6 +361,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
  
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+               if (cpuctx->unique_pmu != pmu)
+                       continue; /* ensure we process each cpuctx once */
  
                 /*
                  * perf_cgroup_events says at least one
@@ -386,9 +386,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
  
                         if (mode & PERF_CGROUP_SWIN) {
                                 WARN_ON_ONCE(cpuctx->cgrp);
-                               /* set cgrp before ctxsw in to
-                                * allow event_filter_match() to not
-                                * have to pass task around
+                               /*
+                                * set cgrp before ctxsw in to allow
+                                * event_filter_match() to not have to pass
+                                * task around
                                  */
                                 cpuctx->cgrp = perf_cgroup_from_task(task);
                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -476,7 +477,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
         event->cgrp = cgrp;
  
         /* must be done before we fput() the file */
-       perf_get_cgroup(event);
+       if (!perf_tryget_cgroup(event)) {
+               event->cgrp = NULL;
+               ret = -ENOENT;
+               goto out;
+       }
  
         /*
          * all events in a group must monitor
@@ -660,6 +665,76 @@ static void put_ctx(struct perf_event_context *ctx)
         }
  }
  
+/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group we
+ * need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ *   perf_remove_from_context();
+ *   synchronize_rcu();
+ *   perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ *     task_struct::perf_event_mutex
+ *       perf_event_context::mutex
+ *         perf_event_context::lock
+ *         perf_event::child_mutex;
+ *         perf_event::mmap_mutex
+ *         mmap_sem
+ */
+static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+again:
+       rcu_read_lock();
+       ctx = ACCESS_ONCE(event->ctx);
+       if (!atomic_inc_not_zero(&ctx->refcount)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+
+       mutex_lock(&ctx->mutex);
+       if (event->ctx != ctx) {
+               mutex_unlock(&ctx->mutex);
+               put_ctx(ctx);
+               goto again;
+       }
+
+       return ctx;
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+                                 struct perf_event_context *ctx)
+{
+       mutex_unlock(&ctx->mutex);
+       put_ctx(ctx);
+}
+
  static void unclone_ctx(struct perf_event_context *ctx)
  {
         if (ctx->parent_ctx) {
@@ -714,8 +789,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  {
         struct perf_event_context *ctx;
  
-       rcu_read_lock();
  retry:
+       /*
+        * One of the few rules of preemptible RCU is that one cannot do
+        * rcu_read_unlock() while holding a scheduler (or nested) lock when
+        * part of the read side critical section was preemptible -- see
+        * rcu_read_unlock_special().
+        *
+        * Since ctx->lock nests under rq->lock we must ensure the entire read
+        * side critical section is non-preemptible.
+        */
+       preempt_disable();
+       rcu_read_lock();
         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
         if (ctx) {
                 /*
@@ -731,6 +816,8 @@ retry:
                 raw_spin_lock_irqsave(&ctx->lock, *flags);
                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       rcu_read_unlock();
+                       preempt_enable();
                         goto retry;
                 }
  
@@ -740,6 +827,7 @@ retry:
                 }
         }
         rcu_read_unlock();
+       preempt_enable();
         return ctx;
  }
  
@@ -889,6 +977,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                 ctx->nr_stat++;
  }
  
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+       event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+                                             PERF_EVENT_STATE_INACTIVE;
+}
+
  /*
   * Called at perf_event creation and when events are attached/detached from a
   * group.
@@ -1154,6 +1251,11 @@ group_sched_out(struct perf_event *group_event,
                 cpuctx->exclusive = 0;
  }
  
+struct remove_event {
+       struct perf_event *event;
+       bool detach_group;
+};
+
  /*
   * Cross CPU call to remove a performance event
   *
@@ -1162,12 +1264,15 @@ group_sched_out(struct perf_event *group_event,
   */
  static int __perf_remove_from_context(void *info)
  {
-       struct perf_event *event = info;
+       struct remove_event *re = info;
+       struct perf_event *event = re->event;
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         raw_spin_lock(&ctx->lock);
         event_sched_out(event, cpuctx, ctx);
+       if (re->detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                 ctx->is_active = 0;
@@ -1192,10 +1297,14 @@ static int __perf_remove_from_context(void *info)
   * When called from perf_event_exit_task, it's OK because the
   * context has been detached from its task.
   */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
+       struct remove_event re = {
+               .event = event,
+               .detach_group = detach_group,
+       };
  
         lockdep_assert_held(&ctx->mutex);
  
@@ -1204,12 +1313,12 @@ static void perf_remove_from_context(struct perf_event *event)
                  * Per cpu events are removed via an smp call and
                  * the removal is always successful.
                  */
-               cpu_function_call(event->cpu, __perf_remove_from_context, event);
+               cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                 return;
         }
  
  retry:
-       if (!task_function_call(task, __perf_remove_from_context, event))
+       if (!task_function_call(task, __perf_remove_from_context, &re))
                 return;
  
         raw_spin_lock_irq(&ctx->lock);
@@ -1226,6 +1335,8 @@ retry:
          * Since the task isn't running, its safe to remove the event, us
          * holding the ctx->lock ensures the task won't get scheduled in.
          */
+       if (detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         raw_spin_unlock_irq(&ctx->lock);
  }
@@ -1284,7 +1395,7 @@ static int __perf_event_disable(void *info)
   * is the current context on this CPU and preemption is disabled,
   * hence we can't get into perf_event_task_sched_out for this context.
   */
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
@@ -1326,6 +1437,19 @@ retry:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_disable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
+
  static void perf_set_shadow_time(struct perf_event *event,
                                  struct perf_event_context *ctx,
                                  u64 tstamp)
@@ -1643,6 +1767,16 @@ retry:
          */
         if (ctx->is_active) {
                 raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                 goto retry;
         }
  
@@ -1762,7 +1896,7 @@ unlock:
   * perf_event_for_each_child or perf_event_for_each as described
   * for perf_event_disable.
   */
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
@@ -1819,7 +1953,19 @@ out:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
-int perf_event_refresh(struct perf_event *event, int refresh)
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_enable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
+
+static int _perf_event_refresh(struct perf_event *event, int refresh)
  {
         /*
          * not supported on inherited events
@@ -1828,10 +1974,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
                 return -EINVAL;
  
         atomic_add(refresh, &event->event_limit);
-       perf_event_enable(event);
+       _perf_event_enable(event);
  
         return 0;
  }
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+       struct perf_event_context *ctx;
+       int ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_event_refresh(event, refresh);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
  EXPORT_SYMBOL_GPL(perf_event_refresh);
  
  static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2843,7 +3004,7 @@ find_lively_task_by_vpid(pid_t vpid)
  
         /* Reuse ptrace permission checks for now. */
         err = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
                 goto errout;
  
         return task;
@@ -2948,6 +3109,7 @@ static void free_event_rcu(struct rcu_head *head)
  }
  
  static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
  
  static void free_event(struct perf_event *event)
  {
@@ -2971,8 +3133,22 @@ static void free_event(struct perf_event *event)
         }
  
         if (event->rb) {
-               ring_buffer_put(event->rb);
-               event->rb = NULL;
+               struct ring_buffer *rb;
+
+               /*
+                * Can happen when we close an event with re-directed output.
+                *
+                * Since we have a 0 refcount, perf_mmap_close() will skip
+                * over us; possibly making our ring_buffer_put() the last.
+                */
+               mutex_lock(&event->mmap_mutex);
+               rb = event->rb;
+               if (rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* could be last */
+               }
+               mutex_unlock(&event->mmap_mutex);
         }
  
         if (is_cgroup_event(event))
@@ -3005,10 +3181,7 @@ int perf_event_release_kernel(struct perf_event *event)
          *     to trigger the AB-BA case.
          */
         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-       raw_spin_lock_irq(&ctx->lock);
-       perf_group_detach(event);
-       raw_spin_unlock_irq(&ctx->lock);
-       perf_remove_from_context(event);
+       perf_remove_from_context(event, true);
         mutex_unlock(&ctx->mutex);
  
         free_event(event);
@@ -3047,7 +3220,16 @@ static void put_event(struct perf_event *event)
         rcu_read_unlock();
  
         if (owner) {
-               mutex_lock(&owner->perf_event_mutex);
+               /*
+                * If we're here through perf_event_exit_task() we're already
+                * holding ctx->mutex which would be an inversion wrt. the
+                * normal lock order.
+                *
+                * However we can safely take this lock because its the child
+                * ctx->mutex.
+                */
+               mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
                 /*
                  * We have to re-check the event->owner field, if it is cleared
                  * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3099,12 +3281,13 @@ static int perf_event_read_group(struct perf_event *event,
                                    u64 read_format, char __user *buf)
  {
         struct perf_event *leader = event->group_leader, *sub;
-       int n = 0, size = 0, ret = -EFAULT;
         struct perf_event_context *ctx = leader->ctx;
-       u64 values[5];
+       int n = 0, size = 0, ret;
         u64 count, enabled, running;
+       u64 values[5];
+
+       lockdep_assert_held(&ctx->mutex);
  
-       mutex_lock(&ctx->mutex);
         count = perf_event_read_value(leader, &enabled, &running);
  
         values[n++] = 1 + leader->nr_siblings;
@@ -3119,7 +3302,7 @@ static int perf_event_read_group(struct perf_event *event,
         size = n * sizeof(u64);
  
         if (copy_to_user(buf, values, size))
-               goto unlock;
+               return -EFAULT;
  
         ret = size;
  
@@ -3133,14 +3316,11 @@ static int perf_event_read_group(struct perf_event *event,
                 size = n * sizeof(u64);
  
                 if (copy_to_user(buf + ret, values, size)) {
-                       ret = -EFAULT;
-                       goto unlock;
+                       return -EFAULT;
                 }
  
                 ret += size;
         }
-unlock:
-       mutex_unlock(&ctx->mutex);
  
         return ret;
  }
@@ -3199,8 +3379,14 @@ static ssize_t
  perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  {
         struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       int ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = perf_read_hw(event, buf, count);
+       perf_event_ctx_unlock(event, ctx);
  
-       return perf_read_hw(event, buf, count);
+       return ret;
  }
  
  static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3210,30 +3396,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
         unsigned int events = POLL_HUP;
  
         /*
-        * Race between perf_event_set_output() and perf_poll(): perf_poll()
-        * grabs the rb reference but perf_event_set_output() overrides it.
-        * Here is the timeline for two threads T1, T2:
-        * t0: T1, rb = rcu_dereference(event->rb)
-        * t1: T2, old_rb = event->rb
-        * t2: T2, event->rb = new rb
-        * t3: T2, ring_buffer_detach(old_rb)
-        * t4: T1, ring_buffer_attach(rb1)
-        * t5: T1, poll_wait(event->waitq)
-        *
-        * To avoid this problem, we grab mmap_mutex in perf_poll()
-        * thereby ensuring that the assignment of the new ring buffer
-        * and the detachment of the old buffer appear atomic to perf_poll()
+        * Pin the event->rb by taking event->mmap_mutex; otherwise
+        * perf_event_set_output() can swizzle our rb and make us miss wakeups.
          */
         mutex_lock(&event->mmap_mutex);
-
-       rcu_read_lock();
-       rb = rcu_dereference(event->rb);
-       if (rb) {
-               ring_buffer_attach(event, rb);
+       rb = event->rb;
+       if (rb)
                 events = atomic_xchg(&rb->poll, 0);
-       }
-       rcu_read_unlock();
-
         mutex_unlock(&event->mmap_mutex);
  
         poll_wait(file, &event->waitq, wait);
@@ -3241,7 +3410,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
         return events;
  }
  
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
  {
         (void)perf_event_read(event);
         local64_set(&event->count, 0);
@@ -3260,6 +3429,7 @@ static void perf_event_for_each_child(struct perf_event *event,
         struct perf_event *child;
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
+
         mutex_lock(&event->child_mutex);
         func(event);
         list_for_each_entry(child, &event->child_list, child_list)
@@ -3273,15 +3443,14 @@ static void perf_event_for_each(struct perf_event *event,
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *sibling;
  
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
+       lockdep_assert_held(&ctx->mutex);
+
         event = event->group_leader;
  
         perf_event_for_each_child(event, func);
         func(event);
         list_for_each_entry(sibling, &event->sibling_list, group_entry)
-               perf_event_for_each_child(event, func);
-       mutex_unlock(&ctx->mutex);
+               perf_event_for_each_child(sibling, func);
  }
  
  static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3340,25 +3509,24 @@ static int perf_event_set_output(struct perf_event *event,
                                  struct perf_event *output_event);
  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
  {
-       struct perf_event *event = file->private_data;
         void (*func)(struct perf_event *);
         u32 flags = arg;
  
         switch (cmd) {
         case PERF_EVENT_IOC_ENABLE:
-               func = perf_event_enable;
+               func = _perf_event_enable;
                 break;
         case PERF_EVENT_IOC_DISABLE:
-               func = perf_event_disable;
+               func = _perf_event_disable;
                 break;
         case PERF_EVENT_IOC_RESET:
-               func = perf_event_reset;
+               func = _perf_event_reset;
                 break;
  
         case PERF_EVENT_IOC_REFRESH:
-               return perf_event_refresh(event, arg);
+               return _perf_event_refresh(event, arg);
  
         case PERF_EVENT_IOC_PERIOD:
                 return perf_event_period(event, (u64 __user *)arg);
@@ -3399,13 +3567,49 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         return 0;
  }
  
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       long ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_ioctl(event, cmd, arg);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg)
+{
+       switch (_IOC_NR(cmd)) {
+       case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+               /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+               if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+                       cmd &= ~IOCSIZE_MASK;
+                       cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+               }
+               break;
+       }
+       return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
  int perf_event_task_enable(void)
  {
+       struct perf_event_context *ctx;
         struct perf_event *event;
  
         mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_enable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_enable);
+               perf_event_ctx_unlock(event, ctx);
+       }
         mutex_unlock(&current->perf_event_mutex);
  
         return 0;
@@ -3413,11 +3617,15 @@ int perf_event_task_enable(void)
  
  int perf_event_task_disable(void)
  {
+       struct perf_event_context *ctx;
         struct perf_event *event;
  
         mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_disable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_disable);
+               perf_event_ctx_unlock(event, ctx);
+       }
         mutex_unlock(&current->perf_event_mutex);
  
         return 0;
@@ -3547,16 +3755,12 @@ static void ring_buffer_attach(struct perf_event *event,
                 return;
  
         spin_lock_irqsave(&rb->event_lock, flags);
-       if (!list_empty(&event->rb_entry))
-               goto unlock;
-
-       list_add(&event->rb_entry, &rb->event_list);
-unlock:
+       if (list_empty(&event->rb_entry))
+               list_add(&event->rb_entry, &rb->event_list);
         spin_unlock_irqrestore(&rb->event_lock, flags);
  }
  
-static void ring_buffer_detach(struct perf_event *event,
-                              struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
  {
         unsigned long flags;
  
@@ -3575,13 +3779,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
  
         rcu_read_lock();
         rb = rcu_dereference(event->rb);
-       if (!rb)
-               goto unlock;
-
-       list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-               wake_up_all(&event->waitq);
-
-unlock:
+       if (rb) {
+               list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                       wake_up_all(&event->waitq);
+       }
         rcu_read_unlock();
  }
  
@@ -3610,18 +3811,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
  
  static void ring_buffer_put(struct ring_buffer *rb)
  {
-       struct perf_event *event, *n;
-       unsigned long flags;
-
         if (!atomic_dec_and_test(&rb->refcount))
                 return;
  
-       spin_lock_irqsave(&rb->event_lock, flags);
-       list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-               list_del_init(&event->rb_entry);
-               wake_up_all(&event->waitq);
-       }
-       spin_unlock_irqrestore(&rb->event_lock, flags);
+       WARN_ON_ONCE(!list_empty(&rb->event_list));
  
         call_rcu(&rb->rcu_head, rb_free_rcu);
  }
@@ -3631,26 +3824,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
         struct perf_event *event = vma->vm_file->private_data;
  
         atomic_inc(&event->mmap_count);
+       atomic_inc(&event->rb->mmap_count);
  }
  
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
  static void perf_mmap_close(struct vm_area_struct *vma)
  {
         struct perf_event *event = vma->vm_file->private_data;
  
-       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               unsigned long size = perf_data_size(event->rb);
-               struct user_struct *user = event->mmap_user;
-               struct ring_buffer *rb = event->rb;
+       struct ring_buffer *rb = event->rb;
+       struct user_struct *mmap_user = rb->mmap_user;
+       int mmap_locked = rb->mmap_locked;
+       unsigned long size = perf_data_size(rb);
+
+       atomic_dec(&rb->mmap_count);
+
+       if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+               return;
+
+       /* Detach current event from the buffer. */
+       rcu_assign_pointer(event->rb, NULL);
+       ring_buffer_detach(event, rb);
+       mutex_unlock(&event->mmap_mutex);
+
+       /* If there's still other mmap()s of this buffer, we're done. */
+       if (atomic_read(&rb->mmap_count)) {
+               ring_buffer_put(rb); /* can't be last */
+               return;
+       }
  
-               atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-               vma->vm_mm->pinned_vm -= event->mmap_locked;
-               rcu_assign_pointer(event->rb, NULL);
-               ring_buffer_detach(event, rb);
+       /*
+        * No other mmap()s, detach from all other events that might redirect
+        * into the now unreachable buffer. Somewhat complicated by the
+        * fact that rb::event_lock otherwise nests inside mmap_mutex.
+        */
+again:
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+               if (!atomic_long_inc_not_zero(&event->refcount)) {
+                       /*
+                        * This event is en-route to free_event() which will
+                        * detach it and remove it from the list.
+                        */
+                       continue;
+               }
+               rcu_read_unlock();
+
+               mutex_lock(&event->mmap_mutex);
+               /*
+                * Check we didn't race with perf_event_set_output() which can
+                * swizzle the rb from under us while we were waiting to
+                * acquire mmap_mutex.
+                *
+                * If we find a different rb; ignore this event, a next
+                * iteration will no longer find it on the list. We have to
+                * still restart the iteration to make sure we're not now
+                * iterating the wrong list.
+                */
+               if (event->rb == rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* can't be last, we still have one */
+               }
                 mutex_unlock(&event->mmap_mutex);
+               put_event(event);
  
-               ring_buffer_put(rb);
-               free_uid(user);
+               /*
+                * Restart the iteration; either we're on the wrong list or
+                * destroyed its integrity by doing a deletion.
+                */
+               goto again;
         }
+       rcu_read_unlock();
+
+       /*
+        * It could be there's still a few 0-ref events on the list; they'll
+        * get cleaned up by free_event() -- they'll also still have their
+        * ref on the rb and will free it whenever they are done with it.
+        *
+        * Aside from that, this buffer is 'fully' detached and unmapped,
+        * undo the VM accounting.
+        */
+
+       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       vma->vm_mm->pinned_vm -= mmap_locked;
+       free_uid(mmap_user);
+
+       ring_buffer_put(rb); /* could be last */
  }
  
  static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3700,12 +3967,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 return -EINVAL;
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
         mutex_lock(&event->mmap_mutex);
         if (event->rb) {
-               if (event->rb->nr_pages == nr_pages)
-                       atomic_inc(&event->rb->refcount);
-               else
+               if (event->rb->nr_pages != nr_pages) {
                         ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                       /*
+                        * Raced against perf_mmap_close() through
+                        * perf_event_set_output(). Try again, hope for better
+                        * luck.
+                        */
+                       mutex_unlock(&event->mmap_mutex);
+                       goto again;
+               }
+
                 goto unlock;
         }
  
@@ -3746,19 +4025,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 ret = -ENOMEM;
                 goto unlock;
         }
-       rcu_assign_pointer(event->rb, rb);
+
+       atomic_set(&rb->mmap_count, 1);
+       rb->mmap_locked = extra;
+       rb->mmap_user = get_current_user();
  
         atomic_long_add(user_extra, &user->locked_vm);
-       event->mmap_locked = extra;
-       event->mmap_user = get_current_user();
-       vma->vm_mm->pinned_vm += event->mmap_locked;
+       vma->vm_mm->pinned_vm += extra;
+
+       ring_buffer_attach(event, rb);
+       rcu_assign_pointer(event->rb, rb);
  
  unlock:
         if (!ret)
                 atomic_inc(&event->mmap_count);
         mutex_unlock(&event->mmap_mutex);
  
-       vma->vm_flags |= VM_RESERVED;
+       /*
+        * Since pinned accounting is per vm we cannot allow fork() to copy our
+        * vma.
+        */
+       vma->vm_flags |= VM_DONTCOPY | VM_RESERVED;
         vma->vm_ops = &perf_mmap_vmops;
  
         return ret;
@@ -3786,7 +4073,7 @@ static const struct file_operations perf_fops = {
         .read                   = perf_read,
         .poll                   = perf_poll,
         .unlocked_ioctl         = perf_ioctl,
-       .compat_ioctl           = perf_ioctl,
+       .compat_ioctl           = perf_compat_ioctl,
         .mmap                   = perf_mmap,
         .fasync                 = perf_fasync,
  };
@@ -3798,12 +4085,20 @@ static const struct file_operations perf_fops = {
   * to user-space before waking everybody up.
   */
  
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+       /* only the parent has fasync state */
+       if (event->parent)
+               event = event->parent;
+       return &event->fasync;
+}
+
  void perf_event_wakeup(struct perf_event *event)
  {
         ring_buffer_wakeup(event);
  
         if (event->pending_kill) {
-               kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+               kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                 event->pending_kill = 0;
         }
  }
@@ -3812,6 +4107,13 @@ static void perf_pending_event(struct irq_work *entry)
  {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
+       int rctx;
+
+       rctx = perf_swevent_get_recursion_context();
+       /*
+        * If we 'fail' here, that's OK, it means recursion is already disabled
+        * and we won't recurse 'further'.
+        */
  
         if (event->pending_disable) {
                 event->pending_disable = 0;
@@ -3822,6 +4124,9 @@ static void perf_pending_event(struct irq_work *entry)
                 event->pending_wakeup = 0;
                 perf_event_wakeup(event);
         }
+
+       if (rctx >= 0)
+               perf_swevent_put_recursion_context(rctx);
  }
  
  /*
@@ -4281,7 +4586,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_task_ctx(&cpuctx->ctx, task_event);
  
@@ -4427,7 +4732,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
  
@@ -4623,7 +4928,7 @@ got_name:
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->active_pmu != pmu)
+               if (cpuctx->unique_pmu != pmu)
                         goto next;
                 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
                                         vma->vm_flags & VM_EXEC);
@@ -4770,7 +5075,7 @@ static int __perf_event_overflow(struct perf_event *event,
         else
                 perf_event_output(event, data, regs);
  
-       if (event->fasync && event->pending_kill) {
+       if (*perf_event_fasync(event) && event->pending_kill) {
                 event->pending_wakeup = 1;
                 irq_work_queue(&event->pending);
         }
@@ -5111,7 +5416,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
         int err = 0;
  
         mutex_lock(&swhash->hlist_mutex);
-
         if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                 struct swevent_hlist *hlist;
  
@@ -5222,6 +5526,10 @@ static int perf_tp_filter_match(struct perf_event *event,
  {
         void *record = data->raw->data;
  
+       /* only top level events have filters set */
+       if (event->parent)
+               event = event->parent;
+
         if (likely(!event->filter) || filter_match_preds(event->filter, record))
                 return 1;
         return 0;
@@ -5645,8 +5953,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
  
                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  
-               if (cpuctx->active_pmu == old_pmu)
-                       cpuctx->active_pmu = pmu;
+               if (cpuctx->unique_pmu == old_pmu)
+                       cpuctx->unique_pmu = pmu;
         }
  }
  
@@ -5781,7 +6089,7 @@ skip_type:
                 cpuctx->ctx.pmu = pmu;
                 cpuctx->jiffies_interval = 1;
                 INIT_LIST_HEAD(&cpuctx->rotation_list);
-               cpuctx->active_pmu = pmu;
+               cpuctx->unique_pmu = pmu;
         }
  
  got_cpu_context:
@@ -5963,8 +6271,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         event->overflow_handler = overflow_handler;
         event->overflow_handler_context = context;
  
-       if (attr->disabled)
-               event->state = PERF_EVENT_STATE_OFF;
+       perf_event__state_init(event);
  
         pmu = NULL;
  
@@ -6123,6 +6430,8 @@ set:
         if (atomic_read(&event->mmap_count))
                 goto unlock;
  
+       old_rb = event->rb;
+
         if (output_event) {
                 /* get the rb we want to redirect to */
                 rb = ring_buffer_get(output_event);
@@ -6130,20 +6439,72 @@ set:
                         goto unlock;
         }
  
-       old_rb = event->rb;
-       rcu_assign_pointer(event->rb, rb);
         if (old_rb)
                 ring_buffer_detach(event, old_rb);
+
+       if (rb)
+               ring_buffer_attach(event, rb);
+
+       rcu_assign_pointer(event->rb, rb);
+
+       if (old_rb) {
+               ring_buffer_put(old_rb);
+               /*
+                * Since we detached before setting the new rb, so that we
+                * could attach the new rb, we could have missed a wakeup.
+                * Provide it now.
+                */
+               wake_up_all(&event->waitq);
+       }
+
         ret = 0;
  unlock:
         mutex_unlock(&event->mmap_mutex);
  
-       if (old_rb)
-               ring_buffer_put(old_rb);
  out:
         return ret;
  }
  
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+       if (b < a)
+               swap(a, b);
+
+       mutex_lock(a);
+       mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+                            struct perf_event_context *ctx)
+{
+       struct perf_event_context *gctx;
+
+again:
+       rcu_read_lock();
+       gctx = ACCESS_ONCE(group_leader->ctx);
+       if (!atomic_inc_not_zero(&gctx->refcount)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+
+       mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+       if (group_leader->ctx != gctx) {
+               mutex_unlock(&ctx->mutex);
+               mutex_unlock(&gctx->mutex);
+               put_ctx(gctx);
+               goto again;
+       }
+
+       return gctx;
+}
+
  /**
   * sys_perf_event_open - open a performance event, associate it to a task/cpu
   *
@@ -6159,7 +6520,7 @@ SYSCALL_DEFINE5(perf_event_open,
         struct perf_event *group_leader = NULL, *output_event = NULL;
         struct perf_event *event, *sibling;
         struct perf_event_attr attr;
-       struct perf_event_context *ctx;
+       struct perf_event_context *ctx, *uninitialized_var(gctx);
         struct file *event_file = NULL;
         struct file *group_file = NULL;
         struct task_struct *task = NULL;
@@ -6185,6 +6546,9 @@ SYSCALL_DEFINE5(perf_event_open,
         if (attr.freq) {
                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
                         return -EINVAL;
+       } else {
+               if (attr.sample_period & (1ULL << 63))
+                       return -EINVAL;
         }
  
         /*
@@ -6328,23 +6692,58 @@ SYSCALL_DEFINE5(perf_event_open,
         }
  
         if (move_group) {
-               struct perf_event_context *gctx = group_leader->ctx;
+               gctx = __perf_event_ctx_lock_double(group_leader, ctx);
  
-               mutex_lock(&gctx->mutex);
-               perf_remove_from_context(group_leader);
+               /*
+                * Check if we raced against another sys_perf_event_open() call
+                * moving the software group underneath us.
+                */
+               if (!(group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                       /*
+                        * If someone moved the group out from under us, check
+                        * if this new event wound up on the same ctx, if so
+                        * its the regular !move_group case, otherwise fail.
+                        */
+                       if (gctx != ctx) {
+                               err = -EINVAL;
+                               goto err_locked;
+                       } else {
+                               perf_event_ctx_unlock(group_leader, gctx);
+                               move_group = 0;
+                       }
+               }
+
+               /*
+                * See perf_event_ctx_lock() for comments on the details
+                * of swizzling perf_event::ctx.
+                */
+               perf_remove_from_context(group_leader, false);
+
+               /*
+                * Removing from the context ends up with disabled
+                * event. What we want here is event in the initial
+                * startup state, ready to be add into new context.
+                */
+               perf_event__state_init(group_leader);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
                                     group_entry) {
-                       perf_remove_from_context(sibling);
+                       perf_remove_from_context(sibling, false);
+                       perf_event__state_init(sibling);
                         put_ctx(gctx);
                 }
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
+       } else {
+               mutex_lock(&ctx->mutex);
         }
  
         WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
  
         if (move_group) {
+               /*
+                * Wait for everybody to stop referencing the events through
+                * the old lists, before installing it on new lists.
+                */
+               synchronize_rcu();
+
                 perf_install_in_context(ctx, group_leader, cpu);
                 get_ctx(ctx);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -6357,6 +6756,11 @@ SYSCALL_DEFINE5(perf_event_open,
         perf_install_in_context(ctx, event, cpu);
         ++ctx->generation;
         perf_unpin_context(ctx);
+
+       if (move_group) {
+               perf_event_ctx_unlock(group_leader, gctx);
+               put_ctx(gctx);
+       }
         mutex_unlock(&ctx->mutex);
  
         event->owner = current;
@@ -6381,11 +6785,21 @@ SYSCALL_DEFINE5(perf_event_open,
         fd_install(event_fd, event_file);
         return event_fd;
  
+err_locked:
+       if (move_group)
+               perf_event_ctx_unlock(group_leader, gctx);
+       mutex_unlock(&ctx->mutex);
+       fput(event_file);
  err_context:
         perf_unpin_context(ctx);
         put_ctx(ctx);
  err_alloc:
-       free_event(event);
+       /*
+        * If event_file is set, the fput() above will have called ->release()
+        * and that will take care of freeing the event.
+        */
+       if (!event_file)
+               free_event(event);
  err_task:
         if (task)
                 put_task_struct(task);
@@ -6486,13 +6900,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                          struct perf_event_context *child_ctx,
                          struct task_struct *child)
  {
-       if (child_event->parent) {
-               raw_spin_lock_irq(&child_ctx->lock);
-               perf_group_detach(child_event);
-               raw_spin_unlock_irq(&child_ctx->lock);
-       }
-
-       perf_remove_from_context(child_event);
+       perf_remove_from_context(child_event, !!child_event->parent);
  
         /*
          * It can happen that the parent exits first, and has events
@@ -6865,7 +7273,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
-                       break;
+                       goto out_unlock;
         }
  
         /*
@@ -6881,7 +7289,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
-                       break;
+                       goto out_unlock;
         }
  
         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -6909,6 +7317,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
         }
  
         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
         mutex_unlock(&parent_ctx->mutex);
  
         perf_unpin_context(parent_ctx);
@@ -6930,8 +7339,10 @@ int perf_event_init_task(struct task_struct *child)
  
         for_each_task_context_nr(ctxn) {
                 ret = perf_event_init_context(child, ctxn);
-               if (ret)
+               if (ret) {
+                       perf_event_free_task(child);
                         return ret;
+               }
         }
  
         return 0;
@@ -6976,15 +7387,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
  
  static void __perf_event_exit_context(void *__info)
  {
+       struct remove_event re = { .detach_group = false };
         struct perf_event_context *ctx = __info;
-       struct perf_event *event, *tmp;
  
         perf_pmu_rotate_stop(ctx->pmu);
  
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               __perf_remove_from_context(event);
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-               __perf_remove_from_context(event);
+       rcu_read_lock();
+       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(&re);
+       rcu_read_unlock();
  }
  
  static void perf_event_exit_cpu_context(int cpu)
@@ -7006,12 +7417,6 @@ static void perf_event_exit_cpu_context(int cpu)
  
  static void perf_event_exit_cpu(int cpu)
  {
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swevent_hlist_release(swhash);
-       mutex_unlock(&swhash->hlist_mutex);
-
         perf_event_exit_cpu_context(cpu);
  }
  #else