ptrace: use fsuid, fsgid, effective creds for fs access checks

[pandora-kernel.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index f475286..bc94278 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -665,6 +665,76 @@ static void put_ctx(struct perf_event_context *ctx)
         }
  }
  
+/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group we
+ * need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ *   perf_remove_from_context();
+ *   synchronize_rcu();
+ *   perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ *     task_struct::perf_event_mutex
+ *       perf_event_context::mutex
+ *         perf_event_context::lock
+ *         perf_event::child_mutex;
+ *         perf_event::mmap_mutex
+ *         mmap_sem
+ */
+static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+again:
+       rcu_read_lock();
+       ctx = ACCESS_ONCE(event->ctx);
+       if (!atomic_inc_not_zero(&ctx->refcount)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+
+       mutex_lock(&ctx->mutex);
+       if (event->ctx != ctx) {
+               mutex_unlock(&ctx->mutex);
+               put_ctx(ctx);
+               goto again;
+       }
+
+       return ctx;
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+                                 struct perf_event_context *ctx)
+{
+       mutex_unlock(&ctx->mutex);
+       put_ctx(ctx);
+}
+
  static void unclone_ctx(struct perf_event_context *ctx)
  {
         if (ctx->parent_ctx) {
@@ -1325,7 +1395,7 @@ static int __perf_event_disable(void *info)
   * is the current context on this CPU and preemption is disabled,
   * hence we can't get into perf_event_task_sched_out for this context.
   */
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
@@ -1367,6 +1437,19 @@ retry:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_disable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
+
  static void perf_set_shadow_time(struct perf_event *event,
                                  struct perf_event_context *ctx,
                                  u64 tstamp)
@@ -1813,7 +1896,7 @@ unlock:
   * perf_event_for_each_child or perf_event_for_each as described
   * for perf_event_disable.
   */
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
@@ -1870,7 +1953,19 @@ out:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
-int perf_event_refresh(struct perf_event *event, int refresh)
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_enable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
+
+static int _perf_event_refresh(struct perf_event *event, int refresh)
  {
         /*
          * not supported on inherited events
@@ -1879,10 +1974,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
                 return -EINVAL;
  
         atomic_add(refresh, &event->event_limit);
-       perf_event_enable(event);
+       _perf_event_enable(event);
  
         return 0;
  }
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+       struct perf_event_context *ctx;
+       int ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_event_refresh(event, refresh);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
  EXPORT_SYMBOL_GPL(perf_event_refresh);
  
  static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2894,7 +3004,7 @@ find_lively_task_by_vpid(pid_t vpid)
  
         /* Reuse ptrace permission checks for now. */
         err = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
                 goto errout;
  
         return task;
@@ -3110,7 +3220,16 @@ static void put_event(struct perf_event *event)
         rcu_read_unlock();
  
         if (owner) {
-               mutex_lock(&owner->perf_event_mutex);
+               /*
+                * If we're here through perf_event_exit_task() we're already
+                * holding ctx->mutex which would be an inversion wrt. the
+                * normal lock order.
+                *
+                * However we can safely take this lock because its the child
+                * ctx->mutex.
+                */
+               mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
                 /*
                  * We have to re-check the event->owner field, if it is cleared
                  * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3162,12 +3281,13 @@ static int perf_event_read_group(struct perf_event *event,
                                    u64 read_format, char __user *buf)
  {
         struct perf_event *leader = event->group_leader, *sub;
-       int n = 0, size = 0, ret = -EFAULT;
         struct perf_event_context *ctx = leader->ctx;
-       u64 values[5];
+       int n = 0, size = 0, ret;
         u64 count, enabled, running;
+       u64 values[5];
+
+       lockdep_assert_held(&ctx->mutex);
  
-       mutex_lock(&ctx->mutex);
         count = perf_event_read_value(leader, &enabled, &running);
  
         values[n++] = 1 + leader->nr_siblings;
@@ -3182,7 +3302,7 @@ static int perf_event_read_group(struct perf_event *event,
         size = n * sizeof(u64);
  
         if (copy_to_user(buf, values, size))
-               goto unlock;
+               return -EFAULT;
  
         ret = size;
  
@@ -3196,14 +3316,11 @@ static int perf_event_read_group(struct perf_event *event,
                 size = n * sizeof(u64);
  
                 if (copy_to_user(buf + ret, values, size)) {
-                       ret = -EFAULT;
-                       goto unlock;
+                       return -EFAULT;
                 }
  
                 ret += size;
         }
-unlock:
-       mutex_unlock(&ctx->mutex);
  
         return ret;
  }
@@ -3262,8 +3379,14 @@ static ssize_t
  perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  {
         struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       int ret;
  
-       return perf_read_hw(event, buf, count);
+       ctx = perf_event_ctx_lock(event);
+       ret = perf_read_hw(event, buf, count);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
  }
  
  static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3287,7 +3410,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
         return events;
  }
  
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
  {
         (void)perf_event_read(event);
         local64_set(&event->count, 0);
@@ -3306,6 +3429,7 @@ static void perf_event_for_each_child(struct perf_event *event,
         struct perf_event *child;
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
+
         mutex_lock(&event->child_mutex);
         func(event);
         list_for_each_entry(child, &event->child_list, child_list)
@@ -3319,15 +3443,14 @@ static void perf_event_for_each(struct perf_event *event,
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *sibling;
  
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
+       lockdep_assert_held(&ctx->mutex);
+
         event = event->group_leader;
  
         perf_event_for_each_child(event, func);
         func(event);
         list_for_each_entry(sibling, &event->sibling_list, group_entry)
-               perf_event_for_each_child(event, func);
-       mutex_unlock(&ctx->mutex);
+               perf_event_for_each_child(sibling, func);
  }
  
  static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3386,25 +3509,24 @@ static int perf_event_set_output(struct perf_event *event,
                                  struct perf_event *output_event);
  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
  {
-       struct perf_event *event = file->private_data;
         void (*func)(struct perf_event *);
         u32 flags = arg;
  
         switch (cmd) {
         case PERF_EVENT_IOC_ENABLE:
-               func = perf_event_enable;
+               func = _perf_event_enable;
                 break;
         case PERF_EVENT_IOC_DISABLE:
-               func = perf_event_disable;
+               func = _perf_event_disable;
                 break;
         case PERF_EVENT_IOC_RESET:
-               func = perf_event_reset;
+               func = _perf_event_reset;
                 break;
  
         case PERF_EVENT_IOC_REFRESH:
-               return perf_event_refresh(event, arg);
+               return _perf_event_refresh(event, arg);
  
         case PERF_EVENT_IOC_PERIOD:
                 return perf_event_period(event, (u64 __user *)arg);
@@ -3445,6 +3567,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         return 0;
  }
  
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       long ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_ioctl(event, cmd, arg);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
+
  #ifdef CONFIG_COMPAT
  static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                 unsigned long arg)
@@ -3466,11 +3601,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
  
  int perf_event_task_enable(void)
  {
+       struct perf_event_context *ctx;
         struct perf_event *event;
  
         mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_enable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_enable);
+               perf_event_ctx_unlock(event, ctx);
+       }
         mutex_unlock(&current->perf_event_mutex);
  
         return 0;
@@ -3478,11 +3617,15 @@ int perf_event_task_enable(void)
  
  int perf_event_task_disable(void)
  {
+       struct perf_event_context *ctx;
         struct perf_event *event;
  
         mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_disable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_disable);
+               perf_event_ctx_unlock(event, ctx);
+       }
         mutex_unlock(&current->perf_event_mutex);
  
         return 0;
@@ -4958,9 +5101,6 @@ struct swevent_htable {
  
         /* Recursion avoidance in each contexts */
         int                             recursion[PERF_NR_CONTEXTS];
-
-       /* Keeps track of cpu being initialized/exited */
-       bool                            online;
  };
  
  static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5203,14 +5343,8 @@ static int perf_swevent_add(struct perf_event *event, int flags)
         hwc->state = !(flags & PERF_EF_START);
  
         head = find_swevent_head(swhash, event);
-       if (!head) {
-               /*
-                * We can race with cpu hotplug code. Do not
-                * WARN if the cpu just got unplugged.
-                */
-               WARN_ON_ONCE(swhash->online);
+       if (WARN_ON_ONCE(!head))
                 return -EINVAL;
-       }
  
         hlist_add_head_rcu(&event->hlist_entry, head);
  
@@ -5282,7 +5416,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
         int err = 0;
  
         mutex_lock(&swhash->hlist_mutex);
-
         if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                 struct swevent_hlist *hlist;
  
@@ -5393,6 +5526,10 @@ static int perf_tp_filter_match(struct perf_event *event,
  {
         void *record = data->raw->data;
  
+       /* only top level events have filters set */
+       if (event->parent)
+               event = event->parent;
+
         if (likely(!event->filter) || filter_match_preds(event->filter, record))
                 return 1;
         return 0;
@@ -6328,6 +6465,46 @@ out:
         return ret;
  }
  
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+       if (b < a)
+               swap(a, b);
+
+       mutex_lock(a);
+       mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+                            struct perf_event_context *ctx)
+{
+       struct perf_event_context *gctx;
+
+again:
+       rcu_read_lock();
+       gctx = ACCESS_ONCE(group_leader->ctx);
+       if (!atomic_inc_not_zero(&gctx->refcount)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+
+       mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+       if (group_leader->ctx != gctx) {
+               mutex_unlock(&ctx->mutex);
+               mutex_unlock(&gctx->mutex);
+               put_ctx(gctx);
+               goto again;
+       }
+
+       return gctx;
+}
+
  /**
   * sys_perf_event_open - open a performance event, associate it to a task/cpu
   *
@@ -6343,7 +6520,7 @@ SYSCALL_DEFINE5(perf_event_open,
         struct perf_event *group_leader = NULL, *output_event = NULL;
         struct perf_event *event, *sibling;
         struct perf_event_attr attr;
-       struct perf_event_context *ctx;
+       struct perf_event_context *ctx, *uninitialized_var(gctx);
         struct file *event_file = NULL;
         struct file *group_file = NULL;
         struct task_struct *task = NULL;
@@ -6515,9 +6692,31 @@ SYSCALL_DEFINE5(perf_event_open,
         }
  
         if (move_group) {
-               struct perf_event_context *gctx = group_leader->ctx;
+               gctx = __perf_event_ctx_lock_double(group_leader, ctx);
  
-               mutex_lock(&gctx->mutex);
+               /*
+                * Check if we raced against another sys_perf_event_open() call
+                * moving the software group underneath us.
+                */
+               if (!(group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                       /*
+                        * If someone moved the group out from under us, check
+                        * if this new event wound up on the same ctx, if so
+                        * its the regular !move_group case, otherwise fail.
+                        */
+                       if (gctx != ctx) {
+                               err = -EINVAL;
+                               goto err_locked;
+                       } else {
+                               perf_event_ctx_unlock(group_leader, gctx);
+                               move_group = 0;
+                       }
+               }
+
+               /*
+                * See perf_event_ctx_lock() for comments on the details
+                * of swizzling perf_event::ctx.
+                */
                 perf_remove_from_context(group_leader, false);
  
                 /*
@@ -6532,14 +6731,19 @@ SYSCALL_DEFINE5(perf_event_open,
                         perf_event__state_init(sibling);
                         put_ctx(gctx);
                 }
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
+       } else {
+               mutex_lock(&ctx->mutex);
         }
  
         WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
  
         if (move_group) {
+               /*
+                * Wait for everybody to stop referencing the events through
+                * the old lists, before installing it on new lists.
+                */
+               synchronize_rcu();
+
                 perf_install_in_context(ctx, group_leader, cpu);
                 get_ctx(ctx);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -6552,6 +6756,11 @@ SYSCALL_DEFINE5(perf_event_open,
         perf_install_in_context(ctx, event, cpu);
         ++ctx->generation;
         perf_unpin_context(ctx);
+
+       if (move_group) {
+               perf_event_ctx_unlock(group_leader, gctx);
+               put_ctx(gctx);
+       }
         mutex_unlock(&ctx->mutex);
  
         event->owner = current;
@@ -6576,11 +6785,21 @@ SYSCALL_DEFINE5(perf_event_open,
         fd_install(event_fd, event_file);
         return event_fd;
  
+err_locked:
+       if (move_group)
+               perf_event_ctx_unlock(group_leader, gctx);
+       mutex_unlock(&ctx->mutex);
+       fput(event_file);
  err_context:
         perf_unpin_context(ctx);
         put_ctx(ctx);
  err_alloc:
-       free_event(event);
+       /*
+        * If event_file is set, the fput() above will have called ->release()
+        * and that will take care of freeing the event.
+        */
+       if (!event_file)
+               free_event(event);
  err_task:
         if (task)
                 put_task_struct(task);
@@ -7054,7 +7273,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
-                       break;
+                       goto out_unlock;
         }
  
         /*
@@ -7070,7 +7289,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
-                       break;
+                       goto out_unlock;
         }
  
         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -7098,6 +7317,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
         }
  
         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
         mutex_unlock(&parent_ctx->mutex);
  
         perf_unpin_context(parent_ctx);
@@ -7145,7 +7365,6 @@ static void __cpuinit perf_event_init_cpu(int cpu)
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
         mutex_lock(&swhash->hlist_mutex);
-       swhash->online = true;
         if (swhash->hlist_refcount > 0) {
                 struct swevent_hlist *hlist;
  
@@ -7198,14 +7417,7 @@ static void perf_event_exit_cpu_context(int cpu)
  
  static void perf_event_exit_cpu(int cpu)
  {
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
         perf_event_exit_cpu_context(cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swhash->online = false;
-       swevent_hlist_release(swhash);
-       mutex_unlock(&swhash->hlist_mutex);
  }
  #else
  static inline void perf_event_exit_cpu(int cpu) { }