X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=kernel%2Fevents%2Fcore.c;h=42770953a0e301126b18ce0b237771d8968ebb2a;hp=d3b9df5962c25bdbd3ca324756474366ff8c6d68;hb=f3022d3db336e281ec070b876c3f0600a712a2f4;hpb=64b3dcc35e8dcaa3032aa6aba416d05f41ead959 diff --git a/kernel/events/core.c b/kernel/events/core.c index d3b9df5962c2..42770953a0e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" @@ -185,9 +186,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -245,9 +243,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -363,6 +361,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ /* * perf_cgroup_events says at least one @@ -386,9 +386,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around */ cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -476,7 +477,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor @@ -714,8 +719,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; - rcu_read_lock(); retry: + /* + * One of the few rules of preemptible RCU is that one cannot do + * rcu_read_unlock() while holding a scheduler (or nested) lock when + * part of the read side critical section was preemptible -- see + * rcu_read_unlock_special(). + * + * Since ctx->lock nests under rq->lock we must ensure the entire read + * side critical section is non-preemptible. + */ + preempt_disable(); + rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* @@ -731,6 +746,8 @@ retry: raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); + rcu_read_unlock(); + preempt_enable(); goto retry; } @@ -740,6 +757,7 @@ retry: } } rcu_read_unlock(); + preempt_enable(); return ctx; } @@ -889,6 +907,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +/* + * Initialize event state based on the perf_event_attr::disabled. + */ +static inline void perf_event__state_init(struct perf_event *event) +{ + event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : + PERF_EVENT_STATE_INACTIVE; +} + /* * Called at perf_event creation and when events are attached/detached from a * group. @@ -1154,6 +1181,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1162,12 +1194,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1192,10 +1227,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1204,12 +1243,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1226,6 +1265,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1643,6 +1684,16 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -1687,7 +1738,16 @@ static int __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - if (WARN_ON_ONCE(!ctx->is_active)) + /* + * There's a time window between 'ctx->is_active' check + * in perf_event_enable function and this place having: + * - IRQs on + * - ctx->lock unlocked + * + * where the task could be killed and 'ctx' deactivated + * by perf_event_exit_task. + */ + if (!ctx->is_active) return -EINVAL; raw_spin_lock(&ctx->lock); @@ -2939,6 +2999,7 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2962,8 +3023,22 @@ static void free_event(struct perf_event *event) } if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) @@ -2996,10 +3071,7 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); + perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); free_event(event); @@ -3011,12 +3083,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -3051,7 +3123,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3195,30 +3273,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3304,7 +3365,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3318,7 +3379,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3350,19 +3411,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -3382,6 +3445,25 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { struct perf_event *event; @@ -3530,16 +3612,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3558,8 +3636,9 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { - wake_up_all(&event->waitq); + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); } rcu_read_unlock(); } @@ -3589,18 +3668,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3610,26 +3681,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); + + atomic_dec(&rb->mmap_count); + + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; + + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } + + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; + } + rcu_read_unlock(); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } mutex_unlock(&event->mmap_mutex); + put_event(event); - ring_buffer_put(rb); - free_uid(user); + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3679,12 +3824,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3725,19 +3882,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + atomic_set(&rb->mmap_count, 1); + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + ring_buffer_attach(event, rb); + rcu_assign_pointer(event->rb, rb); unlock: if (!ret) atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vma->vm_flags |= VM_DONTCOPY | VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -3765,7 +3930,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; @@ -3791,6 +3956,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -3801,6 +3973,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* @@ -4260,7 +4435,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4406,7 +4581,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4602,7 +4777,7 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -4775,6 +4950,9 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5017,8 +5195,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); return -EINVAL; + } hlist_add_head_rcu(&event->hlist_entry, head); @@ -5152,7 +5336,7 @@ static void sw_perf_event_destroy(struct perf_event *event) static int perf_swevent_init(struct perf_event *event) { - int event_id = event->attr.config; + u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; @@ -5624,8 +5808,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } } @@ -5744,6 +5928,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; @@ -5759,7 +5944,7 @@ skip_type: cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; + cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -5908,6 +6093,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -5940,8 +6126,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->overflow_handler = overflow_handler; event->overflow_handler_context = context; - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; + perf_event__state_init(event); pmu = NULL; @@ -6100,6 +6285,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6107,16 +6294,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } @@ -6162,6 +6361,9 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; + } else { + if (attr.sample_period & (1ULL << 63)) + return -EINVAL; } /* @@ -6178,12 +6380,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6308,17 +6510,24 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); + perf_event__state_init(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6408,7 +6617,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6457,7 +6665,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6465,13 +6673,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events @@ -6533,9 +6735,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6603,7 +6804,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6683,6 +6884,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6723,14 +6930,6 @@ inherit_event(struct perf_event *parent_event, add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - /* * Link this into the parent event's child list */ @@ -6788,7 +6987,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = alloc_perf_context(event->pmu, child); + child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; @@ -6912,8 +7111,10 @@ int perf_event_init_task(struct task_struct *child) for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); - if (ret) + if (ret) { + perf_event_free_task(child); return ret; + } } return 0; @@ -6936,6 +7137,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -6958,15 +7160,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_remove_from_context(event); + rcu_read_lock(); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -6990,11 +7192,12 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); + swhash->online = false; swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { }