tracing/filters: Add __field_ext() to TRACE_EVENT
[pandora-kernel.git] / kernel / perf_counter.c
index 7530588..b0b20a0 100644 (file)
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
@@ -1103,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
                __perf_counter_sync_stat(counter, next_counter);
 
                counter = list_next_entry(counter, event_entry);
-               next_counter = list_next_entry(counter, event_entry);
+               next_counter = list_next_entry(next_counter, event_entry);
        }
 }
 
@@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter)
                        atomic_dec(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_dec(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_dec(&nr_task_counters);
        }
 
        if (counter->destroy)
@@ -1688,6 +1691,18 @@ static int perf_release(struct inode *inode, struct file *file)
        return 0;
 }
 
+static u64 perf_counter_read_tree(struct perf_counter *counter)
+{
+       struct perf_counter *child;
+       u64 total = 0;
+
+       total += perf_counter_read(counter);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               total += perf_counter_read(child);
+
+       return total;
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
@@ -1707,7 +1722,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->child_mutex);
-       values[0] = perf_counter_read(counter);
+       values[0] = perf_counter_read_tree(counter);
        n = 1;
        if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = counter->total_time_enabled +
@@ -2030,7 +2045,7 @@ fail:
 
 static void perf_mmap_free_page(unsigned long addr)
 {
-       struct page *page = virt_to_page(addr);
+       struct page *page = virt_to_page((void *)addr);
 
        page->mapping = NULL;
        __free_page(page);
@@ -2699,6 +2714,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                        header.size += sizeof(u64);
        }
 
+       if (sample_type & PERF_SAMPLE_RAW) {
+               int size = sizeof(u32);
+
+               if (data->raw)
+                       size += data->raw->size;
+               else
+                       size += sizeof(u32);
+
+               WARN_ON_ONCE(size & (sizeof(u64)-1));
+               header.size += size;
+       }
+
        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
        if (ret)
                return;
@@ -2762,6 +2789,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
                }
        }
 
+       if (sample_type & PERF_SAMPLE_RAW) {
+               if (data->raw) {
+                       perf_output_put(&handle, data->raw->size);
+                       perf_output_copy(&handle, data->raw->data, data->raw->size);
+               } else {
+                       struct {
+                               u32     size;
+                               u32     data;
+                       } raw = {
+                               .size = sizeof(u32),
+                               .data = 0,
+                       };
+                       perf_output_put(&handle, raw);
+               }
+       }
+
        perf_output_end(&handle);
 }
 
@@ -2819,48 +2862,56 @@ perf_counter_read_event(struct perf_counter *counter,
 }
 
 /*
- * fork tracking
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
  */
 
-struct perf_fork_event {
-       struct task_struct      *task;
+struct perf_task_event {
+       struct task_struct              *task;
+       struct perf_counter_context     *task_ctx;
 
        struct {
                struct perf_event_header        header;
 
                u32                             pid;
                u32                             ppid;
+               u32                             tid;
+               u32                             ptid;
        } event;
 };
 
-static void perf_counter_fork_output(struct perf_counter *counter,
-                                    struct perf_fork_event *fork_event)
+static void perf_counter_task_output(struct perf_counter *counter,
+                                    struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-       int size = fork_event->event.header.size;
-       struct task_struct *task = fork_event->task;
+       int size = task_event->event.header.size;
+       struct task_struct *task = task_event->task;
        int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
        if (ret)
                return;
 
-       fork_event->event.pid = perf_counter_pid(counter, task);
-       fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+       task_event->event.pid = perf_counter_pid(counter, task);
+       task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
 
-       perf_output_put(&handle, fork_event->event);
+       task_event->event.tid = perf_counter_tid(counter, task);
+       task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
+
+       perf_output_put(&handle, task_event->event);
        perf_output_end(&handle);
 }
 
-static int perf_counter_fork_match(struct perf_counter *counter)
+static int perf_counter_task_match(struct perf_counter *counter)
 {
-       if (counter->attr.comm || counter->attr.mmap)
+       if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
                return 1;
 
        return 0;
 }
 
-static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
-                                 struct perf_fork_event *fork_event)
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+                                 struct perf_task_event *task_event)
 {
        struct perf_counter *counter;
 
@@ -2869,54 +2920,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
 
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_fork_match(counter))
-                       perf_counter_fork_output(counter, fork_event);
+               if (perf_counter_task_match(counter))
+                       perf_counter_task_output(counter, task_event);
        }
        rcu_read_unlock();
 }
 
-static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+static void perf_counter_task_event(struct perf_task_event *task_event)
 {
        struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
+       struct perf_counter_context *ctx = task_event->task_ctx;
 
        cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+       perf_counter_task_ctx(&cpuctx->ctx, task_event);
        put_cpu_var(perf_cpu_context);
 
        rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (!ctx)
+               ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
        if (ctx)
-               perf_counter_fork_ctx(ctx, fork_event);
+               perf_counter_task_ctx(ctx, task_event);
        rcu_read_unlock();
 }
 
-void perf_counter_fork(struct task_struct *task)
+static void perf_counter_task(struct task_struct *task,
+                             struct perf_counter_context *task_ctx,
+                             int new)
 {
-       struct perf_fork_event fork_event;
+       struct perf_task_event task_event;
 
        if (!atomic_read(&nr_comm_counters) &&
-           !atomic_read(&nr_mmap_counters))
+           !atomic_read(&nr_mmap_counters) &&
+           !atomic_read(&nr_task_counters))
                return;
 
-       fork_event = (struct perf_fork_event){
-               .task   = task,
-               .event  = {
+       task_event = (struct perf_task_event){
+               .task     = task,
+               .task_ctx = task_ctx,
+               .event    = {
                        .header = {
-                               .type = PERF_EVENT_FORK,
+                               .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
                                .misc = 0,
-                               .size = sizeof(fork_event.event),
+                               .size = sizeof(task_event.event),
                        },
                        /* .pid  */
                        /* .ppid */
+                       /* .tid  */
+                       /* .ptid */
                },
        };
 
-       perf_counter_fork_event(&fork_event);
+       perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+       perf_counter_task(task, NULL, 1);
 }
 
 /*
@@ -3217,7 +3276,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
                u64                             stream_id;
        } throttle_event = {
                .header = {
-                       .type = PERF_EVENT_THROTTLE + 1,
+                       .type = PERF_EVENT_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
@@ -3226,6 +3285,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
                .stream_id      = counter->id,
        };
 
+       if (enable)
+               throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
        ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
        if (ret)
                return;
@@ -3302,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
  * Generic software counter infrastructure
  */
 
-static void perf_swcounter_update(struct perf_counter *counter)
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
 {
        struct hw_perf_counter *hwc = &counter->hw;
-       u64 prev, now;
-       s64 delta;
+       u64 period = hwc->last_period;
+       u64 nr, offset;
+       s64 old, val;
+
+       hwc->last_period = hwc->sample_period;
 
 again:
-       prev = atomic64_read(&hwc->prev_count);
-       now = atomic64_read(&hwc->count);
-       if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
-               goto again;
+       old = val = atomic64_read(&hwc->period_left);
+       if (val < 0)
+               return 0;
 
-       delta = now - prev;
+       nr = div64_u64(period + val, period);
+       offset = nr * period;
+       val -= offset;
+       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+               goto again;
 
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &hwc->period_left);
+       return nr;
 }
 
-static void perf_swcounter_set_period(struct perf_counter *counter)
+static void perf_swcounter_overflow(struct perf_counter *counter,
+                                   int nmi, struct perf_sample_data *data)
 {
        struct hw_perf_counter *hwc = &counter->hw;
-       s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
+       u64 overflow;
 
-       if (unlikely(left <= -period)) {
-               left = period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-       }
+       data->period = counter->hw.last_period;
+       overflow = perf_swcounter_set_period(counter);
 
-       if (unlikely(left <= 0)) {
-               left += period;
-               atomic64_add(period, &hwc->period_left);
-               hwc->last_period = period;
-       }
+       if (hwc->interrupts == MAX_INTERRUPTS)
+               return;
 
-       atomic64_set(&hwc->prev_count, -left);
-       atomic64_set(&hwc->count, -left);
+       for (; overflow; overflow--) {
+               if (perf_counter_overflow(counter, nmi, data)) {
+                       /*
+                        * We inhibit the overflow from happening when
+                        * hwc->interrupts == MAX_INTERRUPTS.
+                        */
+                       break;
+               }
+       }
 }
 
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
 {
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct perf_counter *counter;
-       u64 period;
-
-       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
-       counter->pmu->read(counter);
-
-       data.addr = 0;
-       data.regs = get_irq_regs();
        /*
-        * In case we exclude kernel IPs or are somehow not in interrupt
-        * context, provide the next best thing, the user IP.
+        * Nothing to do, we already reset hwc->interrupts.
         */
-       if ((counter->attr.exclude_kernel || !data.regs) &&
-                       !counter->attr.exclude_user)
-               data.regs = task_pt_regs(current);
+}
 
-       if (data.regs) {
-               if (perf_counter_overflow(counter, 0, &data))
-                       ret = HRTIMER_NORESTART;
-       }
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+                              int nmi, struct perf_sample_data *data)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
 
-       period = max_t(u64, 10000, counter->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+       atomic64_add(nr, &counter->count);
 
-       return ret;
-}
+       if (!hwc->sample_period)
+               return;
 
-static void perf_swcounter_overflow(struct perf_counter *counter,
-                                   int nmi, struct perf_sample_data *data)
-{
-       data->period = counter->hw.last_period;
+       if (!data->regs)
+               return;
 
-       perf_swcounter_update(counter);
-       perf_swcounter_set_period(counter);
-       if (perf_counter_overflow(counter, nmi, data))
-               /* soft-disable the counter */
-               ;
+       if (!atomic64_add_negative(nr, &hwc->period_left))
+               perf_swcounter_overflow(counter, nmi, data);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3446,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
        return 1;
 }
 
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                              int nmi, struct perf_sample_data *data)
-{
-       int neg = atomic64_add_negative(nr, &counter->hw.count);
-
-       if (counter->hw.sample_period && !neg && data->regs)
-               perf_swcounter_overflow(counter, nmi, data);
-}
-
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                     enum perf_type_id type,
                                     u32 event, u64 nr, int nmi,
@@ -3533,26 +3580,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 
 static void perf_swcounter_read(struct perf_counter *counter)
 {
-       perf_swcounter_update(counter);
 }
 
 static int perf_swcounter_enable(struct perf_counter *counter)
 {
-       perf_swcounter_set_period(counter);
+       struct hw_perf_counter *hwc = &counter->hw;
+
+       if (hwc->sample_period) {
+               hwc->last_period = hwc->sample_period;
+               perf_swcounter_set_period(counter);
+       }
        return 0;
 }
 
 static void perf_swcounter_disable(struct perf_counter *counter)
 {
-       perf_swcounter_update(counter);
 }
 
 static const struct pmu perf_ops_generic = {
        .enable         = perf_swcounter_enable,
        .disable        = perf_swcounter_disable,
        .read           = perf_swcounter_read,
+       .unthrottle     = perf_swcounter_unthrottle,
 };
 
+/*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct perf_counter *counter;
+       u64 period;
+
+       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+       counter->pmu->read(counter);
+
+       data.addr = 0;
+       data.regs = get_irq_regs();
+       /*
+        * In case we exclude kernel IPs or are somehow not in interrupt
+        * context, provide the next best thing, the user IP.
+        */
+       if ((counter->attr.exclude_kernel || !data.regs) &&
+                       !counter->attr.exclude_user)
+               data.regs = task_pt_regs(current);
+
+       if (data.regs) {
+               if (perf_counter_overflow(counter, 0, &data))
+                       ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, counter->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
+}
+
 /*
  * Software counter: cpu wall time clock
  */
@@ -3670,17 +3756,24 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+                         int entry_size)
 {
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
+
        struct perf_sample_data data = {
                .regs = get_irq_regs(),
-               .addr = 0,
+               .addr = addr,
+               .raw = &raw,
        };
 
        if (!data.regs)
                data.regs = task_pt_regs(current);
 
-       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
@@ -3694,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
+       /*
+        * Raw tracepoint data is a severe data leak, only allow root to
+        * have these.
+        */
+       if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       !capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
        if (ftrace_profile_enable(counter->attr.config))
                return NULL;
 
@@ -3872,6 +3973,8 @@ done:
                        atomic_inc(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_inc(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_inc(&nr_task_counters);
        }
 
        return counter;
@@ -4233,8 +4336,10 @@ void perf_counter_exit_task(struct task_struct *child)
        struct perf_counter_context *child_ctx;
        unsigned long flags;
 
-       if (likely(!child->perf_counter_ctxp))
+       if (likely(!child->perf_counter_ctxp)) {
+               perf_counter_task(child, NULL, 0);
                return;
+       }
 
        local_irq_save(flags);
        /*
@@ -4259,8 +4364,14 @@ void perf_counter_exit_task(struct task_struct *child)
         * the counters from it.
         */
        unclone_ctx(child_ctx);
-       spin_unlock(&child_ctx->lock);
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+       /*
+        * Report the task dead after unscheduling the counters so that we
+        * won't get any samples after PERF_EVENT_EXIT. We can however still
+        * get a few PERF_EVENT_READ events.
+        */
+       perf_counter_task(child, child_ctx, 0);
 
        /*
         * We can recurse on the same lock type through: