perf_counter: Initialize ->oncpu properly
[pandora-kernel.git] / kernel / perf_counter.c
index 2f410ea..367299f 100644 (file)
@@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -111,6 +112,10 @@ static void put_ctx(struct perf_counter_context *ctx)
        }
 }
 
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -130,13 +135,11 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
        list_add_rcu(&counter->event_entry, &ctx->event_list);
        ctx->nr_counters++;
-       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-               ctx->nr_enabled++;
 }
 
 /*
  * Remove a counter from the lists for its context.
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -146,8 +149,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
        if (list_empty(&counter->list_entry))
                return;
        ctx->nr_counters--;
-       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-               ctx->nr_enabled--;
 
        list_del_init(&counter->list_entry);
        list_del_rcu(&counter->event_entry);
@@ -276,7 +277,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -402,7 +403,6 @@ static void __perf_counter_disable(void *info)
                else
                        counter_sched_out(counter, cpuctx, ctx);
                counter->state = PERF_COUNTER_STATE_OFF;
-               ctx->nr_enabled--;
        }
 
        spin_unlock_irqrestore(&ctx->lock, flags);
@@ -444,7 +444,6 @@ static void perf_counter_disable(struct perf_counter *counter)
        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
                update_counter_times(counter);
                counter->state = PERF_COUNTER_STATE_OFF;
-               ctx->nr_enabled--;
        }
 
        spin_unlock_irq(&ctx->lock);
@@ -593,6 +592,8 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 
 /*
  * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
  */
 static void __perf_install_in_context(void *info)
 {
@@ -753,7 +754,6 @@ static void __perf_counter_enable(void *info)
                goto unlock;
        counter->state = PERF_COUNTER_STATE_INACTIVE;
        counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-       ctx->nr_enabled++;
 
        /*
         * If the counter is in a group and isn't the group leader,
@@ -844,7 +844,6 @@ static void perf_counter_enable(struct perf_counter *counter)
                counter->state = PERF_COUNTER_STATE_INACTIVE;
                counter->tstamp_enabled =
                        ctx->time - counter->total_time_enabled;
-               ctx->nr_enabled++;
        }
  out:
        spin_unlock_irq(&ctx->lock);
@@ -904,8 +903,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
                         struct perf_counter_context *ctx2)
 {
        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-               && ctx1->parent_gen == ctx2->parent_gen
-               && ctx1->nr_enabled == ctx2->nr_enabled;
+               && ctx1->parent_gen == ctx2->parent_gen;
 }
 
 /*
@@ -927,14 +925,13 @@ void perf_counter_task_sched_out(struct task_struct *task,
        struct perf_counter_context *next_ctx;
        struct pt_regs *regs;
 
+       regs = task_pt_regs(task);
+       perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
 
        update_context_time(ctx);
-
-       regs = task_pt_regs(task);
-       perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
        next_ctx = next->perf_counter_ctxp;
        if (next_ctx && context_equiv(ctx, next_ctx)) {
                task->perf_counter_ctxp = next_ctx;
@@ -1070,89 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
        __perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_disable(void)
-{
-       struct task_struct *curr = current;
-       struct perf_counter_context *ctx = curr->perf_counter_ctxp;
-       struct perf_counter *counter;
-       unsigned long flags;
-
-       if (!ctx || !ctx->nr_counters)
-               return 0;
-
-       local_irq_save(flags);
-
-       __perf_counter_task_sched_out(ctx);
-
-       spin_lock(&ctx->lock);
-
-       /*
-        * Disable all the counters:
-        */
-       perf_disable();
-
-       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-               if (counter->state != PERF_COUNTER_STATE_ERROR) {
-                       update_group_times(counter);
-                       counter->state = PERF_COUNTER_STATE_OFF;
-               }
-       }
-
-       perf_enable();
-
-       spin_unlock_irqrestore(&ctx->lock, flags);
-
-       return 0;
-}
-
-int perf_counter_task_enable(void)
-{
-       struct task_struct *curr = current;
-       struct perf_counter_context *ctx = curr->perf_counter_ctxp;
-       struct perf_counter *counter;
-       unsigned long flags;
-       int cpu;
-
-       if (!ctx || !ctx->nr_counters)
-               return 0;
-
-       local_irq_save(flags);
-       cpu = smp_processor_id();
-
-       __perf_counter_task_sched_out(ctx);
-
-       spin_lock(&ctx->lock);
-
-       /*
-        * Disable all the counters:
-        */
-       perf_disable();
-
-       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-               if (counter->state > PERF_COUNTER_STATE_OFF)
-                       continue;
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->tstamp_enabled =
-                       ctx->time - counter->total_time_enabled;
-               counter->hw_event.disabled = 0;
-       }
-       perf_enable();
-
-       spin_unlock(&ctx->lock);
-
-       perf_counter_task_sched_in(curr, cpu);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
+#define MAX_INTERRUPTS (~0ULL)
 
+static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
        struct perf_counter *counter;
-       u64 irq_period;
+       u64 interrupts, irq_period;
        u64 events, period;
        s64 delta;
 
@@ -1161,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
                if (counter->state != PERF_COUNTER_STATE_ACTIVE)
                        continue;
 
+               interrupts = counter->hw.interrupts;
+               counter->hw.interrupts = 0;
+
+               if (interrupts == MAX_INTERRUPTS) {
+                       perf_log_throttle(counter, 1);
+                       counter->pmu->unthrottle(counter);
+                       interrupts = 2*sysctl_perf_counter_limit/HZ;
+               }
+
                if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
                        continue;
 
-               events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+               events = HZ * interrupts * counter->hw.irq_period;
                period = div64_u64(events, counter->hw_event.irq_freq);
 
                delta = (s64)(1 + period - counter->hw.irq_period);
@@ -1178,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
                perf_log_period(counter, irq_period);
 
                counter->hw.irq_period = irq_period;
-               counter->hw.interrupts = 0;
        }
        spin_unlock(&ctx->lock);
 }
@@ -1407,13 +1338,14 @@ static int perf_release(struct inode *inode, struct file *file)
        file->private_data = NULL;
 
        mutex_lock(&ctx->mutex);
-       mutex_lock(&counter->mutex);
-
        perf_counter_remove_from_context(counter);
-
-       mutex_unlock(&counter->mutex);
        mutex_unlock(&ctx->mutex);
 
+       mutex_lock(&counter->owner->perf_counter_mutex);
+       list_del_init(&counter->owner_entry);
+       mutex_unlock(&counter->owner->perf_counter_mutex);
+       put_task_struct(counter->owner);
+
        free_counter(counter);
        put_context(ctx);
 
@@ -1437,7 +1369,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
        if (counter->state == PERF_COUNTER_STATE_ERROR)
                return 0;
 
-       mutex_lock(&counter->mutex);
+       mutex_lock(&counter->child_mutex);
        values[0] = perf_counter_read(counter);
        n = 1;
        if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -1446,7 +1378,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
        if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = counter->total_time_running +
                        atomic64_read(&counter->child_total_time_running);
-       mutex_unlock(&counter->mutex);
+       mutex_unlock(&counter->child_mutex);
 
        if (count < n * sizeof(u64))
                return -EINVAL;
@@ -1496,13 +1428,13 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
        struct perf_counter_context *ctx = counter->ctx;
        struct perf_counter *sibling;
 
-       spin_lock_irq(&ctx->lock);
+       mutex_lock(&ctx->mutex);
        counter = counter->group_leader;
 
        func(counter);
        list_for_each_entry(sibling, &counter->sibling_list, list_entry)
                func(sibling);
-       spin_unlock_irq(&ctx->lock);
+       mutex_unlock(&ctx->mutex);
 }
 
 static void perf_counter_for_each_child(struct perf_counter *counter,
@@ -1510,11 +1442,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
        struct perf_counter *child;
 
-       mutex_lock(&counter->mutex);
+       mutex_lock(&counter->child_mutex);
        func(counter);
        list_for_each_entry(child, &counter->child_list, child_list)
                func(child);
-       mutex_unlock(&counter->mutex);
+       mutex_unlock(&counter->child_mutex);
 }
 
 static void perf_counter_for_each(struct perf_counter *counter,
@@ -1522,11 +1454,11 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
        struct perf_counter *child;
 
-       mutex_lock(&counter->mutex);
+       mutex_lock(&counter->child_mutex);
        perf_counter_for_each_sibling(counter, func);
        list_for_each_entry(child, &counter->child_list, child_list)
                perf_counter_for_each_sibling(child, func);
-       mutex_unlock(&counter->mutex);
+       mutex_unlock(&counter->child_mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1560,6 +1492,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return 0;
 }
 
+int perf_counter_task_enable(void)
+{
+       struct perf_counter *counter;
+
+       mutex_lock(&current->perf_counter_mutex);
+       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+               perf_counter_for_each_child(counter, perf_counter_enable);
+       mutex_unlock(&current->perf_counter_mutex);
+
+       return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+       struct perf_counter *counter;
+
+       mutex_lock(&current->perf_counter_mutex);
+       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+               perf_counter_for_each_child(counter, perf_counter_disable);
+       mutex_unlock(&current->perf_counter_mutex);
+
+       return 0;
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1759,6 +1715,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
        user_extra = nr_pages + 1;
        user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+       /*
+        * Increase the limit linearly with more CPUs:
+        */
+       user_lock_limit *= num_online_cpus();
+
        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
        extra = 0;
@@ -2593,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
        perf_output_end(&handle);
 }
 
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+       struct perf_output_handle handle;
+       int ret;
+
+       struct {
+               struct perf_event_header        header;
+               u64                             time;
+       } throttle_event = {
+               .header = {
+                       .type = PERF_EVENT_THROTTLE + 1,
+                       .misc = 0,
+                       .size = sizeof(throttle_event),
+               },
+               .time = sched_clock(),
+       };
+
+       ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, throttle_event);
+       perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
@@ -2601,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter,
                          int nmi, struct pt_regs *regs, u64 addr)
 {
        int events = atomic_read(&counter->event_limit);
+       int throttle = counter->pmu->unthrottle != NULL;
        int ret = 0;
 
-       counter->hw.interrupts++;
+       if (!throttle) {
+               counter->hw.interrupts++;
+       } else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+               counter->hw.interrupts++;
+               if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+                       counter->hw.interrupts = MAX_INTERRUPTS;
+                       perf_log_throttle(counter, 0);
+                       ret = 1;
+               }
+       }
 
        /*
         * XXX event_limit might not quite work as expected on inherited
@@ -3106,7 +3107,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
        if (!group_leader)
                group_leader = counter;
 
-       mutex_init(&counter->mutex);
+       mutex_init(&counter->child_mutex);
+       INIT_LIST_HEAD(&counter->child_list);
+
        INIT_LIST_HEAD(&counter->list_entry);
        INIT_LIST_HEAD(&counter->event_entry);
        INIT_LIST_HEAD(&counter->sibling_list);
@@ -3114,13 +3117,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
        mutex_init(&counter->mmap_mutex);
 
-       INIT_LIST_HEAD(&counter->child_list);
-
        counter->cpu                    = cpu;
        counter->hw_event               = *hw_event;
        counter->group_leader           = group_leader;
        counter->pmu                    = NULL;
        counter->ctx                    = ctx;
+       counter->oncpu                  = -1;
+
        get_ctx(ctx);
 
        counter->state = PERF_COUNTER_STATE_INACTIVE;
@@ -3270,6 +3273,12 @@ SYSCALL_DEFINE5(perf_counter_open,
        perf_install_in_context(ctx, counter, cpu);
        mutex_unlock(&ctx->mutex);
 
+       counter->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_counter_mutex);
+       list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+       mutex_unlock(&current->perf_counter_mutex);
+
        fput_light(counter_file, fput_needed2);
 
 out_fput:
@@ -3346,10 +3355,9 @@ inherit_counter(struct perf_counter *parent_counter,
        /*
         * Link this into the parent counter's child list
         */
-       mutex_lock(&parent_counter->mutex);
+       mutex_lock(&parent_counter->child_mutex);
        list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-
-       mutex_unlock(&parent_counter->mutex);
+       mutex_unlock(&parent_counter->child_mutex);
 
        return child_counter;
 }
@@ -3396,9 +3404,9 @@ static void sync_child_counter(struct perf_counter *child_counter,
        /*
         * Remove this counter from the parent's list
         */
-       mutex_lock(&parent_counter->mutex);
+       mutex_lock(&parent_counter->child_mutex);
        list_del_init(&child_counter->child_list);
-       mutex_unlock(&parent_counter->mutex);
+       mutex_unlock(&parent_counter->child_mutex);
 
        /*
         * Release the parent counter, if this was the last
@@ -3414,16 +3422,8 @@ __perf_counter_exit_task(struct task_struct *child,
 {
        struct perf_counter *parent_counter;
 
-       /*
-        * Protect against concurrent operations on child_counter
-        * due its fd getting closed, etc.
-        */
-       mutex_lock(&child_counter->mutex);
-
        update_counter_times(child_counter);
-       list_del_counter(child_counter, child_ctx);
-
-       mutex_unlock(&child_counter->mutex);
+       perf_counter_remove_from_context(child_counter);
 
        parent_counter = child_counter->parent;
        /*
@@ -3451,8 +3451,6 @@ void perf_counter_exit_task(struct task_struct *child)
        struct perf_counter_context *child_ctx;
        unsigned long flags;
 
-       WARN_ON_ONCE(child != current);
-
        child_ctx = child->perf_counter_ctxp;
 
        if (likely(!child_ctx))
@@ -3486,15 +3484,23 @@ again:
 /*
  * Initialize the perf_counter context in task_struct
  */
-void perf_counter_init_task(struct task_struct *child)
+int perf_counter_init_task(struct task_struct *child)
 {
        struct perf_counter_context *child_ctx, *parent_ctx;
        struct perf_counter *counter;
        struct task_struct *parent = current;
        int inherited_all = 1;
+       int ret = 0;
 
        child->perf_counter_ctxp = NULL;
 
+       mutex_init(&child->perf_counter_mutex);
+       INIT_LIST_HEAD(&child->perf_counter_list);
+
+       parent_ctx = parent->perf_counter_ctxp;
+       if (likely(!parent_ctx || !parent_ctx->nr_counters))
+               return 0;
+
        /*
         * This is executed from the parent task context, so inherit
         * counters that have been marked for cloning.
@@ -3503,11 +3509,7 @@ void perf_counter_init_task(struct task_struct *child)
 
        child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
        if (!child_ctx)
-               return;
-
-       parent_ctx = parent->perf_counter_ctxp;
-       if (likely(!parent_ctx || !parent_ctx->nr_counters))
-               return;
+               return -ENOMEM;
 
        __perf_counter_init_context(child_ctx, child);
        child->perf_counter_ctxp = child_ctx;
@@ -3531,8 +3533,9 @@ void perf_counter_init_task(struct task_struct *child)
                        continue;
                }
 
-               if (inherit_group(counter, parent,
-                                 parent_ctx, child, child_ctx)) {
+               ret = inherit_group(counter, parent, parent_ctx,
+                                            child, child_ctx);
+               if (ret) {
                        inherited_all = 0;
                        break;
                }
@@ -3554,6 +3557,8 @@ void perf_counter_init_task(struct task_struct *child)
        }
 
        mutex_unlock(&parent_ctx->mutex);
+
+       return ret;
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)