int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
/*
* Lock for (sysadmin-configurable) counter reservations:
}
}
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
static void
list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
{
list_add_rcu(&counter->event_entry, &ctx->event_list);
ctx->nr_counters++;
- if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
- ctx->nr_enabled++;
}
/*
* Remove a counter from the lists for its context.
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
if (list_empty(&counter->list_entry))
return;
ctx->nr_counters--;
- if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
- ctx->nr_enabled--;
list_del_init(&counter->list_entry);
list_del_rcu(&counter->event_entry);
/*
* Remove the counter from a task's (or a CPU's) list of counters.
*
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex held.
*
* CPU counters are removed with a smp call. For task counters we only
* call when the task is on a CPU.
else
counter_sched_out(counter, cpuctx, ctx);
counter->state = PERF_COUNTER_STATE_OFF;
- ctx->nr_enabled--;
}
spin_unlock_irqrestore(&ctx->lock, flags);
if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
update_counter_times(counter);
counter->state = PERF_COUNTER_STATE_OFF;
- ctx->nr_enabled--;
}
spin_unlock_irq(&ctx->lock);
/*
* Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
*/
static void __perf_install_in_context(void *info)
{
goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;
counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
- ctx->nr_enabled++;
/*
* If the counter is in a group and isn't the group leader,
counter->state = PERF_COUNTER_STATE_INACTIVE;
counter->tstamp_enabled =
ctx->time - counter->total_time_enabled;
- ctx->nr_enabled++;
}
out:
spin_unlock_irq(&ctx->lock);
struct perf_counter_context *ctx2)
{
return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && ctx1->nr_enabled == ctx2->nr_enabled;
+ && ctx1->parent_gen == ctx2->parent_gen;
}
/*
struct perf_counter_context *next_ctx;
struct pt_regs *regs;
+ regs = task_pt_regs(task);
+ perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
if (likely(!ctx || !cpuctx->task_ctx))
return;
update_context_time(ctx);
-
- regs = task_pt_regs(task);
- perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
next_ctx = next->perf_counter_ctxp;
if (next_ctx && context_equiv(ctx, next_ctx)) {
task->perf_counter_ctxp = next_ctx;
__perf_counter_sched_in(ctx, cpuctx, cpu);
}
-int perf_counter_task_disable(void)
-{
- struct task_struct *curr = current;
- struct perf_counter_context *ctx = curr->perf_counter_ctxp;
- struct perf_counter *counter;
- unsigned long flags;
-
- if (!ctx || !ctx->nr_counters)
- return 0;
-
- local_irq_save(flags);
-
- __perf_counter_task_sched_out(ctx);
-
- spin_lock(&ctx->lock);
-
- /*
- * Disable all the counters:
- */
- perf_disable();
-
- list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (counter->state != PERF_COUNTER_STATE_ERROR) {
- update_group_times(counter);
- counter->state = PERF_COUNTER_STATE_OFF;
- }
- }
-
- perf_enable();
-
- spin_unlock_irqrestore(&ctx->lock, flags);
-
- return 0;
-}
-
-int perf_counter_task_enable(void)
-{
- struct task_struct *curr = current;
- struct perf_counter_context *ctx = curr->perf_counter_ctxp;
- struct perf_counter *counter;
- unsigned long flags;
- int cpu;
-
- if (!ctx || !ctx->nr_counters)
- return 0;
-
- local_irq_save(flags);
- cpu = smp_processor_id();
-
- __perf_counter_task_sched_out(ctx);
-
- spin_lock(&ctx->lock);
-
- /*
- * Disable all the counters:
- */
- perf_disable();
-
- list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (counter->state > PERF_COUNTER_STATE_OFF)
- continue;
- counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->tstamp_enabled =
- ctx->time - counter->total_time_enabled;
- counter->hw_event.disabled = 0;
- }
- perf_enable();
-
- spin_unlock(&ctx->lock);
-
- perf_counter_task_sched_in(curr, cpu);
-
- local_irq_restore(flags);
-
- return 0;
-}
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_counter *counter, int enable);
static void perf_log_period(struct perf_counter *counter, u64 period);
static void perf_adjust_freq(struct perf_counter_context *ctx)
{
struct perf_counter *counter;
- u64 irq_period;
+ u64 interrupts, irq_period;
u64 events, period;
s64 delta;
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
continue;
+ interrupts = counter->hw.interrupts;
+ counter->hw.interrupts = 0;
+
+ if (interrupts == MAX_INTERRUPTS) {
+ perf_log_throttle(counter, 1);
+ counter->pmu->unthrottle(counter);
+ interrupts = 2*sysctl_perf_counter_limit/HZ;
+ }
+
if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
continue;
- events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+ events = HZ * interrupts * counter->hw.irq_period;
period = div64_u64(events, counter->hw_event.irq_freq);
delta = (s64)(1 + period - counter->hw.irq_period);
perf_log_period(counter, irq_period);
counter->hw.irq_period = irq_period;
- counter->hw.interrupts = 0;
}
spin_unlock(&ctx->lock);
}
file->private_data = NULL;
mutex_lock(&ctx->mutex);
- mutex_lock(&counter->mutex);
-
perf_counter_remove_from_context(counter);
-
- mutex_unlock(&counter->mutex);
mutex_unlock(&ctx->mutex);
+ mutex_lock(&counter->owner->perf_counter_mutex);
+ list_del_init(&counter->owner_entry);
+ mutex_unlock(&counter->owner->perf_counter_mutex);
+ put_task_struct(counter->owner);
+
free_counter(counter);
put_context(ctx);
if (counter->state == PERF_COUNTER_STATE_ERROR)
return 0;
- mutex_lock(&counter->mutex);
+ mutex_lock(&counter->child_mutex);
values[0] = perf_counter_read(counter);
n = 1;
if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = counter->total_time_running +
atomic64_read(&counter->child_total_time_running);
- mutex_unlock(&counter->mutex);
+ mutex_unlock(&counter->child_mutex);
if (count < n * sizeof(u64))
return -EINVAL;
struct perf_counter_context *ctx = counter->ctx;
struct perf_counter *sibling;
- spin_lock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
counter = counter->group_leader;
func(counter);
list_for_each_entry(sibling, &counter->sibling_list, list_entry)
func(sibling);
- spin_unlock_irq(&ctx->lock);
+ mutex_unlock(&ctx->mutex);
}
static void perf_counter_for_each_child(struct perf_counter *counter,
{
struct perf_counter *child;
- mutex_lock(&counter->mutex);
+ mutex_lock(&counter->child_mutex);
func(counter);
list_for_each_entry(child, &counter->child_list, child_list)
func(child);
- mutex_unlock(&counter->mutex);
+ mutex_unlock(&counter->child_mutex);
}
static void perf_counter_for_each(struct perf_counter *counter,
{
struct perf_counter *child;
- mutex_lock(&counter->mutex);
+ mutex_lock(&counter->child_mutex);
perf_counter_for_each_sibling(counter, func);
list_for_each_entry(child, &counter->child_list, child_list)
perf_counter_for_each_sibling(child, func);
- mutex_unlock(&counter->mutex);
+ mutex_unlock(&counter->child_mutex);
}
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+int perf_counter_task_enable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(¤t->perf_counter_mutex);
+ list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_enable);
+ mutex_unlock(¤t->perf_counter_mutex);
+
+ return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(¤t->perf_counter_mutex);
+ list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_disable);
+ mutex_unlock(¤t->perf_counter_mutex);
+
+ return 0;
+}
+
/*
* Callers need to ensure there can be no nesting of this function, otherwise
* the seqlock logic goes bad. We can not serialize this because the arch
user_extra = nr_pages + 1;
user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ user_lock_limit *= num_online_cpus();
+
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
extra = 0;
perf_output_end(&handle);
}
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+ struct perf_output_handle handle;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 time;
+ } throttle_event = {
+ .header = {
+ .type = PERF_EVENT_THROTTLE + 1,
+ .misc = 0,
+ .size = sizeof(throttle_event),
+ },
+ .time = sched_clock(),
+ };
+
+ ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, throttle_event);
+ perf_output_end(&handle);
+}
+
/*
* Generic counter overflow handling.
*/
int nmi, struct pt_regs *regs, u64 addr)
{
int events = atomic_read(&counter->event_limit);
+ int throttle = counter->pmu->unthrottle != NULL;
int ret = 0;
- counter->hw.interrupts++;
+ if (!throttle) {
+ counter->hw.interrupts++;
+ } else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+ counter->hw.interrupts++;
+ if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+ counter->hw.interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(counter, 0);
+ ret = 1;
+ }
+ }
/*
* XXX event_limit might not quite work as expected on inherited
if (!group_leader)
group_leader = counter;
- mutex_init(&counter->mutex);
+ mutex_init(&counter->child_mutex);
+ INIT_LIST_HEAD(&counter->child_list);
+
INIT_LIST_HEAD(&counter->list_entry);
INIT_LIST_HEAD(&counter->event_entry);
INIT_LIST_HEAD(&counter->sibling_list);
mutex_init(&counter->mmap_mutex);
- INIT_LIST_HEAD(&counter->child_list);
-
counter->cpu = cpu;
counter->hw_event = *hw_event;
counter->group_leader = group_leader;
counter->pmu = NULL;
counter->ctx = ctx;
+ counter->oncpu = -1;
+
get_ctx(ctx);
counter->state = PERF_COUNTER_STATE_INACTIVE;
perf_install_in_context(ctx, counter, cpu);
mutex_unlock(&ctx->mutex);
+ counter->owner = current;
+ get_task_struct(current);
+ mutex_lock(¤t->perf_counter_mutex);
+ list_add_tail(&counter->owner_entry, ¤t->perf_counter_list);
+ mutex_unlock(¤t->perf_counter_mutex);
+
fput_light(counter_file, fput_needed2);
out_fput:
/*
* Link this into the parent counter's child list
*/
- mutex_lock(&parent_counter->mutex);
+ mutex_lock(&parent_counter->child_mutex);
list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-
- mutex_unlock(&parent_counter->mutex);
+ mutex_unlock(&parent_counter->child_mutex);
return child_counter;
}
/*
* Remove this counter from the parent's list
*/
- mutex_lock(&parent_counter->mutex);
+ mutex_lock(&parent_counter->child_mutex);
list_del_init(&child_counter->child_list);
- mutex_unlock(&parent_counter->mutex);
+ mutex_unlock(&parent_counter->child_mutex);
/*
* Release the parent counter, if this was the last
{
struct perf_counter *parent_counter;
- /*
- * Protect against concurrent operations on child_counter
- * due its fd getting closed, etc.
- */
- mutex_lock(&child_counter->mutex);
-
update_counter_times(child_counter);
- list_del_counter(child_counter, child_ctx);
-
- mutex_unlock(&child_counter->mutex);
+ perf_counter_remove_from_context(child_counter);
parent_counter = child_counter->parent;
/*
struct perf_counter_context *child_ctx;
unsigned long flags;
- WARN_ON_ONCE(child != current);
-
child_ctx = child->perf_counter_ctxp;
if (likely(!child_ctx))
/*
* Initialize the perf_counter context in task_struct
*/
-void perf_counter_init_task(struct task_struct *child)
+int perf_counter_init_task(struct task_struct *child)
{
struct perf_counter_context *child_ctx, *parent_ctx;
struct perf_counter *counter;
struct task_struct *parent = current;
int inherited_all = 1;
+ int ret = 0;
child->perf_counter_ctxp = NULL;
+ mutex_init(&child->perf_counter_mutex);
+ INIT_LIST_HEAD(&child->perf_counter_list);
+
+ parent_ctx = parent->perf_counter_ctxp;
+ if (likely(!parent_ctx || !parent_ctx->nr_counters))
+ return 0;
+
/*
* This is executed from the parent task context, so inherit
* counters that have been marked for cloning.
child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
if (!child_ctx)
- return;
-
- parent_ctx = parent->perf_counter_ctxp;
- if (likely(!parent_ctx || !parent_ctx->nr_counters))
- return;
+ return -ENOMEM;
__perf_counter_init_context(child_ctx, child);
child->perf_counter_ctxp = child_ctx;
continue;
}
- if (inherit_group(counter, parent,
- parent_ctx, child, child_ctx)) {
+ ret = inherit_group(counter, parent, parent_ctx,
+ child, child_ctx);
+ if (ret) {
inherited_all = 0;
break;
}
}
mutex_unlock(&parent_ctx->mutex);
+
+ return ret;
}
static void __cpuinit perf_counter_init_cpu(int cpu)