perf counters: hw driver API
[pandora-kernel.git] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/perf_counter.h>
11 #include <linux/capability.h>
12 #include <linux/notifier.h>
13 #include <linux/hardirq.h>
14 #include <linux/kprobes.h>
15 #include <linux/module.h>
16 #include <linux/kdebug.h>
17 #include <linux/sched.h>
18
19 #include <asm/intel_arch_perfmon.h>
20 #include <asm/apic.h>
21
22 static bool perf_counters_initialized __read_mostly;
23
24 /*
25  * Number of (generic) HW counters:
26  */
27 static int nr_hw_counters __read_mostly;
28 static u32 perf_counter_mask __read_mostly;
29
30 /* No support for fixed function counters yet */
31
32 #define MAX_HW_COUNTERS         8
33
34 struct cpu_hw_counters {
35         struct perf_counter     *counters[MAX_HW_COUNTERS];
36         unsigned long           used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
37 };
38
39 /*
40  * Intel PerfMon v3. Used on Core2 and later.
41  */
42 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
43
44 const int intel_perfmon_event_map[] =
45 {
46   [PERF_COUNT_CYCLES]                   = 0x003c,
47   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
48   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
49   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
50   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
51   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
52 };
53
54 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
55
56 /*
57  * Setup the hardware configuration for a given hw_event_type
58  */
59 static int __hw_perf_counter_init(struct perf_counter *counter)
60 {
61         struct perf_counter_hw_event *hw_event = &counter->hw_event;
62         struct hw_perf_counter *hwc = &counter->hw;
63
64         if (unlikely(!perf_counters_initialized))
65                 return -EINVAL;
66
67         /*
68          * Count user events, and generate PMC IRQs:
69          * (keep 'enabled' bit clear for now)
70          */
71         hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
72
73         /*
74          * If privileged enough, count OS events too, and allow
75          * NMI events as well:
76          */
77         hwc->nmi = 0;
78         if (capable(CAP_SYS_ADMIN)) {
79                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
80                 if (hw_event->nmi)
81                         hwc->nmi = 1;
82         }
83
84         hwc->config_base        = MSR_ARCH_PERFMON_EVENTSEL0;
85         hwc->counter_base       = MSR_ARCH_PERFMON_PERFCTR0;
86
87         hwc->irq_period         = hw_event->irq_period;
88         /*
89          * Intel PMCs cannot be accessed sanely above 32 bit width,
90          * so we install an artificial 1<<31 period regardless of
91          * the generic counter period:
92          */
93         if (!hwc->irq_period)
94                 hwc->irq_period = 0x7FFFFFFF;
95
96         hwc->next_count = -(s32)hwc->irq_period;
97
98         /*
99          * Raw event type provide the config in the event structure
100          */
101         if (hw_event->raw) {
102                 hwc->config |= hw_event->type;
103         } else {
104                 if (hw_event->type >= max_intel_perfmon_events)
105                         return -EINVAL;
106                 /*
107                  * The generic map:
108                  */
109                 hwc->config |= intel_perfmon_event_map[hw_event->type];
110         }
111         counter->wakeup_pending = 0;
112
113         return 0;
114 }
115
116 void hw_perf_enable_all(void)
117 {
118         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
119 }
120
121 void hw_perf_restore_ctrl(u64 ctrl)
122 {
123         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
124 }
125 EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
126
127 u64 hw_perf_disable_all(void)
128 {
129         u64 ctrl;
130
131         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
132         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
133         return ctrl;
134 }
135 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
136
137 static inline void
138 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
139 {
140         wrmsr(hwc->config_base + idx, hwc->config, 0);
141 }
142
143 static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
144
145 static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
146 {
147         per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
148
149         wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
150 }
151
152 static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
153 {
154         wrmsr(hwc->config_base + idx,
155               hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
156 }
157
158 static void x86_perf_counter_enable(struct perf_counter *counter)
159 {
160         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
161         struct hw_perf_counter *hwc = &counter->hw;
162         int idx = hwc->idx;
163
164         /* Try to get the previous counter again */
165         if (test_and_set_bit(idx, cpuc->used)) {
166                 idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
167                 set_bit(idx, cpuc->used);
168                 hwc->idx = idx;
169         }
170
171         perf_counters_lapic_init(hwc->nmi);
172
173         __x86_perf_counter_disable(hwc, idx);
174
175         cpuc->counters[idx] = counter;
176
177         __hw_perf_counter_set_period(hwc, idx);
178         __x86_perf_counter_enable(hwc, idx);
179 }
180
181 #ifdef CONFIG_X86_64
182 static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
183 {
184         atomic64_set(&counter->count, val);
185 }
186
187 static inline u64 atomic64_counter_read(struct perf_counter *counter)
188 {
189         return atomic64_read(&counter->count);
190 }
191 #else
192 /*
193  * Todo: add proper atomic64_t support to 32-bit x86:
194  */
195 static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
196 {
197         u32 *val32 = (void *)&val64;
198
199         atomic_set(counter->count32 + 0, *(val32 + 0));
200         atomic_set(counter->count32 + 1, *(val32 + 1));
201 }
202
203 static inline u64 atomic64_counter_read(struct perf_counter *counter)
204 {
205         return atomic_read(counter->count32 + 0) |
206                 (u64) atomic_read(counter->count32 + 1) << 32;
207 }
208 #endif
209
210 static void __hw_perf_save_counter(struct perf_counter *counter,
211                                    struct hw_perf_counter *hwc, int idx)
212 {
213         s64 raw = -1;
214         s64 delta;
215
216         /*
217          * Get the raw hw counter value:
218          */
219         rdmsrl(hwc->counter_base + idx, raw);
220
221         /*
222          * Rebase it to zero (it started counting at -irq_period),
223          * to see the delta since ->prev_count:
224          */
225         delta = (s64)hwc->irq_period + (s64)(s32)raw;
226
227         atomic64_counter_set(counter, hwc->prev_count + delta);
228
229         /*
230          * Adjust the ->prev_count offset - if we went beyond
231          * irq_period of units, then we got an IRQ and the counter
232          * was set back to -irq_period:
233          */
234         while (delta >= (s64)hwc->irq_period) {
235                 hwc->prev_count += hwc->irq_period;
236                 delta -= (s64)hwc->irq_period;
237         }
238
239         /*
240          * Calculate the next raw counter value we'll write into
241          * the counter at the next sched-in time:
242          */
243         delta -= (s64)hwc->irq_period;
244
245         hwc->next_count = (s32)delta;
246 }
247
248 void perf_counter_print_debug(void)
249 {
250         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
251         int cpu, idx;
252
253         if (!nr_hw_counters)
254                 return;
255
256         local_irq_disable();
257
258         cpu = smp_processor_id();
259
260         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
261         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
262         rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
263
264         printk(KERN_INFO "\n");
265         printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
266         printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
267         printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
268
269         for (idx = 0; idx < nr_hw_counters; idx++) {
270                 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
271                 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
272
273                 next_count = per_cpu(prev_next_count[idx], cpu);
274
275                 printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
276                         cpu, idx, pmc_ctrl);
277                 printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
278                         cpu, idx, pmc_count);
279                 printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
280                         cpu, idx, next_count);
281         }
282         local_irq_enable();
283 }
284
285 static void x86_perf_counter_disable(struct perf_counter *counter)
286 {
287         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
288         struct hw_perf_counter *hwc = &counter->hw;
289         unsigned int idx = hwc->idx;
290
291         __x86_perf_counter_disable(hwc, idx);
292
293         clear_bit(idx, cpuc->used);
294         cpuc->counters[idx] = NULL;
295         __hw_perf_save_counter(counter, hwc, idx);
296 }
297
298 static void x86_perf_counter_read(struct perf_counter *counter)
299 {
300         struct hw_perf_counter *hwc = &counter->hw;
301         unsigned long addr = hwc->counter_base + hwc->idx;
302         s64 offs, val = -1LL;
303         s32 val32;
304
305         /* Careful: NMI might modify the counter offset */
306         do {
307                 offs = hwc->prev_count;
308                 rdmsrl(addr, val);
309         } while (offs != hwc->prev_count);
310
311         val32 = (s32) val;
312         val =  (s64)hwc->irq_period + (s64)val32;
313         atomic64_counter_set(counter, hwc->prev_count + val);
314 }
315
316 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
317 {
318         struct perf_data *irqdata = counter->irqdata;
319
320         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
321                 irqdata->overrun++;
322         } else {
323                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
324
325                 *p = data;
326                 irqdata->len += sizeof(u64);
327         }
328 }
329
330 /*
331  * NMI-safe enable method:
332  */
333 static void perf_save_and_restart(struct perf_counter *counter)
334 {
335         struct hw_perf_counter *hwc = &counter->hw;
336         int idx = hwc->idx;
337         u64 pmc_ctrl;
338
339         rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
340
341         __hw_perf_save_counter(counter, hwc, idx);
342         __hw_perf_counter_set_period(hwc, idx);
343
344         if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
345                 __x86_perf_counter_enable(hwc, idx);
346 }
347
348 static void
349 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
350 {
351         struct perf_counter *counter, *group_leader = sibling->group_leader;
352         int bit;
353
354         /*
355          * Store the counter's own timestamp first:
356          */
357         perf_store_irq_data(sibling, sibling->hw_event.type);
358         perf_store_irq_data(sibling, atomic64_counter_read(sibling));
359
360         /*
361          * Then store sibling timestamps (if any):
362          */
363         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
364                 if (!counter->active) {
365                         /*
366                          * When counter was not in the overflow mask, we have to
367                          * read it from hardware. We read it as well, when it
368                          * has not been read yet and clear the bit in the
369                          * status mask.
370                          */
371                         bit = counter->hw.idx;
372                         if (!test_bit(bit, (unsigned long *) overflown) ||
373                             test_bit(bit, (unsigned long *) status)) {
374                                 clear_bit(bit, (unsigned long *) status);
375                                 perf_save_and_restart(counter);
376                         }
377                 }
378                 perf_store_irq_data(sibling, counter->hw_event.type);
379                 perf_store_irq_data(sibling, atomic64_counter_read(counter));
380         }
381 }
382
383 /*
384  * This handler is triggered by the local APIC, so the APIC IRQ handling
385  * rules apply:
386  */
387 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
388 {
389         int bit, cpu = smp_processor_id();
390         u64 ack, status, saved_global;
391         struct cpu_hw_counters *cpuc;
392
393         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
394
395         /* Disable counters globally */
396         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
397         ack_APIC_irq();
398
399         cpuc = &per_cpu(cpu_hw_counters, cpu);
400
401         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
402         if (!status)
403                 goto out;
404
405 again:
406         ack = status;
407         for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
408                 struct perf_counter *counter = cpuc->counters[bit];
409
410                 clear_bit(bit, (unsigned long *) &status);
411                 if (!counter)
412                         continue;
413
414                 perf_save_and_restart(counter);
415
416                 switch (counter->hw_event.record_type) {
417                 case PERF_RECORD_SIMPLE:
418                         continue;
419                 case PERF_RECORD_IRQ:
420                         perf_store_irq_data(counter, instruction_pointer(regs));
421                         break;
422                 case PERF_RECORD_GROUP:
423                         perf_handle_group(counter, &status, &ack);
424                         break;
425                 }
426                 /*
427                  * From NMI context we cannot call into the scheduler to
428                  * do a task wakeup - but we mark these counters as
429                  * wakeup_pending and initate a wakeup callback:
430                  */
431                 if (nmi) {
432                         counter->wakeup_pending = 1;
433                         set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
434                 } else {
435                         wake_up(&counter->waitq);
436                 }
437         }
438
439         wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
440
441         /*
442          * Repeat if there is more work to be done:
443          */
444         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
445         if (status)
446                 goto again;
447 out:
448         /*
449          * Restore - do not reenable when global enable is off:
450          */
451         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
452 }
453
454 void smp_perf_counter_interrupt(struct pt_regs *regs)
455 {
456         irq_enter();
457 #ifdef CONFIG_X86_64
458         add_pda(apic_perf_irqs, 1);
459 #else
460         per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
461 #endif
462         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
463         __smp_perf_counter_interrupt(regs, 0);
464
465         irq_exit();
466 }
467
468 /*
469  * This handler is triggered by NMI contexts:
470  */
471 void perf_counter_notify(struct pt_regs *regs)
472 {
473         struct cpu_hw_counters *cpuc;
474         unsigned long flags;
475         int bit, cpu;
476
477         local_irq_save(flags);
478         cpu = smp_processor_id();
479         cpuc = &per_cpu(cpu_hw_counters, cpu);
480
481         for_each_bit(bit, cpuc->used, nr_hw_counters) {
482                 struct perf_counter *counter = cpuc->counters[bit];
483
484                 if (!counter)
485                         continue;
486
487                 if (counter->wakeup_pending) {
488                         counter->wakeup_pending = 0;
489                         wake_up(&counter->waitq);
490                 }
491         }
492
493         local_irq_restore(flags);
494 }
495
496 void __cpuinit perf_counters_lapic_init(int nmi)
497 {
498         u32 apic_val;
499
500         if (!perf_counters_initialized)
501                 return;
502         /*
503          * Enable the performance counter vector in the APIC LVT:
504          */
505         apic_val = apic_read(APIC_LVTERR);
506
507         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
508         if (nmi)
509                 apic_write(APIC_LVTPC, APIC_DM_NMI);
510         else
511                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
512         apic_write(APIC_LVTERR, apic_val);
513 }
514
515 static int __kprobes
516 perf_counter_nmi_handler(struct notifier_block *self,
517                          unsigned long cmd, void *__args)
518 {
519         struct die_args *args = __args;
520         struct pt_regs *regs;
521
522         if (likely(cmd != DIE_NMI_IPI))
523                 return NOTIFY_DONE;
524
525         regs = args->regs;
526
527         apic_write(APIC_LVTPC, APIC_DM_NMI);
528         __smp_perf_counter_interrupt(regs, 1);
529
530         return NOTIFY_STOP;
531 }
532
533 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
534         .notifier_call          = perf_counter_nmi_handler
535 };
536
537 void __init init_hw_perf_counters(void)
538 {
539         union cpuid10_eax eax;
540         unsigned int unused;
541         unsigned int ebx;
542
543         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
544                 return;
545
546         /*
547          * Check whether the Architectural PerfMon supports
548          * Branch Misses Retired Event or not.
549          */
550         cpuid(10, &(eax.full), &ebx, &unused, &unused);
551         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
552                 return;
553
554         printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
555
556         printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
557         printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
558         nr_hw_counters = eax.split.num_counters;
559         if (nr_hw_counters > MAX_HW_COUNTERS) {
560                 nr_hw_counters = MAX_HW_COUNTERS;
561                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
562                         nr_hw_counters, MAX_HW_COUNTERS);
563         }
564         perf_counter_mask = (1 << nr_hw_counters) - 1;
565         perf_max_counters = nr_hw_counters;
566
567         printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
568         printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
569
570         perf_counters_lapic_init(0);
571         register_die_notifier(&perf_counter_nmi_notifier);
572
573         perf_counters_initialized = true;
574 }
575
576 static struct hw_perf_counter_ops x86_perf_counter_ops = {
577         .hw_perf_counter_enable         = x86_perf_counter_enable,
578         .hw_perf_counter_disable        = x86_perf_counter_disable,
579         .hw_perf_counter_read           = x86_perf_counter_read,
580 };
581
582 struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
583 {
584         int err;
585
586         err = __hw_perf_counter_init(counter);
587         if (err)
588                 return NULL;
589
590         return &x86_perf_counter_ops;
591 }