2d3681bbb5225429608b70811c18d97aca780344
[pandora-kernel.git] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *  Copyright(C) 2009 Jaswinder Singh Rajput
7  *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
8  *
9  *  For licencing details see kernel-base/COPYING
10  */
11
12 #include <linux/perf_counter.h>
13 #include <linux/capability.h>
14 #include <linux/notifier.h>
15 #include <linux/hardirq.h>
16 #include <linux/kprobes.h>
17 #include <linux/module.h>
18 #include <linux/kdebug.h>
19 #include <linux/sched.h>
20 #include <linux/uaccess.h>
21
22 #include <asm/apic.h>
23 #include <asm/stacktrace.h>
24 #include <asm/nmi.h>
25
26 static u64 perf_counter_mask __read_mostly;
27
28 struct cpu_hw_counters {
29         struct perf_counter     *counters[X86_PMC_IDX_MAX];
30         unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
31         unsigned long           active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32         unsigned long           interrupts;
33         u64                     throttle_ctrl;
34         int                     enabled;
35 };
36
37 /*
38  * struct x86_pmu - generic x86 pmu
39  */
40 struct x86_pmu {
41         const char      *name;
42         int             version;
43         int             (*handle_irq)(struct pt_regs *, int);
44         u64             (*save_disable_all)(void);
45         void            (*restore_all)(u64);
46         void            (*enable)(struct hw_perf_counter *, int);
47         void            (*disable)(struct hw_perf_counter *, int);
48         unsigned        eventsel;
49         unsigned        perfctr;
50         u64             (*event_map)(int);
51         u64             (*raw_event)(u64);
52         int             max_events;
53         int             num_counters;
54         int             num_counters_fixed;
55         int             counter_bits;
56         u64             counter_mask;
57 };
58
59 static struct x86_pmu x86_pmu __read_mostly;
60
61 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
62         .enabled = 1,
63 };
64
65 /*
66  * Intel PerfMon v3. Used on Core2 and later.
67  */
68 static const u64 intel_perfmon_event_map[] =
69 {
70   [PERF_COUNT_CPU_CYCLES]               = 0x003c,
71   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
72   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
73   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
74   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
75   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
76   [PERF_COUNT_BUS_CYCLES]               = 0x013c,
77 };
78
79 static u64 intel_pmu_event_map(int event)
80 {
81         return intel_perfmon_event_map[event];
82 }
83
84 static u64 intel_pmu_raw_event(u64 event)
85 {
86 #define CORE_EVNTSEL_EVENT_MASK         0x000000FFULL
87 #define CORE_EVNTSEL_UNIT_MASK          0x0000FF00ULL
88 #define CORE_EVNTSEL_COUNTER_MASK       0xFF000000ULL
89
90 #define CORE_EVNTSEL_MASK               \
91         (CORE_EVNTSEL_EVENT_MASK |      \
92          CORE_EVNTSEL_UNIT_MASK  |      \
93          CORE_EVNTSEL_COUNTER_MASK)
94
95         return event & CORE_EVNTSEL_MASK;
96 }
97
98 /*
99  * AMD Performance Monitor K7 and later.
100  */
101 static const u64 amd_perfmon_event_map[] =
102 {
103   [PERF_COUNT_CPU_CYCLES]               = 0x0076,
104   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
105   [PERF_COUNT_CACHE_REFERENCES]         = 0x0080,
106   [PERF_COUNT_CACHE_MISSES]             = 0x0081,
107   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
108   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
109 };
110
111 static u64 amd_pmu_event_map(int event)
112 {
113         return amd_perfmon_event_map[event];
114 }
115
116 static u64 amd_pmu_raw_event(u64 event)
117 {
118 #define K7_EVNTSEL_EVENT_MASK   0x7000000FFULL
119 #define K7_EVNTSEL_UNIT_MASK    0x00000FF00ULL
120 #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
121
122 #define K7_EVNTSEL_MASK                 \
123         (K7_EVNTSEL_EVENT_MASK |        \
124          K7_EVNTSEL_UNIT_MASK  |        \
125          K7_EVNTSEL_COUNTER_MASK)
126
127         return event & K7_EVNTSEL_MASK;
128 }
129
130 /*
131  * Propagate counter elapsed time into the generic counter.
132  * Can only be executed on the CPU where the counter is active.
133  * Returns the delta events processed.
134  */
135 static void
136 x86_perf_counter_update(struct perf_counter *counter,
137                         struct hw_perf_counter *hwc, int idx)
138 {
139         u64 prev_raw_count, new_raw_count, delta;
140
141         /*
142          * Careful: an NMI might modify the previous counter value.
143          *
144          * Our tactic to handle this is to first atomically read and
145          * exchange a new raw count - then add that new-prev delta
146          * count to the generic counter atomically:
147          */
148 again:
149         prev_raw_count = atomic64_read(&hwc->prev_count);
150         rdmsrl(hwc->counter_base + idx, new_raw_count);
151
152         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
153                                         new_raw_count) != prev_raw_count)
154                 goto again;
155
156         /*
157          * Now we have the new raw value and have updated the prev
158          * timestamp already. We can now calculate the elapsed delta
159          * (counter-)time and add that to the generic counter.
160          *
161          * Careful, not all hw sign-extends above the physical width
162          * of the count, so we do that by clipping the delta to 32 bits:
163          */
164         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
165
166         atomic64_add(delta, &counter->count);
167         atomic64_sub(delta, &hwc->period_left);
168 }
169
170 static atomic_t num_counters;
171 static DEFINE_MUTEX(pmc_reserve_mutex);
172
173 static bool reserve_pmc_hardware(void)
174 {
175         int i;
176
177         if (nmi_watchdog == NMI_LOCAL_APIC)
178                 disable_lapic_nmi_watchdog();
179
180         for (i = 0; i < x86_pmu.num_counters; i++) {
181                 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
182                         goto perfctr_fail;
183         }
184
185         for (i = 0; i < x86_pmu.num_counters; i++) {
186                 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
187                         goto eventsel_fail;
188         }
189
190         return true;
191
192 eventsel_fail:
193         for (i--; i >= 0; i--)
194                 release_evntsel_nmi(x86_pmu.eventsel + i);
195
196         i = x86_pmu.num_counters;
197
198 perfctr_fail:
199         for (i--; i >= 0; i--)
200                 release_perfctr_nmi(x86_pmu.perfctr + i);
201
202         if (nmi_watchdog == NMI_LOCAL_APIC)
203                 enable_lapic_nmi_watchdog();
204
205         return false;
206 }
207
208 static void release_pmc_hardware(void)
209 {
210         int i;
211
212         for (i = 0; i < x86_pmu.num_counters; i++) {
213                 release_perfctr_nmi(x86_pmu.perfctr + i);
214                 release_evntsel_nmi(x86_pmu.eventsel + i);
215         }
216
217         if (nmi_watchdog == NMI_LOCAL_APIC)
218                 enable_lapic_nmi_watchdog();
219 }
220
221 static void hw_perf_counter_destroy(struct perf_counter *counter)
222 {
223         if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
224                 release_pmc_hardware();
225                 mutex_unlock(&pmc_reserve_mutex);
226         }
227 }
228
229 static inline int x86_pmu_initialized(void)
230 {
231         return x86_pmu.handle_irq != NULL;
232 }
233
234 /*
235  * Setup the hardware configuration for a given hw_event_type
236  */
237 static int __hw_perf_counter_init(struct perf_counter *counter)
238 {
239         struct perf_counter_hw_event *hw_event = &counter->hw_event;
240         struct hw_perf_counter *hwc = &counter->hw;
241         int err;
242
243         /* disable temporarily */
244         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
245                 return -ENOSYS;
246
247         if (!x86_pmu_initialized())
248                 return -ENODEV;
249
250         err = 0;
251         if (atomic_inc_not_zero(&num_counters)) {
252                 mutex_lock(&pmc_reserve_mutex);
253                 if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
254                         err = -EBUSY;
255                 else
256                         atomic_inc(&num_counters);
257                 mutex_unlock(&pmc_reserve_mutex);
258         }
259         if (err)
260                 return err;
261
262         /*
263          * Generate PMC IRQs:
264          * (keep 'enabled' bit clear for now)
265          */
266         hwc->config = ARCH_PERFMON_EVENTSEL_INT;
267
268         /*
269          * Count user and OS events unless requested not to.
270          */
271         if (!hw_event->exclude_user)
272                 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
273         if (!hw_event->exclude_kernel)
274                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
275
276         /*
277          * If privileged enough, allow NMI events:
278          */
279         hwc->nmi = 0;
280         if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
281                 hwc->nmi = 1;
282
283         hwc->irq_period         = hw_event->irq_period;
284         /*
285          * Intel PMCs cannot be accessed sanely above 32 bit width,
286          * so we install an artificial 1<<31 period regardless of
287          * the generic counter period:
288          */
289         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
290                 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
291                         hwc->irq_period = 0x7FFFFFFF;
292
293         atomic64_set(&hwc->period_left, hwc->irq_period);
294
295         /*
296          * Raw event type provide the config in the event structure
297          */
298         if (perf_event_raw(hw_event)) {
299                 hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
300         } else {
301                 if (perf_event_id(hw_event) >= x86_pmu.max_events)
302                         return -EINVAL;
303                 /*
304                  * The generic map:
305                  */
306                 hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
307         }
308
309         counter->destroy = hw_perf_counter_destroy;
310
311         return 0;
312 }
313
314 static u64 intel_pmu_save_disable_all(void)
315 {
316         u64 ctrl;
317
318         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
319         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
320
321         return ctrl;
322 }
323
324 static u64 amd_pmu_save_disable_all(void)
325 {
326         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
327         int enabled, idx;
328
329         enabled = cpuc->enabled;
330         cpuc->enabled = 0;
331         /*
332          * ensure we write the disable before we start disabling the
333          * counters proper, so that amd_pmu_enable_counter() does the
334          * right thing.
335          */
336         barrier();
337
338         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
339                 u64 val;
340
341                 if (!test_bit(idx, cpuc->active))
342                         continue;
343                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
344                 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
345                         continue;
346                 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
347                 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
348         }
349
350         return enabled;
351 }
352
353 u64 hw_perf_save_disable(void)
354 {
355         if (!x86_pmu_initialized())
356                 return 0;
357         return x86_pmu.save_disable_all();
358 }
359 /*
360  * Exported because of ACPI idle
361  */
362 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
363
364 static void intel_pmu_restore_all(u64 ctrl)
365 {
366         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
367 }
368
369 static void amd_pmu_restore_all(u64 ctrl)
370 {
371         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
372         int idx;
373
374         cpuc->enabled = ctrl;
375         barrier();
376         if (!ctrl)
377                 return;
378
379         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
380                 u64 val;
381
382                 if (!test_bit(idx, cpuc->active))
383                         continue;
384                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
385                 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
386                         continue;
387                 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
388                 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
389         }
390 }
391
392 void hw_perf_restore(u64 ctrl)
393 {
394         if (!x86_pmu_initialized())
395                 return;
396         x86_pmu.restore_all(ctrl);
397 }
398 /*
399  * Exported because of ACPI idle
400  */
401 EXPORT_SYMBOL_GPL(hw_perf_restore);
402
403 static inline u64 intel_pmu_get_status(u64 mask)
404 {
405         u64 status;
406
407         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
408
409         return status;
410 }
411
412 static inline void intel_pmu_ack_status(u64 ack)
413 {
414         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
415 }
416
417 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
418 {
419         int err;
420         err = checking_wrmsrl(hwc->config_base + idx,
421                               hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
422 }
423
424 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
425 {
426         int err;
427         err = checking_wrmsrl(hwc->config_base + idx,
428                               hwc->config);
429 }
430
431 static inline void
432 intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
433 {
434         int idx = __idx - X86_PMC_IDX_FIXED;
435         u64 ctrl_val, mask;
436         int err;
437
438         mask = 0xfULL << (idx * 4);
439
440         rdmsrl(hwc->config_base, ctrl_val);
441         ctrl_val &= ~mask;
442         err = checking_wrmsrl(hwc->config_base, ctrl_val);
443 }
444
445 static inline void
446 intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
447 {
448         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
449                 intel_pmu_disable_fixed(hwc, idx);
450                 return;
451         }
452
453         x86_pmu_disable_counter(hwc, idx);
454 }
455
456 static inline void
457 amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
458 {
459         x86_pmu_disable_counter(hwc, idx);
460 }
461
462 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
463
464 /*
465  * Set the next IRQ period, based on the hwc->period_left value.
466  * To be called with the counter disabled in hw:
467  */
468 static void
469 x86_perf_counter_set_period(struct perf_counter *counter,
470                              struct hw_perf_counter *hwc, int idx)
471 {
472         s64 left = atomic64_read(&hwc->period_left);
473         s64 period = hwc->irq_period;
474         int err;
475
476         /*
477          * If we are way outside a reasoable range then just skip forward:
478          */
479         if (unlikely(left <= -period)) {
480                 left = period;
481                 atomic64_set(&hwc->period_left, left);
482         }
483
484         if (unlikely(left <= 0)) {
485                 left += period;
486                 atomic64_set(&hwc->period_left, left);
487         }
488
489         per_cpu(prev_left[idx], smp_processor_id()) = left;
490
491         /*
492          * The hw counter starts counting from this counter offset,
493          * mark it to be able to extra future deltas:
494          */
495         atomic64_set(&hwc->prev_count, (u64)-left);
496
497         err = checking_wrmsrl(hwc->counter_base + idx,
498                              (u64)(-left) & x86_pmu.counter_mask);
499 }
500
501 static inline void
502 intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
503 {
504         int idx = __idx - X86_PMC_IDX_FIXED;
505         u64 ctrl_val, bits, mask;
506         int err;
507
508         /*
509          * Enable IRQ generation (0x8),
510          * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
511          * if requested:
512          */
513         bits = 0x8ULL;
514         if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
515                 bits |= 0x2;
516         if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
517                 bits |= 0x1;
518         bits <<= (idx * 4);
519         mask = 0xfULL << (idx * 4);
520
521         rdmsrl(hwc->config_base, ctrl_val);
522         ctrl_val &= ~mask;
523         ctrl_val |= bits;
524         err = checking_wrmsrl(hwc->config_base, ctrl_val);
525 }
526
527 static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
528 {
529         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
530                 intel_pmu_enable_fixed(hwc, idx);
531                 return;
532         }
533
534         x86_pmu_enable_counter(hwc, idx);
535 }
536
537 static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
538 {
539         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
540
541         if (cpuc->enabled)
542                 x86_pmu_enable_counter(hwc, idx);
543         else
544                 x86_pmu_disable_counter(hwc, idx);
545 }
546
547 static int
548 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
549 {
550         unsigned int event;
551
552         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
553                 return -1;
554
555         if (unlikely(hwc->nmi))
556                 return -1;
557
558         event = hwc->config & ARCH_PERFMON_EVENT_MASK;
559
560         if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
561                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
562         if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
563                 return X86_PMC_IDX_FIXED_CPU_CYCLES;
564         if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
565                 return X86_PMC_IDX_FIXED_BUS_CYCLES;
566
567         return -1;
568 }
569
570 /*
571  * Find a PMC slot for the freshly enabled / scheduled in counter:
572  */
573 static int x86_pmu_enable(struct perf_counter *counter)
574 {
575         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
576         struct hw_perf_counter *hwc = &counter->hw;
577         int idx;
578
579         idx = fixed_mode_idx(counter, hwc);
580         if (idx >= 0) {
581                 /*
582                  * Try to get the fixed counter, if that is already taken
583                  * then try to get a generic counter:
584                  */
585                 if (test_and_set_bit(idx, cpuc->used))
586                         goto try_generic;
587
588                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
589                 /*
590                  * We set it so that counter_base + idx in wrmsr/rdmsr maps to
591                  * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
592                  */
593                 hwc->counter_base =
594                         MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
595                 hwc->idx = idx;
596         } else {
597                 idx = hwc->idx;
598                 /* Try to get the previous generic counter again */
599                 if (test_and_set_bit(idx, cpuc->used)) {
600 try_generic:
601                         idx = find_first_zero_bit(cpuc->used,
602                                                   x86_pmu.num_counters);
603                         if (idx == x86_pmu.num_counters)
604                                 return -EAGAIN;
605
606                         set_bit(idx, cpuc->used);
607                         hwc->idx = idx;
608                 }
609                 hwc->config_base  = x86_pmu.eventsel;
610                 hwc->counter_base = x86_pmu.perfctr;
611         }
612
613         perf_counters_lapic_init(hwc->nmi);
614
615         x86_pmu.disable(hwc, idx);
616
617         cpuc->counters[idx] = counter;
618         set_bit(idx, cpuc->active);
619
620         x86_perf_counter_set_period(counter, hwc, idx);
621         x86_pmu.enable(hwc, idx);
622
623         return 0;
624 }
625
626 void perf_counter_print_debug(void)
627 {
628         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
629         struct cpu_hw_counters *cpuc;
630         int cpu, idx;
631
632         if (!x86_pmu.num_counters)
633                 return;
634
635         local_irq_disable();
636
637         cpu = smp_processor_id();
638         cpuc = &per_cpu(cpu_hw_counters, cpu);
639
640         if (x86_pmu.version >= 2) {
641                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
642                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
643                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
644                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
645
646                 pr_info("\n");
647                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
648                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
649                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
650                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
651         }
652         pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
653
654         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
655                 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
656                 rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
657
658                 prev_left = per_cpu(prev_left[idx], cpu);
659
660                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
661                         cpu, idx, pmc_ctrl);
662                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
663                         cpu, idx, pmc_count);
664                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
665                         cpu, idx, prev_left);
666         }
667         for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
668                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
669
670                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
671                         cpu, idx, pmc_count);
672         }
673         local_irq_enable();
674 }
675
676 static void x86_pmu_disable(struct perf_counter *counter)
677 {
678         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
679         struct hw_perf_counter *hwc = &counter->hw;
680         int idx = hwc->idx;
681
682         /*
683          * Must be done before we disable, otherwise the nmi handler
684          * could reenable again:
685          */
686         clear_bit(idx, cpuc->active);
687         x86_pmu.disable(hwc, idx);
688
689         /*
690          * Make sure the cleared pointer becomes visible before we
691          * (potentially) free the counter:
692          */
693         barrier();
694
695         /*
696          * Drain the remaining delta count out of a counter
697          * that we are disabling:
698          */
699         x86_perf_counter_update(counter, hwc, idx);
700         cpuc->counters[idx] = NULL;
701         clear_bit(idx, cpuc->used);
702 }
703
704 /*
705  * Save and restart an expired counter. Called by NMI contexts,
706  * so it has to be careful about preempting normal counter ops:
707  */
708 static void intel_pmu_save_and_restart(struct perf_counter *counter)
709 {
710         struct hw_perf_counter *hwc = &counter->hw;
711         int idx = hwc->idx;
712
713         x86_perf_counter_update(counter, hwc, idx);
714         x86_perf_counter_set_period(counter, hwc, idx);
715
716         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
717                 intel_pmu_enable_counter(hwc, idx);
718 }
719
720 /*
721  * Maximum interrupt frequency of 100KHz per CPU
722  */
723 #define PERFMON_MAX_INTERRUPTS (100000/HZ)
724
725 /*
726  * This handler is triggered by the local APIC, so the APIC IRQ handling
727  * rules apply:
728  */
729 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
730 {
731         int bit, cpu = smp_processor_id();
732         u64 ack, status;
733         struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
734         int ret = 0;
735
736         cpuc->throttle_ctrl = intel_pmu_save_disable_all();
737
738         status = intel_pmu_get_status(cpuc->throttle_ctrl);
739         if (!status)
740                 goto out;
741
742         ret = 1;
743 again:
744         inc_irq_stat(apic_perf_irqs);
745         ack = status;
746         for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
747                 struct perf_counter *counter = cpuc->counters[bit];
748
749                 clear_bit(bit, (unsigned long *) &status);
750                 if (!test_bit(bit, cpuc->active))
751                         continue;
752
753                 intel_pmu_save_and_restart(counter);
754                 if (perf_counter_overflow(counter, nmi, regs, 0))
755                         intel_pmu_disable_counter(&counter->hw, bit);
756         }
757
758         intel_pmu_ack_status(ack);
759
760         /*
761          * Repeat if there is more work to be done:
762          */
763         status = intel_pmu_get_status(cpuc->throttle_ctrl);
764         if (status)
765                 goto again;
766 out:
767         /*
768          * Restore - do not reenable when global enable is off or throttled:
769          */
770         if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
771                 intel_pmu_restore_all(cpuc->throttle_ctrl);
772
773         return ret;
774 }
775
776 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
777
778 void perf_counter_unthrottle(void)
779 {
780         struct cpu_hw_counters *cpuc;
781
782         if (!x86_pmu_initialized())
783                 return;
784
785         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
786                 return;
787
788         cpuc = &__get_cpu_var(cpu_hw_counters);
789         if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
790                 if (printk_ratelimit())
791                         printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
792                 hw_perf_restore(cpuc->throttle_ctrl);
793         }
794         cpuc->interrupts = 0;
795 }
796
797 void smp_perf_counter_interrupt(struct pt_regs *regs)
798 {
799         irq_enter();
800         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
801         ack_APIC_irq();
802         x86_pmu.handle_irq(regs, 0);
803         irq_exit();
804 }
805
806 void smp_perf_pending_interrupt(struct pt_regs *regs)
807 {
808         irq_enter();
809         ack_APIC_irq();
810         inc_irq_stat(apic_pending_irqs);
811         perf_counter_do_pending();
812         irq_exit();
813 }
814
815 void set_perf_counter_pending(void)
816 {
817         apic->send_IPI_self(LOCAL_PENDING_VECTOR);
818 }
819
820 void perf_counters_lapic_init(int nmi)
821 {
822         u32 apic_val;
823
824         if (!x86_pmu_initialized())
825                 return;
826
827         /*
828          * Enable the performance counter vector in the APIC LVT:
829          */
830         apic_val = apic_read(APIC_LVTERR);
831
832         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
833         if (nmi)
834                 apic_write(APIC_LVTPC, APIC_DM_NMI);
835         else
836                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
837         apic_write(APIC_LVTERR, apic_val);
838 }
839
840 static int __kprobes
841 perf_counter_nmi_handler(struct notifier_block *self,
842                          unsigned long cmd, void *__args)
843 {
844         struct die_args *args = __args;
845         struct pt_regs *regs;
846         int ret;
847
848         switch (cmd) {
849         case DIE_NMI:
850         case DIE_NMI_IPI:
851                 break;
852
853         default:
854                 return NOTIFY_DONE;
855         }
856
857         regs = args->regs;
858
859         apic_write(APIC_LVTPC, APIC_DM_NMI);
860         ret = x86_pmu.handle_irq(regs, 1);
861
862         return ret ? NOTIFY_STOP : NOTIFY_OK;
863 }
864
865 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
866         .notifier_call          = perf_counter_nmi_handler,
867         .next                   = NULL,
868         .priority               = 1
869 };
870
871 static struct x86_pmu intel_pmu = {
872         .name                   = "Intel",
873         .handle_irq             = intel_pmu_handle_irq,
874         .save_disable_all       = intel_pmu_save_disable_all,
875         .restore_all            = intel_pmu_restore_all,
876         .enable                 = intel_pmu_enable_counter,
877         .disable                = intel_pmu_disable_counter,
878         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
879         .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
880         .event_map              = intel_pmu_event_map,
881         .raw_event              = intel_pmu_raw_event,
882         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
883 };
884
885 static struct x86_pmu amd_pmu = {
886         .name                   = "AMD",
887         .handle_irq             = amd_pmu_handle_irq,
888         .save_disable_all       = amd_pmu_save_disable_all,
889         .restore_all            = amd_pmu_restore_all,
890         .enable                 = amd_pmu_enable_counter,
891         .disable                = amd_pmu_disable_counter,
892         .eventsel               = MSR_K7_EVNTSEL0,
893         .perfctr                = MSR_K7_PERFCTR0,
894         .event_map              = amd_pmu_event_map,
895         .raw_event              = amd_pmu_raw_event,
896         .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
897         .num_counters           = 4,
898         .counter_bits           = 48,
899         .counter_mask           = (1ULL << 48) - 1,
900 };
901
902 static int intel_pmu_init(void)
903 {
904         union cpuid10_edx edx;
905         union cpuid10_eax eax;
906         unsigned int unused;
907         unsigned int ebx;
908         int version;
909
910         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
911                 return -ENODEV;
912
913         /*
914          * Check whether the Architectural PerfMon supports
915          * Branch Misses Retired Event or not.
916          */
917         cpuid(10, &eax.full, &ebx, &unused, &edx.full);
918         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
919                 return -ENODEV;
920
921         version = eax.split.version_id;
922         if (version < 2)
923                 return -ENODEV;
924
925         x86_pmu = intel_pmu;
926         x86_pmu.version = version;
927         x86_pmu.num_counters = eax.split.num_counters;
928         x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
929         x86_pmu.counter_bits = eax.split.bit_width;
930         x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
931
932         return 0;
933 }
934
935 static int amd_pmu_init(void)
936 {
937         x86_pmu = amd_pmu;
938         return 0;
939 }
940
941 void __init init_hw_perf_counters(void)
942 {
943         int err;
944
945         switch (boot_cpu_data.x86_vendor) {
946         case X86_VENDOR_INTEL:
947                 err = intel_pmu_init();
948                 break;
949         case X86_VENDOR_AMD:
950                 err = amd_pmu_init();
951                 break;
952         default:
953                 return;
954         }
955         if (err != 0)
956                 return;
957
958         pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
959         pr_info("... version:         %d\n", x86_pmu.version);
960         pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
961
962         pr_info("... num counters:    %d\n", x86_pmu.num_counters);
963         if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
964                 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
965                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
966                      x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
967         }
968         perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
969         perf_max_counters = x86_pmu.num_counters;
970
971         pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
972
973         if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
974                 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
975                 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
976                      x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
977         }
978         pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
979
980         perf_counter_mask |=
981                 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
982
983         pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
984
985         perf_counters_lapic_init(0);
986         register_die_notifier(&perf_counter_nmi_notifier);
987 }
988
989 static inline void x86_pmu_read(struct perf_counter *counter)
990 {
991         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
992 }
993
994 static const struct pmu pmu = {
995         .enable         = x86_pmu_enable,
996         .disable        = x86_pmu_disable,
997         .read           = x86_pmu_read,
998 };
999
1000 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1001 {
1002         int err;
1003
1004         err = __hw_perf_counter_init(counter);
1005         if (err)
1006                 return ERR_PTR(err);
1007
1008         return &pmu;
1009 }
1010
1011 /*
1012  * callchain support
1013  */
1014
1015 static inline
1016 void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1017 {
1018         if (entry->nr < MAX_STACK_DEPTH)
1019                 entry->ip[entry->nr++] = ip;
1020 }
1021
1022 static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1023 static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1024
1025
1026 static void
1027 backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1028 {
1029         /* Ignore warnings */
1030 }
1031
1032 static void backtrace_warning(void *data, char *msg)
1033 {
1034         /* Ignore warnings */
1035 }
1036
1037 static int backtrace_stack(void *data, char *name)
1038 {
1039         /* Don't bother with IRQ stacks for now */
1040         return -1;
1041 }
1042
1043 static void backtrace_address(void *data, unsigned long addr, int reliable)
1044 {
1045         struct perf_callchain_entry *entry = data;
1046
1047         if (reliable)
1048                 callchain_store(entry, addr);
1049 }
1050
1051 static const struct stacktrace_ops backtrace_ops = {
1052         .warning                = backtrace_warning,
1053         .warning_symbol         = backtrace_warning_symbol,
1054         .stack                  = backtrace_stack,
1055         .address                = backtrace_address,
1056 };
1057
1058 static void
1059 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1060 {
1061         unsigned long bp;
1062         char *stack;
1063         int nr = entry->nr;
1064
1065         callchain_store(entry, instruction_pointer(regs));
1066
1067         stack = ((char *)regs + sizeof(struct pt_regs));
1068 #ifdef CONFIG_FRAME_POINTER
1069         bp = frame_pointer(regs);
1070 #else
1071         bp = 0;
1072 #endif
1073
1074         dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1075
1076         entry->kernel = entry->nr - nr;
1077 }
1078
1079
1080 struct stack_frame {
1081         const void __user       *next_fp;
1082         unsigned long           return_address;
1083 };
1084
1085 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1086 {
1087         int ret;
1088
1089         if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1090                 return 0;
1091
1092         ret = 1;
1093         pagefault_disable();
1094         if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1095                 ret = 0;
1096         pagefault_enable();
1097
1098         return ret;
1099 }
1100
1101 static void
1102 perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1103 {
1104         struct stack_frame frame;
1105         const void __user *fp;
1106         int nr = entry->nr;
1107
1108         regs = (struct pt_regs *)current->thread.sp0 - 1;
1109         fp   = (void __user *)regs->bp;
1110
1111         callchain_store(entry, regs->ip);
1112
1113         while (entry->nr < MAX_STACK_DEPTH) {
1114                 frame.next_fp        = NULL;
1115                 frame.return_address = 0;
1116
1117                 if (!copy_stack_frame(fp, &frame))
1118                         break;
1119
1120                 if ((unsigned long)fp < user_stack_pointer(regs))
1121                         break;
1122
1123                 callchain_store(entry, frame.return_address);
1124                 fp = frame.next_fp;
1125         }
1126
1127         entry->user = entry->nr - nr;
1128 }
1129
1130 static void
1131 perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1132 {
1133         int is_user;
1134
1135         if (!regs)
1136                 return;
1137
1138         is_user = user_mode(regs);
1139
1140         if (!current || current->pid == 0)
1141                 return;
1142
1143         if (is_user && current->state != TASK_RUNNING)
1144                 return;
1145
1146         if (!is_user)
1147                 perf_callchain_kernel(regs, entry);
1148
1149         if (current->mm)
1150                 perf_callchain_user(regs, entry);
1151 }
1152
1153 struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1154 {
1155         struct perf_callchain_entry *entry;
1156
1157         if (in_nmi())
1158                 entry = &__get_cpu_var(nmi_entry);
1159         else
1160                 entry = &__get_cpu_var(irq_entry);
1161
1162         entry->nr = 0;
1163         entry->hv = 0;
1164         entry->kernel = 0;
1165         entry->user = 0;
1166
1167         perf_do_callchain(regs, entry);
1168
1169         return entry;
1170 }