perf_counter, x86: rework pmc_amd_save_disable_all() and pmc_amd_restore_all()
[pandora-kernel.git] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *  Copyright(C) 2009 Jaswinder Singh Rajput
7  *
8  *  For licencing details see kernel-base/COPYING
9  */
10
11 #include <linux/perf_counter.h>
12 #include <linux/capability.h>
13 #include <linux/notifier.h>
14 #include <linux/hardirq.h>
15 #include <linux/kprobes.h>
16 #include <linux/module.h>
17 #include <linux/kdebug.h>
18 #include <linux/sched.h>
19 #include <linux/uaccess.h>
20
21 #include <asm/apic.h>
22 #include <asm/stacktrace.h>
23 #include <asm/nmi.h>
24
25 static bool perf_counters_initialized __read_mostly;
26
27 /*
28  * Number of (generic) HW counters:
29  */
30 static int nr_counters_generic __read_mostly;
31 static u64 perf_counter_mask __read_mostly;
32 static u64 counter_value_mask __read_mostly;
33 static int counter_value_bits __read_mostly;
34
35 static int nr_counters_fixed __read_mostly;
36
37 struct cpu_hw_counters {
38         struct perf_counter     *counters[X86_PMC_IDX_MAX];
39         unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
40         unsigned long           interrupts;
41         u64                     throttle_ctrl;
42         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
43         int                     enabled;
44 };
45
46 /*
47  * struct pmc_x86_ops - performance counter x86 ops
48  */
49 struct pmc_x86_ops {
50         u64             (*save_disable_all)(void);
51         void            (*restore_all)(u64);
52         u64             (*get_status)(u64);
53         void            (*ack_status)(u64);
54         void            (*enable)(int, u64);
55         void            (*disable)(int, u64);
56         unsigned        eventsel;
57         unsigned        perfctr;
58         u64             (*event_map)(int);
59         u64             (*raw_event)(u64);
60         int             max_events;
61 };
62
63 static struct pmc_x86_ops *pmc_ops __read_mostly;
64
65 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66         .enabled = 1,
67 };
68
69 static __read_mostly int intel_perfmon_version;
70
71 /*
72  * Intel PerfMon v3. Used on Core2 and later.
73  */
74 static const u64 intel_perfmon_event_map[] =
75 {
76   [PERF_COUNT_CPU_CYCLES]               = 0x003c,
77   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
78   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
79   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
80   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
81   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
82   [PERF_COUNT_BUS_CYCLES]               = 0x013c,
83 };
84
85 static u64 pmc_intel_event_map(int event)
86 {
87         return intel_perfmon_event_map[event];
88 }
89
90 static u64 pmc_intel_raw_event(u64 event)
91 {
92 #define CORE_EVNTSEL_EVENT_MASK         0x000000FFULL
93 #define CORE_EVNTSEL_UNIT_MASK          0x0000FF00ULL
94 #define CORE_EVNTSEL_COUNTER_MASK       0xFF000000ULL
95
96 #define CORE_EVNTSEL_MASK               \
97         (CORE_EVNTSEL_EVENT_MASK |      \
98          CORE_EVNTSEL_UNIT_MASK  |      \
99          CORE_EVNTSEL_COUNTER_MASK)
100
101         return event & CORE_EVNTSEL_MASK;
102 }
103
104 /*
105  * AMD Performance Monitor K7 and later.
106  */
107 static const u64 amd_perfmon_event_map[] =
108 {
109   [PERF_COUNT_CPU_CYCLES]               = 0x0076,
110   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
111   [PERF_COUNT_CACHE_REFERENCES]         = 0x0080,
112   [PERF_COUNT_CACHE_MISSES]             = 0x0081,
113   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
114   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
115 };
116
117 static u64 pmc_amd_event_map(int event)
118 {
119         return amd_perfmon_event_map[event];
120 }
121
122 static u64 pmc_amd_raw_event(u64 event)
123 {
124 #define K7_EVNTSEL_EVENT_MASK   0x7000000FFULL
125 #define K7_EVNTSEL_UNIT_MASK    0x00000FF00ULL
126 #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
127
128 #define K7_EVNTSEL_MASK                 \
129         (K7_EVNTSEL_EVENT_MASK |        \
130          K7_EVNTSEL_UNIT_MASK  |        \
131          K7_EVNTSEL_COUNTER_MASK)
132
133         return event & K7_EVNTSEL_MASK;
134 }
135
136 /*
137  * Propagate counter elapsed time into the generic counter.
138  * Can only be executed on the CPU where the counter is active.
139  * Returns the delta events processed.
140  */
141 static void
142 x86_perf_counter_update(struct perf_counter *counter,
143                         struct hw_perf_counter *hwc, int idx)
144 {
145         u64 prev_raw_count, new_raw_count, delta;
146
147         /*
148          * Careful: an NMI might modify the previous counter value.
149          *
150          * Our tactic to handle this is to first atomically read and
151          * exchange a new raw count - then add that new-prev delta
152          * count to the generic counter atomically:
153          */
154 again:
155         prev_raw_count = atomic64_read(&hwc->prev_count);
156         rdmsrl(hwc->counter_base + idx, new_raw_count);
157
158         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
159                                         new_raw_count) != prev_raw_count)
160                 goto again;
161
162         /*
163          * Now we have the new raw value and have updated the prev
164          * timestamp already. We can now calculate the elapsed delta
165          * (counter-)time and add that to the generic counter.
166          *
167          * Careful, not all hw sign-extends above the physical width
168          * of the count, so we do that by clipping the delta to 32 bits:
169          */
170         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
171
172         atomic64_add(delta, &counter->count);
173         atomic64_sub(delta, &hwc->period_left);
174 }
175
176 static atomic_t num_counters;
177 static DEFINE_MUTEX(pmc_reserve_mutex);
178
179 static bool reserve_pmc_hardware(void)
180 {
181         int i;
182
183         if (nmi_watchdog == NMI_LOCAL_APIC)
184                 disable_lapic_nmi_watchdog();
185
186         for (i = 0; i < nr_counters_generic; i++) {
187                 if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
188                         goto perfctr_fail;
189         }
190
191         for (i = 0; i < nr_counters_generic; i++) {
192                 if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
193                         goto eventsel_fail;
194         }
195
196         return true;
197
198 eventsel_fail:
199         for (i--; i >= 0; i--)
200                 release_evntsel_nmi(pmc_ops->eventsel + i);
201
202         i = nr_counters_generic;
203
204 perfctr_fail:
205         for (i--; i >= 0; i--)
206                 release_perfctr_nmi(pmc_ops->perfctr + i);
207
208         if (nmi_watchdog == NMI_LOCAL_APIC)
209                 enable_lapic_nmi_watchdog();
210
211         return false;
212 }
213
214 static void release_pmc_hardware(void)
215 {
216         int i;
217
218         for (i = 0; i < nr_counters_generic; i++) {
219                 release_perfctr_nmi(pmc_ops->perfctr + i);
220                 release_evntsel_nmi(pmc_ops->eventsel + i);
221         }
222
223         if (nmi_watchdog == NMI_LOCAL_APIC)
224                 enable_lapic_nmi_watchdog();
225 }
226
227 static void hw_perf_counter_destroy(struct perf_counter *counter)
228 {
229         if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
230                 release_pmc_hardware();
231                 mutex_unlock(&pmc_reserve_mutex);
232         }
233 }
234
235 /*
236  * Setup the hardware configuration for a given hw_event_type
237  */
238 static int __hw_perf_counter_init(struct perf_counter *counter)
239 {
240         struct perf_counter_hw_event *hw_event = &counter->hw_event;
241         struct hw_perf_counter *hwc = &counter->hw;
242         int err;
243
244         if (unlikely(!perf_counters_initialized))
245                 return -EINVAL;
246
247         err = 0;
248         if (atomic_inc_not_zero(&num_counters)) {
249                 mutex_lock(&pmc_reserve_mutex);
250                 if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
251                         err = -EBUSY;
252                 else
253                         atomic_inc(&num_counters);
254                 mutex_unlock(&pmc_reserve_mutex);
255         }
256         if (err)
257                 return err;
258
259         /*
260          * Generate PMC IRQs:
261          * (keep 'enabled' bit clear for now)
262          */
263         hwc->config = ARCH_PERFMON_EVENTSEL_INT;
264
265         /*
266          * Count user and OS events unless requested not to.
267          */
268         if (!hw_event->exclude_user)
269                 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
270         if (!hw_event->exclude_kernel)
271                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
272
273         /*
274          * If privileged enough, allow NMI events:
275          */
276         hwc->nmi = 0;
277         if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
278                 hwc->nmi = 1;
279
280         hwc->irq_period         = hw_event->irq_period;
281         /*
282          * Intel PMCs cannot be accessed sanely above 32 bit width,
283          * so we install an artificial 1<<31 period regardless of
284          * the generic counter period:
285          */
286         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
287                 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
288                         hwc->irq_period = 0x7FFFFFFF;
289
290         atomic64_set(&hwc->period_left, hwc->irq_period);
291
292         /*
293          * Raw event type provide the config in the event structure
294          */
295         if (perf_event_raw(hw_event)) {
296                 hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
297         } else {
298                 if (perf_event_id(hw_event) >= pmc_ops->max_events)
299                         return -EINVAL;
300                 /*
301                  * The generic map:
302                  */
303                 hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
304         }
305
306         counter->destroy = hw_perf_counter_destroy;
307
308         return 0;
309 }
310
311 static u64 pmc_intel_save_disable_all(void)
312 {
313         u64 ctrl;
314
315         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
316         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
317
318         return ctrl;
319 }
320
321 static u64 pmc_amd_save_disable_all(void)
322 {
323         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
324         int enabled, idx;
325
326         enabled = cpuc->enabled;
327         cpuc->enabled = 0;
328         /*
329          * ensure we write the disable before we start disabling the
330          * counters proper, so that pcm_amd_enable() does the right thing.
331          */
332         barrier();
333
334         for (idx = 0; idx < nr_counters_generic; idx++) {
335                 u64 val;
336
337                 if (!test_bit(idx, cpuc->active_mask))
338                         continue;
339                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
340                 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
341                         continue;
342                 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
343                 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
344         }
345
346         return enabled;
347 }
348
349 u64 hw_perf_save_disable(void)
350 {
351         if (unlikely(!perf_counters_initialized))
352                 return 0;
353
354         return pmc_ops->save_disable_all();
355 }
356 /*
357  * Exported because of ACPI idle
358  */
359 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
360
361 static void pmc_intel_restore_all(u64 ctrl)
362 {
363         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
364 }
365
366 static void pmc_amd_restore_all(u64 ctrl)
367 {
368         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
369         int idx;
370
371         cpuc->enabled = ctrl;
372         barrier();
373         if (!ctrl)
374                 return;
375
376         for (idx = 0; idx < nr_counters_generic; idx++) {
377                 u64 val;
378
379                 if (!test_bit(idx, cpuc->active_mask))
380                         continue;
381                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
382                 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
383                         continue;
384                 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
385                 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
386         }
387 }
388
389 void hw_perf_restore(u64 ctrl)
390 {
391         if (unlikely(!perf_counters_initialized))
392                 return;
393
394         pmc_ops->restore_all(ctrl);
395 }
396 /*
397  * Exported because of ACPI idle
398  */
399 EXPORT_SYMBOL_GPL(hw_perf_restore);
400
401 static u64 pmc_intel_get_status(u64 mask)
402 {
403         u64 status;
404
405         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
406
407         return status;
408 }
409
410 static u64 pmc_amd_get_status(u64 mask)
411 {
412         u64 status = 0;
413         int idx;
414
415         for (idx = 0; idx < nr_counters_generic; idx++) {
416                 s64 val;
417
418                 if (!(mask & (1 << idx)))
419                         continue;
420
421                 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
422                 val <<= (64 - counter_value_bits);
423                 if (val >= 0)
424                         status |= (1 << idx);
425         }
426
427         return status;
428 }
429
430 static u64 hw_perf_get_status(u64 mask)
431 {
432         if (unlikely(!perf_counters_initialized))
433                 return 0;
434
435         return pmc_ops->get_status(mask);
436 }
437
438 static void pmc_intel_ack_status(u64 ack)
439 {
440         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
441 }
442
443 static void pmc_amd_ack_status(u64 ack)
444 {
445 }
446
447 static void hw_perf_ack_status(u64 ack)
448 {
449         if (unlikely(!perf_counters_initialized))
450                 return;
451
452         pmc_ops->ack_status(ack);
453 }
454
455 static void pmc_intel_enable(int idx, u64 config)
456 {
457         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
458                         config | ARCH_PERFMON_EVENTSEL0_ENABLE);
459 }
460
461 static void pmc_amd_enable(int idx, u64 config)
462 {
463         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
464
465         set_bit(idx, cpuc->active_mask);
466         if (cpuc->enabled)
467                 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
468
469         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
470 }
471
472 static void hw_perf_enable(int idx, u64 config)
473 {
474         if (unlikely(!perf_counters_initialized))
475                 return;
476
477         pmc_ops->enable(idx, config);
478 }
479
480 static void pmc_intel_disable(int idx, u64 config)
481 {
482         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
483 }
484
485 static void pmc_amd_disable(int idx, u64 config)
486 {
487         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
488
489         clear_bit(idx, cpuc->active_mask);
490         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
491
492 }
493
494 static void hw_perf_disable(int idx, u64 config)
495 {
496         if (unlikely(!perf_counters_initialized))
497                 return;
498
499         pmc_ops->disable(idx, config);
500 }
501
502 static inline void
503 __pmc_fixed_disable(struct perf_counter *counter,
504                     struct hw_perf_counter *hwc, unsigned int __idx)
505 {
506         int idx = __idx - X86_PMC_IDX_FIXED;
507         u64 ctrl_val, mask;
508         int err;
509
510         mask = 0xfULL << (idx * 4);
511
512         rdmsrl(hwc->config_base, ctrl_val);
513         ctrl_val &= ~mask;
514         err = checking_wrmsrl(hwc->config_base, ctrl_val);
515 }
516
517 static inline void
518 __pmc_generic_disable(struct perf_counter *counter,
519                            struct hw_perf_counter *hwc, unsigned int idx)
520 {
521         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
522                 __pmc_fixed_disable(counter, hwc, idx);
523         else
524                 hw_perf_disable(idx, hwc->config);
525 }
526
527 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
528
529 /*
530  * Set the next IRQ period, based on the hwc->period_left value.
531  * To be called with the counter disabled in hw:
532  */
533 static void
534 __hw_perf_counter_set_period(struct perf_counter *counter,
535                              struct hw_perf_counter *hwc, int idx)
536 {
537         s64 left = atomic64_read(&hwc->period_left);
538         s64 period = hwc->irq_period;
539         int err;
540
541         /*
542          * If we are way outside a reasoable range then just skip forward:
543          */
544         if (unlikely(left <= -period)) {
545                 left = period;
546                 atomic64_set(&hwc->period_left, left);
547         }
548
549         if (unlikely(left <= 0)) {
550                 left += period;
551                 atomic64_set(&hwc->period_left, left);
552         }
553
554         per_cpu(prev_left[idx], smp_processor_id()) = left;
555
556         /*
557          * The hw counter starts counting from this counter offset,
558          * mark it to be able to extra future deltas:
559          */
560         atomic64_set(&hwc->prev_count, (u64)-left);
561
562         err = checking_wrmsrl(hwc->counter_base + idx,
563                              (u64)(-left) & counter_value_mask);
564 }
565
566 static inline void
567 __pmc_fixed_enable(struct perf_counter *counter,
568                    struct hw_perf_counter *hwc, unsigned int __idx)
569 {
570         int idx = __idx - X86_PMC_IDX_FIXED;
571         u64 ctrl_val, bits, mask;
572         int err;
573
574         /*
575          * Enable IRQ generation (0x8),
576          * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
577          * if requested:
578          */
579         bits = 0x8ULL;
580         if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
581                 bits |= 0x2;
582         if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
583                 bits |= 0x1;
584         bits <<= (idx * 4);
585         mask = 0xfULL << (idx * 4);
586
587         rdmsrl(hwc->config_base, ctrl_val);
588         ctrl_val &= ~mask;
589         ctrl_val |= bits;
590         err = checking_wrmsrl(hwc->config_base, ctrl_val);
591 }
592
593 static void
594 __pmc_generic_enable(struct perf_counter *counter,
595                           struct hw_perf_counter *hwc, int idx)
596 {
597         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
598                 __pmc_fixed_enable(counter, hwc, idx);
599         else
600                 hw_perf_enable(idx, hwc->config);
601 }
602
603 static int
604 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
605 {
606         unsigned int event;
607
608         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
609                 return -1;
610
611         if (unlikely(hwc->nmi))
612                 return -1;
613
614         event = hwc->config & ARCH_PERFMON_EVENT_MASK;
615
616         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
617                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
618         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
619                 return X86_PMC_IDX_FIXED_CPU_CYCLES;
620         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
621                 return X86_PMC_IDX_FIXED_BUS_CYCLES;
622
623         return -1;
624 }
625
626 /*
627  * Find a PMC slot for the freshly enabled / scheduled in counter:
628  */
629 static int pmc_generic_enable(struct perf_counter *counter)
630 {
631         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
632         struct hw_perf_counter *hwc = &counter->hw;
633         int idx;
634
635         idx = fixed_mode_idx(counter, hwc);
636         if (idx >= 0) {
637                 /*
638                  * Try to get the fixed counter, if that is already taken
639                  * then try to get a generic counter:
640                  */
641                 if (test_and_set_bit(idx, cpuc->used))
642                         goto try_generic;
643
644                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
645                 /*
646                  * We set it so that counter_base + idx in wrmsr/rdmsr maps to
647                  * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
648                  */
649                 hwc->counter_base =
650                         MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
651                 hwc->idx = idx;
652         } else {
653                 idx = hwc->idx;
654                 /* Try to get the previous generic counter again */
655                 if (test_and_set_bit(idx, cpuc->used)) {
656 try_generic:
657                         idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
658                         if (idx == nr_counters_generic)
659                                 return -EAGAIN;
660
661                         set_bit(idx, cpuc->used);
662                         hwc->idx = idx;
663                 }
664                 hwc->config_base  = pmc_ops->eventsel;
665                 hwc->counter_base = pmc_ops->perfctr;
666         }
667
668         perf_counters_lapic_init(hwc->nmi);
669
670         __pmc_generic_disable(counter, hwc, idx);
671
672         cpuc->counters[idx] = counter;
673         /*
674          * Make it visible before enabling the hw:
675          */
676         smp_wmb();
677
678         __hw_perf_counter_set_period(counter, hwc, idx);
679         __pmc_generic_enable(counter, hwc, idx);
680
681         return 0;
682 }
683
684 void perf_counter_print_debug(void)
685 {
686         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
687         struct cpu_hw_counters *cpuc;
688         int cpu, idx;
689
690         if (!nr_counters_generic)
691                 return;
692
693         local_irq_disable();
694
695         cpu = smp_processor_id();
696         cpuc = &per_cpu(cpu_hw_counters, cpu);
697
698         if (intel_perfmon_version >= 2) {
699                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
700                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
701                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
702                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
703
704                 pr_info("\n");
705                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
706                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
707                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
708                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
709         }
710         pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
711
712         for (idx = 0; idx < nr_counters_generic; idx++) {
713                 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
714                 rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
715
716                 prev_left = per_cpu(prev_left[idx], cpu);
717
718                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
719                         cpu, idx, pmc_ctrl);
720                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
721                         cpu, idx, pmc_count);
722                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
723                         cpu, idx, prev_left);
724         }
725         for (idx = 0; idx < nr_counters_fixed; idx++) {
726                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
727
728                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
729                         cpu, idx, pmc_count);
730         }
731         local_irq_enable();
732 }
733
734 static void pmc_generic_disable(struct perf_counter *counter)
735 {
736         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
737         struct hw_perf_counter *hwc = &counter->hw;
738         unsigned int idx = hwc->idx;
739
740         __pmc_generic_disable(counter, hwc, idx);
741
742         clear_bit(idx, cpuc->used);
743         cpuc->counters[idx] = NULL;
744         /*
745          * Make sure the cleared pointer becomes visible before we
746          * (potentially) free the counter:
747          */
748         smp_wmb();
749
750         /*
751          * Drain the remaining delta count out of a counter
752          * that we are disabling:
753          */
754         x86_perf_counter_update(counter, hwc, idx);
755 }
756
757 /*
758  * Save and restart an expired counter. Called by NMI contexts,
759  * so it has to be careful about preempting normal counter ops:
760  */
761 static void perf_save_and_restart(struct perf_counter *counter)
762 {
763         struct hw_perf_counter *hwc = &counter->hw;
764         int idx = hwc->idx;
765
766         x86_perf_counter_update(counter, hwc, idx);
767         __hw_perf_counter_set_period(counter, hwc, idx);
768
769         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
770                 __pmc_generic_enable(counter, hwc, idx);
771 }
772
773 /*
774  * Maximum interrupt frequency of 100KHz per CPU
775  */
776 #define PERFMON_MAX_INTERRUPTS (100000/HZ)
777
778 /*
779  * This handler is triggered by the local APIC, so the APIC IRQ handling
780  * rules apply:
781  */
782 static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
783 {
784         int bit, cpu = smp_processor_id();
785         u64 ack, status;
786         struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
787         int ret = 0;
788
789         cpuc->throttle_ctrl = hw_perf_save_disable();
790
791         status = hw_perf_get_status(cpuc->throttle_ctrl);
792         if (!status)
793                 goto out;
794
795         ret = 1;
796 again:
797         inc_irq_stat(apic_perf_irqs);
798         ack = status;
799         for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
800                 struct perf_counter *counter = cpuc->counters[bit];
801
802                 clear_bit(bit, (unsigned long *) &status);
803                 if (!counter)
804                         continue;
805
806                 perf_save_and_restart(counter);
807                 if (perf_counter_overflow(counter, nmi, regs, 0))
808                         __pmc_generic_disable(counter, &counter->hw, bit);
809         }
810
811         hw_perf_ack_status(ack);
812
813         /*
814          * Repeat if there is more work to be done:
815          */
816         status = hw_perf_get_status(cpuc->throttle_ctrl);
817         if (status)
818                 goto again;
819 out:
820         /*
821          * Restore - do not reenable when global enable is off or throttled:
822          */
823         if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
824                 hw_perf_restore(cpuc->throttle_ctrl);
825
826         return ret;
827 }
828
829 void perf_counter_unthrottle(void)
830 {
831         struct cpu_hw_counters *cpuc;
832
833         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
834                 return;
835
836         if (unlikely(!perf_counters_initialized))
837                 return;
838
839         cpuc = &__get_cpu_var(cpu_hw_counters);
840         if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
841                 if (printk_ratelimit())
842                         printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
843                 hw_perf_restore(cpuc->throttle_ctrl);
844         }
845         cpuc->interrupts = 0;
846 }
847
848 void smp_perf_counter_interrupt(struct pt_regs *regs)
849 {
850         irq_enter();
851         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
852         ack_APIC_irq();
853         __smp_perf_counter_interrupt(regs, 0);
854         irq_exit();
855 }
856
857 void smp_perf_pending_interrupt(struct pt_regs *regs)
858 {
859         irq_enter();
860         ack_APIC_irq();
861         inc_irq_stat(apic_pending_irqs);
862         perf_counter_do_pending();
863         irq_exit();
864 }
865
866 void set_perf_counter_pending(void)
867 {
868         apic->send_IPI_self(LOCAL_PENDING_VECTOR);
869 }
870
871 void perf_counters_lapic_init(int nmi)
872 {
873         u32 apic_val;
874
875         if (!perf_counters_initialized)
876                 return;
877         /*
878          * Enable the performance counter vector in the APIC LVT:
879          */
880         apic_val = apic_read(APIC_LVTERR);
881
882         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
883         if (nmi)
884                 apic_write(APIC_LVTPC, APIC_DM_NMI);
885         else
886                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
887         apic_write(APIC_LVTERR, apic_val);
888 }
889
890 static int __kprobes
891 perf_counter_nmi_handler(struct notifier_block *self,
892                          unsigned long cmd, void *__args)
893 {
894         struct die_args *args = __args;
895         struct pt_regs *regs;
896         int ret;
897
898         switch (cmd) {
899         case DIE_NMI:
900         case DIE_NMI_IPI:
901                 break;
902
903         default:
904                 return NOTIFY_DONE;
905         }
906
907         regs = args->regs;
908
909         apic_write(APIC_LVTPC, APIC_DM_NMI);
910         ret = __smp_perf_counter_interrupt(regs, 1);
911
912         return ret ? NOTIFY_STOP : NOTIFY_OK;
913 }
914
915 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
916         .notifier_call          = perf_counter_nmi_handler,
917         .next                   = NULL,
918         .priority               = 1
919 };
920
921 static struct pmc_x86_ops pmc_intel_ops = {
922         .save_disable_all       = pmc_intel_save_disable_all,
923         .restore_all            = pmc_intel_restore_all,
924         .get_status             = pmc_intel_get_status,
925         .ack_status             = pmc_intel_ack_status,
926         .enable                 = pmc_intel_enable,
927         .disable                = pmc_intel_disable,
928         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
929         .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
930         .event_map              = pmc_intel_event_map,
931         .raw_event              = pmc_intel_raw_event,
932         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
933 };
934
935 static struct pmc_x86_ops pmc_amd_ops = {
936         .save_disable_all       = pmc_amd_save_disable_all,
937         .restore_all            = pmc_amd_restore_all,
938         .get_status             = pmc_amd_get_status,
939         .ack_status             = pmc_amd_ack_status,
940         .enable                 = pmc_amd_enable,
941         .disable                = pmc_amd_disable,
942         .eventsel               = MSR_K7_EVNTSEL0,
943         .perfctr                = MSR_K7_PERFCTR0,
944         .event_map              = pmc_amd_event_map,
945         .raw_event              = pmc_amd_raw_event,
946         .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
947 };
948
949 static struct pmc_x86_ops *pmc_intel_init(void)
950 {
951         union cpuid10_edx edx;
952         union cpuid10_eax eax;
953         unsigned int unused;
954         unsigned int ebx;
955
956         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
957                 return NULL;
958
959         /*
960          * Check whether the Architectural PerfMon supports
961          * Branch Misses Retired Event or not.
962          */
963         cpuid(10, &eax.full, &ebx, &unused, &edx.full);
964         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
965                 return NULL;
966
967         intel_perfmon_version = eax.split.version_id;
968         if (intel_perfmon_version < 2)
969                 return NULL;
970
971         pr_info("Intel Performance Monitoring support detected.\n");
972         pr_info("... version:         %d\n", intel_perfmon_version);
973         pr_info("... bit width:       %d\n", eax.split.bit_width);
974         pr_info("... mask length:     %d\n", eax.split.mask_length);
975
976         nr_counters_generic = eax.split.num_counters;
977         nr_counters_fixed = edx.split.num_counters_fixed;
978         counter_value_mask = (1ULL << eax.split.bit_width) - 1;
979
980         return &pmc_intel_ops;
981 }
982
983 static struct pmc_x86_ops *pmc_amd_init(void)
984 {
985         nr_counters_generic = 4;
986         nr_counters_fixed = 0;
987         counter_value_mask = 0x0000FFFFFFFFFFFFULL;
988         counter_value_bits = 48;
989
990         pr_info("AMD Performance Monitoring support detected.\n");
991
992         return &pmc_amd_ops;
993 }
994
995 void __init init_hw_perf_counters(void)
996 {
997         switch (boot_cpu_data.x86_vendor) {
998         case X86_VENDOR_INTEL:
999                 pmc_ops = pmc_intel_init();
1000                 break;
1001         case X86_VENDOR_AMD:
1002                 pmc_ops = pmc_amd_init();
1003                 break;
1004         default:
1005                 return;
1006         }
1007         if (!pmc_ops)
1008                 return;
1009
1010         pr_info("... num counters:    %d\n", nr_counters_generic);
1011         if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
1012                 nr_counters_generic = X86_PMC_MAX_GENERIC;
1013                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1014                         nr_counters_generic, X86_PMC_MAX_GENERIC);
1015         }
1016         perf_counter_mask = (1 << nr_counters_generic) - 1;
1017         perf_max_counters = nr_counters_generic;
1018
1019         pr_info("... value mask:      %016Lx\n", counter_value_mask);
1020
1021         if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
1022                 nr_counters_fixed = X86_PMC_MAX_FIXED;
1023                 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1024                         nr_counters_fixed, X86_PMC_MAX_FIXED);
1025         }
1026         pr_info("... fixed counters:  %d\n", nr_counters_fixed);
1027
1028         perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1029
1030         pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1031         perf_counters_initialized = true;
1032
1033         perf_counters_lapic_init(0);
1034         register_die_notifier(&perf_counter_nmi_notifier);
1035 }
1036
1037 static void pmc_generic_read(struct perf_counter *counter)
1038 {
1039         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1040 }
1041
1042 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1043         .enable         = pmc_generic_enable,
1044         .disable        = pmc_generic_disable,
1045         .read           = pmc_generic_read,
1046 };
1047
1048 const struct hw_perf_counter_ops *
1049 hw_perf_counter_init(struct perf_counter *counter)
1050 {
1051         int err;
1052
1053         err = __hw_perf_counter_init(counter);
1054         if (err)
1055                 return ERR_PTR(err);
1056
1057         return &x86_perf_counter_ops;
1058 }
1059
1060 /*
1061  * callchain support
1062  */
1063
1064 static inline
1065 void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1066 {
1067         if (entry->nr < MAX_STACK_DEPTH)
1068                 entry->ip[entry->nr++] = ip;
1069 }
1070
1071 static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1072 static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1073
1074
1075 static void
1076 backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1077 {
1078         /* Ignore warnings */
1079 }
1080
1081 static void backtrace_warning(void *data, char *msg)
1082 {
1083         /* Ignore warnings */
1084 }
1085
1086 static int backtrace_stack(void *data, char *name)
1087 {
1088         /* Don't bother with IRQ stacks for now */
1089         return -1;
1090 }
1091
1092 static void backtrace_address(void *data, unsigned long addr, int reliable)
1093 {
1094         struct perf_callchain_entry *entry = data;
1095
1096         if (reliable)
1097                 callchain_store(entry, addr);
1098 }
1099
1100 static const struct stacktrace_ops backtrace_ops = {
1101         .warning                = backtrace_warning,
1102         .warning_symbol         = backtrace_warning_symbol,
1103         .stack                  = backtrace_stack,
1104         .address                = backtrace_address,
1105 };
1106
1107 static void
1108 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1109 {
1110         unsigned long bp;
1111         char *stack;
1112         int nr = entry->nr;
1113
1114         callchain_store(entry, instruction_pointer(regs));
1115
1116         stack = ((char *)regs + sizeof(struct pt_regs));
1117 #ifdef CONFIG_FRAME_POINTER
1118         bp = frame_pointer(regs);
1119 #else
1120         bp = 0;
1121 #endif
1122
1123         dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1124
1125         entry->kernel = entry->nr - nr;
1126 }
1127
1128
1129 struct stack_frame {
1130         const void __user       *next_fp;
1131         unsigned long           return_address;
1132 };
1133
1134 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1135 {
1136         int ret;
1137
1138         if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1139                 return 0;
1140
1141         ret = 1;
1142         pagefault_disable();
1143         if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1144                 ret = 0;
1145         pagefault_enable();
1146
1147         return ret;
1148 }
1149
1150 static void
1151 perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1152 {
1153         struct stack_frame frame;
1154         const void __user *fp;
1155         int nr = entry->nr;
1156
1157         regs = (struct pt_regs *)current->thread.sp0 - 1;
1158         fp   = (void __user *)regs->bp;
1159
1160         callchain_store(entry, regs->ip);
1161
1162         while (entry->nr < MAX_STACK_DEPTH) {
1163                 frame.next_fp        = NULL;
1164                 frame.return_address = 0;
1165
1166                 if (!copy_stack_frame(fp, &frame))
1167                         break;
1168
1169                 if ((unsigned long)fp < user_stack_pointer(regs))
1170                         break;
1171
1172                 callchain_store(entry, frame.return_address);
1173                 fp = frame.next_fp;
1174         }
1175
1176         entry->user = entry->nr - nr;
1177 }
1178
1179 static void
1180 perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1181 {
1182         int is_user;
1183
1184         if (!regs)
1185                 return;
1186
1187         is_user = user_mode(regs);
1188
1189         if (!current || current->pid == 0)
1190                 return;
1191
1192         if (is_user && current->state != TASK_RUNNING)
1193                 return;
1194
1195         if (!is_user)
1196                 perf_callchain_kernel(regs, entry);
1197
1198         if (current->mm)
1199                 perf_callchain_user(regs, entry);
1200 }
1201
1202 struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1203 {
1204         struct perf_callchain_entry *entry;
1205
1206         if (in_nmi())
1207                 entry = &__get_cpu_var(nmi_entry);
1208         else
1209                 entry = &__get_cpu_var(irq_entry);
1210
1211         entry->nr = 0;
1212         entry->hv = 0;
1213         entry->kernel = 0;
1214         entry->user = 0;
1215
1216         perf_do_callchain(regs, entry);
1217
1218         return entry;
1219 }