b7a010fce8c36bda6b980a00415ee7c563daee60
[pandora-kernel.git] / arch / x86 / kernel / cpu / perf_event.c
1 /*
2  * Performance events x86 architecture code
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2009 Jaswinder Singh Rajput
7  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9  *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10  *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11  *
12  *  For licencing details see kernel-base/COPYING
13  */
14
15 #include <linux/perf_event.h>
16 #include <linux/capability.h>
17 #include <linux/notifier.h>
18 #include <linux/hardirq.h>
19 #include <linux/kprobes.h>
20 #include <linux/module.h>
21 #include <linux/kdebug.h>
22 #include <linux/sched.h>
23 #include <linux/uaccess.h>
24 #include <linux/slab.h>
25 #include <linux/highmem.h>
26 #include <linux/cpu.h>
27 #include <linux/bitops.h>
28
29 #include <asm/apic.h>
30 #include <asm/stacktrace.h>
31 #include <asm/nmi.h>
32 #include <asm/compat.h>
33 #include <asm/smp.h>
34 #include <asm/alternative.h>
35
36 #if 0
37 #undef wrmsrl
38 #define wrmsrl(msr, val)                                        \
39 do {                                                            \
40         trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
41                         (unsigned long)(val));                  \
42         native_write_msr((msr), (u32)((u64)(val)),              \
43                         (u32)((u64)(val) >> 32));               \
44 } while (0)
45 #endif
46
47 /*
48  *          |   NHM/WSM    |      SNB     |
49  * register -------------------------------
50  *          |  HT  | no HT |  HT  | no HT |
51  *-----------------------------------------
52  * offcore  | core | core  | cpu  | core  |
53  * lbr_sel  | core | core  | cpu  | core  |
54  * ld_lat   | cpu  | core  | cpu  | core  |
55  *-----------------------------------------
56  *
57  * Given that there is a small number of shared regs,
58  * we can pre-allocate their slot in the per-cpu
59  * per-core reg tables.
60  */
61 enum extra_reg_type {
62         EXTRA_REG_NONE  = -1,   /* not used */
63
64         EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
65         EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
66
67         EXTRA_REG_MAX           /* number of entries needed */
68 };
69
70 /*
71  * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
72  */
73 static unsigned long
74 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
75 {
76         unsigned long offset, addr = (unsigned long)from;
77         unsigned long size, len = 0;
78         struct page *page;
79         void *map;
80         int ret;
81
82         do {
83                 ret = __get_user_pages_fast(addr, 1, 0, &page);
84                 if (!ret)
85                         break;
86
87                 offset = addr & (PAGE_SIZE - 1);
88                 size = min(PAGE_SIZE - offset, n - len);
89
90                 map = kmap_atomic(page);
91                 memcpy(to, map+offset, size);
92                 kunmap_atomic(map);
93                 put_page(page);
94
95                 len  += size;
96                 to   += size;
97                 addr += size;
98
99         } while (len < n);
100
101         return len;
102 }
103
104 struct event_constraint {
105         union {
106                 unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
107                 u64             idxmsk64;
108         };
109         u64     code;
110         u64     cmask;
111         int     weight;
112 };
113
114 struct amd_nb {
115         int nb_id;  /* NorthBridge id */
116         int refcnt; /* reference count */
117         struct perf_event *owners[X86_PMC_IDX_MAX];
118         struct event_constraint event_constraints[X86_PMC_IDX_MAX];
119 };
120
121 struct intel_percore;
122
123 #define MAX_LBR_ENTRIES         16
124
125 struct cpu_hw_events {
126         /*
127          * Generic x86 PMC bits
128          */
129         struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
130         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
131         unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
132         int                     enabled;
133
134         int                     n_events;
135         int                     n_added;
136         int                     n_txn;
137         int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
138         u64                     tags[X86_PMC_IDX_MAX];
139         struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
140
141         unsigned int            group_flag;
142
143         /*
144          * Intel DebugStore bits
145          */
146         struct debug_store      *ds;
147         u64                     pebs_enabled;
148
149         /*
150          * Intel LBR bits
151          */
152         int                             lbr_users;
153         void                            *lbr_context;
154         struct perf_branch_stack        lbr_stack;
155         struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
156
157         /*
158          * manage shared (per-core, per-cpu) registers
159          * used on Intel NHM/WSM/SNB
160          */
161         struct intel_shared_regs        *shared_regs;
162
163         /*
164          * AMD specific bits
165          */
166         struct amd_nb           *amd_nb;
167 };
168
169 #define __EVENT_CONSTRAINT(c, n, m, w) {\
170         { .idxmsk64 = (n) },            \
171         .code = (c),                    \
172         .cmask = (m),                   \
173         .weight = (w),                  \
174 }
175
176 #define EVENT_CONSTRAINT(c, n, m)       \
177         __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
178
179 /*
180  * Constraint on the Event code.
181  */
182 #define INTEL_EVENT_CONSTRAINT(c, n)    \
183         EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
184
185 /*
186  * Constraint on the Event code + UMask + fixed-mask
187  *
188  * filter mask to validate fixed counter events.
189  * the following filters disqualify for fixed counters:
190  *  - inv
191  *  - edge
192  *  - cnt-mask
193  *  The other filters are supported by fixed counters.
194  *  The any-thread option is supported starting with v3.
195  */
196 #define FIXED_EVENT_CONSTRAINT(c, n)    \
197         EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
198
199 /*
200  * Constraint on the Event code + UMask
201  */
202 #define INTEL_UEVENT_CONSTRAINT(c, n)   \
203         EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
204
205 #define EVENT_CONSTRAINT_END            \
206         EVENT_CONSTRAINT(0, 0, 0)
207
208 #define for_each_event_constraint(e, c) \
209         for ((e) = (c); (e)->weight; (e)++)
210
211 /*
212  * Per register state.
213  */
214 struct er_account {
215         raw_spinlock_t          lock;   /* per-core: protect structure */
216         u64                     config; /* extra MSR config */
217         u64                     reg;    /* extra MSR number */
218         atomic_t                ref;    /* reference count */
219 };
220
221 /*
222  * Extra registers for specific events.
223  *
224  * Some events need large masks and require external MSRs.
225  * Those extra MSRs end up being shared for all events on
226  * a PMU and sometimes between PMU of sibling HT threads.
227  * In either case, the kernel needs to handle conflicting
228  * accesses to those extra, shared, regs. The data structure
229  * to manage those registers is stored in cpu_hw_event.
230  */
231 struct extra_reg {
232         unsigned int            event;
233         unsigned int            msr;
234         u64                     config_mask;
235         u64                     valid_mask;
236         int                     idx;  /* per_xxx->regs[] reg index */
237 };
238
239 #define EVENT_EXTRA_REG(e, ms, m, vm, i) {      \
240         .event = (e),           \
241         .msr = (ms),            \
242         .config_mask = (m),     \
243         .valid_mask = (vm),     \
244         .idx = EXTRA_REG_##i    \
245         }
246
247 #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)      \
248         EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
249
250 #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
251
252 union perf_capabilities {
253         struct {
254                 u64     lbr_format    : 6;
255                 u64     pebs_trap     : 1;
256                 u64     pebs_arch_reg : 1;
257                 u64     pebs_format   : 4;
258                 u64     smm_freeze    : 1;
259         };
260         u64     capabilities;
261 };
262
263 /*
264  * struct x86_pmu - generic x86 pmu
265  */
266 struct x86_pmu {
267         /*
268          * Generic x86 PMC bits
269          */
270         const char      *name;
271         int             version;
272         int             (*handle_irq)(struct pt_regs *);
273         void            (*disable_all)(void);
274         void            (*enable_all)(int added);
275         void            (*enable)(struct perf_event *);
276         void            (*disable)(struct perf_event *);
277         int             (*hw_config)(struct perf_event *event);
278         int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
279         unsigned        eventsel;
280         unsigned        perfctr;
281         u64             (*event_map)(int);
282         int             max_events;
283         int             num_counters;
284         int             num_counters_fixed;
285         int             cntval_bits;
286         u64             cntval_mask;
287         int             apic;
288         u64             max_period;
289         struct event_constraint *
290                         (*get_event_constraints)(struct cpu_hw_events *cpuc,
291                                                  struct perf_event *event);
292
293         void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
294                                                  struct perf_event *event);
295         struct event_constraint *event_constraints;
296         void            (*quirks)(void);
297         int             perfctr_second_write;
298
299         int             (*cpu_prepare)(int cpu);
300         void            (*cpu_starting)(int cpu);
301         void            (*cpu_dying)(int cpu);
302         void            (*cpu_dead)(int cpu);
303
304         /*
305          * Intel Arch Perfmon v2+
306          */
307         u64                     intel_ctrl;
308         union perf_capabilities intel_cap;
309
310         /*
311          * Intel DebugStore bits
312          */
313         int             bts, pebs;
314         int             bts_active, pebs_active;
315         int             pebs_record_size;
316         void            (*drain_pebs)(struct pt_regs *regs);
317         struct event_constraint *pebs_constraints;
318
319         /*
320          * Intel LBR
321          */
322         unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
323         int             lbr_nr;                    /* hardware stack size */
324
325         /*
326          * Extra registers for events
327          */
328         struct extra_reg *extra_regs;
329         unsigned int er_flags;
330 };
331
332 #define ERF_NO_HT_SHARING       1
333 #define ERF_HAS_RSP_1           2
334
335 static struct x86_pmu x86_pmu __read_mostly;
336
337 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
338         .enabled = 1,
339 };
340
341 static int x86_perf_event_set_period(struct perf_event *event);
342
343 /*
344  * Generalized hw caching related hw_event table, filled
345  * in on a per model basis. A value of 0 means
346  * 'not supported', -1 means 'hw_event makes no sense on
347  * this CPU', any other value means the raw hw_event
348  * ID.
349  */
350
351 #define C(x) PERF_COUNT_HW_CACHE_##x
352
353 static u64 __read_mostly hw_cache_event_ids
354                                 [PERF_COUNT_HW_CACHE_MAX]
355                                 [PERF_COUNT_HW_CACHE_OP_MAX]
356                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
357 static u64 __read_mostly hw_cache_extra_regs
358                                 [PERF_COUNT_HW_CACHE_MAX]
359                                 [PERF_COUNT_HW_CACHE_OP_MAX]
360                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
361
362 /*
363  * Propagate event elapsed time into the generic event.
364  * Can only be executed on the CPU where the event is active.
365  * Returns the delta events processed.
366  */
367 static u64
368 x86_perf_event_update(struct perf_event *event)
369 {
370         struct hw_perf_event *hwc = &event->hw;
371         int shift = 64 - x86_pmu.cntval_bits;
372         u64 prev_raw_count, new_raw_count;
373         int idx = hwc->idx;
374         s64 delta;
375
376         if (idx == X86_PMC_IDX_FIXED_BTS)
377                 return 0;
378
379         /*
380          * Careful: an NMI might modify the previous event value.
381          *
382          * Our tactic to handle this is to first atomically read and
383          * exchange a new raw count - then add that new-prev delta
384          * count to the generic event atomically:
385          */
386 again:
387         prev_raw_count = local64_read(&hwc->prev_count);
388         rdmsrl(hwc->event_base, new_raw_count);
389
390         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
391                                         new_raw_count) != prev_raw_count)
392                 goto again;
393
394         /*
395          * Now we have the new raw value and have updated the prev
396          * timestamp already. We can now calculate the elapsed delta
397          * (event-)time and add that to the generic event.
398          *
399          * Careful, not all hw sign-extends above the physical width
400          * of the count.
401          */
402         delta = (new_raw_count << shift) - (prev_raw_count << shift);
403         delta >>= shift;
404
405         local64_add(delta, &event->count);
406         local64_sub(delta, &hwc->period_left);
407
408         return new_raw_count;
409 }
410
411 static inline int x86_pmu_addr_offset(int index)
412 {
413         int offset;
414
415         /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
416         alternative_io(ASM_NOP2,
417                        "shll $1, %%eax",
418                        X86_FEATURE_PERFCTR_CORE,
419                        "=a" (offset),
420                        "a"  (index));
421
422         return offset;
423 }
424
425 static inline unsigned int x86_pmu_config_addr(int index)
426 {
427         return x86_pmu.eventsel + x86_pmu_addr_offset(index);
428 }
429
430 static inline unsigned int x86_pmu_event_addr(int index)
431 {
432         return x86_pmu.perfctr + x86_pmu_addr_offset(index);
433 }
434
435 /*
436  * Find and validate any extra registers to set up.
437  */
438 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
439 {
440         struct hw_perf_event_extra *reg;
441         struct extra_reg *er;
442
443         reg = &event->hw.extra_reg;
444
445         if (!x86_pmu.extra_regs)
446                 return 0;
447
448         for (er = x86_pmu.extra_regs; er->msr; er++) {
449                 if (er->event != (config & er->config_mask))
450                         continue;
451                 if (event->attr.config1 & ~er->valid_mask)
452                         return -EINVAL;
453
454                 reg->idx = er->idx;
455                 reg->config = event->attr.config1;
456                 reg->reg = er->msr;
457                 break;
458         }
459         return 0;
460 }
461
462 static atomic_t active_events;
463 static DEFINE_MUTEX(pmc_reserve_mutex);
464
465 #ifdef CONFIG_X86_LOCAL_APIC
466
467 static bool reserve_pmc_hardware(void)
468 {
469         int i;
470
471         for (i = 0; i < x86_pmu.num_counters; i++) {
472                 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
473                         goto perfctr_fail;
474         }
475
476         for (i = 0; i < x86_pmu.num_counters; i++) {
477                 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
478                         goto eventsel_fail;
479         }
480
481         return true;
482
483 eventsel_fail:
484         for (i--; i >= 0; i--)
485                 release_evntsel_nmi(x86_pmu_config_addr(i));
486
487         i = x86_pmu.num_counters;
488
489 perfctr_fail:
490         for (i--; i >= 0; i--)
491                 release_perfctr_nmi(x86_pmu_event_addr(i));
492
493         return false;
494 }
495
496 static void release_pmc_hardware(void)
497 {
498         int i;
499
500         for (i = 0; i < x86_pmu.num_counters; i++) {
501                 release_perfctr_nmi(x86_pmu_event_addr(i));
502                 release_evntsel_nmi(x86_pmu_config_addr(i));
503         }
504 }
505
506 #else
507
508 static bool reserve_pmc_hardware(void) { return true; }
509 static void release_pmc_hardware(void) {}
510
511 #endif
512
513 static bool check_hw_exists(void)
514 {
515         u64 val, val_new = 0;
516         int i, reg, ret = 0;
517
518         /*
519          * Check to see if the BIOS enabled any of the counters, if so
520          * complain and bail.
521          */
522         for (i = 0; i < x86_pmu.num_counters; i++) {
523                 reg = x86_pmu_config_addr(i);
524                 ret = rdmsrl_safe(reg, &val);
525                 if (ret)
526                         goto msr_fail;
527                 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
528                         goto bios_fail;
529         }
530
531         if (x86_pmu.num_counters_fixed) {
532                 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
533                 ret = rdmsrl_safe(reg, &val);
534                 if (ret)
535                         goto msr_fail;
536                 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
537                         if (val & (0x03 << i*4))
538                                 goto bios_fail;
539                 }
540         }
541
542         /*
543          * Now write a value and read it back to see if it matches,
544          * this is needed to detect certain hardware emulators (qemu/kvm)
545          * that don't trap on the MSR access and always return 0s.
546          */
547         val = 0xabcdUL;
548         ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
549         ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
550         if (ret || val != val_new)
551                 goto msr_fail;
552
553         return true;
554
555 bios_fail:
556         /*
557          * We still allow the PMU driver to operate:
558          */
559         printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
560         printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
561
562         return true;
563
564 msr_fail:
565         printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
566
567         return false;
568 }
569
570 static void reserve_ds_buffers(void);
571 static void release_ds_buffers(void);
572
573 static void hw_perf_event_destroy(struct perf_event *event)
574 {
575         if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
576                 release_pmc_hardware();
577                 release_ds_buffers();
578                 mutex_unlock(&pmc_reserve_mutex);
579         }
580 }
581
582 static inline int x86_pmu_initialized(void)
583 {
584         return x86_pmu.handle_irq != NULL;
585 }
586
587 static inline int
588 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
589 {
590         struct perf_event_attr *attr = &event->attr;
591         unsigned int cache_type, cache_op, cache_result;
592         u64 config, val;
593
594         config = attr->config;
595
596         cache_type = (config >>  0) & 0xff;
597         if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
598                 return -EINVAL;
599
600         cache_op = (config >>  8) & 0xff;
601         if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
602                 return -EINVAL;
603
604         cache_result = (config >> 16) & 0xff;
605         if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
606                 return -EINVAL;
607
608         val = hw_cache_event_ids[cache_type][cache_op][cache_result];
609
610         if (val == 0)
611                 return -ENOENT;
612
613         if (val == -1)
614                 return -EINVAL;
615
616         hwc->config |= val;
617         attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
618         return x86_pmu_extra_regs(val, event);
619 }
620
621 static int x86_setup_perfctr(struct perf_event *event)
622 {
623         struct perf_event_attr *attr = &event->attr;
624         struct hw_perf_event *hwc = &event->hw;
625         u64 config;
626
627         if (!is_sampling_event(event)) {
628                 hwc->sample_period = x86_pmu.max_period;
629                 hwc->last_period = hwc->sample_period;
630                 local64_set(&hwc->period_left, hwc->sample_period);
631         } else {
632                 /*
633                  * If we have a PMU initialized but no APIC
634                  * interrupts, we cannot sample hardware
635                  * events (user-space has to fall back and
636                  * sample via a hrtimer based software event):
637                  */
638                 if (!x86_pmu.apic)
639                         return -EOPNOTSUPP;
640         }
641
642         /*
643          * Do not allow config1 (extended registers) to propagate,
644          * there's no sane user-space generalization yet:
645          */
646         if (attr->type == PERF_TYPE_RAW)
647                 return 0;
648
649         if (attr->type == PERF_TYPE_HW_CACHE)
650                 return set_ext_hw_attr(hwc, event);
651
652         if (attr->config >= x86_pmu.max_events)
653                 return -EINVAL;
654
655         /*
656          * The generic map:
657          */
658         config = x86_pmu.event_map(attr->config);
659
660         if (config == 0)
661                 return -ENOENT;
662
663         if (config == -1LL)
664                 return -EINVAL;
665
666         /*
667          * Branch tracing:
668          */
669         if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
670             !attr->freq && hwc->sample_period == 1) {
671                 /* BTS is not supported by this architecture. */
672                 if (!x86_pmu.bts_active)
673                         return -EOPNOTSUPP;
674
675                 /* BTS is currently only allowed for user-mode. */
676                 if (!attr->exclude_kernel)
677                         return -EOPNOTSUPP;
678         }
679
680         hwc->config |= config;
681
682         return 0;
683 }
684
685 static int x86_pmu_hw_config(struct perf_event *event)
686 {
687         if (event->attr.precise_ip) {
688                 int precise = 0;
689
690                 /* Support for constant skid */
691                 if (x86_pmu.pebs_active) {
692                         precise++;
693
694                         /* Support for IP fixup */
695                         if (x86_pmu.lbr_nr)
696                                 precise++;
697                 }
698
699                 if (event->attr.precise_ip > precise)
700                         return -EOPNOTSUPP;
701         }
702
703         /*
704          * Generate PMC IRQs:
705          * (keep 'enabled' bit clear for now)
706          */
707         event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
708
709         /*
710          * Count user and OS events unless requested not to
711          */
712         if (!event->attr.exclude_user)
713                 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
714         if (!event->attr.exclude_kernel)
715                 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
716
717         if (event->attr.type == PERF_TYPE_RAW)
718                 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
719
720         return x86_setup_perfctr(event);
721 }
722
723 /*
724  * Setup the hardware configuration for a given attr_type
725  */
726 static int __x86_pmu_event_init(struct perf_event *event)
727 {
728         int err;
729
730         if (!x86_pmu_initialized())
731                 return -ENODEV;
732
733         err = 0;
734         if (!atomic_inc_not_zero(&active_events)) {
735                 mutex_lock(&pmc_reserve_mutex);
736                 if (atomic_read(&active_events) == 0) {
737                         if (!reserve_pmc_hardware())
738                                 err = -EBUSY;
739                         else
740                                 reserve_ds_buffers();
741                 }
742                 if (!err)
743                         atomic_inc(&active_events);
744                 mutex_unlock(&pmc_reserve_mutex);
745         }
746         if (err)
747                 return err;
748
749         event->destroy = hw_perf_event_destroy;
750
751         event->hw.idx = -1;
752         event->hw.last_cpu = -1;
753         event->hw.last_tag = ~0ULL;
754
755         /* mark unused */
756         event->hw.extra_reg.idx = EXTRA_REG_NONE;
757
758         return x86_pmu.hw_config(event);
759 }
760
761 static void x86_pmu_disable_all(void)
762 {
763         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
764         int idx;
765
766         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
767                 u64 val;
768
769                 if (!test_bit(idx, cpuc->active_mask))
770                         continue;
771                 rdmsrl(x86_pmu_config_addr(idx), val);
772                 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
773                         continue;
774                 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
775                 wrmsrl(x86_pmu_config_addr(idx), val);
776         }
777 }
778
779 static void x86_pmu_disable(struct pmu *pmu)
780 {
781         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
782
783         if (!x86_pmu_initialized())
784                 return;
785
786         if (!cpuc->enabled)
787                 return;
788
789         cpuc->n_added = 0;
790         cpuc->enabled = 0;
791         barrier();
792
793         x86_pmu.disable_all();
794 }
795
796 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
797                                           u64 enable_mask)
798 {
799         if (hwc->extra_reg.reg)
800                 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
801         wrmsrl(hwc->config_base, hwc->config | enable_mask);
802 }
803
804 static void x86_pmu_enable_all(int added)
805 {
806         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
807         int idx;
808
809         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
810                 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
811
812                 if (!test_bit(idx, cpuc->active_mask))
813                         continue;
814
815                 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
816         }
817 }
818
819 static struct pmu pmu;
820
821 static inline int is_x86_event(struct perf_event *event)
822 {
823         return event->pmu == &pmu;
824 }
825
826 static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
827 {
828         struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
829         unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
830         int i, j, w, wmax, num = 0;
831         struct hw_perf_event *hwc;
832
833         bitmap_zero(used_mask, X86_PMC_IDX_MAX);
834
835         for (i = 0; i < n; i++) {
836                 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
837                 constraints[i] = c;
838         }
839
840         /*
841          * fastpath, try to reuse previous register
842          */
843         for (i = 0; i < n; i++) {
844                 hwc = &cpuc->event_list[i]->hw;
845                 c = constraints[i];
846
847                 /* never assigned */
848                 if (hwc->idx == -1)
849                         break;
850
851                 /* constraint still honored */
852                 if (!test_bit(hwc->idx, c->idxmsk))
853                         break;
854
855                 /* not already used */
856                 if (test_bit(hwc->idx, used_mask))
857                         break;
858
859                 __set_bit(hwc->idx, used_mask);
860                 if (assign)
861                         assign[i] = hwc->idx;
862         }
863         if (i == n)
864                 goto done;
865
866         /*
867          * begin slow path
868          */
869
870         bitmap_zero(used_mask, X86_PMC_IDX_MAX);
871
872         /*
873          * weight = number of possible counters
874          *
875          * 1    = most constrained, only works on one counter
876          * wmax = least constrained, works on any counter
877          *
878          * assign events to counters starting with most
879          * constrained events.
880          */
881         wmax = x86_pmu.num_counters;
882
883         /*
884          * when fixed event counters are present,
885          * wmax is incremented by 1 to account
886          * for one more choice
887          */
888         if (x86_pmu.num_counters_fixed)
889                 wmax++;
890
891         for (w = 1, num = n; num && w <= wmax; w++) {
892                 /* for each event */
893                 for (i = 0; num && i < n; i++) {
894                         c = constraints[i];
895                         hwc = &cpuc->event_list[i]->hw;
896
897                         if (c->weight != w)
898                                 continue;
899
900                         for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
901                                 if (!test_bit(j, used_mask))
902                                         break;
903                         }
904
905                         if (j == X86_PMC_IDX_MAX)
906                                 break;
907
908                         __set_bit(j, used_mask);
909
910                         if (assign)
911                                 assign[i] = j;
912                         num--;
913                 }
914         }
915 done:
916         /*
917          * scheduling failed or is just a simulation,
918          * free resources if necessary
919          */
920         if (!assign || num) {
921                 for (i = 0; i < n; i++) {
922                         if (x86_pmu.put_event_constraints)
923                                 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
924                 }
925         }
926         return num ? -ENOSPC : 0;
927 }
928
929 /*
930  * dogrp: true if must collect siblings events (group)
931  * returns total number of events and error code
932  */
933 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
934 {
935         struct perf_event *event;
936         int n, max_count;
937
938         max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
939
940         /* current number of events already accepted */
941         n = cpuc->n_events;
942
943         if (is_x86_event(leader)) {
944                 if (n >= max_count)
945                         return -ENOSPC;
946                 cpuc->event_list[n] = leader;
947                 n++;
948         }
949         if (!dogrp)
950                 return n;
951
952         list_for_each_entry(event, &leader->sibling_list, group_entry) {
953                 if (!is_x86_event(event) ||
954                     event->state <= PERF_EVENT_STATE_OFF)
955                         continue;
956
957                 if (n >= max_count)
958                         return -ENOSPC;
959
960                 cpuc->event_list[n] = event;
961                 n++;
962         }
963         return n;
964 }
965
966 static inline void x86_assign_hw_event(struct perf_event *event,
967                                 struct cpu_hw_events *cpuc, int i)
968 {
969         struct hw_perf_event *hwc = &event->hw;
970
971         hwc->idx = cpuc->assign[i];
972         hwc->last_cpu = smp_processor_id();
973         hwc->last_tag = ++cpuc->tags[i];
974
975         if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
976                 hwc->config_base = 0;
977                 hwc->event_base = 0;
978         } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
979                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
980                 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
981         } else {
982                 hwc->config_base = x86_pmu_config_addr(hwc->idx);
983                 hwc->event_base  = x86_pmu_event_addr(hwc->idx);
984         }
985 }
986
987 static inline int match_prev_assignment(struct hw_perf_event *hwc,
988                                         struct cpu_hw_events *cpuc,
989                                         int i)
990 {
991         return hwc->idx == cpuc->assign[i] &&
992                 hwc->last_cpu == smp_processor_id() &&
993                 hwc->last_tag == cpuc->tags[i];
994 }
995
996 static void x86_pmu_start(struct perf_event *event, int flags);
997 static void x86_pmu_stop(struct perf_event *event, int flags);
998
999 static void x86_pmu_enable(struct pmu *pmu)
1000 {
1001         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1002         struct perf_event *event;
1003         struct hw_perf_event *hwc;
1004         int i, added = cpuc->n_added;
1005
1006         if (!x86_pmu_initialized())
1007                 return;
1008
1009         if (cpuc->enabled)
1010                 return;
1011
1012         if (cpuc->n_added) {
1013                 int n_running = cpuc->n_events - cpuc->n_added;
1014                 /*
1015                  * apply assignment obtained either from
1016                  * hw_perf_group_sched_in() or x86_pmu_enable()
1017                  *
1018                  * step1: save events moving to new counters
1019                  * step2: reprogram moved events into new counters
1020                  */
1021                 for (i = 0; i < n_running; i++) {
1022                         event = cpuc->event_list[i];
1023                         hwc = &event->hw;
1024
1025                         /*
1026                          * we can avoid reprogramming counter if:
1027                          * - assigned same counter as last time
1028                          * - running on same CPU as last time
1029                          * - no other event has used the counter since
1030                          */
1031                         if (hwc->idx == -1 ||
1032                             match_prev_assignment(hwc, cpuc, i))
1033                                 continue;
1034
1035                         /*
1036                          * Ensure we don't accidentally enable a stopped
1037                          * counter simply because we rescheduled.
1038                          */
1039                         if (hwc->state & PERF_HES_STOPPED)
1040                                 hwc->state |= PERF_HES_ARCH;
1041
1042                         x86_pmu_stop(event, PERF_EF_UPDATE);
1043                 }
1044
1045                 for (i = 0; i < cpuc->n_events; i++) {
1046                         event = cpuc->event_list[i];
1047                         hwc = &event->hw;
1048
1049                         if (!match_prev_assignment(hwc, cpuc, i))
1050                                 x86_assign_hw_event(event, cpuc, i);
1051                         else if (i < n_running)
1052                                 continue;
1053
1054                         if (hwc->state & PERF_HES_ARCH)
1055                                 continue;
1056
1057                         x86_pmu_start(event, PERF_EF_RELOAD);
1058                 }
1059                 cpuc->n_added = 0;
1060                 perf_events_lapic_init();
1061         }
1062
1063         cpuc->enabled = 1;
1064         barrier();
1065
1066         x86_pmu.enable_all(added);
1067 }
1068
1069 static inline void x86_pmu_disable_event(struct perf_event *event)
1070 {
1071         struct hw_perf_event *hwc = &event->hw;
1072
1073         wrmsrl(hwc->config_base, hwc->config);
1074 }
1075
1076 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1077
1078 /*
1079  * Set the next IRQ period, based on the hwc->period_left value.
1080  * To be called with the event disabled in hw:
1081  */
1082 static int
1083 x86_perf_event_set_period(struct perf_event *event)
1084 {
1085         struct hw_perf_event *hwc = &event->hw;
1086         s64 left = local64_read(&hwc->period_left);
1087         s64 period = hwc->sample_period;
1088         int ret = 0, idx = hwc->idx;
1089
1090         if (idx == X86_PMC_IDX_FIXED_BTS)
1091                 return 0;
1092
1093         /*
1094          * If we are way outside a reasonable range then just skip forward:
1095          */
1096         if (unlikely(left <= -period)) {
1097                 left = period;
1098                 local64_set(&hwc->period_left, left);
1099                 hwc->last_period = period;
1100                 ret = 1;
1101         }
1102
1103         if (unlikely(left <= 0)) {
1104                 left += period;
1105                 local64_set(&hwc->period_left, left);
1106                 hwc->last_period = period;
1107                 ret = 1;
1108         }
1109         /*
1110          * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1111          */
1112         if (unlikely(left < 2))
1113                 left = 2;
1114
1115         if (left > x86_pmu.max_period)
1116                 left = x86_pmu.max_period;
1117
1118         per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1119
1120         /*
1121          * The hw event starts counting from this event offset,
1122          * mark it to be able to extra future deltas:
1123          */
1124         local64_set(&hwc->prev_count, (u64)-left);
1125
1126         wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1127
1128         /*
1129          * Due to erratum on certan cpu we need
1130          * a second write to be sure the register
1131          * is updated properly
1132          */
1133         if (x86_pmu.perfctr_second_write) {
1134                 wrmsrl(hwc->event_base,
1135                         (u64)(-left) & x86_pmu.cntval_mask);
1136         }
1137
1138         perf_event_update_userpage(event);
1139
1140         return ret;
1141 }
1142
1143 static void x86_pmu_enable_event(struct perf_event *event)
1144 {
1145         if (__this_cpu_read(cpu_hw_events.enabled))
1146                 __x86_pmu_enable_event(&event->hw,
1147                                        ARCH_PERFMON_EVENTSEL_ENABLE);
1148 }
1149
1150 /*
1151  * Add a single event to the PMU.
1152  *
1153  * The event is added to the group of enabled events
1154  * but only if it can be scehduled with existing events.
1155  */
1156 static int x86_pmu_add(struct perf_event *event, int flags)
1157 {
1158         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1159         struct hw_perf_event *hwc;
1160         int assign[X86_PMC_IDX_MAX];
1161         int n, n0, ret;
1162
1163         hwc = &event->hw;
1164
1165         perf_pmu_disable(event->pmu);
1166         n0 = cpuc->n_events;
1167         ret = n = collect_events(cpuc, event, false);
1168         if (ret < 0)
1169                 goto out;
1170
1171         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1172         if (!(flags & PERF_EF_START))
1173                 hwc->state |= PERF_HES_ARCH;
1174
1175         /*
1176          * If group events scheduling transaction was started,
1177          * skip the schedulability test here, it will be performed
1178          * at commit time (->commit_txn) as a whole
1179          */
1180         if (cpuc->group_flag & PERF_EVENT_TXN)
1181                 goto done_collect;
1182
1183         ret = x86_pmu.schedule_events(cpuc, n, assign);
1184         if (ret)
1185                 goto out;
1186         /*
1187          * copy new assignment, now we know it is possible
1188          * will be used by hw_perf_enable()
1189          */
1190         memcpy(cpuc->assign, assign, n*sizeof(int));
1191
1192 done_collect:
1193         cpuc->n_events = n;
1194         cpuc->n_added += n - n0;
1195         cpuc->n_txn += n - n0;
1196
1197         ret = 0;
1198 out:
1199         perf_pmu_enable(event->pmu);
1200         return ret;
1201 }
1202
1203 static void x86_pmu_start(struct perf_event *event, int flags)
1204 {
1205         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1206         int idx = event->hw.idx;
1207
1208         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1209                 return;
1210
1211         if (WARN_ON_ONCE(idx == -1))
1212                 return;
1213
1214         if (flags & PERF_EF_RELOAD) {
1215                 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1216                 x86_perf_event_set_period(event);
1217         }
1218
1219         event->hw.state = 0;
1220
1221         cpuc->events[idx] = event;
1222         __set_bit(idx, cpuc->active_mask);
1223         __set_bit(idx, cpuc->running);
1224         x86_pmu.enable(event);
1225         perf_event_update_userpage(event);
1226 }
1227
1228 void perf_event_print_debug(void)
1229 {
1230         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1231         u64 pebs;
1232         struct cpu_hw_events *cpuc;
1233         unsigned long flags;
1234         int cpu, idx;
1235
1236         if (!x86_pmu.num_counters)
1237                 return;
1238
1239         local_irq_save(flags);
1240
1241         cpu = smp_processor_id();
1242         cpuc = &per_cpu(cpu_hw_events, cpu);
1243
1244         if (x86_pmu.version >= 2) {
1245                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1246                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1247                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1248                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1249                 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1250
1251                 pr_info("\n");
1252                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1253                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1254                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1255                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1256                 pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1257         }
1258         pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1259
1260         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1261                 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1262                 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1263
1264                 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1265
1266                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1267                         cpu, idx, pmc_ctrl);
1268                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1269                         cpu, idx, pmc_count);
1270                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1271                         cpu, idx, prev_left);
1272         }
1273         for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1274                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1275
1276                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1277                         cpu, idx, pmc_count);
1278         }
1279         local_irq_restore(flags);
1280 }
1281
1282 static void x86_pmu_stop(struct perf_event *event, int flags)
1283 {
1284         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1285         struct hw_perf_event *hwc = &event->hw;
1286
1287         if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1288                 x86_pmu.disable(event);
1289                 cpuc->events[hwc->idx] = NULL;
1290                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1291                 hwc->state |= PERF_HES_STOPPED;
1292         }
1293
1294         if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1295                 /*
1296                  * Drain the remaining delta count out of a event
1297                  * that we are disabling:
1298                  */
1299                 x86_perf_event_update(event);
1300                 hwc->state |= PERF_HES_UPTODATE;
1301         }
1302 }
1303
1304 static void x86_pmu_del(struct perf_event *event, int flags)
1305 {
1306         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1307         int i;
1308
1309         /*
1310          * If we're called during a txn, we don't need to do anything.
1311          * The events never got scheduled and ->cancel_txn will truncate
1312          * the event_list.
1313          */
1314         if (cpuc->group_flag & PERF_EVENT_TXN)
1315                 return;
1316
1317         x86_pmu_stop(event, PERF_EF_UPDATE);
1318
1319         for (i = 0; i < cpuc->n_events; i++) {
1320                 if (event == cpuc->event_list[i]) {
1321
1322                         if (x86_pmu.put_event_constraints)
1323                                 x86_pmu.put_event_constraints(cpuc, event);
1324
1325                         while (++i < cpuc->n_events)
1326                                 cpuc->event_list[i-1] = cpuc->event_list[i];
1327
1328                         --cpuc->n_events;
1329                         break;
1330                 }
1331         }
1332         perf_event_update_userpage(event);
1333 }
1334
1335 static int x86_pmu_handle_irq(struct pt_regs *regs)
1336 {
1337         struct perf_sample_data data;
1338         struct cpu_hw_events *cpuc;
1339         struct perf_event *event;
1340         int idx, handled = 0;
1341         u64 val;
1342
1343         perf_sample_data_init(&data, 0);
1344
1345         cpuc = &__get_cpu_var(cpu_hw_events);
1346
1347         /*
1348          * Some chipsets need to unmask the LVTPC in a particular spot
1349          * inside the nmi handler.  As a result, the unmasking was pushed
1350          * into all the nmi handlers.
1351          *
1352          * This generic handler doesn't seem to have any issues where the
1353          * unmasking occurs so it was left at the top.
1354          */
1355         apic_write(APIC_LVTPC, APIC_DM_NMI);
1356
1357         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1358                 if (!test_bit(idx, cpuc->active_mask)) {
1359                         /*
1360                          * Though we deactivated the counter some cpus
1361                          * might still deliver spurious interrupts still
1362                          * in flight. Catch them:
1363                          */
1364                         if (__test_and_clear_bit(idx, cpuc->running))
1365                                 handled++;
1366                         continue;
1367                 }
1368
1369                 event = cpuc->events[idx];
1370
1371                 val = x86_perf_event_update(event);
1372                 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1373                         continue;
1374
1375                 /*
1376                  * event overflow
1377                  */
1378                 handled++;
1379                 data.period     = event->hw.last_period;
1380
1381                 if (!x86_perf_event_set_period(event))
1382                         continue;
1383
1384                 if (perf_event_overflow(event, &data, regs))
1385                         x86_pmu_stop(event, 0);
1386         }
1387
1388         if (handled)
1389                 inc_irq_stat(apic_perf_irqs);
1390
1391         return handled;
1392 }
1393
1394 void perf_events_lapic_init(void)
1395 {
1396         if (!x86_pmu.apic || !x86_pmu_initialized())
1397                 return;
1398
1399         /*
1400          * Always use NMI for PMU
1401          */
1402         apic_write(APIC_LVTPC, APIC_DM_NMI);
1403 }
1404
1405 struct pmu_nmi_state {
1406         unsigned int    marked;
1407         int             handled;
1408 };
1409
1410 static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1411
1412 static int __kprobes
1413 perf_event_nmi_handler(struct notifier_block *self,
1414                          unsigned long cmd, void *__args)
1415 {
1416         struct die_args *args = __args;
1417         unsigned int this_nmi;
1418         int handled;
1419
1420         if (!atomic_read(&active_events))
1421                 return NOTIFY_DONE;
1422
1423         switch (cmd) {
1424         case DIE_NMI:
1425                 break;
1426         case DIE_NMIUNKNOWN:
1427                 this_nmi = percpu_read(irq_stat.__nmi_count);
1428                 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1429                         /* let the kernel handle the unknown nmi */
1430                         return NOTIFY_DONE;
1431                 /*
1432                  * This one is a PMU back-to-back nmi. Two events
1433                  * trigger 'simultaneously' raising two back-to-back
1434                  * NMIs. If the first NMI handles both, the latter
1435                  * will be empty and daze the CPU. So, we drop it to
1436                  * avoid false-positive 'unknown nmi' messages.
1437                  */
1438                 return NOTIFY_STOP;
1439         default:
1440                 return NOTIFY_DONE;
1441         }
1442
1443         handled = x86_pmu.handle_irq(args->regs);
1444         if (!handled)
1445                 return NOTIFY_DONE;
1446
1447         this_nmi = percpu_read(irq_stat.__nmi_count);
1448         if ((handled > 1) ||
1449                 /* the next nmi could be a back-to-back nmi */
1450             ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1451              (__this_cpu_read(pmu_nmi.handled) > 1))) {
1452                 /*
1453                  * We could have two subsequent back-to-back nmis: The
1454                  * first handles more than one counter, the 2nd
1455                  * handles only one counter and the 3rd handles no
1456                  * counter.
1457                  *
1458                  * This is the 2nd nmi because the previous was
1459                  * handling more than one counter. We will mark the
1460                  * next (3rd) and then drop it if unhandled.
1461                  */
1462                 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1463                 __this_cpu_write(pmu_nmi.handled, handled);
1464         }
1465
1466         return NOTIFY_STOP;
1467 }
1468
1469 static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1470         .notifier_call          = perf_event_nmi_handler,
1471         .next                   = NULL,
1472         .priority               = NMI_LOCAL_LOW_PRIOR,
1473 };
1474
1475 static struct event_constraint unconstrained;
1476 static struct event_constraint emptyconstraint;
1477
1478 static struct event_constraint *
1479 x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1480 {
1481         struct event_constraint *c;
1482
1483         if (x86_pmu.event_constraints) {
1484                 for_each_event_constraint(c, x86_pmu.event_constraints) {
1485                         if ((event->hw.config & c->cmask) == c->code)
1486                                 return c;
1487                 }
1488         }
1489
1490         return &unconstrained;
1491 }
1492
1493 #include "perf_event_amd.c"
1494 #include "perf_event_p6.c"
1495 #include "perf_event_p4.c"
1496 #include "perf_event_intel_lbr.c"
1497 #include "perf_event_intel_ds.c"
1498 #include "perf_event_intel.c"
1499
1500 static int __cpuinit
1501 x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1502 {
1503         unsigned int cpu = (long)hcpu;
1504         int ret = NOTIFY_OK;
1505
1506         switch (action & ~CPU_TASKS_FROZEN) {
1507         case CPU_UP_PREPARE:
1508                 if (x86_pmu.cpu_prepare)
1509                         ret = x86_pmu.cpu_prepare(cpu);
1510                 break;
1511
1512         case CPU_STARTING:
1513                 if (x86_pmu.cpu_starting)
1514                         x86_pmu.cpu_starting(cpu);
1515                 break;
1516
1517         case CPU_DYING:
1518                 if (x86_pmu.cpu_dying)
1519                         x86_pmu.cpu_dying(cpu);
1520                 break;
1521
1522         case CPU_UP_CANCELED:
1523         case CPU_DEAD:
1524                 if (x86_pmu.cpu_dead)
1525                         x86_pmu.cpu_dead(cpu);
1526                 break;
1527
1528         default:
1529                 break;
1530         }
1531
1532         return ret;
1533 }
1534
1535 static void __init pmu_check_apic(void)
1536 {
1537         if (cpu_has_apic)
1538                 return;
1539
1540         x86_pmu.apic = 0;
1541         pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1542         pr_info("no hardware sampling interrupt available.\n");
1543 }
1544
1545 static int __init init_hw_perf_events(void)
1546 {
1547         struct event_constraint *c;
1548         int err;
1549
1550         pr_info("Performance Events: ");
1551
1552         switch (boot_cpu_data.x86_vendor) {
1553         case X86_VENDOR_INTEL:
1554                 err = intel_pmu_init();
1555                 break;
1556         case X86_VENDOR_AMD:
1557                 err = amd_pmu_init();
1558                 break;
1559         default:
1560                 return 0;
1561         }
1562         if (err != 0) {
1563                 pr_cont("no PMU driver, software events only.\n");
1564                 return 0;
1565         }
1566
1567         pmu_check_apic();
1568
1569         /* sanity check that the hardware exists or is emulated */
1570         if (!check_hw_exists())
1571                 return 0;
1572
1573         pr_cont("%s PMU driver.\n", x86_pmu.name);
1574
1575         if (x86_pmu.quirks)
1576                 x86_pmu.quirks();
1577
1578         if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1579                 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1580                      x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1581                 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1582         }
1583         x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1584
1585         if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1586                 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1587                      x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1588                 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1589         }
1590
1591         x86_pmu.intel_ctrl |=
1592                 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1593
1594         perf_events_lapic_init();
1595         register_die_notifier(&perf_event_nmi_notifier);
1596
1597         unconstrained = (struct event_constraint)
1598                 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1599                                    0, x86_pmu.num_counters);
1600
1601         if (x86_pmu.event_constraints) {
1602                 for_each_event_constraint(c, x86_pmu.event_constraints) {
1603                         if (c->cmask != X86_RAW_EVENT_MASK)
1604                                 continue;
1605
1606                         c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1607                         c->weight += x86_pmu.num_counters;
1608                 }
1609         }
1610
1611         pr_info("... version:                %d\n",     x86_pmu.version);
1612         pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1613         pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1614         pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1615         pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1616         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1617         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1618
1619         perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1620         perf_cpu_notifier(x86_pmu_notifier);
1621
1622         return 0;
1623 }
1624 early_initcall(init_hw_perf_events);
1625
1626 static inline void x86_pmu_read(struct perf_event *event)
1627 {
1628         x86_perf_event_update(event);
1629 }
1630
1631 /*
1632  * Start group events scheduling transaction
1633  * Set the flag to make pmu::enable() not perform the
1634  * schedulability test, it will be performed at commit time
1635  */
1636 static void x86_pmu_start_txn(struct pmu *pmu)
1637 {
1638         perf_pmu_disable(pmu);
1639         __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1640         __this_cpu_write(cpu_hw_events.n_txn, 0);
1641 }
1642
1643 /*
1644  * Stop group events scheduling transaction
1645  * Clear the flag and pmu::enable() will perform the
1646  * schedulability test.
1647  */
1648 static void x86_pmu_cancel_txn(struct pmu *pmu)
1649 {
1650         __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1651         /*
1652          * Truncate the collected events.
1653          */
1654         __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1655         __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1656         perf_pmu_enable(pmu);
1657 }
1658
1659 /*
1660  * Commit group events scheduling transaction
1661  * Perform the group schedulability test as a whole
1662  * Return 0 if success
1663  */
1664 static int x86_pmu_commit_txn(struct pmu *pmu)
1665 {
1666         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1667         int assign[X86_PMC_IDX_MAX];
1668         int n, ret;
1669
1670         n = cpuc->n_events;
1671
1672         if (!x86_pmu_initialized())
1673                 return -EAGAIN;
1674
1675         ret = x86_pmu.schedule_events(cpuc, n, assign);
1676         if (ret)
1677                 return ret;
1678
1679         /*
1680          * copy new assignment, now we know it is possible
1681          * will be used by hw_perf_enable()
1682          */
1683         memcpy(cpuc->assign, assign, n*sizeof(int));
1684
1685         cpuc->group_flag &= ~PERF_EVENT_TXN;
1686         perf_pmu_enable(pmu);
1687         return 0;
1688 }
1689 /*
1690  * a fake_cpuc is used to validate event groups. Due to
1691  * the extra reg logic, we need to also allocate a fake
1692  * per_core and per_cpu structure. Otherwise, group events
1693  * using extra reg may conflict without the kernel being
1694  * able to catch this when the last event gets added to
1695  * the group.
1696  */
1697 static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1698 {
1699         kfree(cpuc->shared_regs);
1700         kfree(cpuc);
1701 }
1702
1703 static struct cpu_hw_events *allocate_fake_cpuc(void)
1704 {
1705         struct cpu_hw_events *cpuc;
1706         int cpu = raw_smp_processor_id();
1707
1708         cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1709         if (!cpuc)
1710                 return ERR_PTR(-ENOMEM);
1711
1712         /* only needed, if we have extra_regs */
1713         if (x86_pmu.extra_regs) {
1714                 cpuc->shared_regs = allocate_shared_regs(cpu);
1715                 if (!cpuc->shared_regs)
1716                         goto error;
1717         }
1718         return cpuc;
1719 error:
1720         free_fake_cpuc(cpuc);
1721         return ERR_PTR(-ENOMEM);
1722 }
1723
1724 /*
1725  * validate that we can schedule this event
1726  */
1727 static int validate_event(struct perf_event *event)
1728 {
1729         struct cpu_hw_events *fake_cpuc;
1730         struct event_constraint *c;
1731         int ret = 0;
1732
1733         fake_cpuc = allocate_fake_cpuc();
1734         if (IS_ERR(fake_cpuc))
1735                 return PTR_ERR(fake_cpuc);
1736
1737         c = x86_pmu.get_event_constraints(fake_cpuc, event);
1738
1739         if (!c || !c->weight)
1740                 ret = -ENOSPC;
1741
1742         if (x86_pmu.put_event_constraints)
1743                 x86_pmu.put_event_constraints(fake_cpuc, event);
1744
1745         free_fake_cpuc(fake_cpuc);
1746
1747         return ret;
1748 }
1749
1750 /*
1751  * validate a single event group
1752  *
1753  * validation include:
1754  *      - check events are compatible which each other
1755  *      - events do not compete for the same counter
1756  *      - number of events <= number of counters
1757  *
1758  * validation ensures the group can be loaded onto the
1759  * PMU if it was the only group available.
1760  */
1761 static int validate_group(struct perf_event *event)
1762 {
1763         struct perf_event *leader = event->group_leader;
1764         struct cpu_hw_events *fake_cpuc;
1765         int ret = -ENOSPC, n;
1766
1767         fake_cpuc = allocate_fake_cpuc();
1768         if (IS_ERR(fake_cpuc))
1769                 return PTR_ERR(fake_cpuc);
1770         /*
1771          * the event is not yet connected with its
1772          * siblings therefore we must first collect
1773          * existing siblings, then add the new event
1774          * before we can simulate the scheduling
1775          */
1776         n = collect_events(fake_cpuc, leader, true);
1777         if (n < 0)
1778                 goto out;
1779
1780         fake_cpuc->n_events = n;
1781         n = collect_events(fake_cpuc, event, false);
1782         if (n < 0)
1783                 goto out;
1784
1785         fake_cpuc->n_events = n;
1786
1787         ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1788
1789 out:
1790         free_fake_cpuc(fake_cpuc);
1791         return ret;
1792 }
1793
1794 static int x86_pmu_event_init(struct perf_event *event)
1795 {
1796         struct pmu *tmp;
1797         int err;
1798
1799         switch (event->attr.type) {
1800         case PERF_TYPE_RAW:
1801         case PERF_TYPE_HARDWARE:
1802         case PERF_TYPE_HW_CACHE:
1803                 break;
1804
1805         default:
1806                 return -ENOENT;
1807         }
1808
1809         err = __x86_pmu_event_init(event);
1810         if (!err) {
1811                 /*
1812                  * we temporarily connect event to its pmu
1813                  * such that validate_group() can classify
1814                  * it as an x86 event using is_x86_event()
1815                  */
1816                 tmp = event->pmu;
1817                 event->pmu = &pmu;
1818
1819                 if (event->group_leader != event)
1820                         err = validate_group(event);
1821                 else
1822                         err = validate_event(event);
1823
1824                 event->pmu = tmp;
1825         }
1826         if (err) {
1827                 if (event->destroy)
1828                         event->destroy(event);
1829         }
1830
1831         return err;
1832 }
1833
1834 static struct pmu pmu = {
1835         .pmu_enable     = x86_pmu_enable,
1836         .pmu_disable    = x86_pmu_disable,
1837
1838         .event_init     = x86_pmu_event_init,
1839
1840         .add            = x86_pmu_add,
1841         .del            = x86_pmu_del,
1842         .start          = x86_pmu_start,
1843         .stop           = x86_pmu_stop,
1844         .read           = x86_pmu_read,
1845
1846         .start_txn      = x86_pmu_start_txn,
1847         .cancel_txn     = x86_pmu_cancel_txn,
1848         .commit_txn     = x86_pmu_commit_txn,
1849 };
1850
1851 /*
1852  * callchain support
1853  */
1854
1855 static int backtrace_stack(void *data, char *name)
1856 {
1857         return 0;
1858 }
1859
1860 static void backtrace_address(void *data, unsigned long addr, int reliable)
1861 {
1862         struct perf_callchain_entry *entry = data;
1863
1864         perf_callchain_store(entry, addr);
1865 }
1866
1867 static const struct stacktrace_ops backtrace_ops = {
1868         .stack                  = backtrace_stack,
1869         .address                = backtrace_address,
1870         .walk_stack             = print_context_stack_bp,
1871 };
1872
1873 void
1874 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1875 {
1876         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1877                 /* TODO: We don't support guest os callchain now */
1878                 return;
1879         }
1880
1881         perf_callchain_store(entry, regs->ip);
1882
1883         dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1884 }
1885
1886 #ifdef CONFIG_COMPAT
1887 static inline int
1888 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1889 {
1890         /* 32-bit process in 64-bit kernel. */
1891         struct stack_frame_ia32 frame;
1892         const void __user *fp;
1893
1894         if (!test_thread_flag(TIF_IA32))
1895                 return 0;
1896
1897         fp = compat_ptr(regs->bp);
1898         while (entry->nr < PERF_MAX_STACK_DEPTH) {
1899                 unsigned long bytes;
1900                 frame.next_frame     = 0;
1901                 frame.return_address = 0;
1902
1903                 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1904                 if (bytes != sizeof(frame))
1905                         break;
1906
1907                 if (fp < compat_ptr(regs->sp))
1908                         break;
1909
1910                 perf_callchain_store(entry, frame.return_address);
1911                 fp = compat_ptr(frame.next_frame);
1912         }
1913         return 1;
1914 }
1915 #else
1916 static inline int
1917 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1918 {
1919     return 0;
1920 }
1921 #endif
1922
1923 void
1924 perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1925 {
1926         struct stack_frame frame;
1927         const void __user *fp;
1928
1929         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1930                 /* TODO: We don't support guest os callchain now */
1931                 return;
1932         }
1933
1934         fp = (void __user *)regs->bp;
1935
1936         perf_callchain_store(entry, regs->ip);
1937
1938         if (perf_callchain_user32(regs, entry))
1939                 return;
1940
1941         while (entry->nr < PERF_MAX_STACK_DEPTH) {
1942                 unsigned long bytes;
1943                 frame.next_frame             = NULL;
1944                 frame.return_address = 0;
1945
1946                 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1947                 if (bytes != sizeof(frame))
1948                         break;
1949
1950                 if ((unsigned long)fp < regs->sp)
1951                         break;
1952
1953                 perf_callchain_store(entry, frame.return_address);
1954                 fp = frame.next_frame;
1955         }
1956 }
1957
1958 unsigned long perf_instruction_pointer(struct pt_regs *regs)
1959 {
1960         unsigned long ip;
1961
1962         if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1963                 ip = perf_guest_cbs->get_guest_ip();
1964         else
1965                 ip = instruction_pointer(regs);
1966
1967         return ip;
1968 }
1969
1970 unsigned long perf_misc_flags(struct pt_regs *regs)
1971 {
1972         int misc = 0;
1973
1974         if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1975                 if (perf_guest_cbs->is_user_mode())
1976                         misc |= PERF_RECORD_MISC_GUEST_USER;
1977                 else
1978                         misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1979         } else {
1980                 if (user_mode(regs))
1981                         misc |= PERF_RECORD_MISC_USER;
1982                 else
1983                         misc |= PERF_RECORD_MISC_KERNEL;
1984         }
1985
1986         if (regs->flags & PERF_EFLAGS_EXACT)
1987                 misc |= PERF_RECORD_MISC_EXACT_IP;
1988
1989         return misc;
1990 }