Merge branch 'master' into gfs2
[pandora-kernel.git] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/config.h>
17 #include <linux/delay.h>
18 #include <linux/interrupt.h>
19 #include <linux/module.h>
20 #include <linux/nmi.h>
21 #include <linux/sysdev.h>
22 #include <linux/sysctl.h>
23 #include <linux/percpu.h>
24 #include <linux/dmi.h>
25 #include <linux/kprobes.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
35  * evtsel_nmi_owner tracks the ownership of the event selection
36  * - different performance counters/ event selection may be reserved for
37  *   different subsystems this reservation system just tries to coordinate
38  *   things a little
39  */
40 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
41 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
42
43 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
44  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
45  */
46 #define NMI_MAX_COUNTER_BITS 66
47
48 /* nmi_active:
49  * >0: the lapic NMI watchdog is active, but can be disabled
50  * <0: the lapic NMI watchdog has not been set up, and cannot
51  *     be enabled
52  *  0: the lapic NMI watchdog is disabled, but can be enabled
53  */
54 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
55
56 unsigned int nmi_watchdog = NMI_DEFAULT;
57 static unsigned int nmi_hz = HZ;
58
59 struct nmi_watchdog_ctlblk {
60         int enabled;
61         u64 check_bit;
62         unsigned int cccr_msr;
63         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
64         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
65 };
66 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
67
68 /* local prototypes */
69 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
70
71 extern void show_registers(struct pt_regs *regs);
72 extern int unknown_nmi_panic;
73
74 /* converts an msr to an appropriate reservation bit */
75 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
76 {
77         /* returns the bit offset of the performance counter register */
78         switch (boot_cpu_data.x86_vendor) {
79         case X86_VENDOR_AMD:
80                 return (msr - MSR_K7_PERFCTR0);
81         case X86_VENDOR_INTEL:
82                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
83                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
84
85                 switch (boot_cpu_data.x86) {
86                 case 6:
87                         return (msr - MSR_P6_PERFCTR0);
88                 case 15:
89                         return (msr - MSR_P4_BPU_PERFCTR0);
90                 }
91         }
92         return 0;
93 }
94
95 /* converts an msr to an appropriate reservation bit */
96 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
97 {
98         /* returns the bit offset of the event selection register */
99         switch (boot_cpu_data.x86_vendor) {
100         case X86_VENDOR_AMD:
101                 return (msr - MSR_K7_EVNTSEL0);
102         case X86_VENDOR_INTEL:
103                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
104                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
105
106                 switch (boot_cpu_data.x86) {
107                 case 6:
108                         return (msr - MSR_P6_EVNTSEL0);
109                 case 15:
110                         return (msr - MSR_P4_BSU_ESCR0);
111                 }
112         }
113         return 0;
114 }
115
116 /* checks for a bit availability (hack for oprofile) */
117 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
118 {
119         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
120
121         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
122 }
123
124 /* checks the an msr for availability */
125 int avail_to_resrv_perfctr_nmi(unsigned int msr)
126 {
127         unsigned int counter;
128
129         counter = nmi_perfctr_msr_to_bit(msr);
130         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
131
132         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
133 }
134
135 int reserve_perfctr_nmi(unsigned int msr)
136 {
137         unsigned int counter;
138
139         counter = nmi_perfctr_msr_to_bit(msr);
140         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
141
142         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
143                 return 1;
144         return 0;
145 }
146
147 void release_perfctr_nmi(unsigned int msr)
148 {
149         unsigned int counter;
150
151         counter = nmi_perfctr_msr_to_bit(msr);
152         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
153
154         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
155 }
156
157 int reserve_evntsel_nmi(unsigned int msr)
158 {
159         unsigned int counter;
160
161         counter = nmi_evntsel_msr_to_bit(msr);
162         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
163
164         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
165                 return 1;
166         return 0;
167 }
168
169 void release_evntsel_nmi(unsigned int msr)
170 {
171         unsigned int counter;
172
173         counter = nmi_evntsel_msr_to_bit(msr);
174         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
175
176         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
177 }
178
179 static __cpuinit inline int nmi_known_cpu(void)
180 {
181         switch (boot_cpu_data.x86_vendor) {
182         case X86_VENDOR_AMD:
183                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
184         case X86_VENDOR_INTEL:
185                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
186                         return 1;
187                 else
188                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
189         }
190         return 0;
191 }
192
193 #ifdef CONFIG_SMP
194 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
195  * the CPU is idle. To make sure the NMI watchdog really ticks on all
196  * CPUs during the test make them busy.
197  */
198 static __init void nmi_cpu_busy(void *data)
199 {
200         volatile int *endflag = data;
201         local_irq_enable_in_hardirq();
202         /* Intentionally don't use cpu_relax here. This is
203            to make sure that the performance counter really ticks,
204            even if there is a simulator or similar that catches the
205            pause instruction. On a real HT machine this is fine because
206            all other CPUs are busy with "useless" delay loops and don't
207            care if they get somewhat less cycles. */
208         while (*endflag == 0)
209                 barrier();
210 }
211 #endif
212
213 static int __init check_nmi_watchdog(void)
214 {
215         volatile int endflag = 0;
216         unsigned int *prev_nmi_count;
217         int cpu;
218
219         /* Enable NMI watchdog for newer systems.
220            Actually it should be safe for most systems before 2004 too except
221            for some IBM systems that corrupt registers when NMI happens
222            during SMM. Unfortunately we don't have more exact information
223            on these and use this coarse check. */
224         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
225                 nmi_watchdog = NMI_LOCAL_APIC;
226
227         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
228                 return 0;
229
230         if (!atomic_read(&nmi_active))
231                 return 0;
232
233         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
234         if (!prev_nmi_count)
235                 return -1;
236
237         printk(KERN_INFO "Testing NMI watchdog ... ");
238
239         if (nmi_watchdog == NMI_LOCAL_APIC)
240                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
241
242         for_each_possible_cpu(cpu)
243                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
244         local_irq_enable();
245         mdelay((10*1000)/nmi_hz); // wait 10 ticks
246
247         for_each_possible_cpu(cpu) {
248 #ifdef CONFIG_SMP
249                 /* Check cpu_callin_map here because that is set
250                    after the timer is started. */
251                 if (!cpu_isset(cpu, cpu_callin_map))
252                         continue;
253 #endif
254                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
255                         continue;
256                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
257                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
258                                 cpu,
259                                 prev_nmi_count[cpu],
260                                 nmi_count(cpu));
261                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
262                         atomic_dec(&nmi_active);
263                 }
264         }
265         if (!atomic_read(&nmi_active)) {
266                 kfree(prev_nmi_count);
267                 atomic_set(&nmi_active, -1);
268                 return -1;
269         }
270         endflag = 1;
271         printk("OK.\n");
272
273         /* now that we know it works we can reduce NMI frequency to
274            something more reasonable; makes a difference in some configs */
275         if (nmi_watchdog == NMI_LOCAL_APIC) {
276                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
277
278                 nmi_hz = 1;
279                 /*
280                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
281                  * are writable, with higher bits sign extending from bit 31.
282                  * So, we can only program the counter with 31 bit values and
283                  * 32nd bit should be 1, for 33.. to be 1.
284                  * Find the appropriate nmi_hz
285                  */
286                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
287                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
288                         u64 count = (u64)cpu_khz * 1000;
289                         do_div(count, 0x7fffffffUL);
290                         nmi_hz = count + 1;
291                 }
292         }
293
294         kfree(prev_nmi_count);
295         return 0;
296 }
297 /* This needs to happen later in boot so counters are working */
298 late_initcall(check_nmi_watchdog);
299
300 static int __init setup_nmi_watchdog(char *str)
301 {
302         int nmi;
303
304         get_option(&str, &nmi);
305
306         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
307                 return 0;
308         /*
309          * If any other x86 CPU has a local APIC, then
310          * please test the NMI stuff there and send me the
311          * missing bits. Right now Intel P6/P4 and AMD K7 only.
312          */
313         if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
314                 return 0;  /* no lapic support */
315         nmi_watchdog = nmi;
316         return 1;
317 }
318
319 __setup("nmi_watchdog=", setup_nmi_watchdog);
320
321 static void disable_lapic_nmi_watchdog(void)
322 {
323         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
324
325         if (atomic_read(&nmi_active) <= 0)
326                 return;
327
328         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
329
330         BUG_ON(atomic_read(&nmi_active) != 0);
331 }
332
333 static void enable_lapic_nmi_watchdog(void)
334 {
335         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
336
337         /* are we already enabled */
338         if (atomic_read(&nmi_active) != 0)
339                 return;
340
341         /* are we lapic aware */
342         if (nmi_known_cpu() <= 0)
343                 return;
344
345         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
346         touch_nmi_watchdog();
347 }
348
349 void disable_timer_nmi_watchdog(void)
350 {
351         BUG_ON(nmi_watchdog != NMI_IO_APIC);
352
353         if (atomic_read(&nmi_active) <= 0)
354                 return;
355
356         disable_irq(0);
357         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
358
359         BUG_ON(atomic_read(&nmi_active) != 0);
360 }
361
362 void enable_timer_nmi_watchdog(void)
363 {
364         BUG_ON(nmi_watchdog != NMI_IO_APIC);
365
366         if (atomic_read(&nmi_active) == 0) {
367                 touch_nmi_watchdog();
368                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
369                 enable_irq(0);
370         }
371 }
372
373 #ifdef CONFIG_PM
374
375 static int nmi_pm_active; /* nmi_active before suspend */
376
377 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
378 {
379         /* only CPU0 goes here, other CPUs should be offline */
380         nmi_pm_active = atomic_read(&nmi_active);
381         stop_apic_nmi_watchdog(NULL);
382         BUG_ON(atomic_read(&nmi_active) != 0);
383         return 0;
384 }
385
386 static int lapic_nmi_resume(struct sys_device *dev)
387 {
388         /* only CPU0 goes here, other CPUs should be offline */
389         if (nmi_pm_active > 0) {
390                 setup_apic_nmi_watchdog(NULL);
391                 touch_nmi_watchdog();
392         }
393         return 0;
394 }
395
396
397 static struct sysdev_class nmi_sysclass = {
398         set_kset_name("lapic_nmi"),
399         .resume         = lapic_nmi_resume,
400         .suspend        = lapic_nmi_suspend,
401 };
402
403 static struct sys_device device_lapic_nmi = {
404         .id     = 0,
405         .cls    = &nmi_sysclass,
406 };
407
408 static int __init init_lapic_nmi_sysfs(void)
409 {
410         int error;
411
412         /* should really be a BUG_ON but b/c this is an
413          * init call, it just doesn't work.  -dcz
414          */
415         if (nmi_watchdog != NMI_LOCAL_APIC)
416                 return 0;
417
418         if ( atomic_read(&nmi_active) < 0 )
419                 return 0;
420
421         error = sysdev_class_register(&nmi_sysclass);
422         if (!error)
423                 error = sysdev_register(&device_lapic_nmi);
424         return error;
425 }
426 /* must come after the local APIC's device_initcall() */
427 late_initcall(init_lapic_nmi_sysfs);
428
429 #endif  /* CONFIG_PM */
430
431 /*
432  * Activate the NMI watchdog via the local APIC.
433  * Original code written by Keith Owens.
434  */
435
436 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
437 {
438         u64 count = (u64)cpu_khz * 1000;
439
440         do_div(count, nmi_hz);
441         if(descr)
442                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
443         wrmsrl(perfctr_msr, 0 - count);
444 }
445
446 /* Note that these events don't tick when the CPU idles. This means
447    the frequency varies with CPU load. */
448
449 #define K7_EVNTSEL_ENABLE       (1 << 22)
450 #define K7_EVNTSEL_INT          (1 << 20)
451 #define K7_EVNTSEL_OS           (1 << 17)
452 #define K7_EVNTSEL_USR          (1 << 16)
453 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
454 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
455
456 static int setup_k7_watchdog(void)
457 {
458         unsigned int perfctr_msr, evntsel_msr;
459         unsigned int evntsel;
460         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
461
462         perfctr_msr = MSR_K7_PERFCTR0;
463         evntsel_msr = MSR_K7_EVNTSEL0;
464         if (!reserve_perfctr_nmi(perfctr_msr))
465                 goto fail;
466
467         if (!reserve_evntsel_nmi(evntsel_msr))
468                 goto fail1;
469
470         wrmsrl(perfctr_msr, 0UL);
471
472         evntsel = K7_EVNTSEL_INT
473                 | K7_EVNTSEL_OS
474                 | K7_EVNTSEL_USR
475                 | K7_NMI_EVENT;
476
477         /* setup the timer */
478         wrmsr(evntsel_msr, evntsel, 0);
479         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
480         apic_write(APIC_LVTPC, APIC_DM_NMI);
481         evntsel |= K7_EVNTSEL_ENABLE;
482         wrmsr(evntsel_msr, evntsel, 0);
483
484         wd->perfctr_msr = perfctr_msr;
485         wd->evntsel_msr = evntsel_msr;
486         wd->cccr_msr = 0;  //unused
487         wd->check_bit = 1ULL<<63;
488         return 1;
489 fail1:
490         release_perfctr_nmi(perfctr_msr);
491 fail:
492         return 0;
493 }
494
495 static void stop_k7_watchdog(void)
496 {
497         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
498
499         wrmsr(wd->evntsel_msr, 0, 0);
500
501         release_evntsel_nmi(wd->evntsel_msr);
502         release_perfctr_nmi(wd->perfctr_msr);
503 }
504
505 #define P6_EVNTSEL0_ENABLE      (1 << 22)
506 #define P6_EVNTSEL_INT          (1 << 20)
507 #define P6_EVNTSEL_OS           (1 << 17)
508 #define P6_EVNTSEL_USR          (1 << 16)
509 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
510 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
511
512 static int setup_p6_watchdog(void)
513 {
514         unsigned int perfctr_msr, evntsel_msr;
515         unsigned int evntsel;
516         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
517
518         perfctr_msr = MSR_P6_PERFCTR0;
519         evntsel_msr = MSR_P6_EVNTSEL0;
520         if (!reserve_perfctr_nmi(perfctr_msr))
521                 goto fail;
522
523         if (!reserve_evntsel_nmi(evntsel_msr))
524                 goto fail1;
525
526         wrmsrl(perfctr_msr, 0UL);
527
528         evntsel = P6_EVNTSEL_INT
529                 | P6_EVNTSEL_OS
530                 | P6_EVNTSEL_USR
531                 | P6_NMI_EVENT;
532
533         /* setup the timer */
534         wrmsr(evntsel_msr, evntsel, 0);
535         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
536         apic_write(APIC_LVTPC, APIC_DM_NMI);
537         evntsel |= P6_EVNTSEL0_ENABLE;
538         wrmsr(evntsel_msr, evntsel, 0);
539
540         wd->perfctr_msr = perfctr_msr;
541         wd->evntsel_msr = evntsel_msr;
542         wd->cccr_msr = 0;  //unused
543         wd->check_bit = 1ULL<<39;
544         return 1;
545 fail1:
546         release_perfctr_nmi(perfctr_msr);
547 fail:
548         return 0;
549 }
550
551 static void stop_p6_watchdog(void)
552 {
553         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
554
555         wrmsr(wd->evntsel_msr, 0, 0);
556
557         release_evntsel_nmi(wd->evntsel_msr);
558         release_perfctr_nmi(wd->perfctr_msr);
559 }
560
561 /* Note that these events don't tick when the CPU idles. This means
562    the frequency varies with CPU load. */
563
564 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
565 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
566 #define P4_ESCR_OS              (1<<3)
567 #define P4_ESCR_USR             (1<<2)
568 #define P4_CCCR_OVF_PMI0        (1<<26)
569 #define P4_CCCR_OVF_PMI1        (1<<27)
570 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
571 #define P4_CCCR_COMPLEMENT      (1<<19)
572 #define P4_CCCR_COMPARE         (1<<18)
573 #define P4_CCCR_REQUIRED        (3<<16)
574 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
575 #define P4_CCCR_ENABLE          (1<<12)
576 #define P4_CCCR_OVF             (1<<31)
577 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
578    CRU_ESCR0 (with any non-null event selector) through a complemented
579    max threshold. [IA32-Vol3, Section 14.9.9] */
580
581 static int setup_p4_watchdog(void)
582 {
583         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
584         unsigned int evntsel, cccr_val;
585         unsigned int misc_enable, dummy;
586         unsigned int ht_num;
587         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
588
589         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
590         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
591                 return 0;
592
593 #ifdef CONFIG_SMP
594         /* detect which hyperthread we are on */
595         if (smp_num_siblings == 2) {
596                 unsigned int ebx, apicid;
597
598                 ebx = cpuid_ebx(1);
599                 apicid = (ebx >> 24) & 0xff;
600                 ht_num = apicid & 1;
601         } else
602 #endif
603                 ht_num = 0;
604
605         /* performance counters are shared resources
606          * assign each hyperthread its own set
607          * (re-use the ESCR0 register, seems safe
608          * and keeps the cccr_val the same)
609          */
610         if (!ht_num) {
611                 /* logical cpu 0 */
612                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
613                 evntsel_msr = MSR_P4_CRU_ESCR0;
614                 cccr_msr = MSR_P4_IQ_CCCR0;
615                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
616         } else {
617                 /* logical cpu 1 */
618                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
619                 evntsel_msr = MSR_P4_CRU_ESCR0;
620                 cccr_msr = MSR_P4_IQ_CCCR1;
621                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
622         }
623
624         if (!reserve_perfctr_nmi(perfctr_msr))
625                 goto fail;
626
627         if (!reserve_evntsel_nmi(evntsel_msr))
628                 goto fail1;
629
630         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
631                 | P4_ESCR_OS
632                 | P4_ESCR_USR;
633
634         cccr_val |= P4_CCCR_THRESHOLD(15)
635                  | P4_CCCR_COMPLEMENT
636                  | P4_CCCR_COMPARE
637                  | P4_CCCR_REQUIRED;
638
639         wrmsr(evntsel_msr, evntsel, 0);
640         wrmsr(cccr_msr, cccr_val, 0);
641         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
642         apic_write(APIC_LVTPC, APIC_DM_NMI);
643         cccr_val |= P4_CCCR_ENABLE;
644         wrmsr(cccr_msr, cccr_val, 0);
645         wd->perfctr_msr = perfctr_msr;
646         wd->evntsel_msr = evntsel_msr;
647         wd->cccr_msr = cccr_msr;
648         wd->check_bit = 1ULL<<39;
649         return 1;
650 fail1:
651         release_perfctr_nmi(perfctr_msr);
652 fail:
653         return 0;
654 }
655
656 static void stop_p4_watchdog(void)
657 {
658         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
659
660         wrmsr(wd->cccr_msr, 0, 0);
661         wrmsr(wd->evntsel_msr, 0, 0);
662
663         release_evntsel_nmi(wd->evntsel_msr);
664         release_perfctr_nmi(wd->perfctr_msr);
665 }
666
667 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
668 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
669
670 static int setup_intel_arch_watchdog(void)
671 {
672         unsigned int ebx;
673         union cpuid10_eax eax;
674         unsigned int unused;
675         unsigned int perfctr_msr, evntsel_msr;
676         unsigned int evntsel;
677         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
678
679         /*
680          * Check whether the Architectural PerfMon supports
681          * Unhalted Core Cycles Event or not.
682          * NOTE: Corresponding bit = 0 in ebx indicates event present.
683          */
684         cpuid(10, &(eax.full), &ebx, &unused, &unused);
685         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
686             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
687                 goto fail;
688
689         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
690         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
691
692         if (!reserve_perfctr_nmi(perfctr_msr))
693                 goto fail;
694
695         if (!reserve_evntsel_nmi(evntsel_msr))
696                 goto fail1;
697
698         wrmsrl(perfctr_msr, 0UL);
699
700         evntsel = ARCH_PERFMON_EVENTSEL_INT
701                 | ARCH_PERFMON_EVENTSEL_OS
702                 | ARCH_PERFMON_EVENTSEL_USR
703                 | ARCH_PERFMON_NMI_EVENT_SEL
704                 | ARCH_PERFMON_NMI_EVENT_UMASK;
705
706         /* setup the timer */
707         wrmsr(evntsel_msr, evntsel, 0);
708         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
709         apic_write(APIC_LVTPC, APIC_DM_NMI);
710         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
711         wrmsr(evntsel_msr, evntsel, 0);
712
713         wd->perfctr_msr = perfctr_msr;
714         wd->evntsel_msr = evntsel_msr;
715         wd->cccr_msr = 0;  //unused
716         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
717         return 1;
718 fail1:
719         release_perfctr_nmi(perfctr_msr);
720 fail:
721         return 0;
722 }
723
724 static void stop_intel_arch_watchdog(void)
725 {
726         unsigned int ebx;
727         union cpuid10_eax eax;
728         unsigned int unused;
729         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
730
731         /*
732          * Check whether the Architectural PerfMon supports
733          * Unhalted Core Cycles Event or not.
734          * NOTE: Corresponding bit = 0 in ebx indicates event present.
735          */
736         cpuid(10, &(eax.full), &ebx, &unused, &unused);
737         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
738             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
739                 return;
740
741         wrmsr(wd->evntsel_msr, 0, 0);
742         release_evntsel_nmi(wd->evntsel_msr);
743         release_perfctr_nmi(wd->perfctr_msr);
744 }
745
746 void setup_apic_nmi_watchdog (void *unused)
747 {
748         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
749
750         /* only support LOCAL and IO APICs for now */
751         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
752             (nmi_watchdog != NMI_IO_APIC))
753                 return;
754
755         if (wd->enabled == 1)
756                 return;
757
758         /* cheap hack to support suspend/resume */
759         /* if cpu0 is not active neither should the other cpus */
760         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
761                 return;
762
763         if (nmi_watchdog == NMI_LOCAL_APIC) {
764                 switch (boot_cpu_data.x86_vendor) {
765                 case X86_VENDOR_AMD:
766                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
767                                 return;
768                         if (!setup_k7_watchdog())
769                                 return;
770                         break;
771                 case X86_VENDOR_INTEL:
772                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
773                                 if (!setup_intel_arch_watchdog())
774                                         return;
775                                 break;
776                         }
777                         switch (boot_cpu_data.x86) {
778                         case 6:
779                                 if (boot_cpu_data.x86_model > 0xd)
780                                         return;
781
782                                 if (!setup_p6_watchdog())
783                                         return;
784                                 break;
785                         case 15:
786                                 if (boot_cpu_data.x86_model > 0x4)
787                                         return;
788
789                                 if (!setup_p4_watchdog())
790                                         return;
791                                 break;
792                         default:
793                                 return;
794                         }
795                         break;
796                 default:
797                         return;
798                 }
799         }
800         wd->enabled = 1;
801         atomic_inc(&nmi_active);
802 }
803
804 void stop_apic_nmi_watchdog(void *unused)
805 {
806         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
807
808         /* only support LOCAL and IO APICs for now */
809         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
810             (nmi_watchdog != NMI_IO_APIC))
811                 return;
812
813         if (wd->enabled == 0)
814                 return;
815
816         if (nmi_watchdog == NMI_LOCAL_APIC) {
817                 switch (boot_cpu_data.x86_vendor) {
818                 case X86_VENDOR_AMD:
819                         stop_k7_watchdog();
820                         break;
821                 case X86_VENDOR_INTEL:
822                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
823                                 stop_intel_arch_watchdog();
824                                 break;
825                         }
826                         switch (boot_cpu_data.x86) {
827                         case 6:
828                                 if (boot_cpu_data.x86_model > 0xd)
829                                         break;
830                                 stop_p6_watchdog();
831                                 break;
832                         case 15:
833                                 if (boot_cpu_data.x86_model > 0x4)
834                                         break;
835                                 stop_p4_watchdog();
836                                 break;
837                         }
838                         break;
839                 default:
840                         return;
841                 }
842         }
843         wd->enabled = 0;
844         atomic_dec(&nmi_active);
845 }
846
847 /*
848  * the best way to detect whether a CPU has a 'hard lockup' problem
849  * is to check it's local APIC timer IRQ counts. If they are not
850  * changing then that CPU has some problem.
851  *
852  * as these watchdog NMI IRQs are generated on every CPU, we only
853  * have to check the current processor.
854  *
855  * since NMIs don't listen to _any_ locks, we have to be extremely
856  * careful not to rely on unsafe variables. The printk might lock
857  * up though, so we have to break up any console locks first ...
858  * [when there will be more tty-related locks, break them up
859  *  here too!]
860  */
861
862 static unsigned int
863         last_irq_sums [NR_CPUS],
864         alert_counter [NR_CPUS];
865
866 void touch_nmi_watchdog (void)
867 {
868         int i;
869
870         /*
871          * Just reset the alert counters, (other CPUs might be
872          * spinning on locks we hold):
873          */
874         for_each_possible_cpu(i)
875                 alert_counter[i] = 0;
876
877         /*
878          * Tickle the softlockup detector too:
879          */
880         touch_softlockup_watchdog();
881 }
882 EXPORT_SYMBOL(touch_nmi_watchdog);
883
884 extern void die_nmi(struct pt_regs *, const char *msg);
885
886 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
887 {
888
889         /*
890          * Since current_thread_info()-> is always on the stack, and we
891          * always switch the stack NMI-atomically, it's safe to use
892          * smp_processor_id().
893          */
894         unsigned int sum;
895         int touched = 0;
896         int cpu = smp_processor_id();
897         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
898         u64 dummy;
899         int rc=0;
900
901         /* check for other users first */
902         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
903                         == NOTIFY_STOP) {
904                 rc = 1;
905                 touched = 1;
906         }
907
908         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
909
910         /* if the apic timer isn't firing, this cpu isn't doing much */
911         if (!touched && last_irq_sums[cpu] == sum) {
912                 /*
913                  * Ayiee, looks like this CPU is stuck ...
914                  * wait a few IRQs (5 seconds) before doing the oops ...
915                  */
916                 alert_counter[cpu]++;
917                 if (alert_counter[cpu] == 5*nmi_hz)
918                         /*
919                          * die_nmi will return ONLY if NOTIFY_STOP happens..
920                          */
921                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
922         } else {
923                 last_irq_sums[cpu] = sum;
924                 alert_counter[cpu] = 0;
925         }
926         /* see if the nmi watchdog went off */
927         if (wd->enabled) {
928                 if (nmi_watchdog == NMI_LOCAL_APIC) {
929                         rdmsrl(wd->perfctr_msr, dummy);
930                         if (dummy & wd->check_bit){
931                                 /* this wasn't a watchdog timer interrupt */
932                                 goto done;
933                         }
934
935                         /* only Intel P4 uses the cccr msr */
936                         if (wd->cccr_msr != 0) {
937                                 /*
938                                  * P4 quirks:
939                                  * - An overflown perfctr will assert its interrupt
940                                  *   until the OVF flag in its CCCR is cleared.
941                                  * - LVTPC is masked on interrupt and must be
942                                  *   unmasked by the LVTPC handler.
943                                  */
944                                 rdmsrl(wd->cccr_msr, dummy);
945                                 dummy &= ~P4_CCCR_OVF;
946                                 wrmsrl(wd->cccr_msr, dummy);
947                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
948                         }
949                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
950                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
951                                 /* P6 based Pentium M need to re-unmask
952                                  * the apic vector but it doesn't hurt
953                                  * other P6 variant.
954                                  * ArchPerfom/Core Duo also needs this */
955                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
956                         }
957                         /* start the cycle over again */
958                         write_watchdog_counter(wd->perfctr_msr, NULL);
959                         rc = 1;
960                 } else if (nmi_watchdog == NMI_IO_APIC) {
961                         /* don't know how to accurately check for this.
962                          * just assume it was a watchdog timer interrupt
963                          * This matches the old behaviour.
964                          */
965                         rc = 1;
966                 }
967         }
968 done:
969         return rc;
970 }
971
972 int do_nmi_callback(struct pt_regs * regs, int cpu)
973 {
974 #ifdef CONFIG_SYSCTL
975         if (unknown_nmi_panic)
976                 return unknown_nmi_panic_callback(regs, cpu);
977 #endif
978         return 0;
979 }
980
981 #ifdef CONFIG_SYSCTL
982
983 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
984 {
985         unsigned char reason = get_nmi_reason();
986         char buf[64];
987
988         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
989         die_nmi(regs, buf);
990         return 0;
991 }
992
993 /*
994  * proc handler for /proc/sys/kernel/nmi
995  */
996 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
997                         void __user *buffer, size_t *length, loff_t *ppos)
998 {
999         int old_state;
1000
1001         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1002         old_state = nmi_watchdog_enabled;
1003         proc_dointvec(table, write, file, buffer, length, ppos);
1004         if (!!old_state == !!nmi_watchdog_enabled)
1005                 return 0;
1006
1007         if (atomic_read(&nmi_active) < 0) {
1008                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1009                 return -EIO;
1010         }
1011
1012         if (nmi_watchdog == NMI_DEFAULT) {
1013                 if (nmi_known_cpu() > 0)
1014                         nmi_watchdog = NMI_LOCAL_APIC;
1015                 else
1016                         nmi_watchdog = NMI_IO_APIC;
1017         }
1018
1019         if (nmi_watchdog == NMI_LOCAL_APIC) {
1020                 if (nmi_watchdog_enabled)
1021                         enable_lapic_nmi_watchdog();
1022                 else
1023                         disable_lapic_nmi_watchdog();
1024         } else {
1025                 printk( KERN_WARNING
1026                         "NMI watchdog doesn't know what hardware to touch\n");
1027                 return -EIO;
1028         }
1029         return 0;
1030 }
1031
1032 #endif
1033
1034 EXPORT_SYMBOL(nmi_active);
1035 EXPORT_SYMBOL(nmi_watchdog);
1036 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1037 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1038 EXPORT_SYMBOL(reserve_perfctr_nmi);
1039 EXPORT_SYMBOL(release_perfctr_nmi);
1040 EXPORT_SYMBOL(reserve_evntsel_nmi);
1041 EXPORT_SYMBOL(release_evntsel_nmi);
1042 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1043 EXPORT_SYMBOL(enable_timer_nmi_watchdog);