[PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs
[pandora-kernel.git] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 int unknown_nmi_panic;
35 int nmi_watchdog_enabled;
36
37 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
38  * evtsel_nmi_owner tracks the ownership of the event selection
39  * - different performance counters/ event selection may be reserved for
40  *   different subsystems this reservation system just tries to coordinate
41  *   things a little
42  */
43 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
44 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
45
46 static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
48 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
49  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
50  */
51 #define NMI_MAX_COUNTER_BITS 66
52
53 /* nmi_active:
54  * >0: the lapic NMI watchdog is active, but can be disabled
55  * <0: the lapic NMI watchdog has not been set up, and cannot
56  *     be enabled
57  *  0: the lapic NMI watchdog is disabled, but can be enabled
58  */
59 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
60
61 unsigned int nmi_watchdog = NMI_DEFAULT;
62 static unsigned int nmi_hz = HZ;
63
64 struct nmi_watchdog_ctlblk {
65         int enabled;
66         u64 check_bit;
67         unsigned int cccr_msr;
68         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
69         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
70 };
71 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
72
73 /* local prototypes */
74 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
75
76 extern void show_registers(struct pt_regs *regs);
77 extern int unknown_nmi_panic;
78
79 /* converts an msr to an appropriate reservation bit */
80 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
81 {
82         /* returns the bit offset of the performance counter register */
83         switch (boot_cpu_data.x86_vendor) {
84         case X86_VENDOR_AMD:
85                 return (msr - MSR_K7_PERFCTR0);
86         case X86_VENDOR_INTEL:
87                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
88                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
89
90                 switch (boot_cpu_data.x86) {
91                 case 6:
92                         return (msr - MSR_P6_PERFCTR0);
93                 case 15:
94                         return (msr - MSR_P4_BPU_PERFCTR0);
95                 }
96         }
97         return 0;
98 }
99
100 /* converts an msr to an appropriate reservation bit */
101 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
102 {
103         /* returns the bit offset of the event selection register */
104         switch (boot_cpu_data.x86_vendor) {
105         case X86_VENDOR_AMD:
106                 return (msr - MSR_K7_EVNTSEL0);
107         case X86_VENDOR_INTEL:
108                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
109                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
110
111                 switch (boot_cpu_data.x86) {
112                 case 6:
113                         return (msr - MSR_P6_EVNTSEL0);
114                 case 15:
115                         return (msr - MSR_P4_BSU_ESCR0);
116                 }
117         }
118         return 0;
119 }
120
121 /* checks for a bit availability (hack for oprofile) */
122 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
123 {
124         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
125
126         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
127 }
128
129 /* checks the an msr for availability */
130 int avail_to_resrv_perfctr_nmi(unsigned int msr)
131 {
132         unsigned int counter;
133
134         counter = nmi_perfctr_msr_to_bit(msr);
135         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
136
137         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
138 }
139
140 int reserve_perfctr_nmi(unsigned int msr)
141 {
142         unsigned int counter;
143
144         counter = nmi_perfctr_msr_to_bit(msr);
145         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
148                 return 1;
149         return 0;
150 }
151
152 void release_perfctr_nmi(unsigned int msr)
153 {
154         unsigned int counter;
155
156         counter = nmi_perfctr_msr_to_bit(msr);
157         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
158
159         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
160 }
161
162 int reserve_evntsel_nmi(unsigned int msr)
163 {
164         unsigned int counter;
165
166         counter = nmi_evntsel_msr_to_bit(msr);
167         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
168
169         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
170                 return 1;
171         return 0;
172 }
173
174 void release_evntsel_nmi(unsigned int msr)
175 {
176         unsigned int counter;
177
178         counter = nmi_evntsel_msr_to_bit(msr);
179         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
180
181         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
182 }
183
184 static __cpuinit inline int nmi_known_cpu(void)
185 {
186         switch (boot_cpu_data.x86_vendor) {
187         case X86_VENDOR_AMD:
188                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
189                         || (boot_cpu_data.x86 == 16));
190         case X86_VENDOR_INTEL:
191                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
192                         return 1;
193                 else
194                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
195         }
196         return 0;
197 }
198
199 static int endflag __initdata = 0;
200
201 #ifdef CONFIG_SMP
202 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
203  * the CPU is idle. To make sure the NMI watchdog really ticks on all
204  * CPUs during the test make them busy.
205  */
206 static __init void nmi_cpu_busy(void *data)
207 {
208         local_irq_enable_in_hardirq();
209         /* Intentionally don't use cpu_relax here. This is
210            to make sure that the performance counter really ticks,
211            even if there is a simulator or similar that catches the
212            pause instruction. On a real HT machine this is fine because
213            all other CPUs are busy with "useless" delay loops and don't
214            care if they get somewhat less cycles. */
215         while (endflag == 0)
216                 mb();
217 }
218 #endif
219
220 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
221 {
222         u64 counter_val;
223         unsigned int retval = hz;
224
225         /*
226          * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
227          * are writable, with higher bits sign extending from bit 31.
228          * So, we can only program the counter with 31 bit values and
229          * 32nd bit should be 1, for 33.. to be 1.
230          * Find the appropriate nmi_hz
231          */
232         counter_val = (u64)cpu_khz * 1000;
233         do_div(counter_val, retval);
234         if (counter_val > 0x7fffffffULL) {
235                 u64 count = (u64)cpu_khz * 1000;
236                 do_div(count, 0x7fffffffUL);
237                 retval = count + 1;
238         }
239         return retval;
240 }
241
242 static int __init check_nmi_watchdog(void)
243 {
244         unsigned int *prev_nmi_count;
245         int cpu;
246
247         /* Enable NMI watchdog for newer systems.
248            Probably safe on most older systems too, but let's be careful.
249            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
250            which hangs the system. Disable watchdog for all thinkpads */
251         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
252                 !dmi_name_in_vendors("ThinkPad"))
253                 nmi_watchdog = NMI_LOCAL_APIC;
254
255         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
256                 return 0;
257
258         if (!atomic_read(&nmi_active))
259                 return 0;
260
261         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
262         if (!prev_nmi_count)
263                 return -1;
264
265         printk(KERN_INFO "Testing NMI watchdog ... ");
266
267         if (nmi_watchdog == NMI_LOCAL_APIC)
268                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
269
270         for_each_possible_cpu(cpu)
271                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
272         local_irq_enable();
273         mdelay((10*1000)/nmi_hz); // wait 10 ticks
274
275         for_each_possible_cpu(cpu) {
276 #ifdef CONFIG_SMP
277                 /* Check cpu_callin_map here because that is set
278                    after the timer is started. */
279                 if (!cpu_isset(cpu, cpu_callin_map))
280                         continue;
281 #endif
282                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
283                         continue;
284                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
285                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
286                                 cpu,
287                                 prev_nmi_count[cpu],
288                                 nmi_count(cpu));
289                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
290                         atomic_dec(&nmi_active);
291                 }
292         }
293         if (!atomic_read(&nmi_active)) {
294                 kfree(prev_nmi_count);
295                 atomic_set(&nmi_active, -1);
296                 return -1;
297         }
298         endflag = 1;
299         printk("OK.\n");
300
301         /* now that we know it works we can reduce NMI frequency to
302            something more reasonable; makes a difference in some configs */
303         if (nmi_watchdog == NMI_LOCAL_APIC) {
304                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
305
306                 nmi_hz = 1;
307
308                 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
309                     wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
310                         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
311                 }
312         }
313
314         kfree(prev_nmi_count);
315         return 0;
316 }
317 /* This needs to happen later in boot so counters are working */
318 late_initcall(check_nmi_watchdog);
319
320 static int __init setup_nmi_watchdog(char *str)
321 {
322         int nmi;
323
324         get_option(&str, &nmi);
325
326         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
327                 return 0;
328
329         nmi_watchdog = nmi;
330         return 1;
331 }
332
333 __setup("nmi_watchdog=", setup_nmi_watchdog);
334
335 static void disable_lapic_nmi_watchdog(void)
336 {
337         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
338
339         if (atomic_read(&nmi_active) <= 0)
340                 return;
341
342         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
343
344         BUG_ON(atomic_read(&nmi_active) != 0);
345 }
346
347 static void enable_lapic_nmi_watchdog(void)
348 {
349         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
350
351         /* are we already enabled */
352         if (atomic_read(&nmi_active) != 0)
353                 return;
354
355         /* are we lapic aware */
356         if (nmi_known_cpu() <= 0)
357                 return;
358
359         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
360         touch_nmi_watchdog();
361 }
362
363 void disable_timer_nmi_watchdog(void)
364 {
365         BUG_ON(nmi_watchdog != NMI_IO_APIC);
366
367         if (atomic_read(&nmi_active) <= 0)
368                 return;
369
370         disable_irq(0);
371         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
372
373         BUG_ON(atomic_read(&nmi_active) != 0);
374 }
375
376 void enable_timer_nmi_watchdog(void)
377 {
378         BUG_ON(nmi_watchdog != NMI_IO_APIC);
379
380         if (atomic_read(&nmi_active) == 0) {
381                 touch_nmi_watchdog();
382                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
383                 enable_irq(0);
384         }
385 }
386
387 static void __acpi_nmi_disable(void *__unused)
388 {
389         apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
390 }
391
392 /*
393  * Disable timer based NMIs on all CPUs:
394  */
395 void acpi_nmi_disable(void)
396 {
397         if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
398                 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
399 }
400
401 static void __acpi_nmi_enable(void *__unused)
402 {
403         apic_write_around(APIC_LVT0, APIC_DM_NMI);
404 }
405
406 /*
407  * Enable timer based NMIs on all CPUs:
408  */
409 void acpi_nmi_enable(void)
410 {
411         if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
412                 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
413 }
414
415 #ifdef CONFIG_PM
416
417 static int nmi_pm_active; /* nmi_active before suspend */
418
419 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
420 {
421         /* only CPU0 goes here, other CPUs should be offline */
422         nmi_pm_active = atomic_read(&nmi_active);
423         stop_apic_nmi_watchdog(NULL);
424         BUG_ON(atomic_read(&nmi_active) != 0);
425         return 0;
426 }
427
428 static int lapic_nmi_resume(struct sys_device *dev)
429 {
430         /* only CPU0 goes here, other CPUs should be offline */
431         if (nmi_pm_active > 0) {
432                 setup_apic_nmi_watchdog(NULL);
433                 touch_nmi_watchdog();
434         }
435         return 0;
436 }
437
438
439 static struct sysdev_class nmi_sysclass = {
440         set_kset_name("lapic_nmi"),
441         .resume         = lapic_nmi_resume,
442         .suspend        = lapic_nmi_suspend,
443 };
444
445 static struct sys_device device_lapic_nmi = {
446         .id     = 0,
447         .cls    = &nmi_sysclass,
448 };
449
450 static int __init init_lapic_nmi_sysfs(void)
451 {
452         int error;
453
454         /* should really be a BUG_ON but b/c this is an
455          * init call, it just doesn't work.  -dcz
456          */
457         if (nmi_watchdog != NMI_LOCAL_APIC)
458                 return 0;
459
460         if ( atomic_read(&nmi_active) < 0 )
461                 return 0;
462
463         error = sysdev_class_register(&nmi_sysclass);
464         if (!error)
465                 error = sysdev_register(&device_lapic_nmi);
466         return error;
467 }
468 /* must come after the local APIC's device_initcall() */
469 late_initcall(init_lapic_nmi_sysfs);
470
471 #endif  /* CONFIG_PM */
472
473 /*
474  * Activate the NMI watchdog via the local APIC.
475  * Original code written by Keith Owens.
476  */
477
478 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
479 {
480         u64 count = (u64)cpu_khz * 1000;
481
482         do_div(count, nmi_hz);
483         if(descr)
484                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
485         wrmsrl(perfctr_msr, 0 - count);
486 }
487
488 static void write_watchdog_counter32(unsigned int perfctr_msr,
489                 const char *descr)
490 {
491         u64 count = (u64)cpu_khz * 1000;
492
493         do_div(count, nmi_hz);
494         if(descr)
495                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
496         wrmsr(perfctr_msr, (u32)(-count), 0);
497 }
498
499 /* Note that these events don't tick when the CPU idles. This means
500    the frequency varies with CPU load. */
501
502 #define K7_EVNTSEL_ENABLE       (1 << 22)
503 #define K7_EVNTSEL_INT          (1 << 20)
504 #define K7_EVNTSEL_OS           (1 << 17)
505 #define K7_EVNTSEL_USR          (1 << 16)
506 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
507 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
508
509 static int setup_k7_watchdog(void)
510 {
511         unsigned int perfctr_msr, evntsel_msr;
512         unsigned int evntsel;
513         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
514
515         perfctr_msr = MSR_K7_PERFCTR0;
516         evntsel_msr = MSR_K7_EVNTSEL0;
517         if (!reserve_perfctr_nmi(perfctr_msr))
518                 goto fail;
519
520         if (!reserve_evntsel_nmi(evntsel_msr))
521                 goto fail1;
522
523         wrmsrl(perfctr_msr, 0UL);
524
525         evntsel = K7_EVNTSEL_INT
526                 | K7_EVNTSEL_OS
527                 | K7_EVNTSEL_USR
528                 | K7_NMI_EVENT;
529
530         /* setup the timer */
531         wrmsr(evntsel_msr, evntsel, 0);
532         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
533         apic_write(APIC_LVTPC, APIC_DM_NMI);
534         evntsel |= K7_EVNTSEL_ENABLE;
535         wrmsr(evntsel_msr, evntsel, 0);
536
537         wd->perfctr_msr = perfctr_msr;
538         wd->evntsel_msr = evntsel_msr;
539         wd->cccr_msr = 0;  //unused
540         wd->check_bit = 1ULL<<63;
541         return 1;
542 fail1:
543         release_perfctr_nmi(perfctr_msr);
544 fail:
545         return 0;
546 }
547
548 static void stop_k7_watchdog(void)
549 {
550         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
551
552         wrmsr(wd->evntsel_msr, 0, 0);
553
554         release_evntsel_nmi(wd->evntsel_msr);
555         release_perfctr_nmi(wd->perfctr_msr);
556 }
557
558 #define P6_EVNTSEL0_ENABLE      (1 << 22)
559 #define P6_EVNTSEL_INT          (1 << 20)
560 #define P6_EVNTSEL_OS           (1 << 17)
561 #define P6_EVNTSEL_USR          (1 << 16)
562 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
563 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
564
565 static int setup_p6_watchdog(void)
566 {
567         unsigned int perfctr_msr, evntsel_msr;
568         unsigned int evntsel;
569         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
570
571         perfctr_msr = MSR_P6_PERFCTR0;
572         evntsel_msr = MSR_P6_EVNTSEL0;
573         if (!reserve_perfctr_nmi(perfctr_msr))
574                 goto fail;
575
576         if (!reserve_evntsel_nmi(evntsel_msr))
577                 goto fail1;
578
579         wrmsrl(perfctr_msr, 0UL);
580
581         evntsel = P6_EVNTSEL_INT
582                 | P6_EVNTSEL_OS
583                 | P6_EVNTSEL_USR
584                 | P6_NMI_EVENT;
585
586         /* setup the timer */
587         wrmsr(evntsel_msr, evntsel, 0);
588         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
589         write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
590         apic_write(APIC_LVTPC, APIC_DM_NMI);
591         evntsel |= P6_EVNTSEL0_ENABLE;
592         wrmsr(evntsel_msr, evntsel, 0);
593
594         wd->perfctr_msr = perfctr_msr;
595         wd->evntsel_msr = evntsel_msr;
596         wd->cccr_msr = 0;  //unused
597         wd->check_bit = 1ULL<<39;
598         return 1;
599 fail1:
600         release_perfctr_nmi(perfctr_msr);
601 fail:
602         return 0;
603 }
604
605 static void stop_p6_watchdog(void)
606 {
607         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
608
609         wrmsr(wd->evntsel_msr, 0, 0);
610
611         release_evntsel_nmi(wd->evntsel_msr);
612         release_perfctr_nmi(wd->perfctr_msr);
613 }
614
615 /* Note that these events don't tick when the CPU idles. This means
616    the frequency varies with CPU load. */
617
618 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
619 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
620 #define P4_ESCR_OS              (1<<3)
621 #define P4_ESCR_USR             (1<<2)
622 #define P4_CCCR_OVF_PMI0        (1<<26)
623 #define P4_CCCR_OVF_PMI1        (1<<27)
624 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
625 #define P4_CCCR_COMPLEMENT      (1<<19)
626 #define P4_CCCR_COMPARE         (1<<18)
627 #define P4_CCCR_REQUIRED        (3<<16)
628 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
629 #define P4_CCCR_ENABLE          (1<<12)
630 #define P4_CCCR_OVF             (1<<31)
631 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
632    CRU_ESCR0 (with any non-null event selector) through a complemented
633    max threshold. [IA32-Vol3, Section 14.9.9] */
634
635 static int setup_p4_watchdog(void)
636 {
637         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
638         unsigned int evntsel, cccr_val;
639         unsigned int misc_enable, dummy;
640         unsigned int ht_num;
641         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
642
643         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
644         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
645                 return 0;
646
647 #ifdef CONFIG_SMP
648         /* detect which hyperthread we are on */
649         if (smp_num_siblings == 2) {
650                 unsigned int ebx, apicid;
651
652                 ebx = cpuid_ebx(1);
653                 apicid = (ebx >> 24) & 0xff;
654                 ht_num = apicid & 1;
655         } else
656 #endif
657                 ht_num = 0;
658
659         /* performance counters are shared resources
660          * assign each hyperthread its own set
661          * (re-use the ESCR0 register, seems safe
662          * and keeps the cccr_val the same)
663          */
664         if (!ht_num) {
665                 /* logical cpu 0 */
666                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
667                 evntsel_msr = MSR_P4_CRU_ESCR0;
668                 cccr_msr = MSR_P4_IQ_CCCR0;
669                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
670         } else {
671                 /* logical cpu 1 */
672                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
673                 evntsel_msr = MSR_P4_CRU_ESCR0;
674                 cccr_msr = MSR_P4_IQ_CCCR1;
675                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
676         }
677
678         if (!reserve_perfctr_nmi(perfctr_msr))
679                 goto fail;
680
681         if (!reserve_evntsel_nmi(evntsel_msr))
682                 goto fail1;
683
684         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
685                 | P4_ESCR_OS
686                 | P4_ESCR_USR;
687
688         cccr_val |= P4_CCCR_THRESHOLD(15)
689                  | P4_CCCR_COMPLEMENT
690                  | P4_CCCR_COMPARE
691                  | P4_CCCR_REQUIRED;
692
693         wrmsr(evntsel_msr, evntsel, 0);
694         wrmsr(cccr_msr, cccr_val, 0);
695         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
696         apic_write(APIC_LVTPC, APIC_DM_NMI);
697         cccr_val |= P4_CCCR_ENABLE;
698         wrmsr(cccr_msr, cccr_val, 0);
699         wd->perfctr_msr = perfctr_msr;
700         wd->evntsel_msr = evntsel_msr;
701         wd->cccr_msr = cccr_msr;
702         wd->check_bit = 1ULL<<39;
703         return 1;
704 fail1:
705         release_perfctr_nmi(perfctr_msr);
706 fail:
707         return 0;
708 }
709
710 static void stop_p4_watchdog(void)
711 {
712         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
713
714         wrmsr(wd->cccr_msr, 0, 0);
715         wrmsr(wd->evntsel_msr, 0, 0);
716
717         release_evntsel_nmi(wd->evntsel_msr);
718         release_perfctr_nmi(wd->perfctr_msr);
719 }
720
721 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
722 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
723
724 static int setup_intel_arch_watchdog(void)
725 {
726         unsigned int ebx;
727         union cpuid10_eax eax;
728         unsigned int unused;
729         unsigned int perfctr_msr, evntsel_msr;
730         unsigned int evntsel;
731         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
732
733         /*
734          * Check whether the Architectural PerfMon supports
735          * Unhalted Core Cycles Event or not.
736          * NOTE: Corresponding bit = 0 in ebx indicates event present.
737          */
738         cpuid(10, &(eax.full), &ebx, &unused, &unused);
739         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
740             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
741                 goto fail;
742
743         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
744         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
745
746         if (!reserve_perfctr_nmi(perfctr_msr))
747                 goto fail;
748
749         if (!reserve_evntsel_nmi(evntsel_msr))
750                 goto fail1;
751
752         wrmsrl(perfctr_msr, 0UL);
753
754         evntsel = ARCH_PERFMON_EVENTSEL_INT
755                 | ARCH_PERFMON_EVENTSEL_OS
756                 | ARCH_PERFMON_EVENTSEL_USR
757                 | ARCH_PERFMON_NMI_EVENT_SEL
758                 | ARCH_PERFMON_NMI_EVENT_UMASK;
759
760         /* setup the timer */
761         wrmsr(evntsel_msr, evntsel, 0);
762         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
763         write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
764         apic_write(APIC_LVTPC, APIC_DM_NMI);
765         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
766         wrmsr(evntsel_msr, evntsel, 0);
767
768         wd->perfctr_msr = perfctr_msr;
769         wd->evntsel_msr = evntsel_msr;
770         wd->cccr_msr = 0;  //unused
771         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
772         return 1;
773 fail1:
774         release_perfctr_nmi(perfctr_msr);
775 fail:
776         return 0;
777 }
778
779 static void stop_intel_arch_watchdog(void)
780 {
781         unsigned int ebx;
782         union cpuid10_eax eax;
783         unsigned int unused;
784         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
785
786         /*
787          * Check whether the Architectural PerfMon supports
788          * Unhalted Core Cycles Event or not.
789          * NOTE: Corresponding bit = 0 in ebx indicates event present.
790          */
791         cpuid(10, &(eax.full), &ebx, &unused, &unused);
792         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
793             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
794                 return;
795
796         wrmsr(wd->evntsel_msr, 0, 0);
797         release_evntsel_nmi(wd->evntsel_msr);
798         release_perfctr_nmi(wd->perfctr_msr);
799 }
800
801 void setup_apic_nmi_watchdog (void *unused)
802 {
803         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
804
805         /* only support LOCAL and IO APICs for now */
806         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
807             (nmi_watchdog != NMI_IO_APIC))
808                 return;
809
810         if (wd->enabled == 1)
811                 return;
812
813         /* cheap hack to support suspend/resume */
814         /* if cpu0 is not active neither should the other cpus */
815         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
816                 return;
817
818         if (nmi_watchdog == NMI_LOCAL_APIC) {
819                 switch (boot_cpu_data.x86_vendor) {
820                 case X86_VENDOR_AMD:
821                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
822                                 boot_cpu_data.x86 != 16)
823                                 return;
824                         if (!setup_k7_watchdog())
825                                 return;
826                         break;
827                 case X86_VENDOR_INTEL:
828                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
829                                 if (!setup_intel_arch_watchdog())
830                                         return;
831                                 break;
832                         }
833                         switch (boot_cpu_data.x86) {
834                         case 6:
835                                 if (boot_cpu_data.x86_model > 0xd)
836                                         return;
837
838                                 if (!setup_p6_watchdog())
839                                         return;
840                                 break;
841                         case 15:
842                                 if (boot_cpu_data.x86_model > 0x4)
843                                         return;
844
845                                 if (!setup_p4_watchdog())
846                                         return;
847                                 break;
848                         default:
849                                 return;
850                         }
851                         break;
852                 default:
853                         return;
854                 }
855         }
856         wd->enabled = 1;
857         atomic_inc(&nmi_active);
858 }
859
860 void stop_apic_nmi_watchdog(void *unused)
861 {
862         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
863
864         /* only support LOCAL and IO APICs for now */
865         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
866             (nmi_watchdog != NMI_IO_APIC))
867                 return;
868
869         if (wd->enabled == 0)
870                 return;
871
872         if (nmi_watchdog == NMI_LOCAL_APIC) {
873                 switch (boot_cpu_data.x86_vendor) {
874                 case X86_VENDOR_AMD:
875                         stop_k7_watchdog();
876                         break;
877                 case X86_VENDOR_INTEL:
878                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
879                                 stop_intel_arch_watchdog();
880                                 break;
881                         }
882                         switch (boot_cpu_data.x86) {
883                         case 6:
884                                 if (boot_cpu_data.x86_model > 0xd)
885                                         break;
886                                 stop_p6_watchdog();
887                                 break;
888                         case 15:
889                                 if (boot_cpu_data.x86_model > 0x4)
890                                         break;
891                                 stop_p4_watchdog();
892                                 break;
893                         }
894                         break;
895                 default:
896                         return;
897                 }
898         }
899         wd->enabled = 0;
900         atomic_dec(&nmi_active);
901 }
902
903 /*
904  * the best way to detect whether a CPU has a 'hard lockup' problem
905  * is to check it's local APIC timer IRQ counts. If they are not
906  * changing then that CPU has some problem.
907  *
908  * as these watchdog NMI IRQs are generated on every CPU, we only
909  * have to check the current processor.
910  *
911  * since NMIs don't listen to _any_ locks, we have to be extremely
912  * careful not to rely on unsafe variables. The printk might lock
913  * up though, so we have to break up any console locks first ...
914  * [when there will be more tty-related locks, break them up
915  *  here too!]
916  */
917
918 static unsigned int
919         last_irq_sums [NR_CPUS],
920         alert_counter [NR_CPUS];
921
922 void touch_nmi_watchdog (void)
923 {
924         if (nmi_watchdog > 0) {
925                 unsigned cpu;
926
927                 /*
928                  * Just reset the alert counters, (other CPUs might be
929                  * spinning on locks we hold):
930                  */
931                 for_each_present_cpu (cpu)
932                         alert_counter[cpu] = 0;
933         }
934
935         /*
936          * Tickle the softlockup detector too:
937          */
938         touch_softlockup_watchdog();
939 }
940 EXPORT_SYMBOL(touch_nmi_watchdog);
941
942 extern void die_nmi(struct pt_regs *, const char *msg);
943
944 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
945 {
946
947         /*
948          * Since current_thread_info()-> is always on the stack, and we
949          * always switch the stack NMI-atomically, it's safe to use
950          * smp_processor_id().
951          */
952         unsigned int sum;
953         int touched = 0;
954         int cpu = smp_processor_id();
955         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
956         u64 dummy;
957         int rc=0;
958
959         /* check for other users first */
960         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
961                         == NOTIFY_STOP) {
962                 rc = 1;
963                 touched = 1;
964         }
965
966         if (cpu_isset(cpu, backtrace_mask)) {
967                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
968
969                 spin_lock(&lock);
970                 printk("NMI backtrace for cpu %d\n", cpu);
971                 dump_stack();
972                 spin_unlock(&lock);
973                 cpu_clear(cpu, backtrace_mask);
974         }
975
976         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
977
978         /* if the apic timer isn't firing, this cpu isn't doing much */
979         if (!touched && last_irq_sums[cpu] == sum) {
980                 /*
981                  * Ayiee, looks like this CPU is stuck ...
982                  * wait a few IRQs (5 seconds) before doing the oops ...
983                  */
984                 alert_counter[cpu]++;
985                 if (alert_counter[cpu] == 5*nmi_hz)
986                         /*
987                          * die_nmi will return ONLY if NOTIFY_STOP happens..
988                          */
989                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
990         } else {
991                 last_irq_sums[cpu] = sum;
992                 alert_counter[cpu] = 0;
993         }
994         /* see if the nmi watchdog went off */
995         if (wd->enabled) {
996                 if (nmi_watchdog == NMI_LOCAL_APIC) {
997                         rdmsrl(wd->perfctr_msr, dummy);
998                         if (dummy & wd->check_bit){
999                                 /* this wasn't a watchdog timer interrupt */
1000                                 goto done;
1001                         }
1002
1003                         /* only Intel P4 uses the cccr msr */
1004                         if (wd->cccr_msr != 0) {
1005                                 /*
1006                                  * P4 quirks:
1007                                  * - An overflown perfctr will assert its interrupt
1008                                  *   until the OVF flag in its CCCR is cleared.
1009                                  * - LVTPC is masked on interrupt and must be
1010                                  *   unmasked by the LVTPC handler.
1011                                  */
1012                                 rdmsrl(wd->cccr_msr, dummy);
1013                                 dummy &= ~P4_CCCR_OVF;
1014                                 wrmsrl(wd->cccr_msr, dummy);
1015                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
1016                                 /* start the cycle over again */
1017                                 write_watchdog_counter(wd->perfctr_msr, NULL);
1018                         }
1019                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
1020                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
1021                                 /* P6 based Pentium M need to re-unmask
1022                                  * the apic vector but it doesn't hurt
1023                                  * other P6 variant.
1024                                  * ArchPerfom/Core Duo also needs this */
1025                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
1026                                 /* P6/ARCH_PERFMON has 32 bit counter write */
1027                                 write_watchdog_counter32(wd->perfctr_msr, NULL);
1028                         } else {
1029                                 /* start the cycle over again */
1030                                 write_watchdog_counter(wd->perfctr_msr, NULL);
1031                         }
1032                         rc = 1;
1033                 } else if (nmi_watchdog == NMI_IO_APIC) {
1034                         /* don't know how to accurately check for this.
1035                          * just assume it was a watchdog timer interrupt
1036                          * This matches the old behaviour.
1037                          */
1038                         rc = 1;
1039                 }
1040         }
1041 done:
1042         return rc;
1043 }
1044
1045 int do_nmi_callback(struct pt_regs * regs, int cpu)
1046 {
1047 #ifdef CONFIG_SYSCTL
1048         if (unknown_nmi_panic)
1049                 return unknown_nmi_panic_callback(regs, cpu);
1050 #endif
1051         return 0;
1052 }
1053
1054 #ifdef CONFIG_SYSCTL
1055
1056 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
1057 {
1058         unsigned char reason = get_nmi_reason();
1059         char buf[64];
1060
1061         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1062         die_nmi(regs, buf);
1063         return 0;
1064 }
1065
1066 /*
1067  * proc handler for /proc/sys/kernel/nmi
1068  */
1069 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1070                         void __user *buffer, size_t *length, loff_t *ppos)
1071 {
1072         int old_state;
1073
1074         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1075         old_state = nmi_watchdog_enabled;
1076         proc_dointvec(table, write, file, buffer, length, ppos);
1077         if (!!old_state == !!nmi_watchdog_enabled)
1078                 return 0;
1079
1080         if (atomic_read(&nmi_active) < 0) {
1081                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1082                 return -EIO;
1083         }
1084
1085         if (nmi_watchdog == NMI_DEFAULT) {
1086                 if (nmi_known_cpu() > 0)
1087                         nmi_watchdog = NMI_LOCAL_APIC;
1088                 else
1089                         nmi_watchdog = NMI_IO_APIC;
1090         }
1091
1092         if (nmi_watchdog == NMI_LOCAL_APIC) {
1093                 if (nmi_watchdog_enabled)
1094                         enable_lapic_nmi_watchdog();
1095                 else
1096                         disable_lapic_nmi_watchdog();
1097         } else {
1098                 printk( KERN_WARNING
1099                         "NMI watchdog doesn't know what hardware to touch\n");
1100                 return -EIO;
1101         }
1102         return 0;
1103 }
1104
1105 #endif
1106
1107 void __trigger_all_cpu_backtrace(void)
1108 {
1109         int i;
1110
1111         backtrace_mask = cpu_online_map;
1112         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1113         for (i = 0; i < 10 * 1000; i++) {
1114                 if (cpus_empty(backtrace_mask))
1115                         break;
1116                 mdelay(1);
1117         }
1118 }
1119
1120 EXPORT_SYMBOL(nmi_active);
1121 EXPORT_SYMBOL(nmi_watchdog);
1122 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1123 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1124 EXPORT_SYMBOL(reserve_perfctr_nmi);
1125 EXPORT_SYMBOL(release_perfctr_nmi);
1126 EXPORT_SYMBOL(reserve_evntsel_nmi);
1127 EXPORT_SYMBOL(release_evntsel_nmi);
1128 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1129 EXPORT_SYMBOL(enable_timer_nmi_watchdog);