Merge branch 'master' into upstream
[pandora-kernel.git] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26 #include <linux/kernel_stat.h>
27
28 #include <asm/smp.h>
29 #include <asm/nmi.h>
30 #include <asm/kdebug.h>
31 #include <asm/intel_arch_perfmon.h>
32
33 #include "mach_traps.h"
34
35 int unknown_nmi_panic;
36 int nmi_watchdog_enabled;
37
38 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
39  * evtsel_nmi_owner tracks the ownership of the event selection
40  * - different performance counters/ event selection may be reserved for
41  *   different subsystems this reservation system just tries to coordinate
42  *   things a little
43  */
44 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
45 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
46
47 static cpumask_t backtrace_mask = CPU_MASK_NONE;
48
49 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
50  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
51  */
52 #define NMI_MAX_COUNTER_BITS 66
53
54 /* nmi_active:
55  * >0: the lapic NMI watchdog is active, but can be disabled
56  * <0: the lapic NMI watchdog has not been set up, and cannot
57  *     be enabled
58  *  0: the lapic NMI watchdog is disabled, but can be enabled
59  */
60 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
61
62 unsigned int nmi_watchdog = NMI_DEFAULT;
63 static unsigned int nmi_hz = HZ;
64
65 struct nmi_watchdog_ctlblk {
66         int enabled;
67         u64 check_bit;
68         unsigned int cccr_msr;
69         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
70         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
71 };
72 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
73
74 /* local prototypes */
75 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
76
77 extern void show_registers(struct pt_regs *regs);
78 extern int unknown_nmi_panic;
79
80 /* converts an msr to an appropriate reservation bit */
81 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
82 {
83         /* returns the bit offset of the performance counter register */
84         switch (boot_cpu_data.x86_vendor) {
85         case X86_VENDOR_AMD:
86                 return (msr - MSR_K7_PERFCTR0);
87         case X86_VENDOR_INTEL:
88                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
89                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
90
91                 switch (boot_cpu_data.x86) {
92                 case 6:
93                         return (msr - MSR_P6_PERFCTR0);
94                 case 15:
95                         return (msr - MSR_P4_BPU_PERFCTR0);
96                 }
97         }
98         return 0;
99 }
100
101 /* converts an msr to an appropriate reservation bit */
102 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
103 {
104         /* returns the bit offset of the event selection register */
105         switch (boot_cpu_data.x86_vendor) {
106         case X86_VENDOR_AMD:
107                 return (msr - MSR_K7_EVNTSEL0);
108         case X86_VENDOR_INTEL:
109                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
110                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
111
112                 switch (boot_cpu_data.x86) {
113                 case 6:
114                         return (msr - MSR_P6_EVNTSEL0);
115                 case 15:
116                         return (msr - MSR_P4_BSU_ESCR0);
117                 }
118         }
119         return 0;
120 }
121
122 /* checks for a bit availability (hack for oprofile) */
123 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
124 {
125         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
128 }
129
130 /* checks the an msr for availability */
131 int avail_to_resrv_perfctr_nmi(unsigned int msr)
132 {
133         unsigned int counter;
134
135         counter = nmi_perfctr_msr_to_bit(msr);
136         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
137
138         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
139 }
140
141 int reserve_perfctr_nmi(unsigned int msr)
142 {
143         unsigned int counter;
144
145         counter = nmi_perfctr_msr_to_bit(msr);
146         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
147
148         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
149                 return 1;
150         return 0;
151 }
152
153 void release_perfctr_nmi(unsigned int msr)
154 {
155         unsigned int counter;
156
157         counter = nmi_perfctr_msr_to_bit(msr);
158         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
159
160         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
161 }
162
163 int reserve_evntsel_nmi(unsigned int msr)
164 {
165         unsigned int counter;
166
167         counter = nmi_evntsel_msr_to_bit(msr);
168         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
169
170         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
171                 return 1;
172         return 0;
173 }
174
175 void release_evntsel_nmi(unsigned int msr)
176 {
177         unsigned int counter;
178
179         counter = nmi_evntsel_msr_to_bit(msr);
180         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
181
182         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
183 }
184
185 static __cpuinit inline int nmi_known_cpu(void)
186 {
187         switch (boot_cpu_data.x86_vendor) {
188         case X86_VENDOR_AMD:
189                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
190                         || (boot_cpu_data.x86 == 16));
191         case X86_VENDOR_INTEL:
192                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
193                         return 1;
194                 else
195                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
196         }
197         return 0;
198 }
199
200 static int endflag __initdata = 0;
201
202 #ifdef CONFIG_SMP
203 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
204  * the CPU is idle. To make sure the NMI watchdog really ticks on all
205  * CPUs during the test make them busy.
206  */
207 static __init void nmi_cpu_busy(void *data)
208 {
209         local_irq_enable_in_hardirq();
210         /* Intentionally don't use cpu_relax here. This is
211            to make sure that the performance counter really ticks,
212            even if there is a simulator or similar that catches the
213            pause instruction. On a real HT machine this is fine because
214            all other CPUs are busy with "useless" delay loops and don't
215            care if they get somewhat less cycles. */
216         while (endflag == 0)
217                 mb();
218 }
219 #endif
220
221 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
222 {
223         u64 counter_val;
224         unsigned int retval = hz;
225
226         /*
227          * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
228          * are writable, with higher bits sign extending from bit 31.
229          * So, we can only program the counter with 31 bit values and
230          * 32nd bit should be 1, for 33.. to be 1.
231          * Find the appropriate nmi_hz
232          */
233         counter_val = (u64)cpu_khz * 1000;
234         do_div(counter_val, retval);
235         if (counter_val > 0x7fffffffULL) {
236                 u64 count = (u64)cpu_khz * 1000;
237                 do_div(count, 0x7fffffffUL);
238                 retval = count + 1;
239         }
240         return retval;
241 }
242
243 static int __init check_nmi_watchdog(void)
244 {
245         unsigned int *prev_nmi_count;
246         int cpu;
247
248         /* Enable NMI watchdog for newer systems.
249            Probably safe on most older systems too, but let's be careful.
250            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
251            which hangs the system. Disable watchdog for all thinkpads */
252         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
253                 !dmi_name_in_vendors("ThinkPad"))
254                 nmi_watchdog = NMI_LOCAL_APIC;
255
256         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
257                 return 0;
258
259         if (!atomic_read(&nmi_active))
260                 return 0;
261
262         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
263         if (!prev_nmi_count)
264                 return -1;
265
266         printk(KERN_INFO "Testing NMI watchdog ... ");
267
268         if (nmi_watchdog == NMI_LOCAL_APIC)
269                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
270
271         for_each_possible_cpu(cpu)
272                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
273         local_irq_enable();
274         mdelay((10*1000)/nmi_hz); // wait 10 ticks
275
276         for_each_possible_cpu(cpu) {
277 #ifdef CONFIG_SMP
278                 /* Check cpu_callin_map here because that is set
279                    after the timer is started. */
280                 if (!cpu_isset(cpu, cpu_callin_map))
281                         continue;
282 #endif
283                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
284                         continue;
285                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
286                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
287                                 cpu,
288                                 prev_nmi_count[cpu],
289                                 nmi_count(cpu));
290                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
291                         atomic_dec(&nmi_active);
292                 }
293         }
294         if (!atomic_read(&nmi_active)) {
295                 kfree(prev_nmi_count);
296                 atomic_set(&nmi_active, -1);
297                 return -1;
298         }
299         endflag = 1;
300         printk("OK.\n");
301
302         /* now that we know it works we can reduce NMI frequency to
303            something more reasonable; makes a difference in some configs */
304         if (nmi_watchdog == NMI_LOCAL_APIC) {
305                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
306
307                 nmi_hz = 1;
308
309                 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
310                     wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
311                         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
312                 }
313         }
314
315         kfree(prev_nmi_count);
316         return 0;
317 }
318 /* This needs to happen later in boot so counters are working */
319 late_initcall(check_nmi_watchdog);
320
321 static int __init setup_nmi_watchdog(char *str)
322 {
323         int nmi;
324
325         get_option(&str, &nmi);
326
327         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
328                 return 0;
329
330         nmi_watchdog = nmi;
331         return 1;
332 }
333
334 __setup("nmi_watchdog=", setup_nmi_watchdog);
335
336 static void disable_lapic_nmi_watchdog(void)
337 {
338         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
339
340         if (atomic_read(&nmi_active) <= 0)
341                 return;
342
343         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
344
345         BUG_ON(atomic_read(&nmi_active) != 0);
346 }
347
348 static void enable_lapic_nmi_watchdog(void)
349 {
350         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
351
352         /* are we already enabled */
353         if (atomic_read(&nmi_active) != 0)
354                 return;
355
356         /* are we lapic aware */
357         if (nmi_known_cpu() <= 0)
358                 return;
359
360         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
361         touch_nmi_watchdog();
362 }
363
364 void disable_timer_nmi_watchdog(void)
365 {
366         BUG_ON(nmi_watchdog != NMI_IO_APIC);
367
368         if (atomic_read(&nmi_active) <= 0)
369                 return;
370
371         disable_irq(0);
372         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
373
374         BUG_ON(atomic_read(&nmi_active) != 0);
375 }
376
377 void enable_timer_nmi_watchdog(void)
378 {
379         BUG_ON(nmi_watchdog != NMI_IO_APIC);
380
381         if (atomic_read(&nmi_active) == 0) {
382                 touch_nmi_watchdog();
383                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
384                 enable_irq(0);
385         }
386 }
387
388 static void __acpi_nmi_disable(void *__unused)
389 {
390         apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
391 }
392
393 /*
394  * Disable timer based NMIs on all CPUs:
395  */
396 void acpi_nmi_disable(void)
397 {
398         if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
399                 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
400 }
401
402 static void __acpi_nmi_enable(void *__unused)
403 {
404         apic_write_around(APIC_LVT0, APIC_DM_NMI);
405 }
406
407 /*
408  * Enable timer based NMIs on all CPUs:
409  */
410 void acpi_nmi_enable(void)
411 {
412         if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
413                 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
414 }
415
416 #ifdef CONFIG_PM
417
418 static int nmi_pm_active; /* nmi_active before suspend */
419
420 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
421 {
422         /* only CPU0 goes here, other CPUs should be offline */
423         nmi_pm_active = atomic_read(&nmi_active);
424         stop_apic_nmi_watchdog(NULL);
425         BUG_ON(atomic_read(&nmi_active) != 0);
426         return 0;
427 }
428
429 static int lapic_nmi_resume(struct sys_device *dev)
430 {
431         /* only CPU0 goes here, other CPUs should be offline */
432         if (nmi_pm_active > 0) {
433                 setup_apic_nmi_watchdog(NULL);
434                 touch_nmi_watchdog();
435         }
436         return 0;
437 }
438
439
440 static struct sysdev_class nmi_sysclass = {
441         set_kset_name("lapic_nmi"),
442         .resume         = lapic_nmi_resume,
443         .suspend        = lapic_nmi_suspend,
444 };
445
446 static struct sys_device device_lapic_nmi = {
447         .id     = 0,
448         .cls    = &nmi_sysclass,
449 };
450
451 static int __init init_lapic_nmi_sysfs(void)
452 {
453         int error;
454
455         /* should really be a BUG_ON but b/c this is an
456          * init call, it just doesn't work.  -dcz
457          */
458         if (nmi_watchdog != NMI_LOCAL_APIC)
459                 return 0;
460
461         if ( atomic_read(&nmi_active) < 0 )
462                 return 0;
463
464         error = sysdev_class_register(&nmi_sysclass);
465         if (!error)
466                 error = sysdev_register(&device_lapic_nmi);
467         return error;
468 }
469 /* must come after the local APIC's device_initcall() */
470 late_initcall(init_lapic_nmi_sysfs);
471
472 #endif  /* CONFIG_PM */
473
474 /*
475  * Activate the NMI watchdog via the local APIC.
476  * Original code written by Keith Owens.
477  */
478
479 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
480 {
481         u64 count = (u64)cpu_khz * 1000;
482
483         do_div(count, nmi_hz);
484         if(descr)
485                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
486         wrmsrl(perfctr_msr, 0 - count);
487 }
488
489 static void write_watchdog_counter32(unsigned int perfctr_msr,
490                 const char *descr)
491 {
492         u64 count = (u64)cpu_khz * 1000;
493
494         do_div(count, nmi_hz);
495         if(descr)
496                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
497         wrmsr(perfctr_msr, (u32)(-count), 0);
498 }
499
500 /* Note that these events don't tick when the CPU idles. This means
501    the frequency varies with CPU load. */
502
503 #define K7_EVNTSEL_ENABLE       (1 << 22)
504 #define K7_EVNTSEL_INT          (1 << 20)
505 #define K7_EVNTSEL_OS           (1 << 17)
506 #define K7_EVNTSEL_USR          (1 << 16)
507 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
508 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
509
510 static int setup_k7_watchdog(void)
511 {
512         unsigned int perfctr_msr, evntsel_msr;
513         unsigned int evntsel;
514         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
515
516         perfctr_msr = MSR_K7_PERFCTR0;
517         evntsel_msr = MSR_K7_EVNTSEL0;
518         if (!reserve_perfctr_nmi(perfctr_msr))
519                 goto fail;
520
521         if (!reserve_evntsel_nmi(evntsel_msr))
522                 goto fail1;
523
524         wrmsrl(perfctr_msr, 0UL);
525
526         evntsel = K7_EVNTSEL_INT
527                 | K7_EVNTSEL_OS
528                 | K7_EVNTSEL_USR
529                 | K7_NMI_EVENT;
530
531         /* setup the timer */
532         wrmsr(evntsel_msr, evntsel, 0);
533         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
534         apic_write(APIC_LVTPC, APIC_DM_NMI);
535         evntsel |= K7_EVNTSEL_ENABLE;
536         wrmsr(evntsel_msr, evntsel, 0);
537
538         wd->perfctr_msr = perfctr_msr;
539         wd->evntsel_msr = evntsel_msr;
540         wd->cccr_msr = 0;  //unused
541         wd->check_bit = 1ULL<<63;
542         return 1;
543 fail1:
544         release_perfctr_nmi(perfctr_msr);
545 fail:
546         return 0;
547 }
548
549 static void stop_k7_watchdog(void)
550 {
551         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
552
553         wrmsr(wd->evntsel_msr, 0, 0);
554
555         release_evntsel_nmi(wd->evntsel_msr);
556         release_perfctr_nmi(wd->perfctr_msr);
557 }
558
559 #define P6_EVNTSEL0_ENABLE      (1 << 22)
560 #define P6_EVNTSEL_INT          (1 << 20)
561 #define P6_EVNTSEL_OS           (1 << 17)
562 #define P6_EVNTSEL_USR          (1 << 16)
563 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
564 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
565
566 static int setup_p6_watchdog(void)
567 {
568         unsigned int perfctr_msr, evntsel_msr;
569         unsigned int evntsel;
570         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
571
572         perfctr_msr = MSR_P6_PERFCTR0;
573         evntsel_msr = MSR_P6_EVNTSEL0;
574         if (!reserve_perfctr_nmi(perfctr_msr))
575                 goto fail;
576
577         if (!reserve_evntsel_nmi(evntsel_msr))
578                 goto fail1;
579
580         wrmsrl(perfctr_msr, 0UL);
581
582         evntsel = P6_EVNTSEL_INT
583                 | P6_EVNTSEL_OS
584                 | P6_EVNTSEL_USR
585                 | P6_NMI_EVENT;
586
587         /* setup the timer */
588         wrmsr(evntsel_msr, evntsel, 0);
589         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
590         write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
591         apic_write(APIC_LVTPC, APIC_DM_NMI);
592         evntsel |= P6_EVNTSEL0_ENABLE;
593         wrmsr(evntsel_msr, evntsel, 0);
594
595         wd->perfctr_msr = perfctr_msr;
596         wd->evntsel_msr = evntsel_msr;
597         wd->cccr_msr = 0;  //unused
598         wd->check_bit = 1ULL<<39;
599         return 1;
600 fail1:
601         release_perfctr_nmi(perfctr_msr);
602 fail:
603         return 0;
604 }
605
606 static void stop_p6_watchdog(void)
607 {
608         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
609
610         wrmsr(wd->evntsel_msr, 0, 0);
611
612         release_evntsel_nmi(wd->evntsel_msr);
613         release_perfctr_nmi(wd->perfctr_msr);
614 }
615
616 /* Note that these events don't tick when the CPU idles. This means
617    the frequency varies with CPU load. */
618
619 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
620 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
621 #define P4_ESCR_OS              (1<<3)
622 #define P4_ESCR_USR             (1<<2)
623 #define P4_CCCR_OVF_PMI0        (1<<26)
624 #define P4_CCCR_OVF_PMI1        (1<<27)
625 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
626 #define P4_CCCR_COMPLEMENT      (1<<19)
627 #define P4_CCCR_COMPARE         (1<<18)
628 #define P4_CCCR_REQUIRED        (3<<16)
629 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
630 #define P4_CCCR_ENABLE          (1<<12)
631 #define P4_CCCR_OVF             (1<<31)
632 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
633    CRU_ESCR0 (with any non-null event selector) through a complemented
634    max threshold. [IA32-Vol3, Section 14.9.9] */
635
636 static int setup_p4_watchdog(void)
637 {
638         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
639         unsigned int evntsel, cccr_val;
640         unsigned int misc_enable, dummy;
641         unsigned int ht_num;
642         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
643
644         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
645         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
646                 return 0;
647
648 #ifdef CONFIG_SMP
649         /* detect which hyperthread we are on */
650         if (smp_num_siblings == 2) {
651                 unsigned int ebx, apicid;
652
653                 ebx = cpuid_ebx(1);
654                 apicid = (ebx >> 24) & 0xff;
655                 ht_num = apicid & 1;
656         } else
657 #endif
658                 ht_num = 0;
659
660         /* performance counters are shared resources
661          * assign each hyperthread its own set
662          * (re-use the ESCR0 register, seems safe
663          * and keeps the cccr_val the same)
664          */
665         if (!ht_num) {
666                 /* logical cpu 0 */
667                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
668                 evntsel_msr = MSR_P4_CRU_ESCR0;
669                 cccr_msr = MSR_P4_IQ_CCCR0;
670                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
671         } else {
672                 /* logical cpu 1 */
673                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
674                 evntsel_msr = MSR_P4_CRU_ESCR0;
675                 cccr_msr = MSR_P4_IQ_CCCR1;
676                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
677         }
678
679         if (!reserve_perfctr_nmi(perfctr_msr))
680                 goto fail;
681
682         if (!reserve_evntsel_nmi(evntsel_msr))
683                 goto fail1;
684
685         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
686                 | P4_ESCR_OS
687                 | P4_ESCR_USR;
688
689         cccr_val |= P4_CCCR_THRESHOLD(15)
690                  | P4_CCCR_COMPLEMENT
691                  | P4_CCCR_COMPARE
692                  | P4_CCCR_REQUIRED;
693
694         wrmsr(evntsel_msr, evntsel, 0);
695         wrmsr(cccr_msr, cccr_val, 0);
696         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
697         apic_write(APIC_LVTPC, APIC_DM_NMI);
698         cccr_val |= P4_CCCR_ENABLE;
699         wrmsr(cccr_msr, cccr_val, 0);
700         wd->perfctr_msr = perfctr_msr;
701         wd->evntsel_msr = evntsel_msr;
702         wd->cccr_msr = cccr_msr;
703         wd->check_bit = 1ULL<<39;
704         return 1;
705 fail1:
706         release_perfctr_nmi(perfctr_msr);
707 fail:
708         return 0;
709 }
710
711 static void stop_p4_watchdog(void)
712 {
713         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
714
715         wrmsr(wd->cccr_msr, 0, 0);
716         wrmsr(wd->evntsel_msr, 0, 0);
717
718         release_evntsel_nmi(wd->evntsel_msr);
719         release_perfctr_nmi(wd->perfctr_msr);
720 }
721
722 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
723 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
724
725 static int setup_intel_arch_watchdog(void)
726 {
727         unsigned int ebx;
728         union cpuid10_eax eax;
729         unsigned int unused;
730         unsigned int perfctr_msr, evntsel_msr;
731         unsigned int evntsel;
732         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
733
734         /*
735          * Check whether the Architectural PerfMon supports
736          * Unhalted Core Cycles Event or not.
737          * NOTE: Corresponding bit = 0 in ebx indicates event present.
738          */
739         cpuid(10, &(eax.full), &ebx, &unused, &unused);
740         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
741             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
742                 goto fail;
743
744         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
745         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
746
747         if (!reserve_perfctr_nmi(perfctr_msr))
748                 goto fail;
749
750         if (!reserve_evntsel_nmi(evntsel_msr))
751                 goto fail1;
752
753         wrmsrl(perfctr_msr, 0UL);
754
755         evntsel = ARCH_PERFMON_EVENTSEL_INT
756                 | ARCH_PERFMON_EVENTSEL_OS
757                 | ARCH_PERFMON_EVENTSEL_USR
758                 | ARCH_PERFMON_NMI_EVENT_SEL
759                 | ARCH_PERFMON_NMI_EVENT_UMASK;
760
761         /* setup the timer */
762         wrmsr(evntsel_msr, evntsel, 0);
763         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
764         write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
765         apic_write(APIC_LVTPC, APIC_DM_NMI);
766         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
767         wrmsr(evntsel_msr, evntsel, 0);
768
769         wd->perfctr_msr = perfctr_msr;
770         wd->evntsel_msr = evntsel_msr;
771         wd->cccr_msr = 0;  //unused
772         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
773         return 1;
774 fail1:
775         release_perfctr_nmi(perfctr_msr);
776 fail:
777         return 0;
778 }
779
780 static void stop_intel_arch_watchdog(void)
781 {
782         unsigned int ebx;
783         union cpuid10_eax eax;
784         unsigned int unused;
785         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
786
787         /*
788          * Check whether the Architectural PerfMon supports
789          * Unhalted Core Cycles Event or not.
790          * NOTE: Corresponding bit = 0 in ebx indicates event present.
791          */
792         cpuid(10, &(eax.full), &ebx, &unused, &unused);
793         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
794             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
795                 return;
796
797         wrmsr(wd->evntsel_msr, 0, 0);
798         release_evntsel_nmi(wd->evntsel_msr);
799         release_perfctr_nmi(wd->perfctr_msr);
800 }
801
802 void setup_apic_nmi_watchdog (void *unused)
803 {
804         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
805
806         /* only support LOCAL and IO APICs for now */
807         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
808             (nmi_watchdog != NMI_IO_APIC))
809                 return;
810
811         if (wd->enabled == 1)
812                 return;
813
814         /* cheap hack to support suspend/resume */
815         /* if cpu0 is not active neither should the other cpus */
816         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
817                 return;
818
819         if (nmi_watchdog == NMI_LOCAL_APIC) {
820                 switch (boot_cpu_data.x86_vendor) {
821                 case X86_VENDOR_AMD:
822                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
823                                 boot_cpu_data.x86 != 16)
824                                 return;
825                         if (!setup_k7_watchdog())
826                                 return;
827                         break;
828                 case X86_VENDOR_INTEL:
829                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
830                                 if (!setup_intel_arch_watchdog())
831                                         return;
832                                 break;
833                         }
834                         switch (boot_cpu_data.x86) {
835                         case 6:
836                                 if (boot_cpu_data.x86_model > 0xd)
837                                         return;
838
839                                 if (!setup_p6_watchdog())
840                                         return;
841                                 break;
842                         case 15:
843                                 if (boot_cpu_data.x86_model > 0x4)
844                                         return;
845
846                                 if (!setup_p4_watchdog())
847                                         return;
848                                 break;
849                         default:
850                                 return;
851                         }
852                         break;
853                 default:
854                         return;
855                 }
856         }
857         wd->enabled = 1;
858         atomic_inc(&nmi_active);
859 }
860
861 void stop_apic_nmi_watchdog(void *unused)
862 {
863         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
864
865         /* only support LOCAL and IO APICs for now */
866         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
867             (nmi_watchdog != NMI_IO_APIC))
868                 return;
869
870         if (wd->enabled == 0)
871                 return;
872
873         if (nmi_watchdog == NMI_LOCAL_APIC) {
874                 switch (boot_cpu_data.x86_vendor) {
875                 case X86_VENDOR_AMD:
876                         stop_k7_watchdog();
877                         break;
878                 case X86_VENDOR_INTEL:
879                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
880                                 stop_intel_arch_watchdog();
881                                 break;
882                         }
883                         switch (boot_cpu_data.x86) {
884                         case 6:
885                                 if (boot_cpu_data.x86_model > 0xd)
886                                         break;
887                                 stop_p6_watchdog();
888                                 break;
889                         case 15:
890                                 if (boot_cpu_data.x86_model > 0x4)
891                                         break;
892                                 stop_p4_watchdog();
893                                 break;
894                         }
895                         break;
896                 default:
897                         return;
898                 }
899         }
900         wd->enabled = 0;
901         atomic_dec(&nmi_active);
902 }
903
904 /*
905  * the best way to detect whether a CPU has a 'hard lockup' problem
906  * is to check it's local APIC timer IRQ counts. If they are not
907  * changing then that CPU has some problem.
908  *
909  * as these watchdog NMI IRQs are generated on every CPU, we only
910  * have to check the current processor.
911  *
912  * since NMIs don't listen to _any_ locks, we have to be extremely
913  * careful not to rely on unsafe variables. The printk might lock
914  * up though, so we have to break up any console locks first ...
915  * [when there will be more tty-related locks, break them up
916  *  here too!]
917  */
918
919 static unsigned int
920         last_irq_sums [NR_CPUS],
921         alert_counter [NR_CPUS];
922
923 void touch_nmi_watchdog (void)
924 {
925         if (nmi_watchdog > 0) {
926                 unsigned cpu;
927
928                 /*
929                  * Just reset the alert counters, (other CPUs might be
930                  * spinning on locks we hold):
931                  */
932                 for_each_present_cpu (cpu)
933                         alert_counter[cpu] = 0;
934         }
935
936         /*
937          * Tickle the softlockup detector too:
938          */
939         touch_softlockup_watchdog();
940 }
941 EXPORT_SYMBOL(touch_nmi_watchdog);
942
943 extern void die_nmi(struct pt_regs *, const char *msg);
944
945 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
946 {
947
948         /*
949          * Since current_thread_info()-> is always on the stack, and we
950          * always switch the stack NMI-atomically, it's safe to use
951          * smp_processor_id().
952          */
953         unsigned int sum;
954         int touched = 0;
955         int cpu = smp_processor_id();
956         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
957         u64 dummy;
958         int rc=0;
959
960         /* check for other users first */
961         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
962                         == NOTIFY_STOP) {
963                 rc = 1;
964                 touched = 1;
965         }
966
967         if (cpu_isset(cpu, backtrace_mask)) {
968                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
969
970                 spin_lock(&lock);
971                 printk("NMI backtrace for cpu %d\n", cpu);
972                 dump_stack();
973                 spin_unlock(&lock);
974                 cpu_clear(cpu, backtrace_mask);
975         }
976
977         /*
978          * Take the local apic timer and PIT/HPET into account. We don't
979          * know which one is active, when we have highres/dyntick on
980          */
981         sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
982
983         /* if the none of the timers isn't firing, this cpu isn't doing much */
984         if (!touched && last_irq_sums[cpu] == sum) {
985                 /*
986                  * Ayiee, looks like this CPU is stuck ...
987                  * wait a few IRQs (5 seconds) before doing the oops ...
988                  */
989                 alert_counter[cpu]++;
990                 if (alert_counter[cpu] == 5*nmi_hz)
991                         /*
992                          * die_nmi will return ONLY if NOTIFY_STOP happens..
993                          */
994                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
995         } else {
996                 last_irq_sums[cpu] = sum;
997                 alert_counter[cpu] = 0;
998         }
999         /* see if the nmi watchdog went off */
1000         if (wd->enabled) {
1001                 if (nmi_watchdog == NMI_LOCAL_APIC) {
1002                         rdmsrl(wd->perfctr_msr, dummy);
1003                         if (dummy & wd->check_bit){
1004                                 /* this wasn't a watchdog timer interrupt */
1005                                 goto done;
1006                         }
1007
1008                         /* only Intel P4 uses the cccr msr */
1009                         if (wd->cccr_msr != 0) {
1010                                 /*
1011                                  * P4 quirks:
1012                                  * - An overflown perfctr will assert its interrupt
1013                                  *   until the OVF flag in its CCCR is cleared.
1014                                  * - LVTPC is masked on interrupt and must be
1015                                  *   unmasked by the LVTPC handler.
1016                                  */
1017                                 rdmsrl(wd->cccr_msr, dummy);
1018                                 dummy &= ~P4_CCCR_OVF;
1019                                 wrmsrl(wd->cccr_msr, dummy);
1020                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
1021                                 /* start the cycle over again */
1022                                 write_watchdog_counter(wd->perfctr_msr, NULL);
1023                         }
1024                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
1025                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
1026                                 /* P6 based Pentium M need to re-unmask
1027                                  * the apic vector but it doesn't hurt
1028                                  * other P6 variant.
1029                                  * ArchPerfom/Core Duo also needs this */
1030                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
1031                                 /* P6/ARCH_PERFMON has 32 bit counter write */
1032                                 write_watchdog_counter32(wd->perfctr_msr, NULL);
1033                         } else {
1034                                 /* start the cycle over again */
1035                                 write_watchdog_counter(wd->perfctr_msr, NULL);
1036                         }
1037                         rc = 1;
1038                 } else if (nmi_watchdog == NMI_IO_APIC) {
1039                         /* don't know how to accurately check for this.
1040                          * just assume it was a watchdog timer interrupt
1041                          * This matches the old behaviour.
1042                          */
1043                         rc = 1;
1044                 }
1045         }
1046 done:
1047         return rc;
1048 }
1049
1050 int do_nmi_callback(struct pt_regs * regs, int cpu)
1051 {
1052 #ifdef CONFIG_SYSCTL
1053         if (unknown_nmi_panic)
1054                 return unknown_nmi_panic_callback(regs, cpu);
1055 #endif
1056         return 0;
1057 }
1058
1059 #ifdef CONFIG_SYSCTL
1060
1061 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
1062 {
1063         unsigned char reason = get_nmi_reason();
1064         char buf[64];
1065
1066         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1067         die_nmi(regs, buf);
1068         return 0;
1069 }
1070
1071 /*
1072  * proc handler for /proc/sys/kernel/nmi
1073  */
1074 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1075                         void __user *buffer, size_t *length, loff_t *ppos)
1076 {
1077         int old_state;
1078
1079         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1080         old_state = nmi_watchdog_enabled;
1081         proc_dointvec(table, write, file, buffer, length, ppos);
1082         if (!!old_state == !!nmi_watchdog_enabled)
1083                 return 0;
1084
1085         if (atomic_read(&nmi_active) < 0) {
1086                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1087                 return -EIO;
1088         }
1089
1090         if (nmi_watchdog == NMI_DEFAULT) {
1091                 if (nmi_known_cpu() > 0)
1092                         nmi_watchdog = NMI_LOCAL_APIC;
1093                 else
1094                         nmi_watchdog = NMI_IO_APIC;
1095         }
1096
1097         if (nmi_watchdog == NMI_LOCAL_APIC) {
1098                 if (nmi_watchdog_enabled)
1099                         enable_lapic_nmi_watchdog();
1100                 else
1101                         disable_lapic_nmi_watchdog();
1102         } else {
1103                 printk( KERN_WARNING
1104                         "NMI watchdog doesn't know what hardware to touch\n");
1105                 return -EIO;
1106         }
1107         return 0;
1108 }
1109
1110 #endif
1111
1112 void __trigger_all_cpu_backtrace(void)
1113 {
1114         int i;
1115
1116         backtrace_mask = cpu_online_map;
1117         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1118         for (i = 0; i < 10 * 1000; i++) {
1119                 if (cpus_empty(backtrace_mask))
1120                         break;
1121                 mdelay(1);
1122         }
1123 }
1124
1125 EXPORT_SYMBOL(nmi_active);
1126 EXPORT_SYMBOL(nmi_watchdog);
1127 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1128 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1129 EXPORT_SYMBOL(reserve_perfctr_nmi);
1130 EXPORT_SYMBOL(release_perfctr_nmi);
1131 EXPORT_SYMBOL(reserve_evntsel_nmi);
1132 EXPORT_SYMBOL(release_evntsel_nmi);
1133 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1134 EXPORT_SYMBOL(enable_timer_nmi_watchdog);