2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
27 #define MISC_MCELOG_MINOR 227
30 static int mce_dont_init;
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
40 static int mce_bootlog;
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
48 struct mce_log mcelog = {
53 void mce_log(struct mce *mce)
59 entry = rcu_dereference(mcelog.next);
60 /* The rmb forces the compiler to reload next in each
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
84 mcelog.entry[entry].finished = 1;
87 if (!test_and_set_bit(0, &console_logged))
91 static void print_mce(struct mce *m)
93 printk(KERN_EMERG "\n"
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
108 printk("ADDR %Lx ", m->addr);
110 printk("MISC %Lx ", m->misc);
114 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
129 printk("Fake panic: %s\n", msg);
134 static int mce_available(struct cpuinfo_x86 *c)
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
140 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
158 * The actual machine check handler
161 void do_machine_check(struct pt_regs * regs, long error_code)
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
168 int panicm_found = 0;
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
184 for (i = 0; i < banks; i++) {
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
209 mce_get_rip(&m, regs);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
213 if (error_code != -2)
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
224 add_taint(TAINT_MACHINE_CHECK);
227 /* Never do anything final in the polling timer */
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
236 mce_panic("Machine check", &panicm, mcestart);
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
267 * Periodic polling timer for "silent" machine check errors.
270 static int check_interval = 5 * 60; /* 5 minutes */
271 static void mcheck_timer(void *data);
272 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
274 static void mcheck_check_cpu(void *info)
276 if (mce_available(¤t_cpu_data))
277 do_machine_check(NULL, 0);
280 static void mcheck_timer(void *data)
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
292 if (notify_user && console_logged) {
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
300 static __init int periodic_mcheck_init(void)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
306 __initcall(periodic_mcheck_init);
310 * Initialize Machine Checks for a CPU.
312 static void mce_init(void *dummy)
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
331 set_in_cr4(X86_CR4_MCE);
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
342 /* Add per CPU specific workarounds here */
343 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
353 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
355 switch (c->x86_vendor) {
356 case X86_VENDOR_INTEL:
357 mce_intel_feature_init(c);
360 mce_amd_feature_init(c);
368 * Called for each booted CPU to set up machine checks.
369 * Must be called with preempt off.
371 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
373 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
378 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
387 * Character device to read and clear the MCE log.
390 static void collect_tscs(void *data)
392 unsigned long *cpu_tsc = (unsigned long *)data;
393 rdtscll(cpu_tsc[smp_processor_id()]);
396 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
398 unsigned long *cpu_tsc;
399 static DECLARE_MUTEX(mce_read_sem);
401 char __user *buf = ubuf;
404 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
409 next = rcu_dereference(mcelog.next);
411 /* Only supports full reads right now */
412 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
419 for (i = 0; i < next; i++) {
420 unsigned long start = jiffies;
421 while (!mcelog.entry[i].finished) {
422 if (!time_before(jiffies, start + 2)) {
423 memset(mcelog.entry + i,0, sizeof(struct mce));
429 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
430 buf += sizeof(struct mce);
433 memset(mcelog.entry, 0, next * sizeof(struct mce));
438 /* Collect entries that were still getting written before the synchronize. */
440 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
441 for (i = next; i < MCE_LOG_LEN; i++) {
442 if (mcelog.entry[i].finished &&
443 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
444 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
446 buf += sizeof(struct mce);
447 memset(&mcelog.entry[i], 0, sizeof(struct mce));
452 return err ? -EFAULT : buf - ubuf;
455 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
457 int __user *p = (int __user *)arg;
458 if (!capable(CAP_SYS_ADMIN))
461 case MCE_GET_RECORD_LEN:
462 return put_user(sizeof(struct mce), p);
463 case MCE_GET_LOG_LEN:
464 return put_user(MCE_LOG_LEN, p);
465 case MCE_GETCLEAR_FLAGS: {
468 flags = mcelog.flags;
469 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
470 return put_user(flags, p);
477 static struct file_operations mce_chrdev_ops = {
482 static struct miscdevice mce_log_device = {
489 * Old style boot options parsing. Only for compatibility.
492 static int __init mcheck_disable(char *str)
498 /* mce=off disables machine check. Note you can reenable it later
500 mce=TOLERANCELEVEL (number, see above)
501 mce=bootlog Log MCEs from before booting. Disabled by default to work
502 around buggy BIOS that leave bogus MCEs. */
503 static int __init mcheck_enable(char *str)
507 if (!strcmp(str, "off"))
509 else if (!strcmp(str, "bootlog"))
511 else if (isdigit(str[0]))
512 get_option(&str, &tolerant);
514 printk("mce= argument %s ignored. Please use /sys", str);
518 __setup("nomce", mcheck_disable);
519 __setup("mce", mcheck_enable);
525 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
526 Only one CPU is active at this time, the others get readded later using
528 static int mce_resume(struct sys_device *dev)
534 /* Reinit MCEs after user configuration changes */
535 static void mce_restart(void)
538 cancel_delayed_work(&mcheck_work);
539 /* Timer race is harmless here */
540 on_each_cpu(mce_init, NULL, 1, 1);
542 schedule_delayed_work(&mcheck_work, check_interval*HZ);
545 static struct sysdev_class mce_sysclass = {
546 .resume = mce_resume,
547 set_kset_name("machinecheck"),
550 static DEFINE_PER_CPU(struct sys_device, device_mce);
552 /* Why are there no generic functions for this? */
553 #define ACCESSOR(name, var, start) \
554 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
555 return sprintf(buf, "%lx\n", (unsigned long)var); \
557 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
559 unsigned long new = simple_strtoul(buf, &end, 0); \
560 if (end == buf) return -EINVAL; \
565 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
567 ACCESSOR(bank0ctl,bank[0],mce_restart())
568 ACCESSOR(bank1ctl,bank[1],mce_restart())
569 ACCESSOR(bank2ctl,bank[2],mce_restart())
570 ACCESSOR(bank3ctl,bank[3],mce_restart())
571 ACCESSOR(bank4ctl,bank[4],mce_restart())
572 ACCESSOR(tolerant,tolerant,)
573 ACCESSOR(check_interval,check_interval,mce_restart())
575 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
576 static __cpuinit int mce_create_device(unsigned int cpu)
579 if (!mce_available(&cpu_data[cpu]))
582 per_cpu(device_mce,cpu).id = cpu;
583 per_cpu(device_mce,cpu).cls = &mce_sysclass;
585 err = sysdev_register(&per_cpu(device_mce,cpu));
588 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
589 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
590 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
591 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
592 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
593 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
594 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
599 #ifdef CONFIG_HOTPLUG_CPU
600 static __cpuinit void mce_remove_device(unsigned int cpu)
602 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
603 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
604 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
605 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
606 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
607 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
608 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
609 sysdev_unregister(&per_cpu(device_mce,cpu));
613 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
615 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
617 unsigned int cpu = (unsigned long)hcpu;
621 mce_create_device(cpu);
623 #ifdef CONFIG_HOTPLUG_CPU
625 mce_remove_device(cpu);
632 static struct notifier_block mce_cpu_notifier = {
633 .notifier_call = mce_cpu_callback,
636 static __init int mce_init_device(void)
641 if (!mce_available(&boot_cpu_data))
643 err = sysdev_class_register(&mce_sysclass);
645 for_each_online_cpu(i) {
646 mce_create_device(i);
649 register_cpu_notifier(&mce_cpu_notifier);
650 misc_register(&mce_log_device);
654 device_initcall(mce_init_device);