arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/poll.h>
  22 #include <linux/thread_info.h>
  23 #include <linux/ctype.h>
  24 #include <linux/kmod.h>
  25 #include <linux/kdebug.h>
  26 #include <asm/processor.h>
  27 #include <asm/msr.h>
  28 #include <asm/mce.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/smp.h>
  31 #include <asm/idle.h>
  32
  33 #define MISC_MCELOG_MINOR 227
  34 #define NR_SYSFS_BANKS 6
  35
  36 atomic_t mce_entry;
  37
  38 static int mce_dont_init;
  39
  40 /*
  41  * Tolerant levels:
  42  *   0: always panic on uncorrected errors, log corrected errors
  43  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  44  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  45  *   3: never panic or SIGBUS, log all errors (for testing only)
  46  */
  47 static int tolerant = 1;
  48 static int banks;
  49 static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
  50 static unsigned long notify_user;
  51 static int rip_msr;
  52 static int mce_bootlog = -1;
  53 static atomic_t mce_events;
  54
  55 static char trigger[128];
  56 static char *trigger_argv[2] = { trigger, NULL };
  57
  58 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  59
  60 /*
  61  * Lockless MCE logging infrastructure.
  62  * This avoids deadlocks on printk locks without having to break locks. Also
  63  * separate MCEs from kernel messages to avoid bogus bug reports.
  64  */
  65
  66 static struct mce_log mcelog = {
  67         MCE_LOG_SIGNATURE,
  68         MCE_LOG_LEN,
  69 };
  70
  71 void mce_log(struct mce *mce)
  72 {
  73         unsigned next, entry;
  74         atomic_inc(&mce_events);
  75         mce->finished = 0;
  76         wmb();
  77         for (;;) {
  78                 entry = rcu_dereference(mcelog.next);
  79                 for (;;) {
  80                         /* When the buffer fills up discard new entries. Assume
  81                            that the earlier errors are the more interesting. */
  82                         if (entry >= MCE_LOG_LEN) {
  83                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
  84                                 return;
  85                         }
  86                         /* Old left over entry. Skip. */
  87                         if (mcelog.entry[entry].finished) {
  88                                 entry++;
  89                                 continue;
  90                         }
  91                         break;
  92                 }
  93                 smp_rmb();
  94                 next = entry + 1;
  95                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  96                         break;
  97         }
  98         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  99         wmb();
 100         mcelog.entry[entry].finished = 1;
 101         wmb();
 102
 103         set_bit(0, &notify_user);
 104 }
 105
 106 static void print_mce(struct mce *m)
 107 {
 108         printk(KERN_EMERG "\n"
 109                KERN_EMERG "HARDWARE ERROR\n"
 110                KERN_EMERG
 111                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 112                m->cpu, m->mcgstatus, m->bank, m->status);
 113         if (m->ip) {
 114                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 115                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 116                        m->cs, m->ip);
 117                 if (m->cs == __KERNEL_CS)
 118                         print_symbol("{%s}", m->ip);
 119                 printk("\n");
 120         }
 121         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 122         if (m->addr)
 123                 printk("ADDR %Lx ", m->addr);
 124         if (m->misc)
 125                 printk("MISC %Lx ", m->misc);
 126         printk("\n");
 127         printk(KERN_EMERG "This is not a software problem!\n");
 128         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 129                "and contact your hardware vendor\n");
 130 }
 131
 132 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 133 {
 134         int i;
 135
 136         oops_begin();
 137         for (i = 0; i < MCE_LOG_LEN; i++) {
 138                 unsigned long tsc = mcelog.entry[i].tsc;
 139
 140                 if (time_before(tsc, start))
 141                         continue;
 142                 print_mce(&mcelog.entry[i]);
 143                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 144                         backup = NULL;
 145         }
 146         if (backup)
 147                 print_mce(backup);
 148         panic(msg);
 149 }
 150
 151 static int mce_available(struct cpuinfo_x86 *c)
 152 {
 153         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 154 }
 155
 156 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 157 {
 158         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 159                 m->ip = regs->ip;
 160                 m->cs = regs->cs;
 161         } else {
 162                 m->ip = 0;
 163                 m->cs = 0;
 164         }
 165         if (rip_msr) {
 166                 /* Assume the RIP in the MSR is exact. Is this true? */
 167                 m->mcgstatus |= MCG_STATUS_EIPV;
 168                 rdmsrl(rip_msr, m->ip);
 169                 m->cs = 0;
 170         }
 171 }
 172
 173 /*
 174  * The actual machine check handler
 175  */
 176 void do_machine_check(struct pt_regs * regs, long error_code)
 177 {
 178         struct mce m, panicm;
 179         u64 mcestart = 0;
 180         int i;
 181         int panicm_found = 0;
 182         /*
 183          * If no_way_out gets set, there is no safe way to recover from this
 184          * MCE.  If tolerant is cranked up, we'll try anyway.
 185          */
 186         int no_way_out = 0;
 187         /*
 188          * If kill_it gets set, there might be a way to recover from this
 189          * error.
 190          */
 191         int kill_it = 0;
 192
 193         atomic_inc(&mce_entry);
 194
 195         if ((regs
 196              && notify_die(DIE_NMI, "machine check", regs, error_code,
 197                            18, SIGKILL) == NOTIFY_STOP)
 198             || !banks)
 199                 goto out2;
 200
 201         memset(&m, 0, sizeof(struct mce));
 202         m.cpu = smp_processor_id();
 203         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 204         /* if the restart IP is not valid, we're done for */
 205         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 206                 no_way_out = 1;
 207
 208         rdtscll(mcestart);
 209         barrier();
 210
 211         for (i = 0; i < banks; i++) {
 212                 if (i < NR_SYSFS_BANKS && !bank[i])
 213                         continue;
 214
 215                 m.misc = 0;
 216                 m.addr = 0;
 217                 m.bank = i;
 218                 m.tsc = 0;
 219
 220                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 221                 if ((m.status & MCI_STATUS_VAL) == 0)
 222                         continue;
 223
 224                 if (m.status & MCI_STATUS_EN) {
 225                         /* if PCC was set, there's no way out */
 226                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 227                         /*
 228                          * If this error was uncorrectable and there was
 229                          * an overflow, we're in trouble.  If no overflow,
 230                          * we might get away with just killing a task.
 231                          */
 232                         if (m.status & MCI_STATUS_UC) {
 233                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 234                                         no_way_out = 1;
 235                                 kill_it = 1;
 236                         }
 237                 }
 238
 239                 if (m.status & MCI_STATUS_MISCV)
 240                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 241                 if (m.status & MCI_STATUS_ADDRV)
 242                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 243
 244                 mce_get_rip(&m, regs);
 245                 if (error_code >= 0)
 246                         rdtscll(m.tsc);
 247                 if (error_code != -2)
 248                         mce_log(&m);
 249
 250                 /* Did this bank cause the exception? */
 251                 /* Assume that the bank with uncorrectable errors did it,
 252                    and that there is only a single one. */
 253                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 254                         panicm = m;
 255                         panicm_found = 1;
 256                 }
 257
 258                 add_taint(TAINT_MACHINE_CHECK);
 259         }
 260
 261         /* Never do anything final in the polling timer */
 262         if (!regs)
 263                 goto out;
 264
 265         /* If we didn't find an uncorrectable error, pick
 266            the last one (shouldn't happen, just being safe). */
 267         if (!panicm_found)
 268                 panicm = m;
 269
 270         /*
 271          * If we have decided that we just CAN'T continue, and the user
 272          *  has not set tolerant to an insane level, give up and die.
 273          */
 274         if (no_way_out && tolerant < 3)
 275                 mce_panic("Machine check", &panicm, mcestart);
 276
 277         /*
 278          * If the error seems to be unrecoverable, something should be
 279          * done.  Try to kill as little as possible.  If we can kill just
 280          * one task, do that.  If the user has set the tolerance very
 281          * high, don't try to do anything at all.
 282          */
 283         if (kill_it && tolerant < 3) {
 284                 int user_space = 0;
 285
 286                 /*
 287                  * If the EIPV bit is set, it means the saved IP is the
 288                  * instruction which caused the MCE.
 289                  */
 290                 if (m.mcgstatus & MCG_STATUS_EIPV)
 291                         user_space = panicm.ip && (panicm.cs & 3);
 292
 293                 /*
 294                  * If we know that the error was in user space, send a
 295                  * SIGBUS.  Otherwise, panic if tolerance is low.
 296                  *
 297                  * do_exit() takes an awful lot of locks and has a slight
 298                  * risk of deadlocking.
 299                  */
 300                 if (user_space) {
 301                         do_exit(SIGBUS);
 302                 } else if (panic_on_oops || tolerant < 2) {
 303                         mce_panic("Uncorrected machine check",
 304                                 &panicm, mcestart);
 305                 }
 306         }
 307
 308         /* notify userspace ASAP */
 309         set_thread_flag(TIF_MCE_NOTIFY);
 310
 311  out:
 312         /* the last thing we do is clear state */
 313         for (i = 0; i < banks; i++)
 314                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 315         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 316  out2:
 317         atomic_dec(&mce_entry);
 318 }
 319
 320 #ifdef CONFIG_X86_MCE_INTEL
 321 /***
 322  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 323  * @cpu: The CPU on which the event occurred.
 324  * @status: Event status information
 325  *
 326  * This function should be called by the thermal interrupt after the
 327  * event has been processed and the decision was made to log the event
 328  * further.
 329  *
 330  * The status parameter will be saved to the 'status' field of 'struct mce'
 331  * and historically has been the register value of the
 332  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 333  */
 334 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 335 {
 336         struct mce m;
 337
 338         memset(&m, 0, sizeof(m));
 339         m.cpu = cpu;
 340         m.bank = MCE_THERMAL_BANK;
 341         m.status = status;
 342         rdtscll(m.tsc);
 343         mce_log(&m);
 344 }
 345 #endif /* CONFIG_X86_MCE_INTEL */
 346
 347 /*
 348  * Periodic polling timer for "silent" machine check errors.  If the
 349  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 350  * errors, poll 2x slower (up to check_interval seconds).
 351  */
 352
 353 static int check_interval = 5 * 60; /* 5 minutes */
 354 static int next_interval; /* in jiffies */
 355 static void mcheck_timer(struct work_struct *work);
 356 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 357
 358 static void mcheck_check_cpu(void *info)
 359 {
 360         if (mce_available(&current_cpu_data))
 361                 do_machine_check(NULL, 0);
 362 }
 363
 364 static void mcheck_timer(struct work_struct *work)
 365 {
 366         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 367
 368         /*
 369          * Alert userspace if needed.  If we logged an MCE, reduce the
 370          * polling interval, otherwise increase the polling interval.
 371          */
 372         if (mce_notify_user()) {
 373                 next_interval = max(next_interval/2, HZ/100);
 374         } else {
 375                 next_interval = min(next_interval * 2,
 376                                 (int)round_jiffies_relative(check_interval*HZ));
 377         }
 378
 379         schedule_delayed_work(&mcheck_work, next_interval);
 380 }
 381
 382 /*
 383  * This is only called from process context.  This is where we do
 384  * anything we need to alert userspace about new MCEs.  This is called
 385  * directly from the poller and also from entry.S and idle, thanks to
 386  * TIF_MCE_NOTIFY.
 387  */
 388 int mce_notify_user(void)
 389 {
 390         clear_thread_flag(TIF_MCE_NOTIFY);
 391         if (test_and_clear_bit(0, &notify_user)) {
 392                 static unsigned long last_print;
 393                 unsigned long now = jiffies;
 394
 395                 wake_up_interruptible(&mce_wait);
 396                 if (trigger[0])
 397                         call_usermodehelper(trigger, trigger_argv, NULL,
 398                                                 UMH_NO_WAIT);
 399
 400                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 401                         last_print = now;
 402                         printk(KERN_INFO "Machine check events logged\n");
 403                 }
 404
 405                 return 1;
 406         }
 407         return 0;
 408 }
 409
 410 /* see if the idle task needs to notify userspace */
 411 static int
 412 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 413 {
 414         /* IDLE_END should be safe - interrupts are back on */
 415         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 416                 mce_notify_user();
 417
 418         return NOTIFY_OK;
 419 }
 420
 421 static struct notifier_block mce_idle_notifier = {
 422         .notifier_call = mce_idle_callback,
 423 };
 424
 425 static __init int periodic_mcheck_init(void)
 426 {
 427         next_interval = check_interval * HZ;
 428         if (next_interval)
 429                 schedule_delayed_work(&mcheck_work,
 430                                       round_jiffies_relative(next_interval));
 431         idle_notifier_register(&mce_idle_notifier);
 432         return 0;
 433 }
 434 __initcall(periodic_mcheck_init);
 435
 436
 437 /*
 438  * Initialize Machine Checks for a CPU.
 439  */
 440 static void mce_init(void *dummy)
 441 {
 442         u64 cap;
 443         int i;
 444
 445         rdmsrl(MSR_IA32_MCG_CAP, cap);
 446         banks = cap & 0xff;
 447         if (banks > MCE_EXTENDED_BANK) {
 448                 banks = MCE_EXTENDED_BANK;
 449                 printk(KERN_INFO "MCE: warning: using only %d banks\n",
 450                        MCE_EXTENDED_BANK);
 451         }
 452         /* Use accurate RIP reporting if available. */
 453         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 454                 rip_msr = MSR_IA32_MCG_EIP;
 455
 456         /* Log the machine checks left over from the previous reset.
 457            This also clears all registers */
 458         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 459
 460         set_in_cr4(X86_CR4_MCE);
 461
 462         if (cap & MCG_CTL_P)
 463                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 464
 465         for (i = 0; i < banks; i++) {
 466                 if (i < NR_SYSFS_BANKS)
 467                         wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 468                 else
 469                         wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
 470
 471                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 472         }
 473 }
 474
 475 /* Add per CPU specific workarounds here */
 476 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 477 {
 478         /* This should be disabled by the BIOS, but isn't always */
 479         if (c->x86_vendor == X86_VENDOR_AMD) {
 480                 if(c->x86 == 15)
 481                         /* disable GART TBL walk error reporting, which trips off
 482                            incorrectly with the IOMMU & 3ware & Cerberus. */
 483                         clear_bit(10, &bank[4]);
 484                 if(c->x86 <= 17 && mce_bootlog < 0)
 485                         /* Lots of broken BIOS around that don't clear them
 486                            by default and leave crap in there. Don't log. */
 487                         mce_bootlog = 0;
 488         }
 489
 490 }
 491
 492 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 493 {
 494         switch (c->x86_vendor) {
 495         case X86_VENDOR_INTEL:
 496                 mce_intel_feature_init(c);
 497                 break;
 498         case X86_VENDOR_AMD:
 499                 mce_amd_feature_init(c);
 500                 break;
 501         default:
 502                 break;
 503         }
 504 }
 505
 506 /*
 507  * Called for each booted CPU to set up machine checks.
 508  * Must be called with preempt off.
 509  */
 510 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 511 {
 512         static cpumask_t mce_cpus = CPU_MASK_NONE;
 513
 514         mce_cpu_quirks(c);
 515
 516         if (mce_dont_init ||
 517             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 518             !mce_available(c))
 519                 return;
 520
 521         mce_init(NULL);
 522         mce_cpu_features(c);
 523 }
 524
 525 /*
 526  * Character device to read and clear the MCE log.
 527  */
 528
 529 static DEFINE_SPINLOCK(mce_state_lock);
 530 static int open_count;  /* #times opened */
 531 static int open_exclu;  /* already open exclusive? */
 532
 533 static int mce_open(struct inode *inode, struct file *file)
 534 {
 535         spin_lock(&mce_state_lock);
 536
 537         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 538                 spin_unlock(&mce_state_lock);
 539                 return -EBUSY;
 540         }
 541
 542         if (file->f_flags & O_EXCL)
 543                 open_exclu = 1;
 544         open_count++;
 545
 546         spin_unlock(&mce_state_lock);
 547
 548         return nonseekable_open(inode, file);
 549 }
 550
 551 static int mce_release(struct inode *inode, struct file *file)
 552 {
 553         spin_lock(&mce_state_lock);
 554
 555         open_count--;
 556         open_exclu = 0;
 557
 558         spin_unlock(&mce_state_lock);
 559
 560         return 0;
 561 }
 562
 563 static void collect_tscs(void *data)
 564 {
 565         unsigned long *cpu_tsc = (unsigned long *)data;
 566
 567         rdtscll(cpu_tsc[smp_processor_id()]);
 568 }
 569
 570 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 571                         loff_t *off)
 572 {
 573         unsigned long *cpu_tsc;
 574         static DEFINE_MUTEX(mce_read_mutex);
 575         unsigned next;
 576         char __user *buf = ubuf;
 577         int i, err;
 578
 579         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 580         if (!cpu_tsc)
 581                 return -ENOMEM;
 582
 583         mutex_lock(&mce_read_mutex);
 584         next = rcu_dereference(mcelog.next);
 585
 586         /* Only supports full reads right now */
 587         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 588                 mutex_unlock(&mce_read_mutex);
 589                 kfree(cpu_tsc);
 590                 return -EINVAL;
 591         }
 592
 593         err = 0;
 594         for (i = 0; i < next; i++) {
 595                 unsigned long start = jiffies;
 596
 597                 while (!mcelog.entry[i].finished) {
 598                         if (time_after_eq(jiffies, start + 2)) {
 599                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 600                                 goto timeout;
 601                         }
 602                         cpu_relax();
 603                 }
 604                 smp_rmb();
 605                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 606                 buf += sizeof(struct mce);
 607  timeout:
 608                 ;
 609         }
 610
 611         memset(mcelog.entry, 0, next * sizeof(struct mce));
 612         mcelog.next = 0;
 613
 614         synchronize_sched();
 615
 616         /*
 617          * Collect entries that were still getting written before the
 618          * synchronize.
 619          */
 620         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 621         for (i = next; i < MCE_LOG_LEN; i++) {
 622                 if (mcelog.entry[i].finished &&
 623                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 624                         err |= copy_to_user(buf, mcelog.entry+i,
 625                                             sizeof(struct mce));
 626                         smp_rmb();
 627                         buf += sizeof(struct mce);
 628                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 629                 }
 630         }
 631         mutex_unlock(&mce_read_mutex);
 632         kfree(cpu_tsc);
 633         return err ? -EFAULT : buf - ubuf;
 634 }
 635
 636 static unsigned int mce_poll(struct file *file, poll_table *wait)
 637 {
 638         poll_wait(file, &mce_wait, wait);
 639         if (rcu_dereference(mcelog.next))
 640                 return POLLIN | POLLRDNORM;
 641         return 0;
 642 }
 643
 644 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 645 {
 646         int __user *p = (int __user *)arg;
 647
 648         if (!capable(CAP_SYS_ADMIN))
 649                 return -EPERM;
 650         switch (cmd) {
 651         case MCE_GET_RECORD_LEN:
 652                 return put_user(sizeof(struct mce), p);
 653         case MCE_GET_LOG_LEN:
 654                 return put_user(MCE_LOG_LEN, p);
 655         case MCE_GETCLEAR_FLAGS: {
 656                 unsigned flags;
 657
 658                 do {
 659                         flags = mcelog.flags;
 660                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 661                 return put_user(flags, p);
 662         }
 663         default:
 664                 return -ENOTTY;
 665         }
 666 }
 667
 668 static const struct file_operations mce_chrdev_ops = {
 669         .open = mce_open,
 670         .release = mce_release,
 671         .read = mce_read,
 672         .poll = mce_poll,
 673         .unlocked_ioctl = mce_ioctl,
 674 };
 675
 676 static struct miscdevice mce_log_device = {
 677         MISC_MCELOG_MINOR,
 678         "mcelog",
 679         &mce_chrdev_ops,
 680 };
 681
 682 static unsigned long old_cr4 __initdata;
 683
 684 void __init stop_mce(void)
 685 {
 686         old_cr4 = read_cr4();
 687         clear_in_cr4(X86_CR4_MCE);
 688 }
 689
 690 void __init restart_mce(void)
 691 {
 692         if (old_cr4 & X86_CR4_MCE)
 693                 set_in_cr4(X86_CR4_MCE);
 694 }
 695
 696 /*
 697  * Old style boot options parsing. Only for compatibility.
 698  */
 699 static int __init mcheck_disable(char *str)
 700 {
 701         mce_dont_init = 1;
 702         return 1;
 703 }
 704
 705 /* mce=off disables machine check. Note you can re-enable it later
 706    using sysfs.
 707    mce=TOLERANCELEVEL (number, see above)
 708    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 709    mce=nobootlog Don't log MCEs from before booting. */
 710 static int __init mcheck_enable(char *str)
 711 {
 712         if (!strcmp(str, "off"))
 713                 mce_dont_init = 1;
 714         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 715                 mce_bootlog = str[0] == 'b';
 716         else if (isdigit(str[0]))
 717                 get_option(&str, &tolerant);
 718         else
 719                 printk("mce= argument %s ignored. Please use /sys", str);
 720         return 1;
 721 }
 722
 723 __setup("nomce", mcheck_disable);
 724 __setup("mce=", mcheck_enable);
 725
 726 /*
 727  * Sysfs support
 728  */
 729
 730 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 731    Only one CPU is active at this time, the others get readded later using
 732    CPU hotplug. */
 733 static int mce_resume(struct sys_device *dev)
 734 {
 735         mce_init(NULL);
 736         return 0;
 737 }
 738
 739 /* Reinit MCEs after user configuration changes */
 740 static void mce_restart(void)
 741 {
 742         if (next_interval)
 743                 cancel_delayed_work(&mcheck_work);
 744         /* Timer race is harmless here */
 745         on_each_cpu(mce_init, NULL, 1, 1);
 746         next_interval = check_interval * HZ;
 747         if (next_interval)
 748                 schedule_delayed_work(&mcheck_work,
 749                                       round_jiffies_relative(next_interval));
 750 }
 751
 752 static struct sysdev_class mce_sysclass = {
 753         .resume = mce_resume,
 754         .name = "machinecheck",
 755 };
 756
 757 DEFINE_PER_CPU(struct sys_device, device_mce);
 758
 759 /* Why are there no generic functions for this? */
 760 #define ACCESSOR(name, var, start) \
 761         static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
 762                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 763         }                                                               \
 764         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 765                 char *end;                                              \
 766                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 767                 if (end == buf) return -EINVAL;                         \
 768                 var = new;                                              \
 769                 start;                                                  \
 770                 return end-buf;                                         \
 771         }                                                               \
 772         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 773
 774 /*
 775  * TBD should generate these dynamically based on number of available banks.
 776  * Have only 6 contol banks in /sysfs until then.
 777  */
 778 ACCESSOR(bank0ctl,bank[0],mce_restart())
 779 ACCESSOR(bank1ctl,bank[1],mce_restart())
 780 ACCESSOR(bank2ctl,bank[2],mce_restart())
 781 ACCESSOR(bank3ctl,bank[3],mce_restart())
 782 ACCESSOR(bank4ctl,bank[4],mce_restart())
 783 ACCESSOR(bank5ctl,bank[5],mce_restart())
 784
 785 static ssize_t show_trigger(struct sys_device *s, char *buf)
 786 {
 787         strcpy(buf, trigger);
 788         strcat(buf, "\n");
 789         return strlen(trigger) + 1;
 790 }
 791
 792 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 793 {
 794         char *p;
 795         int len;
 796         strncpy(trigger, buf, sizeof(trigger));
 797         trigger[sizeof(trigger)-1] = 0;
 798         len = strlen(trigger);
 799         p = strchr(trigger, '\n');
 800         if (*p) *p = 0;
 801         return len;
 802 }
 803
 804 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 805 ACCESSOR(tolerant,tolerant,)
 806 ACCESSOR(check_interval,check_interval,mce_restart())
 807 static struct sysdev_attribute *mce_attributes[] = {
 808         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 809         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 810         &attr_tolerant, &attr_check_interval, &attr_trigger,
 811         NULL
 812 };
 813
 814 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
 815
 816 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 817 static __cpuinit int mce_create_device(unsigned int cpu)
 818 {
 819         int err;
 820         int i;
 821
 822         if (!mce_available(&boot_cpu_data))
 823                 return -EIO;
 824
 825         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 826         per_cpu(device_mce,cpu).id = cpu;
 827         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 828
 829         err = sysdev_register(&per_cpu(device_mce,cpu));
 830         if (err)
 831                 return err;
 832
 833         for (i = 0; mce_attributes[i]; i++) {
 834                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
 835                                          mce_attributes[i]);
 836                 if (err)
 837                         goto error;
 838         }
 839         cpu_set(cpu, mce_device_initialized);
 840
 841         return 0;
 842 error:
 843         while (i--) {
 844                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 845                                    mce_attributes[i]);
 846         }
 847         sysdev_unregister(&per_cpu(device_mce,cpu));
 848
 849         return err;
 850 }
 851
 852 static void mce_remove_device(unsigned int cpu)
 853 {
 854         int i;
 855
 856         if (!cpu_isset(cpu, mce_device_initialized))
 857                 return;
 858
 859         for (i = 0; mce_attributes[i]; i++)
 860                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 861                         mce_attributes[i]);
 862         sysdev_unregister(&per_cpu(device_mce,cpu));
 863         cpu_clear(cpu, mce_device_initialized);
 864 }
 865
 866 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 867 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 868                                       unsigned long action, void *hcpu)
 869 {
 870         unsigned int cpu = (unsigned long)hcpu;
 871
 872         switch (action) {
 873         case CPU_ONLINE:
 874         case CPU_ONLINE_FROZEN:
 875                 mce_create_device(cpu);
 876                 break;
 877         case CPU_DEAD:
 878         case CPU_DEAD_FROZEN:
 879                 mce_remove_device(cpu);
 880                 break;
 881         }
 882         return NOTIFY_OK;
 883 }
 884
 885 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 886         .notifier_call = mce_cpu_callback,
 887 };
 888
 889 static __init int mce_init_device(void)
 890 {
 891         int err;
 892         int i = 0;
 893
 894         if (!mce_available(&boot_cpu_data))
 895                 return -EIO;
 896         err = sysdev_class_register(&mce_sysclass);
 897         if (err)
 898                 return err;
 899
 900         for_each_online_cpu(i) {
 901                 err = mce_create_device(i);
 902                 if (err)
 903                         return err;
 904         }
 905
 906         register_hotcpu_notifier(&mce_cpu_notifier);
 907         misc_register(&mce_log_device);
 908         return err;
 909 }
 910
 911 device_initcall(mce_init_device);