arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ctype.h>
  22 #include <asm/processor.h>
  23 #include <asm/msr.h>
  24 #include <asm/mce.h>
  25 #include <asm/kdebug.h>
  26 #include <asm/uaccess.h>
  27 #include <asm/smp.h>
  28
  29 #define MISC_MCELOG_MINOR 227
  30 #define NR_BANKS 6
  31
  32 atomic_t mce_entry;
  33
  34 static int mce_dont_init;
  35
  36 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  37    3: never panic or exit (for testing only) */
  38 static int tolerant = 1;
  39 static int banks;
  40 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  41 static unsigned long console_logged;
  42 static int notify_user;
  43 static int rip_msr;
  44 static int mce_bootlog = 1;
  45
  46 /*
  47  * Lockless MCE logging infrastructure.
  48  * This avoids deadlocks on printk locks without having to break locks. Also
  49  * separate MCEs from kernel messages to avoid bogus bug reports.
  50  */
  51
  52 struct mce_log mcelog = {
  53         MCE_LOG_SIGNATURE,
  54         MCE_LOG_LEN,
  55 };
  56
  57 void mce_log(struct mce *mce)
  58 {
  59         unsigned next, entry;
  60         mce->finished = 0;
  61         wmb();
  62         for (;;) {
  63                 entry = rcu_dereference(mcelog.next);
  64                 /* The rmb forces the compiler to reload next in each
  65                     iteration */
  66                 rmb();
  67                 for (;;) {
  68                         /* When the buffer fills up discard new entries. Assume
  69                            that the earlier errors are the more interesting. */
  70                         if (entry >= MCE_LOG_LEN) {
  71                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  72                                 return;
  73                         }
  74                         /* Old left over entry. Skip. */
  75                         if (mcelog.entry[entry].finished) {
  76                                 entry++;
  77                                 continue;
  78                         }
  79                         break;
  80                 }
  81                 smp_rmb();
  82                 next = entry + 1;
  83                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  84                         break;
  85         }
  86         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  87         wmb();
  88         mcelog.entry[entry].finished = 1;
  89         wmb();
  90
  91         if (!test_and_set_bit(0, &console_logged))
  92                 notify_user = 1;
  93 }
  94
  95 static void print_mce(struct mce *m)
  96 {
  97         printk(KERN_EMERG "\n"
  98                KERN_EMERG "HARDWARE ERROR\n"
  99                KERN_EMERG
 100                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 101                m->cpu, m->mcgstatus, m->bank, m->status);
 102         if (m->rip) {
 103                 printk(KERN_EMERG
 104                        "RIP%s %02x:<%016Lx> ",
 105                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 106                        m->cs, m->rip);
 107                 if (m->cs == __KERNEL_CS)
 108                         print_symbol("{%s}", m->rip);
 109                 printk("\n");
 110         }
 111         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 112         if (m->addr)
 113                 printk("ADDR %Lx ", m->addr);
 114         if (m->misc)
 115                 printk("MISC %Lx ", m->misc);
 116         printk("\n");
 117         printk(KERN_EMERG "This is not a software problem!\n");
 118         printk(KERN_EMERG
 119     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 120 }
 121
 122 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 123 {
 124         int i;
 125         oops_begin();
 126         for (i = 0; i < MCE_LOG_LEN; i++) {
 127                 unsigned long tsc = mcelog.entry[i].tsc;
 128                 if (time_before(tsc, start))
 129                         continue;
 130                 print_mce(&mcelog.entry[i]);
 131                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 132                         backup = NULL;
 133         }
 134         if (backup)
 135                 print_mce(backup);
 136         if (tolerant >= 3)
 137                 printk("Fake panic: %s\n", msg);
 138         else
 139                 panic(msg);
 140 }
 141
 142 static int mce_available(struct cpuinfo_x86 *c)
 143 {
 144         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 145 }
 146
 147 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 148 {
 149         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 150                 m->rip = regs->rip;
 151                 m->cs = regs->cs;
 152         } else {
 153                 m->rip = 0;
 154                 m->cs = 0;
 155         }
 156         if (rip_msr) {
 157                 /* Assume the RIP in the MSR is exact. Is this true? */
 158                 m->mcgstatus |= MCG_STATUS_EIPV;
 159                 rdmsrl(rip_msr, m->rip);
 160                 m->cs = 0;
 161         }
 162 }
 163
 164 /*
 165  * The actual machine check handler
 166  */
 167
 168 void do_machine_check(struct pt_regs * regs, long error_code)
 169 {
 170         struct mce m, panicm;
 171         int nowayout = (tolerant < 1);
 172         int kill_it = 0;
 173         u64 mcestart = 0;
 174         int i;
 175         int panicm_found = 0;
 176
 177         atomic_inc(&mce_entry);
 178
 179         if (regs)
 180                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 181         if (!banks)
 182                 goto out2;
 183
 184         memset(&m, 0, sizeof(struct mce));
 185         m.cpu = smp_processor_id();
 186         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 187         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 188                 kill_it = 1;
 189
 190         rdtscll(mcestart);
 191         barrier();
 192
 193         for (i = 0; i < banks; i++) {
 194                 if (!bank[i])
 195                         continue;
 196
 197                 m.misc = 0;
 198                 m.addr = 0;
 199                 m.bank = i;
 200                 m.tsc = 0;
 201
 202                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 203                 if ((m.status & MCI_STATUS_VAL) == 0)
 204                         continue;
 205
 206                 if (m.status & MCI_STATUS_EN) {
 207                         /* In theory _OVER could be a nowayout too, but
 208                            assume any overflowed errors were no fatal. */
 209                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 210                         kill_it |= !!(m.status & MCI_STATUS_UC);
 211                 }
 212
 213                 if (m.status & MCI_STATUS_MISCV)
 214                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 215                 if (m.status & MCI_STATUS_ADDRV)
 216                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 217
 218                 mce_get_rip(&m, regs);
 219                 if (error_code >= 0)
 220                         rdtscll(m.tsc);
 221                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 222                 if (error_code != -2)
 223                         mce_log(&m);
 224
 225                 /* Did this bank cause the exception? */
 226                 /* Assume that the bank with uncorrectable errors did it,
 227                    and that there is only a single one. */
 228                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 229                         panicm = m;
 230                         panicm_found = 1;
 231                 }
 232
 233                 add_taint(TAINT_MACHINE_CHECK);
 234         }
 235
 236         /* Never do anything final in the polling timer */
 237         if (!regs)
 238                 goto out;
 239
 240         /* If we didn't find an uncorrectable error, pick
 241            the last one (shouldn't happen, just being safe). */
 242         if (!panicm_found)
 243                 panicm = m;
 244         if (nowayout)
 245                 mce_panic("Machine check", &panicm, mcestart);
 246         if (kill_it) {
 247                 int user_space = 0;
 248
 249                 if (m.mcgstatus & MCG_STATUS_RIPV)
 250                         user_space = panicm.rip && (panicm.cs & 3);
 251
 252                 /* When the machine was in user space and the CPU didn't get
 253                    confused it's normally not necessary to panic, unless you
 254                    are paranoid (tolerant == 0)
 255
 256                    RED-PEN could be more tolerant for MCEs in idle,
 257                    but most likely they occur at boot anyways, where
 258                    it is best to just halt the machine. */
 259                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 260                     (unsigned)current->pid <= 1)
 261                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 262
 263                 /* do_exit takes an awful lot of locks and has as
 264                    slight risk of deadlocking. If you don't want that
 265                    don't set tolerant >= 2 */
 266                 if (tolerant < 3)
 267                         do_exit(SIGBUS);
 268         }
 269
 270  out:
 271         /* Last thing done in the machine check exception to clear state. */
 272         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 273  out2:
 274         atomic_dec(&mce_entry);
 275 }
 276
 277 #ifdef CONFIG_X86_MCE_INTEL
 278 /***
 279  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 280  * @cpu: The CPU on which the event occured.
 281  * @status: Event status information
 282  *
 283  * This function should be called by the thermal interrupt after the
 284  * event has been processed and the decision was made to log the event
 285  * further.
 286  *
 287  * The status parameter will be saved to the 'status' field of 'struct mce'
 288  * and historically has been the register value of the
 289  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 290  */
 291 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 292 {
 293         struct mce m;
 294
 295         memset(&m, 0, sizeof(m));
 296         m.cpu = cpu;
 297         m.bank = MCE_THERMAL_BANK;
 298         m.status = status;
 299         rdtscll(m.tsc);
 300         mce_log(&m);
 301 }
 302 #endif /* CONFIG_X86_MCE_INTEL */
 303
 304 /*
 305  * Periodic polling timer for "silent" machine check errors.
 306  */
 307
 308 static int check_interval = 5 * 60; /* 5 minutes */
 309 static void mcheck_timer(struct work_struct *work);
 310 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 311
 312 static void mcheck_check_cpu(void *info)
 313 {
 314         if (mce_available(&current_cpu_data))
 315                 do_machine_check(NULL, 0);
 316 }
 317
 318 static void mcheck_timer(struct work_struct *work)
 319 {
 320         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 321         schedule_delayed_work(&mcheck_work, check_interval * HZ);
 322
 323         /*
 324          * It's ok to read stale data here for notify_user and
 325          * console_logged as we'll simply get the updated versions
 326          * on the next mcheck_timer execution and atomic operations
 327          * on console_logged act as synchronization for notify_user
 328          * writes.
 329          */
 330         if (notify_user && console_logged) {
 331                 notify_user = 0;
 332                 clear_bit(0, &console_logged);
 333                 printk(KERN_INFO "Machine check events logged\n");
 334         }
 335 }
 336
 337
 338 static __init int periodic_mcheck_init(void)
 339 {
 340         if (check_interval)
 341                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 342         return 0;
 343 }
 344 __initcall(periodic_mcheck_init);
 345
 346
 347 /*
 348  * Initialize Machine Checks for a CPU.
 349  */
 350 static void mce_init(void *dummy)
 351 {
 352         u64 cap;
 353         int i;
 354
 355         rdmsrl(MSR_IA32_MCG_CAP, cap);
 356         banks = cap & 0xff;
 357         if (banks > NR_BANKS) {
 358                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 359                 banks = NR_BANKS;
 360         }
 361         /* Use accurate RIP reporting if available. */
 362         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 363                 rip_msr = MSR_IA32_MCG_EIP;
 364
 365         /* Log the machine checks left over from the previous reset.
 366            This also clears all registers */
 367         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 368
 369         set_in_cr4(X86_CR4_MCE);
 370
 371         if (cap & MCG_CTL_P)
 372                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 373
 374         for (i = 0; i < banks; i++) {
 375                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 376                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 377         }
 378 }
 379
 380 /* Add per CPU specific workarounds here */
 381 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 382 {
 383         /* This should be disabled by the BIOS, but isn't always */
 384         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 385                 /* disable GART TBL walk error reporting, which trips off
 386                    incorrectly with the IOMMU & 3ware & Cerberus. */
 387                 clear_bit(10, &bank[4]);
 388                 /* Lots of broken BIOS around that don't clear them
 389                    by default and leave crap in there. Don't log. */
 390                 mce_bootlog = 0;
 391         }
 392
 393 }
 394
 395 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 396 {
 397         switch (c->x86_vendor) {
 398         case X86_VENDOR_INTEL:
 399                 mce_intel_feature_init(c);
 400                 break;
 401         case X86_VENDOR_AMD:
 402                 mce_amd_feature_init(c);
 403                 break;
 404         default:
 405                 break;
 406         }
 407 }
 408
 409 /*
 410  * Called for each booted CPU to set up machine checks.
 411  * Must be called with preempt off.
 412  */
 413 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 414 {
 415         static cpumask_t mce_cpus = CPU_MASK_NONE;
 416
 417         mce_cpu_quirks(c);
 418
 419         if (mce_dont_init ||
 420             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 421             !mce_available(c))
 422                 return;
 423
 424         mce_init(NULL);
 425         mce_cpu_features(c);
 426 }
 427
 428 /*
 429  * Character device to read and clear the MCE log.
 430  */
 431
 432 static void collect_tscs(void *data)
 433 {
 434         unsigned long *cpu_tsc = (unsigned long *)data;
 435         rdtscll(cpu_tsc[smp_processor_id()]);
 436 }
 437
 438 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 439 {
 440         unsigned long *cpu_tsc;
 441         static DECLARE_MUTEX(mce_read_sem);
 442         unsigned next;
 443         char __user *buf = ubuf;
 444         int i, err;
 445
 446         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 447         if (!cpu_tsc)
 448                 return -ENOMEM;
 449
 450         down(&mce_read_sem);
 451         next = rcu_dereference(mcelog.next);
 452
 453         /* Only supports full reads right now */
 454         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 455                 up(&mce_read_sem);
 456                 kfree(cpu_tsc);
 457                 return -EINVAL;
 458         }
 459
 460         err = 0;
 461         for (i = 0; i < next; i++) {
 462                 unsigned long start = jiffies;
 463                 while (!mcelog.entry[i].finished) {
 464                         if (!time_before(jiffies, start + 2)) {
 465                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 466                                 continue;
 467                         }
 468                         cpu_relax();
 469                 }
 470                 smp_rmb();
 471                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 472                 buf += sizeof(struct mce);
 473         }
 474
 475         memset(mcelog.entry, 0, next * sizeof(struct mce));
 476         mcelog.next = 0;
 477
 478         synchronize_sched();
 479
 480         /* Collect entries that were still getting written before the synchronize. */
 481
 482         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 483         for (i = next; i < MCE_LOG_LEN; i++) {
 484                 if (mcelog.entry[i].finished &&
 485                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 486                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 487                         smp_rmb();
 488                         buf += sizeof(struct mce);
 489                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 490                 }
 491         }
 492         up(&mce_read_sem);
 493         kfree(cpu_tsc);
 494         return err ? -EFAULT : buf - ubuf;
 495 }
 496
 497 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 498 {
 499         int __user *p = (int __user *)arg;
 500         if (!capable(CAP_SYS_ADMIN))
 501                 return -EPERM;
 502         switch (cmd) {
 503         case MCE_GET_RECORD_LEN:
 504                 return put_user(sizeof(struct mce), p);
 505         case MCE_GET_LOG_LEN:
 506                 return put_user(MCE_LOG_LEN, p);
 507         case MCE_GETCLEAR_FLAGS: {
 508                 unsigned flags;
 509                 do {
 510                         flags = mcelog.flags;
 511                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 512                 return put_user(flags, p);
 513         }
 514         default:
 515                 return -ENOTTY;
 516         }
 517 }
 518
 519 static struct file_operations mce_chrdev_ops = {
 520         .read = mce_read,
 521         .ioctl = mce_ioctl,
 522 };
 523
 524 static struct miscdevice mce_log_device = {
 525         MISC_MCELOG_MINOR,
 526         "mcelog",
 527         &mce_chrdev_ops,
 528 };
 529
 530 /*
 531  * Old style boot options parsing. Only for compatibility.
 532  */
 533
 534 static int __init mcheck_disable(char *str)
 535 {
 536         mce_dont_init = 1;
 537         return 1;
 538 }
 539
 540 /* mce=off disables machine check. Note you can reenable it later
 541    using sysfs.
 542    mce=TOLERANCELEVEL (number, see above)
 543    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 544    mce=nobootlog Don't log MCEs from before booting. */
 545 static int __init mcheck_enable(char *str)
 546 {
 547         if (*str == '=')
 548                 str++;
 549         if (!strcmp(str, "off"))
 550                 mce_dont_init = 1;
 551         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 552                 mce_bootlog = str[0] == 'b';
 553         else if (isdigit(str[0]))
 554                 get_option(&str, &tolerant);
 555         else
 556                 printk("mce= argument %s ignored. Please use /sys", str);
 557         return 1;
 558 }
 559
 560 __setup("nomce", mcheck_disable);
 561 __setup("mce", mcheck_enable);
 562
 563 /*
 564  * Sysfs support
 565  */
 566
 567 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 568    Only one CPU is active at this time, the others get readded later using
 569    CPU hotplug. */
 570 static int mce_resume(struct sys_device *dev)
 571 {
 572         mce_init(NULL);
 573         return 0;
 574 }
 575
 576 /* Reinit MCEs after user configuration changes */
 577 static void mce_restart(void)
 578 {
 579         if (check_interval)
 580                 cancel_delayed_work(&mcheck_work);
 581         /* Timer race is harmless here */
 582         on_each_cpu(mce_init, NULL, 1, 1);
 583         if (check_interval)
 584                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 585 }
 586
 587 static struct sysdev_class mce_sysclass = {
 588         .resume = mce_resume,
 589         set_kset_name("machinecheck"),
 590 };
 591
 592 DEFINE_PER_CPU(struct sys_device, device_mce);
 593
 594 /* Why are there no generic functions for this? */
 595 #define ACCESSOR(name, var, start) \
 596         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 597                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 598         }                                                                          \
 599         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 600                 char *end;                                                         \
 601                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 602                 if (end == buf) return -EINVAL;                                    \
 603                 var = new;                                                         \
 604                 start;                                                             \
 605                 return end-buf;                                                    \
 606         }                                                                          \
 607         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 608
 609 ACCESSOR(bank0ctl,bank[0],mce_restart())
 610 ACCESSOR(bank1ctl,bank[1],mce_restart())
 611 ACCESSOR(bank2ctl,bank[2],mce_restart())
 612 ACCESSOR(bank3ctl,bank[3],mce_restart())
 613 ACCESSOR(bank4ctl,bank[4],mce_restart())
 614 ACCESSOR(bank5ctl,bank[5],mce_restart())
 615 static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
 616         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 617         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
 618 ACCESSOR(tolerant,tolerant,)
 619 ACCESSOR(check_interval,check_interval,mce_restart())
 620
 621 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 622 static __cpuinit int mce_create_device(unsigned int cpu)
 623 {
 624         int err;
 625         int i;
 626         if (!mce_available(&cpu_data[cpu]))
 627                 return -EIO;
 628
 629         per_cpu(device_mce,cpu).id = cpu;
 630         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 631
 632         err = sysdev_register(&per_cpu(device_mce,cpu));
 633
 634         if (!err) {
 635                 for (i = 0; i < banks; i++)
 636                         sysdev_create_file(&per_cpu(device_mce,cpu),
 637                                 bank_attributes[i]);
 638                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 639                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 640         }
 641         return err;
 642 }
 643
 644 static void mce_remove_device(unsigned int cpu)
 645 {
 646         int i;
 647
 648         for (i = 0; i < banks; i++)
 649                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 650                         bank_attributes[i]);
 651         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 652         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 653         sysdev_unregister(&per_cpu(device_mce,cpu));
 654         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 655 }
 656
 657 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 658 static int
 659 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 660 {
 661         unsigned int cpu = (unsigned long)hcpu;
 662
 663         switch (action) {
 664         case CPU_ONLINE:
 665                 mce_create_device(cpu);
 666                 break;
 667         case CPU_DEAD:
 668                 mce_remove_device(cpu);
 669                 break;
 670         }
 671         return NOTIFY_OK;
 672 }
 673
 674 static struct notifier_block mce_cpu_notifier = {
 675         .notifier_call = mce_cpu_callback,
 676 };
 677
 678 static __init int mce_init_device(void)
 679 {
 680         int err;
 681         int i = 0;
 682
 683         if (!mce_available(&boot_cpu_data))
 684                 return -EIO;
 685         err = sysdev_class_register(&mce_sysclass);
 686
 687         for_each_online_cpu(i) {
 688                 mce_create_device(i);
 689         }
 690
 691         register_hotcpu_notifier(&mce_cpu_notifier);
 692         misc_register(&mce_log_device);
 693         return err;
 694 }
 695
 696 device_initcall(mce_init_device);