arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  * Copyright 2008 Intel Corporation
   7  * Author: Andi Kleen
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/sched.h>
  14 #include <linux/smp_lock.h>
  15 #include <linux/string.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/sysdev.h>
  19 #include <linux/miscdevice.h>
  20 #include <linux/fs.h>
  21 #include <linux/capability.h>
  22 #include <linux/cpu.h>
  23 #include <linux/percpu.h>
  24 #include <linux/poll.h>
  25 #include <linux/thread_info.h>
  26 #include <linux/ctype.h>
  27 #include <linux/kmod.h>
  28 #include <linux/kdebug.h>
  29 #include <linux/kobject.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/ratelimit.h>
  32 #include <asm/processor.h>
  33 #include <asm/msr.h>
  34 #include <asm/mce.h>
  35 #include <asm/uaccess.h>
  36 #include <asm/smp.h>
  37 #include <asm/idle.h>
  38
  39 #define MISC_MCELOG_MINOR 227
  40
  41 atomic_t mce_entry;
  42
  43 static int mce_dont_init;
  44
  45 /*
  46  * Tolerant levels:
  47  *   0: always panic on uncorrected errors, log corrected errors
  48  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  49  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  50  *   3: never panic or SIGBUS, log all errors (for testing only)
  51  */
  52 static int tolerant = 1;
  53 static int banks;
  54 static u64 *bank;
  55 static unsigned long notify_user;
  56 static int rip_msr;
  57 static int mce_bootlog = -1;
  58 static atomic_t mce_events;
  59
  60 static char trigger[128];
  61 static char *trigger_argv[2] = { trigger, NULL };
  62
  63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  64
  65 /* MCA banks polled by the period polling timer for corrected events */
  66 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  67         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  68 };
  69
  70 /* Do initial initialization of a struct mce */
  71 void mce_setup(struct mce *m)
  72 {
  73         memset(m, 0, sizeof(struct mce));
  74         m->cpu = smp_processor_id();
  75         rdtscll(m->tsc);
  76 }
  77
  78 /*
  79  * Lockless MCE logging infrastructure.
  80  * This avoids deadlocks on printk locks without having to break locks. Also
  81  * separate MCEs from kernel messages to avoid bogus bug reports.
  82  */
  83
  84 static struct mce_log mcelog = {
  85         MCE_LOG_SIGNATURE,
  86         MCE_LOG_LEN,
  87 };
  88
  89 void mce_log(struct mce *mce)
  90 {
  91         unsigned next, entry;
  92         atomic_inc(&mce_events);
  93         mce->finished = 0;
  94         wmb();
  95         for (;;) {
  96                 entry = rcu_dereference(mcelog.next);
  97                 for (;;) {
  98                         /* When the buffer fills up discard new entries. Assume
  99                            that the earlier errors are the more interesting. */
 100                         if (entry >= MCE_LOG_LEN) {
 101                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 102                                 return;
 103                         }
 104                         /* Old left over entry. Skip. */
 105                         if (mcelog.entry[entry].finished) {
 106                                 entry++;
 107                                 continue;
 108                         }
 109                         break;
 110                 }
 111                 smp_rmb();
 112                 next = entry + 1;
 113                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 114                         break;
 115         }
 116         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 117         wmb();
 118         mcelog.entry[entry].finished = 1;
 119         wmb();
 120
 121         set_bit(0, &notify_user);
 122 }
 123
 124 static void print_mce(struct mce *m)
 125 {
 126         printk(KERN_EMERG "\n"
 127                KERN_EMERG "HARDWARE ERROR\n"
 128                KERN_EMERG
 129                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 130                m->cpu, m->mcgstatus, m->bank, m->status);
 131         if (m->ip) {
 132                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 133                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 134                        m->cs, m->ip);
 135                 if (m->cs == __KERNEL_CS)
 136                         print_symbol("{%s}", m->ip);
 137                 printk("\n");
 138         }
 139         printk(KERN_EMERG "TSC %llx ", m->tsc);
 140         if (m->addr)
 141                 printk("ADDR %llx ", m->addr);
 142         if (m->misc)
 143                 printk("MISC %llx ", m->misc);
 144         printk("\n");
 145         printk(KERN_EMERG "This is not a software problem!\n");
 146         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 147                "and contact your hardware vendor\n");
 148 }
 149
 150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 151 {
 152         int i;
 153
 154         oops_begin();
 155         for (i = 0; i < MCE_LOG_LEN; i++) {
 156                 unsigned long tsc = mcelog.entry[i].tsc;
 157
 158                 if (time_before(tsc, start))
 159                         continue;
 160                 print_mce(&mcelog.entry[i]);
 161                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 162                         backup = NULL;
 163         }
 164         if (backup)
 165                 print_mce(backup);
 166         panic(msg);
 167 }
 168
 169 int mce_available(struct cpuinfo_x86 *c)
 170 {
 171         if (mce_dont_init)
 172                 return 0;
 173         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 174 }
 175
 176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 177 {
 178         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 179                 m->ip = regs->ip;
 180                 m->cs = regs->cs;
 181         } else {
 182                 m->ip = 0;
 183                 m->cs = 0;
 184         }
 185         if (rip_msr) {
 186                 /* Assume the RIP in the MSR is exact. Is this true? */
 187                 m->mcgstatus |= MCG_STATUS_EIPV;
 188                 rdmsrl(rip_msr, m->ip);
 189                 m->cs = 0;
 190         }
 191 }
 192
 193 /*
 194  * Poll for corrected events or events that happened before reset.
 195  * Those are just logged through /dev/mcelog.
 196  *
 197  * This is executed in standard interrupt context.
 198  */
 199 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 200 {
 201         struct mce m;
 202         int i;
 203
 204         mce_setup(&m);
 205
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         for (i = 0; i < banks; i++) {
 208                 if (!bank[i] || !test_bit(i, *b))
 209                         continue;
 210
 211                 m.misc = 0;
 212                 m.addr = 0;
 213                 m.bank = i;
 214                 m.tsc = 0;
 215
 216                 barrier();
 217                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 218                 if (!(m.status & MCI_STATUS_VAL))
 219                         continue;
 220
 221                 /*
 222                  * Uncorrected events are handled by the exception handler
 223                  * when it is enabled. But when the exception is disabled log
 224                  * everything.
 225                  *
 226                  * TBD do the same check for MCI_STATUS_EN here?
 227                  */
 228                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 229                         continue;
 230
 231                 if (m.status & MCI_STATUS_MISCV)
 232                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 233                 if (m.status & MCI_STATUS_ADDRV)
 234                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 235
 236                 if (!(flags & MCP_TIMESTAMP))
 237                         m.tsc = 0;
 238                 /*
 239                  * Don't get the IP here because it's unlikely to
 240                  * have anything to do with the actual error location.
 241                  */
 242
 243                 mce_log(&m);
 244                 add_taint(TAINT_MACHINE_CHECK);
 245
 246                 /*
 247                  * Clear state for this bank.
 248                  */
 249                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 250         }
 251
 252         /*
 253          * Don't clear MCG_STATUS here because it's only defined for
 254          * exceptions.
 255          */
 256 }
 257
 258 /*
 259  * The actual machine check handler. This only handles real
 260  * exceptions when something got corrupted coming in through int 18.
 261  *
 262  * This is executed in NMI context not subject to normal locking rules. This
 263  * implies that most kernel services cannot be safely used. Don't even
 264  * think about putting a printk in there!
 265  */
 266 void do_machine_check(struct pt_regs * regs, long error_code)
 267 {
 268         struct mce m, panicm;
 269         u64 mcestart = 0;
 270         int i;
 271         int panicm_found = 0;
 272         /*
 273          * If no_way_out gets set, there is no safe way to recover from this
 274          * MCE.  If tolerant is cranked up, we'll try anyway.
 275          */
 276         int no_way_out = 0;
 277         /*
 278          * If kill_it gets set, there might be a way to recover from this
 279          * error.
 280          */
 281         int kill_it = 0;
 282         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 283
 284         atomic_inc(&mce_entry);
 285
 286         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 287                            18, SIGKILL) == NOTIFY_STOP)
 288                 goto out2;
 289         if (!banks)
 290                 goto out2;
 291
 292         mce_setup(&m);
 293
 294         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 295         /* if the restart IP is not valid, we're done for */
 296         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 297                 no_way_out = 1;
 298
 299         rdtscll(mcestart);
 300         barrier();
 301
 302         for (i = 0; i < banks; i++) {
 303                 __clear_bit(i, toclear);
 304                 if (!bank[i])
 305                         continue;
 306
 307                 m.misc = 0;
 308                 m.addr = 0;
 309                 m.bank = i;
 310
 311                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 312                 if ((m.status & MCI_STATUS_VAL) == 0)
 313                         continue;
 314
 315                 /*
 316                  * Non uncorrected errors are handled by machine_check_poll
 317                  * Leave them alone.
 318                  */
 319                 if ((m.status & MCI_STATUS_UC) == 0)
 320                         continue;
 321
 322                 /*
 323                  * Set taint even when machine check was not enabled.
 324                  */
 325                 add_taint(TAINT_MACHINE_CHECK);
 326
 327                 __set_bit(i, toclear);
 328
 329                 if (m.status & MCI_STATUS_EN) {
 330                         /* if PCC was set, there's no way out */
 331                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 332                         /*
 333                          * If this error was uncorrectable and there was
 334                          * an overflow, we're in trouble.  If no overflow,
 335                          * we might get away with just killing a task.
 336                          */
 337                         if (m.status & MCI_STATUS_UC) {
 338                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 339                                         no_way_out = 1;
 340                                 kill_it = 1;
 341                         }
 342                 } else {
 343                         /*
 344                          * Machine check event was not enabled. Clear, but
 345                          * ignore.
 346                          */
 347                         continue;
 348                 }
 349
 350                 if (m.status & MCI_STATUS_MISCV)
 351                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 352                 if (m.status & MCI_STATUS_ADDRV)
 353                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 354
 355                 mce_get_rip(&m, regs);
 356                 mce_log(&m);
 357
 358                 /* Did this bank cause the exception? */
 359                 /* Assume that the bank with uncorrectable errors did it,
 360                    and that there is only a single one. */
 361                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 362                         panicm = m;
 363                         panicm_found = 1;
 364                 }
 365         }
 366
 367         /* If we didn't find an uncorrectable error, pick
 368            the last one (shouldn't happen, just being safe). */
 369         if (!panicm_found)
 370                 panicm = m;
 371
 372         /*
 373          * If we have decided that we just CAN'T continue, and the user
 374          *  has not set tolerant to an insane level, give up and die.
 375          */
 376         if (no_way_out && tolerant < 3)
 377                 mce_panic("Machine check", &panicm, mcestart);
 378
 379         /*
 380          * If the error seems to be unrecoverable, something should be
 381          * done.  Try to kill as little as possible.  If we can kill just
 382          * one task, do that.  If the user has set the tolerance very
 383          * high, don't try to do anything at all.
 384          */
 385         if (kill_it && tolerant < 3) {
 386                 int user_space = 0;
 387
 388                 /*
 389                  * If the EIPV bit is set, it means the saved IP is the
 390                  * instruction which caused the MCE.
 391                  */
 392                 if (m.mcgstatus & MCG_STATUS_EIPV)
 393                         user_space = panicm.ip && (panicm.cs & 3);
 394
 395                 /*
 396                  * If we know that the error was in user space, send a
 397                  * SIGBUS.  Otherwise, panic if tolerance is low.
 398                  *
 399                  * force_sig() takes an awful lot of locks and has a slight
 400                  * risk of deadlocking.
 401                  */
 402                 if (user_space) {
 403                         force_sig(SIGBUS, current);
 404                 } else if (panic_on_oops || tolerant < 2) {
 405                         mce_panic("Uncorrected machine check",
 406                                 &panicm, mcestart);
 407                 }
 408         }
 409
 410         /* notify userspace ASAP */
 411         set_thread_flag(TIF_MCE_NOTIFY);
 412
 413         /* the last thing we do is clear state */
 414         for (i = 0; i < banks; i++) {
 415                 if (test_bit(i, toclear))
 416                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 417         }
 418         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 419  out2:
 420         atomic_dec(&mce_entry);
 421 }
 422
 423 #ifdef CONFIG_X86_MCE_INTEL
 424 /***
 425  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 426  * @cpu: The CPU on which the event occurred.
 427  * @status: Event status information
 428  *
 429  * This function should be called by the thermal interrupt after the
 430  * event has been processed and the decision was made to log the event
 431  * further.
 432  *
 433  * The status parameter will be saved to the 'status' field of 'struct mce'
 434  * and historically has been the register value of the
 435  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 436  */
 437 void mce_log_therm_throt_event(__u64 status)
 438 {
 439         struct mce m;
 440
 441         mce_setup(&m);
 442         m.bank = MCE_THERMAL_BANK;
 443         m.status = status;
 444         mce_log(&m);
 445 }
 446 #endif /* CONFIG_X86_MCE_INTEL */
 447
 448 /*
 449  * Periodic polling timer for "silent" machine check errors.  If the
 450  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 451  * errors, poll 2x slower (up to check_interval seconds).
 452  */
 453
 454 static int check_interval = 5 * 60; /* 5 minutes */
 455 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 456 static void mcheck_timer(unsigned long);
 457 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 458
 459 static void mcheck_timer(unsigned long data)
 460 {
 461         struct timer_list *t = &per_cpu(mce_timer, data);
 462         int *n;
 463
 464         WARN_ON(smp_processor_id() != data);
 465
 466         if (mce_available(&current_cpu_data))
 467                 machine_check_poll(MCP_TIMESTAMP,
 468                                 &__get_cpu_var(mce_poll_banks));
 469
 470         /*
 471          * Alert userspace if needed.  If we logged an MCE, reduce the
 472          * polling interval, otherwise increase the polling interval.
 473          */
 474         n = &__get_cpu_var(next_interval);
 475         if (mce_notify_user()) {
 476                 *n = max(*n/2, HZ/100);
 477         } else {
 478                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 479         }
 480
 481         t->expires = jiffies + *n;
 482         add_timer(t);
 483 }
 484
 485 static void mce_do_trigger(struct work_struct *work)
 486 {
 487         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 488 }
 489
 490 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 491
 492 /*
 493  * Notify the user(s) about new machine check events.
 494  * Can be called from interrupt context, but not from machine check/NMI
 495  * context.
 496  */
 497 int mce_notify_user(void)
 498 {
 499         /* Not more than two messages every minute */
 500         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 501
 502         clear_thread_flag(TIF_MCE_NOTIFY);
 503         if (test_and_clear_bit(0, &notify_user)) {
 504                 wake_up_interruptible(&mce_wait);
 505
 506                 /*
 507                  * There is no risk of missing notifications because
 508                  * work_pending is always cleared before the function is
 509                  * executed.
 510                  */
 511                 if (trigger[0] && !work_pending(&mce_trigger_work))
 512                         schedule_work(&mce_trigger_work);
 513
 514                 if (__ratelimit(&ratelimit))
 515                         printk(KERN_INFO "Machine check events logged\n");
 516
 517                 return 1;
 518         }
 519         return 0;
 520 }
 521
 522 /* see if the idle task needs to notify userspace */
 523 static int
 524 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 525 {
 526         /* IDLE_END should be safe - interrupts are back on */
 527         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 528                 mce_notify_user();
 529
 530         return NOTIFY_OK;
 531 }
 532
 533 static struct notifier_block mce_idle_notifier = {
 534         .notifier_call = mce_idle_callback,
 535 };
 536
 537 static __init int periodic_mcheck_init(void)
 538 {
 539        idle_notifier_register(&mce_idle_notifier);
 540        return 0;
 541 }
 542 __initcall(periodic_mcheck_init);
 543
 544 /*
 545  * Initialize Machine Checks for a CPU.
 546  */
 547 static int mce_cap_init(void)
 548 {
 549         u64 cap;
 550         unsigned b;
 551
 552         rdmsrl(MSR_IA32_MCG_CAP, cap);
 553         b = cap & 0xff;
 554         if (b > MAX_NR_BANKS) {
 555                 printk(KERN_WARNING
 556                        "MCE: Using only %u machine check banks out of %u\n",
 557                         MAX_NR_BANKS, b);
 558                 b = MAX_NR_BANKS;
 559         }
 560
 561         /* Don't support asymmetric configurations today */
 562         WARN_ON(banks != 0 && b != banks);
 563         banks = b;
 564         if (!bank) {
 565                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 566                 if (!bank)
 567                         return -ENOMEM;
 568                 memset(bank, 0xff, banks * sizeof(u64));
 569         }
 570
 571         /* Use accurate RIP reporting if available. */
 572         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 573                 rip_msr = MSR_IA32_MCG_EIP;
 574
 575         return 0;
 576 }
 577
 578 static void mce_init(void *dummy)
 579 {
 580         u64 cap;
 581         int i;
 582         mce_banks_t all_banks;
 583
 584         /*
 585          * Log the machine checks left over from the previous reset.
 586          */
 587         bitmap_fill(all_banks, MAX_NR_BANKS);
 588         machine_check_poll(MCP_UC, &all_banks);
 589
 590         set_in_cr4(X86_CR4_MCE);
 591
 592         rdmsrl(MSR_IA32_MCG_CAP, cap);
 593         if (cap & MCG_CTL_P)
 594                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 595
 596         for (i = 0; i < banks; i++) {
 597                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 598                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 599         }
 600 }
 601
 602 /* Add per CPU specific workarounds here */
 603 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 604 {
 605         /* This should be disabled by the BIOS, but isn't always */
 606         if (c->x86_vendor == X86_VENDOR_AMD) {
 607                 if (c->x86 == 15 && banks > 4)
 608                         /* disable GART TBL walk error reporting, which trips off
 609                            incorrectly with the IOMMU & 3ware & Cerberus. */
 610                         clear_bit(10, (unsigned long *)&bank[4]);
 611                 if(c->x86 <= 17 && mce_bootlog < 0)
 612                         /* Lots of broken BIOS around that don't clear them
 613                            by default and leave crap in there. Don't log. */
 614                         mce_bootlog = 0;
 615         }
 616
 617 }
 618
 619 static void mce_cpu_features(struct cpuinfo_x86 *c)
 620 {
 621         switch (c->x86_vendor) {
 622         case X86_VENDOR_INTEL:
 623                 mce_intel_feature_init(c);
 624                 break;
 625         case X86_VENDOR_AMD:
 626                 mce_amd_feature_init(c);
 627                 break;
 628         default:
 629                 break;
 630         }
 631 }
 632
 633 static void mce_init_timer(void)
 634 {
 635         struct timer_list *t = &__get_cpu_var(mce_timer);
 636         int *n = &__get_cpu_var(next_interval);
 637
 638         *n = check_interval * HZ;
 639         if (!*n)
 640                 return;
 641         setup_timer(t, mcheck_timer, smp_processor_id());
 642         t->expires = round_jiffies(jiffies + *n);
 643         add_timer(t);
 644 }
 645
 646 /*
 647  * Called for each booted CPU to set up machine checks.
 648  * Must be called with preempt off.
 649  */
 650 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 651 {
 652         if (!mce_available(c))
 653                 return;
 654
 655         if (mce_cap_init() < 0) {
 656                 mce_dont_init = 1;
 657                 return;
 658         }
 659         mce_cpu_quirks(c);
 660
 661         mce_init(NULL);
 662         mce_cpu_features(c);
 663         mce_init_timer();
 664 }
 665
 666 /*
 667  * Character device to read and clear the MCE log.
 668  */
 669
 670 static DEFINE_SPINLOCK(mce_state_lock);
 671 static int open_count;  /* #times opened */
 672 static int open_exclu;  /* already open exclusive? */
 673
 674 static int mce_open(struct inode *inode, struct file *file)
 675 {
 676         lock_kernel();
 677         spin_lock(&mce_state_lock);
 678
 679         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 680                 spin_unlock(&mce_state_lock);
 681                 unlock_kernel();
 682                 return -EBUSY;
 683         }
 684
 685         if (file->f_flags & O_EXCL)
 686                 open_exclu = 1;
 687         open_count++;
 688
 689         spin_unlock(&mce_state_lock);
 690         unlock_kernel();
 691
 692         return nonseekable_open(inode, file);
 693 }
 694
 695 static int mce_release(struct inode *inode, struct file *file)
 696 {
 697         spin_lock(&mce_state_lock);
 698
 699         open_count--;
 700         open_exclu = 0;
 701
 702         spin_unlock(&mce_state_lock);
 703
 704         return 0;
 705 }
 706
 707 static void collect_tscs(void *data)
 708 {
 709         unsigned long *cpu_tsc = (unsigned long *)data;
 710
 711         rdtscll(cpu_tsc[smp_processor_id()]);
 712 }
 713
 714 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 715                         loff_t *off)
 716 {
 717         unsigned long *cpu_tsc;
 718         static DEFINE_MUTEX(mce_read_mutex);
 719         unsigned prev, next;
 720         char __user *buf = ubuf;
 721         int i, err;
 722
 723         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 724         if (!cpu_tsc)
 725                 return -ENOMEM;
 726
 727         mutex_lock(&mce_read_mutex);
 728         next = rcu_dereference(mcelog.next);
 729
 730         /* Only supports full reads right now */
 731         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 732                 mutex_unlock(&mce_read_mutex);
 733                 kfree(cpu_tsc);
 734                 return -EINVAL;
 735         }
 736
 737         err = 0;
 738         prev = 0;
 739         do {
 740                 for (i = prev; i < next; i++) {
 741                         unsigned long start = jiffies;
 742
 743                         while (!mcelog.entry[i].finished) {
 744                                 if (time_after_eq(jiffies, start + 2)) {
 745                                         memset(mcelog.entry + i, 0,
 746                                                sizeof(struct mce));
 747                                         goto timeout;
 748                                 }
 749                                 cpu_relax();
 750                         }
 751                         smp_rmb();
 752                         err |= copy_to_user(buf, mcelog.entry + i,
 753                                             sizeof(struct mce));
 754                         buf += sizeof(struct mce);
 755 timeout:
 756                         ;
 757                 }
 758
 759                 memset(mcelog.entry + prev, 0,
 760                        (next - prev) * sizeof(struct mce));
 761                 prev = next;
 762                 next = cmpxchg(&mcelog.next, prev, 0);
 763         } while (next != prev);
 764
 765         synchronize_sched();
 766
 767         /*
 768          * Collect entries that were still getting written before the
 769          * synchronize.
 770          */
 771         on_each_cpu(collect_tscs, cpu_tsc, 1);
 772         for (i = next; i < MCE_LOG_LEN; i++) {
 773                 if (mcelog.entry[i].finished &&
 774                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 775                         err |= copy_to_user(buf, mcelog.entry+i,
 776                                             sizeof(struct mce));
 777                         smp_rmb();
 778                         buf += sizeof(struct mce);
 779                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 780                 }
 781         }
 782         mutex_unlock(&mce_read_mutex);
 783         kfree(cpu_tsc);
 784         return err ? -EFAULT : buf - ubuf;
 785 }
 786
 787 static unsigned int mce_poll(struct file *file, poll_table *wait)
 788 {
 789         poll_wait(file, &mce_wait, wait);
 790         if (rcu_dereference(mcelog.next))
 791                 return POLLIN | POLLRDNORM;
 792         return 0;
 793 }
 794
 795 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 796 {
 797         int __user *p = (int __user *)arg;
 798
 799         if (!capable(CAP_SYS_ADMIN))
 800                 return -EPERM;
 801         switch (cmd) {
 802         case MCE_GET_RECORD_LEN:
 803                 return put_user(sizeof(struct mce), p);
 804         case MCE_GET_LOG_LEN:
 805                 return put_user(MCE_LOG_LEN, p);
 806         case MCE_GETCLEAR_FLAGS: {
 807                 unsigned flags;
 808
 809                 do {
 810                         flags = mcelog.flags;
 811                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 812                 return put_user(flags, p);
 813         }
 814         default:
 815                 return -ENOTTY;
 816         }
 817 }
 818
 819 static const struct file_operations mce_chrdev_ops = {
 820         .open = mce_open,
 821         .release = mce_release,
 822         .read = mce_read,
 823         .poll = mce_poll,
 824         .unlocked_ioctl = mce_ioctl,
 825 };
 826
 827 static struct miscdevice mce_log_device = {
 828         MISC_MCELOG_MINOR,
 829         "mcelog",
 830         &mce_chrdev_ops,
 831 };
 832
 833 /*
 834  * Old style boot options parsing. Only for compatibility.
 835  */
 836 static int __init mcheck_disable(char *str)
 837 {
 838         mce_dont_init = 1;
 839         return 1;
 840 }
 841
 842 /* mce=off disables machine check.
 843    mce=TOLERANCELEVEL (number, see above)
 844    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 845    mce=nobootlog Don't log MCEs from before booting. */
 846 static int __init mcheck_enable(char *str)
 847 {
 848         if (!strcmp(str, "off"))
 849                 mce_dont_init = 1;
 850         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 851                 mce_bootlog = str[0] == 'b';
 852         else if (isdigit(str[0]))
 853                 get_option(&str, &tolerant);
 854         else
 855                 printk("mce= argument %s ignored. Please use /sys", str);
 856         return 1;
 857 }
 858
 859 __setup("nomce", mcheck_disable);
 860 __setup("mce=", mcheck_enable);
 861
 862 /*
 863  * Sysfs support
 864  */
 865
 866 /*
 867  * Disable machine checks on suspend and shutdown. We can't really handle
 868  * them later.
 869  */
 870 static int mce_disable(void)
 871 {
 872         int i;
 873
 874         for (i = 0; i < banks; i++)
 875                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 876         return 0;
 877 }
 878
 879 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 880 {
 881         return mce_disable();
 882 }
 883
 884 static int mce_shutdown(struct sys_device *dev)
 885 {
 886         return mce_disable();
 887 }
 888
 889 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 890    Only one CPU is active at this time, the others get readded later using
 891    CPU hotplug. */
 892 static int mce_resume(struct sys_device *dev)
 893 {
 894         mce_init(NULL);
 895         mce_cpu_features(&current_cpu_data);
 896         return 0;
 897 }
 898
 899 static void mce_cpu_restart(void *data)
 900 {
 901         del_timer_sync(&__get_cpu_var(mce_timer));
 902         if (mce_available(&current_cpu_data))
 903                 mce_init(NULL);
 904         mce_init_timer();
 905 }
 906
 907 /* Reinit MCEs after user configuration changes */
 908 static void mce_restart(void)
 909 {
 910         on_each_cpu(mce_cpu_restart, NULL, 1);
 911 }
 912
 913 static struct sysdev_class mce_sysclass = {
 914         .suspend = mce_suspend,
 915         .shutdown = mce_shutdown,
 916         .resume = mce_resume,
 917         .name = "machinecheck",
 918 };
 919
 920 DEFINE_PER_CPU(struct sys_device, device_mce);
 921 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 922
 923 /* Why are there no generic functions for this? */
 924 #define ACCESSOR(name, var, start) \
 925         static ssize_t show_ ## name(struct sys_device *s,              \
 926                                      struct sysdev_attribute *attr,     \
 927                                      char *buf) {                       \
 928                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 929         }                                                               \
 930         static ssize_t set_ ## name(struct sys_device *s,               \
 931                                     struct sysdev_attribute *attr,      \
 932                                     const char *buf, size_t siz) {      \
 933                 char *end;                                              \
 934                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 935                 if (end == buf) return -EINVAL;                         \
 936                 var = new;                                              \
 937                 start;                                                  \
 938                 return end-buf;                                         \
 939         }                                                               \
 940         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 941
 942 static struct sysdev_attribute *bank_attrs;
 943
 944 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 945                          char *buf)
 946 {
 947         u64 b = bank[attr - bank_attrs];
 948         return sprintf(buf, "%llx\n", b);
 949 }
 950
 951 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 952                         const char *buf, size_t siz)
 953 {
 954         char *end;
 955         u64 new = simple_strtoull(buf, &end, 0);
 956         if (end == buf)
 957                 return -EINVAL;
 958         bank[attr - bank_attrs] = new;
 959         mce_restart();
 960         return end-buf;
 961 }
 962
 963 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 964                                 char *buf)
 965 {
 966         strcpy(buf, trigger);
 967         strcat(buf, "\n");
 968         return strlen(trigger) + 1;
 969 }
 970
 971 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 972                                 const char *buf,size_t siz)
 973 {
 974         char *p;
 975         int len;
 976         strncpy(trigger, buf, sizeof(trigger));
 977         trigger[sizeof(trigger)-1] = 0;
 978         len = strlen(trigger);
 979         p = strchr(trigger, '\n');
 980         if (*p) *p = 0;
 981         return len;
 982 }
 983
 984 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 985 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 986 ACCESSOR(check_interval,check_interval,mce_restart())
 987 static struct sysdev_attribute *mce_attributes[] = {
 988         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 989         NULL
 990 };
 991
 992 static cpumask_var_t mce_device_initialized;
 993
 994 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 995 static __cpuinit int mce_create_device(unsigned int cpu)
 996 {
 997         int err;
 998         int i;
 999
1000         if (!mce_available(&boot_cpu_data))
1001                 return -EIO;
1002
1003         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1004         per_cpu(device_mce,cpu).id = cpu;
1005         per_cpu(device_mce,cpu).cls = &mce_sysclass;
1006
1007         err = sysdev_register(&per_cpu(device_mce,cpu));
1008         if (err)
1009                 return err;
1010
1011         for (i = 0; mce_attributes[i]; i++) {
1012                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1013                                          mce_attributes[i]);
1014                 if (err)
1015                         goto error;
1016         }
1017         for (i = 0; i < banks; i++) {
1018                 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1019                                         &bank_attrs[i]);
1020                 if (err)
1021                         goto error2;
1022         }
1023         cpumask_set_cpu(cpu, mce_device_initialized);
1024
1025         return 0;
1026 error2:
1027         while (--i >= 0) {
1028                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1029                                         &bank_attrs[i]);
1030         }
1031 error:
1032         while (--i >= 0) {
1033                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1034                                    mce_attributes[i]);
1035         }
1036         sysdev_unregister(&per_cpu(device_mce,cpu));
1037
1038         return err;
1039 }
1040
1041 static __cpuinit void mce_remove_device(unsigned int cpu)
1042 {
1043         int i;
1044
1045         if (!cpumask_test_cpu(cpu, mce_device_initialized))
1046                 return;
1047
1048         for (i = 0; mce_attributes[i]; i++)
1049                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1050                         mce_attributes[i]);
1051         for (i = 0; i < banks; i++)
1052                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1053                         &bank_attrs[i]);
1054         sysdev_unregister(&per_cpu(device_mce,cpu));
1055         cpumask_clear_cpu(cpu, mce_device_initialized);
1056 }
1057
1058 /* Make sure there are no machine checks on offlined CPUs. */
1059 static void mce_disable_cpu(void *h)
1060 {
1061         int i;
1062         unsigned long action = *(unsigned long *)h;
1063
1064         if (!mce_available(&current_cpu_data))
1065                 return;
1066         if (!(action & CPU_TASKS_FROZEN))
1067                 cmci_clear();
1068         for (i = 0; i < banks; i++)
1069                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1070 }
1071
1072 static void mce_reenable_cpu(void *h)
1073 {
1074         int i;
1075         unsigned long action = *(unsigned long *)h;
1076
1077         if (!mce_available(&current_cpu_data))
1078                 return;
1079         if (!(action & CPU_TASKS_FROZEN))
1080                 cmci_reenable();
1081         for (i = 0; i < banks; i++)
1082                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1083 }
1084
1085 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1086 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1087                                       unsigned long action, void *hcpu)
1088 {
1089         unsigned int cpu = (unsigned long)hcpu;
1090         struct timer_list *t = &per_cpu(mce_timer, cpu);
1091
1092         switch (action) {
1093         case CPU_ONLINE:
1094         case CPU_ONLINE_FROZEN:
1095                 mce_create_device(cpu);
1096                 if (threshold_cpu_callback)
1097                         threshold_cpu_callback(action, cpu);
1098                 break;
1099         case CPU_DEAD:
1100         case CPU_DEAD_FROZEN:
1101                 if (threshold_cpu_callback)
1102                         threshold_cpu_callback(action, cpu);
1103                 mce_remove_device(cpu);
1104                 break;
1105         case CPU_DOWN_PREPARE:
1106         case CPU_DOWN_PREPARE_FROZEN:
1107                 del_timer_sync(t);
1108                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1109                 break;
1110         case CPU_DOWN_FAILED:
1111         case CPU_DOWN_FAILED_FROZEN:
1112                 t->expires = round_jiffies(jiffies +
1113                                                 __get_cpu_var(next_interval));
1114                 add_timer_on(t, cpu);
1115                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116                 break;
1117         case CPU_POST_DEAD:
1118                 /* intentionally ignoring frozen here */
1119                 cmci_rediscover(cpu);
1120                 break;
1121         }
1122         return NOTIFY_OK;
1123 }
1124
1125 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1126         .notifier_call = mce_cpu_callback,
1127 };
1128
1129 static __init int mce_init_banks(void)
1130 {
1131         int i;
1132
1133         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134                                 GFP_KERNEL);
1135         if (!bank_attrs)
1136                 return -ENOMEM;
1137
1138         for (i = 0; i < banks; i++) {
1139                 struct sysdev_attribute *a = &bank_attrs[i];
1140                 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141                 if (!a->attr.name)
1142                         goto nomem;
1143                 a->attr.mode = 0644;
1144                 a->show = show_bank;
1145                 a->store = set_bank;
1146         }
1147         return 0;
1148
1149 nomem:
1150         while (--i >= 0)
1151                 kfree(bank_attrs[i].attr.name);
1152         kfree(bank_attrs);
1153         bank_attrs = NULL;
1154         return -ENOMEM;
1155 }
1156
1157 static __init int mce_init_device(void)
1158 {
1159         int err;
1160         int i = 0;
1161
1162         if (!mce_available(&boot_cpu_data))
1163                 return -EIO;
1164
1165         alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1166
1167         err = mce_init_banks();
1168         if (err)
1169                 return err;
1170
1171         err = sysdev_class_register(&mce_sysclass);
1172         if (err)
1173                 return err;
1174
1175         for_each_online_cpu(i) {
1176                 err = mce_create_device(i);
1177                 if (err)
1178                         return err;
1179         }
1180
1181         register_hotcpu_notifier(&mce_cpu_notifier);
1182         misc_register(&mce_log_device);
1183         return err;
1184 }
1185
1186 device_initcall(mce_init_device);