#include <asm/mce.h>
#include <asm/msr.h>
+#include "mce-internal.h"
#include "mce.h"
/* Handle unconfigured int18 (should never happen) */
atomic_t mce_entry;
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
/*
* Tolerant levels:
* 0: always panic on uncorrected errors, log corrected errors
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
- m->cpu = smp_processor_id();
+ m->cpu = m->extcpu = smp_processor_id();
rdtscll(m->tsc);
+ /* We hope get_seconds stays lockless */
+ m->time = get_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+#ifdef CONFIG_SMP
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+#endif
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
DEFINE_PER_CPU(struct mce, injectm);
*/
static struct mce_log mcelog = {
- MCE_LOG_SIGNATURE,
- MCE_LOG_LEN,
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
};
void mce_log(struct mce *mce)
mcelog.entry[entry].finished = 1;
wmb();
+ mce->finished = 1;
set_bit(0, ¬ify_user);
}
KERN_EMERG "HARDWARE ERROR\n"
KERN_EMERG
"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->cpu, m->mcgstatus, m->bank, m->status);
+ m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
if (m->misc)
printk("MISC %llx ", m->misc);
printk("\n");
+ printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid,
+ m->apicid);
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode "
"and contact your hardware vendor\n");
}
-static void mce_panic(char *msg, struct mce *backup, u64 start)
+static void mce_panic(char *msg, struct mce *final, char *exp)
{
int i;
bust_spinlocks(1);
console_verbose();
+ /* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
- u64 tsc = mcelog.entry[i].tsc;
-
- if ((s64)(tsc - start) < 0)
+ struct mce *m = &mcelog.entry[i];
+ if ((m->status & MCI_STATUS_VAL) &&
+ !(m->status & MCI_STATUS_UC))
+ print_mce(m);
+ }
+ /* Now print uncorrected but with the final one last */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
- backup = NULL;
+ if (!final || memcmp(m, final, sizeof(struct mce)))
+ print_mce(m);
}
- if (backup)
- print_mce(backup);
+ if (final)
+ print_mce(final);
+ if (exp)
+ printk(KERN_EMERG "Machine check: %s\n", exp);
panic(msg);
}
}
}
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
/*
* Poll for corrected events or events that happened before reset.
* Those are just logged through /dev/mcelog.
struct mce m;
int i;
+ __get_cpu_var(mce_poll_count)++;
+
mce_setup(&m);
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
}
EXPORT_SYMBOL_GPL(machine_check_poll);
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
+ return 1;
+ }
+ return 0;
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
{
struct mce m, panicm;
int panicm_found = 0;
- u64 mcestart = 0;
int i;
/*
* If no_way_out gets set, there is no safe way to recover from this
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ char *msg = "Unknown";
atomic_inc(&mce_entry);
+ __get_cpu_var(mce_exception_count)++;
+
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
goto out;
mce_setup(&m);
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ no_way_out = mce_no_way_out(&m, &msg);
- /* if the restart IP is not valid, we're done for */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- no_way_out = 1;
-
- rdtscll(mcestart);
barrier();
for (i = 0; i < banks; i++) {
/*
* Non uncorrected errors are handled by machine_check_poll
- * Leave them alone.
+ * Leave them alone, unless this panics.
*/
- if ((m.status & MCI_STATUS_UC) == 0)
+ if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
continue;
/*
__set_bit(i, toclear);
if (m.status & MCI_STATUS_EN) {
- /* if PCC was set, there's no way out */
- no_way_out |= !!(m.status & MCI_STATUS_PCC);
/*
* If this error was uncorrectable and there was
* an overflow, we're in trouble. If no overflow,
* we might get away with just killing a task.
*/
- if (m.status & MCI_STATUS_UC) {
- if (tolerant < 1 || m.status & MCI_STATUS_OVER)
- no_way_out = 1;
+ if (m.status & MCI_STATUS_UC)
kill_it = 1;
- }
} else {
/*
* Machine check event was not enabled. Clear, but
* has not set tolerant to an insane level, give up and die.
*/
if (no_way_out && tolerant < 3)
- mce_panic("Machine check", &panicm, mcestart);
+ mce_panic("Machine check", &panicm, msg);
/*
* If the error seems to be unrecoverable, something should be
if (user_space) {
force_sig(SIGBUS, current);
} else if (panic_on_oops || tolerant < 2) {
- mce_panic("Uncorrected machine check",
- &panicm, mcestart);
+ mce_panic("Uncorrected machine check", &panicm, msg);
}
}