1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static bool f12h_dc_mce(u16 ec, u8 xec)
107 pr_cont("during L1 linefill from L2.\n");
108 else if (ll == LL_L1)
109 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
116 static bool f10h_dc_mce(u16 ec, u8 xec)
118 u8 r4 = (ec >> 4) & 0xf;
121 if (r4 == R4_GEN && ll == LL_L1) {
122 pr_cont("during data scrub.\n");
125 return f12h_dc_mce(ec, xec);
128 static bool k8_dc_mce(u16 ec, u8 xec)
131 pr_cont("during system linefill.\n");
135 return f10h_dc_mce(ec, xec);
138 static bool f14h_dc_mce(u16 ec, u8 xec)
140 u8 r4 = (ec >> 4) & 0xf;
142 u8 tt = (ec >> 2) & 0x3;
148 if (tt != TT_DATA || ll != LL_L1)
154 pr_cont("Data/Tag parity error due to %s.\n",
155 (r4 == R4_DRD ? "load/hw prf" : "store"));
158 pr_cont("Copyback parity error on a tag miss.\n");
161 pr_cont("Tag parity error during snoop.\n");
166 } else if (BUS_ERROR(ec)) {
168 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
171 pr_cont("System read data error on a ");
175 pr_cont("TLB reload.\n");
193 static bool f15h_dc_mce(u16 ec, u8 xec)
201 pr_cont("Data Array access error.\n");
205 pr_cont("UC error during a linefill from L2/NB.\n");
210 pr_cont("STQ access error.\n");
214 pr_cont("SCB access error.\n");
218 pr_cont("Tag error.\n");
222 pr_cont("LDQ access error.\n");
228 } else if (BUS_ERROR(ec)) {
231 pr_cont("during system linefill.\n");
233 pr_cont(" Internal %s condition.\n",
234 ((xec == 1) ? "livelock" : "deadlock"));
241 static void amd_decode_dc_mce(struct mce *m)
243 u16 ec = m->status & 0xffff;
244 u8 xec = (m->status >> 16) & xec_mask;
246 pr_emerg(HW_ERR "Data Cache Error: ");
248 /* TLB error signatures are the same across families */
250 u8 tt = (ec >> 2) & 0x3;
253 pr_cont("%s TLB %s.\n", LL_MSG(ec),
254 ((xec == 2) ? "locked miss"
255 : (xec ? "multimatch" : "parity")));
258 } else if (fam_ops->dc_mce(ec, xec))
261 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
264 static bool k8_ic_mce(u16 ec, u8 xec)
267 u8 r4 = (ec >> 4) & 0xf;
274 pr_cont("during a linefill from L2.\n");
275 else if (ll == 0x1) {
278 pr_cont("Parity error during data load.\n");
282 pr_cont("Copyback Parity/Victim error.\n");
286 pr_cont("Tag Snoop error.\n");
299 static bool f14h_ic_mce(u16 ec, u8 xec)
302 u8 tt = (ec >> 2) & 0x3;
303 u8 r4 = (ec >> 4) & 0xf;
307 if (tt != 0 || ll != 1)
311 pr_cont("Data/tag array parity error for a tag hit.\n");
312 else if (r4 == R4_SNOOP)
313 pr_cont("Tag error during snoop/victimization.\n");
320 static bool f15h_ic_mce(u16 ec, u8 xec)
329 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
333 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
337 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
346 static void amd_decode_ic_mce(struct mce *m)
348 u16 ec = m->status & 0xffff;
349 u8 xec = (m->status >> 16) & xec_mask;
351 pr_emerg(HW_ERR "Instruction Cache Error: ");
354 pr_cont("%s TLB %s.\n", LL_MSG(ec),
355 (xec ? "multimatch" : "parity error"));
356 else if (BUS_ERROR(ec)) {
357 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
359 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
360 } else if (fam_ops->ic_mce(ec, xec))
363 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
366 static void amd_decode_bu_mce(struct mce *m)
368 u32 ec = m->status & 0xffff;
369 u32 xec = (m->status >> 16) & xec_mask;
371 pr_emerg(HW_ERR "Bus Unit Error");
374 pr_cont(" in the write data buffers.\n");
376 pr_cont(" in the victim data buffers.\n");
377 else if (xec == 0x2 && MEM_ERROR(ec))
378 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
379 else if (xec == 0x0) {
381 pr_cont(": %s error in a Page Descriptor Cache or "
382 "Guest TLB.\n", TT_MSG(ec));
383 else if (BUS_ERROR(ec))
384 pr_cont(": %s/ECC error in data read from NB: %s.\n",
385 RRRR_MSG(ec), PP_MSG(ec));
386 else if (MEM_ERROR(ec)) {
387 u8 rrrr = (ec >> 4) & 0xf;
390 pr_cont(": %s error during data copyback.\n",
392 else if (rrrr <= 0x1)
393 pr_cont(": %s parity/ECC error during data "
394 "access from L2.\n", RRRR_MSG(ec));
405 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
408 static void amd_decode_ls_mce(struct mce *m)
410 u16 ec = m->status & 0xffff;
411 u8 xec = (m->status >> 16) & xec_mask;
413 if (boot_cpu_data.x86 == 0x14) {
414 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
415 " please report on LKML.\n");
419 pr_emerg(HW_ERR "Load Store Error");
422 u8 r4 = (ec >> 4) & 0xf;
424 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
427 pr_cont(" during %s.\n", RRRR_MSG(ec));
434 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
437 static bool k8_nb_mce(u16 ec, u8 xec)
443 pr_cont("CRC error detected on HT link.\n");
447 pr_cont("Invalid GART PTE entry during GART table walk.\n");
451 pr_cont("Unsupported atomic RMW received from an IO link.\n");
456 if (boot_cpu_data.x86 == 0x11)
459 pr_cont("DRAM ECC error detected on the NB.\n");
463 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
474 static bool f10h_nb_mce(u16 ec, u8 xec)
479 if (k8_nb_mce(ec, xec))
493 pr_cont("GART Table Walk data error.\n");
494 else if (BUS_ERROR(ec))
495 pr_cont("DMA Exclusion Vector Table Walk error.\n");
513 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
519 static bool nb_noop_mce(u16 ec, u8 xec)
524 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
526 u8 xec = (m->status >> 16) & 0x1f;
527 u16 ec = m->status & 0xffff;
528 u32 nbsh = (u32)(m->status >> 32);
530 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
533 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
534 * value encoding has changed so interpret those differently
536 if ((boot_cpu_data.x86 == 0x10) &&
537 (boot_cpu_data.x86_model > 7)) {
538 if (nbsh & K8_NBSH_ERR_CPU_VAL)
539 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
541 u8 assoc_cpus = nbsh & nb_err_cpumask;
544 pr_cont(", core: %d", fls(assoc_cpus) - 1);
549 pr_cont("Sync error (sync packets on HT link detected).\n");
553 pr_cont("HT Master abort.\n");
557 pr_cont("HT Target abort.\n");
561 pr_cont("NB Watchdog timeout.\n");
565 pr_cont("SVM DMA Exclusion Vector error.\n");
572 if (!fam_ops->nb_mce(ec, xec))
575 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
576 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
577 nb_bus_decoder(node_id, m, nbcfg);
582 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
584 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
586 static void amd_decode_fr_mce(struct mce *m)
588 if (boot_cpu_data.x86 == 0xf ||
589 boot_cpu_data.x86 == 0x11)
592 /* we have only one error signature so match all fields at once. */
593 if ((m->status & 0xffff) == 0x0f0f) {
594 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
599 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
602 static inline void amd_decode_err_code(u16 ec)
605 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
606 TT_MSG(ec), LL_MSG(ec));
607 } else if (MEM_ERROR(ec)) {
608 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
609 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
610 } else if (BUS_ERROR(ec)) {
611 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
612 "Participating Processor: %s\n",
613 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
616 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
620 * Filter out unwanted MCE signatures here.
622 static bool amd_filter_mce(struct mce *m)
624 u8 xec = (m->status >> 16) & 0x1f;
627 * NB GART TLB error reporting is disabled by default.
629 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
635 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
637 struct mce *m = (struct mce *)data;
640 if (amd_filter_mce(m))
643 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
645 pr_cont("%sorrected error, other errors lost: %s, "
646 "CPU context corrupt: %s",
647 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
648 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
649 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
651 /* do the two bits[14:13] together */
652 ecc = (m->status >> 45) & 0x3;
654 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
660 amd_decode_dc_mce(m);
664 amd_decode_ic_mce(m);
668 amd_decode_bu_mce(m);
672 amd_decode_ls_mce(m);
676 node = amd_get_nb_id(m->extcpu);
677 amd_decode_nb_mce(node, m, 0);
681 amd_decode_fr_mce(m);
688 amd_decode_err_code(m->status & 0xffff);
692 EXPORT_SYMBOL_GPL(amd_decode_mce);
694 static struct notifier_block amd_mce_dec_nb = {
695 .notifier_call = amd_decode_mce,
698 static int __init mce_amd_init(void)
700 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
703 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
704 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
707 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
711 switch (boot_cpu_data.x86) {
713 fam_ops->dc_mce = k8_dc_mce;
714 fam_ops->ic_mce = k8_ic_mce;
715 fam_ops->nb_mce = k8_nb_mce;
719 fam_ops->dc_mce = f10h_dc_mce;
720 fam_ops->ic_mce = k8_ic_mce;
721 fam_ops->nb_mce = f10h_nb_mce;
725 fam_ops->dc_mce = k8_dc_mce;
726 fam_ops->ic_mce = k8_ic_mce;
727 fam_ops->nb_mce = f10h_nb_mce;
731 fam_ops->dc_mce = f12h_dc_mce;
732 fam_ops->ic_mce = k8_ic_mce;
733 fam_ops->nb_mce = nb_noop_mce;
737 nb_err_cpumask = 0x3;
738 fam_ops->dc_mce = f14h_dc_mce;
739 fam_ops->ic_mce = f14h_ic_mce;
740 fam_ops->nb_mce = nb_noop_mce;
745 fam_ops->dc_mce = f15h_dc_mce;
746 fam_ops->ic_mce = f15h_ic_mce;
750 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
756 pr_info("MCE: In-kernel MCE decoding enabled.\n");
758 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
762 early_initcall(mce_amd_init);
765 static void __exit mce_amd_exit(void)
767 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
771 MODULE_DESCRIPTION("AMD MCE decoder");
772 MODULE_ALIAS("edac-mce-amd");
773 MODULE_LICENSE("GPL");
774 module_exit(mce_amd_exit);