i7core_edac: Add error insertion code for Nehalem
authorMauro Carvalho Chehab <mchehab@redhat.com>
Tue, 23 Jun 2009 01:48:28 +0000 (22:48 -0300)
committerMauro Carvalho Chehab <mchehab@redhat.com>
Mon, 10 May 2010 14:44:45 +0000 (11:44 -0300)
Implements set_inject_error() with the low-level code needed to inject
memory errors at Nehalem, and adds some sysfs nodes to allow error injection

The next patch will add an API for error injection.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
drivers/edac/i7core_edac.c

index 7ecf15e..b590f84 100644 (file)
@@ -20,7 +20,6 @@
  *     http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
  */
 
-
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
        /* OFFSETS for Devices 4,5 and 6 Function 0 */
 
 #define MC_CHANNEL_ADDR_MATCH  0xf0
-
-#define MC_MASK_DIMM   (1 << 41)
-#define MC_MASK_RANK   (1 << 40)
-#define MC_MASK_BANK   (1 << 39)
-#define MC_MASK_PAGE   (1 << 38)
-#define MC_MASK_COL    (1 << 37)
+#define MC_CHANNEL_ERROR_MASK  0xf8
+#define MC_CHANNEL_ERROR_INJECT        0xfc
+  #define INJECT_ADDR_PARITY   0x10
+  #define INJECT_ECC           0x08
+  #define MASK_CACHELINE       0x06
+  #define MASK_FULL_CACHELINE  0x06
+  #define MASK_MSB32_CACHELINE 0x04
+  #define MASK_LSB32_CACHELINE 0x02
+  #define NO_MASK_CACHELINE    0x00
+  #define REPEAT_EN            0x01
 
 /*
  * i7core structs
@@ -84,10 +87,23 @@ struct i7core_info {
        u32     max_dod;
 };
 
+
+struct i7core_inject {
+       int     enable;
+
+       u32     section;
+       u32     type;
+       u32     eccmask;
+
+       /* Error address mask */
+       int channel, dimm, rank, bank, page, col;
+};
+
 struct i7core_pvt {
        struct pci_dev          *pci_mcr;       /* Dev 3:0 */
        struct pci_dev          *pci_ch[NUM_CHANS][NUM_FUNCS];
        struct i7core_info      info;
+       struct i7core_inject    inject;
 };
 
 /* Device name and register DID (Device ID) */
@@ -166,6 +182,7 @@ static inline int maxnumcol(struct i7core_pvt *pvt)
        return cols[((pvt->info.max_dod >> 9) & 0x3) << 12];
 }
 
+
 /****************************************************************************
                        Memory check routines
  ****************************************************************************/
@@ -199,6 +216,390 @@ static int get_dimm_config(struct mem_ctl_info *mci)
        return 0;
 }
 
+/****************************************************************************
+                       Error insertion routines
+ ****************************************************************************/
+
+/* The i7core has independent error injection features per channel.
+   However, to have a simpler code, we don't allow enabling error injection
+   on more than one channel.
+   Also, since a change at an inject parameter will be applied only at enable,
+   we're disabling error injection on all write calls to the sysfs nodes that
+   controls the error code injection.
+ */
+static void disable_inject(struct mem_ctl_info *mci)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+
+       pvt->inject.enable = 0;
+
+       pci_write_config_dword(pvt->pci_ch[pvt->inject.channel][0],
+                               MC_CHANNEL_ERROR_MASK, 0);
+}
+
+/*
+ * i7core inject inject.section
+ *
+ *     accept and store error injection inject.section value
+ *     bit 0 - refers to the lower 32-byte half cacheline
+ *     bit 1 - refers to the upper 32-byte half cacheline
+ */
+static ssize_t i7core_inject_section_store(struct mem_ctl_info *mci,
+                                          const char *data, size_t count)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       unsigned long value;
+       int rc;
+
+       if (pvt->inject.enable)
+                disable_inject(mci);
+
+       rc = strict_strtoul(data, 10, &value);
+       if ((rc < 0) || (value > 3))
+               return 0;
+
+       pvt->inject.section = (u32) value;
+       return count;
+}
+
+static ssize_t i7core_inject_section_show(struct mem_ctl_info *mci,
+                                             char *data)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       return sprintf(data, "0x%08x\n", pvt->inject.section);
+}
+
+/*
+ * i7core inject.type
+ *
+ *     accept and store error injection inject.section value
+ *     bit 0 - repeat enable - Enable error repetition
+ *     bit 1 - inject ECC error
+ *     bit 2 - inject parity error
+ */
+static ssize_t i7core_inject_type_store(struct mem_ctl_info *mci,
+                                       const char *data, size_t count)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       unsigned long value;
+       int rc;
+
+       if (pvt->inject.enable)
+                disable_inject(mci);
+
+       rc = strict_strtoul(data, 10, &value);
+       if ((rc < 0) || (value > 7))
+               return 0;
+
+       pvt->inject.type = (u32) value;
+       return count;
+}
+
+static ssize_t i7core_inject_type_show(struct mem_ctl_info *mci,
+                                             char *data)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       return sprintf(data, "0x%08x\n", pvt->inject.type);
+}
+
+/*
+ * i7core_inject_inject.eccmask_store
+ *
+ * The type of error (UE/CE) will depend on the inject.eccmask value:
+ *   Any bits set to a 1 will flip the corresponding ECC bit
+ *   Correctable errors can be injected by flipping 1 bit or the bits within
+ *   a symbol pair (2 consecutive aligned 8-bit pairs - i.e. 7:0 and 15:8 or
+ *   23:16 and 31:24). Flipping bits in two symbol pairs will cause an
+ *   uncorrectable error to be injected.
+ */
+static ssize_t i7core_inject_eccmask_store(struct mem_ctl_info *mci,
+                                       const char *data, size_t count)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       unsigned long value;
+       int rc;
+
+       if (pvt->inject.enable)
+                disable_inject(mci);
+
+       rc = strict_strtoul(data, 10, &value);
+       if (rc < 0)
+               return 0;
+
+       pvt->inject.eccmask = (u32) value;
+       return count;
+}
+
+static ssize_t i7core_inject_eccmask_show(struct mem_ctl_info *mci,
+                                             char *data)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       return sprintf(data, "0x%08x\n", pvt->inject.eccmask);
+}
+
+/*
+ * i7core_addrmatch
+ *
+ * The type of error (UE/CE) will depend on the inject.eccmask value:
+ *   Any bits set to a 1 will flip the corresponding ECC bit
+ *   Correctable errors can be injected by flipping 1 bit or the bits within
+ *   a symbol pair (2 consecutive aligned 8-bit pairs - i.e. 7:0 and 15:8 or
+ *   23:16 and 31:24). Flipping bits in two symbol pairs will cause an
+ *   uncorrectable error to be injected.
+ */
+static ssize_t i7core_inject_addrmatch_store(struct mem_ctl_info *mci,
+                                       const char *data, size_t count)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       char *cmd, *val;
+       long value;
+       int rc;
+
+       if (pvt->inject.enable)
+                disable_inject(mci);
+
+       do {
+               cmd = strsep((char **) &data, ":");
+               if (!cmd)
+                       break;
+               val = strsep((char **) &data, " \n\t");
+               if (!val)
+                       return cmd - data;
+
+               if (!strcasecmp(val,"any"))
+                       value = -1;
+               else {
+                       rc = strict_strtol(val, 10, &value);
+                       if ((rc < 0) || (value < 0))
+                               return cmd - data;
+               }
+
+               if (!strcasecmp(cmd,"channel")) {
+                       if (value < 3)
+                               pvt->inject.channel = value;
+                       else
+                               return cmd - data;
+               } else if (!strcasecmp(cmd,"dimm")) {
+                       if (value < 4)
+                               pvt->inject.dimm = value;
+                       else
+                               return cmd - data;
+               } else if (!strcasecmp(cmd,"rank")) {
+                       if (value < 4)
+                               pvt->inject.rank = value;
+                       else
+                               return cmd - data;
+               } else if (!strcasecmp(cmd,"bank")) {
+                       if (value < 4)
+                               pvt->inject.bank = value;
+                       else
+                               return cmd - data;
+               } else if (!strcasecmp(cmd,"page")) {
+                       if (value <= 0xffff)
+                               pvt->inject.page = value;
+                       else
+                               return cmd - data;
+               } else if (!strcasecmp(cmd,"col") ||
+                          !strcasecmp(cmd,"column")) {
+                       if (value <= 0x3fff)
+                               pvt->inject.col = value;
+                       else
+                               return cmd - data;
+               }
+       } while (1);
+
+       return count;
+}
+
+static ssize_t i7core_inject_addrmatch_show(struct mem_ctl_info *mci,
+                                             char *data)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       char channel[4], dimm[4], bank[4], rank[4], page[7], col[7];
+
+       if (pvt->inject.channel < 0)
+               sprintf(channel, "any");
+       else
+               sprintf(channel, "%d", pvt->inject.channel);
+       if (pvt->inject.dimm < 0)
+               sprintf(dimm, "any");
+       else
+               sprintf(dimm, "%d", pvt->inject.dimm);
+       if (pvt->inject.bank < 0)
+               sprintf(bank, "any");
+       else
+               sprintf(bank, "%d", pvt->inject.bank);
+       if (pvt->inject.rank < 0)
+               sprintf(rank, "any");
+       else
+               sprintf(rank, "%d", pvt->inject.rank);
+       if (pvt->inject.page < 0)
+               sprintf(page, "any");
+       else
+               sprintf(page, "0x%04x", pvt->inject.page);
+       if (pvt->inject.col < 0)
+               sprintf(col, "any");
+       else
+               sprintf(col, "0x%04x", pvt->inject.col);
+
+       return sprintf(data, "channel: %s\ndimm: %s\nbank: %s\n"
+                            "rank: %s\npage: %s\ncolumn: %s\n",
+                      channel, dimm, bank, rank, page, col);
+}
+
+/*
+ * This routine prepares the Memory Controller for error injection.
+ * The error will be injected when some process tries to write to the
+ * memory that matches the given criteria.
+ * The criteria can be set in terms of a mask where dimm, rank, bank, page
+ * and col can be specified.
+ * A -1 value for any of the mask items will make the MCU to ignore
+ * that matching criteria for error injection.
+ *
+ * It should be noticed that the error will only happen after a write operation
+ * on a memory that matches the condition. if REPEAT_EN is not enabled at
+ * inject mask, then it will produce just one error. Otherwise, it will repeat
+ * until the injectmask would be cleaned.
+ *
+ * FIXME: This routine assumes that MAXNUMDIMMS value of MC_MAX_DOD
+ *    is reliable enough to check if the MC is using the
+ *    three channels. However, this is not clear at the datasheet.
+ */
+static ssize_t i7core_inject_enable_store(struct mem_ctl_info *mci,
+                                      const char *data, size_t count)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       u32 injectmask;
+       u64 mask = 0;
+       int  rc;
+       long enable;
+
+       rc = strict_strtoul(data, 10, &enable);
+       if ((rc < 0))
+               return 0;
+
+       if (enable) {
+               pvt->inject.enable = 1;
+       } else {
+               disable_inject(mci);
+               return count;
+       }
+
+       /* Sets pvt->inject.dimm mask */
+       if (pvt->inject.dimm < 0)
+               mask |= 1l << 41;
+       else {
+               if (maxnumdimms(pvt) > 2)
+                       mask |= (pvt->inject.dimm & 0x3l) << 35;
+               else
+                       mask |= (pvt->inject.dimm & 0x1l) << 36;
+       }
+
+       /* Sets pvt->inject.rank mask */
+       if (pvt->inject.rank < 0)
+               mask |= 1l << 40;
+       else {
+               if (maxnumdimms(pvt) > 2)
+                       mask |= (pvt->inject.rank & 0x1l) << 34;
+               else
+                       mask |= (pvt->inject.rank & 0x3l) << 34;
+       }
+
+       /* Sets pvt->inject.bank mask */
+       if (pvt->inject.bank < 0)
+               mask |= 1l << 39;
+       else
+               mask |= (pvt->inject.bank & 0x15l) << 30;
+
+       /* Sets pvt->inject.page mask */
+       if (pvt->inject.page < 0)
+               mask |= 1l << 38;
+       else
+               mask |= (pvt->inject.page & 0xffffl) << 14;
+
+       /* Sets pvt->inject.column mask */
+       if (pvt->inject.col < 0)
+               mask |= 1l << 37;
+       else
+               mask |= (pvt->inject.col & 0x3fffl);
+
+       pci_write_config_qword(pvt->pci_ch[pvt->inject.channel][0],
+                              MC_CHANNEL_ADDR_MATCH, mask);
+
+       pci_write_config_dword(pvt->pci_ch[pvt->inject.channel][0],
+                              MC_CHANNEL_ERROR_MASK, pvt->inject.eccmask);
+
+       /*
+        * bit    0: REPEAT_EN
+        * bits 1-2: MASK_HALF_CACHELINE
+        * bit    3: INJECT_ECC
+        * bit    4: INJECT_ADDR_PARITY
+        */
+
+       injectmask = (pvt->inject.type & 1) &&
+                    (pvt->inject.section & 0x3) << 1 &&
+                    (pvt->inject.type & 0x6) << (3 - 1);
+
+       pci_write_config_dword(pvt->pci_ch[pvt->inject.channel][0],
+                              MC_CHANNEL_ERROR_MASK, injectmask);
+
+
+       debugf0("Error inject addr match 0x%016llx, ecc 0x%08x, inject 0x%08x\n",
+               mask, pvt->inject.eccmask, injectmask);
+
+       return count;
+}
+
+static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci,
+                                       char *data)
+{
+       struct i7core_pvt *pvt = mci->pvt_info;
+       return sprintf(data, "%d\n", pvt->inject.enable);
+}
+
+/*
+ * Sysfs struct
+ */
+static struct mcidev_sysfs_attribute i7core_inj_attrs[] = {
+
+       {
+               .attr = {
+                       .name = "inject_section",
+                       .mode = (S_IRUGO | S_IWUSR)
+               },
+               .show  = i7core_inject_section_show,
+               .store = i7core_inject_section_store,
+       }, {
+               .attr = {
+                       .name = "inject_type",
+                       .mode = (S_IRUGO | S_IWUSR)
+               },
+               .show  = i7core_inject_type_show,
+               .store = i7core_inject_type_store,
+       }, {
+               .attr = {
+                       .name = "inject_eccmask",
+                       .mode = (S_IRUGO | S_IWUSR)
+               },
+               .show  = i7core_inject_eccmask_show,
+               .store = i7core_inject_eccmask_store,
+       }, {
+               .attr = {
+                       .name = "inject_addrmatch",
+                       .mode = (S_IRUGO | S_IWUSR)
+               },
+               .show  = i7core_inject_addrmatch_show,
+               .store = i7core_inject_addrmatch_store,
+       }, {
+               .attr = {
+                       .name = "inject_enable",
+                       .mode = (S_IRUGO | S_IWUSR)
+               },
+               .show  = i7core_inject_enable_show,
+               .store = i7core_inject_enable_store,
+       },
+};
+
 /****************************************************************************
        Device initialization routines: put/get, init/exit
  ****************************************************************************/
@@ -322,10 +723,11 @@ static int __devinit i7core_probe(struct pci_dev *pdev,
 
        debugf0("MC: " __FILE__ ": %s(): mci = %p\n", __func__, mci);
 
-       mci->dev = &pdev->dev;  /* record ptr  to the generic device */
+       mci->dev = &pdev->dev;  /* record ptr to the generic device */
        dev_set_drvdata(mci->dev, mci);
 
        pvt = mci->pvt_info;
+
 //     pvt->system_address = pdev;     /* Record this device in our private */
 //     pvt->maxch = num_channels;
 //     pvt->maxdimmperch = num_dimms_per_channel;
@@ -343,6 +745,7 @@ static int __devinit i7core_probe(struct pci_dev *pdev,
        mci->ctl_name = i7core_devs[dev_idx].ctl_name;
        mci->dev_name = pci_name(pdev);
        mci->ctl_page_to_phys = NULL;
+       mci->mc_driver_sysfs_attributes = i7core_inj_attrs;
 
        /* add this new MC control structure to EDAC's list of MCs */
        if (edac_mc_add_mc(mci)) {
@@ -365,6 +768,14 @@ static int __devinit i7core_probe(struct pci_dev *pdev,
                        __func__);
        }
 
+       /* Default error mask is any memory */
+       pvt->inject.channel = -1;
+       pvt->inject.dimm = -1;
+       pvt->inject.rank = -1;
+       pvt->inject.bank = -1;
+       pvt->inject.page = -1;
+       pvt->inject.col = -1;
+
        return 0;
 
 fail1: