IB/ipath: Fix and recover TXE piobuf and PBC parity errors
authorBryan O'Sullivan <bos@pathscale.com>
Thu, 28 Sep 2006 16:00:18 +0000 (09:00 -0700)
committerRoland Dreier <rolandd@cisco.com>
Thu, 28 Sep 2006 18:17:03 +0000 (11:17 -0700)
We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest).   Add a
stats counter for these.

Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.

Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/ipath/ipath_common.h
drivers/infiniband/hw/ipath/ipath_iba6110.c
drivers/infiniband/hw/ipath/ipath_iba6120.c
drivers/infiniband/hw/ipath/ipath_intr.c
drivers/infiniband/hw/ipath/ipath_kernel.h

index 382956d..a9b109a 100644 (file)
@@ -141,8 +141,9 @@ struct infinipath_stats {
         * packets if ipath not configured, etc.)
         */
        __u64 sps_krdrops;
+       __u64 sps_txeparity; /* PIO buffer parity error, recovered */
        /* pad for future growth */
-       __u64 __sps_pad[46];
+       __u64 __sps_pad[45];
 };
 
 /*
index fd49c9c..9e4e8d4 100644 (file)
@@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
         * make sure we get this much out, unless told to be quiet,
         * or it's occurred within the last 5 seconds
         */
-       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+       if ((hwerrs & ~(dd->ipath_lasthwerror |
+                       ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                         INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
            (ipath_debug & __IPATH_VERBDBG))
                dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
                         "(cleared)\n", (unsigned long long) hwerrs);
@@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
        if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               /*
+                * parity errors in send memory are recoverable,
+                * just cancel the send (if indicated in * sendbuffererror),
+                * count the occurrence, unfreeze (if no other handled
+                * hardware error bits are set), and continue. They can
+                * occur if a processor speculative read is done to the PIO
+                * buffer while we are sending a packet, for example.
+                */
+               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+                       ipath_stats.sps_txeparity++;
+                       ipath_dbg("Recovering from TXE parity error (%llu), "
+                                 "hwerrstatus=%llx\n",
+                                 (unsigned long long) ipath_stats.sps_txeparity,
+                                 (unsigned long long) hwerrs);
+                       ipath_disarm_senderrbufs(dd);
+                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
+                       if (!hwerrs) { /* else leave in freeze mode */
+                               ipath_write_kreg(dd,
+                                                dd->ipath_kregs->kr_control,
+                                                dd->ipath_control);
+                               return;
+                       }
+               }
                if (hwerrs) {
                        /*
                         * if any set that we aren't ignoring; only
index 08a44dd..024b6aa 100644 (file)
@@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
         * make sure we get this much out, unless told to be quiet,
         * or it's occurred within the last 5 seconds
         */
-       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+       if ((hwerrs & ~(dd->ipath_lasthwerror |
+                       ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                         INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
            (ipath_debug & __IPATH_VERBDBG))
                dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
                         "(cleared)\n", (unsigned long long) hwerrs);
@@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
        if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               /*
+                * parity errors in send memory are recoverable,
+                * just cancel the send (if indicated in * sendbuffererror),
+                * count the occurrence, unfreeze (if no other handled
+                * hardware error bits are set), and continue. They can
+                * occur if a processor speculative read is done to the PIO
+                * buffer while we are sending a packet, for example.
+                */
+               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+                       ipath_stats.sps_txeparity++;
+                       ipath_dbg("Recovering from TXE parity error (%llu), "
+                                 "hwerrstatus=%llx\n",
+                                 (unsigned long long) ipath_stats.sps_txeparity,
+                                 (unsigned long long) hwerrs);
+                       ipath_disarm_senderrbufs(dd);
+                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
+                       if (!hwerrs) { /* else leave in freeze mode */
+                               ipath_write_kreg(dd,
+                                                dd->ipath_kregs->kr_control,
+                                                dd->ipath_control);
+                           return;
+                       }
+               }
                if (hwerrs) {
                        /*
                         * if any set that we aren't ignoring only make the
@@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                } else {
                        ipath_dbg("Clearing freezemode on ignored hardware "
                                  "error\n");
-                       ctrl &= ~INFINIPATH_C_FREEZEMODE;
                        ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                                        ctrl);
+                                        dd->ipath_control);
                }
        }
 
@@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
        dd->ipath_hwe_bitsextant =
                (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
                 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
                (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
                 INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
                INFINIPATH_HWE_PCIE1PLLFAILED |
index f4d8aaf..6bee53c 100644 (file)
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+       u32 piobcnt;
+       unsigned long sbuf[4];
+       /*
+        * it's possible that sendbuffererror could have bits set; might
+        * have already done this as a result of hardware error handling
+        */
+       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /* read these before writing errorclear */
+       sbuf[0] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror);
+       sbuf[1] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+       if (piobcnt > 128) {
+               sbuf[2] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+               sbuf[3] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+       }
+
+       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+               int i;
+               if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
+                       __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+                                         "SendbufErrs %lx %lx", sbuf[0],
+                                         sbuf[1]);
+                       if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+                               printk(" %lx %lx ", sbuf[2], sbuf[3]);
+                       printk("\n");
+               }
+
+               for (i = 0; i < piobcnt; i++)
+                       if (test_bit(i, sbuf))
+                               ipath_disarm_piobufs(dd, i, 1);
+               dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
+       }
+}
+
+
 /* These are all rcv-related errors which we want to count for stats */
 #define E_SUM_PKTERRS \
        (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
 
 static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
 {
-       unsigned long sbuf[4];
        u64 ignore_this_time = 0;
-       u32 piobcnt;
-
-       /* if possible that sendbuffererror could be valid */
-       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /* read these before writing errorclear */
-       sbuf[0] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror);
-       sbuf[1] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
-       if (piobcnt > 128) {
-               sbuf[2] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
-               sbuf[3] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
-       }
 
-       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
-               int i;
-
-               ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
-               if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
-                       printk("%lx %lx ", sbuf[2], sbuf[3]);
-               for (i = 0; i < piobcnt; i++) {
-                       if (test_bit(i, sbuf)) {
-                               u32 __iomem *piobuf;
-                               if (i < dd->ipath_piobcnt2k)
-                                       piobuf = (u32 __iomem *)
-                                               (dd->ipath_pio2kbase +
-                                                i * dd->ipath_palign);
-                               else
-                                       piobuf = (u32 __iomem *)
-                                               (dd->ipath_pio4kbase +
-                                                (i - dd->ipath_piobcnt2k) *
-                                                dd->ipath_4kalign);
-
-                               ipath_cdbg(PKT,
-                                          "PIObuf[%u] @%p pbc is %x; ",
-                                          i, piobuf, readl(piobuf));
-
-                               ipath_disarm_piobufs(dd, i, 1);
-                       }
-               }
-               if (ipath_debug & __IPATH_PKTDBG)
-                       printk("\n");
-       }
+       ipath_disarm_senderrbufs(dd);
        if ((errs & E_SUM_LINK_PKTERRS) &&
            !(dd->ipath_flags & IPATH_LINKACTIVE)) {
                /*
@@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
                        ~(INFINIPATH_E_HARDWARE |
                          INFINIPATH_E_IBSTATUSCHANGED);
        }
+
+       /* likely due to cancel, so suppress */
+       if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+               dd->ipath_lastcancel > jiffies) {
+               ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
+               errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+       }
+
        if (!errs)
                return 0;
 
index 02134cb..d7540b7 100644 (file)
@@ -427,6 +427,9 @@ struct ipath_devdata {
        unsigned long ipath_rcvctrl;
        /* shadow kr_sendctrl */
        unsigned long ipath_sendctrl;
+       /* ports waiting for PIOavail intr */
+       unsigned long ipath_portpiowait;
+       unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */
 
        /* value we put in kr_rcvhdrcnt */
        u32 ipath_rcvhdrcnt;
@@ -490,8 +493,6 @@ struct ipath_devdata {
        u32 ipath_htwidth;
        /* HT speed (200,400,800,1000) from HT config */
        u32 ipath_htspeed;
-       /* ports waiting for PIOavail intr */
-       unsigned long ipath_portpiowait;
        /*
         * number of sequential ibcstatus change for polling active/quiet
         * (i.e., link not coming up).
@@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
 void ipath_disable_wc(struct ipath_devdata *dd);
 int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
 void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
 
 struct file_operations;
 int ipath_cdev_init(int minor, char *name, struct file_operations *fops,