mlx4_core: Report thermal error events
authorJack Morgenstein <jackm@dev.mellanox.co.il>
Tue, 6 Mar 2012 13:50:49 +0000 (15:50 +0200)
committerRoland Dreier <roland@purestorage.com>
Mon, 12 Mar 2012 23:24:59 +0000 (16:24 -0700)
Print an error message when a thermal error async event is reported by the HW.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Dotan Barak <dotanb@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
drivers/net/ethernet/mellanox/mlx4/eq.c
drivers/net/ethernet/mellanox/mlx4/mlx4.h
include/linux/mlx4/device.h

index 8fa41f3..780b5ad 100644 (file)
@@ -79,7 +79,8 @@ enum {
                               (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)          | \
                               (1ull << MLX4_EVENT_TYPE_CMD)                | \
                               (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL)       | \
-                              (1ull << MLX4_EVENT_TYPE_FLR_EVENT))
+                              (1ull << MLX4_EVENT_TYPE_FLR_EVENT)          | \
+                              (1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
 
 static void eq_set_ci(struct mlx4_eq *eq, int req_not)
 {
@@ -443,6 +444,35 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
                        queue_work(priv->mfunc.master.comm_wq,
                                   &priv->mfunc.master.slave_flr_event_work);
                        break;
+
+               case MLX4_EVENT_TYPE_FATAL_WARNING:
+                       if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
+                               if (mlx4_is_master(dev))
+                                       for (i = 0; i < dev->num_slaves; i++) {
+                                               mlx4_dbg(dev, "%s: Sending "
+                                                       "MLX4_FATAL_WARNING_SUBTYPE_WARMING"
+                                                       " to slave: %d\n", __func__, i);
+                                               if (i == dev->caps.function)
+                                                       continue;
+                                               mlx4_slave_event(dev, i, eqe);
+                                       }
+                               mlx4_err(dev, "Temperature Threshold was reached! "
+                                       "Threshold: %d celsius degrees; "
+                                       "Current Temperature: %d\n",
+                                       be16_to_cpu(eqe->event.warming.warning_threshold),
+                                       be16_to_cpu(eqe->event.warming.current_temperature));
+                       } else
+                               mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), "
+                                         "subtype %02x on EQ %d at index %u. owner=%x, "
+                                         "nent=0x%x, slave=%x, ownership=%s\n",
+                                         eqe->type, eqe->subtype, eq->eqn,
+                                         eq->cons_index, eqe->owner, eq->nent,
+                                         eqe->slave_id,
+                                         !!(eqe->owner & 0x80) ^
+                                         !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+
+                       break;
+
                case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
                case MLX4_EVENT_TYPE_ECC_DETECT:
                default:
index c92269f..ac2d606 100644 (file)
@@ -363,6 +363,10 @@ struct mlx4_eqe {
                struct {
                        __be32  slave_id;
                } __packed flr_event;
+               struct {
+                       __be16  current_temperature;
+                       __be16  warning_threshold;
+               } __packed warming;
        }                       event;
        u8                      slave_id;
        u8                      reserved3[2];
index 263d2ae..4b3fbf1 100644 (file)
@@ -133,6 +133,7 @@ enum mlx4_event {
        MLX4_EVENT_TYPE_CMD                = 0x0a,
        MLX4_EVENT_TYPE_VEP_UPDATE         = 0x19,
        MLX4_EVENT_TYPE_COMM_CHANNEL       = 0x18,
+       MLX4_EVENT_TYPE_FATAL_WARNING      = 0x1b,
        MLX4_EVENT_TYPE_FLR_EVENT          = 0x1c,
        MLX4_EVENT_TYPE_NONE               = 0xff,
 };
@@ -142,6 +143,10 @@ enum {
        MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4
 };
 
+enum {
+       MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
+};
+
 enum {
        MLX4_PERM_LOCAL_READ    = 1 << 10,
        MLX4_PERM_LOCAL_WRITE   = 1 << 11,