Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
authorLinus Torvalds <torvalds@g5.osdl.org>
Fri, 29 Sep 2006 22:18:22 +0000 (15:18 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Fri, 29 Sep 2006 22:18:22 +0000 (15:18 -0700)
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband: (33 commits)
  IB/ipath: Fix lockdep error upon "ifconfig ibN down"
  IB/ipath: Fix races with ib_resize_cq()
  IB/ipath: Support new PCIE device, QLE7142
  IB/ipath: Set CPU affinity early
  IB/ipath: Fix EEPROM read when driver is compiled with -Os
  IB/ipath: Fix and recover TXE piobuf and PBC parity errors
  IB/ipath: Change HT CRC message to indicate how to resolve problem
  IB/ipath: Clean up module exit code
  IB/ipath: Call mtrr_del with correct arguments
  IB/ipath: Flush RWQEs if access error or invalid error seen
  IB/ipath: Improved support for PowerPC
  IB/ipath: Drop unnecessary "(void *)" casts
  IB/ipath: Support multiple simultaneous devices of different types
  IB/ipath: Fix mismatch in shifts and masks for printing debug info
  IB/ipath: Fix compiler warnings and errors on non-x86_64 systems
  IB/ipath: Print more informative parity error messages
  IB/ipath: Ensure that PD of MR matches PD of QP checking the Rkey
  IB/ipath: RC and UC should validate SLID and DLID
  IB/ipath: Only allow complete writes to flash
  IB/ipath: Count SRQs properly
  ...

38 files changed:
drivers/infiniband/hw/amso1100/c2_ae.c
drivers/infiniband/hw/amso1100/c2_alloc.c
drivers/infiniband/hw/amso1100/c2_cm.c
drivers/infiniband/hw/amso1100/c2_provider.c
drivers/infiniband/hw/amso1100/c2_rnic.c
drivers/infiniband/hw/ipath/ipath_common.h
drivers/infiniband/hw/ipath/ipath_cq.c
drivers/infiniband/hw/ipath/ipath_driver.c
drivers/infiniband/hw/ipath/ipath_eeprom.c
drivers/infiniband/hw/ipath/ipath_file_ops.c
drivers/infiniband/hw/ipath/ipath_fs.c
drivers/infiniband/hw/ipath/ipath_iba6110.c
drivers/infiniband/hw/ipath/ipath_iba6120.c
drivers/infiniband/hw/ipath/ipath_init_chip.c
drivers/infiniband/hw/ipath/ipath_intr.c
drivers/infiniband/hw/ipath/ipath_kernel.h
drivers/infiniband/hw/ipath/ipath_keys.c
drivers/infiniband/hw/ipath/ipath_mad.c
drivers/infiniband/hw/ipath/ipath_mr.c
drivers/infiniband/hw/ipath/ipath_qp.c
drivers/infiniband/hw/ipath/ipath_rc.c
drivers/infiniband/hw/ipath/ipath_registers.h
drivers/infiniband/hw/ipath/ipath_ruc.c
drivers/infiniband/hw/ipath/ipath_srq.c
drivers/infiniband/hw/ipath/ipath_sysfs.c
drivers/infiniband/hw/ipath/ipath_uc.c
drivers/infiniband/hw/ipath/ipath_ud.c
drivers/infiniband/hw/ipath/ipath_user_pages.c
drivers/infiniband/hw/ipath/ipath_verbs.c
drivers/infiniband/hw/ipath/ipath_verbs.h
drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
drivers/infiniband/ulp/iser/Kconfig
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c

index 08f46c8..3aae497 100644 (file)
@@ -197,7 +197,7 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
                        "resource=%x, qp_state=%s\n",
                        __FUNCTION__,
                        to_event_str(event_id),
-                       be64_to_cpu(wr->ae.ae_generic.user_context),
+                       (unsigned long long) be64_to_cpu(wr->ae.ae_generic.user_context),
                        be32_to_cpu(wr->ae.ae_generic.resource_type),
                        be32_to_cpu(wr->ae.ae_generic.resource),
                        to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
index 1d25299..028a60b 100644 (file)
@@ -115,7 +115,7 @@ u16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
                            ((unsigned long) &(head->shared_ptr[mqsp]) -
                             (unsigned long) head);
                pr_debug("%s addr %p dma_addr %llx\n", __FUNCTION__,
-                        &(head->shared_ptr[mqsp]), (u64)*dma_addr);
+                        &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
                return &(head->shared_ptr[mqsp]);
        }
        return NULL;
index 485254e..75b93e9 100644 (file)
@@ -302,7 +302,7 @@ int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
        vq_req = vq_req_alloc(c2dev);
        if (!vq_req) {
                err = -ENOMEM;
-               goto bail1;
+               goto bail0;
        }
        vq_req->qp = qp;
        vq_req->cm_id = cm_id;
@@ -311,7 +311,7 @@ int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
        wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
        if (!wr) {
                err = -ENOMEM;
-               goto bail2;
+               goto bail1;
        }
 
        /* Build the WR */
@@ -331,7 +331,7 @@ int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
        /* Validate private_data length */
        if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
                err = -EINVAL;
-               goto bail2;
+               goto bail1;
        }
 
        if (iw_param->private_data) {
@@ -348,19 +348,19 @@ int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
        err = vq_send_wr(c2dev, (union c2wr *) wr);
        if (err) {
                vq_req_put(c2dev, vq_req);
-               goto bail2;
+               goto bail1;
        }
 
        /* Wait for reply from adapter */
        err = vq_wait_for_reply(c2dev, vq_req);
        if (err)
-               goto bail2;
+               goto bail1;
 
        /* Check that reply is present */
        reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
        if (!reply) {
                err = -ENOMEM;
-               goto bail2;
+               goto bail1;
        }
 
        err = c2_errno(reply);
@@ -368,9 +368,8 @@ int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
 
        if (!err)
                c2_set_qp_state(qp, C2_QP_STATE_RTS);
- bail2:
-       kfree(wr);
  bail1:
+       kfree(wr);
        vq_req_free(c2dev, vq_req);
  bail0:
        if (err) {
index dd6af55..da98d9f 100644 (file)
@@ -390,14 +390,18 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
        }
 
        mr = kmalloc(sizeof(*mr), GFP_KERNEL);
-       if (!mr)
+       if (!mr) {
+               vfree(page_list);
                return ERR_PTR(-ENOMEM);
+       }
 
        mr->pd = to_c2pd(ib_pd);
        pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
                "*iova_start %llx, first pa %llx, last pa %llx\n",
                __FUNCTION__, page_shift, pbl_depth, total_len,
-               *iova_start, page_list[0], page_list[pbl_depth-1]);
+               (unsigned long long) *iova_start,
+               (unsigned long long) page_list[0],
+               (unsigned long long) page_list[pbl_depth-1]);
        err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
                                         (1 << page_shift), pbl_depth,
                                         total_len, 0, iova_start,
index f49a32b..e37c568 100644 (file)
@@ -527,7 +527,7 @@ int c2_rnic_init(struct c2_dev *c2dev)
                                                DMA_FROM_DEVICE);
        pci_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
        pr_debug("%s rep_vq va %p dma %llx\n", __FUNCTION__, q1_pages,
-                (u64)c2dev->rep_vq.host_dma);
+                (unsigned long long) c2dev->rep_vq.host_dma);
        c2_mq_rep_init(&c2dev->rep_vq,
                   1,
                   qsize,
@@ -550,7 +550,7 @@ int c2_rnic_init(struct c2_dev *c2dev)
                                                DMA_FROM_DEVICE);
        pci_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
        pr_debug("%s aeq va %p dma %llx\n", __FUNCTION__, q1_pages,
-                (u64)c2dev->rep_vq.host_dma);
+                (unsigned long long) c2dev->rep_vq.host_dma);
        c2_mq_rep_init(&c2dev->aeq,
                       2,
                       qsize,
index f577905..54139d3 100644 (file)
@@ -141,8 +141,9 @@ struct infinipath_stats {
         * packets if ipath not configured, etc.)
         */
        __u64 sps_krdrops;
+       __u64 sps_txeparity; /* PIO buffer parity error, recovered */
        /* pad for future growth */
-       __u64 __sps_pad[46];
+       __u64 __sps_pad[45];
 };
 
 /*
@@ -185,6 +186,9 @@ typedef enum _ipath_ureg {
 #define IPATH_RUNTIME_PCIE     0x2
 #define IPATH_RUNTIME_FORCE_WC_ORDER   0x4
 #define IPATH_RUNTIME_RCVHDR_COPY      0x8
+#define IPATH_RUNTIME_MASTER   0x10
+#define IPATH_RUNTIME_PBC_REWRITE 0x20
+#define IPATH_RUNTIME_LOOSE_DMA_ALIGN 0x40
 
 /*
  * This structure is returned by ipath_userinit() immediately after
@@ -202,7 +206,8 @@ struct ipath_base_info {
        /* version of software, for feature checking. */
        __u32 spi_sw_version;
        /* InfiniPath port assigned, goes into sent packets */
-       __u32 spi_port;
+       __u16 spi_port;
+       __u16 spi_subport;
        /*
         * IB MTU, packets IB data must be less than this.
         * The MTU is in bytes, and will be a multiple of 4 bytes.
@@ -218,7 +223,7 @@ struct ipath_base_info {
        __u32 spi_tidcnt;
        /* size of the TID Eager list in infinipath, in entries */
        __u32 spi_tidegrcnt;
-       /* size of a single receive header queue entry. */
+       /* size of a single receive header queue entry in words. */
        __u32 spi_rcvhdrent_size;
        /*
         * Count of receive header queue entries allocated.
@@ -310,6 +315,12 @@ struct ipath_base_info {
        __u32 spi_filler_for_align;
        /* address of readonly memory copy of the rcvhdrq tail register. */
        __u64 spi_rcvhdr_tailaddr;
+
+       /* shared memory pages for subports if IPATH_RUNTIME_MASTER is set */
+       __u64 spi_subport_uregbase;
+       __u64 spi_subport_rcvegrbuf;
+       __u64 spi_subport_rcvhdr_base;
+
 } __attribute__ ((aligned(8)));
 
 
@@ -328,12 +339,12 @@ struct ipath_base_info {
 
 /*
  * Minor version differences are always compatible
- * a within a major version, however if if user software is larger
+ * a within a major version, however if user software is larger
  * than driver software, some new features and/or structure fields
  * may not be implemented; the user code must deal with this if it
- * cares, or it must abort after initialization reports the difference
+ * cares, or it must abort after initialization reports the difference.
  */
-#define IPATH_USER_SWMINOR 2
+#define IPATH_USER_SWMINOR 3
 
 #define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
 
@@ -379,7 +390,16 @@ struct ipath_user_info {
         */
        __u32 spu_rcvhdrsize;
 
-       __u64 spu_unused; /* kept for compatible layout */
+       /*
+        * If two or more processes wish to share a port, each process
+        * must set the spu_subport_cnt and spu_subport_id to the same
+        * values.  The only restriction on the spu_subport_id is that
+        * it be unique for a given node.
+        */
+       __u16 spu_subport_cnt;
+       __u16 spu_subport_id;
+
+       __u32 spu_unused; /* kept for compatible layout */
 
        /*
         * address of struct base_info to write to
@@ -392,19 +412,25 @@ struct ipath_user_info {
 
 #define IPATH_CMD_MIN          16
 
-#define IPATH_CMD_USER_INIT    16      /* set up userspace */
+#define __IPATH_CMD_USER_INIT  16      /* old set up userspace (for old user code) */
 #define IPATH_CMD_PORT_INFO    17      /* find out what resources we got */
 #define IPATH_CMD_RECV_CTRL    18      /* control receipt of packets */
 #define IPATH_CMD_TID_UPDATE   19      /* update expected TID entries */
 #define IPATH_CMD_TID_FREE     20      /* free expected TID entries */
 #define IPATH_CMD_SET_PART_KEY 21      /* add partition key */
+#define IPATH_CMD_SLAVE_INFO   22      /* return info on slave processes */
+#define IPATH_CMD_ASSIGN_PORT  23      /* allocate HCA and port */
+#define IPATH_CMD_USER_INIT    24      /* set up userspace */
 
-#define IPATH_CMD_MAX          21
+#define IPATH_CMD_MAX          24
 
 struct ipath_port_info {
        __u32 num_active;       /* number of active units */
        __u32 unit;             /* unit (chip) assigned to caller */
-       __u32 port;             /* port on unit assigned to caller */
+       __u16 port;             /* port on unit assigned to caller */
+       __u16 subport;          /* subport on unit assigned to caller */
+       __u16 num_ports;        /* number of ports available on unit */
+       __u16 num_subports;     /* number of subport slaves opened on port */
 };
 
 struct ipath_tid_info {
@@ -435,6 +461,8 @@ struct ipath_cmd {
                __u32 recv_ctrl;
                /* partition key to set */
                __u16 part_key;
+               /* user address of __u32 bitmask of active slaves */
+               __u64 slave_mask_addr;
        } cmd;
 };
 
@@ -596,6 +624,10 @@ struct infinipath_counters {
 
 /* K_PktFlags bits */
 #define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT 4
 
 /* SendPIO per-buffer control */
 #define INFINIPATH_SP_TEST    0x40
@@ -610,7 +642,7 @@ struct ipath_header {
        /*
         * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
         * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
-        * Port 3, TID 11, offset 14.
+        * Port 4, TID 11, offset 13.
         */
        __le32 ver_port_tid_offset;
        __le16 chksum;
index 049221b..87462e0 100644 (file)
@@ -46,7 +46,7 @@
  */
 void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
 {
-       struct ipath_cq_wc *wc = cq->queue;
+       struct ipath_cq_wc *wc;
        unsigned long flags;
        u32 head;
        u32 next;
@@ -57,6 +57,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
         * Note that the head pointer might be writable by user processes.
         * Take care to verify it is a sane value.
         */
+       wc = cq->queue;
        head = wc->head;
        if (head >= (unsigned) cq->ibcq.cqe) {
                head = cq->ibcq.cqe;
@@ -109,21 +110,27 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
 int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 {
        struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *wc = cq->queue;
+       struct ipath_cq_wc *wc;
        unsigned long flags;
        int npolled;
+       u32 tail;
 
        spin_lock_irqsave(&cq->lock, flags);
 
+       wc = cq->queue;
+       tail = wc->tail;
+       if (tail > (u32) cq->ibcq.cqe)
+               tail = (u32) cq->ibcq.cqe;
        for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
-               if (wc->tail == wc->head)
+               if (tail == wc->head)
                        break;
-               *entry = wc->queue[wc->tail];
-               if (wc->tail >= cq->ibcq.cqe)
-                       wc->tail = 0;
+               *entry = wc->queue[tail];
+               if (tail >= cq->ibcq.cqe)
+                       tail = 0;
                else
-                       wc->tail++;
+                       tail++;
        }
+       wc->tail = tail;
 
        spin_unlock_irqrestore(&cq->lock, flags);
 
@@ -177,11 +184,6 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
                goto done;
        }
 
-       if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
        /* Allocate the completion queue structure. */
        cq = kmalloc(sizeof(*cq), GFP_KERNEL);
        if (!cq) {
@@ -237,6 +239,16 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
        } else
                cq->ip = NULL;
 
+       spin_lock(&dev->n_cqs_lock);
+       if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+               spin_unlock(&dev->n_cqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_wc;
+       }
+
+       dev->n_cqs_allocated++;
+       spin_unlock(&dev->n_cqs_lock);
+
        /*
         * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
         * The number of entries should be >= the number requested or return
@@ -253,7 +265,6 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
 
        ret = &cq->ibcq;
 
-       dev->n_cqs_allocated++;
        goto done;
 
 bail_wc:
@@ -280,7 +291,9 @@ int ipath_destroy_cq(struct ib_cq *ibcq)
        struct ipath_cq *cq = to_icq(ibcq);
 
        tasklet_kill(&cq->comptask);
+       spin_lock(&dev->n_cqs_lock);
        dev->n_cqs_allocated--;
+       spin_unlock(&dev->n_cqs_lock);
        if (cq->ip)
                kref_put(&cq->ip->ref, ipath_release_mmap_info);
        else
@@ -316,10 +329,16 @@ int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
        return 0;
 }
 
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
 int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 {
        struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *old_wc = cq->queue;
+       struct ipath_cq_wc *old_wc;
        struct ipath_cq_wc *wc;
        u32 head, tail, n;
        int ret;
@@ -355,6 +374,7 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
         * Make sure head and tail are sane since they
         * might be user writable.
         */
+       old_wc = cq->queue;
        head = old_wc->head;
        if (head > (u32) cq->ibcq.cqe)
                head = (u32) cq->ibcq.cqe;
index 2108466..12cefa6 100644 (file)
@@ -95,16 +95,6 @@ const char *ipath_ibcstatus_str[] = {
        "RecovIdle",
 };
 
-/*
- * These variables are initialized in the chip-specific files
- * but are defined here.
- */
-u16 ipath_gpio_sda_num, ipath_gpio_scl_num;
-u64 ipath_gpio_sda, ipath_gpio_scl;
-u64 infinipath_i_bitsextant;
-ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
-u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
-
 static void __devexit ipath_remove_one(struct pci_dev *);
 static int __devinit ipath_init_one(struct pci_dev *,
                                    const struct pci_device_id *);
@@ -527,28 +517,146 @@ bail:
        return ret;
 }
 
+static void __devexit cleanup_device(struct ipath_devdata *dd)
+{
+       int port;
+
+       ipath_shutdown_device(dd);
+
+       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+               /* can't do anything more with chip; needs re-init */
+               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+               if (dd->ipath_kregbase) {
+                       /*
+                        * if we haven't already cleaned up before these are
+                        * to ensure any register reads/writes "fail" until
+                        * re-init
+                        */
+                       dd->ipath_kregbase = NULL;
+                       dd->ipath_uregbase = 0;
+                       dd->ipath_sregbase = 0;
+                       dd->ipath_cregbase = 0;
+                       dd->ipath_kregsize = 0;
+               }
+               ipath_disable_wc(dd);
+       }
+
+       if (dd->ipath_pioavailregs_dma) {
+               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                 (void *) dd->ipath_pioavailregs_dma,
+                                 dd->ipath_pioavailregs_phys);
+               dd->ipath_pioavailregs_dma = NULL;
+       }
+       if (dd->ipath_dummy_hdrq) {
+               dma_free_coherent(&dd->pcidev->dev,
+                       dd->ipath_pd[0]->port_rcvhdrq_size,
+                       dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+               dd->ipath_dummy_hdrq = NULL;
+       }
+
+       if (dd->ipath_pageshadow) {
+               struct page **tmpp = dd->ipath_pageshadow;
+               dma_addr_t *tmpd = dd->ipath_physshadow;
+               int i, cnt = 0;
+
+               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+                          "locked\n");
+               for (port = 0; port < dd->ipath_cfgports; port++) {
+                       int port_tidbase = port * dd->ipath_rcvtidcnt;
+                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+                       for (i = port_tidbase; i < maxtid; i++) {
+                               if (!tmpp[i])
+                                       continue;
+                               pci_unmap_page(dd->pcidev, tmpd[i],
+                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+                               ipath_release_user_pages(&tmpp[i], 1);
+                               tmpp[i] = NULL;
+                               cnt++;
+                       }
+               }
+               if (cnt) {
+                       ipath_stats.sps_pageunlocks += cnt;
+                       ipath_cdbg(VERBOSE, "There were still %u expTID "
+                                  "entries locked\n", cnt);
+               }
+               if (ipath_stats.sps_pagelocks ||
+                   ipath_stats.sps_pageunlocks)
+                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+                                  "unlocked via ipath_m{un}lock\n",
+                                  (unsigned long long)
+                                  ipath_stats.sps_pagelocks,
+                                  (unsigned long long)
+                                  ipath_stats.sps_pageunlocks);
+
+               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+                          dd->ipath_pageshadow);
+               vfree(dd->ipath_pageshadow);
+               dd->ipath_pageshadow = NULL;
+       }
+
+       /*
+        * free any resources still in use (usually just kernel ports)
+        * at unload; we do for portcnt, not cfgports, because cfgports
+        * could have changed while we were loaded.
+        */
+       for (port = 0; port < dd->ipath_portcnt; port++) {
+               struct ipath_portdata *pd = dd->ipath_pd[port];
+               dd->ipath_pd[port] = NULL;
+               ipath_free_pddata(dd, pd);
+       }
+       kfree(dd->ipath_pd);
+       /*
+        * debuggability, in case some cleanup path tries to use it
+        * after this
+        */
+       dd->ipath_pd = NULL;
+}
+
 static void __devexit ipath_remove_one(struct pci_dev *pdev)
 {
-       struct ipath_devdata *dd;
+       struct ipath_devdata *dd = pci_get_drvdata(pdev);
 
-       ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev);
-       if (!pdev)
-               return;
+       ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+       if (dd->verbs_dev)
+               ipath_unregister_ib_device(dd->verbs_dev);
 
-       dd = pci_get_drvdata(pdev);
-       ipath_unregister_ib_device(dd->verbs_dev);
        ipath_diag_remove(dd);
        ipath_user_remove(dd);
        ipathfs_remove_device(dd);
        ipath_device_remove_group(&pdev->dev, dd);
+
        ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
                   "unit %u\n", dd, (u32) dd->ipath_unit);
-       if (dd->ipath_kregbase) {
-               ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n",
-                          dd->ipath_kregbase);
-               iounmap((volatile void __iomem *) dd->ipath_kregbase);
-               dd->ipath_kregbase = NULL;
-       }
+
+       cleanup_device(dd);
+
+       /*
+        * turn off rcv, send, and interrupts for all ports, all drivers
+        * should also hard reset the chip here?
+        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+        * for all versions of the driver, if they were allocated
+        */
+       if (pdev->irq) {
+               ipath_cdbg(VERBOSE,
+                          "unit %u free_irq of irq %x\n",
+                          dd->ipath_unit, pdev->irq);
+               free_irq(pdev->irq, dd);
+       } else
+               ipath_dbg("irq is 0, not doing free_irq "
+                         "for unit %u\n", dd->ipath_unit);
+       /*
+        * we check for NULL here, because it's outside
+        * the kregbase check, and we need to call it
+        * after the free_irq.  Thus it's possible that
+        * the function pointers were never initialized.
+        */
+       if (dd->ipath_f_cleanup)
+               /* clean up chip-specific stuff */
+               dd->ipath_f_cleanup(dd);
+
+       ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+       iounmap((volatile void __iomem *) dd->ipath_kregbase);
        pci_release_regions(pdev);
        ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
        pci_disable_device(pdev);
@@ -760,8 +868,8 @@ static void get_rhf_errstring(u32 err, char *msg, size_t len)
 static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum,
                                     int err)
 {
-       return dd->ipath_port0_skbs ?
-               (void *)dd->ipath_port0_skbs[bufnum]->data : NULL;
+       return dd->ipath_port0_skbinfo ?
+               (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
 }
 
 /**
@@ -783,31 +891,34 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
         */
 
        /*
-        * We need 4 extra bytes for unaligned transfer copying
+        * We need 2 extra bytes for ipath_ether data sent in the
+        * key header.  In order to keep everything dword aligned,
+        * we'll reserve 4 bytes.
         */
+       len = dd->ipath_ibmaxlen + 4;
+
        if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               /* we need a 4KB multiple alignment, and there is no way
+               /* We need a 2KB multiple alignment, and there is no way
                 * to do it except to allocate extra and then skb_reserve
                 * enough to bring it up to the right alignment.
                 */
-               len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1;
+               len += 2047;
        }
-       else
-               len = dd->ipath_ibmaxlen + 4;
+
        skb = __dev_alloc_skb(len, gfp_mask);
        if (!skb) {
                ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
                              len);
                goto bail;
        }
+
+       skb_reserve(skb, 4);
+
        if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4);
+               u32 una = (unsigned long)skb->data & 2047;
                if (una)
-                       skb_reserve(skb, 4 + (1 << 11) - una);
-               else
-                       skb_reserve(skb, 4);
-       } else
-               skb_reserve(skb, 4);
+                       skb_reserve(skb, 2048 - una);
+       }
 
 bail:
        return skb;
@@ -1326,6 +1437,9 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd,
                                      "for port %u rcvhdrqtailaddr failed\n",
                                      pd->port_port);
                        ret = -ENOMEM;
+                       dma_free_coherent(&dd->pcidev->dev, amt,
+                                         pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+                       pd->port_rcvhdrq = NULL;
                        goto bail;
                }
                pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
@@ -1347,12 +1461,13 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd,
                ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
                           "hdrtailaddr@%p %llx physical\n",
                           pd->port_port, pd->port_rcvhdrq,
-                          pd->port_rcvhdrq_phys, pd->port_rcvhdrtail_kvaddr,
-                          (unsigned long long)pd->port_rcvhdrqtailaddr_phys);
+                          (unsigned long long) pd->port_rcvhdrq_phys,
+                          pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+                          pd->port_rcvhdrqtailaddr_phys);
 
        /* clear for security and sanity on each use */
        memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
-       memset((void *)pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+       memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
 
        /*
         * tell chip each time we init it, even if we are re-using previous
@@ -1805,7 +1920,7 @@ void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
                pd->port_rcvhdrq = NULL;
                if (pd->port_rcvhdrtail_kvaddr) {
                        dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                        (void *)pd->port_rcvhdrtail_kvaddr,
+                                        pd->port_rcvhdrtail_kvaddr,
                                         pd->port_rcvhdrqtailaddr_phys);
                        pd->port_rcvhdrtail_kvaddr = NULL;
                }
@@ -1824,24 +1939,32 @@ void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
                        dma_free_coherent(&dd->pcidev->dev, size,
                                base, pd->port_rcvegrbuf_phys[e]);
                }
-               vfree(pd->port_rcvegrbuf);
+               kfree(pd->port_rcvegrbuf);
                pd->port_rcvegrbuf = NULL;
-               vfree(pd->port_rcvegrbuf_phys);
+               kfree(pd->port_rcvegrbuf_phys);
                pd->port_rcvegrbuf_phys = NULL;
                pd->port_rcvegrbuf_chunks = 0;
-       } else if (pd->port_port == 0 && dd->ipath_port0_skbs) {
+       } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
                unsigned e;
-               struct sk_buff **skbs = dd->ipath_port0_skbs;
+               struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
 
-               dd->ipath_port0_skbs = NULL;
-               ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs "
-                          "@ %p\n", pd->port_port, skbs);
+               dd->ipath_port0_skbinfo = NULL;
+               ipath_cdbg(VERBOSE, "free closed port %d "
+                          "ipath_port0_skbinfo @ %p\n", pd->port_port,
+                          skbinfo);
                for (e = 0; e < dd->ipath_rcvegrcnt; e++)
-                       if (skbs[e])
-                               dev_kfree_skb(skbs[e]);
-               vfree(skbs);
+               if (skbinfo[e].skb) {
+                       pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+                                        dd->ipath_ibmaxlen,
+                                        PCI_DMA_FROMDEVICE);
+                       dev_kfree_skb(skbinfo[e].skb);
+               }
+               vfree(skbinfo);
        }
        kfree(pd->port_tid_pg_list);
+       vfree(pd->subport_uregbase);
+       vfree(pd->subport_rcvegrbuf);
+       vfree(pd->subport_rcvhdr_base);
        kfree(pd);
 }
 
@@ -1907,150 +2030,12 @@ bail:
        return ret;
 }
 
-static void cleanup_device(struct ipath_devdata *dd)
-{
-       int port;
-
-       ipath_shutdown_device(dd);
-
-       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
-               /* can't do anything more with chip; needs re-init */
-               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
-               if (dd->ipath_kregbase) {
-                       /*
-                        * if we haven't already cleaned up before these are
-                        * to ensure any register reads/writes "fail" until
-                        * re-init
-                        */
-                       dd->ipath_kregbase = NULL;
-                       dd->ipath_uregbase = 0;
-                       dd->ipath_sregbase = 0;
-                       dd->ipath_cregbase = 0;
-                       dd->ipath_kregsize = 0;
-               }
-               ipath_disable_wc(dd);
-       }
-
-       if (dd->ipath_pioavailregs_dma) {
-               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                 (void *) dd->ipath_pioavailregs_dma,
-                                 dd->ipath_pioavailregs_phys);
-               dd->ipath_pioavailregs_dma = NULL;
-       }
-       if (dd->ipath_dummy_hdrq) {
-               dma_free_coherent(&dd->pcidev->dev,
-                       dd->ipath_pd[0]->port_rcvhdrq_size,
-                       dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
-               dd->ipath_dummy_hdrq = NULL;
-       }
-
-       if (dd->ipath_pageshadow) {
-               struct page **tmpp = dd->ipath_pageshadow;
-               int i, cnt = 0;
-
-               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
-                          "locked\n");
-               for (port = 0; port < dd->ipath_cfgports; port++) {
-                       int port_tidbase = port * dd->ipath_rcvtidcnt;
-                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-                       for (i = port_tidbase; i < maxtid; i++) {
-                               if (!tmpp[i])
-                                       continue;
-                               ipath_release_user_pages(&tmpp[i], 1);
-                               tmpp[i] = NULL;
-                               cnt++;
-                       }
-               }
-               if (cnt) {
-                       ipath_stats.sps_pageunlocks += cnt;
-                       ipath_cdbg(VERBOSE, "There were still %u expTID "
-                                  "entries locked\n", cnt);
-               }
-               if (ipath_stats.sps_pagelocks ||
-                   ipath_stats.sps_pageunlocks)
-                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
-                                  "unlocked via ipath_m{un}lock\n",
-                                  (unsigned long long)
-                                  ipath_stats.sps_pagelocks,
-                                  (unsigned long long)
-                                  ipath_stats.sps_pageunlocks);
-
-               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
-                          dd->ipath_pageshadow);
-               vfree(dd->ipath_pageshadow);
-               dd->ipath_pageshadow = NULL;
-       }
-
-       /*
-        * free any resources still in use (usually just kernel ports)
-        * at unload; we do for portcnt, not cfgports, because cfgports
-        * could have changed while we were loaded.
-        */
-       for (port = 0; port < dd->ipath_portcnt; port++) {
-               struct ipath_portdata *pd = dd->ipath_pd[port];
-               dd->ipath_pd[port] = NULL;
-               ipath_free_pddata(dd, pd);
-       }
-       kfree(dd->ipath_pd);
-       /*
-        * debuggability, in case some cleanup path tries to use it
-        * after this
-        */
-       dd->ipath_pd = NULL;
-}
-
 static void __exit infinipath_cleanup(void)
 {
-       struct ipath_devdata *dd, *tmp;
-       unsigned long flags;
-
-       ipath_diagpkt_remove();
-
        ipath_exit_ipathfs();
 
        ipath_driver_remove_group(&ipath_driver.driver);
 
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       /*
-        * turn off rcv, send, and interrupts for all ports, all drivers
-        * should also hard reset the chip here?
-        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
-        * for all versions of the driver, if they were allocated
-        */
-       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
-               spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-               if (dd->ipath_kregbase)
-                       cleanup_device(dd);
-
-               if (dd->pcidev) {
-                       if (dd->pcidev->irq) {
-                               ipath_cdbg(VERBOSE,
-                                          "unit %u free_irq of irq %x\n",
-                                          dd->ipath_unit, dd->pcidev->irq);
-                               free_irq(dd->pcidev->irq, dd);
-                       } else
-                               ipath_dbg("irq is 0, not doing free_irq "
-                                         "for unit %u\n", dd->ipath_unit);
-
-                       /*
-                        * we check for NULL here, because it's outside
-                        * the kregbase check, and we need to call it
-                        * after the free_irq.  Thus it's possible that
-                        * the function pointers were never initialized.
-                        */
-                       if (dd->ipath_f_cleanup)
-                               /* clean up chip-specific stuff */
-                               dd->ipath_f_cleanup(dd);
-
-                       dd->pcidev = NULL;
-               }
-               spin_lock_irqsave(&ipath_devs_lock, flags);
-       }
-
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
        ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
        pci_unregister_driver(&ipath_driver);
 
index 3313356..a4019a6 100644 (file)
@@ -100,9 +100,9 @@ static int i2c_gpio_set(struct ipath_devdata *dd,
        gpioval = &dd->ipath_gpio_out;
        read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
        if (line == i2c_line_scl)
-               mask = ipath_gpio_scl;
+               mask = dd->ipath_gpio_scl;
        else
-               mask = ipath_gpio_sda;
+               mask = dd->ipath_gpio_sda;
 
        if (new_line_state == i2c_line_high)
                /* tri-state the output rather than force high */
@@ -119,12 +119,12 @@ static int i2c_gpio_set(struct ipath_devdata *dd,
                write_val = 0x0UL;
 
        if (line == i2c_line_scl) {
-               write_val <<= ipath_gpio_scl_num;
-               *gpioval = *gpioval & ~(1UL << ipath_gpio_scl_num);
+               write_val <<= dd->ipath_gpio_scl_num;
+               *gpioval = *gpioval & ~(1UL << dd->ipath_gpio_scl_num);
                *gpioval |= write_val;
        } else {
-               write_val <<= ipath_gpio_sda_num;
-               *gpioval = *gpioval & ~(1UL << ipath_gpio_sda_num);
+               write_val <<= dd->ipath_gpio_sda_num;
+               *gpioval = *gpioval & ~(1UL << dd->ipath_gpio_sda_num);
                *gpioval |= write_val;
        }
        ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
@@ -157,9 +157,9 @@ static int i2c_gpio_get(struct ipath_devdata *dd,
        read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
        /* config line to be an input */
        if (line == i2c_line_scl)
-               mask = ipath_gpio_scl;
+               mask = dd->ipath_gpio_scl;
        else
-               mask = ipath_gpio_sda;
+               mask = dd->ipath_gpio_sda;
        write_val = read_val & ~mask;
        ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val);
        read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
@@ -187,6 +187,7 @@ bail:
 static void i2c_wait_for_writes(struct ipath_devdata *dd)
 {
        (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+       rmb();
 }
 
 static void scl_out(struct ipath_devdata *dd, u8 bit)
index 29930e2..a9ddc69 100644 (file)
 #include "ipath_kernel.h"
 #include "ipath_common.h"
 
+/*
+ * mmap64 doesn't allow all 64 bits for 32-bit applications
+ * so only use the low 43 bits.
+ */
+#define MMAP64_MASK    0x7FFFFFFFFFFUL
+
 static int ipath_open(struct inode *, struct file *);
 static int ipath_close(struct inode *, struct file *);
 static ssize_t ipath_write(struct file *, const char __user *, size_t,
@@ -57,18 +63,35 @@ static struct file_operations ipath_file_ops = {
        .mmap = ipath_mmap
 };
 
-static int ipath_get_base_info(struct ipath_portdata *pd,
+static int ipath_get_base_info(struct file *fp,
                               void __user *ubase, size_t ubase_size)
 {
+       struct ipath_portdata *pd = port_fp(fp);
        int ret = 0;
        struct ipath_base_info *kinfo = NULL;
        struct ipath_devdata *dd = pd->port_dd;
+       unsigned subport_cnt;
+       int shared, master;
+       size_t sz;
+
+       subport_cnt = pd->port_subport_cnt;
+       if (!subport_cnt) {
+               shared = 0;
+               master = 0;
+               subport_cnt = 1;
+       } else {
+               shared = 1;
+               master = !subport_fp(fp);
+       }
 
-       if (ubase_size < sizeof(*kinfo)) {
+       sz = sizeof(*kinfo);
+       /* If port sharing is not requested, allow the old size structure */
+       if (!shared)
+               sz -= 3 * sizeof(u64);
+       if (ubase_size < sz) {
                ipath_cdbg(PROC,
-                          "Base size %lu, need %lu (version mismatch?)\n",
-                          (unsigned long) ubase_size,
-                          (unsigned long) sizeof(*kinfo));
+                          "Base size %zu, need %zu (version mismatch?)\n",
+                          ubase_size, sz);
                ret = -EINVAL;
                goto bail;
        }
@@ -95,7 +118,9 @@ static int ipath_get_base_info(struct ipath_portdata *pd,
        kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
        kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
                pd->port_rcvegrbuf_chunks;
-       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt;
+       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+       if (master)
+               kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
        /*
         * for this use, may be ipath_cfgports summed over all chips that
         * are are configured and present
@@ -118,31 +143,75 @@ static int ipath_get_base_info(struct ipath_portdata *pd,
         * page_address() macro worked, but in 2.6.11, even that returns the
         * full 64 bit address (upper bits all 1's).  So far, using the
         * physical addresses (or chip offsets, for chip mapping) works, but
-        * no doubt some future kernel release will chang that, and we'll be
-        * on to yet another method of dealing with this
+        * no doubt some future kernel release will change that, and we'll be
+        * on to yet another method of dealing with this.
         */
        kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
-       kinfo->spi_rcvhdr_tailaddr = (u64)pd->port_rcvhdrqtailaddr_phys;
+       kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
        kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
        kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
        kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
                (void *) dd->ipath_statusp -
                (void *) dd->ipath_pioavailregs_dma;
-       kinfo->spi_piobufbase = (u64) pd->port_piobufs;
-       kinfo->__spi_uregbase =
-               dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+       if (!shared) {
+               kinfo->spi_piocnt = dd->ipath_pbufsport;
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+               kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+                       dd->ipath_palign * pd->port_port;
+       } else if (master) {
+               kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) +
+                                   (dd->ipath_pbufsport % subport_cnt);
+               /* Master's PIO buffers are after all the slave's */
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+                       dd->ipath_palign *
+                       (dd->ipath_pbufsport - kinfo->spi_piocnt);
+               kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+                       dd->ipath_palign * pd->port_port;
+       } else {
+               unsigned slave = subport_fp(fp) - 1;
+
+               kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt;
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+                       dd->ipath_palign * kinfo->spi_piocnt * slave;
+               kinfo->__spi_uregbase = ((u64) pd->subport_uregbase +
+                       PAGE_SIZE * slave) & MMAP64_MASK;
 
-       kinfo->spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1);
-       kinfo->spi_piocnt = dd->ipath_pbufsport;
+               kinfo->spi_rcvhdr_base = ((u64) pd->subport_rcvhdr_base +
+                       pd->port_rcvhdrq_size * slave) & MMAP64_MASK;
+               kinfo->spi_rcvhdr_tailaddr =
+                       (u64) pd->port_rcvhdrqtailaddr_phys & MMAP64_MASK;
+               kinfo->spi_rcv_egrbufs = ((u64) pd->subport_rcvegrbuf +
+                       dd->ipath_rcvegrcnt * dd->ipath_rcvegrbufsize * slave) &
+                       MMAP64_MASK;
+       }
+
+       kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->ipath_piobufbase) /
+               dd->ipath_palign;
        kinfo->spi_pioalign = dd->ipath_palign;
 
        kinfo->spi_qpair = IPATH_KD_QP;
        kinfo->spi_piosize = dd->ipath_ibmaxlen;
        kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
        kinfo->spi_port = pd->port_port;
+       kinfo->spi_subport = subport_fp(fp);
        kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
        kinfo->spi_hw_version = dd->ipath_revision;
 
+       if (master) {
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+               kinfo->spi_subport_uregbase =
+                       (u64) pd->subport_uregbase & MMAP64_MASK;
+               kinfo->spi_subport_rcvegrbuf =
+                       (u64) pd->subport_rcvegrbuf & MMAP64_MASK;
+               kinfo->spi_subport_rcvhdr_base =
+                       (u64) pd->subport_rcvhdr_base & MMAP64_MASK;
+               ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+                       kinfo->spi_port, kinfo->spi_runtime_flags,
+                       (unsigned long long) kinfo->spi_subport_uregbase,
+                       (unsigned long long) kinfo->spi_subport_rcvegrbuf,
+                       (unsigned long long) kinfo->spi_subport_rcvhdr_base);
+       }
+
        if (copy_to_user(ubase, kinfo, sizeof(*kinfo)))
                ret = -EFAULT;
 
@@ -154,6 +223,7 @@ bail:
 /**
  * ipath_tid_update - update a port TID
  * @pd: the port
+ * @fp: the ipath device file
  * @ti: the TID information
  *
  * The new implementation as of Oct 2004 is that the driver assigns
@@ -176,11 +246,11 @@ bail:
  * virtually contiguous pages, that should change to improve
  * performance.
  */
-static int ipath_tid_update(struct ipath_portdata *pd,
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
                            const struct ipath_tid_info *ti)
 {
        int ret = 0, ntids;
-       u32 tid, porttid, cnt, i, tidcnt;
+       u32 tid, porttid, cnt, i, tidcnt, tidoff;
        u16 *tidlist;
        struct ipath_devdata *dd = pd->port_dd;
        u64 physaddr;
@@ -188,6 +258,7 @@ static int ipath_tid_update(struct ipath_portdata *pd,
        u64 __iomem *tidbase;
        unsigned long tidmap[8];
        struct page **pagep = NULL;
+       unsigned subport = subport_fp(fp);
 
        if (!dd->ipath_pageshadow) {
                ret = -ENOMEM;
@@ -204,20 +275,34 @@ static int ipath_tid_update(struct ipath_portdata *pd,
                ret = -EFAULT;
                goto done;
        }
-       tidcnt = dd->ipath_rcvtidcnt;
-       if (cnt >= tidcnt) {
+       porttid = pd->port_port * dd->ipath_rcvtidcnt;
+       if (!pd->port_subport_cnt) {
+               tidcnt = dd->ipath_rcvtidcnt;
+               tid = pd->port_tidcursor;
+               tidoff = 0;
+       } else if (!subport) {
+               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+               tidoff = dd->ipath_rcvtidcnt - tidcnt;
+               porttid += tidoff;
+               tid = tidcursor_fp(fp);
+       } else {
+               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+               tidoff = tidcnt * (subport - 1);
+               porttid += tidoff;
+               tid = tidcursor_fp(fp);
+       }
+       if (cnt > tidcnt) {
                /* make sure it all fits in port_tid_pg_list */
                dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
                         "TIDs, only trying max (%u)\n", cnt, tidcnt);
                cnt = tidcnt;
        }
-       pagep = (struct page **)pd->port_tid_pg_list;
-       tidlist = (u16 *) (&pagep[cnt]);
+       pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+       tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
 
        memset(tidmap, 0, sizeof(tidmap));
-       tid = pd->port_tidcursor;
        /* before decrement; chip actual # */
-       porttid = pd->port_port * tidcnt;
        ntids = tidcnt;
        tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
                                   dd->ipath_rcvtidbase +
@@ -274,16 +359,19 @@ static int ipath_tid_update(struct ipath_portdata *pd,
                        ret = -ENOMEM;
                        break;
                }
-               tidlist[i] = tid;
+               tidlist[i] = tid + tidoff;
                ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
-                          "vaddr %lx\n", i, tid, vaddr);
+                          "vaddr %lx\n", i, tid + tidoff, vaddr);
                /* we "know" system pages and TID pages are same size */
                dd->ipath_pageshadow[porttid + tid] = pagep[i];
+               dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+                       dd->pcidev, pagep[i], 0, PAGE_SIZE,
+                       PCI_DMA_FROMDEVICE);
                /*
                 * don't need atomic or it's overhead
                 */
                __set_bit(tid, tidmap);
-               physaddr = page_to_phys(pagep[i]);
+               physaddr = dd->ipath_physshadow[porttid + tid];
                ipath_stats.sps_pagelocks++;
                ipath_cdbg(VERBOSE,
                           "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
@@ -317,6 +405,9 @@ static int ipath_tid_update(struct ipath_portdata *pd,
                                           tid);
                                dd->ipath_f_put_tid(dd, &tidbase[tid], 1,
                                                    dd->ipath_tidinvalid);
+                               pci_unmap_page(dd->pcidev,
+                                       dd->ipath_physshadow[porttid + tid],
+                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
                                dd->ipath_pageshadow[porttid + tid] = NULL;
                                ipath_stats.sps_pageunlocks++;
                        }
@@ -341,7 +432,10 @@ static int ipath_tid_update(struct ipath_portdata *pd,
                }
                if (tid == tidcnt)
                        tid = 0;
-               pd->port_tidcursor = tid;
+               if (!pd->port_subport_cnt)
+                       pd->port_tidcursor = tid;
+               else
+                       tidcursor_fp(fp) = tid;
        }
 
 done:
@@ -354,6 +448,7 @@ done:
 /**
  * ipath_tid_free - free a port TID
  * @pd: the port
+ * @subport: the subport
  * @ti: the TID info
  *
  * right now we are unlocking one page at a time, but since
@@ -367,7 +462,7 @@ done:
  * they pass in to us.
  */
 
-static int ipath_tid_free(struct ipath_portdata *pd,
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
                          const struct ipath_tid_info *ti)
 {
        int ret = 0;
@@ -388,11 +483,20 @@ static int ipath_tid_free(struct ipath_portdata *pd,
        }
 
        porttid = pd->port_port * dd->ipath_rcvtidcnt;
+       if (!pd->port_subport_cnt)
+               tidcnt = dd->ipath_rcvtidcnt;
+       else if (!subport) {
+               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+               porttid += dd->ipath_rcvtidcnt - tidcnt;
+       } else {
+               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+               porttid += tidcnt * (subport - 1);
+       }
        tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
                                   dd->ipath_rcvtidbase +
                                   porttid * sizeof(*tidbase));
 
-       tidcnt = dd->ipath_rcvtidcnt;
        limit = sizeof(tidmap) * BITS_PER_BYTE;
        if (limit > tidcnt)
                /* just in case size changes in future */
@@ -417,6 +521,9 @@ static int ipath_tid_free(struct ipath_portdata *pd,
                                   pd->port_pid, tid);
                        dd->ipath_f_put_tid(dd, &tidbase[tid], 1,
                                            dd->ipath_tidinvalid);
+                       pci_unmap_page(dd->pcidev,
+                               dd->ipath_physshadow[porttid + tid],
+                               PAGE_SIZE, PCI_DMA_FROMDEVICE);
                        ipath_release_user_pages(
                                &dd->ipath_pageshadow[porttid + tid], 1);
                        dd->ipath_pageshadow[porttid + tid] = NULL;
@@ -581,20 +688,24 @@ bail:
 /**
  * ipath_manage_rcvq - manage a port's receive queue
  * @pd: the port
+ * @subport: the subport
  * @start_stop: action to carry out
  *
  * start_stop == 0 disables receive on the port, for use in queue
  * overflow conditions.  start_stop==1 re-enables, to be used to
  * re-init the software copy of the head register
  */
-static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop)
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+                            int start_stop)
 {
        struct ipath_devdata *dd = pd->port_dd;
        u64 tval;
 
-       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u\n",
+       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
                   start_stop ? "en" : "dis", dd->ipath_unit,
-                  pd->port_port);
+                  pd->port_port, subport);
+       if (subport)
+               goto bail;
        /* atomically clear receive enable port. */
        if (start_stop) {
                /*
@@ -609,7 +720,7 @@ static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop)
                 * updated and correct itself, even in the face of software
                 * bugs.
                 */
-               *pd->port_rcvhdrtail_kvaddr = 0;
+               *(volatile u64 *)pd->port_rcvhdrtail_kvaddr = 0;
                set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
                        &dd->ipath_rcvctrl);
        } else
@@ -630,6 +741,7 @@ static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop)
                tval = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
        }
        /* always; new head should be equal to new tail; see above */
+bail:
        return 0;
 }
 
@@ -687,6 +799,36 @@ static void ipath_clean_part_key(struct ipath_portdata *pd,
        }
 }
 
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned egrperchunk, egrcnt, size;
+
+       /*
+        * to avoid wasting a lot of memory, we allocate 32KB chunks of
+        * physically contiguous memory, advance through it until used up
+        * and then allocate more.  Of course, we need memory to store those
+        * extra pointers, now.  Started out with 256KB, but under heavy
+        * memory pressure (creating large files and then copying them over
+        * NFS while doing lots of MPI jobs), we hit some allocation
+        * failures, even though we can sleep...  (2.6.10) Still get
+        * failures at 64K.  32K is the lowest we can go without wasting
+        * additional memory.
+        */
+       size = 0x8000;
+       egrperchunk = size / dd->ipath_rcvegrbufsize;
+       egrcnt = dd->ipath_rcvegrcnt;
+       pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+       pd->port_rcvegrbufs_perchunk = egrperchunk;
+       pd->port_rcvegrbuf_size = size;
+}
+
 /**
  * ipath_create_user_egr - allocate eager TID buffers
  * @pd: the port to allocate TID buffers for
@@ -702,7 +844,7 @@ static void ipath_clean_part_key(struct ipath_portdata *pd,
 static int ipath_create_user_egr(struct ipath_portdata *pd)
 {
        struct ipath_devdata *dd = pd->port_dd;
-       unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff;
+       unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
        size_t size;
        int ret;
        gfp_t gfp_flags;
@@ -722,31 +864,18 @@ static int ipath_create_user_egr(struct ipath_portdata *pd)
        ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
                   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
 
-       /*
-        * to avoid wasting a lot of memory, we allocate 32KB chunks of
-        * physically contiguous memory, advance through it until used up
-        * and then allocate more.  Of course, we need memory to store those
-        * extra pointers, now.  Started out with 256KB, but under heavy
-        * memory pressure (creating large files and then copying them over
-        * NFS while doing lots of MPI jobs), we hit some allocation
-        * failures, even though we can sleep...  (2.6.10) Still get
-        * failures at 64K.  32K is the lowest we can go without wasting
-        * additional memory.
-        */
-       size = 0x8000;
-       alloced = ALIGN(egrsize * egrcnt, size);
-       egrperchunk = size / egrsize;
-       chunk = (egrcnt + egrperchunk - 1) / egrperchunk;
-       pd->port_rcvegrbuf_chunks = chunk;
-       pd->port_rcvegrbufs_perchunk = egrperchunk;
-       pd->port_rcvegrbuf_size = size;
-       pd->port_rcvegrbuf = vmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]));
+       chunk = pd->port_rcvegrbuf_chunks;
+       egrperchunk = pd->port_rcvegrbufs_perchunk;
+       size = pd->port_rcvegrbuf_size;
+       pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+                                    GFP_KERNEL);
        if (!pd->port_rcvegrbuf) {
                ret = -ENOMEM;
                goto bail;
        }
        pd->port_rcvegrbuf_phys =
-               vmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]));
+               kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+                       GFP_KERNEL);
        if (!pd->port_rcvegrbuf_phys) {
                ret = -ENOMEM;
                goto bail_rcvegrbuf;
@@ -791,105 +920,23 @@ bail_rcvegrbuf_phys:
                                  pd->port_rcvegrbuf_phys[e]);
 
        }
-       vfree(pd->port_rcvegrbuf_phys);
+       kfree(pd->port_rcvegrbuf_phys);
        pd->port_rcvegrbuf_phys = NULL;
 bail_rcvegrbuf:
-       vfree(pd->port_rcvegrbuf);
+       kfree(pd->port_rcvegrbuf);
        pd->port_rcvegrbuf = NULL;
 bail:
        return ret;
 }
 
-static int ipath_do_user_init(struct ipath_portdata *pd,
-                             const struct ipath_user_info *uinfo)
-{
-       int ret = 0;
-       struct ipath_devdata *dd = pd->port_dd;
-       u32 head32;
-
-       /* for now, if major version is different, bail */
-       if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) {
-               dev_info(&dd->pcidev->dev,
-                        "User major version %d not same as driver "
-                        "major %d\n", uinfo->spu_userversion >> 16,
-                        IPATH_USER_SWMAJOR);
-               ret = -ENODEV;
-               goto done;
-       }
-
-       if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR)
-               ipath_dbg("User minor version %d not same as driver "
-                         "minor %d\n", uinfo->spu_userversion & 0xffff,
-                         IPATH_USER_SWMINOR);
-
-       if (uinfo->spu_rcvhdrsize) {
-               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
-               if (ret)
-                       goto done;
-       }
-
-       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
-
-       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
-       pd->port_piobufs = dd->ipath_piobufbase +
-               dd->ipath_pbufsport * (pd->port_port -
-                                      1) * dd->ipath_palign;
-       ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n",
-                  pd->port_port, pd->port_piobufs);
-
-       /*
-        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
-        * array for time being.  If pd->port_port > chip-supported,
-        * we need to do extra stuff here to handle by handling overflow
-        * through port 0, someday
-        */
-       ret = ipath_create_rcvhdrq(dd, pd);
-       if (!ret)
-               ret = ipath_create_user_egr(pd);
-       if (ret)
-               goto done;
-
-       /*
-        * set the eager head register for this port to the current values
-        * of the tail pointers, since we don't know if they were
-        * updated on last use of the port.
-        */
-       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
-       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
-       dd->ipath_lastegrheads[pd->port_port] = -1;
-       dd->ipath_lastrcvhdrqtails[pd->port_port] = -1;
-       ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
-               pd->port_port, head32);
-       pd->port_tidcursor = 0; /* start at beginning after open */
-       /*
-        * now enable the port; the tail registers will be written to memory
-        * by the chip as soon as it sees the write to
-        * dd->ipath_kregs->kr_rcvctrl.  The update only happens on
-        * transition from 0 to 1, so clear it first, then set it as part of
-        * enabling the port.  This will (very briefly) affect any other
-        * open ports, but it shouldn't be long enough to be an issue.
-        * We explictly set the in-memory copy to 0 beforehand, so we don't
-        * have to wait to be sure the DMA update has happened.
-        */
-       *pd->port_rcvhdrtail_kvaddr = 0ULL;
-       set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
-               &dd->ipath_rcvctrl);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-done:
-       return ret;
-}
-
 
 /* common code for the mappings on dma_alloc_coherent mem */
 static int ipath_mmap_mem(struct vm_area_struct *vma,
-                            struct ipath_portdata *pd, unsigned len,
-                            int write_ok, dma_addr_t addr, char *what)
+       struct ipath_portdata *pd, unsigned len, int write_ok,
+       void *kvaddr, char *what)
 {
        struct ipath_devdata *dd = pd->port_dd;
-       unsigned pfn = (unsigned long)addr >> PAGE_SHIFT;
+       unsigned long pfn;
        int ret;
 
        if ((vma->vm_end - vma->vm_start) > len) {
@@ -912,17 +959,17 @@ static int ipath_mmap_mem(struct vm_area_struct *vma,
                vma->vm_flags &= ~VM_MAYWRITE;
        }
 
+       pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
        ret = remap_pfn_range(vma, vma->vm_start, pfn,
                              len, vma->vm_page_prot);
        if (ret)
-               dev_info(&dd->pcidev->dev,
-                        "%s port%u mmap of %lx, %x bytes r%c failed: %d\n",
-                        what, pd->port_port, (unsigned long)addr, len,
-                        write_ok?'w':'o', ret);
+               dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+                        "bytes r%c failed: %d\n", what, pd->port_port,
+                        pfn, len, write_ok?'w':'o', ret);
        else
-               ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes r%c\n",
-                       what, pd->port_port, (unsigned long)addr, len,
-                        write_ok?'w':'o');
+               ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+                          "r%c\n", what, pd->port_port, pfn, len,
+                          write_ok?'w':'o');
 bail:
        return ret;
 }
@@ -957,7 +1004,8 @@ static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
 
 static int mmap_piobufs(struct vm_area_struct *vma,
                        struct ipath_devdata *dd,
-                       struct ipath_portdata *pd)
+                       struct ipath_portdata *pd,
+                       unsigned piobufs, unsigned piocnt)
 {
        unsigned long phys;
        int ret;
@@ -968,16 +1016,15 @@ static int mmap_piobufs(struct vm_area_struct *vma,
         * process data, and catches users who might try to read the i/o
         * space due to a bug.
         */
-       if ((vma->vm_end - vma->vm_start) >
-           (dd->ipath_pbufsport * dd->ipath_palign)) {
+       if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
                dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
                         "reqlen %lx > PAGE\n",
                         vma->vm_end - vma->vm_start);
-               ret = -EFAULT;
+               ret = -EINVAL;
                goto bail;
        }
 
-       phys = dd->ipath_physaddr + pd->port_piobufs;
+       phys = dd->ipath_physaddr + piobufs;
 
        /*
         * Don't mark this as non-cached, or we don't get the
@@ -1011,7 +1058,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma,
        struct ipath_devdata *dd = pd->port_dd;
        unsigned long start, size;
        size_t total_size, i;
-       dma_addr_t *phys;
+       unsigned long pfn;
        int ret;
 
        size = pd->port_rcvegrbuf_size;
@@ -1021,7 +1068,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma,
                         "reqlen %lx > actual %lx\n",
                         vma->vm_end - vma->vm_start,
                         (unsigned long) total_size);
-               ret = -EFAULT;
+               ret = -EINVAL;
                goto bail;
        }
 
@@ -1035,11 +1082,11 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma,
        vma->vm_flags &= ~VM_MAYWRITE;
 
        start = vma->vm_start;
-       phys = pd->port_rcvegrbuf_phys;
 
        for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
-               ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT,
-                                     size, vma->vm_page_prot);
+               pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+               ret = remap_pfn_range(vma, start, pfn, size,
+                                     vma->vm_page_prot);
                if (ret < 0)
                        goto bail;
        }
@@ -1049,6 +1096,122 @@ bail:
        return ret;
 }
 
+/*
+ * ipath_file_vma_nopage - handle a VMA page fault.
+ */
+static struct page *ipath_file_vma_nopage(struct vm_area_struct *vma,
+                                         unsigned long address, int *type)
+{
+       unsigned long offset = address - vma->vm_start;
+       struct page *page = NOPAGE_SIGBUS;
+       void *pageptr;
+
+       /*
+        * Convert the vmalloc address into a struct page.
+        */
+       pageptr = (void *)(offset + (vma->vm_pgoff << PAGE_SHIFT));
+       page = vmalloc_to_page(pageptr);
+       if (!page)
+               goto out;
+
+       /* Increment the reference count. */
+       get_page(page);
+       if (type)
+               *type = VM_FAULT_MINOR;
+out:
+       return page;
+}
+
+static struct vm_operations_struct ipath_file_vm_ops = {
+       .nopage = ipath_file_vma_nopage,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+                      struct ipath_portdata *pd, unsigned subport)
+{
+       unsigned long len;
+       struct ipath_devdata *dd;
+       void *addr;
+       size_t size;
+       int ret;
+
+       /* If the port is not shared, all addresses should be physical */
+       if (!pd->port_subport_cnt) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       dd = pd->port_dd;
+       size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+       /*
+        * Master has all the slave uregbase, rcvhdrq, and
+        * rcvegrbufs mmapped.
+        */
+       if (subport == 0) {
+               unsigned num_slaves = pd->port_subport_cnt - 1;
+
+               if (pgaddr == ((u64) pd->subport_uregbase & MMAP64_MASK)) {
+                       addr = pd->subport_uregbase;
+                       size = PAGE_SIZE * num_slaves;
+               } else if (pgaddr == ((u64) pd->subport_rcvhdr_base &
+                                     MMAP64_MASK)) {
+                       addr = pd->subport_rcvhdr_base;
+                       size = pd->port_rcvhdrq_size * num_slaves;
+               } else if (pgaddr == ((u64) pd->subport_rcvegrbuf &
+                                     MMAP64_MASK)) {
+                       addr = pd->subport_rcvegrbuf;
+                       size *= num_slaves;
+               } else {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+       } else if (pgaddr == (((u64) pd->subport_uregbase +
+                              PAGE_SIZE * (subport - 1)) & MMAP64_MASK)) {
+               addr = pd->subport_uregbase + PAGE_SIZE * (subport - 1);
+               size = PAGE_SIZE;
+       } else if (pgaddr == (((u64) pd->subport_rcvhdr_base +
+                              pd->port_rcvhdrq_size * (subport - 1)) &
+                             MMAP64_MASK)) {
+               addr = pd->subport_rcvhdr_base +
+                       pd->port_rcvhdrq_size * (subport - 1);
+               size = pd->port_rcvhdrq_size;
+       } else if (pgaddr == (((u64) pd->subport_rcvegrbuf +
+                              size * (subport - 1)) & MMAP64_MASK)) {
+               addr = pd->subport_rcvegrbuf + size * (subport - 1);
+               /* rcvegrbufs are read-only on the slave */
+               if (vma->vm_flags & VM_WRITE) {
+                       dev_info(&dd->pcidev->dev,
+                                "Can't map eager buffers as "
+                                "writable (flags=%lx)\n", vma->vm_flags);
+                       ret = -EPERM;
+                       goto bail;
+               }
+               /*
+                * Don't allow permission to later change to writeable
+                * with mprotect.
+                */
+               vma->vm_flags &= ~VM_MAYWRITE;
+       } else {
+               ret = -EINVAL;
+               goto bail;
+       }
+       len = vma->vm_end - vma->vm_start;
+       if (len > size) {
+               ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+       vma->vm_ops = &ipath_file_vm_ops;
+       vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       ret = 0;
+
+bail:
+       return ret;
+}
+
 /**
  * ipath_mmap - mmap various structures into user space
  * @fp: the file pointer
@@ -1064,73 +1227,99 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
        struct ipath_portdata *pd;
        struct ipath_devdata *dd;
        u64 pgaddr, ureg;
+       unsigned piobufs, piocnt;
        int ret;
 
        pd = port_fp(fp);
+       if (!pd) {
+               ret = -EINVAL;
+               goto bail;
+       }
        dd = pd->port_dd;
 
        /*
         * This is the ipath_do_user_init() code, mapping the shared buffers
         * into the user process. The address referred to by vm_pgoff is the
-        * virtual, not physical, address; we only do one mmap for each
-        * space mapped.
+        * file offset passed via mmap().  For shared ports, this is the
+        * kernel vmalloc() address of the pages to share with the master.
+        * For non-shared or master ports, this is a physical address.
+        * We only do one mmap for each space mapped.
         */
        pgaddr = vma->vm_pgoff << PAGE_SHIFT;
 
        /*
-        * Must fit in 40 bits for our hardware; some checked elsewhere,
-        * but we'll be paranoid.  Check for 0 is mostly in case one of the
-        * allocations failed, but user called mmap anyway.   We want to catch
-        * that before it can match.
+        * Check for 0 in case one of the allocations failed, but user
+        * called mmap anyway.
         */
-       if (!pgaddr || pgaddr >= (1ULL<<40))  {
-               ipath_dev_err(dd, "Bad phys addr %llx, start %lx, end %lx\n",
-                       (unsigned long long)pgaddr, vma->vm_start, vma->vm_end);
-               return -EINVAL;
+       if (!pgaddr)  {
+               ret = -EINVAL;
+               goto bail;
        }
 
-       /* just the offset of the port user registers, not physical addr */
-       ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
-
-       ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n",
+       ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
                   (unsigned long long) pgaddr, vma->vm_start,
-                  vma->vm_end - vma->vm_start);
+                  vma->vm_end - vma->vm_start, dd->ipath_unit,
+                  pd->port_port, subport_fp(fp));
 
-       if (vma->vm_start & (PAGE_SIZE-1)) {
-               ipath_dev_err(dd,
-                       "vm_start not aligned: %lx, end=%lx phys %lx\n",
-                       vma->vm_start, vma->vm_end, (unsigned long)pgaddr);
-               ret = -EINVAL;
+       /*
+        * Physical addresses must fit in 40 bits for our hardware.
+        * Check for kernel virtual addresses first, anything else must
+        * match a HW or memory address.
+        */
+       if (pgaddr >= (1ULL<<40)) {
+               ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+               goto bail;
        }
-       else if (pgaddr == ureg)
+
+       if (!pd->port_subport_cnt) {
+               /* port is not shared */
+               ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+               piocnt = dd->ipath_pbufsport;
+               piobufs = pd->port_piobufs;
+       } else if (!subport_fp(fp)) {
+               /* caller is the master */
+               ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+               piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) +
+                        (dd->ipath_pbufsport % pd->port_subport_cnt);
+               piobufs = pd->port_piobufs +
+                       dd->ipath_palign * (dd->ipath_pbufsport - piocnt);
+       } else {
+               unsigned slave = subport_fp(fp) - 1;
+
+               /* caller is a slave */
+               ureg = 0;
+               piocnt = dd->ipath_pbufsport / pd->port_subport_cnt;
+               piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+       }
+
+       if (pgaddr == ureg)
                ret = mmap_ureg(vma, dd, ureg);
-       else if (pgaddr == pd->port_piobufs)
-               ret = mmap_piobufs(vma, dd, pd);
-       else if (pgaddr == (u64) pd->port_rcvegr_phys)
+       else if (pgaddr == piobufs)
+               ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+       else if (pgaddr == dd->ipath_pioavailregs_phys)
+               /* in-memory copy of pioavail registers */
+               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+                                    (void *) dd->ipath_pioavailregs_dma,
+                                    "pioavail registers");
+       else if (subport_fp(fp))
+               /* Subports don't mmap the physical receive buffers */
+               ret = -EINVAL;
+       else if (pgaddr == pd->port_rcvegr_phys)
                ret = mmap_rcvegrbufs(vma, pd);
-       else if (pgaddr == (u64) pd->port_rcvhdrq_phys) {
+       else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
                /*
                 * The rcvhdrq itself; readonly except on HT (so have
                 * to allow writable mapping), multiple pages, contiguous
                 * from an i/o perspective.
                 */
-               unsigned total_size =
-                       ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize
-                          * sizeof(u32), PAGE_SIZE);
-               ret = ipath_mmap_mem(vma, pd, total_size, 1,
-                                    pd->port_rcvhdrq_phys,
+               ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+                                    pd->port_rcvhdrq,
                                     "rcvhdrq");
-       }
-       else if (pgaddr == (u64)pd->port_rcvhdrqtailaddr_phys)
+       else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
                /* in-memory copy of rcvhdrq tail register */
                ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    pd->port_rcvhdrqtailaddr_phys,
+                                    pd->port_rcvhdrtail_kvaddr,
                                     "rcvhdrq tail");
-       else if (pgaddr == dd->ipath_pioavailregs_phys)
-               /* in-memory copy of pioavail registers */
-               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    dd->ipath_pioavailregs_phys,
-                                    "pioavail registers");
        else
                ret = -EINVAL;
 
@@ -1138,9 +1327,10 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
 
        if (ret < 0)
                dev_info(&dd->pcidev->dev,
-                        "Failure %d on addr %lx, off %lx\n",
-                        -ret, vma->vm_start, vma->vm_pgoff);
-
+                        "Failure %d on off %llx len %lx\n",
+                        -ret, (unsigned long long)pgaddr,
+                        vma->vm_end - vma->vm_start);
+bail:
        return ret;
 }
 
@@ -1154,6 +1344,8 @@ static unsigned int ipath_poll(struct file *fp,
        struct ipath_devdata *dd;
 
        pd = port_fp(fp);
+       if (!pd)
+               goto bail;
        dd = pd->port_dd;
 
        bit = pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT;
@@ -1176,7 +1368,7 @@ static unsigned int ipath_poll(struct file *fp,
 
        if (tail == head) {
                set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
-               if(dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+               if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
                        (void)ipath_write_ureg(dd, ur_rcvhdrhead,
                                               dd->ipath_rhdrhead_intr_off
                                               | head, pd->port_port);
@@ -1200,18 +1392,80 @@ static unsigned int ipath_poll(struct file *fp,
        ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
                         dd->ipath_rcvctrl);
 
+bail:
        return pollflag;
 }
 
+static int init_subports(struct ipath_devdata *dd,
+                        struct ipath_portdata *pd,
+                        const struct ipath_user_info *uinfo)
+{
+       int ret = 0;
+       unsigned num_slaves;
+       size_t size;
+
+       /* Old user binaries don't know about subports */
+       if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR)
+               goto bail;
+       /*
+        * If the user is requesting zero or one port,
+        * skip the subport allocation.
+        */
+       if (uinfo->spu_subport_cnt <= 1)
+               goto bail;
+       if (uinfo->spu_subport_cnt > 4) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       num_slaves = uinfo->spu_subport_cnt - 1;
+       pd->subport_uregbase = vmalloc(PAGE_SIZE * num_slaves);
+       if (!pd->subport_uregbase) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+       size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+                    sizeof(u32), PAGE_SIZE) * num_slaves;
+       pd->subport_rcvhdr_base = vmalloc(size);
+       if (!pd->subport_rcvhdr_base) {
+               ret = -ENOMEM;
+               goto bail_ureg;
+       }
+
+       pd->subport_rcvegrbuf = vmalloc(pd->port_rcvegrbuf_chunks *
+                                       pd->port_rcvegrbuf_size *
+                                       num_slaves);
+       if (!pd->subport_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail_rhdr;
+       }
+
+       pd->port_subport_cnt = uinfo->spu_subport_cnt;
+       pd->port_subport_id = uinfo->spu_subport_id;
+       pd->active_slaves = 1;
+       goto bail;
+
+bail_rhdr:
+       vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+       vfree(pd->subport_uregbase);
+       pd->subport_uregbase = NULL;
+bail:
+       return ret;
+}
+
 static int try_alloc_port(struct ipath_devdata *dd, int port,
-                         struct file *fp)
+                         struct file *fp,
+                         const struct ipath_user_info *uinfo)
 {
+       struct ipath_portdata *pd;
        int ret;
 
-       if (!dd->ipath_pd[port]) {
-               void *p, *ptmp;
+       if (!(pd = dd->ipath_pd[port])) {
+               void *ptmp;
 
-               p = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+               pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
 
                /*
                 * Allocate memory for use in ipath_tid_update() just once
@@ -1221,34 +1475,36 @@ static int try_alloc_port(struct ipath_devdata *dd, int port,
                ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
                               dd->ipath_rcvtidcnt * sizeof(struct page **),
                               GFP_KERNEL);
-               if (!p || !ptmp) {
+               if (!pd || !ptmp) {
                        ipath_dev_err(dd, "Unable to allocate portdata "
                                      "memory, failing open\n");
                        ret = -ENOMEM;
-                       kfree(p);
+                       kfree(pd);
                        kfree(ptmp);
                        goto bail;
                }
-               dd->ipath_pd[port] = p;
+               dd->ipath_pd[port] = pd;
                dd->ipath_pd[port]->port_port = port;
                dd->ipath_pd[port]->port_dd = dd;
                dd->ipath_pd[port]->port_tid_pg_list = ptmp;
                init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
        }
-       if (!dd->ipath_pd[port]->port_cnt) {
-               dd->ipath_pd[port]->port_cnt = 1;
-               fp->private_data = (void *) dd->ipath_pd[port];
+       if (!pd->port_cnt) {
+               pd->userversion = uinfo->spu_userversion;
+               init_user_egr_sizes(pd);
+               if ((ret = init_subports(dd, pd, uinfo)) != 0)
+                       goto bail;
                ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
                           current->comm, current->pid, dd->ipath_unit,
                           port);
-               dd->ipath_pd[port]->port_pid = current->pid;
-               strncpy(dd->ipath_pd[port]->port_comm, current->comm,
-                       sizeof(dd->ipath_pd[port]->port_comm));
+               pd->port_cnt = 1;
+               port_fp(fp) = pd;
+               pd->port_pid = current->pid;
+               strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
                ipath_stats.sps_ports++;
                ret = 0;
-               goto bail;
-       }
-       ret = -EBUSY;
+       } else
+               ret = -EBUSY;
 
 bail:
        return ret;
@@ -1264,7 +1520,8 @@ static inline int usable(struct ipath_devdata *dd)
                                     | IPATH_LINKUNK));
 }
 
-static int find_free_port(int unit, struct file *fp)
+static int find_free_port(int unit, struct file *fp,
+                         const struct ipath_user_info *uinfo)
 {
        struct ipath_devdata *dd = ipath_lookup(unit);
        int ret, i;
@@ -1279,8 +1536,8 @@ static int find_free_port(int unit, struct file *fp)
                goto bail;
        }
 
-       for (i = 0; i < dd->ipath_cfgports; i++) {
-               ret = try_alloc_port(dd, i, fp);
+       for (i = 1; i < dd->ipath_cfgports; i++) {
+               ret = try_alloc_port(dd, i, fp, uinfo);
                if (ret != -EBUSY)
                        goto bail;
        }
@@ -1290,13 +1547,14 @@ bail:
        return ret;
 }
 
-static int find_best_unit(struct file *fp)
+static int find_best_unit(struct file *fp,
+                         const struct ipath_user_info *uinfo)
 {
        int ret = 0, i, prefunit = -1, devmax;
        int maxofallports, npresent, nup;
        int ndev;
 
-       (void) ipath_count_units(&npresent, &nup, &maxofallports);
+       devmax = ipath_count_units(&npresent, &nup, &maxofallports);
 
        /*
         * This code is present to allow a knowledgeable person to
@@ -1343,8 +1601,6 @@ static int find_best_unit(struct file *fp)
 
        if (prefunit != -1)
                devmax = prefunit + 1;
-       else
-               devmax = ipath_count_units(NULL, NULL, NULL);
 recheck:
        for (i = 1; i < maxofallports; i++) {
                for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
@@ -1359,7 +1615,7 @@ recheck:
                                 * next.
                                 */
                                continue;
-                       ret = try_alloc_port(dd, i, fp);
+                       ret = try_alloc_port(dd, i, fp, uinfo);
                        if (!ret)
                                goto done;
                }
@@ -1395,22 +1651,183 @@ done:
        return ret;
 }
 
+static int find_shared_port(struct file *fp,
+                           const struct ipath_user_info *uinfo)
+{
+       int devmax, ndev, i;
+       int ret = 0;
+
+       devmax = ipath_count_units(NULL, NULL, NULL);
+
+       for (ndev = 0; ndev < devmax; ndev++) {
+               struct ipath_devdata *dd = ipath_lookup(ndev);
+
+               if (!dd)
+                       continue;
+               for (i = 1; i < dd->ipath_cfgports; i++) {
+                       struct ipath_portdata *pd = dd->ipath_pd[i];
+
+                       /* Skip ports which are not yet open */
+                       if (!pd || !pd->port_cnt)
+                               continue;
+                       /* Skip port if it doesn't match the requested one */
+                       if (pd->port_subport_id != uinfo->spu_subport_id)
+                               continue;
+                       /* Verify the sharing process matches the master */
+                       if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+                           pd->userversion != uinfo->spu_userversion ||
+                           pd->port_cnt >= pd->port_subport_cnt) {
+                               ret = -EINVAL;
+                               goto done;
+                       }
+                       port_fp(fp) = pd;
+                       subport_fp(fp) = pd->port_cnt++;
+                       tidcursor_fp(fp) = 0;
+                       pd->active_slaves |= 1 << subport_fp(fp);
+                       ipath_cdbg(PROC,
+                                  "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+                                  current->comm, current->pid,
+                                  subport_fp(fp),
+                                  pd->port_comm, pd->port_pid,
+                                  dd->ipath_unit, pd->port_port);
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
 static int ipath_open(struct inode *in, struct file *fp)
 {
-       int ret, user_minor;
+       /* The real work is performed later in ipath_assign_port() */
+       fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+       return fp->private_data ? 0 : -ENOMEM;
+}
+
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+                             const struct ipath_user_info *uinfo)
+{
+       int ret;
+       int i_minor;
+       unsigned swminor;
+
+       /* Check to be sure we haven't already initialized this file */
+       if (port_fp(fp)) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       /* for now, if major version is different, bail */
+       if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) {
+               ipath_dbg("User major version %d not same as driver "
+                         "major %d\n", uinfo->spu_userversion >> 16,
+                         IPATH_USER_SWMAJOR);
+               ret = -ENODEV;
+               goto done;
+       }
+
+       swminor = uinfo->spu_userversion & 0xffff;
+       if (swminor != IPATH_USER_SWMINOR)
+               ipath_dbg("User minor version %d not same as driver "
+                         "minor %d\n", swminor, IPATH_USER_SWMINOR);
 
        mutex_lock(&ipath_mutex);
 
-       user_minor = iminor(in) - IPATH_USER_MINOR_BASE;
+       if (swminor == IPATH_USER_SWMINOR && uinfo->spu_subport_cnt &&
+           (ret = find_shared_port(fp, uinfo))) {
+               mutex_unlock(&ipath_mutex);
+               if (ret > 0)
+                       ret = 0;
+               goto done;
+       }
+
+       i_minor = iminor(fp->f_dentry->d_inode) - IPATH_USER_MINOR_BASE;
        ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
-                  (long)in->i_rdev, user_minor);
+                  (long)fp->f_dentry->d_inode->i_rdev, i_minor);
 
-       if (user_minor)
-               ret = find_free_port(user_minor - 1, fp);
+       if (i_minor)
+               ret = find_free_port(i_minor - 1, fp, uinfo);
        else
-               ret = find_best_unit(fp);
+               ret = find_best_unit(fp, uinfo);
 
        mutex_unlock(&ipath_mutex);
+
+done:
+       return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+                             const struct ipath_user_info *uinfo)
+{
+       int ret;
+       struct ipath_portdata *pd;
+       struct ipath_devdata *dd;
+       u32 head32;
+
+       pd = port_fp(fp);
+       dd = pd->port_dd;
+
+       if (uinfo->spu_rcvhdrsize) {
+               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+               if (ret)
+                       goto done;
+       }
+
+       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
+       pd->port_piobufs = dd->ipath_piobufbase +
+               dd->ipath_pbufsport * (pd->port_port - 1) * dd->ipath_palign;
+       ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n",
+                  pd->port_port, pd->port_piobufs);
+
+       /*
+        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+        * array for time being.  If pd->port_port > chip-supported,
+        * we need to do extra stuff here to handle by handling overflow
+        * through port 0, someday
+        */
+       ret = ipath_create_rcvhdrq(dd, pd);
+       if (!ret)
+               ret = ipath_create_user_egr(pd);
+       if (ret)
+               goto done;
+
+       /*
+        * set the eager head register for this port to the current values
+        * of the tail pointers, since we don't know if they were
+        * updated on last use of the port.
+        */
+       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+       dd->ipath_lastegrheads[pd->port_port] = -1;
+       dd->ipath_lastrcvhdrqtails[pd->port_port] = -1;
+       ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+               pd->port_port, head32);
+       pd->port_tidcursor = 0; /* start at beginning after open */
+       /*
+        * now enable the port; the tail registers will be written to memory
+        * by the chip as soon as it sees the write to
+        * dd->ipath_kregs->kr_rcvctrl.  The update only happens on
+        * transition from 0 to 1, so clear it first, then set it as part of
+        * enabling the port.  This will (very briefly) affect any other
+        * open ports, but it shouldn't be long enough to be an issue.
+        * We explictly set the in-memory copy to 0 beforehand, so we don't
+        * have to wait to be sure the DMA update has happened.
+        */
+       *(volatile u64 *)pd->port_rcvhdrtail_kvaddr = 0ULL;
+       set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+               &dd->ipath_rcvctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+done:
        return ret;
 }
 
@@ -1433,6 +1850,8 @@ static void unlock_expected_tids(struct ipath_portdata *pd)
                if (!dd->ipath_pageshadow[i])
                        continue;
 
+               pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
                ipath_release_user_pages_on_close(&dd->ipath_pageshadow[i],
                                                  1);
                dd->ipath_pageshadow[i] = NULL;
@@ -1453,6 +1872,7 @@ static void unlock_expected_tids(struct ipath_portdata *pd)
 static int ipath_close(struct inode *in, struct file *fp)
 {
        int ret = 0;
+       struct ipath_filedata *fd;
        struct ipath_portdata *pd;
        struct ipath_devdata *dd;
        unsigned port;
@@ -1462,9 +1882,24 @@ static int ipath_close(struct inode *in, struct file *fp)
 
        mutex_lock(&ipath_mutex);
 
-       pd = port_fp(fp);
-       port = pd->port_port;
+       fd = (struct ipath_filedata *) fp->private_data;
        fp->private_data = NULL;
+       pd = fd->pd;
+       if (!pd) {
+               mutex_unlock(&ipath_mutex);
+               goto bail;
+       }
+       if (--pd->port_cnt) {
+               /*
+                * XXX If the master closes the port before the slave(s),
+                * revoke the mmap for the eager receive queue so
+                * the slave(s) don't wait for receive data forever.
+                */
+               pd->active_slaves &= ~(1 << fd->subport);
+               mutex_unlock(&ipath_mutex);
+               goto bail;
+       }
+       port = pd->port_port;
        dd = pd->port_dd;
 
        if (pd->port_hdrqfull) {
@@ -1503,8 +1938,6 @@ static int ipath_close(struct inode *in, struct file *fp)
 
                /* clean up the pkeys for this port user */
                ipath_clean_part_key(pd, dd);
-
-
                /*
                 * be paranoid, and never write 0's to these, just use an
                 * unused part of the port 0 tail page.  Of course,
@@ -1523,39 +1956,49 @@ static int ipath_close(struct inode *in, struct file *fp)
                i = dd->ipath_pbufsport * (port - 1);
                ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport);
 
+               dd->ipath_f_clear_tids(dd, pd->port_port);
+
                if (dd->ipath_pageshadow)
                        unlock_expected_tids(pd);
                ipath_stats.sps_ports--;
                ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
                           pd->port_comm, pd->port_pid,
                           dd->ipath_unit, port);
-
-               dd->ipath_f_clear_tids(dd, pd->port_port);
        }
 
-       pd->port_cnt = 0;
        pd->port_pid = 0;
-
        dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */
        mutex_unlock(&ipath_mutex);
        ipath_free_pddata(dd, pd); /* after releasing the mutex */
 
+bail:
+       kfree(fd);
        return ret;
 }
 
-static int ipath_port_info(struct ipath_portdata *pd,
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
                           struct ipath_port_info __user *uinfo)
 {
        struct ipath_port_info info;
        int nup;
        int ret;
+       size_t sz;
 
        (void) ipath_count_units(NULL, &nup, NULL);
        info.num_active = nup;
        info.unit = pd->port_dd->ipath_unit;
        info.port = pd->port_port;
+       info.subport = subport;
+       /* Don't return new fields if old library opened the port. */
+       if ((pd->userversion & 0xffff) == IPATH_USER_SWMINOR) {
+               /* Number of user ports available for this device. */
+               info.num_ports = pd->port_dd->ipath_cfgports - 1;
+               info.num_subports = pd->port_subport_cnt;
+               sz = sizeof(info);
+       } else
+               sz = sizeof(info) - 2 * sizeof(u16);
 
-       if (copy_to_user(uinfo, &info, sizeof(info))) {
+       if (copy_to_user(uinfo, &info, sz)) {
                ret = -EFAULT;
                goto bail;
        }
@@ -1565,6 +2008,16 @@ bail:
        return ret;
 }
 
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+                               void __user *slave_mask_addr)
+{
+       int ret = 0;
+
+       if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+               ret = -EFAULT;
+       return ret;
+}
+
 static ssize_t ipath_write(struct file *fp, const char __user *data,
                           size_t count, loff_t *off)
 {
@@ -1591,6 +2044,8 @@ static ssize_t ipath_write(struct file *fp, const char __user *data,
        consumed = sizeof(cmd.type);
 
        switch (cmd.type) {
+       case IPATH_CMD_ASSIGN_PORT:
+       case __IPATH_CMD_USER_INIT:
        case IPATH_CMD_USER_INIT:
                copy = sizeof(cmd.cmd.user_info);
                dest = &cmd.cmd.user_info;
@@ -1617,6 +2072,11 @@ static ssize_t ipath_write(struct file *fp, const char __user *data,
                dest = &cmd.cmd.part_key;
                src = &ucmd->cmd.part_key;
                break;
+       case IPATH_CMD_SLAVE_INFO:
+               copy = sizeof(cmd.cmd.slave_mask_addr);
+               dest = &cmd.cmd.slave_mask_addr;
+               src = &ucmd->cmd.slave_mask_addr;
+               break;
        default:
                ret = -EINVAL;
                goto bail;
@@ -1634,34 +2094,55 @@ static ssize_t ipath_write(struct file *fp, const char __user *data,
 
        consumed += copy;
        pd = port_fp(fp);
+       if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+               cmd.type != IPATH_CMD_ASSIGN_PORT) {
+               ret = -EINVAL;
+               goto bail;
+       }
 
        switch (cmd.type) {
+       case IPATH_CMD_ASSIGN_PORT:
+               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+               if (ret)
+                       goto bail;
+               break;
+       case __IPATH_CMD_USER_INIT:
+               /* backwards compatibility, get port first */
+               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+               if (ret)
+                       goto bail;
+               /* and fall through to current version. */
        case IPATH_CMD_USER_INIT:
-               ret = ipath_do_user_init(pd, &cmd.cmd.user_info);
-               if (ret < 0)
+               ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+               if (ret)
                        goto bail;
                ret = ipath_get_base_info(
-                       pd, (void __user *) (unsigned long)
+                       fp, (void __user *) (unsigned long)
                        cmd.cmd.user_info.spu_base_info,
                        cmd.cmd.user_info.spu_base_info_size);
                break;
        case IPATH_CMD_RECV_CTRL:
-               ret = ipath_manage_rcvq(pd, cmd.cmd.recv_ctrl);
+               ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
                break;
        case IPATH_CMD_PORT_INFO:
-               ret = ipath_port_info(pd,
+               ret = ipath_port_info(pd, subport_fp(fp),
                                      (struct ipath_port_info __user *)
                                      (unsigned long) cmd.cmd.port_info);
                break;
        case IPATH_CMD_TID_UPDATE:
-               ret = ipath_tid_update(pd, &cmd.cmd.tid_info);
+               ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
                break;
        case IPATH_CMD_TID_FREE:
-               ret = ipath_tid_free(pd, &cmd.cmd.tid_info);
+               ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
                break;
        case IPATH_CMD_SET_PART_KEY:
                ret = ipath_set_part_key(pd, cmd.cmd.part_key);
                break;
+       case IPATH_CMD_SLAVE_INFO:
+               ret = ipath_get_slave_info(pd,
+                                          (void __user *) (unsigned long)
+                                          cmd.cmd.slave_mask_addr);
+               break;
        }
 
        if (ret >= 0)
@@ -1858,4 +2339,3 @@ void ipath_user_remove(struct ipath_devdata *dd)
 bail:
        return;
 }
-
index c8a8af0..a507d0b 100644 (file)
@@ -356,19 +356,16 @@ static ssize_t flash_write(struct file *file, const char __user *buf,
 
        pos = *ppos;
 
-       if ( pos < 0) {
+       if (pos != 0) {
                ret = -EINVAL;
                goto bail;
        }
 
-       if (pos >= sizeof(struct ipath_flash)) {
-               ret = 0;
+       if (count != sizeof(struct ipath_flash)) {
+               ret = -EINVAL;
                goto bail;
        }
 
-       if (count > sizeof(struct ipath_flash) - pos)
-               count = sizeof(struct ipath_flash) - pos;
-
        tmp = kmalloc(count, GFP_KERNEL);
        if (!tmp) {
                ret = -ENOMEM;
index 5c9b509..9e4e8d4 100644 (file)
@@ -252,8 +252,8 @@ static const struct ipath_cregs ipath_ht_cregs = {
 };
 
 /* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_RCVURG_MASK 0x1FF
-#define INFINIPATH_I_RCVAVAIL_MASK 0x1FF
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
 
 /* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
 #define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
@@ -338,7 +338,7 @@ static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
        if (crcbits) {
                u16 ctrl0, ctrl1;
                snprintf(bitsmsg, sizeof bitsmsg,
-                        "[HT%s lane %s CRC (%llx); ignore till reload]",
+                        "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
                         !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
                         "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
                                    ? "1 (B)" : "0+1 (A+B)"),
@@ -389,17 +389,28 @@ static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
                                     _IPATH_HTLINK1_CRCBITS)));
 }
 
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+       INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+       INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+       INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+       INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+       INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+       INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+       INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+       INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
 /**
- * ipath_ht_handle_hwerrors - display hardware errors
+ * ipath_ht_handle_hwerrors - display hardware errors.
  * @dd: the infinipath device
  * @msg: the output buffer
  * @msgl: the size of the output buffer
  *
- * Use same msg buffer as regular errors to avoid
- * excessive stack use.  Most hardware errors are catastrophic, but for
- * right now, we'll print them and continue.
- * We reuse the same message buffer as ipath_handle_errors() to avoid
- * excessive stack usage.
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
  */
 static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                                     size_t msgl)
@@ -440,19 +451,49 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
         * make sure we get this much out, unless told to be quiet,
         * or it's occurred within the last 5 seconds
         */
-       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+       if ((hwerrs & ~(dd->ipath_lasthwerror |
+                       ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                         INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
            (ipath_debug & __IPATH_VERBDBG))
                dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
                         "(cleared)\n", (unsigned long long) hwerrs);
        dd->ipath_lasthwerror |= hwerrs;
 
-       if (hwerrs & ~infinipath_hwe_bitsextant)
+       if (hwerrs & ~dd->ipath_hwe_bitsextant)
                ipath_dev_err(dd, "hwerror interrupt with unknown errors "
                              "%llx set\n", (unsigned long long)
-                             (hwerrs & ~infinipath_hwe_bitsextant));
+                             (hwerrs & ~dd->ipath_hwe_bitsextant));
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
        if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               /*
+                * parity errors in send memory are recoverable,
+                * just cancel the send (if indicated in * sendbuffererror),
+                * count the occurrence, unfreeze (if no other handled
+                * hardware error bits are set), and continue. They can
+                * occur if a processor speculative read is done to the PIO
+                * buffer while we are sending a packet, for example.
+                */
+               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+                       ipath_stats.sps_txeparity++;
+                       ipath_dbg("Recovering from TXE parity error (%llu), "
+                                 "hwerrstatus=%llx\n",
+                                 (unsigned long long) ipath_stats.sps_txeparity,
+                                 (unsigned long long) hwerrs);
+                       ipath_disarm_senderrbufs(dd);
+                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
+                       if (!hwerrs) { /* else leave in freeze mode */
+                               ipath_write_kreg(dd,
+                                                dd->ipath_kregs->kr_control,
+                                                dd->ipath_control);
+                               return;
+                       }
+               }
                if (hwerrs) {
                        /*
                         * if any set that we aren't ignoring; only
@@ -499,44 +540,16 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                         bits);
                strlcat(msg, bitsmsg, msgl);
        }
-       if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK
-                     << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_RXEMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
-       if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK
-                     << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_TXEMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
-       if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR)
-               strlcat(msg, "[IB2IPATH Parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR)
-               strlcat(msg, "[IPATH2IB Parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_HTCBUSIREQPARITYERR)
-               strlcat(msg, "[HTC Ireq Parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_HTCBUSTREQPARITYERR)
-               strlcat(msg, "[HTC Treq Parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_HTCBUSTRESPPARITYERR)
-               strlcat(msg, "[HTC Tresp Parity]", msgl);
+
+       ipath_format_hwerrors(hwerrs,
+                             ipath_6110_hwerror_msgs,
+                             sizeof(ipath_6110_hwerror_msgs) /
+                             sizeof(ipath_6110_hwerror_msgs[0]),
+                             msg, msgl);
 
        if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
                hwerr_crcbits(dd, hwerrs, msg, msgl);
 
-       if (hwerrs & INFINIPATH_HWE_HTCMISCERR5)
-               strlcat(msg, "[HT core Misc5]", msgl);
-       if (hwerrs & INFINIPATH_HWE_HTCMISCERR6)
-               strlcat(msg, "[HT core Misc6]", msgl);
-       if (hwerrs & INFINIPATH_HWE_HTCMISCERR7)
-               strlcat(msg, "[HT core Misc7]", msgl);
        if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
                strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
                        msgl);
@@ -573,11 +586,6 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                                 dd->ipath_hwerrmask);
        }
 
-       if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR)
-               strlcat(msg, "[Rx Dsync]", msgl);
-       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED)
-               strlcat(msg, "[SerDes PLL]", msgl);
-
        ipath_dev_err(dd, "%s hardware error\n", msg);
        if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
                /*
@@ -1080,21 +1088,21 @@ static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
        ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
 }
 
-static void ipath_init_ht_variables(void)
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
 {
-       ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
-       ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
-       ipath_gpio_sda = IPATH_GPIO_SDA;
-       ipath_gpio_scl = IPATH_GPIO_SCL;
+       dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+       dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+       dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+       dd->ipath_gpio_scl = IPATH_GPIO_SCL;
 
-       infinipath_i_bitsextant =
+       dd->ipath_i_bitsextant =
                (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
                (INFINIPATH_I_RCVAVAIL_MASK <<
                 INFINIPATH_I_RCVAVAIL_SHIFT) |
                INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
                INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
 
-       infinipath_e_bitsextant =
+       dd->ipath_e_bitsextant =
                INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
                INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
                INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
@@ -1112,7 +1120,7 @@ static void ipath_init_ht_variables(void)
                INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
                INFINIPATH_E_HARDWARE;
 
-       infinipath_hwe_bitsextant =
+       dd->ipath_hwe_bitsextant =
                (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
                 INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
                (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
@@ -1141,8 +1149,8 @@ static void ipath_init_ht_variables(void)
                INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
                INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
 
-       infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
-       infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+       dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+       dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
 }
 
 /**
@@ -1607,5 +1615,5 @@ void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
         * do very early init that is needed before ipath_f_bus is
         * called
         */
-       ipath_init_ht_variables();
+       ipath_init_ht_variables(dd);
 }
index d86516d..a72ab9d 100644 (file)
@@ -263,8 +263,8 @@ static const struct ipath_cregs ipath_pe_cregs = {
 };
 
 /* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_RCVURG_MASK 0x1F
-#define INFINIPATH_I_RCVAVAIL_MASK 0x1F
+#define INFINIPATH_I_RCVURG_MASK ((1U<<5)-1)
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<5)-1)
 
 /* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
 #define INFINIPATH_HWE_PCIEMEMPARITYERR_MASK  0x000000000000003fULL
@@ -294,6 +294,33 @@ static const struct ipath_cregs ipath_pe_cregs = {
 #define IPATH_GPIO_SCL (1ULL << \
        (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
 
+/*
+ * Rev2 silicon allows suppressing check for ArmLaunch errors.
+ * this can speed up short packet sends on systems that do
+ * not guaranteee write-order.
+ */
+#define INFINIPATH_XGXS_SUPPRESS_ARMLAUNCH_ERR (1ULL<<63)
+
+/* 6120 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
+       INFINIPATH_HWE_MSG(PCIEPOISONEDTLP, "PCIe Poisoned TLP"),
+       INFINIPATH_HWE_MSG(PCIECPLTIMEOUT, "PCIe completion timeout"),
+       /*
+        * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+        * parity or memory parity error failures, because most likely we
+        * won't be able to talk to the core of the chip.  Nonetheless, we
+        * might see them, if they are in parts of the PCIe core that aren't
+        * essential.
+        */
+       INFINIPATH_HWE_MSG(PCIE1PLLFAILED, "PCIePLL1"),
+       INFINIPATH_HWE_MSG(PCIE0PLLFAILED, "PCIePLL0"),
+       INFINIPATH_HWE_MSG(PCIEBUSPARITYXTLH, "PCIe XTLH core parity"),
+       INFINIPATH_HWE_MSG(PCIEBUSPARITYXADM, "PCIe ADM TX core parity"),
+       INFINIPATH_HWE_MSG(PCIEBUSPARITYRADM, "PCIe ADM RX core parity"),
+       INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+       INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
 /**
  * ipath_pe_handle_hwerrors - display hardware errors.
  * @dd: the infinipath device
@@ -343,19 +370,49 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
         * make sure we get this much out, unless told to be quiet,
         * or it's occurred within the last 5 seconds
         */
-       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+       if ((hwerrs & ~(dd->ipath_lasthwerror |
+                       ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                         INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
            (ipath_debug & __IPATH_VERBDBG))
                dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
                         "(cleared)\n", (unsigned long long) hwerrs);
        dd->ipath_lasthwerror |= hwerrs;
 
-       if (hwerrs & ~infinipath_hwe_bitsextant)
+       if (hwerrs & ~dd->ipath_hwe_bitsextant)
                ipath_dev_err(dd, "hwerror interrupt with unknown errors "
                              "%llx set\n", (unsigned long long)
-                             (hwerrs & ~infinipath_hwe_bitsextant));
+                             (hwerrs & ~dd->ipath_hwe_bitsextant));
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
        if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               /*
+                * parity errors in send memory are recoverable,
+                * just cancel the send (if indicated in * sendbuffererror),
+                * count the occurrence, unfreeze (if no other handled
+                * hardware error bits are set), and continue. They can
+                * occur if a processor speculative read is done to the PIO
+                * buffer while we are sending a packet, for example.
+                */
+               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+                       ipath_stats.sps_txeparity++;
+                       ipath_dbg("Recovering from TXE parity error (%llu), "
+                                 "hwerrstatus=%llx\n",
+                                 (unsigned long long) ipath_stats.sps_txeparity,
+                                 (unsigned long long) hwerrs);
+                       ipath_disarm_senderrbufs(dd);
+                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
+                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
+                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
+                       if (!hwerrs) { /* else leave in freeze mode */
+                               ipath_write_kreg(dd,
+                                                dd->ipath_kregs->kr_control,
+                                                dd->ipath_control);
+                           return;
+                       }
+               }
                if (hwerrs) {
                        /*
                         * if any set that we aren't ignoring only make the
@@ -379,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                } else {
                        ipath_dbg("Clearing freezemode on ignored hardware "
                                  "error\n");
-                       ctrl &= ~INFINIPATH_C_FREEZEMODE;
                        ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                                        ctrl);
+                                        dd->ipath_control);
                }
        }
 
@@ -396,24 +452,13 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
                                 dd->ipath_hwerrmask);
        }
-       if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK
-                     << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_RXEMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
-       if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK
-                     << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_TXEMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
+
+       ipath_format_hwerrors(hwerrs,
+                             ipath_6120_hwerror_msgs,
+                             sizeof(ipath_6120_hwerror_msgs)/
+                             sizeof(ipath_6120_hwerror_msgs[0]),
+                             msg, msgl);
+
        if (hwerrs & (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK
                      << INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT)) {
                bits = (u32) ((hwerrs >>
@@ -423,10 +468,6 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                         "[PCIe Mem Parity Errs %x] ", bits);
                strlcat(msg, bitsmsg, msgl);
        }
-       if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR)
-               strlcat(msg, "[IB2IPATH Parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR)
-               strlcat(msg, "[IPATH2IB Parity]", msgl);
 
 #define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
                         INFINIPATH_HWE_COREPLL_RFSLIP )
@@ -452,34 +493,6 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                                 dd->ipath_hwerrmask);
        }
 
-       if (hwerrs & INFINIPATH_HWE_PCIEPOISONEDTLP)
-               strlcat(msg, "[PCIe Poisoned TLP]", msgl);
-       if (hwerrs & INFINIPATH_HWE_PCIECPLTIMEOUT)
-               strlcat(msg, "[PCIe completion timeout]", msgl);
-
-       /*
-        * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
-        * parity or memory parity error failures, because most likely we
-        * won't be able to talk to the core of the chip.  Nonetheless, we
-        * might see them, if they are in parts of the PCIe core that aren't
-        * essential.
-        */
-       if (hwerrs & INFINIPATH_HWE_PCIE1PLLFAILED)
-               strlcat(msg, "[PCIePLL1]", msgl);
-       if (hwerrs & INFINIPATH_HWE_PCIE0PLLFAILED)
-               strlcat(msg, "[PCIePLL0]", msgl);
-       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXTLH)
-               strlcat(msg, "[PCIe XTLH core parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXADM)
-               strlcat(msg, "[PCIe ADM TX core parity]", msgl);
-       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYRADM)
-               strlcat(msg, "[PCIe ADM RX core parity]", msgl);
-
-       if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR)
-               strlcat(msg, "[Rx Dsync]", msgl);
-       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED)
-               strlcat(msg, "[SerDes PLL]", msgl);
-
        ipath_dev_err(dd, "%s hardware error\n", msg);
        if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) {
                /*
@@ -525,6 +538,9 @@ static int ipath_pe_boardname(struct ipath_devdata *dd, char *name,
        case 5:
                n = "InfiniPath_QMH7140";
                break;
+       case 6:
+               n = "InfiniPath_QLE7142";
+               break;
        default:
                ipath_dev_err(dd,
                              "Don't yet know about board with ID %u\n",
@@ -571,9 +587,12 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd)
        if (!dd->ipath_boardrev)        // no PLL for Emulator
                val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
 
-       /* workaround bug 9460 in internal interface bus parity checking */
-       val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM;
-
+       if (dd->ipath_minrev < 2) {
+               /* workaround bug 9460 in internal interface bus parity
+                * checking. Fixed (HW bug 9490) in Rev2.
+                */
+               val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM;
+       }
        dd->ipath_hwerrmask = val;
 }
 
@@ -583,8 +602,8 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd)
  */
 static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
 {
-       u64 val, tmp, config1;
-       int ret = 0, change = 0;
+       u64 val, tmp, config1, prev_val;
+       int ret = 0;
 
        ipath_dbg("Trying to bringup serdes\n");
 
@@ -641,6 +660,7 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
        val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 
        val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+       prev_val = val;
        if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) &
             INFINIPATH_XGXS_MDIOADDR_MASK) != 3) {
                val &=
@@ -648,11 +668,9 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
                          INFINIPATH_XGXS_MDIOADDR_SHIFT);
                /* MDIO address 3 */
                val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT;
-               change = 1;
        }
        if (val & INFINIPATH_XGXS_RESET) {
                val &= ~INFINIPATH_XGXS_RESET;
-               change = 1;
        }
        if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
             INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
@@ -661,9 +679,19 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
                         INFINIPATH_XGXS_RX_POL_SHIFT);
                val |= dd->ipath_rx_pol_inv <<
                        INFINIPATH_XGXS_RX_POL_SHIFT;
-               change = 1;
        }
-       if (change)
+       if (dd->ipath_minrev >= 2) {
+               /* Rev 2. can tolerate multiple writes to PBC, and
+                * allowing them can provide lower latency on some
+                * CPUs, but this feature is off by default, only
+                * turned on by setting D63 of XGXSconfig reg.
+                * May want to make this conditional more
+                * fine-grained in future. This is not exactly
+                * related to XGXS, but where the bit ended up.
+                */
+               val |= INFINIPATH_XGXS_SUPPRESS_ARMLAUNCH_ERR;
+       }
+       if (val != prev_val)
                ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
 
        val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
@@ -717,9 +745,25 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd)
        ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
 }
 
-/* this is not yet needed on this chip, so just return 0. */
 static int ipath_pe_intconfig(struct ipath_devdata *dd)
 {
+       u64 val;
+       u32 chiprev;
+
+       /*
+        * If the chip supports added error indication via GPIO pins,
+        * enable interrupts on those bits so the interrupt routine
+        * can count the events. Also set flag so interrupt routine
+        * can know they are expected.
+        */
+       chiprev = dd->ipath_revision >> INFINIPATH_R_CHIPREVMINOR_SHIFT;
+       if ((chiprev & INFINIPATH_R_CHIPREVMINOR_MASK) > 1) {
+               /* Rev2+ reports extra errors via internal GPIO pins */
+               dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
+               val |= IPATH_GPIO_ERRINTR_MASK;
+               ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
+       }
        return 0;
 }
 
@@ -853,21 +897,23 @@ static int ipath_setup_pe_config(struct ipath_devdata *dd,
        return 0;
 }
 
-static void ipath_init_pe_variables(void)
+static void ipath_init_pe_variables(struct ipath_devdata *dd)
 {
        /*
         * bits for selecting i2c direction and values,
         * used for I2C serial flash
         */
-       ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
-       ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
-       ipath_gpio_sda = IPATH_GPIO_SDA;
-       ipath_gpio_scl = IPATH_GPIO_SCL;
+       dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+       dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+       dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+       dd->ipath_gpio_scl = IPATH_GPIO_SCL;
 
        /* variables for sanity checking interrupt and errors */
-       infinipath_hwe_bitsextant =
+       dd->ipath_hwe_bitsextant =
                (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
                 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
                (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
                 INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
                INFINIPATH_HWE_PCIE1PLLFAILED |
@@ -883,13 +929,13 @@ static void ipath_init_pe_variables(void)
                INFINIPATH_HWE_SERDESPLLFAILED |
                INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
                INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
-       infinipath_i_bitsextant =
+       dd->ipath_i_bitsextant =
                (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
                (INFINIPATH_I_RCVAVAIL_MASK <<
                 INFINIPATH_I_RCVAVAIL_SHIFT) |
                INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
                INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
-       infinipath_e_bitsextant =
+       dd->ipath_e_bitsextant =
                INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
                INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
                INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
@@ -907,8 +953,8 @@ static void ipath_init_pe_variables(void)
                INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
                INFINIPATH_E_HARDWARE;
 
-       infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
-       infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+       dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+       dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
 }
 
 /* setup the MSI stuff again after a reset.  I'd like to just call
@@ -1082,6 +1128,45 @@ static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr,
        mmiowb();
        spin_unlock_irqrestore(&dd->ipath_tid_lock, flags);
 }
+/**
+ * ipath_pe_put_tid_2 - write a TID in chip, Revision 2 or higher
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for selection of the
+ * appropriate "flavor". The static calls in cleanup just use the
+ * revision-agnostic form, as they are not performance critical.
+ */
+static void ipath_pe_put_tid_2(struct ipath_devdata *dd, u64 __iomem *tidptr,
+                            u32 type, unsigned long pa)
+{
+       u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+
+       if (pa != dd->ipath_tidinvalid) {
+               if (pa & ((1U << 11) - 1)) {
+                       dev_info(&dd->pcidev->dev, "BUG: physaddr %lx "
+                                "not 2KB aligned!\n", pa);
+                       return;
+               }
+               pa >>= 11;
+               /* paranoia check */
+               if (pa & (7<<29))
+                       ipath_dev_err(dd,
+                                     "BUG: Physical page address 0x%lx "
+                                     "has bits set in 31-29\n", pa);
+
+               if (type == 0)
+                       pa |= dd->ipath_tidtemplate;
+               else /* for now, always full 4KB page */
+                       pa |= 2 << 29;
+       }
+       if (dd->ipath_kregbase)
+               writel(pa, tidp32);
+       mmiowb();
+}
+
 
 /**
  * ipath_pe_clear_tid - clear all TID entries for a port, expected and eager
@@ -1203,7 +1288,7 @@ int __attribute__((weak)) ipath_unordered_wc(void)
 
 /**
  * ipath_init_pe_get_base_info - set chip-specific flags for user code
- * @dd: the infinipath device
+ * @pd: the infinipath port
  * @kbase: ipath_base_info pointer
  *
  * We set the PCIE flag because the lower bandwidth on PCIe vs
@@ -1212,6 +1297,7 @@ int __attribute__((weak)) ipath_unordered_wc(void)
 static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase)
 {
        struct ipath_base_info *kinfo = kbase;
+       struct ipath_devdata *dd;
 
        if (ipath_unordered_wc()) {
                kinfo->spi_runtime_flags |= IPATH_RUNTIME_FORCE_WC_ORDER;
@@ -1220,8 +1306,20 @@ static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase)
        else
                ipath_cdbg(PROC, "Not Intel processor, WC ordered\n");
 
-       kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE;
+       if (pd == NULL)
+               goto done;
 
+       dd = pd->port_dd;
+
+       if (dd != NULL && dd->ipath_minrev >= 2) {
+               ipath_cdbg(PROC, "IBA6120 Rev2, allow multiple PBC write\n");
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_PBC_REWRITE;
+               ipath_cdbg(PROC, "IBA6120 Rev2, allow loose DMA alignment\n");
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_LOOSE_DMA_ALIGN;
+       }
+
+done:
+       kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE;
        return 0;
 }
 
@@ -1244,7 +1342,10 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
        dd->ipath_f_quiet_serdes = ipath_pe_quiet_serdes;
        dd->ipath_f_bringup_serdes = ipath_pe_bringup_serdes;
        dd->ipath_f_clear_tids = ipath_pe_clear_tids;
-       dd->ipath_f_put_tid = ipath_pe_put_tid;
+       if (dd->ipath_minrev >= 2)
+               dd->ipath_f_put_tid = ipath_pe_put_tid_2;
+       else
+               dd->ipath_f_put_tid = ipath_pe_put_tid;
        dd->ipath_f_cleanup = ipath_setup_pe_cleanup;
        dd->ipath_f_setextled = ipath_setup_pe_setextled;
        dd->ipath_f_get_base_info = ipath_pe_get_base_info;
@@ -1259,6 +1360,6 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
        dd->ipath_kregs = &ipath_pe_kregs;
        dd->ipath_cregs = &ipath_pe_cregs;
 
-       ipath_init_pe_variables();
+       ipath_init_pe_variables(dd);
 }
 
index 44669dc..d819cca 100644 (file)
@@ -88,13 +88,13 @@ MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
 static int create_port0_egr(struct ipath_devdata *dd)
 {
        unsigned e, egrcnt;
-       struct sk_buff **skbs;
+       struct ipath_skbinfo *skbinfo;
        int ret;
 
        egrcnt = dd->ipath_rcvegrcnt;
 
-       skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt);
-       if (skbs == NULL) {
+       skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+       if (skbinfo == NULL) {
                ipath_dev_err(dd, "allocation error for eager TID "
                              "skb array\n");
                ret = -ENOMEM;
@@ -109,13 +109,13 @@ static int create_port0_egr(struct ipath_devdata *dd)
                 * 4 bytes so that the data buffer stays word aligned.
                 * See ipath_kreceive() for more details.
                 */
-               skbs[e] = ipath_alloc_skb(dd, GFP_KERNEL);
-               if (!skbs[e]) {
+               skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+               if (!skbinfo[e].skb) {
                        ipath_dev_err(dd, "SKB allocation error for "
                                      "eager TID %u\n", e);
                        while (e != 0)
-                               dev_kfree_skb(skbs[--e]);
-                       vfree(skbs);
+                               dev_kfree_skb(skbinfo[--e].skb);
+                       vfree(skbinfo);
                        ret = -ENOMEM;
                        goto bail;
                }
@@ -124,14 +124,17 @@ static int create_port0_egr(struct ipath_devdata *dd)
         * After loop above, so we can test non-NULL to see if ready
         * to use at receive, etc.
         */
-       dd->ipath_port0_skbs = skbs;
+       dd->ipath_port0_skbinfo = skbinfo;
 
        for (e = 0; e < egrcnt; e++) {
-               unsigned long phys =
-                       virt_to_phys(dd->ipath_port0_skbs[e]->data);
+               dd->ipath_port0_skbinfo[e].phys =
+                 ipath_map_single(dd->pcidev,
+                                  dd->ipath_port0_skbinfo[e].skb->data,
+                                  dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
                dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
                                    ((char __iomem *) dd->ipath_kregbase +
-                                    dd->ipath_rcvegrbase), 0, phys);
+                                    dd->ipath_rcvegrbase), 0,
+                                   dd->ipath_port0_skbinfo[e].phys);
        }
 
        ret = 0;
@@ -432,16 +435,33 @@ done:
  */
 static void init_shadow_tids(struct ipath_devdata *dd)
 {
-       dd->ipath_pageshadow = (struct page **)
-               vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+       struct page **pages;
+       dma_addr_t *addrs;
+
+       pages = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
                        sizeof(struct page *));
-       if (!dd->ipath_pageshadow)
+       if (!pages) {
                ipath_dev_err(dd, "failed to allocate shadow page * "
                              "array, no expected sends!\n");
-       else
-               memset(dd->ipath_pageshadow, 0,
-                      dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-                      sizeof(struct page *));
+               dd->ipath_pageshadow = NULL;
+               return;
+       }
+
+       addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+                       sizeof(dma_addr_t));
+       if (!addrs) {
+               ipath_dev_err(dd, "failed to allocate shadow dma handle "
+                             "array, no expected sends!\n");
+               vfree(dd->ipath_pageshadow);
+               dd->ipath_pageshadow = NULL;
+               return;
+       }
+
+       memset(pages, 0, dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+              sizeof(struct page *));
+
+       dd->ipath_pageshadow = pages;
+       dd->ipath_physshadow = addrs;
 }
 
 static void enable_chip(struct ipath_devdata *dd,
index 49bf7bb..6bee53c 100644 (file)
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+       u32 piobcnt;
+       unsigned long sbuf[4];
+       /*
+        * it's possible that sendbuffererror could have bits set; might
+        * have already done this as a result of hardware error handling
+        */
+       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /* read these before writing errorclear */
+       sbuf[0] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror);
+       sbuf[1] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+       if (piobcnt > 128) {
+               sbuf[2] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+               sbuf[3] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+       }
+
+       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+               int i;
+               if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
+                       __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+                                         "SendbufErrs %lx %lx", sbuf[0],
+                                         sbuf[1]);
+                       if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+                               printk(" %lx %lx ", sbuf[2], sbuf[3]);
+                       printk("\n");
+               }
+
+               for (i = 0; i < piobcnt; i++)
+                       if (test_bit(i, sbuf))
+                               ipath_disarm_piobufs(dd, i, 1);
+               dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
+       }
+}
+
+
 /* These are all rcv-related errors which we want to count for stats */
 #define E_SUM_PKTERRS \
        (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
 
 static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
 {
-       unsigned long sbuf[4];
        u64 ignore_this_time = 0;
-       u32 piobcnt;
 
-       /* if possible that sendbuffererror could be valid */
-       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /* read these before writing errorclear */
-       sbuf[0] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror);
-       sbuf[1] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
-       if (piobcnt > 128) {
-               sbuf[2] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
-               sbuf[3] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
-       }
-
-       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
-               int i;
-
-               ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
-               if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
-                       printk("%lx %lx ", sbuf[2], sbuf[3]);
-               for (i = 0; i < piobcnt; i++) {
-                       if (test_bit(i, sbuf)) {
-                               u32 __iomem *piobuf;
-                               if (i < dd->ipath_piobcnt2k)
-                                       piobuf = (u32 __iomem *)
-                                               (dd->ipath_pio2kbase +
-                                                i * dd->ipath_palign);
-                               else
-                                       piobuf = (u32 __iomem *)
-                                               (dd->ipath_pio4kbase +
-                                                (i - dd->ipath_piobcnt2k) *
-                                                dd->ipath_4kalign);
-
-                               ipath_cdbg(PKT,
-                                          "PIObuf[%u] @%p pbc is %x; ",
-                                          i, piobuf, readl(piobuf));
-
-                               ipath_disarm_piobufs(dd, i, 1);
-                       }
-               }
-               if (ipath_debug & __IPATH_PKTDBG)
-                       printk("\n");
-       }
+       ipath_disarm_senderrbufs(dd);
        if ((errs & E_SUM_LINK_PKTERRS) &&
            !(dd->ipath_flags & IPATH_LINKACTIVE)) {
                /*
@@ -132,6 +132,82 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
        return ignore_this_time;
 }
 
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+       { \
+               .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
+                         INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
+               .msg = "TXE " #a " Memory Parity"            \
+       }
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+       { \
+               .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
+                         INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
+               .msg = "RXE " #a " Memory Parity"            \
+       }
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+       INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+       INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+       strlcat(msg, "[", msgl);
+       strlcat(msg, hwmsg, msgl);
+       strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+                          const struct ipath_hwerror_msgs *hwerrmsgs,
+                          size_t nhwerrmsgs,
+                          char *msg, size_t msgl)
+{
+       int i;
+       const int glen =
+           sizeof(ipath_generic_hwerror_msgs) /
+           sizeof(ipath_generic_hwerror_msgs[0]);
+
+       for (i=0; i<glen; i++) {
+               if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+                       ipath_format_hwmsg(msg, msgl,
+                                          ipath_generic_hwerror_msgs[i].msg);
+               }
+       }
+
+       for (i=0; i<nhwerrmsgs; i++) {
+               if (hwerrs & hwerrmsgs[i].mask) {
+                       ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+               }
+       }
+}
+
 /* return the strings for the most common link states */
 static char *ib_linkstate(u32 linkstate)
 {
@@ -404,10 +480,10 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
                dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
        }
 
-       if (!noprint && (errs & ~infinipath_e_bitsextant))
+       if (!noprint && (errs & ~dd->ipath_e_bitsextant))
                ipath_dev_err(dd, "error interrupt with unknown errors "
                              "%llx set\n", (unsigned long long)
-                             (errs & ~infinipath_e_bitsextant));
+                             (errs & ~dd->ipath_e_bitsextant));
 
        if (errs & E_SUM_ERRS)
                ignore_this_time = handle_e_sum_errs(dd, errs);
@@ -478,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
                        ~(INFINIPATH_E_HARDWARE |
                          INFINIPATH_E_IBSTATUSCHANGED);
        }
+
+       /* likely due to cancel, so suppress */
+       if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+               dd->ipath_lastcancel > jiffies) {
+               ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
+               errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+       }
+
        if (!errs)
                return 0;
 
@@ -529,7 +613,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
                                 * don't report same point multiple times,
                                 * except kernel
                                 */
-                               tl = (u32) * pd->port_rcvhdrtail_kvaddr;
+                               tl = *(u64 *) pd->port_rcvhdrtail_kvaddr;
                                if (tl == dd->ipath_lastrcvhdrqtails[i])
                                        continue;
                                hd = ipath_read_ureg32(dd, ur_rcvhdrhead,
@@ -729,9 +813,9 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat)
        int rcvdint = 0;
 
        portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
-                infinipath_i_rcvavail_mask)
+                dd->ipath_i_rcvavail_mask)
                | ((istat >> INFINIPATH_I_RCVURG_SHIFT) &
-                  infinipath_i_rcvurg_mask);
+                  dd->ipath_i_rcvurg_mask);
        for (i = 1; i < dd->ipath_cfgports; i++) {
                struct ipath_portdata *pd = dd->ipath_pd[i];
                if (portr & (1 << i) && pd && pd->port_cnt &&
@@ -808,7 +892,7 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
        if (oldhead != curtail) {
                if (dd->ipath_flags & IPATH_GPIO_INTR) {
                        ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-                                        (u64) (1 << 2));
+                                        (u64) (1 << IPATH_GPIO_PORT0_BIT));
                        istat = port0rbits | INFINIPATH_I_GPIO;
                }
                else
@@ -838,10 +922,10 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
        if (unexpected)
                unexpected = 0;
 
-       if (unlikely(istat & ~infinipath_i_bitsextant))
+       if (unlikely(istat & ~dd->ipath_i_bitsextant))
                ipath_dev_err(dd,
                              "interrupt with unknown interrupts %x set\n",
-                             istat & (u32) ~ infinipath_i_bitsextant);
+                             istat & (u32) ~ dd->ipath_i_bitsextant);
        else
                ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat);
 
@@ -867,26 +951,80 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
 
        if (istat & INFINIPATH_I_GPIO) {
                /*
-                * Packets are available in the port 0 rcv queue.
-                * Eventually this needs to be generalized to check
-                * IPATH_GPIO_INTR, and the specific GPIO bit, if
-                * GPIO interrupts are used for anything else.
+                * GPIO interrupts fall in two broad classes:
+                * GPIO_2 indicates (on some HT4xx boards) that a packet
+                *        has arrived for Port 0. Checking for this
+                *        is controlled by flag IPATH_GPIO_INTR.
+                * GPIO_3..5 on IBA6120 Rev2 chips indicate errors
+                *        that we need to count. Checking for this
+                *        is controlled by flag IPATH_GPIO_ERRINTRS.
                 */
-               if (unlikely(!(dd->ipath_flags & IPATH_GPIO_INTR))) {
-                       u32 gpiostatus;
-                       gpiostatus = ipath_read_kreg32(
-                               dd, dd->ipath_kregs->kr_gpio_status);
-                       ipath_dbg("Unexpected GPIO interrupt bits %x\n",
-                                 gpiostatus);
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-                                        gpiostatus);
+               u32 gpiostatus;
+               u32 to_clear = 0;
+
+               gpiostatus = ipath_read_kreg32(
+                       dd, dd->ipath_kregs->kr_gpio_status);
+               /* First the error-counter case.
+                */
+               if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+                   (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+                       /* want to clear the bits we see asserted. */
+                       to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+                       /*
+                        * Count appropriately, clear bits out of our copy,
+                        * as they have been "handled".
+                        */
+                       if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+                               ipath_dbg("FlowCtl on UnsupVL\n");
+                               dd->ipath_rxfc_unsupvl_errs++;
+                       }
+                       if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+                               ipath_dbg("Overrun Threshold exceeded\n");
+                               dd->ipath_overrun_thresh_errs++;
+                       }
+                       if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+                               ipath_dbg("Local Link Integrity error\n");
+                               dd->ipath_lli_errs++;
+                       }
+                       gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
                }
-               else {
-                       /* Clear GPIO status bit 2 */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-                                       (u64) (1 << 2));
+               /* Now the Port0 Receive case */
+               if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+                   (dd->ipath_flags & IPATH_GPIO_INTR)) {
+                       /*
+                        * GPIO status bit 2 is set, and we expected it.
+                        * clear it and indicate in p0bits.
+                        * This probably only happens if a Port0 pkt
+                        * arrives at _just_ the wrong time, and we
+                        * handle that by seting chk0rcv;
+                        */
+                       to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+                       gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
                        chk0rcv = 1;
                }
+               if (unlikely(gpiostatus)) {
+                       /*
+                        * Some unexpected bits remain. If they could have
+                        * caused the interrupt, complain and clear.
+                        * MEA: this is almost certainly non-ideal.
+                        * we should look into auto-disable of unexpected
+                        * GPIO interrupts, possibly on a "three strikes"
+                        * basis.
+                        */
+                       u32 mask;
+                       mask = ipath_read_kreg32(
+                               dd, dd->ipath_kregs->kr_gpio_mask);
+                       if (mask & gpiostatus) {
+                               ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+                                 gpiostatus & mask);
+                               to_clear |= (gpiostatus & mask);
+                       }
+               }
+               if (to_clear) {
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+                                       (u64) to_clear);
+               }
        }
        chk0rcv |= istat & port0rbits;
 
@@ -911,9 +1049,9 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
                istat &= ~port0rbits;
        }
 
-       if (istat & ((infinipath_i_rcvavail_mask <<
+       if (istat & ((dd->ipath_i_rcvavail_mask <<
                      INFINIPATH_I_RCVAVAIL_SHIFT)
-                    | (infinipath_i_rcvurg_mask <<
+                    | (dd->ipath_i_rcvurg_mask <<
                        INFINIPATH_I_RCVURG_SHIFT)))
                handle_urcv(dd, istat);
 
index a8a5627..d7540b7 100644 (file)
@@ -39,6 +39,8 @@
  */
 
 #include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 
 #include "ipath_common.h"
@@ -62,7 +64,7 @@ struct ipath_portdata {
        /* rcvhdrq base, needs mmap before useful */
        void *port_rcvhdrq;
        /* kernel virtual address where hdrqtail is updated */
-       volatile __le64 *port_rcvhdrtail_kvaddr;
+       void *port_rcvhdrtail_kvaddr;
        /*
         * temp buffer for expected send setup, allocated at open, instead
         * of each setup call
@@ -79,8 +81,8 @@ struct ipath_portdata {
        dma_addr_t port_rcvhdrq_phys;
        dma_addr_t port_rcvhdrqtailaddr_phys;
        /*
-        * number of opens on this instance (0 or 1; ignoring forks, dup,
-        * etc. for now)
+        * number of opens (including slave subports) on this instance
+        * (ignoring forks, dup, etc. for now)
         */
        int port_cnt;
        /*
@@ -89,6 +91,10 @@ struct ipath_portdata {
         */
        /* instead of calculating it */
        unsigned port_port;
+       /* non-zero if port is being shared. */
+       u16 port_subport_cnt;
+       /* non-zero if port is being shared. */
+       u16 port_subport_id;
        /* chip offset of PIO buffers for this port */
        u32 port_piobufs;
        /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
@@ -121,6 +127,16 @@ struct ipath_portdata {
        u16 port_pkeys[4];
        /* so file ops can get at unit */
        struct ipath_devdata *port_dd;
+       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+       void *subport_uregbase;
+       /* An array of pages for the eager receive buffers * N */
+       void *subport_rcvegrbuf;
+       /* An array of pages for the eager header queue entries * N */
+       void *subport_rcvhdr_base;
+       /* The version of the library which opened this port */
+       u32 userversion;
+       /* Bitmask of active slaves */
+       u32 active_slaves;
 };
 
 struct sk_buff;
@@ -132,6 +148,11 @@ struct _ipath_layer {
        void *l_arg;
 };
 
+struct ipath_skbinfo {
+       struct sk_buff *skb;
+       dma_addr_t phys;
+};
+
 struct ipath_devdata {
        struct list_head ipath_list;
 
@@ -154,7 +175,7 @@ struct ipath_devdata {
        /* ipath_cfgports pointers */
        struct ipath_portdata **ipath_pd;
        /* sk_buffs used by port 0 eager receive queue */
-       struct sk_buff **ipath_port0_skbs;
+       struct ipath_skbinfo *ipath_port0_skbinfo;
        /* kvirt address of 1st 2k pio buffer */
        void __iomem *ipath_pio2kbase;
        /* kvirt address of 1st 4k pio buffer */
@@ -315,12 +336,16 @@ struct ipath_devdata {
        u8 ipath_ht_slave_off;
        /* for write combining settings */
        unsigned long ipath_wc_cookie;
+       unsigned long ipath_wc_base;
+       unsigned long ipath_wc_len;
        /* ref count for each pkey */
        atomic_t ipath_pkeyrefs[4];
        /* shadow copy of all exptids physaddr; used only by funcsim */
        u64 *ipath_tidsimshadow;
        /* shadow copy of struct page *'s for exp tid pages */
        struct page **ipath_pageshadow;
+       /* shadow copy of dma handles for exp tid pages */
+       dma_addr_t *ipath_physshadow;
        /* lock to workaround chip bug 9437 */
        spinlock_t ipath_tid_lock;
 
@@ -402,6 +427,9 @@ struct ipath_devdata {
        unsigned long ipath_rcvctrl;
        /* shadow kr_sendctrl */
        unsigned long ipath_sendctrl;
+       /* ports waiting for PIOavail intr */
+       unsigned long ipath_portpiowait;
+       unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */
 
        /* value we put in kr_rcvhdrcnt */
        u32 ipath_rcvhdrcnt;
@@ -465,8 +493,6 @@ struct ipath_devdata {
        u32 ipath_htwidth;
        /* HT speed (200,400,800,1000) from HT config */
        u32 ipath_htspeed;
-       /* ports waiting for PIOavail intr */
-       unsigned long ipath_portpiowait;
        /*
         * number of sequential ibcstatus change for polling active/quiet
         * (i.e., link not coming up).
@@ -510,8 +536,47 @@ struct ipath_devdata {
        u32 ipath_lli_counter;
        /* local link integrity errors */
        u32 ipath_lli_errors;
+       /*
+        * Above counts only cases where _successive_ LocalLinkIntegrity
+        * errors were seen in the receive headers of kern-packets.
+        * Below are the three (monotonically increasing) counters
+        * maintained via GPIO interrupts on iba6120-rev2.
+        */
+       u32 ipath_rxfc_unsupvl_errs;
+       u32 ipath_overrun_thresh_errs;
+       u32 ipath_lli_errs;
+
+       /*
+        * Not all devices managed by a driver instance are the same
+        * type, so these fields must be per-device.
+        */
+       u64 ipath_i_bitsextant;
+       ipath_err_t ipath_e_bitsextant;
+       ipath_err_t ipath_hwe_bitsextant;
+
+       /*
+        * Below should be computable from number of ports,
+        * since they are never modified.
+        */
+       u32 ipath_i_rcvavail_mask;
+       u32 ipath_i_rcvurg_mask;
+
+       /*
+        * Register bits for selecting i2c direction and values, used for
+        * I2C serial flash.
+        */
+       u16 ipath_gpio_sda_num;
+       u16 ipath_gpio_scl_num;
+       u64 ipath_gpio_sda;
+       u64 ipath_gpio_scl;
 };
 
+/* Private data for file operations */
+struct ipath_filedata {
+       struct ipath_portdata *pd;
+       unsigned subport;
+       unsigned tidcursor;
+};
 extern struct list_head ipath_dev_list;
 extern spinlock_t ipath_devs_lock;
 extern struct ipath_devdata *ipath_lookup(int unit);
@@ -521,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
 void ipath_disable_wc(struct ipath_devdata *dd);
 int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
 void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
 
 struct file_operations;
 int ipath_cdev_init(int minor, char *name, struct file_operations *fops,
@@ -572,7 +638,11 @@ int ipath_set_lid(struct ipath_devdata *, u32, u8);
 int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 
 /* for use in system calls, where we want to know device type, etc. */
-#define port_fp(fp) ((struct ipath_portdata *) (fp)->private_data)
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+       ((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+       ((struct ipath_filedata *)(fp)->private_data)->tidcursor
 
 /*
  * values for ipath_flags
@@ -612,6 +682,15 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
                /* can miss port0 rx interrupts */
 #define IPATH_POLL_RX_INTR  0x40000
 #define IPATH_DISABLED      0x80000 /* administratively disabled */
+               /* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
 
 /* portdata flag bit offsets */
                /* waiting for a packet to arrive */
@@ -798,6 +877,13 @@ void ipath_exit_ipathfs(void);
 int ipathfs_add_device(struct ipath_devdata *);
 int ipathfs_remove_device(struct ipath_devdata *);
 
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+                         size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+
 /*
  * Flush write combining store buffers (if present) and perform a write
  * barrier.
@@ -855,4 +941,20 @@ extern struct mutex ipath_mutex;
 
 #endif /* _IPATH_DEBUGGING */
 
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+       u64 mask;
+       const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+                          const struct ipath_hwerror_msgs *hwerrmsgs,
+                          size_t nhwerrmsgs,
+                          char *msg, size_t lmsg);
+
 #endif                         /* _IPATH_KERNEL_H */
index ba1b932..9a6cbd0 100644 (file)
@@ -118,9 +118,10 @@ void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
  * Check the IB SGE for validity and initialize our internal version
  * of it.
  */
-int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
                  struct ib_sge *sge, int acc)
 {
+       struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
        struct ipath_mregion *mr;
        unsigned n, m;
        size_t off;
@@ -140,7 +141,8 @@ int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
                goto bail;
        }
        mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != sge->lkey)) {
+       if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+                    qp->ibqp.pd != mr->pd)) {
                ret = 0;
                goto bail;
        }
@@ -188,9 +190,10 @@ bail:
  *
  * Return 1 if successful, otherwise 0.
  */
-int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
                  u32 len, u64 vaddr, u32 rkey, int acc)
 {
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        struct ipath_lkey_table *rkt = &dev->lk_table;
        struct ipath_sge *sge = &ss->sge;
        struct ipath_mregion *mr;
@@ -214,7 +217,8 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
        }
 
        mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != rkey)) {
+       if (unlikely(mr == NULL || mr->lkey != rkey ||
+                    qp->ibqp.pd != mr->pd)) {
                ret = 0;
                goto bail;
        }
index 72d1db8..25908b0 100644 (file)
@@ -87,7 +87,8 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp,
        struct ipath_devdata *dd = to_idev(ibdev)->dd;
        u32 vendor, majrev, minrev;
 
-       if (smp->attr_mod)
+       /* GUID 0 is illegal */
+       if (smp->attr_mod || (dd->ipath_guid == 0))
                smp->status |= IB_SMP_INVALID_FIELD;
 
        nip->base_version = 1;
@@ -131,10 +132,15 @@ static int recv_subn_get_guidinfo(struct ib_smp *smp,
         * We only support one GUID for now.  If this changes, the
         * portinfo.guid_cap field needs to be updated too.
         */
-       if (startgx == 0)
-               /* The first is a copy of the read-only HW GUID. */
-               *p = to_idev(ibdev)->dd->ipath_guid;
-       else
+       if (startgx == 0) {
+               __be64 g = to_idev(ibdev)->dd->ipath_guid;
+               if (g == 0)
+                       /* GUID 0 is illegal */
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               else
+                       /* The first is a copy of the read-only HW GUID. */
+                       *p = g;
+       } else
                smp->status |= IB_SMP_INVALID_FIELD;
 
        return reply(smp);
index b36f6fb..a0673c1 100644 (file)
@@ -138,6 +138,7 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
                goto bail;
        }
 
+       mr->mr.pd = pd;
        mr->mr.user_base = *iova_start;
        mr->mr.iova = *iova_start;
        mr->mr.length = 0;
@@ -197,6 +198,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
                goto bail;
        }
 
+       mr->mr.pd = pd;
        mr->mr.user_base = region->user_base;
        mr->mr.iova = region->virt_base;
        mr->mr.length = region->length;
@@ -289,6 +291,7 @@ struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
         * Resources are allocated but no valid mapping (RKEY can't be
         * used).
         */
+       fmr->mr.pd = pd;
        fmr->mr.user_base = 0;
        fmr->mr.iova = 0;
        fmr->mr.length = 0;
index 224b0f4..46c1c89 100644 (file)
@@ -335,6 +335,7 @@ static void ipath_reset_qp(struct ipath_qp *qp)
        qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
        qp->r_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
        qp->r_nak_state = 0;
+       qp->r_wrid_valid = 0;
        qp->s_rnr_timeout = 0;
        qp->s_head = 0;
        qp->s_tail = 0;
@@ -342,6 +343,7 @@ static void ipath_reset_qp(struct ipath_qp *qp)
        qp->s_last = 0;
        qp->s_ssn = 1;
        qp->s_lsn = 0;
+       qp->s_wait_credit = 0;
        if (qp->r_rq.wq) {
                qp->r_rq.wq->head = 0;
                qp->r_rq.wq->tail = 0;
@@ -352,12 +354,13 @@ static void ipath_reset_qp(struct ipath_qp *qp)
 /**
  * ipath_error_qp - put a QP into an error state
  * @qp: the QP to put into an error state
+ * @err: the receive completion error to signal if a RWQE is active
  *
  * Flushes both send and receive work queues.
  * QP s_lock should be held and interrupts disabled.
  */
 
-void ipath_error_qp(struct ipath_qp *qp)
+void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
 {
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        struct ib_wc wc;
@@ -373,7 +376,6 @@ void ipath_error_qp(struct ipath_qp *qp)
                list_del_init(&qp->piowait);
        spin_unlock(&dev->pending_lock);
 
-       wc.status = IB_WC_WR_FLUSH_ERR;
        wc.vendor_err = 0;
        wc.byte_len = 0;
        wc.imm_data = 0;
@@ -385,6 +387,12 @@ void ipath_error_qp(struct ipath_qp *qp)
        wc.sl = 0;
        wc.dlid_path_bits = 0;
        wc.port_num = 0;
+       if (qp->r_wrid_valid) {
+               qp->r_wrid_valid = 0;
+               wc.status = err;
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
+       }
+       wc.status = IB_WC_WR_FLUSH_ERR;
 
        while (qp->s_last != qp->s_head) {
                struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
@@ -501,7 +509,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                break;
 
        case IB_QPS_ERR:
-               ipath_error_qp(qp);
+               ipath_error_qp(qp, IB_WC_GENERAL_ERR);
                break;
 
        default:
@@ -516,7 +524,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                qp->remote_qpn = attr->dest_qp_num;
 
        if (attr_mask & IB_QP_SQ_PSN) {
-               qp->s_next_psn = attr->sq_psn;
+               qp->s_psn = qp->s_next_psn = attr->sq_psn;
                qp->s_last_psn = qp->s_next_psn - 1;
        }
 
index a086540..a504cf6 100644 (file)
@@ -201,6 +201,18 @@ int ipath_make_rc_req(struct ipath_qp *qp,
            qp->s_rnr_timeout)
                goto done;
 
+       /* Limit the number of packets sent without an ACK. */
+       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) {
+               qp->s_wait_credit = 1;
+               dev->n_rc_stalls++;
+               spin_lock(&dev->pending_lock);
+               if (list_empty(&qp->timerwait))
+                       list_add_tail(&qp->timerwait,
+                                     &dev->pending[dev->pending_index]);
+               spin_unlock(&dev->pending_lock);
+               goto done;
+       }
+
        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
        hwords = 5;
        bth0 = 0;
@@ -221,7 +233,7 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                        /* Check if send work queue is empty. */
                        if (qp->s_tail == qp->s_head)
                                goto done;
-                       qp->s_psn = wqe->psn = qp->s_next_psn;
+                       wqe->psn = qp->s_next_psn;
                        newreq = 1;
                }
                /*
@@ -393,12 +405,6 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
-                       /*
-                        * Request an ACK every 1/2 MB to avoid retransmit
-                        * timeouts.
-                        */
-                       if (((wqe->length - len) % (512 * 1024)) == 0)
-                               bth2 |= 1 << 31;
                        len = pmtu;
                        break;
                }
@@ -435,12 +441,6 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
-                       /*
-                        * Request an ACK every 1/2 MB to avoid retransmit
-                        * timeouts.
-                        */
-                       if (((wqe->length - len) % (512 * 1024)) == 0)
-                               bth2 |= 1 << 31;
                        len = pmtu;
                        break;
                }
@@ -498,6 +498,8 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                 */
                goto done;
        }
+       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+               bth2 |= 1 << 31;        /* Request ACK. */
        qp->s_len -= len;
        qp->s_hdrwords = hwords;
        qp->s_cur_sge = ss;
@@ -737,6 +739,15 @@ bail:
        return;
 }
 
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+       if (qp->s_wait_credit) {
+               qp->s_wait_credit = 0;
+               tasklet_hi_schedule(&qp->s_task);
+       }
+       qp->s_last_psn = psn;
+}
+
 /**
  * do_rc_ack - process an incoming RC ACK
  * @qp: the QP the ACK came in on
@@ -805,7 +816,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                         * The last valid PSN seen is the previous
                         * request's.
                         */
-                       qp->s_last_psn = wqe->psn - 1;
+                       update_last_psn(qp, wqe->psn - 1);
                        /* Retry this request. */
                        ipath_restart_rc(qp, wqe->psn, &wc);
                        /*
@@ -864,7 +875,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                ipath_get_credit(qp, aeth);
                qp->s_rnr_retry = qp->s_rnr_retry_cnt;
                qp->s_retry = qp->s_retry_cnt;
-               qp->s_last_psn = psn;
+               update_last_psn(qp, psn);
                ret = 1;
                goto bail;
 
@@ -883,7 +894,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                        goto bail;
 
                /* The last valid PSN is the previous PSN. */
-               qp->s_last_psn = psn - 1;
+               update_last_psn(qp, psn - 1);
 
                dev->n_rc_resends += (int)qp->s_psn - (int)psn;
 
@@ -898,7 +909,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
        case 3:         /* NAK */
                /* The last valid PSN seen is the previous request's. */
                if (qp->s_last != qp->s_tail)
-                       qp->s_last_psn = wqe->psn - 1;
+                       update_last_psn(qp, wqe->psn - 1);
                switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
                        IPATH_AETH_CREDIT_MASK) {
                case 0: /* PSN sequence error */
@@ -1071,7 +1082,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                 * since we don't want s_sge modified.
                 */
                qp->s_len -= pmtu;
-               qp->s_last_psn = psn;
+               update_last_psn(qp, psn);
                spin_unlock_irqrestore(&qp->s_lock, flags);
                ipath_copy_sge(&qp->s_sge, data, pmtu);
                goto bail;
@@ -1223,7 +1234,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
                         * Address range must be a subset of the original
                         * request and start on pmtu boundaries.
                         */
-                       ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
+                       ok = ipath_rkey_ok(qp, &qp->s_rdma_sge,
                                           qp->s_rdma_len, vaddr, rkey,
                                           IB_ACCESS_REMOTE_READ);
                        if (unlikely(!ok)) {
@@ -1282,6 +1293,14 @@ done:
        return 1;
 }
 
+static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+       spin_lock_irq(&qp->s_lock);
+       qp->state = IB_QPS_ERR;
+       ipath_error_qp(qp, err);
+       spin_unlock_irq(&qp->s_lock);
+}
+
 /**
  * ipath_rc_rcv - process an incoming RC packet
  * @dev: the device this packet came in on
@@ -1309,6 +1328,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        struct ib_reth *reth;
        int header_in_data;
 
+       /* Validate the SLID. See Ch. 9.6.1.5 */
+       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+               goto done;
+
        /* Check for GRH */
        if (!has_grh) {
                ohdr = &hdr->u.oth;
@@ -1370,8 +1393,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                 */
                if (qp->r_ack_state >= OP(COMPARE_SWAP))
                        goto send_ack;
-               /* XXX Flush WQEs */
-               qp->state = IB_QPS_ERR;
+               ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR);
                qp->r_ack_state = OP(SEND_ONLY);
                qp->r_nak_state = IB_NAK_INVALID_REQUEST;
                qp->r_ack_psn = qp->r_psn;
@@ -1477,9 +1499,9 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        goto nack_inv;
                ipath_copy_sge(&qp->r_sge, data, tlen);
                qp->r_msn++;
-               if (opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_ONLY))
+               if (!qp->r_wrid_valid)
                        break;
+               qp->r_wrid_valid = 0;
                wc.wr_id = qp->r_wr_id;
                wc.status = IB_WC_SUCCESS;
                wc.opcode = IB_WC_RECV;
@@ -1517,7 +1539,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        int ok;
 
                        /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(dev, &qp->r_sge,
+                       ok = ipath_rkey_ok(qp, &qp->r_sge,
                                           qp->r_len, vaddr, rkey,
                                           IB_ACCESS_REMOTE_WRITE);
                        if (unlikely(!ok))
@@ -1559,7 +1581,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        int ok;
 
                        /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
+                       ok = ipath_rkey_ok(qp, &qp->s_rdma_sge,
                                           qp->s_rdma_len, vaddr, rkey,
                                           IB_ACCESS_REMOTE_READ);
                        if (unlikely(!ok)) {
@@ -1618,7 +1640,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        goto nack_inv;
                rkey = be32_to_cpu(ateth->rkey);
                /* Check rkey & NAK */
-               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge,
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
                                            sizeof(u64), vaddr, rkey,
                                            IB_ACCESS_REMOTE_ATOMIC)))
                        goto nack_acc;
@@ -1670,8 +1692,7 @@ nack_acc:
         * is pending though.
         */
        if (qp->r_ack_state < OP(COMPARE_SWAP)) {
-               /* XXX Flush WQEs */
-               qp->state = IB_QPS_ERR;
+               ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR);
                qp->r_ack_state = OP(RDMA_WRITE_ONLY);
                qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
                qp->r_ack_psn = qp->r_psn;
index 6e23b3d..dffc760 100644 (file)
 #define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
 #define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
 #define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
-#define INFINIPATH_HWE_MEMBISTFAILED        0x0040000000000000ULL
 #define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
 #define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF  0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC  0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+/* monty specific -- find the rest in ipath_6120.c */
+#define INFINIPATH_HWE_MEMBISTFAILED   0x0040000000000000ULL
 
 /* kr_hwdiagctrl bits */
 #define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
 
 /* combination link status states that we use with some frequency */
 #define IPATH_IBSTATE_MASK ((INFINIPATH_IBCS_LINKTRAININGSTATE_MASK \
-               << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
+               << INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | \
                (INFINIPATH_IBCS_LINKSTATE_MASK \
-               <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT))
+               <<INFINIPATH_IBCS_LINKSTATE_SHIFT))
 #define IPATH_IBSTATE_INIT ((INFINIPATH_IBCS_L_STATE_INIT \
                << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
                (INFINIPATH_IBCS_LT_STATE_LINKUP \
 
 typedef u64 ipath_err_t;
 
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
 /* mask of defined bits for various registers */
 extern u64 infinipath_i_bitsextant;
 extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
@@ -309,13 +334,6 @@ extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
 /* masks that are different in various chips, or only exist in some chips */
 extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
 
-/*
- * register bits for selecting i2c direction and values, used for I2C serial
- * flash
- */
-extern u16 ipath_gpio_sda_num, ipath_gpio_scl_num;
-extern u64 ipath_gpio_sda, ipath_gpio_scl;
-
 /*
  * These are the infinipath general register numbers (not offsets).
  * The kernel registers are used directly, those beyond the kernel
index 5c1da2d..f753051 100644 (file)
@@ -108,7 +108,6 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
 
 static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe)
 {
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        int user = to_ipd(qp->ibqp.pd)->user;
        int i, j, ret;
        struct ib_wc wc;
@@ -119,8 +118,7 @@ static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe)
                        continue;
                /* Check LKEY */
                if ((user && wqe->sg_list[i].lkey == 0) ||
-                   !ipath_lkey_ok(&dev->lk_table,
-                                  &qp->r_sg_list[j], &wqe->sg_list[i],
+                   !ipath_lkey_ok(qp, &qp->r_sg_list[j], &wqe->sg_list[i],
                                   IB_ACCESS_LOCAL_WRITE))
                        goto bad_lkey;
                qp->r_len += wqe->sg_list[i].length;
@@ -231,6 +229,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
                }
        }
        spin_unlock_irqrestore(&rq->lock, flags);
+       qp->r_wrid_valid = 1;
 
 bail:
        return ret;
@@ -326,7 +325,7 @@ again:
        case IB_WR_RDMA_WRITE:
                if (wqe->length == 0)
                        break;
-               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, wqe->length,
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
                                            wqe->wr.wr.rdma.remote_addr,
                                            wqe->wr.wr.rdma.rkey,
                                            IB_ACCESS_REMOTE_WRITE))) {
@@ -350,7 +349,7 @@ again:
                break;
 
        case IB_WR_RDMA_READ:
-               if (unlikely(!ipath_rkey_ok(dev, &sqp->s_sge, wqe->length,
+               if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
                                            wqe->wr.wr.rdma.remote_addr,
                                            wqe->wr.wr.rdma.rkey,
                                            IB_ACCESS_REMOTE_READ)))
@@ -365,7 +364,7 @@ again:
 
        case IB_WR_ATOMIC_CMP_AND_SWP:
        case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, sizeof(u64),
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
                                            wqe->wr.wr.rdma.remote_addr,
                                            wqe->wr.wr.rdma.rkey,
                                            IB_ACCESS_REMOTE_ATOMIC)))
@@ -575,8 +574,7 @@ int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr)
                }
                if (wr->sg_list[i].length == 0)
                        continue;
-               if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table,
-                                  &wqe->sg_list[j], &wr->sg_list[i],
+               if (!ipath_lkey_ok(qp, &wqe->sg_list[j], &wr->sg_list[i],
                                   acc)) {
                        spin_unlock_irqrestore(&qp->s_lock, flags);
                        ret = -EINVAL;
index 941e866..9403350 100644 (file)
@@ -104,11 +104,6 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
        u32 sz;
        struct ib_srq *ret;
 
-       if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
        if (srq_init_attr->attr.max_wr == 0) {
                ret = ERR_PTR(-EINVAL);
                goto done;
@@ -180,10 +175,17 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
        spin_lock_init(&srq->rq.lock);
        srq->rq.wq->head = 0;
        srq->rq.wq->tail = 0;
-       srq->rq.max_sge = srq_init_attr->attr.max_sge;
        srq->limit = srq_init_attr->attr.srq_limit;
 
-       dev->n_srqs_allocated++;
+       spin_lock(&dev->n_srqs_lock);
+       if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+               spin_unlock(&dev->n_srqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_wq;
+       }
+
+       dev->n_srqs_allocated++;
+       spin_unlock(&dev->n_srqs_lock);
 
        ret = &srq->ibsrq;
        goto done;
@@ -351,8 +353,13 @@ int ipath_destroy_srq(struct ib_srq *ibsrq)
        struct ipath_srq *srq = to_isrq(ibsrq);
        struct ipath_ibdev *dev = to_idev(ibsrq->device);
 
+       spin_lock(&dev->n_srqs_lock);
        dev->n_srqs_allocated--;
-       vfree(srq->rq.wq);
+       spin_unlock(&dev->n_srqs_lock);
+       if (srq->ip)
+               kref_put(&srq->ip->ref, ipath_release_mmap_info);
+       else
+               vfree(srq->rq.wq);
        kfree(srq);
 
        return 0;
index e299148..182de34 100644 (file)
@@ -257,7 +257,7 @@ static ssize_t store_guid(struct device *dev,
        struct ipath_devdata *dd = dev_get_drvdata(dev);
        ssize_t ret;
        unsigned short guid[8];
-       __be64 nguid;
+       __be64 new_guid;
        u8 *ng;
        int i;
 
@@ -266,7 +266,7 @@ static ssize_t store_guid(struct device *dev,
                   &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
                goto invalid;
 
-       ng = (u8 *) &nguid;
+       ng = (u8 *) &new_guid;
 
        for (i = 0; i < 8; i++) {
                if (guid[i] > 0xff)
@@ -274,7 +274,10 @@ static ssize_t store_guid(struct device *dev,
                ng[i] = guid[i];
        }
 
-       dd->ipath_guid = nguid;
+       if (new_guid == 0)
+               goto invalid;
+
+       dd->ipath_guid = new_guid;
        dd->ipath_nguid = 1;
 
        ret = strlen(buf);
@@ -297,6 +300,16 @@ static ssize_t show_nguid(struct device *dev,
        return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
 }
 
+static ssize_t show_nports(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       /* Return the number of user ports available. */
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
 static ssize_t show_serial(struct device *dev,
                           struct device_attribute *attr,
                           char *buf)
@@ -608,6 +621,7 @@ static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
 static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
 static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
 static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
 static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
 static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
 static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
@@ -623,6 +637,7 @@ static struct attribute *dev_attributes[] = {
        &dev_attr_mlid.attr,
        &dev_attr_mtu.attr,
        &dev_attr_nguid.attr,
+       &dev_attr_nports.attr,
        &dev_attr_serial.attr,
        &dev_attr_status.attr,
        &dev_attr_status_str.attr,
index 0fd3cde..e636cfd 100644 (file)
@@ -246,6 +246,10 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        struct ib_reth *reth;
        int header_in_data;
 
+       /* Validate the SLID. See Ch. 9.6.1.5 */
+       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+               goto done;
+
        /* Check for GRH */
        if (!has_grh) {
                ohdr = &hdr->u.oth;
@@ -440,7 +444,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        int ok;
 
                        /* Check rkey */
-                       ok = ipath_rkey_ok(dev, &qp->r_sge, qp->r_len,
+                       ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
                                           vaddr, rkey,
                                           IB_ACCESS_REMOTE_WRITE);
                        if (unlikely(!ok)) {
index 6991d1d..49f1102 100644 (file)
@@ -39,7 +39,6 @@
 static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
                    u32 *lengthp, struct ipath_sge_state *ss)
 {
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        int user = to_ipd(qp->ibqp.pd)->user;
        int i, j, ret;
        struct ib_wc wc;
@@ -50,8 +49,7 @@ static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
                        continue;
                /* Check LKEY */
                if ((user && wqe->sg_list[i].lkey == 0) ||
-                   !ipath_lkey_ok(&dev->lk_table,
-                                  j ? &ss->sg_list[j - 1] : &ss->sge,
+                   !ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
                                   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
                        goto bad_lkey;
                *lengthp += wqe->sg_list[i].length;
@@ -343,7 +341,7 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 
                if (wr->sg_list[i].length == 0)
                        continue;
-               if (!ipath_lkey_ok(&dev->lk_table, ss.num_sge ?
+               if (!ipath_lkey_ok(qp, ss.num_sge ?
                                   sg_list + ss.num_sge - 1 : &ss.sge,
                                   &wr->sg_list[i], 0)) {
                        ret = -EINVAL;
index e32fca9..413754b 100644 (file)
@@ -89,6 +89,62 @@ bail:
        return ret;
 }
 
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+       unsigned long offset, size_t size, int direction)
+{
+       dma_addr_t phys;
+
+       phys = pci_map_page(hwdev, page, offset, size, direction);
+
+       if (phys == 0) {
+               pci_unmap_page(hwdev, phys, size, direction);
+               phys = pci_map_page(hwdev, page, offset, size, direction);
+               /*
+                * FIXME: If we get 0 again, we should keep this page,
+                * map another, then free the 0 page.
+                */
+       }
+
+       return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+       int direction)
+{
+       dma_addr_t phys;
+
+       phys = pci_map_single(hwdev, ptr, size, direction);
+
+       if (phys == 0) {
+               pci_unmap_single(hwdev, phys, size, direction);
+               phys = pci_map_single(hwdev, ptr, size, direction);
+               /*
+                * FIXME: If we get 0 again, we should keep this page,
+                * map another, then free the 0 page.
+                */
+       }
+
+       return phys;
+}
+
 /**
  * ipath_get_user_pages - lock user pages into memory
  * @start_page: the start page
index b8381c5..42eaed8 100644 (file)
@@ -898,7 +898,8 @@ int ipath_get_counters(struct ipath_devdata *dd,
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) +
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) +
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) +
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt);
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt) +
+               dd->ipath_rxfc_unsupvl_errs;
        cntrs->port_rcv_remphys_errors =
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt);
        cntrs->port_xmit_discards =
@@ -911,8 +912,10 @@ int ipath_get_counters(struct ipath_devdata *dd,
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
        cntrs->port_rcv_packets =
                ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-       cntrs->local_link_integrity_errors = dd->ipath_lli_errors;
-       cntrs->excessive_buffer_overrun_errors = 0; /* XXX */
+       cntrs->local_link_integrity_errors =
+               (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+               dd->ipath_lli_errs : dd->ipath_lli_errors;
+       cntrs->excessive_buffer_overrun_errors = dd->ipath_overrun_thresh_errs;
 
        ret = 0;
 
@@ -1199,6 +1202,7 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
        struct ipath_ah *ah;
        struct ib_ah *ret;
        struct ipath_ibdev *dev = to_idev(pd->device);
+       unsigned long flags;
 
        /* A multicast address requires a GRH (see ch. 8.4.1). */
        if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
@@ -1225,16 +1229,16 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
                goto bail;
        }
 
-       spin_lock(&dev->n_ahs_lock);
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
        if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
-               spin_unlock(&dev->n_ahs_lock);
+               spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
                kfree(ah);
                ret = ERR_PTR(-ENOMEM);
                goto bail;
        }
 
        dev->n_ahs_allocated++;
-       spin_unlock(&dev->n_ahs_lock);
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
 
        /* ib_create_ah() will initialize ah->ibah. */
        ah->attr = *ah_attr;
@@ -1255,10 +1259,11 @@ static int ipath_destroy_ah(struct ib_ah *ibah)
 {
        struct ipath_ibdev *dev = to_idev(ibah->device);
        struct ipath_ah *ah = to_iah(ibah);
+       unsigned long flags;
 
-       spin_lock(&dev->n_ahs_lock);
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
        dev->n_ahs_allocated--;
-       spin_unlock(&dev->n_ahs_lock);
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
 
        kfree(ah);
 
@@ -1380,11 +1385,13 @@ static int enable_timer(struct ipath_devdata *dd)
         * processing.
         */
        if (dd->ipath_flags & IPATH_GPIO_INTR) {
+               u64 val;
                ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
                                 0x2074076542310ULL);
                /* Enable GPIO bit 2 interrupt */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                (u64) (1 << 2));
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
+               val |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+               ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
        }
 
        init_timer(&dd->verbs_timer);
@@ -1399,8 +1406,17 @@ static int enable_timer(struct ipath_devdata *dd)
 static int disable_timer(struct ipath_devdata *dd)
 {
        /* Disable GPIO bit 2 interrupt */
-       if (dd->ipath_flags & IPATH_GPIO_INTR)
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, 0);
+       if (dd->ipath_flags & IPATH_GPIO_INTR) {
+                u64 val;
+                /* Disable GPIO bit 2 interrupt */
+                val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
+                val &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+                ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
+               /*
+                * We might want to undo changes to debugportselect,
+                * but how?
+                */
+       }
 
        del_timer_sync(&dd->verbs_timer);
 
@@ -1683,6 +1699,7 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
                      "RC OTH NAKs %d\n"
                      "RC timeouts %d\n"
                      "RC RDMA dup %d\n"
+                     "RC stalls   %d\n"
                      "piobuf wait %d\n"
                      "no piobuf   %d\n"
                      "PKT drops   %d\n"
@@ -1690,7 +1707,7 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
                      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
                      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
                      dev->n_other_naks, dev->n_timeouts,
-                     dev->n_rdma_dup_busy, dev->n_piowait,
+                     dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait,
                      dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs);
        for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
                const struct ipath_opcode_stats *si = &dev->opstats[i];
index 09bbb3f..8039f6e 100644 (file)
@@ -220,6 +220,7 @@ struct ipath_segarray {
 };
 
 struct ipath_mregion {
+       struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
        u64 user_base;          /* User's address for this region */
        u64 iova;               /* IB start address of this region */
        size_t length;
@@ -364,12 +365,14 @@ struct ipath_qp {
        u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
        u8 r_reuse_sge;         /* for UC receive errors */
        u8 r_sge_inx;           /* current index into sg_list */
+       u8 r_wrid_valid;        /* r_wrid set but CQ entry not yet made */
        u8 qp_access_flags;
        u8 s_max_sge;           /* size of s_wq->sg_list */
        u8 s_retry_cnt;         /* number of times to retry */
        u8 s_rnr_retry_cnt;
        u8 s_retry;             /* requester retry counter */
        u8 s_rnr_retry;         /* requester RNR retry counter */
+       u8 s_wait_credit;       /* limit number of unacked packets sent */
        u8 s_pkey_index;        /* PKEY index to use */
        u8 timeout;             /* Timeout for this QP */
        enum ib_mtu path_mtu;
@@ -393,6 +396,8 @@ struct ipath_qp {
 #define IPATH_S_BUSY           0
 #define IPATH_S_SIGNAL_REQ_WR  1
 
+#define IPATH_PSN_CREDIT       2048
+
 /*
  * Since struct ipath_swqe is not a fixed size, we can't simply index into
  * struct ipath_qp.s_wq.  This function does the array index computation.
@@ -521,6 +526,7 @@ struct ipath_ibdev {
        u32 n_rnr_naks;
        u32 n_other_naks;
        u32 n_timeouts;
+       u32 n_rc_stalls;
        u32 n_pkt_drops;
        u32 n_vl15_dropped;
        u32 n_wqe_errs;
@@ -634,6 +640,8 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 
 int ipath_destroy_qp(struct ib_qp *ibqp);
 
+void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
 int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                    int attr_mask, struct ib_udata *udata);
 
@@ -653,12 +661,6 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
 
 void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
 
-int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
-                 u32 len, u64 vaddr, u32 rkey, int acc);
-
-int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
-                 struct ib_sge *sge, int acc);
-
 void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
 
 void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
@@ -683,10 +685,10 @@ int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
 
 void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
 
-int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
                  struct ib_sge *sge, int acc);
 
-int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
                  u32 len, u64 vaddr, u32 rkey, int acc);
 
 int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
index 036fde6..0095bb7 100644 (file)
 #include "ipath_kernel.h"
 
 /**
- * ipath_unordered_wc - indicate whether write combining is ordered
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
  *
- * PowerPC systems (at least those in the 970 processor family)
- * write partially filled store buffers in address order, but will write
- * completely filled store buffers in "random" order, and therefore must
- * have serialization for correctness with current InfiniPath chips.
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+       return 0;
+}
+
+/**
+ * ipath_unordered_wc - indicate whether write combining is unordered
  *
+ * Because our performance depends on our ability to do write
+ * combining mmio writes in the most efficient way, we need to
+ * know if we are on a processor that may reorder stores when
+ * write combining.
  */
 int ipath_unordered_wc(void)
 {
index f8f9e2e..04696e6 100644 (file)
@@ -123,6 +123,8 @@ int ipath_enable_wc(struct ipath_devdata *dd)
                        ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, "
                                   "cookie is %d\n", cookie);
                        dd->ipath_wc_cookie = cookie;
+                       dd->ipath_wc_base = (unsigned long) pioaddr;
+                       dd->ipath_wc_len = (unsigned long) piolen;
                }
        }
 
@@ -136,9 +138,16 @@ int ipath_enable_wc(struct ipath_devdata *dd)
 void ipath_disable_wc(struct ipath_devdata *dd)
 {
        if (dd->ipath_wc_cookie) {
+               int r;
                ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n");
-               mtrr_del(dd->ipath_wc_cookie, 0, 0);
-               dd->ipath_wc_cookie = 0;
+               r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base,
+                            dd->ipath_wc_len);
+               if (r < 0)
+                       dev_info(&dd->pcidev->dev,
+                                "mtrr_del(%lx, %lx, %lx) failed: %d\n",
+                                dd->ipath_wc_cookie, dd->ipath_wc_base,
+                                dd->ipath_wc_len, r);
+               dd->ipath_wc_cookie = 0; /* even on failure */
        }
 }
 
index 365a1b5..aecbb90 100644 (file)
@@ -1,11 +1,12 @@
 config INFINIBAND_ISER
-       tristate "ISCSI RDMA Protocol"
+       tristate "iSCSI Extensions for RDMA (iSER)"
        depends on INFINIBAND && SCSI && INET
        select SCSI_ISCSI_ATTRS
        ---help---
-         Support for the ISCSI RDMA Protocol over InfiniBand.  This
-         allows you to access storage devices that speak ISER/ISCSI
-         over InfiniBand.
+         Support for the iSCSI Extensions for RDMA (iSER) Protocol
+          over InfiniBand. This allows you to access storage devices
+          that speak iSCSI over iSER over InfiniBand.
 
-         The ISER protocol is defined by IETF.
-         See <http://www.ietf.org/>.
+         The iSER protocol is defined by IETF.
+         See <http://www.ietf.org/internet-drafts/draft-ietf-ips-iser-05.txt>
+         and <http://www.infinibandta.org/members/spec/iser_annex_060418.pdf>
index 2a14fe2..eb6f98d 100644 (file)
@@ -317,6 +317,8 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn)
        struct iscsi_iser_conn *iser_conn = conn->dd_data;
 
        iscsi_conn_teardown(cls_conn);
+       if (iser_conn->ib_conn)
+               iser_conn->ib_conn->iser_conn = NULL;
        kfree(iser_conn);
 }
 
index 2cf9ae0..9c53916 100644 (file)
@@ -192,7 +192,7 @@ struct iser_regd_buf {
 
 struct iser_dto {
        struct iscsi_iser_cmd_task *ctask;
-       struct iscsi_iser_conn     *conn;
+       struct iser_conn *ib_conn;
        int                        notify_enable;
 
        /* vector of registered buffers */
@@ -355,4 +355,11 @@ int  iser_post_send(struct iser_desc *tx_desc);
 
 int iser_conn_state_comp(struct iser_conn *ib_conn,
                         enum iser_ib_conn_state comp);
+
+int iser_dma_map_task_data(struct iscsi_iser_cmd_task *iser_ctask,
+                           struct iser_data_buf       *data,
+                           enum   iser_data_dir       iser_dir,
+                           enum   dma_data_direction  dma_dir);
+
+void iser_dma_unmap_task_data(struct iscsi_iser_cmd_task *iser_ctask);
 #endif
index ccf56f6..9b3d79c 100644 (file)
@@ -66,42 +66,6 @@ static void iser_dto_add_regd_buff(struct iser_dto *dto,
        dto->regd_vector_len++;
 }
 
-static int iser_dma_map_task_data(struct iscsi_iser_cmd_task *iser_ctask,
-                                 struct iser_data_buf       *data,
-                                 enum   iser_data_dir       iser_dir,
-                                 enum   dma_data_direction  dma_dir)
-{
-       struct device *dma_device;
-
-       iser_ctask->dir[iser_dir] = 1;
-       dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device;
-
-       data->dma_nents = dma_map_sg(dma_device, data->buf, data->size, dma_dir);
-       if (data->dma_nents == 0) {
-               iser_err("dma_map_sg failed!!!\n");
-               return -EINVAL;
-       }
-       return 0;
-}
-
-static void iser_dma_unmap_task_data(struct iscsi_iser_cmd_task *iser_ctask)
-{
-       struct device  *dma_device;
-       struct iser_data_buf *data;
-
-       dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device;
-
-       if (iser_ctask->dir[ISER_DIR_IN]) {
-               data = &iser_ctask->data[ISER_DIR_IN];
-               dma_unmap_sg(dma_device, data->buf, data->size, DMA_FROM_DEVICE);
-       }
-
-       if (iser_ctask->dir[ISER_DIR_OUT]) {
-               data = &iser_ctask->data[ISER_DIR_OUT];
-               dma_unmap_sg(dma_device, data->buf, data->size, DMA_TO_DEVICE);
-       }
-}
-
 /* Register user buffer memory and initialize passive rdma
  *  dto descriptor. Total data size is stored in
  *  iser_ctask->data[ISER_DIR_IN].data_len
@@ -249,7 +213,7 @@ static int iser_post_receive_control(struct iscsi_conn *conn)
        }
 
        recv_dto = &rx_desc->dto;
-       recv_dto->conn          = iser_conn;
+       recv_dto->ib_conn = iser_conn->ib_conn;
        recv_dto->regd_vector_len = 0;
 
        regd_hdr = &rx_desc->hdr_regd_buf;
@@ -296,7 +260,7 @@ static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn,
        regd_hdr->virt_addr  = tx_desc; /* == &tx_desc->iser_header */
        regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
 
-       send_dto->conn          = iser_conn;
+       send_dto->ib_conn         = iser_conn->ib_conn;
        send_dto->notify_enable   = 1;
        send_dto->regd_vector_len = 0;
 
@@ -588,7 +552,7 @@ void iser_rcv_completion(struct iser_desc *rx_desc,
                         unsigned long dto_xfer_len)
 {
        struct iser_dto        *dto = &rx_desc->dto;
-       struct iscsi_iser_conn *conn = dto->conn;
+       struct iscsi_iser_conn *conn = dto->ib_conn->iser_conn;
        struct iscsi_session *session = conn->iscsi_conn->session;
        struct iscsi_cmd_task *ctask;
        struct iscsi_iser_cmd_task *iser_ctask;
@@ -641,7 +605,8 @@ void iser_rcv_completion(struct iser_desc *rx_desc,
 void iser_snd_completion(struct iser_desc *tx_desc)
 {
        struct iser_dto        *dto = &tx_desc->dto;
-       struct iscsi_iser_conn *iser_conn = dto->conn;
+       struct iser_conn       *ib_conn = dto->ib_conn;
+       struct iscsi_iser_conn *iser_conn = ib_conn->iser_conn;
        struct iscsi_conn      *conn = iser_conn->iscsi_conn;
        struct iscsi_mgmt_task *mtask;
 
@@ -652,7 +617,7 @@ void iser_snd_completion(struct iser_desc *tx_desc)
        if (tx_desc->type == ISCSI_TX_DATAOUT)
                kmem_cache_free(ig.desc_cache, tx_desc);
 
-       atomic_dec(&iser_conn->ib_conn->post_send_buf_count);
+       atomic_dec(&ib_conn->post_send_buf_count);
 
        write_lock(conn->recv_lock);
        if (conn->suspend_tx) {
@@ -698,14 +663,19 @@ void iser_ctask_rdma_init(struct iscsi_iser_cmd_task *iser_ctask)
 void iser_ctask_rdma_finalize(struct iscsi_iser_cmd_task *iser_ctask)
 {
        int deferred;
+       int is_rdma_aligned = 1;
 
        /* if we were reading, copy back to unaligned sglist,
         * anyway dma_unmap and free the copy
         */
-       if (iser_ctask->data_copy[ISER_DIR_IN].copy_buf != NULL)
+       if (iser_ctask->data_copy[ISER_DIR_IN].copy_buf != NULL) {
+               is_rdma_aligned = 0;
                iser_finalize_rdma_unaligned_sg(iser_ctask, ISER_DIR_IN);
-       if (iser_ctask->data_copy[ISER_DIR_OUT].copy_buf != NULL)
+       }
+       if (iser_ctask->data_copy[ISER_DIR_OUT].copy_buf != NULL) {
+               is_rdma_aligned = 0;
                iser_finalize_rdma_unaligned_sg(iser_ctask, ISER_DIR_OUT);
+       }
 
        if (iser_ctask->dir[ISER_DIR_IN]) {
                deferred = iser_regd_buff_release
@@ -725,7 +695,9 @@ void iser_ctask_rdma_finalize(struct iscsi_iser_cmd_task *iser_ctask)
                }
        }
 
-       iser_dma_unmap_task_data(iser_ctask);
+       /* if the data was unaligned, it was already unmapped and then copied */
+       if (is_rdma_aligned)
+               iser_dma_unmap_task_data(iser_ctask);
 }
 
 void iser_dto_buffs_release(struct iser_dto *dto)
index d0b03f4..0606744 100644 (file)
@@ -369,6 +369,44 @@ static void iser_page_vec_build(struct iser_data_buf *data,
        }
 }
 
+int iser_dma_map_task_data(struct iscsi_iser_cmd_task *iser_ctask,
+                           struct iser_data_buf       *data,
+                           enum   iser_data_dir       iser_dir,
+                           enum   dma_data_direction  dma_dir)
+{
+       struct device *dma_device;
+
+       iser_ctask->dir[iser_dir] = 1;
+       dma_device =
+               iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device;
+
+       data->dma_nents = dma_map_sg(dma_device, data->buf, data->size, dma_dir);
+       if (data->dma_nents == 0) {
+               iser_err("dma_map_sg failed!!!\n");
+               return -EINVAL;
+       }
+       return 0;
+}
+
+void iser_dma_unmap_task_data(struct iscsi_iser_cmd_task *iser_ctask)
+{
+       struct device  *dma_device;
+       struct iser_data_buf *data;
+
+       dma_device =
+               iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device;
+
+       if (iser_ctask->dir[ISER_DIR_IN]) {
+               data = &iser_ctask->data[ISER_DIR_IN];
+               dma_unmap_sg(dma_device, data->buf, data->size, DMA_FROM_DEVICE);
+       }
+
+       if (iser_ctask->dir[ISER_DIR_OUT]) {
+               data = &iser_ctask->data[ISER_DIR_OUT];
+               dma_unmap_sg(dma_device, data->buf, data->size, DMA_TO_DEVICE);
+       }
+}
+
 /**
  * iser_reg_rdma_mem - Registers memory intended for RDMA,
  * obtaining rkey and va
@@ -394,6 +432,10 @@ int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask,
                iser_err("rdma alignment violation %d/%d aligned\n",
                         aligned_len, mem->size);
                iser_data_buf_dump(mem);
+
+               /* unmap the command data before accessing it */
+               iser_dma_unmap_task_data(iser_ctask);
+
                /* allocate copy buf, if we are writing, copy the */
                /* unaligned scatterlist, dma map the copy        */
                if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0)
index ecdca7f..18a0000 100644 (file)
@@ -571,6 +571,8 @@ void iser_conn_release(struct iser_conn *ib_conn)
        /* on EVENT_ADDR_ERROR there's no device yet for this conn */
        if (device != NULL)
                iser_device_try_release(device);
+       if (ib_conn->iser_conn)
+               ib_conn->iser_conn->ib_conn = NULL;
        kfree(ib_conn);
 }
 
@@ -694,7 +696,7 @@ int iser_post_recv(struct iser_desc *rx_desc)
        struct iser_dto   *recv_dto = &rx_desc->dto;
 
        /* Retrieve conn */
-       ib_conn = recv_dto->conn->ib_conn;
+       ib_conn = recv_dto->ib_conn;
 
        iser_dto_to_iov(recv_dto, iov, 2);
 
@@ -727,7 +729,7 @@ int iser_post_send(struct iser_desc *tx_desc)
        struct iser_conn  *ib_conn;
        struct iser_dto   *dto = &tx_desc->dto;
 
-       ib_conn = dto->conn->ib_conn;
+       ib_conn = dto->ib_conn;
 
        iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN);
 
@@ -774,7 +776,7 @@ static void iser_comp_error_worker(void *data)
 static void iser_handle_comp_error(struct iser_desc *desc)
 {
        struct iser_dto  *dto     = &desc->dto;
-       struct iser_conn *ib_conn = dto->conn->ib_conn;
+       struct iser_conn *ib_conn = dto->ib_conn;
 
        iser_dto_buffs_release(dto);