IB/srp: rework mapping engine to use multiple FMR entries
[pandora-kernel.git] / drivers / infiniband / ulp / srp / ib_srp.c
index 6f8ee0c..9ce129a 100644 (file)
@@ -444,6 +444,17 @@ static bool srp_change_state(struct srp_target_port *target,
        return changed;
 }
 
+static void srp_free_req_data(struct srp_target_port *target)
+{
+       struct srp_request *req;
+       int i;
+
+       for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) {
+               kfree(req->fmr_list);
+               kfree(req->map_page);
+       }
+}
+
 static void srp_remove_work(struct work_struct *work)
 {
        struct srp_target_port *target =
@@ -460,6 +471,7 @@ static void srp_remove_work(struct work_struct *work)
        scsi_remove_host(target->scsi_host);
        ib_destroy_cm_id(target->cm_id);
        srp_free_target_ib(target);
+       srp_free_req_data(target);
        scsi_host_put(target->scsi_host);
 }
 
@@ -523,18 +535,20 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
                           struct srp_target_port *target,
                           struct srp_request *req)
 {
+       struct ib_device *ibdev = target->srp_host->srp_dev->dev;
+       struct ib_pool_fmr **pfmr;
+
        if (!scsi_sglist(scmnd) ||
            (scmnd->sc_data_direction != DMA_TO_DEVICE &&
             scmnd->sc_data_direction != DMA_FROM_DEVICE))
                return;
 
-       if (req->fmr) {
-               ib_fmr_pool_unmap(req->fmr);
-               req->fmr = NULL;
-       }
+       pfmr = req->fmr_list;
+       while (req->nfmr--)
+               ib_fmr_pool_unmap(*pfmr++);
 
-       ib_dma_unmap_sg(target->srp_host->srp_dev->dev, scsi_sglist(scmnd),
-                       scsi_sg_count(scmnd), scmnd->sc_data_direction);
+       ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd),
+                       scmnd->sc_data_direction);
 }
 
 static void srp_remove_req(struct srp_target_port *target,
@@ -633,95 +647,152 @@ err:
        return ret;
 }
 
-static int srp_map_fmr(struct srp_target_port *target, struct scatterlist *scat,
-                      int sg_cnt, struct srp_request *req,
-                      struct srp_direct_buf *buf)
+static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
+                        unsigned int dma_len, u32 rkey)
 {
-       u64 io_addr = 0;
-       u64 *dma_pages;
-       u32 len;
-       int page_cnt;
-       int i, j;
-       int ret;
-       struct srp_device *dev = target->srp_host->srp_dev;
-       struct ib_device *ibdev = dev->dev;
-       struct scatterlist *sg;
+       struct srp_direct_buf *desc = state->desc;
 
-       if (!dev->fmr_pool)
-               return -ENODEV;
+       desc->va = cpu_to_be64(dma_addr);
+       desc->key = cpu_to_be32(rkey);
+       desc->len = cpu_to_be32(dma_len);
 
-       if (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask)
-               return -EINVAL;
+       state->total_len += dma_len;
+       state->desc++;
+       state->ndesc++;
+}
 
-       len = page_cnt = 0;
-       scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) {
-               unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+static int srp_map_finish_fmr(struct srp_map_state *state,
+                             struct srp_target_port *target)
+{
+       struct srp_device *dev = target->srp_host->srp_dev;
+       struct ib_pool_fmr *fmr;
+       u64 io_addr = 0;
 
-               if (ib_sg_dma_address(ibdev, sg) & ~dev->fmr_page_mask) {
-                       if (i > 0)
-                               return -EINVAL;
-                       else
-                               ++page_cnt;
-               }
-               if ((ib_sg_dma_address(ibdev, sg) + dma_len) &
-                   ~dev->fmr_page_mask) {
-                       if (i < sg_cnt - 1)
-                               return -EINVAL;
-                       else
-                               ++page_cnt;
-               }
+       if (!state->npages)
+               return 0;
 
-               len += dma_len;
+       if (state->npages == 1) {
+               srp_map_desc(state, state->base_dma_addr, state->fmr_len,
+                            target->rkey);
+               state->npages = state->fmr_len = 0;
+               return 0;
        }
 
-       page_cnt += len >> dev->fmr_page_shift;
-       if (page_cnt > SRP_FMR_SIZE)
-               return -ENOMEM;
+       fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages,
+                                  state->npages, io_addr);
+       if (IS_ERR(fmr))
+               return PTR_ERR(fmr);
 
-       dma_pages = kmalloc(sizeof (u64) * page_cnt, GFP_ATOMIC);
-       if (!dma_pages)
-               return -ENOMEM;
+       *state->next_fmr++ = fmr;
+       state->nfmr++;
 
-       page_cnt = 0;
-       scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) {
-               unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+       srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey);
+       state->npages = state->fmr_len = 0;
+       return 0;
+}
+
+static void srp_map_update_start(struct srp_map_state *state,
+                                struct scatterlist *sg, int sg_index,
+                                dma_addr_t dma_addr)
+{
+       state->unmapped_sg = sg;
+       state->unmapped_index = sg_index;
+       state->unmapped_addr = dma_addr;
+}
 
-               for (j = 0; j < dma_len; j += dev->fmr_page_size)
-                       dma_pages[page_cnt++] =
-                               (ib_sg_dma_address(ibdev, sg) &
-                                dev->fmr_page_mask) + j;
+static int srp_map_sg_entry(struct srp_map_state *state,
+                           struct srp_target_port *target,
+                           struct scatterlist *sg, int sg_index,
+                           int use_fmr)
+{
+       struct srp_device *dev = target->srp_host->srp_dev;
+       struct ib_device *ibdev = dev->dev;
+       dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
+       unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+       unsigned int len;
+       int ret;
+
+       if (!dma_len)
+               return 0;
+
+       if (use_fmr == SRP_MAP_NO_FMR) {
+               /* Once we're in direct map mode for a request, we don't
+                * go back to FMR mode, so no need to update anything
+                * other than the descriptor.
+                */
+               srp_map_desc(state, dma_addr, dma_len, target->rkey);
+               return 0;
        }
 
-       req->fmr = ib_fmr_pool_map_phys(dev->fmr_pool,
-                                       dma_pages, page_cnt, io_addr);
-       if (IS_ERR(req->fmr)) {
-               ret = PTR_ERR(req->fmr);
-               req->fmr = NULL;
-               goto out;
+       /* If we start at an offset into the FMR page, don't merge into
+        * the current FMR. Finish it out, and use the kernel's MR for this
+        * sg entry. This is to avoid potential bugs on some SRP targets
+        * that were never quite defined, but went away when the initiator
+        * avoided using FMR on such page fragments.
+        */
+       if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) {
+               ret = srp_map_finish_fmr(state, target);
+               if (ret)
+                       return ret;
+
+               srp_map_desc(state, dma_addr, dma_len, target->rkey);
+               srp_map_update_start(state, NULL, 0, 0);
+               return 0;
        }
 
-       buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, &scat[0]) &
-                              ~dev->fmr_page_mask);
-       buf->key = cpu_to_be32(req->fmr->fmr->rkey);
-       buf->len = cpu_to_be32(len);
+       /* If this is the first sg to go into the FMR, save our position.
+        * We need to know the first unmapped entry, its index, and the
+        * first unmapped address within that entry to be able to restart
+        * mapping after an error.
+        */
+       if (!state->unmapped_sg)
+               srp_map_update_start(state, sg, sg_index, dma_addr);
 
-       ret = 0;
+       while (dma_len) {
+               if (state->npages == SRP_FMR_SIZE) {
+                       ret = srp_map_finish_fmr(state, target);
+                       if (ret)
+                               return ret;
 
-out:
-       kfree(dma_pages);
+                       srp_map_update_start(state, sg, sg_index, dma_addr);
+               }
+
+               len = min_t(unsigned int, dma_len, dev->fmr_page_size);
 
+               if (!state->npages)
+                       state->base_dma_addr = dma_addr;
+               state->pages[state->npages++] = dma_addr;
+               state->fmr_len += len;
+               dma_addr += len;
+               dma_len -= len;
+       }
+
+       /* If the last entry of the FMR wasn't a full page, then we need to
+        * close it out and start a new one -- we can only merge at page
+        * boundries.
+        */
+       ret = 0;
+       if (len != dev->fmr_page_size) {
+               ret = srp_map_finish_fmr(state, target);
+               if (!ret)
+                       srp_map_update_start(state, NULL, 0, 0);
+       }
        return ret;
 }
 
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
                        struct srp_request *req)
 {
-       struct scatterlist *scat;
+       struct scatterlist *scat, *sg;
        struct srp_cmd *cmd = req->cmd->buf;
-       int len, nents, count;
-       u8 fmt = SRP_DATA_DESC_DIRECT;
+       int i, len, nents, count, use_fmr;
        struct srp_device *dev;
        struct ib_device *ibdev;
+       struct srp_map_state state;
+       struct srp_indirect_buf *indirect_hdr;
+       dma_addr_t indirect_addr;
+       u32 table_len;
+       u8 fmt;
 
        if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
                return sizeof (struct srp_cmd);
@@ -741,6 +812,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
        ibdev = dev->dev;
 
        count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction);
+       if (unlikely(count == 0))
+               return -EIO;
 
        fmt = SRP_DATA_DESC_DIRECT;
        len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf);
@@ -757,49 +830,80 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
                buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
                buf->key = cpu_to_be32(target->rkey);
                buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
-       } else if (srp_map_fmr(target, scat, count, req,
-                              (void *) cmd->add_data)) {
-               /*
-                * FMR mapping failed, and the scatterlist has more
-                * than one entry.  Generate an indirect memory
-                * descriptor.
-                */
-               struct srp_indirect_buf *buf = (void *) cmd->add_data;
-               struct scatterlist *sg;
-               u32 datalen = 0;
-               int i;
-
-               fmt = SRP_DATA_DESC_INDIRECT;
-               len = sizeof (struct srp_cmd) +
-                       sizeof (struct srp_indirect_buf) +
-                       count * sizeof (struct srp_direct_buf);
-
-               scsi_for_each_sg(scmnd, sg, count, i) {
-                       unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
-
-                       buf->desc_list[i].va  =
-                               cpu_to_be64(ib_sg_dma_address(ibdev, sg));
-                       buf->desc_list[i].key =
-                               cpu_to_be32(target->rkey);
-                       buf->desc_list[i].len = cpu_to_be32(dma_len);
-                       datalen += dma_len;
+
+               req->nfmr = 0;
+               goto map_complete;
+       }
+
+       /* We have more than one scatter/gather entry, so build our indirect
+        * descriptor table, trying to merge as many entries with FMR as we
+        * can.
+        */
+       indirect_hdr = (void *) cmd->add_data;
+
+       memset(&state, 0, sizeof(state));
+       state.desc      = indirect_hdr->desc_list;
+       state.pages     = req->map_page;
+       state.next_fmr  = req->fmr_list;
+
+       use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR;
+
+       for_each_sg(scat, sg, count, i) {
+               if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) {
+                       /* FMR mapping failed, so backtrack to the first
+                        * unmapped entry and continue on without using FMR.
+                        */
+                       dma_addr_t dma_addr;
+                       unsigned int dma_len;
+
+backtrack:
+                       sg = state.unmapped_sg;
+                       i = state.unmapped_index;
+
+                       dma_addr = ib_sg_dma_address(ibdev, sg);
+                       dma_len = ib_sg_dma_len(ibdev, sg);
+                       dma_len -= (state.unmapped_addr - dma_addr);
+                       dma_addr = state.unmapped_addr;
+                       use_fmr = SRP_MAP_NO_FMR;
+                       srp_map_desc(&state, dma_addr, dma_len, target->rkey);
                }
+       }
 
-               if (scmnd->sc_data_direction == DMA_TO_DEVICE)
-                       cmd->data_out_desc_cnt = count;
-               else
-                       cmd->data_in_desc_cnt = count;
+       if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target))
+               goto backtrack;
 
-               buf->table_desc.va  =
-                       cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf);
-               buf->table_desc.key =
-                       cpu_to_be32(target->rkey);
-               buf->table_desc.len =
-                       cpu_to_be32(count * sizeof (struct srp_direct_buf));
+       /* We've mapped the request, fill in the command buffer.
+        */
+       req->nfmr = state.nfmr;
+       if (state.ndesc == 1) {
+               /* FMR mapping was able to collapse this to one entry,
+                * so use a direct descriptor.
+                */
+               struct srp_direct_buf *buf = (void *) cmd->add_data;
 
-               buf->len = cpu_to_be32(datalen);
+               *buf = indirect_hdr->desc_list[0];
+               goto map_complete;
        }
 
+       table_len = state.ndesc * sizeof (struct srp_direct_buf);
+
+       fmt = SRP_DATA_DESC_INDIRECT;
+       len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+       len += table_len;
+
+       indirect_addr = req->cmd->dma + sizeof *cmd + sizeof *indirect_hdr;
+
+       indirect_hdr->table_desc.va = cpu_to_be64(indirect_addr);
+       indirect_hdr->table_desc.key = cpu_to_be32(target->rkey);
+       indirect_hdr->table_desc.len = cpu_to_be32(table_len);
+       indirect_hdr->len = cpu_to_be32(state.total_len);
+
+       if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+               cmd->data_out_desc_cnt = state.ndesc;
+       else
+               cmd->data_in_desc_cnt = state.ndesc;
+
+map_complete:
        if (scmnd->sc_data_direction == DMA_TO_DEVICE)
                cmd->buf_fmt = fmt << 4;
        else
@@ -1947,8 +2051,7 @@ static ssize_t srp_create_target(struct device *dev,
                container_of(dev, struct srp_host, dev);
        struct Scsi_Host *target_host;
        struct srp_target_port *target;
-       int ret;
-       int i;
+       int i, ret;
 
        target_host = scsi_host_alloc(&srp_template,
                                      sizeof (struct srp_target_port));
@@ -1968,14 +2071,6 @@ static ssize_t srp_create_target(struct device *dev,
        target->rkey            = host->srp_dev->mr->rkey;
        target->cmd_sg_cnt      = cmd_sg_entries;
 
-       spin_lock_init(&target->lock);
-       INIT_LIST_HEAD(&target->free_tx);
-       INIT_LIST_HEAD(&target->free_reqs);
-       for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
-               target->req_ring[i].index = i;
-               list_add_tail(&target->req_ring[i].list, &target->free_reqs);
-       }
-
        ret = srp_parse_options(buf, target);
        if (ret)
                goto err;
@@ -1985,6 +2080,23 @@ static ssize_t srp_create_target(struct device *dev,
                             sizeof (struct srp_indirect_buf) +
                             target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
 
+       spin_lock_init(&target->lock);
+       INIT_LIST_HEAD(&target->free_tx);
+       INIT_LIST_HEAD(&target->free_reqs);
+       for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+               struct srp_request *req = &target->req_ring[i];
+
+               req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *),
+                                       GFP_KERNEL);
+               req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *),
+                                       GFP_KERNEL);
+               if (!req->fmr_list || !req->map_page)
+                       goto err_free_mem;
+
+               req->index = i;
+               list_add_tail(&req->list, &target->free_reqs);
+       }
+
        ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid);
 
        shost_printk(KERN_DEBUG, target->scsi_host, PFX
@@ -1998,11 +2110,11 @@ static ssize_t srp_create_target(struct device *dev,
 
        ret = srp_create_target_ib(target);
        if (ret)
-               goto err;
+               goto err_free_mem;
 
        ret = srp_new_cm_id(target);
        if (ret)
-               goto err_free;
+               goto err_free_ib;
 
        target->qp_in_error = 0;
        ret = srp_connect_target(target);
@@ -2024,9 +2136,12 @@ err_disconnect:
 err_cm_id:
        ib_destroy_cm_id(target->cm_id);
 
-err_free:
+err_free_ib:
        srp_free_target_ib(target);
 
+err_free_mem:
+       srp_free_req_data(target);
+
 err:
        scsi_host_put(target_host);
 
@@ -2099,7 +2214,7 @@ static void srp_add_one(struct ib_device *device)
        struct ib_device_attr *dev_attr;
        struct ib_fmr_pool_param fmr_param;
        struct srp_host *host;
-       int s, e, p;
+       int fmr_page_shift, s, e, p;
 
        dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
        if (!dev_attr)
@@ -2117,12 +2232,13 @@ static void srp_add_one(struct ib_device *device)
 
        /*
         * Use the smallest page size supported by the HCA, down to a
-        * minimum of 512 bytes (which is the smallest sector that a
-        * SCSI command will ever carry).
+        * minimum of 4096 bytes. We're unlikely to build large sglists
+        * out of smaller entries.
         */
-       srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
-       srp_dev->fmr_page_size  = 1 << srp_dev->fmr_page_shift;
-       srp_dev->fmr_page_mask  = ~((u64) srp_dev->fmr_page_size - 1);
+       fmr_page_shift          = max(12, ffs(dev_attr->page_size_cap) - 1);
+       srp_dev->fmr_page_size  = 1 << fmr_page_shift;
+       srp_dev->fmr_page_mask  = ~((u64) srp_dev->fmr_page_size - 1);
+       srp_dev->fmr_max_size   = srp_dev->fmr_page_size * SRP_FMR_SIZE;
 
        INIT_LIST_HEAD(&srp_dev->dev_list);
 
@@ -2143,7 +2259,7 @@ static void srp_add_one(struct ib_device *device)
        fmr_param.dirty_watermark   = SRP_FMR_DIRTY_SIZE;
        fmr_param.cache             = 1;
        fmr_param.max_pages_per_fmr = SRP_FMR_SIZE;
-       fmr_param.page_shift        = srp_dev->fmr_page_shift;
+       fmr_param.page_shift        = fmr_page_shift;
        fmr_param.access            = (IB_ACCESS_LOCAL_WRITE |
                                       IB_ACCESS_REMOTE_WRITE |
                                       IB_ACCESS_REMOTE_READ);
@@ -2223,6 +2339,7 @@ static void srp_remove_one(struct ib_device *device)
                        srp_disconnect_target(target);
                        ib_destroy_cm_id(target->cm_id);
                        srp_free_target_ib(target);
+                       srp_free_req_data(target);
                        scsi_host_put(target->scsi_host);
                }