xprtrdma: Free the pd if ib_query_qp() fails
[pandora-kernel.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/interrupt.h>
51 #include <linux/pci.h>  /* for Tavor hack below */
52 #include <linux/slab.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57  * Globals/Macros
58  */
59
60 #ifdef RPC_DEBUG
61 # define RPCDBG_FACILITY        RPCDBG_TRANS
62 #endif
63
64 /*
65  * internal functions
66  */
67
68 /*
69  * handle replies in tasklet context, using a single, global list
70  * rdma tasklet function -- just turn around and call the func
71  * for all replies on the list
72  */
73
74 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75 static LIST_HEAD(rpcrdma_tasklets_g);
76
77 static void
78 rpcrdma_run_tasklet(unsigned long data)
79 {
80         struct rpcrdma_rep *rep;
81         void (*func)(struct rpcrdma_rep *);
82         unsigned long flags;
83
84         data = data;
85         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86         while (!list_empty(&rpcrdma_tasklets_g)) {
87                 rep = list_entry(rpcrdma_tasklets_g.next,
88                                  struct rpcrdma_rep, rr_list);
89                 list_del(&rep->rr_list);
90                 func = rep->rr_func;
91                 rep->rr_func = NULL;
92                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94                 if (func)
95                         func(rep);
96                 else
97                         rpcrdma_recv_buffer_put(rep);
98
99                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100         }
101         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 }
103
104 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106 static inline void
107 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108 {
109         unsigned long flags;
110
111         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114         tasklet_schedule(&rpcrdma_tasklet_g);
115 }
116
117 static void
118 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119 {
120         struct rpcrdma_ep *ep = context;
121
122         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
123                 __func__, event->event, event->device->name, context);
124         if (ep->rep_connected == 1) {
125                 ep->rep_connected = -EIO;
126                 ep->rep_func(ep);
127                 wake_up_all(&ep->rep_connect_wait);
128         }
129 }
130
131 static void
132 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133 {
134         struct rpcrdma_ep *ep = context;
135
136         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
137                 __func__, event->event, event->device->name, context);
138         if (ep->rep_connected == 1) {
139                 ep->rep_connected = -EIO;
140                 ep->rep_func(ep);
141                 wake_up_all(&ep->rep_connect_wait);
142         }
143 }
144
145 static inline
146 void rpcrdma_event_process(struct ib_wc *wc)
147 {
148         struct rpcrdma_mw *frmr;
149         struct rpcrdma_rep *rep =
150                         (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152         dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
153                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155         if (!rep) /* send or bind completion that we don't care about */
156                 return;
157
158         if (IB_WC_SUCCESS != wc->status) {
159                 dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
160                         __func__, wc->opcode, wc->status);
161                 rep->rr_len = ~0U;
162                 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163                         rpcrdma_schedule_tasklet(rep);
164                 return;
165         }
166
167         switch (wc->opcode) {
168         case IB_WC_FAST_REG_MR:
169                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170                 frmr->r.frmr.state = FRMR_IS_VALID;
171                 break;
172         case IB_WC_LOCAL_INV:
173                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174                 frmr->r.frmr.state = FRMR_IS_INVALID;
175                 break;
176         case IB_WC_RECV:
177                 rep->rr_len = wc->byte_len;
178                 ib_dma_sync_single_for_cpu(
179                         rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181                 /* Keep (only) the most recent credits, after check validity */
182                 if (rep->rr_len >= 16) {
183                         struct rpcrdma_msg *p =
184                                         (struct rpcrdma_msg *) rep->rr_base;
185                         unsigned int credits = ntohl(p->rm_credit);
186                         if (credits == 0) {
187                                 dprintk("RPC:       %s: server"
188                                         " dropped credits to 0!\n", __func__);
189                                 /* don't deadlock */
190                                 credits = 1;
191                         } else if (credits > rep->rr_buffer->rb_max_requests) {
192                                 dprintk("RPC:       %s: server"
193                                         " over-crediting: %d (%d)\n",
194                                         __func__, credits,
195                                         rep->rr_buffer->rb_max_requests);
196                                 credits = rep->rr_buffer->rb_max_requests;
197                         }
198                         atomic_set(&rep->rr_buffer->rb_credits, credits);
199                 }
200                 /* fall through */
201         case IB_WC_BIND_MW:
202                 rpcrdma_schedule_tasklet(rep);
203                 break;
204         default:
205                 dprintk("RPC:       %s: unexpected WC event %X\n",
206                         __func__, wc->opcode);
207                 break;
208         }
209 }
210
211 static inline int
212 rpcrdma_cq_poll(struct ib_cq *cq)
213 {
214         struct ib_wc wc;
215         int rc;
216
217         for (;;) {
218                 rc = ib_poll_cq(cq, 1, &wc);
219                 if (rc < 0) {
220                         dprintk("RPC:       %s: ib_poll_cq failed %i\n",
221                                 __func__, rc);
222                         return rc;
223                 }
224                 if (rc == 0)
225                         break;
226
227                 rpcrdma_event_process(&wc);
228         }
229
230         return 0;
231 }
232
233 /*
234  * rpcrdma_cq_event_upcall
235  *
236  * This upcall handles recv, send, bind and unbind events.
237  * It is reentrant but processes single events in order to maintain
238  * ordering of receives to keep server credits.
239  *
240  * It is the responsibility of the scheduled tasklet to return
241  * recv buffers to the pool. NOTE: this affects synchronization of
242  * connection shutdown. That is, the structures required for
243  * the completion of the reply handler must remain intact until
244  * all memory has been reclaimed.
245  *
246  * Note that send events are suppressed and do not result in an upcall.
247  */
248 static void
249 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250 {
251         int rc;
252
253         rc = rpcrdma_cq_poll(cq);
254         if (rc)
255                 return;
256
257         rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258         if (rc) {
259                 dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
260                         __func__, rc);
261                 return;
262         }
263
264         rpcrdma_cq_poll(cq);
265 }
266
267 #ifdef RPC_DEBUG
268 static const char * const conn[] = {
269         "address resolved",
270         "address error",
271         "route resolved",
272         "route error",
273         "connect request",
274         "connect response",
275         "connect error",
276         "unreachable",
277         "rejected",
278         "established",
279         "disconnected",
280         "device removal"
281 };
282 #endif
283
284 static int
285 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286 {
287         struct rpcrdma_xprt *xprt = id->context;
288         struct rpcrdma_ia *ia = &xprt->rx_ia;
289         struct rpcrdma_ep *ep = &xprt->rx_ep;
290 #ifdef RPC_DEBUG
291         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
292 #endif
293         struct ib_qp_attr attr;
294         struct ib_qp_init_attr iattr;
295         int connstate = 0;
296
297         switch (event->event) {
298         case RDMA_CM_EVENT_ADDR_RESOLVED:
299         case RDMA_CM_EVENT_ROUTE_RESOLVED:
300                 ia->ri_async_rc = 0;
301                 complete(&ia->ri_done);
302                 break;
303         case RDMA_CM_EVENT_ADDR_ERROR:
304                 ia->ri_async_rc = -EHOSTUNREACH;
305                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
306                         __func__, ep);
307                 complete(&ia->ri_done);
308                 break;
309         case RDMA_CM_EVENT_ROUTE_ERROR:
310                 ia->ri_async_rc = -ENETUNREACH;
311                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
312                         __func__, ep);
313                 complete(&ia->ri_done);
314                 break;
315         case RDMA_CM_EVENT_ESTABLISHED:
316                 connstate = 1;
317                 ib_query_qp(ia->ri_id->qp, &attr,
318                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319                         &iattr);
320                 dprintk("RPC:       %s: %d responder resources"
321                         " (%d initiator)\n",
322                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323                 goto connected;
324         case RDMA_CM_EVENT_CONNECT_ERROR:
325                 connstate = -ENOTCONN;
326                 goto connected;
327         case RDMA_CM_EVENT_UNREACHABLE:
328                 connstate = -ENETDOWN;
329                 goto connected;
330         case RDMA_CM_EVENT_REJECTED:
331                 connstate = -ECONNREFUSED;
332                 goto connected;
333         case RDMA_CM_EVENT_DISCONNECTED:
334                 connstate = -ECONNABORTED;
335                 goto connected;
336         case RDMA_CM_EVENT_DEVICE_REMOVAL:
337                 connstate = -ENODEV;
338 connected:
339                 dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
340                         __func__,
341                         (event->event <= 11) ? conn[event->event] :
342                                                 "unknown connection error",
343                         &addr->sin_addr.s_addr,
344                         ntohs(addr->sin_port),
345                         ep, event->event);
346                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347                 dprintk("RPC:       %s: %sconnected\n",
348                                         __func__, connstate > 0 ? "" : "dis");
349                 ep->rep_connected = connstate;
350                 ep->rep_func(ep);
351                 wake_up_all(&ep->rep_connect_wait);
352                 break;
353         default:
354                 dprintk("RPC:       %s: unexpected CM event %d\n",
355                         __func__, event->event);
356                 break;
357         }
358
359 #ifdef RPC_DEBUG
360         if (connstate == 1) {
361                 int ird = attr.max_dest_rd_atomic;
362                 int tird = ep->rep_remote_cma.responder_resources;
363                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
364                         "on %s, memreg %d slots %d ird %d%s\n",
365                         &addr->sin_addr.s_addr,
366                         ntohs(addr->sin_port),
367                         ia->ri_id->device->name,
368                         ia->ri_memreg_strategy,
369                         xprt->rx_buf.rb_max_requests,
370                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371         } else if (connstate < 0) {
372                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373                         &addr->sin_addr.s_addr,
374                         ntohs(addr->sin_port),
375                         connstate);
376         }
377 #endif
378
379         return 0;
380 }
381
382 static struct rdma_cm_id *
383 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384                         struct rpcrdma_ia *ia, struct sockaddr *addr)
385 {
386         struct rdma_cm_id *id;
387         int rc;
388
389         init_completion(&ia->ri_done);
390
391         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
392         if (IS_ERR(id)) {
393                 rc = PTR_ERR(id);
394                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
395                         __func__, rc);
396                 return id;
397         }
398
399         ia->ri_async_rc = -ETIMEDOUT;
400         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401         if (rc) {
402                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
403                         __func__, rc);
404                 goto out;
405         }
406         wait_for_completion_interruptible_timeout(&ia->ri_done,
407                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
408         rc = ia->ri_async_rc;
409         if (rc)
410                 goto out;
411
412         ia->ri_async_rc = -ETIMEDOUT;
413         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414         if (rc) {
415                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
416                         __func__, rc);
417                 goto out;
418         }
419         wait_for_completion_interruptible_timeout(&ia->ri_done,
420                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
421         rc = ia->ri_async_rc;
422         if (rc)
423                 goto out;
424
425         return id;
426
427 out:
428         rdma_destroy_id(id);
429         return ERR_PTR(rc);
430 }
431
432 /*
433  * Drain any cq, prior to teardown.
434  */
435 static void
436 rpcrdma_clean_cq(struct ib_cq *cq)
437 {
438         struct ib_wc wc;
439         int count = 0;
440
441         while (1 == ib_poll_cq(cq, 1, &wc))
442                 ++count;
443
444         if (count)
445                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
446                         __func__, count, wc.opcode);
447 }
448
449 /*
450  * Exported functions.
451  */
452
453 /*
454  * Open and initialize an Interface Adapter.
455  *  o initializes fields of struct rpcrdma_ia, including
456  *    interface and provider attributes and protection zone.
457  */
458 int
459 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460 {
461         int rc, mem_priv;
462         struct ib_device_attr devattr;
463         struct rpcrdma_ia *ia = &xprt->rx_ia;
464
465         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466         if (IS_ERR(ia->ri_id)) {
467                 rc = PTR_ERR(ia->ri_id);
468                 goto out1;
469         }
470
471         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472         if (IS_ERR(ia->ri_pd)) {
473                 rc = PTR_ERR(ia->ri_pd);
474                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
475                         __func__, rc);
476                 goto out2;
477         }
478
479         /*
480          * Query the device to determine if the requested memory
481          * registration strategy is supported. If it isn't, set the
482          * strategy to a globally supported model.
483          */
484         rc = ib_query_device(ia->ri_id->device, &devattr);
485         if (rc) {
486                 dprintk("RPC:       %s: ib_query_device failed %d\n",
487                         __func__, rc);
488                 goto out3;
489         }
490
491         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492                 ia->ri_have_dma_lkey = 1;
493                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494         }
495
496         switch (memreg) {
497         case RPCRDMA_MEMWINDOWS:
498         case RPCRDMA_MEMWINDOWS_ASYNC:
499                 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500                         dprintk("RPC:       %s: MEMWINDOWS registration "
501                                 "specified but not supported by adapter, "
502                                 "using slower RPCRDMA_REGISTER\n",
503                                 __func__);
504                         memreg = RPCRDMA_REGISTER;
505                 }
506                 break;
507         case RPCRDMA_MTHCAFMR:
508                 if (!ia->ri_id->device->alloc_fmr) {
509 #if RPCRDMA_PERSISTENT_REGISTRATION
510                         dprintk("RPC:       %s: MTHCAFMR registration "
511                                 "specified but not supported by adapter, "
512                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
513                                 __func__);
514                         memreg = RPCRDMA_ALLPHYSICAL;
515 #else
516                         dprintk("RPC:       %s: MTHCAFMR registration "
517                                 "specified but not supported by adapter, "
518                                 "using slower RPCRDMA_REGISTER\n",
519                                 __func__);
520                         memreg = RPCRDMA_REGISTER;
521 #endif
522                 }
523                 break;
524         case RPCRDMA_FRMR:
525                 /* Requires both frmr reg and local dma lkey */
526                 if ((devattr.device_cap_flags &
527                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529 #if RPCRDMA_PERSISTENT_REGISTRATION
530                         dprintk("RPC:       %s: FRMR registration "
531                                 "specified but not supported by adapter, "
532                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
533                                 __func__);
534                         memreg = RPCRDMA_ALLPHYSICAL;
535 #else
536                         dprintk("RPC:       %s: FRMR registration "
537                                 "specified but not supported by adapter, "
538                                 "using slower RPCRDMA_REGISTER\n",
539                                 __func__);
540                         memreg = RPCRDMA_REGISTER;
541 #endif
542                 }
543                 break;
544         }
545
546         /*
547          * Optionally obtain an underlying physical identity mapping in
548          * order to do a memory window-based bind. This base registration
549          * is protected from remote access - that is enabled only by binding
550          * for the specific bytes targeted during each RPC operation, and
551          * revoked after the corresponding completion similar to a storage
552          * adapter.
553          */
554         switch (memreg) {
555         case RPCRDMA_BOUNCEBUFFERS:
556         case RPCRDMA_REGISTER:
557         case RPCRDMA_FRMR:
558                 break;
559 #if RPCRDMA_PERSISTENT_REGISTRATION
560         case RPCRDMA_ALLPHYSICAL:
561                 mem_priv = IB_ACCESS_LOCAL_WRITE |
562                                 IB_ACCESS_REMOTE_WRITE |
563                                 IB_ACCESS_REMOTE_READ;
564                 goto register_setup;
565 #endif
566         case RPCRDMA_MEMWINDOWS_ASYNC:
567         case RPCRDMA_MEMWINDOWS:
568                 mem_priv = IB_ACCESS_LOCAL_WRITE |
569                                 IB_ACCESS_MW_BIND;
570                 goto register_setup;
571         case RPCRDMA_MTHCAFMR:
572                 if (ia->ri_have_dma_lkey)
573                         break;
574                 mem_priv = IB_ACCESS_LOCAL_WRITE;
575         register_setup:
576                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
577                 if (IS_ERR(ia->ri_bind_mem)) {
578                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
579                                 "phys register failed with %lX\n\t"
580                                 "Will continue with degraded performance\n",
581                                 __func__, PTR_ERR(ia->ri_bind_mem));
582                         memreg = RPCRDMA_REGISTER;
583                         ia->ri_bind_mem = NULL;
584                 }
585                 break;
586         default:
587                 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
588                                 __func__, memreg);
589                 rc = -EINVAL;
590                 goto out3;
591         }
592         dprintk("RPC:       %s: memory registration strategy is %d\n",
593                 __func__, memreg);
594
595         /* Else will do memory reg/dereg for each chunk */
596         ia->ri_memreg_strategy = memreg;
597
598         return 0;
599
600 out3:
601         ib_dealloc_pd(ia->ri_pd);
602         ia->ri_pd = NULL;
603 out2:
604         rdma_destroy_id(ia->ri_id);
605         ia->ri_id = NULL;
606 out1:
607         return rc;
608 }
609
610 /*
611  * Clean up/close an IA.
612  *   o if event handles and PD have been initialized, free them.
613  *   o close the IA
614  */
615 void
616 rpcrdma_ia_close(struct rpcrdma_ia *ia)
617 {
618         int rc;
619
620         dprintk("RPC:       %s: entering\n", __func__);
621         if (ia->ri_bind_mem != NULL) {
622                 rc = ib_dereg_mr(ia->ri_bind_mem);
623                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
624                         __func__, rc);
625         }
626         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
627                 if (ia->ri_id->qp)
628                         rdma_destroy_qp(ia->ri_id);
629                 rdma_destroy_id(ia->ri_id);
630                 ia->ri_id = NULL;
631         }
632         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
633                 rc = ib_dealloc_pd(ia->ri_pd);
634                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
635                         __func__, rc);
636         }
637 }
638
639 /*
640  * Create unconnected endpoint.
641  */
642 int
643 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
644                                 struct rpcrdma_create_data_internal *cdata)
645 {
646         struct ib_device_attr devattr;
647         int rc, err;
648
649         rc = ib_query_device(ia->ri_id->device, &devattr);
650         if (rc) {
651                 dprintk("RPC:       %s: ib_query_device failed %d\n",
652                         __func__, rc);
653                 return rc;
654         }
655
656         /* check provider's send/recv wr limits */
657         if (cdata->max_requests > devattr.max_qp_wr)
658                 cdata->max_requests = devattr.max_qp_wr;
659
660         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
661         ep->rep_attr.qp_context = ep;
662         /* send_cq and recv_cq initialized below */
663         ep->rep_attr.srq = NULL;
664         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
665         switch (ia->ri_memreg_strategy) {
666         case RPCRDMA_FRMR:
667                 /* Add room for frmr register and invalidate WRs.
668                  * 1. FRMR reg WR for head
669                  * 2. FRMR invalidate WR for head
670                  * 3. FRMR reg WR for pagelist
671                  * 4. FRMR invalidate WR for pagelist
672                  * 5. FRMR reg WR for tail
673                  * 6. FRMR invalidate WR for tail
674                  * 7. The RDMA_SEND WR
675                  */
676                 ep->rep_attr.cap.max_send_wr *= 7;
677                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
678                         cdata->max_requests = devattr.max_qp_wr / 7;
679                         if (!cdata->max_requests)
680                                 return -EINVAL;
681                         ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
682                 }
683                 break;
684         case RPCRDMA_MEMWINDOWS_ASYNC:
685         case RPCRDMA_MEMWINDOWS:
686                 /* Add room for mw_binds+unbinds - overkill! */
687                 ep->rep_attr.cap.max_send_wr++;
688                 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
689                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
690                         return -EINVAL;
691                 break;
692         default:
693                 break;
694         }
695         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
696         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
697         ep->rep_attr.cap.max_recv_sge = 1;
698         ep->rep_attr.cap.max_inline_data = 0;
699         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
700         ep->rep_attr.qp_type = IB_QPT_RC;
701         ep->rep_attr.port_num = ~0;
702
703         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
704                 "iovs: send %d recv %d\n",
705                 __func__,
706                 ep->rep_attr.cap.max_send_wr,
707                 ep->rep_attr.cap.max_recv_wr,
708                 ep->rep_attr.cap.max_send_sge,
709                 ep->rep_attr.cap.max_recv_sge);
710
711         /* set trigger for requesting send completion */
712         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
713         switch (ia->ri_memreg_strategy) {
714         case RPCRDMA_MEMWINDOWS_ASYNC:
715         case RPCRDMA_MEMWINDOWS:
716                 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
717                 break;
718         default:
719                 break;
720         }
721         if (ep->rep_cqinit <= 2)
722                 ep->rep_cqinit = 0;
723         INIT_CQCOUNT(ep);
724         ep->rep_ia = ia;
725         init_waitqueue_head(&ep->rep_connect_wait);
726
727         /*
728          * Create a single cq for receive dto and mw_bind (only ever
729          * care about unbind, really). Send completions are suppressed.
730          * Use single threaded tasklet upcalls to maintain ordering.
731          */
732         ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
733                                   rpcrdma_cq_async_error_upcall, NULL,
734                                   ep->rep_attr.cap.max_recv_wr +
735                                   ep->rep_attr.cap.max_send_wr + 1, 0);
736         if (IS_ERR(ep->rep_cq)) {
737                 rc = PTR_ERR(ep->rep_cq);
738                 dprintk("RPC:       %s: ib_create_cq failed: %i\n",
739                         __func__, rc);
740                 goto out1;
741         }
742
743         rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
744         if (rc) {
745                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
746                         __func__, rc);
747                 goto out2;
748         }
749
750         ep->rep_attr.send_cq = ep->rep_cq;
751         ep->rep_attr.recv_cq = ep->rep_cq;
752
753         /* Initialize cma parameters */
754
755         /* RPC/RDMA does not use private data */
756         ep->rep_remote_cma.private_data = NULL;
757         ep->rep_remote_cma.private_data_len = 0;
758
759         /* Client offers RDMA Read but does not initiate */
760         ep->rep_remote_cma.initiator_depth = 0;
761         if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
762                 ep->rep_remote_cma.responder_resources = 0;
763         else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
764                 ep->rep_remote_cma.responder_resources = 32;
765         else
766                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
767
768         ep->rep_remote_cma.retry_count = 7;
769         ep->rep_remote_cma.flow_control = 0;
770         ep->rep_remote_cma.rnr_retry_count = 0;
771
772         return 0;
773
774 out2:
775         err = ib_destroy_cq(ep->rep_cq);
776         if (err)
777                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
778                         __func__, err);
779 out1:
780         return rc;
781 }
782
783 /*
784  * rpcrdma_ep_destroy
785  *
786  * Disconnect and destroy endpoint. After this, the only
787  * valid operations on the ep are to free it (if dynamically
788  * allocated) or re-create it.
789  *
790  * The caller's error handling must be sure to not leak the endpoint
791  * if this function fails.
792  */
793 int
794 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
795 {
796         int rc;
797
798         dprintk("RPC:       %s: entering, connected is %d\n",
799                 __func__, ep->rep_connected);
800
801         if (ia->ri_id->qp) {
802                 rc = rpcrdma_ep_disconnect(ep, ia);
803                 if (rc)
804                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
805                                 " returned %i\n", __func__, rc);
806                 rdma_destroy_qp(ia->ri_id);
807                 ia->ri_id->qp = NULL;
808         }
809
810         /* padding - could be done in rpcrdma_buffer_destroy... */
811         if (ep->rep_pad_mr) {
812                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
813                 ep->rep_pad_mr = NULL;
814         }
815
816         rpcrdma_clean_cq(ep->rep_cq);
817         rc = ib_destroy_cq(ep->rep_cq);
818         if (rc)
819                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
820                         __func__, rc);
821
822         return rc;
823 }
824
825 /*
826  * Connect unconnected endpoint.
827  */
828 int
829 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
830 {
831         struct rdma_cm_id *id;
832         int rc = 0;
833         int retry_count = 0;
834
835         if (ep->rep_connected != 0) {
836                 struct rpcrdma_xprt *xprt;
837 retry:
838                 rc = rpcrdma_ep_disconnect(ep, ia);
839                 if (rc && rc != -ENOTCONN)
840                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
841                                 " status %i\n", __func__, rc);
842                 rpcrdma_clean_cq(ep->rep_cq);
843
844                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
845                 id = rpcrdma_create_id(xprt, ia,
846                                 (struct sockaddr *)&xprt->rx_data.addr);
847                 if (IS_ERR(id)) {
848                         rc = PTR_ERR(id);
849                         goto out;
850                 }
851                 /* TEMP TEMP TEMP - fail if new device:
852                  * Deregister/remarshal *all* requests!
853                  * Close and recreate adapter, pd, etc!
854                  * Re-determine all attributes still sane!
855                  * More stuff I haven't thought of!
856                  * Rrrgh!
857                  */
858                 if (ia->ri_id->device != id->device) {
859                         printk("RPC:       %s: can't reconnect on "
860                                 "different device!\n", __func__);
861                         rdma_destroy_id(id);
862                         rc = -ENETDOWN;
863                         goto out;
864                 }
865                 /* END TEMP */
866                 rdma_destroy_qp(ia->ri_id);
867                 rdma_destroy_id(ia->ri_id);
868                 ia->ri_id = id;
869         }
870
871         rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
872         if (rc) {
873                 dprintk("RPC:       %s: rdma_create_qp failed %i\n",
874                         __func__, rc);
875                 goto out;
876         }
877
878 /* XXX Tavor device performs badly with 2K MTU! */
879 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
880         struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
881         if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
882             (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
883              pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
884                 struct ib_qp_attr attr = {
885                         .path_mtu = IB_MTU_1024
886                 };
887                 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
888         }
889 }
890
891         ep->rep_connected = 0;
892
893         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
894         if (rc) {
895                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
896                                 __func__, rc);
897                 goto out;
898         }
899
900         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
901
902         /*
903          * Check state. A non-peer reject indicates no listener
904          * (ECONNREFUSED), which may be a transient state. All
905          * others indicate a transport condition which has already
906          * undergone a best-effort.
907          */
908         if (ep->rep_connected == -ECONNREFUSED &&
909             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
910                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
911                 goto retry;
912         }
913         if (ep->rep_connected <= 0) {
914                 /* Sometimes, the only way to reliably connect to remote
915                  * CMs is to use same nonzero values for ORD and IRD. */
916                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
917                     (ep->rep_remote_cma.responder_resources == 0 ||
918                      ep->rep_remote_cma.initiator_depth !=
919                                 ep->rep_remote_cma.responder_resources)) {
920                         if (ep->rep_remote_cma.responder_resources == 0)
921                                 ep->rep_remote_cma.responder_resources = 1;
922                         ep->rep_remote_cma.initiator_depth =
923                                 ep->rep_remote_cma.responder_resources;
924                         goto retry;
925                 }
926                 rc = ep->rep_connected;
927         } else {
928                 dprintk("RPC:       %s: connected\n", __func__);
929         }
930
931 out:
932         if (rc)
933                 ep->rep_connected = rc;
934         return rc;
935 }
936
937 /*
938  * rpcrdma_ep_disconnect
939  *
940  * This is separate from destroy to facilitate the ability
941  * to reconnect without recreating the endpoint.
942  *
943  * This call is not reentrant, and must not be made in parallel
944  * on the same endpoint.
945  */
946 int
947 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
948 {
949         int rc;
950
951         rpcrdma_clean_cq(ep->rep_cq);
952         rc = rdma_disconnect(ia->ri_id);
953         if (!rc) {
954                 /* returns without wait if not connected */
955                 wait_event_interruptible(ep->rep_connect_wait,
956                                                         ep->rep_connected != 1);
957                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
958                         (ep->rep_connected == 1) ? "still " : "dis");
959         } else {
960                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
961                 ep->rep_connected = rc;
962         }
963         return rc;
964 }
965
966 /*
967  * Initialize buffer memory
968  */
969 int
970 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
971         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
972 {
973         char *p;
974         size_t len;
975         int i, rc;
976         struct rpcrdma_mw *r;
977
978         buf->rb_max_requests = cdata->max_requests;
979         spin_lock_init(&buf->rb_lock);
980         atomic_set(&buf->rb_credits, 1);
981
982         /* Need to allocate:
983          *   1.  arrays for send and recv pointers
984          *   2.  arrays of struct rpcrdma_req to fill in pointers
985          *   3.  array of struct rpcrdma_rep for replies
986          *   4.  padding, if any
987          *   5.  mw's, fmr's or frmr's, if any
988          * Send/recv buffers in req/rep need to be registered
989          */
990
991         len = buf->rb_max_requests *
992                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
993         len += cdata->padding;
994         switch (ia->ri_memreg_strategy) {
995         case RPCRDMA_FRMR:
996                 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
997                                 sizeof(struct rpcrdma_mw);
998                 break;
999         case RPCRDMA_MTHCAFMR:
1000                 /* TBD we are perhaps overallocating here */
1001                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002                                 sizeof(struct rpcrdma_mw);
1003                 break;
1004         case RPCRDMA_MEMWINDOWS_ASYNC:
1005         case RPCRDMA_MEMWINDOWS:
1006                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1007                                 sizeof(struct rpcrdma_mw);
1008                 break;
1009         default:
1010                 break;
1011         }
1012
1013         /* allocate 1, 4 and 5 in one shot */
1014         p = kzalloc(len, GFP_KERNEL);
1015         if (p == NULL) {
1016                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1017                         __func__, len);
1018                 rc = -ENOMEM;
1019                 goto out;
1020         }
1021         buf->rb_pool = p;       /* for freeing it later */
1022
1023         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1024         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1025         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1026         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1027
1028         /*
1029          * Register the zeroed pad buffer, if any.
1030          */
1031         if (cdata->padding) {
1032                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1033                                             &ep->rep_pad_mr, &ep->rep_pad);
1034                 if (rc)
1035                         goto out;
1036         }
1037         p += cdata->padding;
1038
1039         /*
1040          * Allocate the fmr's, or mw's for mw_bind chunk registration.
1041          * We "cycle" the mw's in order to minimize rkey reuse,
1042          * and also reduce unbind-to-bind collision.
1043          */
1044         INIT_LIST_HEAD(&buf->rb_mws);
1045         r = (struct rpcrdma_mw *)p;
1046         switch (ia->ri_memreg_strategy) {
1047         case RPCRDMA_FRMR:
1048                 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1049                         r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1050                                                          RPCRDMA_MAX_SEGS);
1051                         if (IS_ERR(r->r.frmr.fr_mr)) {
1052                                 rc = PTR_ERR(r->r.frmr.fr_mr);
1053                                 dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1054                                         " failed %i\n", __func__, rc);
1055                                 goto out;
1056                         }
1057                         r->r.frmr.fr_pgl =
1058                                 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1059                                                             RPCRDMA_MAX_SEGS);
1060                         if (IS_ERR(r->r.frmr.fr_pgl)) {
1061                                 rc = PTR_ERR(r->r.frmr.fr_pgl);
1062                                 dprintk("RPC:       %s: "
1063                                         "ib_alloc_fast_reg_page_list "
1064                                         "failed %i\n", __func__, rc);
1065                                 goto out;
1066                         }
1067                         list_add(&r->mw_list, &buf->rb_mws);
1068                         ++r;
1069                 }
1070                 break;
1071         case RPCRDMA_MTHCAFMR:
1072                 /* TBD we are perhaps overallocating here */
1073                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1074                         static struct ib_fmr_attr fa =
1075                                 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1076                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1077                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1078                                 &fa);
1079                         if (IS_ERR(r->r.fmr)) {
1080                                 rc = PTR_ERR(r->r.fmr);
1081                                 dprintk("RPC:       %s: ib_alloc_fmr"
1082                                         " failed %i\n", __func__, rc);
1083                                 goto out;
1084                         }
1085                         list_add(&r->mw_list, &buf->rb_mws);
1086                         ++r;
1087                 }
1088                 break;
1089         case RPCRDMA_MEMWINDOWS_ASYNC:
1090         case RPCRDMA_MEMWINDOWS:
1091                 /* Allocate one extra request's worth, for full cycling */
1092                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1093                         r->r.mw = ib_alloc_mw(ia->ri_pd);
1094                         if (IS_ERR(r->r.mw)) {
1095                                 rc = PTR_ERR(r->r.mw);
1096                                 dprintk("RPC:       %s: ib_alloc_mw"
1097                                         " failed %i\n", __func__, rc);
1098                                 goto out;
1099                         }
1100                         list_add(&r->mw_list, &buf->rb_mws);
1101                         ++r;
1102                 }
1103                 break;
1104         default:
1105                 break;
1106         }
1107
1108         /*
1109          * Allocate/init the request/reply buffers. Doing this
1110          * using kmalloc for now -- one for each buf.
1111          */
1112         for (i = 0; i < buf->rb_max_requests; i++) {
1113                 struct rpcrdma_req *req;
1114                 struct rpcrdma_rep *rep;
1115
1116                 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1117                 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1118                 /* Typical ~2400b, so rounding up saves work later */
1119                 if (len < 4096)
1120                         len = 4096;
1121                 req = kmalloc(len, GFP_KERNEL);
1122                 if (req == NULL) {
1123                         dprintk("RPC:       %s: request buffer %d alloc"
1124                                 " failed\n", __func__, i);
1125                         rc = -ENOMEM;
1126                         goto out;
1127                 }
1128                 memset(req, 0, sizeof(struct rpcrdma_req));
1129                 buf->rb_send_bufs[i] = req;
1130                 buf->rb_send_bufs[i]->rl_buffer = buf;
1131
1132                 rc = rpcrdma_register_internal(ia, req->rl_base,
1133                                 len - offsetof(struct rpcrdma_req, rl_base),
1134                                 &buf->rb_send_bufs[i]->rl_handle,
1135                                 &buf->rb_send_bufs[i]->rl_iov);
1136                 if (rc)
1137                         goto out;
1138
1139                 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1140
1141                 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1142                 rep = kmalloc(len, GFP_KERNEL);
1143                 if (rep == NULL) {
1144                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1145                                 __func__, i);
1146                         rc = -ENOMEM;
1147                         goto out;
1148                 }
1149                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1150                 buf->rb_recv_bufs[i] = rep;
1151                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1152                 init_waitqueue_head(&rep->rr_unbind);
1153
1154                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1155                                 len - offsetof(struct rpcrdma_rep, rr_base),
1156                                 &buf->rb_recv_bufs[i]->rr_handle,
1157                                 &buf->rb_recv_bufs[i]->rr_iov);
1158                 if (rc)
1159                         goto out;
1160
1161         }
1162         dprintk("RPC:       %s: max_requests %d\n",
1163                 __func__, buf->rb_max_requests);
1164         /* done */
1165         return 0;
1166 out:
1167         rpcrdma_buffer_destroy(buf);
1168         return rc;
1169 }
1170
1171 /*
1172  * Unregister and destroy buffer memory. Need to deal with
1173  * partial initialization, so it's callable from failed create.
1174  * Must be called before destroying endpoint, as registrations
1175  * reference it.
1176  */
1177 void
1178 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1179 {
1180         int rc, i;
1181         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1182         struct rpcrdma_mw *r;
1183
1184         /* clean up in reverse order from create
1185          *   1.  recv mr memory (mr free, then kfree)
1186          *   1a. bind mw memory
1187          *   2.  send mr memory (mr free, then kfree)
1188          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1189          *   4.  arrays
1190          */
1191         dprintk("RPC:       %s: entering\n", __func__);
1192
1193         for (i = 0; i < buf->rb_max_requests; i++) {
1194                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1195                         rpcrdma_deregister_internal(ia,
1196                                         buf->rb_recv_bufs[i]->rr_handle,
1197                                         &buf->rb_recv_bufs[i]->rr_iov);
1198                         kfree(buf->rb_recv_bufs[i]);
1199                 }
1200                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1201                         while (!list_empty(&buf->rb_mws)) {
1202                                 r = list_entry(buf->rb_mws.next,
1203                                         struct rpcrdma_mw, mw_list);
1204                                 list_del(&r->mw_list);
1205                                 switch (ia->ri_memreg_strategy) {
1206                                 case RPCRDMA_FRMR:
1207                                         rc = ib_dereg_mr(r->r.frmr.fr_mr);
1208                                         if (rc)
1209                                                 dprintk("RPC:       %s:"
1210                                                         " ib_dereg_mr"
1211                                                         " failed %i\n",
1212                                                         __func__, rc);
1213                                         ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1214                                         break;
1215                                 case RPCRDMA_MTHCAFMR:
1216                                         rc = ib_dealloc_fmr(r->r.fmr);
1217                                         if (rc)
1218                                                 dprintk("RPC:       %s:"
1219                                                         " ib_dealloc_fmr"
1220                                                         " failed %i\n",
1221                                                         __func__, rc);
1222                                         break;
1223                                 case RPCRDMA_MEMWINDOWS_ASYNC:
1224                                 case RPCRDMA_MEMWINDOWS:
1225                                         rc = ib_dealloc_mw(r->r.mw);
1226                                         if (rc)
1227                                                 dprintk("RPC:       %s:"
1228                                                         " ib_dealloc_mw"
1229                                                         " failed %i\n",
1230                                                         __func__, rc);
1231                                         break;
1232                                 default:
1233                                         break;
1234                                 }
1235                         }
1236                         rpcrdma_deregister_internal(ia,
1237                                         buf->rb_send_bufs[i]->rl_handle,
1238                                         &buf->rb_send_bufs[i]->rl_iov);
1239                         kfree(buf->rb_send_bufs[i]);
1240                 }
1241         }
1242
1243         kfree(buf->rb_pool);
1244 }
1245
1246 /*
1247  * Get a set of request/reply buffers.
1248  *
1249  * Reply buffer (if needed) is attached to send buffer upon return.
1250  * Rule:
1251  *    rb_send_index and rb_recv_index MUST always be pointing to the
1252  *    *next* available buffer (non-NULL). They are incremented after
1253  *    removing buffers, and decremented *before* returning them.
1254  */
1255 struct rpcrdma_req *
1256 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1257 {
1258         struct rpcrdma_req *req;
1259         unsigned long flags;
1260         int i;
1261         struct rpcrdma_mw *r;
1262
1263         spin_lock_irqsave(&buffers->rb_lock, flags);
1264         if (buffers->rb_send_index == buffers->rb_max_requests) {
1265                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1266                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1267                 return ((struct rpcrdma_req *)NULL);
1268         }
1269
1270         req = buffers->rb_send_bufs[buffers->rb_send_index];
1271         if (buffers->rb_send_index < buffers->rb_recv_index) {
1272                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1273                         __func__,
1274                         buffers->rb_recv_index - buffers->rb_send_index);
1275                 req->rl_reply = NULL;
1276         } else {
1277                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1278                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1279         }
1280         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1281         if (!list_empty(&buffers->rb_mws)) {
1282                 i = RPCRDMA_MAX_SEGS - 1;
1283                 do {
1284                         r = list_entry(buffers->rb_mws.next,
1285                                         struct rpcrdma_mw, mw_list);
1286                         list_del(&r->mw_list);
1287                         req->rl_segments[i].mr_chunk.rl_mw = r;
1288                 } while (--i >= 0);
1289         }
1290         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1291         return req;
1292 }
1293
1294 /*
1295  * Put request/reply buffers back into pool.
1296  * Pre-decrement counter/array index.
1297  */
1298 void
1299 rpcrdma_buffer_put(struct rpcrdma_req *req)
1300 {
1301         struct rpcrdma_buffer *buffers = req->rl_buffer;
1302         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1303         int i;
1304         unsigned long flags;
1305
1306         BUG_ON(req->rl_nchunks != 0);
1307         spin_lock_irqsave(&buffers->rb_lock, flags);
1308         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1309         req->rl_niovs = 0;
1310         if (req->rl_reply) {
1311                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1312                 init_waitqueue_head(&req->rl_reply->rr_unbind);
1313                 req->rl_reply->rr_func = NULL;
1314                 req->rl_reply = NULL;
1315         }
1316         switch (ia->ri_memreg_strategy) {
1317         case RPCRDMA_FRMR:
1318         case RPCRDMA_MTHCAFMR:
1319         case RPCRDMA_MEMWINDOWS_ASYNC:
1320         case RPCRDMA_MEMWINDOWS:
1321                 /*
1322                  * Cycle mw's back in reverse order, and "spin" them.
1323                  * This delays and scrambles reuse as much as possible.
1324                  */
1325                 i = 1;
1326                 do {
1327                         struct rpcrdma_mw **mw;
1328                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1329                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1330                         *mw = NULL;
1331                 } while (++i < RPCRDMA_MAX_SEGS);
1332                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1333                                         &buffers->rb_mws);
1334                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1335                 break;
1336         default:
1337                 break;
1338         }
1339         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1340 }
1341
1342 /*
1343  * Recover reply buffers from pool.
1344  * This happens when recovering from error conditions.
1345  * Post-increment counter/array index.
1346  */
1347 void
1348 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1349 {
1350         struct rpcrdma_buffer *buffers = req->rl_buffer;
1351         unsigned long flags;
1352
1353         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1354                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1355         spin_lock_irqsave(&buffers->rb_lock, flags);
1356         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1357                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1358                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1359         }
1360         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1361 }
1362
1363 /*
1364  * Put reply buffers back into pool when not attached to
1365  * request. This happens in error conditions, and when
1366  * aborting unbinds. Pre-decrement counter/array index.
1367  */
1368 void
1369 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1370 {
1371         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1372         unsigned long flags;
1373
1374         rep->rr_func = NULL;
1375         spin_lock_irqsave(&buffers->rb_lock, flags);
1376         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1377         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1378 }
1379
1380 /*
1381  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1382  */
1383
1384 int
1385 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1386                                 struct ib_mr **mrp, struct ib_sge *iov)
1387 {
1388         struct ib_phys_buf ipb;
1389         struct ib_mr *mr;
1390         int rc;
1391
1392         /*
1393          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1394          */
1395         iov->addr = ib_dma_map_single(ia->ri_id->device,
1396                         va, len, DMA_BIDIRECTIONAL);
1397         iov->length = len;
1398
1399         if (ia->ri_have_dma_lkey) {
1400                 *mrp = NULL;
1401                 iov->lkey = ia->ri_dma_lkey;
1402                 return 0;
1403         } else if (ia->ri_bind_mem != NULL) {
1404                 *mrp = NULL;
1405                 iov->lkey = ia->ri_bind_mem->lkey;
1406                 return 0;
1407         }
1408
1409         ipb.addr = iov->addr;
1410         ipb.size = iov->length;
1411         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1412                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1413
1414         dprintk("RPC:       %s: phys convert: 0x%llx "
1415                         "registered 0x%llx length %d\n",
1416                         __func__, (unsigned long long)ipb.addr,
1417                         (unsigned long long)iov->addr, len);
1418
1419         if (IS_ERR(mr)) {
1420                 *mrp = NULL;
1421                 rc = PTR_ERR(mr);
1422                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1423         } else {
1424                 *mrp = mr;
1425                 iov->lkey = mr->lkey;
1426                 rc = 0;
1427         }
1428
1429         return rc;
1430 }
1431
1432 int
1433 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1434                                 struct ib_mr *mr, struct ib_sge *iov)
1435 {
1436         int rc;
1437
1438         ib_dma_unmap_single(ia->ri_id->device,
1439                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1440
1441         if (NULL == mr)
1442                 return 0;
1443
1444         rc = ib_dereg_mr(mr);
1445         if (rc)
1446                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1447         return rc;
1448 }
1449
1450 /*
1451  * Wrappers for chunk registration, shared by read/write chunk code.
1452  */
1453
1454 static void
1455 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1456 {
1457         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1458         seg->mr_dmalen = seg->mr_len;
1459         if (seg->mr_page)
1460                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1461                                 seg->mr_page, offset_in_page(seg->mr_offset),
1462                                 seg->mr_dmalen, seg->mr_dir);
1463         else
1464                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1465                                 seg->mr_offset,
1466                                 seg->mr_dmalen, seg->mr_dir);
1467         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1468                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1469                         __func__,
1470                         (unsigned long long)seg->mr_dma,
1471                         seg->mr_offset, seg->mr_dmalen);
1472         }
1473 }
1474
1475 static void
1476 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1477 {
1478         if (seg->mr_page)
1479                 ib_dma_unmap_page(ia->ri_id->device,
1480                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1481         else
1482                 ib_dma_unmap_single(ia->ri_id->device,
1483                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1484 }
1485
1486 static int
1487 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1488                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1489                         struct rpcrdma_xprt *r_xprt)
1490 {
1491         struct rpcrdma_mr_seg *seg1 = seg;
1492         struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1493
1494         u8 key;
1495         int len, pageoff;
1496         int i, rc;
1497
1498         pageoff = offset_in_page(seg1->mr_offset);
1499         seg1->mr_offset -= pageoff;     /* start of page */
1500         seg1->mr_len += pageoff;
1501         len = -pageoff;
1502         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1503                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1504         for (i = 0; i < *nsegs;) {
1505                 rpcrdma_map_one(ia, seg, writing);
1506                 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1507                 len += seg->mr_len;
1508                 BUG_ON(seg->mr_len > PAGE_SIZE);
1509                 ++seg;
1510                 ++i;
1511                 /* Check for holes */
1512                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1513                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1514                         break;
1515         }
1516         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1517                 __func__, seg1->mr_chunk.rl_mw, i);
1518
1519         if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1520                 dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
1521                         __func__,
1522                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1523                 /* Invalidate before using. */
1524                 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1525                 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1526                 invalidate_wr.next = &frmr_wr;
1527                 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1528                 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1529                 invalidate_wr.ex.invalidate_rkey =
1530                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1531                 DECR_CQCOUNT(&r_xprt->rx_ep);
1532                 post_wr = &invalidate_wr;
1533         } else
1534                 post_wr = &frmr_wr;
1535
1536         /* Bump the key */
1537         key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1538         ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1539
1540         /* Prepare FRMR WR */
1541         memset(&frmr_wr, 0, sizeof frmr_wr);
1542         frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1543         frmr_wr.opcode = IB_WR_FAST_REG_MR;
1544         frmr_wr.send_flags = IB_SEND_SIGNALED;
1545         frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1546         frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1547         frmr_wr.wr.fast_reg.page_list_len = i;
1548         frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1549         frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1550         BUG_ON(frmr_wr.wr.fast_reg.length < len);
1551         frmr_wr.wr.fast_reg.access_flags = (writing ?
1552                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1553                                 IB_ACCESS_REMOTE_READ);
1554         frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1555         DECR_CQCOUNT(&r_xprt->rx_ep);
1556
1557         rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1558
1559         if (rc) {
1560                 dprintk("RPC:       %s: failed ib_post_send for register,"
1561                         " status %i\n", __func__, rc);
1562                 while (i--)
1563                         rpcrdma_unmap_one(ia, --seg);
1564         } else {
1565                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1566                 seg1->mr_base = seg1->mr_dma + pageoff;
1567                 seg1->mr_nsegs = i;
1568                 seg1->mr_len = len;
1569         }
1570         *nsegs = i;
1571         return rc;
1572 }
1573
1574 static int
1575 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1576                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1577 {
1578         struct rpcrdma_mr_seg *seg1 = seg;
1579         struct ib_send_wr invalidate_wr, *bad_wr;
1580         int rc;
1581
1582         while (seg1->mr_nsegs--)
1583                 rpcrdma_unmap_one(ia, seg++);
1584
1585         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1586         invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1587         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1588         invalidate_wr.send_flags = IB_SEND_SIGNALED;
1589         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1590         DECR_CQCOUNT(&r_xprt->rx_ep);
1591
1592         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1593         if (rc)
1594                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1595                         " status %i\n", __func__, rc);
1596         return rc;
1597 }
1598
1599 static int
1600 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1601                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1602 {
1603         struct rpcrdma_mr_seg *seg1 = seg;
1604         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1605         int len, pageoff, i, rc;
1606
1607         pageoff = offset_in_page(seg1->mr_offset);
1608         seg1->mr_offset -= pageoff;     /* start of page */
1609         seg1->mr_len += pageoff;
1610         len = -pageoff;
1611         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1612                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1613         for (i = 0; i < *nsegs;) {
1614                 rpcrdma_map_one(ia, seg, writing);
1615                 physaddrs[i] = seg->mr_dma;
1616                 len += seg->mr_len;
1617                 ++seg;
1618                 ++i;
1619                 /* Check for holes */
1620                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1621                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1622                         break;
1623         }
1624         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1625                                 physaddrs, i, seg1->mr_dma);
1626         if (rc) {
1627                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1628                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1629                         len, (unsigned long long)seg1->mr_dma,
1630                         pageoff, i, rc);
1631                 while (i--)
1632                         rpcrdma_unmap_one(ia, --seg);
1633         } else {
1634                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1635                 seg1->mr_base = seg1->mr_dma + pageoff;
1636                 seg1->mr_nsegs = i;
1637                 seg1->mr_len = len;
1638         }
1639         *nsegs = i;
1640         return rc;
1641 }
1642
1643 static int
1644 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1645                         struct rpcrdma_ia *ia)
1646 {
1647         struct rpcrdma_mr_seg *seg1 = seg;
1648         LIST_HEAD(l);
1649         int rc;
1650
1651         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1652         rc = ib_unmap_fmr(&l);
1653         while (seg1->mr_nsegs--)
1654                 rpcrdma_unmap_one(ia, seg++);
1655         if (rc)
1656                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1657                         " status %i\n", __func__, rc);
1658         return rc;
1659 }
1660
1661 static int
1662 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1663                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1664                         struct rpcrdma_xprt *r_xprt)
1665 {
1666         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1667                                   IB_ACCESS_REMOTE_READ);
1668         struct ib_mw_bind param;
1669         int rc;
1670
1671         *nsegs = 1;
1672         rpcrdma_map_one(ia, seg, writing);
1673         param.mr = ia->ri_bind_mem;
1674         param.wr_id = 0ULL;     /* no send cookie */
1675         param.addr = seg->mr_dma;
1676         param.length = seg->mr_len;
1677         param.send_flags = 0;
1678         param.mw_access_flags = mem_priv;
1679
1680         DECR_CQCOUNT(&r_xprt->rx_ep);
1681         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1682         if (rc) {
1683                 dprintk("RPC:       %s: failed ib_bind_mw "
1684                         "%u@0x%llx status %i\n",
1685                         __func__, seg->mr_len,
1686                         (unsigned long long)seg->mr_dma, rc);
1687                 rpcrdma_unmap_one(ia, seg);
1688         } else {
1689                 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1690                 seg->mr_base = param.addr;
1691                 seg->mr_nsegs = 1;
1692         }
1693         return rc;
1694 }
1695
1696 static int
1697 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1698                         struct rpcrdma_ia *ia,
1699                         struct rpcrdma_xprt *r_xprt, void **r)
1700 {
1701         struct ib_mw_bind param;
1702         LIST_HEAD(l);
1703         int rc;
1704
1705         BUG_ON(seg->mr_nsegs != 1);
1706         param.mr = ia->ri_bind_mem;
1707         param.addr = 0ULL;      /* unbind */
1708         param.length = 0;
1709         param.mw_access_flags = 0;
1710         if (*r) {
1711                 param.wr_id = (u64) (unsigned long) *r;
1712                 param.send_flags = IB_SEND_SIGNALED;
1713                 INIT_CQCOUNT(&r_xprt->rx_ep);
1714         } else {
1715                 param.wr_id = 0ULL;
1716                 param.send_flags = 0;
1717                 DECR_CQCOUNT(&r_xprt->rx_ep);
1718         }
1719         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1720         rpcrdma_unmap_one(ia, seg);
1721         if (rc)
1722                 dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1723                         " status %i\n", __func__, rc);
1724         else
1725                 *r = NULL;      /* will upcall on completion */
1726         return rc;
1727 }
1728
1729 static int
1730 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1731                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1732 {
1733         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1734                                   IB_ACCESS_REMOTE_READ);
1735         struct rpcrdma_mr_seg *seg1 = seg;
1736         struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1737         int len, i, rc = 0;
1738
1739         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1740                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1741         for (len = 0, i = 0; i < *nsegs;) {
1742                 rpcrdma_map_one(ia, seg, writing);
1743                 ipb[i].addr = seg->mr_dma;
1744                 ipb[i].size = seg->mr_len;
1745                 len += seg->mr_len;
1746                 ++seg;
1747                 ++i;
1748                 /* Check for holes */
1749                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1750                     offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1751                         break;
1752         }
1753         seg1->mr_base = seg1->mr_dma;
1754         seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1755                                 ipb, i, mem_priv, &seg1->mr_base);
1756         if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1757                 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1758                 dprintk("RPC:       %s: failed ib_reg_phys_mr "
1759                         "%u@0x%llx (%d)... status %i\n",
1760                         __func__, len,
1761                         (unsigned long long)seg1->mr_dma, i, rc);
1762                 while (i--)
1763                         rpcrdma_unmap_one(ia, --seg);
1764         } else {
1765                 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1766                 seg1->mr_nsegs = i;
1767                 seg1->mr_len = len;
1768         }
1769         *nsegs = i;
1770         return rc;
1771 }
1772
1773 static int
1774 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1775                         struct rpcrdma_ia *ia)
1776 {
1777         struct rpcrdma_mr_seg *seg1 = seg;
1778         int rc;
1779
1780         rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1781         seg1->mr_chunk.rl_mr = NULL;
1782         while (seg1->mr_nsegs--)
1783                 rpcrdma_unmap_one(ia, seg++);
1784         if (rc)
1785                 dprintk("RPC:       %s: failed ib_dereg_mr,"
1786                         " status %i\n", __func__, rc);
1787         return rc;
1788 }
1789
1790 int
1791 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1792                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1793 {
1794         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1795         int rc = 0;
1796
1797         switch (ia->ri_memreg_strategy) {
1798
1799 #if RPCRDMA_PERSISTENT_REGISTRATION
1800         case RPCRDMA_ALLPHYSICAL:
1801                 rpcrdma_map_one(ia, seg, writing);
1802                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1803                 seg->mr_base = seg->mr_dma;
1804                 seg->mr_nsegs = 1;
1805                 nsegs = 1;
1806                 break;
1807 #endif
1808
1809         /* Registration using frmr registration */
1810         case RPCRDMA_FRMR:
1811                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1812                 break;
1813
1814         /* Registration using fmr memory registration */
1815         case RPCRDMA_MTHCAFMR:
1816                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1817                 break;
1818
1819         /* Registration using memory windows */
1820         case RPCRDMA_MEMWINDOWS_ASYNC:
1821         case RPCRDMA_MEMWINDOWS:
1822                 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1823                 break;
1824
1825         /* Default registration each time */
1826         default:
1827                 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1828                 break;
1829         }
1830         if (rc)
1831                 return -1;
1832
1833         return nsegs;
1834 }
1835
1836 int
1837 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1838                 struct rpcrdma_xprt *r_xprt, void *r)
1839 {
1840         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1841         int nsegs = seg->mr_nsegs, rc;
1842
1843         switch (ia->ri_memreg_strategy) {
1844
1845 #if RPCRDMA_PERSISTENT_REGISTRATION
1846         case RPCRDMA_ALLPHYSICAL:
1847                 BUG_ON(nsegs != 1);
1848                 rpcrdma_unmap_one(ia, seg);
1849                 rc = 0;
1850                 break;
1851 #endif
1852
1853         case RPCRDMA_FRMR:
1854                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1855                 break;
1856
1857         case RPCRDMA_MTHCAFMR:
1858                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1859                 break;
1860
1861         case RPCRDMA_MEMWINDOWS_ASYNC:
1862         case RPCRDMA_MEMWINDOWS:
1863                 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1864                 break;
1865
1866         default:
1867                 rc = rpcrdma_deregister_default_external(seg, ia);
1868                 break;
1869         }
1870         if (r) {
1871                 struct rpcrdma_rep *rep = r;
1872                 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1873                 rep->rr_func = NULL;
1874                 func(rep);      /* dereg done, callback now */
1875         }
1876         return nsegs;
1877 }
1878
1879 /*
1880  * Prepost any receive buffer, then post send.
1881  *
1882  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1883  */
1884 int
1885 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1886                 struct rpcrdma_ep *ep,
1887                 struct rpcrdma_req *req)
1888 {
1889         struct ib_send_wr send_wr, *send_wr_fail;
1890         struct rpcrdma_rep *rep = req->rl_reply;
1891         int rc;
1892
1893         if (rep) {
1894                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1895                 if (rc)
1896                         goto out;
1897                 req->rl_reply = NULL;
1898         }
1899
1900         send_wr.next = NULL;
1901         send_wr.wr_id = 0ULL;   /* no send cookie */
1902         send_wr.sg_list = req->rl_send_iov;
1903         send_wr.num_sge = req->rl_niovs;
1904         send_wr.opcode = IB_WR_SEND;
1905         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1906                 ib_dma_sync_single_for_device(ia->ri_id->device,
1907                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1908                         DMA_TO_DEVICE);
1909         ib_dma_sync_single_for_device(ia->ri_id->device,
1910                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1911                 DMA_TO_DEVICE);
1912         ib_dma_sync_single_for_device(ia->ri_id->device,
1913                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1914                 DMA_TO_DEVICE);
1915
1916         if (DECR_CQCOUNT(ep) > 0)
1917                 send_wr.send_flags = 0;
1918         else { /* Provider must take a send completion every now and then */
1919                 INIT_CQCOUNT(ep);
1920                 send_wr.send_flags = IB_SEND_SIGNALED;
1921         }
1922
1923         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1924         if (rc)
1925                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1926                         rc);
1927 out:
1928         return rc;
1929 }
1930
1931 /*
1932  * (Re)post a receive buffer.
1933  */
1934 int
1935 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1936                      struct rpcrdma_ep *ep,
1937                      struct rpcrdma_rep *rep)
1938 {
1939         struct ib_recv_wr recv_wr, *recv_wr_fail;
1940         int rc;
1941
1942         recv_wr.next = NULL;
1943         recv_wr.wr_id = (u64) (unsigned long) rep;
1944         recv_wr.sg_list = &rep->rr_iov;
1945         recv_wr.num_sge = 1;
1946
1947         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1948                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1949
1950         DECR_CQCOUNT(ep);
1951         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1952
1953         if (rc)
1954                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1955                         rc);
1956         return rc;
1957 }