Merge branch 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[pandora-kernel.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/pci.h>  /* for Tavor hack below */
51 #include <linux/slab.h>
52
53 #include "xprt_rdma.h"
54
55 /*
56  * Globals/Macros
57  */
58
59 #ifdef RPC_DEBUG
60 # define RPCDBG_FACILITY        RPCDBG_TRANS
61 #endif
62
63 /*
64  * internal functions
65  */
66
67 /*
68  * handle replies in tasklet context, using a single, global list
69  * rdma tasklet function -- just turn around and call the func
70  * for all replies on the list
71  */
72
73 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74 static LIST_HEAD(rpcrdma_tasklets_g);
75
76 static void
77 rpcrdma_run_tasklet(unsigned long data)
78 {
79         struct rpcrdma_rep *rep;
80         void (*func)(struct rpcrdma_rep *);
81         unsigned long flags;
82
83         data = data;
84         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85         while (!list_empty(&rpcrdma_tasklets_g)) {
86                 rep = list_entry(rpcrdma_tasklets_g.next,
87                                  struct rpcrdma_rep, rr_list);
88                 list_del(&rep->rr_list);
89                 func = rep->rr_func;
90                 rep->rr_func = NULL;
91                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93                 if (func)
94                         func(rep);
95                 else
96                         rpcrdma_recv_buffer_put(rep);
97
98                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99         }
100         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101 }
102
103 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105 static inline void
106 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107 {
108         unsigned long flags;
109
110         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113         tasklet_schedule(&rpcrdma_tasklet_g);
114 }
115
116 static void
117 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118 {
119         struct rpcrdma_ep *ep = context;
120
121         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
122                 __func__, event->event, event->device->name, context);
123         if (ep->rep_connected == 1) {
124                 ep->rep_connected = -EIO;
125                 ep->rep_func(ep);
126                 wake_up_all(&ep->rep_connect_wait);
127         }
128 }
129
130 static void
131 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132 {
133         struct rpcrdma_ep *ep = context;
134
135         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
136                 __func__, event->event, event->device->name, context);
137         if (ep->rep_connected == 1) {
138                 ep->rep_connected = -EIO;
139                 ep->rep_func(ep);
140                 wake_up_all(&ep->rep_connect_wait);
141         }
142 }
143
144 static inline
145 void rpcrdma_event_process(struct ib_wc *wc)
146 {
147         struct rpcrdma_mw *frmr;
148         struct rpcrdma_rep *rep =
149                         (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151         dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
152                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154         if (!rep) /* send or bind completion that we don't care about */
155                 return;
156
157         if (IB_WC_SUCCESS != wc->status) {
158                 dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
159                         __func__, wc->opcode, wc->status);
160                 rep->rr_len = ~0U;
161                 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162                         rpcrdma_schedule_tasklet(rep);
163                 return;
164         }
165
166         switch (wc->opcode) {
167         case IB_WC_FAST_REG_MR:
168                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169                 frmr->r.frmr.state = FRMR_IS_VALID;
170                 break;
171         case IB_WC_LOCAL_INV:
172                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173                 frmr->r.frmr.state = FRMR_IS_INVALID;
174                 break;
175         case IB_WC_RECV:
176                 rep->rr_len = wc->byte_len;
177                 ib_dma_sync_single_for_cpu(
178                         rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180                 /* Keep (only) the most recent credits, after check validity */
181                 if (rep->rr_len >= 16) {
182                         struct rpcrdma_msg *p =
183                                         (struct rpcrdma_msg *) rep->rr_base;
184                         unsigned int credits = ntohl(p->rm_credit);
185                         if (credits == 0) {
186                                 dprintk("RPC:       %s: server"
187                                         " dropped credits to 0!\n", __func__);
188                                 /* don't deadlock */
189                                 credits = 1;
190                         } else if (credits > rep->rr_buffer->rb_max_requests) {
191                                 dprintk("RPC:       %s: server"
192                                         " over-crediting: %d (%d)\n",
193                                         __func__, credits,
194                                         rep->rr_buffer->rb_max_requests);
195                                 credits = rep->rr_buffer->rb_max_requests;
196                         }
197                         atomic_set(&rep->rr_buffer->rb_credits, credits);
198                 }
199                 /* fall through */
200         case IB_WC_BIND_MW:
201                 rpcrdma_schedule_tasklet(rep);
202                 break;
203         default:
204                 dprintk("RPC:       %s: unexpected WC event %X\n",
205                         __func__, wc->opcode);
206                 break;
207         }
208 }
209
210 static inline int
211 rpcrdma_cq_poll(struct ib_cq *cq)
212 {
213         struct ib_wc wc;
214         int rc;
215
216         for (;;) {
217                 rc = ib_poll_cq(cq, 1, &wc);
218                 if (rc < 0) {
219                         dprintk("RPC:       %s: ib_poll_cq failed %i\n",
220                                 __func__, rc);
221                         return rc;
222                 }
223                 if (rc == 0)
224                         break;
225
226                 rpcrdma_event_process(&wc);
227         }
228
229         return 0;
230 }
231
232 /*
233  * rpcrdma_cq_event_upcall
234  *
235  * This upcall handles recv, send, bind and unbind events.
236  * It is reentrant but processes single events in order to maintain
237  * ordering of receives to keep server credits.
238  *
239  * It is the responsibility of the scheduled tasklet to return
240  * recv buffers to the pool. NOTE: this affects synchronization of
241  * connection shutdown. That is, the structures required for
242  * the completion of the reply handler must remain intact until
243  * all memory has been reclaimed.
244  *
245  * Note that send events are suppressed and do not result in an upcall.
246  */
247 static void
248 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249 {
250         int rc;
251
252         rc = rpcrdma_cq_poll(cq);
253         if (rc)
254                 return;
255
256         rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257         if (rc) {
258                 dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
259                         __func__, rc);
260                 return;
261         }
262
263         rpcrdma_cq_poll(cq);
264 }
265
266 #ifdef RPC_DEBUG
267 static const char * const conn[] = {
268         "address resolved",
269         "address error",
270         "route resolved",
271         "route error",
272         "connect request",
273         "connect response",
274         "connect error",
275         "unreachable",
276         "rejected",
277         "established",
278         "disconnected",
279         "device removal"
280 };
281 #endif
282
283 static int
284 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285 {
286         struct rpcrdma_xprt *xprt = id->context;
287         struct rpcrdma_ia *ia = &xprt->rx_ia;
288         struct rpcrdma_ep *ep = &xprt->rx_ep;
289 #ifdef RPC_DEBUG
290         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
291 #endif
292         struct ib_qp_attr attr;
293         struct ib_qp_init_attr iattr;
294         int connstate = 0;
295
296         switch (event->event) {
297         case RDMA_CM_EVENT_ADDR_RESOLVED:
298         case RDMA_CM_EVENT_ROUTE_RESOLVED:
299                 ia->ri_async_rc = 0;
300                 complete(&ia->ri_done);
301                 break;
302         case RDMA_CM_EVENT_ADDR_ERROR:
303                 ia->ri_async_rc = -EHOSTUNREACH;
304                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
305                         __func__, ep);
306                 complete(&ia->ri_done);
307                 break;
308         case RDMA_CM_EVENT_ROUTE_ERROR:
309                 ia->ri_async_rc = -ENETUNREACH;
310                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
311                         __func__, ep);
312                 complete(&ia->ri_done);
313                 break;
314         case RDMA_CM_EVENT_ESTABLISHED:
315                 connstate = 1;
316                 ib_query_qp(ia->ri_id->qp, &attr,
317                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318                         &iattr);
319                 dprintk("RPC:       %s: %d responder resources"
320                         " (%d initiator)\n",
321                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322                 goto connected;
323         case RDMA_CM_EVENT_CONNECT_ERROR:
324                 connstate = -ENOTCONN;
325                 goto connected;
326         case RDMA_CM_EVENT_UNREACHABLE:
327                 connstate = -ENETDOWN;
328                 goto connected;
329         case RDMA_CM_EVENT_REJECTED:
330                 connstate = -ECONNREFUSED;
331                 goto connected;
332         case RDMA_CM_EVENT_DISCONNECTED:
333                 connstate = -ECONNABORTED;
334                 goto connected;
335         case RDMA_CM_EVENT_DEVICE_REMOVAL:
336                 connstate = -ENODEV;
337 connected:
338                 dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
339                         __func__,
340                         (event->event <= 11) ? conn[event->event] :
341                                                 "unknown connection error",
342                         &addr->sin_addr.s_addr,
343                         ntohs(addr->sin_port),
344                         ep, event->event);
345                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346                 dprintk("RPC:       %s: %sconnected\n",
347                                         __func__, connstate > 0 ? "" : "dis");
348                 ep->rep_connected = connstate;
349                 ep->rep_func(ep);
350                 wake_up_all(&ep->rep_connect_wait);
351                 break;
352         default:
353                 dprintk("RPC:       %s: unexpected CM event %d\n",
354                         __func__, event->event);
355                 break;
356         }
357
358 #ifdef RPC_DEBUG
359         if (connstate == 1) {
360                 int ird = attr.max_dest_rd_atomic;
361                 int tird = ep->rep_remote_cma.responder_resources;
362                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
363                         "on %s, memreg %d slots %d ird %d%s\n",
364                         &addr->sin_addr.s_addr,
365                         ntohs(addr->sin_port),
366                         ia->ri_id->device->name,
367                         ia->ri_memreg_strategy,
368                         xprt->rx_buf.rb_max_requests,
369                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370         } else if (connstate < 0) {
371                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372                         &addr->sin_addr.s_addr,
373                         ntohs(addr->sin_port),
374                         connstate);
375         }
376 #endif
377
378         return 0;
379 }
380
381 static struct rdma_cm_id *
382 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383                         struct rpcrdma_ia *ia, struct sockaddr *addr)
384 {
385         struct rdma_cm_id *id;
386         int rc;
387
388         init_completion(&ia->ri_done);
389
390         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
391         if (IS_ERR(id)) {
392                 rc = PTR_ERR(id);
393                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
394                         __func__, rc);
395                 return id;
396         }
397
398         ia->ri_async_rc = -ETIMEDOUT;
399         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400         if (rc) {
401                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
402                         __func__, rc);
403                 goto out;
404         }
405         wait_for_completion_interruptible_timeout(&ia->ri_done,
406                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
407         rc = ia->ri_async_rc;
408         if (rc)
409                 goto out;
410
411         ia->ri_async_rc = -ETIMEDOUT;
412         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413         if (rc) {
414                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
415                         __func__, rc);
416                 goto out;
417         }
418         wait_for_completion_interruptible_timeout(&ia->ri_done,
419                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
420         rc = ia->ri_async_rc;
421         if (rc)
422                 goto out;
423
424         return id;
425
426 out:
427         rdma_destroy_id(id);
428         return ERR_PTR(rc);
429 }
430
431 /*
432  * Drain any cq, prior to teardown.
433  */
434 static void
435 rpcrdma_clean_cq(struct ib_cq *cq)
436 {
437         struct ib_wc wc;
438         int count = 0;
439
440         while (1 == ib_poll_cq(cq, 1, &wc))
441                 ++count;
442
443         if (count)
444                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
445                         __func__, count, wc.opcode);
446 }
447
448 /*
449  * Exported functions.
450  */
451
452 /*
453  * Open and initialize an Interface Adapter.
454  *  o initializes fields of struct rpcrdma_ia, including
455  *    interface and provider attributes and protection zone.
456  */
457 int
458 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459 {
460         int rc, mem_priv;
461         struct ib_device_attr devattr;
462         struct rpcrdma_ia *ia = &xprt->rx_ia;
463
464         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465         if (IS_ERR(ia->ri_id)) {
466                 rc = PTR_ERR(ia->ri_id);
467                 goto out1;
468         }
469
470         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471         if (IS_ERR(ia->ri_pd)) {
472                 rc = PTR_ERR(ia->ri_pd);
473                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
474                         __func__, rc);
475                 goto out2;
476         }
477
478         /*
479          * Query the device to determine if the requested memory
480          * registration strategy is supported. If it isn't, set the
481          * strategy to a globally supported model.
482          */
483         rc = ib_query_device(ia->ri_id->device, &devattr);
484         if (rc) {
485                 dprintk("RPC:       %s: ib_query_device failed %d\n",
486                         __func__, rc);
487                 goto out2;
488         }
489
490         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491                 ia->ri_have_dma_lkey = 1;
492                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493         }
494
495         switch (memreg) {
496         case RPCRDMA_MEMWINDOWS:
497         case RPCRDMA_MEMWINDOWS_ASYNC:
498                 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499                         dprintk("RPC:       %s: MEMWINDOWS registration "
500                                 "specified but not supported by adapter, "
501                                 "using slower RPCRDMA_REGISTER\n",
502                                 __func__);
503                         memreg = RPCRDMA_REGISTER;
504                 }
505                 break;
506         case RPCRDMA_MTHCAFMR:
507                 if (!ia->ri_id->device->alloc_fmr) {
508 #if RPCRDMA_PERSISTENT_REGISTRATION
509                         dprintk("RPC:       %s: MTHCAFMR registration "
510                                 "specified but not supported by adapter, "
511                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
512                                 __func__);
513                         memreg = RPCRDMA_ALLPHYSICAL;
514 #else
515                         dprintk("RPC:       %s: MTHCAFMR registration "
516                                 "specified but not supported by adapter, "
517                                 "using slower RPCRDMA_REGISTER\n",
518                                 __func__);
519                         memreg = RPCRDMA_REGISTER;
520 #endif
521                 }
522                 break;
523         case RPCRDMA_FRMR:
524                 /* Requires both frmr reg and local dma lkey */
525                 if ((devattr.device_cap_flags &
526                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528 #if RPCRDMA_PERSISTENT_REGISTRATION
529                         dprintk("RPC:       %s: FRMR registration "
530                                 "specified but not supported by adapter, "
531                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
532                                 __func__);
533                         memreg = RPCRDMA_ALLPHYSICAL;
534 #else
535                         dprintk("RPC:       %s: FRMR registration "
536                                 "specified but not supported by adapter, "
537                                 "using slower RPCRDMA_REGISTER\n",
538                                 __func__);
539                         memreg = RPCRDMA_REGISTER;
540 #endif
541                 }
542                 break;
543         }
544
545         /*
546          * Optionally obtain an underlying physical identity mapping in
547          * order to do a memory window-based bind. This base registration
548          * is protected from remote access - that is enabled only by binding
549          * for the specific bytes targeted during each RPC operation, and
550          * revoked after the corresponding completion similar to a storage
551          * adapter.
552          */
553         switch (memreg) {
554         case RPCRDMA_BOUNCEBUFFERS:
555         case RPCRDMA_REGISTER:
556         case RPCRDMA_FRMR:
557                 break;
558 #if RPCRDMA_PERSISTENT_REGISTRATION
559         case RPCRDMA_ALLPHYSICAL:
560                 mem_priv = IB_ACCESS_LOCAL_WRITE |
561                                 IB_ACCESS_REMOTE_WRITE |
562                                 IB_ACCESS_REMOTE_READ;
563                 goto register_setup;
564 #endif
565         case RPCRDMA_MEMWINDOWS_ASYNC:
566         case RPCRDMA_MEMWINDOWS:
567                 mem_priv = IB_ACCESS_LOCAL_WRITE |
568                                 IB_ACCESS_MW_BIND;
569                 goto register_setup;
570         case RPCRDMA_MTHCAFMR:
571                 if (ia->ri_have_dma_lkey)
572                         break;
573                 mem_priv = IB_ACCESS_LOCAL_WRITE;
574         register_setup:
575                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576                 if (IS_ERR(ia->ri_bind_mem)) {
577                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
578                                 "phys register failed with %lX\n\t"
579                                 "Will continue with degraded performance\n",
580                                 __func__, PTR_ERR(ia->ri_bind_mem));
581                         memreg = RPCRDMA_REGISTER;
582                         ia->ri_bind_mem = NULL;
583                 }
584                 break;
585         default:
586                 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587                                 __func__, memreg);
588                 rc = -EINVAL;
589                 goto out2;
590         }
591         dprintk("RPC:       %s: memory registration strategy is %d\n",
592                 __func__, memreg);
593
594         /* Else will do memory reg/dereg for each chunk */
595         ia->ri_memreg_strategy = memreg;
596
597         return 0;
598 out2:
599         rdma_destroy_id(ia->ri_id);
600         ia->ri_id = NULL;
601 out1:
602         return rc;
603 }
604
605 /*
606  * Clean up/close an IA.
607  *   o if event handles and PD have been initialized, free them.
608  *   o close the IA
609  */
610 void
611 rpcrdma_ia_close(struct rpcrdma_ia *ia)
612 {
613         int rc;
614
615         dprintk("RPC:       %s: entering\n", __func__);
616         if (ia->ri_bind_mem != NULL) {
617                 rc = ib_dereg_mr(ia->ri_bind_mem);
618                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
619                         __func__, rc);
620         }
621         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622                 if (ia->ri_id->qp)
623                         rdma_destroy_qp(ia->ri_id);
624                 rdma_destroy_id(ia->ri_id);
625                 ia->ri_id = NULL;
626         }
627         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628                 rc = ib_dealloc_pd(ia->ri_pd);
629                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
630                         __func__, rc);
631         }
632 }
633
634 /*
635  * Create unconnected endpoint.
636  */
637 int
638 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639                                 struct rpcrdma_create_data_internal *cdata)
640 {
641         struct ib_device_attr devattr;
642         int rc, err;
643
644         rc = ib_query_device(ia->ri_id->device, &devattr);
645         if (rc) {
646                 dprintk("RPC:       %s: ib_query_device failed %d\n",
647                         __func__, rc);
648                 return rc;
649         }
650
651         /* check provider's send/recv wr limits */
652         if (cdata->max_requests > devattr.max_qp_wr)
653                 cdata->max_requests = devattr.max_qp_wr;
654
655         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656         ep->rep_attr.qp_context = ep;
657         /* send_cq and recv_cq initialized below */
658         ep->rep_attr.srq = NULL;
659         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660         switch (ia->ri_memreg_strategy) {
661         case RPCRDMA_FRMR:
662                 /* Add room for frmr register and invalidate WRs.
663                  * 1. FRMR reg WR for head
664                  * 2. FRMR invalidate WR for head
665                  * 3. FRMR reg WR for pagelist
666                  * 4. FRMR invalidate WR for pagelist
667                  * 5. FRMR reg WR for tail
668                  * 6. FRMR invalidate WR for tail
669                  * 7. The RDMA_SEND WR
670                  */
671                 ep->rep_attr.cap.max_send_wr *= 7;
672                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673                         cdata->max_requests = devattr.max_qp_wr / 7;
674                         if (!cdata->max_requests)
675                                 return -EINVAL;
676                         ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677                 }
678                 break;
679         case RPCRDMA_MEMWINDOWS_ASYNC:
680         case RPCRDMA_MEMWINDOWS:
681                 /* Add room for mw_binds+unbinds - overkill! */
682                 ep->rep_attr.cap.max_send_wr++;
683                 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685                         return -EINVAL;
686                 break;
687         default:
688                 break;
689         }
690         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692         ep->rep_attr.cap.max_recv_sge = 1;
693         ep->rep_attr.cap.max_inline_data = 0;
694         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695         ep->rep_attr.qp_type = IB_QPT_RC;
696         ep->rep_attr.port_num = ~0;
697
698         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
699                 "iovs: send %d recv %d\n",
700                 __func__,
701                 ep->rep_attr.cap.max_send_wr,
702                 ep->rep_attr.cap.max_recv_wr,
703                 ep->rep_attr.cap.max_send_sge,
704                 ep->rep_attr.cap.max_recv_sge);
705
706         /* set trigger for requesting send completion */
707         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
708         switch (ia->ri_memreg_strategy) {
709         case RPCRDMA_MEMWINDOWS_ASYNC:
710         case RPCRDMA_MEMWINDOWS:
711                 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712                 break;
713         default:
714                 break;
715         }
716         if (ep->rep_cqinit <= 2)
717                 ep->rep_cqinit = 0;
718         INIT_CQCOUNT(ep);
719         ep->rep_ia = ia;
720         init_waitqueue_head(&ep->rep_connect_wait);
721
722         /*
723          * Create a single cq for receive dto and mw_bind (only ever
724          * care about unbind, really). Send completions are suppressed.
725          * Use single threaded tasklet upcalls to maintain ordering.
726          */
727         ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728                                   rpcrdma_cq_async_error_upcall, NULL,
729                                   ep->rep_attr.cap.max_recv_wr +
730                                   ep->rep_attr.cap.max_send_wr + 1, 0);
731         if (IS_ERR(ep->rep_cq)) {
732                 rc = PTR_ERR(ep->rep_cq);
733                 dprintk("RPC:       %s: ib_create_cq failed: %i\n",
734                         __func__, rc);
735                 goto out1;
736         }
737
738         rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739         if (rc) {
740                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
741                         __func__, rc);
742                 goto out2;
743         }
744
745         ep->rep_attr.send_cq = ep->rep_cq;
746         ep->rep_attr.recv_cq = ep->rep_cq;
747
748         /* Initialize cma parameters */
749
750         /* RPC/RDMA does not use private data */
751         ep->rep_remote_cma.private_data = NULL;
752         ep->rep_remote_cma.private_data_len = 0;
753
754         /* Client offers RDMA Read but does not initiate */
755         ep->rep_remote_cma.initiator_depth = 0;
756         if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757                 ep->rep_remote_cma.responder_resources = 0;
758         else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
759                 ep->rep_remote_cma.responder_resources = 32;
760         else
761                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
762
763         ep->rep_remote_cma.retry_count = 7;
764         ep->rep_remote_cma.flow_control = 0;
765         ep->rep_remote_cma.rnr_retry_count = 0;
766
767         return 0;
768
769 out2:
770         err = ib_destroy_cq(ep->rep_cq);
771         if (err)
772                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
773                         __func__, err);
774 out1:
775         return rc;
776 }
777
778 /*
779  * rpcrdma_ep_destroy
780  *
781  * Disconnect and destroy endpoint. After this, the only
782  * valid operations on the ep are to free it (if dynamically
783  * allocated) or re-create it.
784  *
785  * The caller's error handling must be sure to not leak the endpoint
786  * if this function fails.
787  */
788 int
789 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790 {
791         int rc;
792
793         dprintk("RPC:       %s: entering, connected is %d\n",
794                 __func__, ep->rep_connected);
795
796         if (ia->ri_id->qp) {
797                 rc = rpcrdma_ep_disconnect(ep, ia);
798                 if (rc)
799                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
800                                 " returned %i\n", __func__, rc);
801                 rdma_destroy_qp(ia->ri_id);
802                 ia->ri_id->qp = NULL;
803         }
804
805         /* padding - could be done in rpcrdma_buffer_destroy... */
806         if (ep->rep_pad_mr) {
807                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808                 ep->rep_pad_mr = NULL;
809         }
810
811         rpcrdma_clean_cq(ep->rep_cq);
812         rc = ib_destroy_cq(ep->rep_cq);
813         if (rc)
814                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
815                         __func__, rc);
816
817         return rc;
818 }
819
820 /*
821  * Connect unconnected endpoint.
822  */
823 int
824 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825 {
826         struct rdma_cm_id *id;
827         int rc = 0;
828         int retry_count = 0;
829
830         if (ep->rep_connected != 0) {
831                 struct rpcrdma_xprt *xprt;
832 retry:
833                 rc = rpcrdma_ep_disconnect(ep, ia);
834                 if (rc && rc != -ENOTCONN)
835                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
836                                 " status %i\n", __func__, rc);
837                 rpcrdma_clean_cq(ep->rep_cq);
838
839                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840                 id = rpcrdma_create_id(xprt, ia,
841                                 (struct sockaddr *)&xprt->rx_data.addr);
842                 if (IS_ERR(id)) {
843                         rc = PTR_ERR(id);
844                         goto out;
845                 }
846                 /* TEMP TEMP TEMP - fail if new device:
847                  * Deregister/remarshal *all* requests!
848                  * Close and recreate adapter, pd, etc!
849                  * Re-determine all attributes still sane!
850                  * More stuff I haven't thought of!
851                  * Rrrgh!
852                  */
853                 if (ia->ri_id->device != id->device) {
854                         printk("RPC:       %s: can't reconnect on "
855                                 "different device!\n", __func__);
856                         rdma_destroy_id(id);
857                         rc = -ENETDOWN;
858                         goto out;
859                 }
860                 /* END TEMP */
861                 rdma_destroy_qp(ia->ri_id);
862                 rdma_destroy_id(ia->ri_id);
863                 ia->ri_id = id;
864         }
865
866         rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867         if (rc) {
868                 dprintk("RPC:       %s: rdma_create_qp failed %i\n",
869                         __func__, rc);
870                 goto out;
871         }
872
873 /* XXX Tavor device performs badly with 2K MTU! */
874 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875         struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876         if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877             (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878              pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879                 struct ib_qp_attr attr = {
880                         .path_mtu = IB_MTU_1024
881                 };
882                 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883         }
884 }
885
886         ep->rep_connected = 0;
887
888         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889         if (rc) {
890                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
891                                 __func__, rc);
892                 goto out;
893         }
894
895         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897         /*
898          * Check state. A non-peer reject indicates no listener
899          * (ECONNREFUSED), which may be a transient state. All
900          * others indicate a transport condition which has already
901          * undergone a best-effort.
902          */
903         if (ep->rep_connected == -ECONNREFUSED &&
904             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
905                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
906                 goto retry;
907         }
908         if (ep->rep_connected <= 0) {
909                 /* Sometimes, the only way to reliably connect to remote
910                  * CMs is to use same nonzero values for ORD and IRD. */
911                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912                     (ep->rep_remote_cma.responder_resources == 0 ||
913                      ep->rep_remote_cma.initiator_depth !=
914                                 ep->rep_remote_cma.responder_resources)) {
915                         if (ep->rep_remote_cma.responder_resources == 0)
916                                 ep->rep_remote_cma.responder_resources = 1;
917                         ep->rep_remote_cma.initiator_depth =
918                                 ep->rep_remote_cma.responder_resources;
919                         goto retry;
920                 }
921                 rc = ep->rep_connected;
922         } else {
923                 dprintk("RPC:       %s: connected\n", __func__);
924         }
925
926 out:
927         if (rc)
928                 ep->rep_connected = rc;
929         return rc;
930 }
931
932 /*
933  * rpcrdma_ep_disconnect
934  *
935  * This is separate from destroy to facilitate the ability
936  * to reconnect without recreating the endpoint.
937  *
938  * This call is not reentrant, and must not be made in parallel
939  * on the same endpoint.
940  */
941 int
942 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943 {
944         int rc;
945
946         rpcrdma_clean_cq(ep->rep_cq);
947         rc = rdma_disconnect(ia->ri_id);
948         if (!rc) {
949                 /* returns without wait if not connected */
950                 wait_event_interruptible(ep->rep_connect_wait,
951                                                         ep->rep_connected != 1);
952                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
953                         (ep->rep_connected == 1) ? "still " : "dis");
954         } else {
955                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
956                 ep->rep_connected = rc;
957         }
958         return rc;
959 }
960
961 /*
962  * Initialize buffer memory
963  */
964 int
965 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967 {
968         char *p;
969         size_t len;
970         int i, rc;
971         struct rpcrdma_mw *r;
972
973         buf->rb_max_requests = cdata->max_requests;
974         spin_lock_init(&buf->rb_lock);
975         atomic_set(&buf->rb_credits, 1);
976
977         /* Need to allocate:
978          *   1.  arrays for send and recv pointers
979          *   2.  arrays of struct rpcrdma_req to fill in pointers
980          *   3.  array of struct rpcrdma_rep for replies
981          *   4.  padding, if any
982          *   5.  mw's, fmr's or frmr's, if any
983          * Send/recv buffers in req/rep need to be registered
984          */
985
986         len = buf->rb_max_requests *
987                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988         len += cdata->padding;
989         switch (ia->ri_memreg_strategy) {
990         case RPCRDMA_FRMR:
991                 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992                                 sizeof(struct rpcrdma_mw);
993                 break;
994         case RPCRDMA_MTHCAFMR:
995                 /* TBD we are perhaps overallocating here */
996                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997                                 sizeof(struct rpcrdma_mw);
998                 break;
999         case RPCRDMA_MEMWINDOWS_ASYNC:
1000         case RPCRDMA_MEMWINDOWS:
1001                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002                                 sizeof(struct rpcrdma_mw);
1003                 break;
1004         default:
1005                 break;
1006         }
1007
1008         /* allocate 1, 4 and 5 in one shot */
1009         p = kzalloc(len, GFP_KERNEL);
1010         if (p == NULL) {
1011                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012                         __func__, len);
1013                 rc = -ENOMEM;
1014                 goto out;
1015         }
1016         buf->rb_pool = p;       /* for freeing it later */
1017
1018         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023         /*
1024          * Register the zeroed pad buffer, if any.
1025          */
1026         if (cdata->padding) {
1027                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028                                             &ep->rep_pad_mr, &ep->rep_pad);
1029                 if (rc)
1030                         goto out;
1031         }
1032         p += cdata->padding;
1033
1034         /*
1035          * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036          * We "cycle" the mw's in order to minimize rkey reuse,
1037          * and also reduce unbind-to-bind collision.
1038          */
1039         INIT_LIST_HEAD(&buf->rb_mws);
1040         r = (struct rpcrdma_mw *)p;
1041         switch (ia->ri_memreg_strategy) {
1042         case RPCRDMA_FRMR:
1043                 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044                         r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045                                                          RPCRDMA_MAX_SEGS);
1046                         if (IS_ERR(r->r.frmr.fr_mr)) {
1047                                 rc = PTR_ERR(r->r.frmr.fr_mr);
1048                                 dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1049                                         " failed %i\n", __func__, rc);
1050                                 goto out;
1051                         }
1052                         r->r.frmr.fr_pgl =
1053                                 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054                                                             RPCRDMA_MAX_SEGS);
1055                         if (IS_ERR(r->r.frmr.fr_pgl)) {
1056                                 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057                                 dprintk("RPC:       %s: "
1058                                         "ib_alloc_fast_reg_page_list "
1059                                         "failed %i\n", __func__, rc);
1060                                 goto out;
1061                         }
1062                         list_add(&r->mw_list, &buf->rb_mws);
1063                         ++r;
1064                 }
1065                 break;
1066         case RPCRDMA_MTHCAFMR:
1067                 /* TBD we are perhaps overallocating here */
1068                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1069                         static struct ib_fmr_attr fa =
1070                                 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1071                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073                                 &fa);
1074                         if (IS_ERR(r->r.fmr)) {
1075                                 rc = PTR_ERR(r->r.fmr);
1076                                 dprintk("RPC:       %s: ib_alloc_fmr"
1077                                         " failed %i\n", __func__, rc);
1078                                 goto out;
1079                         }
1080                         list_add(&r->mw_list, &buf->rb_mws);
1081                         ++r;
1082                 }
1083                 break;
1084         case RPCRDMA_MEMWINDOWS_ASYNC:
1085         case RPCRDMA_MEMWINDOWS:
1086                 /* Allocate one extra request's worth, for full cycling */
1087                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088                         r->r.mw = ib_alloc_mw(ia->ri_pd);
1089                         if (IS_ERR(r->r.mw)) {
1090                                 rc = PTR_ERR(r->r.mw);
1091                                 dprintk("RPC:       %s: ib_alloc_mw"
1092                                         " failed %i\n", __func__, rc);
1093                                 goto out;
1094                         }
1095                         list_add(&r->mw_list, &buf->rb_mws);
1096                         ++r;
1097                 }
1098                 break;
1099         default:
1100                 break;
1101         }
1102
1103         /*
1104          * Allocate/init the request/reply buffers. Doing this
1105          * using kmalloc for now -- one for each buf.
1106          */
1107         for (i = 0; i < buf->rb_max_requests; i++) {
1108                 struct rpcrdma_req *req;
1109                 struct rpcrdma_rep *rep;
1110
1111                 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112                 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113                 /* Typical ~2400b, so rounding up saves work later */
1114                 if (len < 4096)
1115                         len = 4096;
1116                 req = kmalloc(len, GFP_KERNEL);
1117                 if (req == NULL) {
1118                         dprintk("RPC:       %s: request buffer %d alloc"
1119                                 " failed\n", __func__, i);
1120                         rc = -ENOMEM;
1121                         goto out;
1122                 }
1123                 memset(req, 0, sizeof(struct rpcrdma_req));
1124                 buf->rb_send_bufs[i] = req;
1125                 buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127                 rc = rpcrdma_register_internal(ia, req->rl_base,
1128                                 len - offsetof(struct rpcrdma_req, rl_base),
1129                                 &buf->rb_send_bufs[i]->rl_handle,
1130                                 &buf->rb_send_bufs[i]->rl_iov);
1131                 if (rc)
1132                         goto out;
1133
1134                 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136                 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137                 rep = kmalloc(len, GFP_KERNEL);
1138                 if (rep == NULL) {
1139                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1140                                 __func__, i);
1141                         rc = -ENOMEM;
1142                         goto out;
1143                 }
1144                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145                 buf->rb_recv_bufs[i] = rep;
1146                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147                 init_waitqueue_head(&rep->rr_unbind);
1148
1149                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150                                 len - offsetof(struct rpcrdma_rep, rr_base),
1151                                 &buf->rb_recv_bufs[i]->rr_handle,
1152                                 &buf->rb_recv_bufs[i]->rr_iov);
1153                 if (rc)
1154                         goto out;
1155
1156         }
1157         dprintk("RPC:       %s: max_requests %d\n",
1158                 __func__, buf->rb_max_requests);
1159         /* done */
1160         return 0;
1161 out:
1162         rpcrdma_buffer_destroy(buf);
1163         return rc;
1164 }
1165
1166 /*
1167  * Unregister and destroy buffer memory. Need to deal with
1168  * partial initialization, so it's callable from failed create.
1169  * Must be called before destroying endpoint, as registrations
1170  * reference it.
1171  */
1172 void
1173 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174 {
1175         int rc, i;
1176         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1177         struct rpcrdma_mw *r;
1178
1179         /* clean up in reverse order from create
1180          *   1.  recv mr memory (mr free, then kfree)
1181          *   1a. bind mw memory
1182          *   2.  send mr memory (mr free, then kfree)
1183          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1184          *   4.  arrays
1185          */
1186         dprintk("RPC:       %s: entering\n", __func__);
1187
1188         for (i = 0; i < buf->rb_max_requests; i++) {
1189                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190                         rpcrdma_deregister_internal(ia,
1191                                         buf->rb_recv_bufs[i]->rr_handle,
1192                                         &buf->rb_recv_bufs[i]->rr_iov);
1193                         kfree(buf->rb_recv_bufs[i]);
1194                 }
1195                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196                         while (!list_empty(&buf->rb_mws)) {
1197                                 r = list_entry(buf->rb_mws.next,
1198                                         struct rpcrdma_mw, mw_list);
1199                                 list_del(&r->mw_list);
1200                                 switch (ia->ri_memreg_strategy) {
1201                                 case RPCRDMA_FRMR:
1202                                         rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203                                         if (rc)
1204                                                 dprintk("RPC:       %s:"
1205                                                         " ib_dereg_mr"
1206                                                         " failed %i\n",
1207                                                         __func__, rc);
1208                                         ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209                                         break;
1210                                 case RPCRDMA_MTHCAFMR:
1211                                         rc = ib_dealloc_fmr(r->r.fmr);
1212                                         if (rc)
1213                                                 dprintk("RPC:       %s:"
1214                                                         " ib_dealloc_fmr"
1215                                                         " failed %i\n",
1216                                                         __func__, rc);
1217                                         break;
1218                                 case RPCRDMA_MEMWINDOWS_ASYNC:
1219                                 case RPCRDMA_MEMWINDOWS:
1220                                         rc = ib_dealloc_mw(r->r.mw);
1221                                         if (rc)
1222                                                 dprintk("RPC:       %s:"
1223                                                         " ib_dealloc_mw"
1224                                                         " failed %i\n",
1225                                                         __func__, rc);
1226                                         break;
1227                                 default:
1228                                         break;
1229                                 }
1230                         }
1231                         rpcrdma_deregister_internal(ia,
1232                                         buf->rb_send_bufs[i]->rl_handle,
1233                                         &buf->rb_send_bufs[i]->rl_iov);
1234                         kfree(buf->rb_send_bufs[i]);
1235                 }
1236         }
1237
1238         kfree(buf->rb_pool);
1239 }
1240
1241 /*
1242  * Get a set of request/reply buffers.
1243  *
1244  * Reply buffer (if needed) is attached to send buffer upon return.
1245  * Rule:
1246  *    rb_send_index and rb_recv_index MUST always be pointing to the
1247  *    *next* available buffer (non-NULL). They are incremented after
1248  *    removing buffers, and decremented *before* returning them.
1249  */
1250 struct rpcrdma_req *
1251 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252 {
1253         struct rpcrdma_req *req;
1254         unsigned long flags;
1255         int i;
1256         struct rpcrdma_mw *r;
1257
1258         spin_lock_irqsave(&buffers->rb_lock, flags);
1259         if (buffers->rb_send_index == buffers->rb_max_requests) {
1260                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1262                 return ((struct rpcrdma_req *)NULL);
1263         }
1264
1265         req = buffers->rb_send_bufs[buffers->rb_send_index];
1266         if (buffers->rb_send_index < buffers->rb_recv_index) {
1267                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1268                         __func__,
1269                         buffers->rb_recv_index - buffers->rb_send_index);
1270                 req->rl_reply = NULL;
1271         } else {
1272                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274         }
1275         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276         if (!list_empty(&buffers->rb_mws)) {
1277                 i = RPCRDMA_MAX_SEGS - 1;
1278                 do {
1279                         r = list_entry(buffers->rb_mws.next,
1280                                         struct rpcrdma_mw, mw_list);
1281                         list_del(&r->mw_list);
1282                         req->rl_segments[i].mr_chunk.rl_mw = r;
1283                 } while (--i >= 0);
1284         }
1285         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286         return req;
1287 }
1288
1289 /*
1290  * Put request/reply buffers back into pool.
1291  * Pre-decrement counter/array index.
1292  */
1293 void
1294 rpcrdma_buffer_put(struct rpcrdma_req *req)
1295 {
1296         struct rpcrdma_buffer *buffers = req->rl_buffer;
1297         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298         int i;
1299         unsigned long flags;
1300
1301         BUG_ON(req->rl_nchunks != 0);
1302         spin_lock_irqsave(&buffers->rb_lock, flags);
1303         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304         req->rl_niovs = 0;
1305         if (req->rl_reply) {
1306                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307                 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308                 req->rl_reply->rr_func = NULL;
1309                 req->rl_reply = NULL;
1310         }
1311         switch (ia->ri_memreg_strategy) {
1312         case RPCRDMA_FRMR:
1313         case RPCRDMA_MTHCAFMR:
1314         case RPCRDMA_MEMWINDOWS_ASYNC:
1315         case RPCRDMA_MEMWINDOWS:
1316                 /*
1317                  * Cycle mw's back in reverse order, and "spin" them.
1318                  * This delays and scrambles reuse as much as possible.
1319                  */
1320                 i = 1;
1321                 do {
1322                         struct rpcrdma_mw **mw;
1323                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325                         *mw = NULL;
1326                 } while (++i < RPCRDMA_MAX_SEGS);
1327                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328                                         &buffers->rb_mws);
1329                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330                 break;
1331         default:
1332                 break;
1333         }
1334         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335 }
1336
1337 /*
1338  * Recover reply buffers from pool.
1339  * This happens when recovering from error conditions.
1340  * Post-increment counter/array index.
1341  */
1342 void
1343 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344 {
1345         struct rpcrdma_buffer *buffers = req->rl_buffer;
1346         unsigned long flags;
1347
1348         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1349                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350         spin_lock_irqsave(&buffers->rb_lock, flags);
1351         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354         }
1355         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356 }
1357
1358 /*
1359  * Put reply buffers back into pool when not attached to
1360  * request. This happens in error conditions, and when
1361  * aborting unbinds. Pre-decrement counter/array index.
1362  */
1363 void
1364 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365 {
1366         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367         unsigned long flags;
1368
1369         rep->rr_func = NULL;
1370         spin_lock_irqsave(&buffers->rb_lock, flags);
1371         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373 }
1374
1375 /*
1376  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377  */
1378
1379 int
1380 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381                                 struct ib_mr **mrp, struct ib_sge *iov)
1382 {
1383         struct ib_phys_buf ipb;
1384         struct ib_mr *mr;
1385         int rc;
1386
1387         /*
1388          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389          */
1390         iov->addr = ib_dma_map_single(ia->ri_id->device,
1391                         va, len, DMA_BIDIRECTIONAL);
1392         iov->length = len;
1393
1394         if (ia->ri_have_dma_lkey) {
1395                 *mrp = NULL;
1396                 iov->lkey = ia->ri_dma_lkey;
1397                 return 0;
1398         } else if (ia->ri_bind_mem != NULL) {
1399                 *mrp = NULL;
1400                 iov->lkey = ia->ri_bind_mem->lkey;
1401                 return 0;
1402         }
1403
1404         ipb.addr = iov->addr;
1405         ipb.size = iov->length;
1406         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409         dprintk("RPC:       %s: phys convert: 0x%llx "
1410                         "registered 0x%llx length %d\n",
1411                         __func__, (unsigned long long)ipb.addr,
1412                         (unsigned long long)iov->addr, len);
1413
1414         if (IS_ERR(mr)) {
1415                 *mrp = NULL;
1416                 rc = PTR_ERR(mr);
1417                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1418         } else {
1419                 *mrp = mr;
1420                 iov->lkey = mr->lkey;
1421                 rc = 0;
1422         }
1423
1424         return rc;
1425 }
1426
1427 int
1428 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429                                 struct ib_mr *mr, struct ib_sge *iov)
1430 {
1431         int rc;
1432
1433         ib_dma_unmap_single(ia->ri_id->device,
1434                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436         if (NULL == mr)
1437                 return 0;
1438
1439         rc = ib_dereg_mr(mr);
1440         if (rc)
1441                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1442         return rc;
1443 }
1444
1445 /*
1446  * Wrappers for chunk registration, shared by read/write chunk code.
1447  */
1448
1449 static void
1450 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451 {
1452         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453         seg->mr_dmalen = seg->mr_len;
1454         if (seg->mr_page)
1455                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456                                 seg->mr_page, offset_in_page(seg->mr_offset),
1457                                 seg->mr_dmalen, seg->mr_dir);
1458         else
1459                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460                                 seg->mr_offset,
1461                                 seg->mr_dmalen, seg->mr_dir);
1462         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464                         __func__,
1465                         (unsigned long long)seg->mr_dma,
1466                         seg->mr_offset, seg->mr_dmalen);
1467         }
1468 }
1469
1470 static void
1471 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472 {
1473         if (seg->mr_page)
1474                 ib_dma_unmap_page(ia->ri_id->device,
1475                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476         else
1477                 ib_dma_unmap_single(ia->ri_id->device,
1478                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479 }
1480
1481 static int
1482 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1484                         struct rpcrdma_xprt *r_xprt)
1485 {
1486         struct rpcrdma_mr_seg *seg1 = seg;
1487         struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
1489         u8 key;
1490         int len, pageoff;
1491         int i, rc;
1492
1493         pageoff = offset_in_page(seg1->mr_offset);
1494         seg1->mr_offset -= pageoff;     /* start of page */
1495         seg1->mr_len += pageoff;
1496         len = -pageoff;
1497         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499         for (i = 0; i < *nsegs;) {
1500                 rpcrdma_map_one(ia, seg, writing);
1501                 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502                 len += seg->mr_len;
1503                 BUG_ON(seg->mr_len > PAGE_SIZE);
1504                 ++seg;
1505                 ++i;
1506                 /* Check for holes */
1507                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509                         break;
1510         }
1511         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1512                 __func__, seg1->mr_chunk.rl_mw, i);
1513
1514         if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515                 dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
1516                         __func__,
1517                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518                 /* Invalidate before using. */
1519                 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520                 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521                 invalidate_wr.next = &frmr_wr;
1522                 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523                 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524                 invalidate_wr.ex.invalidate_rkey =
1525                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526                 DECR_CQCOUNT(&r_xprt->rx_ep);
1527                 post_wr = &invalidate_wr;
1528         } else
1529                 post_wr = &frmr_wr;
1530
1531         /* Bump the key */
1532         key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533         ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535         /* Prepare FRMR WR */
1536         memset(&frmr_wr, 0, sizeof frmr_wr);
1537         frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1538         frmr_wr.opcode = IB_WR_FAST_REG_MR;
1539         frmr_wr.send_flags = IB_SEND_SIGNALED;
1540         frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1541         frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542         frmr_wr.wr.fast_reg.page_list_len = i;
1543         frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544         frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545         BUG_ON(frmr_wr.wr.fast_reg.length < len);
1546         frmr_wr.wr.fast_reg.access_flags = (writing ?
1547                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548                                 IB_ACCESS_REMOTE_READ);
1549         frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550         DECR_CQCOUNT(&r_xprt->rx_ep);
1551
1552         rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1553
1554         if (rc) {
1555                 dprintk("RPC:       %s: failed ib_post_send for register,"
1556                         " status %i\n", __func__, rc);
1557                 while (i--)
1558                         rpcrdma_unmap_one(ia, --seg);
1559         } else {
1560                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561                 seg1->mr_base = seg1->mr_dma + pageoff;
1562                 seg1->mr_nsegs = i;
1563                 seg1->mr_len = len;
1564         }
1565         *nsegs = i;
1566         return rc;
1567 }
1568
1569 static int
1570 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572 {
1573         struct rpcrdma_mr_seg *seg1 = seg;
1574         struct ib_send_wr invalidate_wr, *bad_wr;
1575         int rc;
1576
1577         while (seg1->mr_nsegs--)
1578                 rpcrdma_unmap_one(ia, seg++);
1579
1580         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581         invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1582         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1583         invalidate_wr.send_flags = IB_SEND_SIGNALED;
1584         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585         DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588         if (rc)
1589                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1590                         " status %i\n", __func__, rc);
1591         return rc;
1592 }
1593
1594 static int
1595 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1597 {
1598         struct rpcrdma_mr_seg *seg1 = seg;
1599         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600         int len, pageoff, i, rc;
1601
1602         pageoff = offset_in_page(seg1->mr_offset);
1603         seg1->mr_offset -= pageoff;     /* start of page */
1604         seg1->mr_len += pageoff;
1605         len = -pageoff;
1606         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608         for (i = 0; i < *nsegs;) {
1609                 rpcrdma_map_one(ia, seg, writing);
1610                 physaddrs[i] = seg->mr_dma;
1611                 len += seg->mr_len;
1612                 ++seg;
1613                 ++i;
1614                 /* Check for holes */
1615                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617                         break;
1618         }
1619         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620                                 physaddrs, i, seg1->mr_dma);
1621         if (rc) {
1622                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1623                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624                         len, (unsigned long long)seg1->mr_dma,
1625                         pageoff, i, rc);
1626                 while (i--)
1627                         rpcrdma_unmap_one(ia, --seg);
1628         } else {
1629                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630                 seg1->mr_base = seg1->mr_dma + pageoff;
1631                 seg1->mr_nsegs = i;
1632                 seg1->mr_len = len;
1633         }
1634         *nsegs = i;
1635         return rc;
1636 }
1637
1638 static int
1639 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640                         struct rpcrdma_ia *ia)
1641 {
1642         struct rpcrdma_mr_seg *seg1 = seg;
1643         LIST_HEAD(l);
1644         int rc;
1645
1646         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647         rc = ib_unmap_fmr(&l);
1648         while (seg1->mr_nsegs--)
1649                 rpcrdma_unmap_one(ia, seg++);
1650         if (rc)
1651                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1652                         " status %i\n", __func__, rc);
1653         return rc;
1654 }
1655
1656 static int
1657 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1659                         struct rpcrdma_xprt *r_xprt)
1660 {
1661         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662                                   IB_ACCESS_REMOTE_READ);
1663         struct ib_mw_bind param;
1664         int rc;
1665
1666         *nsegs = 1;
1667         rpcrdma_map_one(ia, seg, writing);
1668         param.mr = ia->ri_bind_mem;
1669         param.wr_id = 0ULL;     /* no send cookie */
1670         param.addr = seg->mr_dma;
1671         param.length = seg->mr_len;
1672         param.send_flags = 0;
1673         param.mw_access_flags = mem_priv;
1674
1675         DECR_CQCOUNT(&r_xprt->rx_ep);
1676         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677         if (rc) {
1678                 dprintk("RPC:       %s: failed ib_bind_mw "
1679                         "%u@0x%llx status %i\n",
1680                         __func__, seg->mr_len,
1681                         (unsigned long long)seg->mr_dma, rc);
1682                 rpcrdma_unmap_one(ia, seg);
1683         } else {
1684                 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685                 seg->mr_base = param.addr;
1686                 seg->mr_nsegs = 1;
1687         }
1688         return rc;
1689 }
1690
1691 static int
1692 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693                         struct rpcrdma_ia *ia,
1694                         struct rpcrdma_xprt *r_xprt, void **r)
1695 {
1696         struct ib_mw_bind param;
1697         LIST_HEAD(l);
1698         int rc;
1699
1700         BUG_ON(seg->mr_nsegs != 1);
1701         param.mr = ia->ri_bind_mem;
1702         param.addr = 0ULL;      /* unbind */
1703         param.length = 0;
1704         param.mw_access_flags = 0;
1705         if (*r) {
1706                 param.wr_id = (u64) (unsigned long) *r;
1707                 param.send_flags = IB_SEND_SIGNALED;
1708                 INIT_CQCOUNT(&r_xprt->rx_ep);
1709         } else {
1710                 param.wr_id = 0ULL;
1711                 param.send_flags = 0;
1712                 DECR_CQCOUNT(&r_xprt->rx_ep);
1713         }
1714         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715         rpcrdma_unmap_one(ia, seg);
1716         if (rc)
1717                 dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1718                         " status %i\n", __func__, rc);
1719         else
1720                 *r = NULL;      /* will upcall on completion */
1721         return rc;
1722 }
1723
1724 static int
1725 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1727 {
1728         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729                                   IB_ACCESS_REMOTE_READ);
1730         struct rpcrdma_mr_seg *seg1 = seg;
1731         struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732         int len, i, rc = 0;
1733
1734         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736         for (len = 0, i = 0; i < *nsegs;) {
1737                 rpcrdma_map_one(ia, seg, writing);
1738                 ipb[i].addr = seg->mr_dma;
1739                 ipb[i].size = seg->mr_len;
1740                 len += seg->mr_len;
1741                 ++seg;
1742                 ++i;
1743                 /* Check for holes */
1744                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745                     offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746                         break;
1747         }
1748         seg1->mr_base = seg1->mr_dma;
1749         seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750                                 ipb, i, mem_priv, &seg1->mr_base);
1751         if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752                 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753                 dprintk("RPC:       %s: failed ib_reg_phys_mr "
1754                         "%u@0x%llx (%d)... status %i\n",
1755                         __func__, len,
1756                         (unsigned long long)seg1->mr_dma, i, rc);
1757                 while (i--)
1758                         rpcrdma_unmap_one(ia, --seg);
1759         } else {
1760                 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761                 seg1->mr_nsegs = i;
1762                 seg1->mr_len = len;
1763         }
1764         *nsegs = i;
1765         return rc;
1766 }
1767
1768 static int
1769 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770                         struct rpcrdma_ia *ia)
1771 {
1772         struct rpcrdma_mr_seg *seg1 = seg;
1773         int rc;
1774
1775         rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776         seg1->mr_chunk.rl_mr = NULL;
1777         while (seg1->mr_nsegs--)
1778                 rpcrdma_unmap_one(ia, seg++);
1779         if (rc)
1780                 dprintk("RPC:       %s: failed ib_dereg_mr,"
1781                         " status %i\n", __func__, rc);
1782         return rc;
1783 }
1784
1785 int
1786 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788 {
1789         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1790         int rc = 0;
1791
1792         switch (ia->ri_memreg_strategy) {
1793
1794 #if RPCRDMA_PERSISTENT_REGISTRATION
1795         case RPCRDMA_ALLPHYSICAL:
1796                 rpcrdma_map_one(ia, seg, writing);
1797                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1798                 seg->mr_base = seg->mr_dma;
1799                 seg->mr_nsegs = 1;
1800                 nsegs = 1;
1801                 break;
1802 #endif
1803
1804         /* Registration using frmr registration */
1805         case RPCRDMA_FRMR:
1806                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807                 break;
1808
1809         /* Registration using fmr memory registration */
1810         case RPCRDMA_MTHCAFMR:
1811                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1812                 break;
1813
1814         /* Registration using memory windows */
1815         case RPCRDMA_MEMWINDOWS_ASYNC:
1816         case RPCRDMA_MEMWINDOWS:
1817                 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1818                 break;
1819
1820         /* Default registration each time */
1821         default:
1822                 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1823                 break;
1824         }
1825         if (rc)
1826                 return -1;
1827
1828         return nsegs;
1829 }
1830
1831 int
1832 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833                 struct rpcrdma_xprt *r_xprt, void *r)
1834 {
1835         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1836         int nsegs = seg->mr_nsegs, rc;
1837
1838         switch (ia->ri_memreg_strategy) {
1839
1840 #if RPCRDMA_PERSISTENT_REGISTRATION
1841         case RPCRDMA_ALLPHYSICAL:
1842                 BUG_ON(nsegs != 1);
1843                 rpcrdma_unmap_one(ia, seg);
1844                 rc = 0;
1845                 break;
1846 #endif
1847
1848         case RPCRDMA_FRMR:
1849                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850                 break;
1851
1852         case RPCRDMA_MTHCAFMR:
1853                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1854                 break;
1855
1856         case RPCRDMA_MEMWINDOWS_ASYNC:
1857         case RPCRDMA_MEMWINDOWS:
1858                 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1859                 break;
1860
1861         default:
1862                 rc = rpcrdma_deregister_default_external(seg, ia);
1863                 break;
1864         }
1865         if (r) {
1866                 struct rpcrdma_rep *rep = r;
1867                 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868                 rep->rr_func = NULL;
1869                 func(rep);      /* dereg done, callback now */
1870         }
1871         return nsegs;
1872 }
1873
1874 /*
1875  * Prepost any receive buffer, then post send.
1876  *
1877  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1878  */
1879 int
1880 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881                 struct rpcrdma_ep *ep,
1882                 struct rpcrdma_req *req)
1883 {
1884         struct ib_send_wr send_wr, *send_wr_fail;
1885         struct rpcrdma_rep *rep = req->rl_reply;
1886         int rc;
1887
1888         if (rep) {
1889                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890                 if (rc)
1891                         goto out;
1892                 req->rl_reply = NULL;
1893         }
1894
1895         send_wr.next = NULL;
1896         send_wr.wr_id = 0ULL;   /* no send cookie */
1897         send_wr.sg_list = req->rl_send_iov;
1898         send_wr.num_sge = req->rl_niovs;
1899         send_wr.opcode = IB_WR_SEND;
1900         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1901                 ib_dma_sync_single_for_device(ia->ri_id->device,
1902                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903                         DMA_TO_DEVICE);
1904         ib_dma_sync_single_for_device(ia->ri_id->device,
1905                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906                 DMA_TO_DEVICE);
1907         ib_dma_sync_single_for_device(ia->ri_id->device,
1908                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909                 DMA_TO_DEVICE);
1910
1911         if (DECR_CQCOUNT(ep) > 0)
1912                 send_wr.send_flags = 0;
1913         else { /* Provider must take a send completion every now and then */
1914                 INIT_CQCOUNT(ep);
1915                 send_wr.send_flags = IB_SEND_SIGNALED;
1916         }
1917
1918         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919         if (rc)
1920                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1921                         rc);
1922 out:
1923         return rc;
1924 }
1925
1926 /*
1927  * (Re)post a receive buffer.
1928  */
1929 int
1930 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931                      struct rpcrdma_ep *ep,
1932                      struct rpcrdma_rep *rep)
1933 {
1934         struct ib_recv_wr recv_wr, *recv_wr_fail;
1935         int rc;
1936
1937         recv_wr.next = NULL;
1938         recv_wr.wr_id = (u64) (unsigned long) rep;
1939         recv_wr.sg_list = &rep->rr_iov;
1940         recv_wr.num_sge = 1;
1941
1942         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945         DECR_CQCOUNT(ep);
1946         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948         if (rc)
1949                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1950                         rc);
1951         return rc;
1952 }