Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
[pandora-kernel.git] / drivers / infiniband / hw / ipath / ipath_verbs.c
1 /*
2  * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <rdma/ib_mad.h>
35 #include <rdma/ib_user_verbs.h>
36 #include <linux/io.h>
37 #include <linux/slab.h>
38 #include <linux/module.h>
39 #include <linux/utsname.h>
40 #include <linux/rculist.h>
41
42 #include "ipath_kernel.h"
43 #include "ipath_verbs.h"
44 #include "ipath_common.h"
45
46 static unsigned int ib_ipath_qp_table_size = 251;
47 module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
48 MODULE_PARM_DESC(qp_table_size, "QP table size");
49
50 unsigned int ib_ipath_lkey_table_size = 12;
51 module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
52                    S_IRUGO);
53 MODULE_PARM_DESC(lkey_table_size,
54                  "LKEY table size in bits (2^n, 1 <= n <= 23)");
55
56 static unsigned int ib_ipath_max_pds = 0xFFFF;
57 module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
58 MODULE_PARM_DESC(max_pds,
59                  "Maximum number of protection domains to support");
60
61 static unsigned int ib_ipath_max_ahs = 0xFFFF;
62 module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
63 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
64
65 unsigned int ib_ipath_max_cqes = 0x2FFFF;
66 module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
67 MODULE_PARM_DESC(max_cqes,
68                  "Maximum number of completion queue entries to support");
69
70 unsigned int ib_ipath_max_cqs = 0x1FFFF;
71 module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
72 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
73
74 unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
75 module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
76                    S_IWUSR | S_IRUGO);
77 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
78
79 unsigned int ib_ipath_max_qps = 16384;
80 module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
81 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
82
83 unsigned int ib_ipath_max_sges = 0x60;
84 module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
85 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
86
87 unsigned int ib_ipath_max_mcast_grps = 16384;
88 module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
89                    S_IWUSR | S_IRUGO);
90 MODULE_PARM_DESC(max_mcast_grps,
91                  "Maximum number of multicast groups to support");
92
93 unsigned int ib_ipath_max_mcast_qp_attached = 16;
94 module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
95                    uint, S_IWUSR | S_IRUGO);
96 MODULE_PARM_DESC(max_mcast_qp_attached,
97                  "Maximum number of attached QPs to support");
98
99 unsigned int ib_ipath_max_srqs = 1024;
100 module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
101 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
102
103 unsigned int ib_ipath_max_srq_sges = 128;
104 module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
105                    uint, S_IWUSR | S_IRUGO);
106 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
107
108 unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
109 module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
110                    uint, S_IWUSR | S_IRUGO);
111 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
112
113 static unsigned int ib_ipath_disable_sma;
114 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
115 MODULE_PARM_DESC(disable_sma, "Disable the SMA");
116
117 /*
118  * Note that it is OK to post send work requests in the SQE and ERR
119  * states; ipath_do_send() will process them and generate error
120  * completions as per IB 1.2 C10-96.
121  */
122 const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
123         [IB_QPS_RESET] = 0,
124         [IB_QPS_INIT] = IPATH_POST_RECV_OK,
125         [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
126         [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
127             IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
128             IPATH_PROCESS_NEXT_SEND_OK,
129         [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
130             IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
131         [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
132             IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
133         [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
134             IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
135 };
136
137 struct ipath_ucontext {
138         struct ib_ucontext ibucontext;
139 };
140
141 static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
142                                                   *ibucontext)
143 {
144         return container_of(ibucontext, struct ipath_ucontext, ibucontext);
145 }
146
147 /*
148  * Translate ib_wr_opcode into ib_wc_opcode.
149  */
150 const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
151         [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
152         [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
153         [IB_WR_SEND] = IB_WC_SEND,
154         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
155         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
156         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
157         [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
158 };
159
160 /*
161  * System image GUID.
162  */
163 static __be64 sys_image_guid;
164
165 /**
166  * ipath_copy_sge - copy data to SGE memory
167  * @ss: the SGE state
168  * @data: the data to copy
169  * @length: the length of the data
170  */
171 void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
172 {
173         struct ipath_sge *sge = &ss->sge;
174
175         while (length) {
176                 u32 len = sge->length;
177
178                 if (len > length)
179                         len = length;
180                 if (len > sge->sge_length)
181                         len = sge->sge_length;
182                 BUG_ON(len == 0);
183                 memcpy(sge->vaddr, data, len);
184                 sge->vaddr += len;
185                 sge->length -= len;
186                 sge->sge_length -= len;
187                 if (sge->sge_length == 0) {
188                         if (--ss->num_sge)
189                                 *sge = *ss->sg_list++;
190                 } else if (sge->length == 0 && sge->mr != NULL) {
191                         if (++sge->n >= IPATH_SEGSZ) {
192                                 if (++sge->m >= sge->mr->mapsz)
193                                         break;
194                                 sge->n = 0;
195                         }
196                         sge->vaddr =
197                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
198                         sge->length =
199                                 sge->mr->map[sge->m]->segs[sge->n].length;
200                 }
201                 data += len;
202                 length -= len;
203         }
204 }
205
206 /**
207  * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
208  * @ss: the SGE state
209  * @length: the number of bytes to skip
210  */
211 void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
212 {
213         struct ipath_sge *sge = &ss->sge;
214
215         while (length) {
216                 u32 len = sge->length;
217
218                 if (len > length)
219                         len = length;
220                 if (len > sge->sge_length)
221                         len = sge->sge_length;
222                 BUG_ON(len == 0);
223                 sge->vaddr += len;
224                 sge->length -= len;
225                 sge->sge_length -= len;
226                 if (sge->sge_length == 0) {
227                         if (--ss->num_sge)
228                                 *sge = *ss->sg_list++;
229                 } else if (sge->length == 0 && sge->mr != NULL) {
230                         if (++sge->n >= IPATH_SEGSZ) {
231                                 if (++sge->m >= sge->mr->mapsz)
232                                         break;
233                                 sge->n = 0;
234                         }
235                         sge->vaddr =
236                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
237                         sge->length =
238                                 sge->mr->map[sge->m]->segs[sge->n].length;
239                 }
240                 length -= len;
241         }
242 }
243
244 /*
245  * Count the number of DMA descriptors needed to send length bytes of data.
246  * Don't modify the ipath_sge_state to get the count.
247  * Return zero if any of the segments is not aligned.
248  */
249 static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
250 {
251         struct ipath_sge *sg_list = ss->sg_list;
252         struct ipath_sge sge = ss->sge;
253         u8 num_sge = ss->num_sge;
254         u32 ndesc = 1;  /* count the header */
255
256         while (length) {
257                 u32 len = sge.length;
258
259                 if (len > length)
260                         len = length;
261                 if (len > sge.sge_length)
262                         len = sge.sge_length;
263                 BUG_ON(len == 0);
264                 if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
265                     (len != length && (len & (sizeof(u32) - 1)))) {
266                         ndesc = 0;
267                         break;
268                 }
269                 ndesc++;
270                 sge.vaddr += len;
271                 sge.length -= len;
272                 sge.sge_length -= len;
273                 if (sge.sge_length == 0) {
274                         if (--num_sge)
275                                 sge = *sg_list++;
276                 } else if (sge.length == 0 && sge.mr != NULL) {
277                         if (++sge.n >= IPATH_SEGSZ) {
278                                 if (++sge.m >= sge.mr->mapsz)
279                                         break;
280                                 sge.n = 0;
281                         }
282                         sge.vaddr =
283                                 sge.mr->map[sge.m]->segs[sge.n].vaddr;
284                         sge.length =
285                                 sge.mr->map[sge.m]->segs[sge.n].length;
286                 }
287                 length -= len;
288         }
289         return ndesc;
290 }
291
292 /*
293  * Copy from the SGEs to the data buffer.
294  */
295 static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
296                                 u32 length)
297 {
298         struct ipath_sge *sge = &ss->sge;
299
300         while (length) {
301                 u32 len = sge->length;
302
303                 if (len > length)
304                         len = length;
305                 if (len > sge->sge_length)
306                         len = sge->sge_length;
307                 BUG_ON(len == 0);
308                 memcpy(data, sge->vaddr, len);
309                 sge->vaddr += len;
310                 sge->length -= len;
311                 sge->sge_length -= len;
312                 if (sge->sge_length == 0) {
313                         if (--ss->num_sge)
314                                 *sge = *ss->sg_list++;
315                 } else if (sge->length == 0 && sge->mr != NULL) {
316                         if (++sge->n >= IPATH_SEGSZ) {
317                                 if (++sge->m >= sge->mr->mapsz)
318                                         break;
319                                 sge->n = 0;
320                         }
321                         sge->vaddr =
322                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
323                         sge->length =
324                                 sge->mr->map[sge->m]->segs[sge->n].length;
325                 }
326                 data += len;
327                 length -= len;
328         }
329 }
330
331 /**
332  * ipath_post_one_send - post one RC, UC, or UD send work request
333  * @qp: the QP to post on
334  * @wr: the work request to send
335  */
336 static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
337 {
338         struct ipath_swqe *wqe;
339         u32 next;
340         int i;
341         int j;
342         int acc;
343         int ret;
344         unsigned long flags;
345         struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
346
347         spin_lock_irqsave(&qp->s_lock, flags);
348
349         if (qp->ibqp.qp_type != IB_QPT_SMI &&
350             !(dd->ipath_flags & IPATH_LINKACTIVE)) {
351                 ret = -ENETDOWN;
352                 goto bail;
353         }
354
355         /* Check that state is OK to post send. */
356         if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
357                 goto bail_inval;
358
359         /* IB spec says that num_sge == 0 is OK. */
360         if (wr->num_sge > qp->s_max_sge)
361                 goto bail_inval;
362
363         /*
364          * Don't allow RDMA reads or atomic operations on UC or
365          * undefined operations.
366          * Make sure buffer is large enough to hold the result for atomics.
367          */
368         if (qp->ibqp.qp_type == IB_QPT_UC) {
369                 if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
370                         goto bail_inval;
371         } else if (qp->ibqp.qp_type == IB_QPT_UD) {
372                 /* Check UD opcode */
373                 if (wr->opcode != IB_WR_SEND &&
374                     wr->opcode != IB_WR_SEND_WITH_IMM)
375                         goto bail_inval;
376                 /* Check UD destination address PD */
377                 if (qp->ibqp.pd != wr->wr.ud.ah->pd)
378                         goto bail_inval;
379         } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
380                 goto bail_inval;
381         else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
382                    (wr->num_sge == 0 ||
383                     wr->sg_list[0].length < sizeof(u64) ||
384                     wr->sg_list[0].addr & (sizeof(u64) - 1)))
385                 goto bail_inval;
386         else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
387                 goto bail_inval;
388
389         next = qp->s_head + 1;
390         if (next >= qp->s_size)
391                 next = 0;
392         if (next == qp->s_last) {
393                 ret = -ENOMEM;
394                 goto bail;
395         }
396
397         wqe = get_swqe_ptr(qp, qp->s_head);
398         wqe->wr = *wr;
399         wqe->length = 0;
400         if (wr->num_sge) {
401                 acc = wr->opcode >= IB_WR_RDMA_READ ?
402                         IB_ACCESS_LOCAL_WRITE : 0;
403                 for (i = 0, j = 0; i < wr->num_sge; i++) {
404                         u32 length = wr->sg_list[i].length;
405                         int ok;
406
407                         if (length == 0)
408                                 continue;
409                         ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
410                                            &wr->sg_list[i], acc);
411                         if (!ok)
412                                 goto bail_inval;
413                         wqe->length += length;
414                         j++;
415                 }
416                 wqe->wr.num_sge = j;
417         }
418         if (qp->ibqp.qp_type == IB_QPT_UC ||
419             qp->ibqp.qp_type == IB_QPT_RC) {
420                 if (wqe->length > 0x80000000U)
421                         goto bail_inval;
422         } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
423                 goto bail_inval;
424         wqe->ssn = qp->s_ssn++;
425         qp->s_head = next;
426
427         ret = 0;
428         goto bail;
429
430 bail_inval:
431         ret = -EINVAL;
432 bail:
433         spin_unlock_irqrestore(&qp->s_lock, flags);
434         return ret;
435 }
436
437 /**
438  * ipath_post_send - post a send on a QP
439  * @ibqp: the QP to post the send on
440  * @wr: the list of work requests to post
441  * @bad_wr: the first bad WR is put here
442  *
443  * This may be called from interrupt context.
444  */
445 static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
446                            struct ib_send_wr **bad_wr)
447 {
448         struct ipath_qp *qp = to_iqp(ibqp);
449         int err = 0;
450
451         for (; wr; wr = wr->next) {
452                 err = ipath_post_one_send(qp, wr);
453                 if (err) {
454                         *bad_wr = wr;
455                         goto bail;
456                 }
457         }
458
459         /* Try to do the send work in the caller's context. */
460         ipath_do_send((unsigned long) qp);
461
462 bail:
463         return err;
464 }
465
466 /**
467  * ipath_post_receive - post a receive on a QP
468  * @ibqp: the QP to post the receive on
469  * @wr: the WR to post
470  * @bad_wr: the first bad WR is put here
471  *
472  * This may be called from interrupt context.
473  */
474 static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
475                               struct ib_recv_wr **bad_wr)
476 {
477         struct ipath_qp *qp = to_iqp(ibqp);
478         struct ipath_rwq *wq = qp->r_rq.wq;
479         unsigned long flags;
480         int ret;
481
482         /* Check that state is OK to post receive. */
483         if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
484                 *bad_wr = wr;
485                 ret = -EINVAL;
486                 goto bail;
487         }
488
489         for (; wr; wr = wr->next) {
490                 struct ipath_rwqe *wqe;
491                 u32 next;
492                 int i;
493
494                 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
495                         *bad_wr = wr;
496                         ret = -EINVAL;
497                         goto bail;
498                 }
499
500                 spin_lock_irqsave(&qp->r_rq.lock, flags);
501                 next = wq->head + 1;
502                 if (next >= qp->r_rq.size)
503                         next = 0;
504                 if (next == wq->tail) {
505                         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
506                         *bad_wr = wr;
507                         ret = -ENOMEM;
508                         goto bail;
509                 }
510
511                 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
512                 wqe->wr_id = wr->wr_id;
513                 wqe->num_sge = wr->num_sge;
514                 for (i = 0; i < wr->num_sge; i++)
515                         wqe->sg_list[i] = wr->sg_list[i];
516                 /* Make sure queue entry is written before the head index. */
517                 smp_wmb();
518                 wq->head = next;
519                 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
520         }
521         ret = 0;
522
523 bail:
524         return ret;
525 }
526
527 /**
528  * ipath_qp_rcv - processing an incoming packet on a QP
529  * @dev: the device the packet came on
530  * @hdr: the packet header
531  * @has_grh: true if the packet has a GRH
532  * @data: the packet data
533  * @tlen: the packet length
534  * @qp: the QP the packet came on
535  *
536  * This is called from ipath_ib_rcv() to process an incoming packet
537  * for the given QP.
538  * Called at interrupt level.
539  */
540 static void ipath_qp_rcv(struct ipath_ibdev *dev,
541                          struct ipath_ib_header *hdr, int has_grh,
542                          void *data, u32 tlen, struct ipath_qp *qp)
543 {
544         /* Check for valid receive state. */
545         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
546                 dev->n_pkt_drops++;
547                 return;
548         }
549
550         switch (qp->ibqp.qp_type) {
551         case IB_QPT_SMI:
552         case IB_QPT_GSI:
553                 if (ib_ipath_disable_sma)
554                         break;
555                 /* FALLTHROUGH */
556         case IB_QPT_UD:
557                 ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
558                 break;
559
560         case IB_QPT_RC:
561                 ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
562                 break;
563
564         case IB_QPT_UC:
565                 ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
566                 break;
567
568         default:
569                 break;
570         }
571 }
572
573 /**
574  * ipath_ib_rcv - process an incoming packet
575  * @arg: the device pointer
576  * @rhdr: the header of the packet
577  * @data: the packet data
578  * @tlen: the packet length
579  *
580  * This is called from ipath_kreceive() to process an incoming packet at
581  * interrupt level. Tlen is the length of the header + data + CRC in bytes.
582  */
583 void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
584                   u32 tlen)
585 {
586         struct ipath_ib_header *hdr = rhdr;
587         struct ipath_other_headers *ohdr;
588         struct ipath_qp *qp;
589         u32 qp_num;
590         int lnh;
591         u8 opcode;
592         u16 lid;
593
594         if (unlikely(dev == NULL))
595                 goto bail;
596
597         if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
598                 dev->rcv_errors++;
599                 goto bail;
600         }
601
602         /* Check for a valid destination LID (see ch. 7.11.1). */
603         lid = be16_to_cpu(hdr->lrh[1]);
604         if (lid < IPATH_MULTICAST_LID_BASE) {
605                 lid &= ~((1 << dev->dd->ipath_lmc) - 1);
606                 if (unlikely(lid != dev->dd->ipath_lid)) {
607                         dev->rcv_errors++;
608                         goto bail;
609                 }
610         }
611
612         /* Check for GRH */
613         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
614         if (lnh == IPATH_LRH_BTH)
615                 ohdr = &hdr->u.oth;
616         else if (lnh == IPATH_LRH_GRH)
617                 ohdr = &hdr->u.l.oth;
618         else {
619                 dev->rcv_errors++;
620                 goto bail;
621         }
622
623         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
624         dev->opstats[opcode].n_bytes += tlen;
625         dev->opstats[opcode].n_packets++;
626
627         /* Get the destination QP number. */
628         qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
629         if (qp_num == IPATH_MULTICAST_QPN) {
630                 struct ipath_mcast *mcast;
631                 struct ipath_mcast_qp *p;
632
633                 if (lnh != IPATH_LRH_GRH) {
634                         dev->n_pkt_drops++;
635                         goto bail;
636                 }
637                 mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
638                 if (mcast == NULL) {
639                         dev->n_pkt_drops++;
640                         goto bail;
641                 }
642                 dev->n_multicast_rcv++;
643                 list_for_each_entry_rcu(p, &mcast->qp_list, list)
644                         ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
645                 /*
646                  * Notify ipath_multicast_detach() if it is waiting for us
647                  * to finish.
648                  */
649                 if (atomic_dec_return(&mcast->refcount) <= 1)
650                         wake_up(&mcast->wait);
651         } else {
652                 qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
653                 if (qp) {
654                         dev->n_unicast_rcv++;
655                         ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
656                                      tlen, qp);
657                         /*
658                          * Notify ipath_destroy_qp() if it is waiting
659                          * for us to finish.
660                          */
661                         if (atomic_dec_and_test(&qp->refcount))
662                                 wake_up(&qp->wait);
663                 } else
664                         dev->n_pkt_drops++;
665         }
666
667 bail:;
668 }
669
670 /**
671  * ipath_ib_timer - verbs timer
672  * @arg: the device pointer
673  *
674  * This is called from ipath_do_rcv_timer() at interrupt level to check for
675  * QPs which need retransmits and to collect performance numbers.
676  */
677 static void ipath_ib_timer(struct ipath_ibdev *dev)
678 {
679         struct ipath_qp *resend = NULL;
680         struct ipath_qp *rnr = NULL;
681         struct list_head *last;
682         struct ipath_qp *qp;
683         unsigned long flags;
684
685         if (dev == NULL)
686                 return;
687
688         spin_lock_irqsave(&dev->pending_lock, flags);
689         /* Start filling the next pending queue. */
690         if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
691                 dev->pending_index = 0;
692         /* Save any requests still in the new queue, they have timed out. */
693         last = &dev->pending[dev->pending_index];
694         while (!list_empty(last)) {
695                 qp = list_entry(last->next, struct ipath_qp, timerwait);
696                 list_del_init(&qp->timerwait);
697                 qp->timer_next = resend;
698                 resend = qp;
699                 atomic_inc(&qp->refcount);
700         }
701         last = &dev->rnrwait;
702         if (!list_empty(last)) {
703                 qp = list_entry(last->next, struct ipath_qp, timerwait);
704                 if (--qp->s_rnr_timeout == 0) {
705                         do {
706                                 list_del_init(&qp->timerwait);
707                                 qp->timer_next = rnr;
708                                 rnr = qp;
709                                 atomic_inc(&qp->refcount);
710                                 if (list_empty(last))
711                                         break;
712                                 qp = list_entry(last->next, struct ipath_qp,
713                                                 timerwait);
714                         } while (qp->s_rnr_timeout == 0);
715                 }
716         }
717         /*
718          * We should only be in the started state if pma_sample_start != 0
719          */
720         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
721             --dev->pma_sample_start == 0) {
722                 dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
723                 ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
724                                         &dev->ipath_rword,
725                                         &dev->ipath_spkts,
726                                         &dev->ipath_rpkts,
727                                         &dev->ipath_xmit_wait);
728         }
729         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
730                 if (dev->pma_sample_interval == 0) {
731                         u64 ta, tb, tc, td, te;
732
733                         dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
734                         ipath_snapshot_counters(dev->dd, &ta, &tb,
735                                                 &tc, &td, &te);
736
737                         dev->ipath_sword = ta - dev->ipath_sword;
738                         dev->ipath_rword = tb - dev->ipath_rword;
739                         dev->ipath_spkts = tc - dev->ipath_spkts;
740                         dev->ipath_rpkts = td - dev->ipath_rpkts;
741                         dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
742                 }
743                 else
744                         dev->pma_sample_interval--;
745         }
746         spin_unlock_irqrestore(&dev->pending_lock, flags);
747
748         /* XXX What if timer fires again while this is running? */
749         while (resend != NULL) {
750                 qp = resend;
751                 resend = qp->timer_next;
752
753                 spin_lock_irqsave(&qp->s_lock, flags);
754                 if (qp->s_last != qp->s_tail &&
755                     ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
756                         dev->n_timeouts++;
757                         ipath_restart_rc(qp, qp->s_last_psn + 1);
758                 }
759                 spin_unlock_irqrestore(&qp->s_lock, flags);
760
761                 /* Notify ipath_destroy_qp() if it is waiting. */
762                 if (atomic_dec_and_test(&qp->refcount))
763                         wake_up(&qp->wait);
764         }
765         while (rnr != NULL) {
766                 qp = rnr;
767                 rnr = qp->timer_next;
768
769                 spin_lock_irqsave(&qp->s_lock, flags);
770                 if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
771                         ipath_schedule_send(qp);
772                 spin_unlock_irqrestore(&qp->s_lock, flags);
773
774                 /* Notify ipath_destroy_qp() if it is waiting. */
775                 if (atomic_dec_and_test(&qp->refcount))
776                         wake_up(&qp->wait);
777         }
778 }
779
780 static void update_sge(struct ipath_sge_state *ss, u32 length)
781 {
782         struct ipath_sge *sge = &ss->sge;
783
784         sge->vaddr += length;
785         sge->length -= length;
786         sge->sge_length -= length;
787         if (sge->sge_length == 0) {
788                 if (--ss->num_sge)
789                         *sge = *ss->sg_list++;
790         } else if (sge->length == 0 && sge->mr != NULL) {
791                 if (++sge->n >= IPATH_SEGSZ) {
792                         if (++sge->m >= sge->mr->mapsz)
793                                 return;
794                         sge->n = 0;
795                 }
796                 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
797                 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
798         }
799 }
800
801 #ifdef __LITTLE_ENDIAN
802 static inline u32 get_upper_bits(u32 data, u32 shift)
803 {
804         return data >> shift;
805 }
806
807 static inline u32 set_upper_bits(u32 data, u32 shift)
808 {
809         return data << shift;
810 }
811
812 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
813 {
814         data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
815         data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
816         return data;
817 }
818 #else
819 static inline u32 get_upper_bits(u32 data, u32 shift)
820 {
821         return data << shift;
822 }
823
824 static inline u32 set_upper_bits(u32 data, u32 shift)
825 {
826         return data >> shift;
827 }
828
829 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
830 {
831         data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
832         data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
833         return data;
834 }
835 #endif
836
837 static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
838                     u32 length, unsigned flush_wc)
839 {
840         u32 extra = 0;
841         u32 data = 0;
842         u32 last;
843
844         while (1) {
845                 u32 len = ss->sge.length;
846                 u32 off;
847
848                 if (len > length)
849                         len = length;
850                 if (len > ss->sge.sge_length)
851                         len = ss->sge.sge_length;
852                 BUG_ON(len == 0);
853                 /* If the source address is not aligned, try to align it. */
854                 off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
855                 if (off) {
856                         u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
857                                             ~(sizeof(u32) - 1));
858                         u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
859                         u32 y;
860
861                         y = sizeof(u32) - off;
862                         if (len > y)
863                                 len = y;
864                         if (len + extra >= sizeof(u32)) {
865                                 data |= set_upper_bits(v, extra *
866                                                        BITS_PER_BYTE);
867                                 len = sizeof(u32) - extra;
868                                 if (len == length) {
869                                         last = data;
870                                         break;
871                                 }
872                                 __raw_writel(data, piobuf);
873                                 piobuf++;
874                                 extra = 0;
875                                 data = 0;
876                         } else {
877                                 /* Clear unused upper bytes */
878                                 data |= clear_upper_bytes(v, len, extra);
879                                 if (len == length) {
880                                         last = data;
881                                         break;
882                                 }
883                                 extra += len;
884                         }
885                 } else if (extra) {
886                         /* Source address is aligned. */
887                         u32 *addr = (u32 *) ss->sge.vaddr;
888                         int shift = extra * BITS_PER_BYTE;
889                         int ushift = 32 - shift;
890                         u32 l = len;
891
892                         while (l >= sizeof(u32)) {
893                                 u32 v = *addr;
894
895                                 data |= set_upper_bits(v, shift);
896                                 __raw_writel(data, piobuf);
897                                 data = get_upper_bits(v, ushift);
898                                 piobuf++;
899                                 addr++;
900                                 l -= sizeof(u32);
901                         }
902                         /*
903                          * We still have 'extra' number of bytes leftover.
904                          */
905                         if (l) {
906                                 u32 v = *addr;
907
908                                 if (l + extra >= sizeof(u32)) {
909                                         data |= set_upper_bits(v, shift);
910                                         len -= l + extra - sizeof(u32);
911                                         if (len == length) {
912                                                 last = data;
913                                                 break;
914                                         }
915                                         __raw_writel(data, piobuf);
916                                         piobuf++;
917                                         extra = 0;
918                                         data = 0;
919                                 } else {
920                                         /* Clear unused upper bytes */
921                                         data |= clear_upper_bytes(v, l,
922                                                                   extra);
923                                         if (len == length) {
924                                                 last = data;
925                                                 break;
926                                         }
927                                         extra += l;
928                                 }
929                         } else if (len == length) {
930                                 last = data;
931                                 break;
932                         }
933                 } else if (len == length) {
934                         u32 w;
935
936                         /*
937                          * Need to round up for the last dword in the
938                          * packet.
939                          */
940                         w = (len + 3) >> 2;
941                         __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
942                         piobuf += w - 1;
943                         last = ((u32 *) ss->sge.vaddr)[w - 1];
944                         break;
945                 } else {
946                         u32 w = len >> 2;
947
948                         __iowrite32_copy(piobuf, ss->sge.vaddr, w);
949                         piobuf += w;
950
951                         extra = len & (sizeof(u32) - 1);
952                         if (extra) {
953                                 u32 v = ((u32 *) ss->sge.vaddr)[w];
954
955                                 /* Clear unused upper bytes */
956                                 data = clear_upper_bytes(v, extra, 0);
957                         }
958                 }
959                 update_sge(ss, len);
960                 length -= len;
961         }
962         /* Update address before sending packet. */
963         update_sge(ss, length);
964         if (flush_wc) {
965                 /* must flush early everything before trigger word */
966                 ipath_flush_wc();
967                 __raw_writel(last, piobuf);
968                 /* be sure trigger word is written */
969                 ipath_flush_wc();
970         } else
971                 __raw_writel(last, piobuf);
972 }
973
974 /*
975  * Convert IB rate to delay multiplier.
976  */
977 unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
978 {
979         switch (rate) {
980         case IB_RATE_2_5_GBPS: return 8;
981         case IB_RATE_5_GBPS:   return 4;
982         case IB_RATE_10_GBPS:  return 2;
983         case IB_RATE_20_GBPS:  return 1;
984         default:               return 0;
985         }
986 }
987
988 /*
989  * Convert delay multiplier to IB rate
990  */
991 static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
992 {
993         switch (mult) {
994         case 8:  return IB_RATE_2_5_GBPS;
995         case 4:  return IB_RATE_5_GBPS;
996         case 2:  return IB_RATE_10_GBPS;
997         case 1:  return IB_RATE_20_GBPS;
998         default: return IB_RATE_PORT_CURRENT;
999         }
1000 }
1001
1002 static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
1003 {
1004         struct ipath_verbs_txreq *tx = NULL;
1005         unsigned long flags;
1006
1007         spin_lock_irqsave(&dev->pending_lock, flags);
1008         if (!list_empty(&dev->txreq_free)) {
1009                 struct list_head *l = dev->txreq_free.next;
1010
1011                 list_del(l);
1012                 tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
1013         }
1014         spin_unlock_irqrestore(&dev->pending_lock, flags);
1015         return tx;
1016 }
1017
1018 static inline void put_txreq(struct ipath_ibdev *dev,
1019                              struct ipath_verbs_txreq *tx)
1020 {
1021         unsigned long flags;
1022
1023         spin_lock_irqsave(&dev->pending_lock, flags);
1024         list_add(&tx->txreq.list, &dev->txreq_free);
1025         spin_unlock_irqrestore(&dev->pending_lock, flags);
1026 }
1027
1028 static void sdma_complete(void *cookie, int status)
1029 {
1030         struct ipath_verbs_txreq *tx = cookie;
1031         struct ipath_qp *qp = tx->qp;
1032         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
1033         unsigned long flags;
1034         enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
1035                 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
1036
1037         if (atomic_dec_and_test(&qp->s_dma_busy)) {
1038                 spin_lock_irqsave(&qp->s_lock, flags);
1039                 if (tx->wqe)
1040                         ipath_send_complete(qp, tx->wqe, ibs);
1041                 if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
1042                      qp->s_last != qp->s_head) ||
1043                     (qp->s_flags & IPATH_S_WAIT_DMA))
1044                         ipath_schedule_send(qp);
1045                 spin_unlock_irqrestore(&qp->s_lock, flags);
1046                 wake_up(&qp->wait_dma);
1047         } else if (tx->wqe) {
1048                 spin_lock_irqsave(&qp->s_lock, flags);
1049                 ipath_send_complete(qp, tx->wqe, ibs);
1050                 spin_unlock_irqrestore(&qp->s_lock, flags);
1051         }
1052
1053         if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
1054                 kfree(tx->txreq.map_addr);
1055         put_txreq(dev, tx);
1056
1057         if (atomic_dec_and_test(&qp->refcount))
1058                 wake_up(&qp->wait);
1059 }
1060
1061 static void decrement_dma_busy(struct ipath_qp *qp)
1062 {
1063         unsigned long flags;
1064
1065         if (atomic_dec_and_test(&qp->s_dma_busy)) {
1066                 spin_lock_irqsave(&qp->s_lock, flags);
1067                 if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
1068                      qp->s_last != qp->s_head) ||
1069                     (qp->s_flags & IPATH_S_WAIT_DMA))
1070                         ipath_schedule_send(qp);
1071                 spin_unlock_irqrestore(&qp->s_lock, flags);
1072                 wake_up(&qp->wait_dma);
1073         }
1074 }
1075
1076 /*
1077  * Compute the number of clock cycles of delay before sending the next packet.
1078  * The multipliers reflect the number of clocks for the fastest rate so
1079  * one tick at 4xDDR is 8 ticks at 1xSDR.
1080  * If the destination port will take longer to receive a packet than
1081  * the outgoing link can send it, we need to delay sending the next packet
1082  * by the difference in time it takes the receiver to receive and the sender
1083  * to send this packet.
1084  * Note that this delay is always correct for UC and RC but not always
1085  * optimal for UD. For UD, the destination HCA can be different for each
1086  * packet, in which case, we could send packets to a different destination
1087  * while "waiting" for the delay. The overhead for doing this without
1088  * HW support is more than just paying the cost of delaying some packets
1089  * unnecessarily.
1090  */
1091 static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
1092 {
1093         return (rcv_mult > snd_mult) ?
1094                 (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
1095 }
1096
1097 static int ipath_verbs_send_dma(struct ipath_qp *qp,
1098                                 struct ipath_ib_header *hdr, u32 hdrwords,
1099                                 struct ipath_sge_state *ss, u32 len,
1100                                 u32 plen, u32 dwords)
1101 {
1102         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
1103         struct ipath_devdata *dd = dev->dd;
1104         struct ipath_verbs_txreq *tx;
1105         u32 *piobuf;
1106         u32 control;
1107         u32 ndesc;
1108         int ret;
1109
1110         tx = qp->s_tx;
1111         if (tx) {
1112                 qp->s_tx = NULL;
1113                 /* resend previously constructed packet */
1114                 atomic_inc(&qp->s_dma_busy);
1115                 ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
1116                 if (ret) {
1117                         qp->s_tx = tx;
1118                         decrement_dma_busy(qp);
1119                 }
1120                 goto bail;
1121         }
1122
1123         tx = get_txreq(dev);
1124         if (!tx) {
1125                 ret = -EBUSY;
1126                 goto bail;
1127         }
1128
1129         /*
1130          * Get the saved delay count we computed for the previous packet
1131          * and save the delay count for this packet to be used next time
1132          * we get here.
1133          */
1134         control = qp->s_pkt_delay;
1135         qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
1136
1137         tx->qp = qp;
1138         atomic_inc(&qp->refcount);
1139         tx->wqe = qp->s_wqe;
1140         tx->txreq.callback = sdma_complete;
1141         tx->txreq.callback_cookie = tx;
1142         tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
1143                 IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
1144         if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
1145                 tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
1146
1147         /* VL15 packets bypass credit check */
1148         if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
1149                 control |= 1ULL << 31;
1150                 tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
1151         }
1152
1153         if (len) {
1154                 /*
1155                  * Don't try to DMA if it takes more descriptors than
1156                  * the queue holds.
1157                  */
1158                 ndesc = ipath_count_sge(ss, len);
1159                 if (ndesc >= dd->ipath_sdma_descq_cnt)
1160                         ndesc = 0;
1161         } else
1162                 ndesc = 1;
1163         if (ndesc) {
1164                 tx->hdr.pbc[0] = cpu_to_le32(plen);
1165                 tx->hdr.pbc[1] = cpu_to_le32(control);
1166                 memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
1167                 tx->txreq.sg_count = ndesc;
1168                 tx->map_len = (hdrwords + 2) << 2;
1169                 tx->txreq.map_addr = &tx->hdr;
1170                 atomic_inc(&qp->s_dma_busy);
1171                 ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
1172                 if (ret) {
1173                         /* save ss and length in dwords */
1174                         tx->ss = ss;
1175                         tx->len = dwords;
1176                         qp->s_tx = tx;
1177                         decrement_dma_busy(qp);
1178                 }
1179                 goto bail;
1180         }
1181
1182         /* Allocate a buffer and copy the header and payload to it. */
1183         tx->map_len = (plen + 1) << 2;
1184         piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
1185         if (unlikely(piobuf == NULL)) {
1186                 ret = -EBUSY;
1187                 goto err_tx;
1188         }
1189         tx->txreq.map_addr = piobuf;
1190         tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
1191         tx->txreq.sg_count = 1;
1192
1193         *piobuf++ = (__force u32) cpu_to_le32(plen);
1194         *piobuf++ = (__force u32) cpu_to_le32(control);
1195         memcpy(piobuf, hdr, hdrwords << 2);
1196         ipath_copy_from_sge(piobuf + hdrwords, ss, len);
1197
1198         atomic_inc(&qp->s_dma_busy);
1199         ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
1200         /*
1201          * If we couldn't queue the DMA request, save the info
1202          * and try again later rather than destroying the
1203          * buffer and undoing the side effects of the copy.
1204          */
1205         if (ret) {
1206                 tx->ss = NULL;
1207                 tx->len = 0;
1208                 qp->s_tx = tx;
1209                 decrement_dma_busy(qp);
1210         }
1211         dev->n_unaligned++;
1212         goto bail;
1213
1214 err_tx:
1215         if (atomic_dec_and_test(&qp->refcount))
1216                 wake_up(&qp->wait);
1217         put_txreq(dev, tx);
1218 bail:
1219         return ret;
1220 }
1221
1222 static int ipath_verbs_send_pio(struct ipath_qp *qp,
1223                                 struct ipath_ib_header *ibhdr, u32 hdrwords,
1224                                 struct ipath_sge_state *ss, u32 len,
1225                                 u32 plen, u32 dwords)
1226 {
1227         struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
1228         u32 *hdr = (u32 *) ibhdr;
1229         u32 __iomem *piobuf;
1230         unsigned flush_wc;
1231         u32 control;
1232         int ret;
1233         unsigned long flags;
1234
1235         piobuf = ipath_getpiobuf(dd, plen, NULL);
1236         if (unlikely(piobuf == NULL)) {
1237                 ret = -EBUSY;
1238                 goto bail;
1239         }
1240
1241         /*
1242          * Get the saved delay count we computed for the previous packet
1243          * and save the delay count for this packet to be used next time
1244          * we get here.
1245          */
1246         control = qp->s_pkt_delay;
1247         qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
1248
1249         /* VL15 packets bypass credit check */
1250         if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
1251                 control |= 1ULL << 31;
1252
1253         /*
1254          * Write the length to the control qword plus any needed flags.
1255          * We have to flush after the PBC for correctness on some cpus
1256          * or WC buffer can be written out of order.
1257          */
1258         writeq(((u64) control << 32) | plen, piobuf);
1259         piobuf += 2;
1260
1261         flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
1262         if (len == 0) {
1263                 /*
1264                  * If there is just the header portion, must flush before
1265                  * writing last word of header for correctness, and after
1266                  * the last header word (trigger word).
1267                  */
1268                 if (flush_wc) {
1269                         ipath_flush_wc();
1270                         __iowrite32_copy(piobuf, hdr, hdrwords - 1);
1271                         ipath_flush_wc();
1272                         __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
1273                         ipath_flush_wc();
1274                 } else
1275                         __iowrite32_copy(piobuf, hdr, hdrwords);
1276                 goto done;
1277         }
1278
1279         if (flush_wc)
1280                 ipath_flush_wc();
1281         __iowrite32_copy(piobuf, hdr, hdrwords);
1282         piobuf += hdrwords;
1283
1284         /* The common case is aligned and contained in one segment. */
1285         if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
1286                    !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
1287                 u32 *addr = (u32 *) ss->sge.vaddr;
1288
1289                 /* Update address before sending packet. */
1290                 update_sge(ss, len);
1291                 if (flush_wc) {
1292                         __iowrite32_copy(piobuf, addr, dwords - 1);
1293                         /* must flush early everything before trigger word */
1294                         ipath_flush_wc();
1295                         __raw_writel(addr[dwords - 1], piobuf + dwords - 1);
1296                         /* be sure trigger word is written */
1297                         ipath_flush_wc();
1298                 } else
1299                         __iowrite32_copy(piobuf, addr, dwords);
1300                 goto done;
1301         }
1302         copy_io(piobuf, ss, len, flush_wc);
1303 done:
1304         if (qp->s_wqe) {
1305                 spin_lock_irqsave(&qp->s_lock, flags);
1306                 ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
1307                 spin_unlock_irqrestore(&qp->s_lock, flags);
1308         }
1309         ret = 0;
1310 bail:
1311         return ret;
1312 }
1313
1314 /**
1315  * ipath_verbs_send - send a packet
1316  * @qp: the QP to send on
1317  * @hdr: the packet header
1318  * @hdrwords: the number of 32-bit words in the header
1319  * @ss: the SGE to send
1320  * @len: the length of the packet in bytes
1321  */
1322 int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
1323                      u32 hdrwords, struct ipath_sge_state *ss, u32 len)
1324 {
1325         struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
1326         u32 plen;
1327         int ret;
1328         u32 dwords = (len + 3) >> 2;
1329
1330         /*
1331          * Calculate the send buffer trigger address.
1332          * The +1 counts for the pbc control dword following the pbc length.
1333          */
1334         plen = hdrwords + dwords + 1;
1335
1336         /*
1337          * VL15 packets (IB_QPT_SMI) will always use PIO, so we
1338          * can defer SDMA restart until link goes ACTIVE without
1339          * worrying about just how we got there.
1340          */
1341         if (qp->ibqp.qp_type == IB_QPT_SMI ||
1342             !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
1343                 ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
1344                                            plen, dwords);
1345         else
1346                 ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
1347                                            plen, dwords);
1348
1349         return ret;
1350 }
1351
1352 int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
1353                             u64 *rwords, u64 *spkts, u64 *rpkts,
1354                             u64 *xmit_wait)
1355 {
1356         int ret;
1357
1358         if (!(dd->ipath_flags & IPATH_INITTED)) {
1359                 /* no hardware, freeze, etc. */
1360                 ret = -EINVAL;
1361                 goto bail;
1362         }
1363         *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
1364         *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
1365         *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
1366         *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
1367         *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
1368
1369         ret = 0;
1370
1371 bail:
1372         return ret;
1373 }
1374
1375 /**
1376  * ipath_get_counters - get various chip counters
1377  * @dd: the infinipath device
1378  * @cntrs: counters are placed here
1379  *
1380  * Return the counters needed by recv_pma_get_portcounters().
1381  */
1382 int ipath_get_counters(struct ipath_devdata *dd,
1383                        struct ipath_verbs_counters *cntrs)
1384 {
1385         struct ipath_cregs const *crp = dd->ipath_cregs;
1386         int ret;
1387
1388         if (!(dd->ipath_flags & IPATH_INITTED)) {
1389                 /* no hardware, freeze, etc. */
1390                 ret = -EINVAL;
1391                 goto bail;
1392         }
1393         cntrs->symbol_error_counter =
1394                 ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
1395         cntrs->link_error_recovery_counter =
1396                 ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
1397         /*
1398          * The link downed counter counts when the other side downs the
1399          * connection.  We add in the number of times we downed the link
1400          * due to local link integrity errors to compensate.
1401          */
1402         cntrs->link_downed_counter =
1403                 ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
1404         cntrs->port_rcv_errors =
1405                 ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
1406                 ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
1407                 ipath_snap_cntr(dd, crp->cr_portovflcnt) +
1408                 ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
1409                 ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
1410                 ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
1411                 ipath_snap_cntr(dd, crp->cr_erricrccnt) +
1412                 ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
1413                 ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
1414                 ipath_snap_cntr(dd, crp->cr_badformatcnt) +
1415                 dd->ipath_rxfc_unsupvl_errs;
1416         if (crp->cr_rxotherlocalphyerrcnt)
1417                 cntrs->port_rcv_errors +=
1418                         ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
1419         if (crp->cr_rxvlerrcnt)
1420                 cntrs->port_rcv_errors +=
1421                         ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
1422         cntrs->port_rcv_remphys_errors =
1423                 ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
1424         cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
1425         cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
1426         cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
1427         cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
1428         cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
1429         cntrs->local_link_integrity_errors =
1430                 crp->cr_locallinkintegrityerrcnt ?
1431                 ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
1432                 ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
1433                  dd->ipath_lli_errs : dd->ipath_lli_errors);
1434         cntrs->excessive_buffer_overrun_errors =
1435                 crp->cr_excessbufferovflcnt ?
1436                 ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
1437                 dd->ipath_overrun_thresh_errs;
1438         cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
1439                 ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
1440
1441         ret = 0;
1442
1443 bail:
1444         return ret;
1445 }
1446
1447 /**
1448  * ipath_ib_piobufavail - callback when a PIO buffer is available
1449  * @arg: the device pointer
1450  *
1451  * This is called from ipath_intr() at interrupt level when a PIO buffer is
1452  * available after ipath_verbs_send() returned an error that no buffers were
1453  * available.  Return 1 if we consumed all the PIO buffers and we still have
1454  * QPs waiting for buffers (for now, just restart the send tasklet and
1455  * return zero).
1456  */
1457 int ipath_ib_piobufavail(struct ipath_ibdev *dev)
1458 {
1459         struct list_head *list;
1460         struct ipath_qp *qplist;
1461         struct ipath_qp *qp;
1462         unsigned long flags;
1463
1464         if (dev == NULL)
1465                 goto bail;
1466
1467         list = &dev->piowait;
1468         qplist = NULL;
1469
1470         spin_lock_irqsave(&dev->pending_lock, flags);
1471         while (!list_empty(list)) {
1472                 qp = list_entry(list->next, struct ipath_qp, piowait);
1473                 list_del_init(&qp->piowait);
1474                 qp->pio_next = qplist;
1475                 qplist = qp;
1476                 atomic_inc(&qp->refcount);
1477         }
1478         spin_unlock_irqrestore(&dev->pending_lock, flags);
1479
1480         while (qplist != NULL) {
1481                 qp = qplist;
1482                 qplist = qp->pio_next;
1483
1484                 spin_lock_irqsave(&qp->s_lock, flags);
1485                 if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
1486                         ipath_schedule_send(qp);
1487                 spin_unlock_irqrestore(&qp->s_lock, flags);
1488
1489                 /* Notify ipath_destroy_qp() if it is waiting. */
1490                 if (atomic_dec_and_test(&qp->refcount))
1491                         wake_up(&qp->wait);
1492         }
1493
1494 bail:
1495         return 0;
1496 }
1497
1498 static int ipath_query_device(struct ib_device *ibdev,
1499                               struct ib_device_attr *props)
1500 {
1501         struct ipath_ibdev *dev = to_idev(ibdev);
1502
1503         memset(props, 0, sizeof(*props));
1504
1505         props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1506                 IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1507                 IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1508                 IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
1509         props->page_size_cap = PAGE_SIZE;
1510         props->vendor_id =
1511                 IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
1512         props->vendor_part_id = dev->dd->ipath_deviceid;
1513         props->hw_ver = dev->dd->ipath_pcirev;
1514
1515         props->sys_image_guid = dev->sys_image_guid;
1516
1517         props->max_mr_size = ~0ull;
1518         props->max_qp = ib_ipath_max_qps;
1519         props->max_qp_wr = ib_ipath_max_qp_wrs;
1520         props->max_sge = ib_ipath_max_sges;
1521         props->max_cq = ib_ipath_max_cqs;
1522         props->max_ah = ib_ipath_max_ahs;
1523         props->max_cqe = ib_ipath_max_cqes;
1524         props->max_mr = dev->lk_table.max;
1525         props->max_fmr = dev->lk_table.max;
1526         props->max_map_per_fmr = 32767;
1527         props->max_pd = ib_ipath_max_pds;
1528         props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
1529         props->max_qp_init_rd_atom = 255;
1530         /* props->max_res_rd_atom */
1531         props->max_srq = ib_ipath_max_srqs;
1532         props->max_srq_wr = ib_ipath_max_srq_wrs;
1533         props->max_srq_sge = ib_ipath_max_srq_sges;
1534         /* props->local_ca_ack_delay */
1535         props->atomic_cap = IB_ATOMIC_GLOB;
1536         props->max_pkeys = ipath_get_npkeys(dev->dd);
1537         props->max_mcast_grp = ib_ipath_max_mcast_grps;
1538         props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
1539         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1540                 props->max_mcast_grp;
1541
1542         return 0;
1543 }
1544
1545 const u8 ipath_cvt_physportstate[32] = {
1546         [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
1547         [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
1548         [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
1549         [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
1550         [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
1551         [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
1552         [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
1553                 IB_PHYSPORTSTATE_CFG_TRAIN,
1554         [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
1555                 IB_PHYSPORTSTATE_CFG_TRAIN,
1556         [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
1557                 IB_PHYSPORTSTATE_CFG_TRAIN,
1558         [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
1559         [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
1560                 IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
1561         [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
1562                 IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
1563         [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
1564                 IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
1565         [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
1566         [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
1567         [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
1568         [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
1569         [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
1570         [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
1571         [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
1572         [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
1573 };
1574
1575 u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
1576 {
1577         return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
1578 }
1579
1580 static int ipath_query_port(struct ib_device *ibdev,
1581                             u8 port, struct ib_port_attr *props)
1582 {
1583         struct ipath_ibdev *dev = to_idev(ibdev);
1584         struct ipath_devdata *dd = dev->dd;
1585         enum ib_mtu mtu;
1586         u16 lid = dd->ipath_lid;
1587         u64 ibcstat;
1588
1589         memset(props, 0, sizeof(*props));
1590         props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
1591         props->lmc = dd->ipath_lmc;
1592         props->sm_lid = dev->sm_lid;
1593         props->sm_sl = dev->sm_sl;
1594         ibcstat = dd->ipath_lastibcstat;
1595         /* map LinkState to IB portinfo values.  */
1596         props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
1597
1598         /* See phys_state_show() */
1599         props->phys_state = /* MEA: assumes shift == 0 */
1600                 ipath_cvt_physportstate[dd->ipath_lastibcstat &
1601                 dd->ibcs_lts_mask];
1602         props->port_cap_flags = dev->port_cap_flags;
1603         props->gid_tbl_len = 1;
1604         props->max_msg_sz = 0x80000000;
1605         props->pkey_tbl_len = ipath_get_npkeys(dd);
1606         props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
1607                 dev->z_pkey_violations;
1608         props->qkey_viol_cntr = dev->qkey_violations;
1609         props->active_width = dd->ipath_link_width_active;
1610         /* See rate_show() */
1611         props->active_speed = dd->ipath_link_speed_active;
1612         props->max_vl_num = 1;          /* VLCap = VL0 */
1613         props->init_type_reply = 0;
1614
1615         props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
1616         switch (dd->ipath_ibmtu) {
1617         case 4096:
1618                 mtu = IB_MTU_4096;
1619                 break;
1620         case 2048:
1621                 mtu = IB_MTU_2048;
1622                 break;
1623         case 1024:
1624                 mtu = IB_MTU_1024;
1625                 break;
1626         case 512:
1627                 mtu = IB_MTU_512;
1628                 break;
1629         case 256:
1630                 mtu = IB_MTU_256;
1631                 break;
1632         default:
1633                 mtu = IB_MTU_2048;
1634         }
1635         props->active_mtu = mtu;
1636         props->subnet_timeout = dev->subnet_timeout;
1637
1638         return 0;
1639 }
1640
1641 static int ipath_modify_device(struct ib_device *device,
1642                                int device_modify_mask,
1643                                struct ib_device_modify *device_modify)
1644 {
1645         int ret;
1646
1647         if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1648                                    IB_DEVICE_MODIFY_NODE_DESC)) {
1649                 ret = -EOPNOTSUPP;
1650                 goto bail;
1651         }
1652
1653         if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
1654                 memcpy(device->node_desc, device_modify->node_desc, 64);
1655
1656         if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
1657                 to_idev(device)->sys_image_guid =
1658                         cpu_to_be64(device_modify->sys_image_guid);
1659
1660         ret = 0;
1661
1662 bail:
1663         return ret;
1664 }
1665
1666 static int ipath_modify_port(struct ib_device *ibdev,
1667                              u8 port, int port_modify_mask,
1668                              struct ib_port_modify *props)
1669 {
1670         struct ipath_ibdev *dev = to_idev(ibdev);
1671
1672         dev->port_cap_flags |= props->set_port_cap_mask;
1673         dev->port_cap_flags &= ~props->clr_port_cap_mask;
1674         if (port_modify_mask & IB_PORT_SHUTDOWN)
1675                 ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
1676         if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
1677                 dev->qkey_violations = 0;
1678         return 0;
1679 }
1680
1681 static int ipath_query_gid(struct ib_device *ibdev, u8 port,
1682                            int index, union ib_gid *gid)
1683 {
1684         struct ipath_ibdev *dev = to_idev(ibdev);
1685         int ret;
1686
1687         if (index >= 1) {
1688                 ret = -EINVAL;
1689                 goto bail;
1690         }
1691         gid->global.subnet_prefix = dev->gid_prefix;
1692         gid->global.interface_id = dev->dd->ipath_guid;
1693
1694         ret = 0;
1695
1696 bail:
1697         return ret;
1698 }
1699
1700 static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
1701                                     struct ib_ucontext *context,
1702                                     struct ib_udata *udata)
1703 {
1704         struct ipath_ibdev *dev = to_idev(ibdev);
1705         struct ipath_pd *pd;
1706         struct ib_pd *ret;
1707
1708         /*
1709          * This is actually totally arbitrary.  Some correctness tests
1710          * assume there's a maximum number of PDs that can be allocated.
1711          * We don't actually have this limit, but we fail the test if
1712          * we allow allocations of more than we report for this value.
1713          */
1714
1715         pd = kmalloc(sizeof *pd, GFP_KERNEL);
1716         if (!pd) {
1717                 ret = ERR_PTR(-ENOMEM);
1718                 goto bail;
1719         }
1720
1721         spin_lock(&dev->n_pds_lock);
1722         if (dev->n_pds_allocated == ib_ipath_max_pds) {
1723                 spin_unlock(&dev->n_pds_lock);
1724                 kfree(pd);
1725                 ret = ERR_PTR(-ENOMEM);
1726                 goto bail;
1727         }
1728
1729         dev->n_pds_allocated++;
1730         spin_unlock(&dev->n_pds_lock);
1731
1732         /* ib_alloc_pd() will initialize pd->ibpd. */
1733         pd->user = udata != NULL;
1734
1735         ret = &pd->ibpd;
1736
1737 bail:
1738         return ret;
1739 }
1740
1741 static int ipath_dealloc_pd(struct ib_pd *ibpd)
1742 {
1743         struct ipath_pd *pd = to_ipd(ibpd);
1744         struct ipath_ibdev *dev = to_idev(ibpd->device);
1745
1746         spin_lock(&dev->n_pds_lock);
1747         dev->n_pds_allocated--;
1748         spin_unlock(&dev->n_pds_lock);
1749
1750         kfree(pd);
1751
1752         return 0;
1753 }
1754
1755 /**
1756  * ipath_create_ah - create an address handle
1757  * @pd: the protection domain
1758  * @ah_attr: the attributes of the AH
1759  *
1760  * This may be called from interrupt context.
1761  */
1762 static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
1763                                      struct ib_ah_attr *ah_attr)
1764 {
1765         struct ipath_ah *ah;
1766         struct ib_ah *ret;
1767         struct ipath_ibdev *dev = to_idev(pd->device);
1768         unsigned long flags;
1769
1770         /* A multicast address requires a GRH (see ch. 8.4.1). */
1771         if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
1772             ah_attr->dlid != IPATH_PERMISSIVE_LID &&
1773             !(ah_attr->ah_flags & IB_AH_GRH)) {
1774                 ret = ERR_PTR(-EINVAL);
1775                 goto bail;
1776         }
1777
1778         if (ah_attr->dlid == 0) {
1779                 ret = ERR_PTR(-EINVAL);
1780                 goto bail;
1781         }
1782
1783         if (ah_attr->port_num < 1 ||
1784             ah_attr->port_num > pd->device->phys_port_cnt) {
1785                 ret = ERR_PTR(-EINVAL);
1786                 goto bail;
1787         }
1788
1789         ah = kmalloc(sizeof *ah, GFP_ATOMIC);
1790         if (!ah) {
1791                 ret = ERR_PTR(-ENOMEM);
1792                 goto bail;
1793         }
1794
1795         spin_lock_irqsave(&dev->n_ahs_lock, flags);
1796         if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
1797                 spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1798                 kfree(ah);
1799                 ret = ERR_PTR(-ENOMEM);
1800                 goto bail;
1801         }
1802
1803         dev->n_ahs_allocated++;
1804         spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1805
1806         /* ib_create_ah() will initialize ah->ibah. */
1807         ah->attr = *ah_attr;
1808         ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
1809
1810         ret = &ah->ibah;
1811
1812 bail:
1813         return ret;
1814 }
1815
1816 /**
1817  * ipath_destroy_ah - destroy an address handle
1818  * @ibah: the AH to destroy
1819  *
1820  * This may be called from interrupt context.
1821  */
1822 static int ipath_destroy_ah(struct ib_ah *ibah)
1823 {
1824         struct ipath_ibdev *dev = to_idev(ibah->device);
1825         struct ipath_ah *ah = to_iah(ibah);
1826         unsigned long flags;
1827
1828         spin_lock_irqsave(&dev->n_ahs_lock, flags);
1829         dev->n_ahs_allocated--;
1830         spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1831
1832         kfree(ah);
1833
1834         return 0;
1835 }
1836
1837 static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
1838 {
1839         struct ipath_ah *ah = to_iah(ibah);
1840
1841         *ah_attr = ah->attr;
1842         ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
1843
1844         return 0;
1845 }
1846
1847 /**
1848  * ipath_get_npkeys - return the size of the PKEY table for port 0
1849  * @dd: the infinipath device
1850  */
1851 unsigned ipath_get_npkeys(struct ipath_devdata *dd)
1852 {
1853         return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
1854 }
1855
1856 /**
1857  * ipath_get_pkey - return the indexed PKEY from the port PKEY table
1858  * @dd: the infinipath device
1859  * @index: the PKEY index
1860  */
1861 unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
1862 {
1863         unsigned ret;
1864
1865         /* always a kernel port, no locking needed */
1866         if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
1867                 ret = 0;
1868         else
1869                 ret = dd->ipath_pd[0]->port_pkeys[index];
1870
1871         return ret;
1872 }
1873
1874 static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1875                             u16 *pkey)
1876 {
1877         struct ipath_ibdev *dev = to_idev(ibdev);
1878         int ret;
1879
1880         if (index >= ipath_get_npkeys(dev->dd)) {
1881                 ret = -EINVAL;
1882                 goto bail;
1883         }
1884
1885         *pkey = ipath_get_pkey(dev->dd, index);
1886         ret = 0;
1887
1888 bail:
1889         return ret;
1890 }
1891
1892 /**
1893  * ipath_alloc_ucontext - allocate a ucontest
1894  * @ibdev: the infiniband device
1895  * @udata: not used by the InfiniPath driver
1896  */
1897
1898 static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
1899                                                 struct ib_udata *udata)
1900 {
1901         struct ipath_ucontext *context;
1902         struct ib_ucontext *ret;
1903
1904         context = kmalloc(sizeof *context, GFP_KERNEL);
1905         if (!context) {
1906                 ret = ERR_PTR(-ENOMEM);
1907                 goto bail;
1908         }
1909
1910         ret = &context->ibucontext;
1911
1912 bail:
1913         return ret;
1914 }
1915
1916 static int ipath_dealloc_ucontext(struct ib_ucontext *context)
1917 {
1918         kfree(to_iucontext(context));
1919         return 0;
1920 }
1921
1922 static int ipath_verbs_register_sysfs(struct ib_device *dev);
1923
1924 static void __verbs_timer(unsigned long arg)
1925 {
1926         struct ipath_devdata *dd = (struct ipath_devdata *) arg;
1927
1928         /* Handle verbs layer timeouts. */
1929         ipath_ib_timer(dd->verbs_dev);
1930
1931         mod_timer(&dd->verbs_timer, jiffies + 1);
1932 }
1933
1934 static int enable_timer(struct ipath_devdata *dd)
1935 {
1936         /*
1937          * Early chips had a design flaw where the chip and kernel idea
1938          * of the tail register don't always agree, and therefore we won't
1939          * get an interrupt on the next packet received.
1940          * If the board supports per packet receive interrupts, use it.
1941          * Otherwise, the timer function periodically checks for packets
1942          * to cover this case.
1943          * Either way, the timer is needed for verbs layer related
1944          * processing.
1945          */
1946         if (dd->ipath_flags & IPATH_GPIO_INTR) {
1947                 ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
1948                                  0x2074076542310ULL);
1949                 /* Enable GPIO bit 2 interrupt */
1950                 dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
1951                 ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
1952                                  dd->ipath_gpio_mask);
1953         }
1954
1955         init_timer(&dd->verbs_timer);
1956         dd->verbs_timer.function = __verbs_timer;
1957         dd->verbs_timer.data = (unsigned long)dd;
1958         dd->verbs_timer.expires = jiffies + 1;
1959         add_timer(&dd->verbs_timer);
1960
1961         return 0;
1962 }
1963
1964 static int disable_timer(struct ipath_devdata *dd)
1965 {
1966         /* Disable GPIO bit 2 interrupt */
1967         if (dd->ipath_flags & IPATH_GPIO_INTR) {
1968                 /* Disable GPIO bit 2 interrupt */
1969                 dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
1970                 ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
1971                                  dd->ipath_gpio_mask);
1972                 /*
1973                  * We might want to undo changes to debugportselect,
1974                  * but how?
1975                  */
1976         }
1977
1978         del_timer_sync(&dd->verbs_timer);
1979
1980         return 0;
1981 }
1982
1983 /**
1984  * ipath_register_ib_device - register our device with the infiniband core
1985  * @dd: the device data structure
1986  * Return the allocated ipath_ibdev pointer or NULL on error.
1987  */
1988 int ipath_register_ib_device(struct ipath_devdata *dd)
1989 {
1990         struct ipath_verbs_counters cntrs;
1991         struct ipath_ibdev *idev;
1992         struct ib_device *dev;
1993         struct ipath_verbs_txreq *tx;
1994         unsigned i;
1995         int ret;
1996
1997         idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
1998         if (idev == NULL) {
1999                 ret = -ENOMEM;
2000                 goto bail;
2001         }
2002
2003         dev = &idev->ibdev;
2004
2005         if (dd->ipath_sdma_descq_cnt) {
2006                 tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
2007                              GFP_KERNEL);
2008                 if (tx == NULL) {
2009                         ret = -ENOMEM;
2010                         goto err_tx;
2011                 }
2012         } else
2013                 tx = NULL;
2014         idev->txreq_bufs = tx;
2015
2016         /* Only need to initialize non-zero fields. */
2017         spin_lock_init(&idev->n_pds_lock);
2018         spin_lock_init(&idev->n_ahs_lock);
2019         spin_lock_init(&idev->n_cqs_lock);
2020         spin_lock_init(&idev->n_qps_lock);
2021         spin_lock_init(&idev->n_srqs_lock);
2022         spin_lock_init(&idev->n_mcast_grps_lock);
2023
2024         spin_lock_init(&idev->qp_table.lock);
2025         spin_lock_init(&idev->lk_table.lock);
2026         idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
2027         /* Set the prefix to the default value (see ch. 4.1.1) */
2028         idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
2029
2030         ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
2031         if (ret)
2032                 goto err_qp;
2033
2034         /*
2035          * The top ib_ipath_lkey_table_size bits are used to index the
2036          * table.  The lower 8 bits can be owned by the user (copied from
2037          * the LKEY).  The remaining bits act as a generation number or tag.
2038          */
2039         idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
2040         idev->lk_table.table = kzalloc(idev->lk_table.max *
2041                                        sizeof(*idev->lk_table.table),
2042                                        GFP_KERNEL);
2043         if (idev->lk_table.table == NULL) {
2044                 ret = -ENOMEM;
2045                 goto err_lk;
2046         }
2047         INIT_LIST_HEAD(&idev->pending_mmaps);
2048         spin_lock_init(&idev->pending_lock);
2049         idev->mmap_offset = PAGE_SIZE;
2050         spin_lock_init(&idev->mmap_offset_lock);
2051         INIT_LIST_HEAD(&idev->pending[0]);
2052         INIT_LIST_HEAD(&idev->pending[1]);
2053         INIT_LIST_HEAD(&idev->pending[2]);
2054         INIT_LIST_HEAD(&idev->piowait);
2055         INIT_LIST_HEAD(&idev->rnrwait);
2056         INIT_LIST_HEAD(&idev->txreq_free);
2057         idev->pending_index = 0;
2058         idev->port_cap_flags =
2059                 IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
2060         if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
2061                 idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
2062         idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
2063         idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
2064         idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
2065         idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
2066         idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
2067
2068         /* Snapshot current HW counters to "clear" them. */
2069         ipath_get_counters(dd, &cntrs);
2070         idev->z_symbol_error_counter = cntrs.symbol_error_counter;
2071         idev->z_link_error_recovery_counter =
2072                 cntrs.link_error_recovery_counter;
2073         idev->z_link_downed_counter = cntrs.link_downed_counter;
2074         idev->z_port_rcv_errors = cntrs.port_rcv_errors;
2075         idev->z_port_rcv_remphys_errors =
2076                 cntrs.port_rcv_remphys_errors;
2077         idev->z_port_xmit_discards = cntrs.port_xmit_discards;
2078         idev->z_port_xmit_data = cntrs.port_xmit_data;
2079         idev->z_port_rcv_data = cntrs.port_rcv_data;
2080         idev->z_port_xmit_packets = cntrs.port_xmit_packets;
2081         idev->z_port_rcv_packets = cntrs.port_rcv_packets;
2082         idev->z_local_link_integrity_errors =
2083                 cntrs.local_link_integrity_errors;
2084         idev->z_excessive_buffer_overrun_errors =
2085                 cntrs.excessive_buffer_overrun_errors;
2086         idev->z_vl15_dropped = cntrs.vl15_dropped;
2087
2088         for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
2089                 list_add(&tx->txreq.list, &idev->txreq_free);
2090
2091         /*
2092          * The system image GUID is supposed to be the same for all
2093          * IB HCAs in a single system but since there can be other
2094          * device types in the system, we can't be sure this is unique.
2095          */
2096         if (!sys_image_guid)
2097                 sys_image_guid = dd->ipath_guid;
2098         idev->sys_image_guid = sys_image_guid;
2099         idev->ib_unit = dd->ipath_unit;
2100         idev->dd = dd;
2101
2102         strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
2103         dev->owner = THIS_MODULE;
2104         dev->node_guid = dd->ipath_guid;
2105         dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
2106         dev->uverbs_cmd_mask =
2107                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
2108                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
2109                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
2110                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
2111                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
2112                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
2113                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
2114                 (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
2115                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
2116                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
2117                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
2118                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
2119                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
2120                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
2121                 (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
2122                 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
2123                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
2124                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
2125                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
2126                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
2127                 (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
2128                 (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
2129                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
2130                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
2131                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
2132                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
2133                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
2134                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
2135                 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
2136         dev->node_type = RDMA_NODE_IB_CA;
2137         dev->phys_port_cnt = 1;
2138         dev->num_comp_vectors = 1;
2139         dev->dma_device = &dd->pcidev->dev;
2140         dev->query_device = ipath_query_device;
2141         dev->modify_device = ipath_modify_device;
2142         dev->query_port = ipath_query_port;
2143         dev->modify_port = ipath_modify_port;
2144         dev->query_pkey = ipath_query_pkey;
2145         dev->query_gid = ipath_query_gid;
2146         dev->alloc_ucontext = ipath_alloc_ucontext;
2147         dev->dealloc_ucontext = ipath_dealloc_ucontext;
2148         dev->alloc_pd = ipath_alloc_pd;
2149         dev->dealloc_pd = ipath_dealloc_pd;
2150         dev->create_ah = ipath_create_ah;
2151         dev->destroy_ah = ipath_destroy_ah;
2152         dev->query_ah = ipath_query_ah;
2153         dev->create_srq = ipath_create_srq;
2154         dev->modify_srq = ipath_modify_srq;
2155         dev->query_srq = ipath_query_srq;
2156         dev->destroy_srq = ipath_destroy_srq;
2157         dev->create_qp = ipath_create_qp;
2158         dev->modify_qp = ipath_modify_qp;
2159         dev->query_qp = ipath_query_qp;
2160         dev->destroy_qp = ipath_destroy_qp;
2161         dev->post_send = ipath_post_send;
2162         dev->post_recv = ipath_post_receive;
2163         dev->post_srq_recv = ipath_post_srq_receive;
2164         dev->create_cq = ipath_create_cq;
2165         dev->destroy_cq = ipath_destroy_cq;
2166         dev->resize_cq = ipath_resize_cq;
2167         dev->poll_cq = ipath_poll_cq;
2168         dev->req_notify_cq = ipath_req_notify_cq;
2169         dev->get_dma_mr = ipath_get_dma_mr;
2170         dev->reg_phys_mr = ipath_reg_phys_mr;
2171         dev->reg_user_mr = ipath_reg_user_mr;
2172         dev->dereg_mr = ipath_dereg_mr;
2173         dev->alloc_fmr = ipath_alloc_fmr;
2174         dev->map_phys_fmr = ipath_map_phys_fmr;
2175         dev->unmap_fmr = ipath_unmap_fmr;
2176         dev->dealloc_fmr = ipath_dealloc_fmr;
2177         dev->attach_mcast = ipath_multicast_attach;
2178         dev->detach_mcast = ipath_multicast_detach;
2179         dev->process_mad = ipath_process_mad;
2180         dev->mmap = ipath_mmap;
2181         dev->dma_ops = &ipath_dma_mapping_ops;
2182
2183         snprintf(dev->node_desc, sizeof(dev->node_desc),
2184                  IPATH_IDSTR " %s", init_utsname()->nodename);
2185
2186         ret = ib_register_device(dev, NULL);
2187         if (ret)
2188                 goto err_reg;
2189
2190         if (ipath_verbs_register_sysfs(dev))
2191                 goto err_class;
2192
2193         enable_timer(dd);
2194
2195         goto bail;
2196
2197 err_class:
2198         ib_unregister_device(dev);
2199 err_reg:
2200         kfree(idev->lk_table.table);
2201 err_lk:
2202         kfree(idev->qp_table.table);
2203 err_qp:
2204         kfree(idev->txreq_bufs);
2205 err_tx:
2206         ib_dealloc_device(dev);
2207         ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
2208         idev = NULL;
2209
2210 bail:
2211         dd->verbs_dev = idev;
2212         return ret;
2213 }
2214
2215 void ipath_unregister_ib_device(struct ipath_ibdev *dev)
2216 {
2217         struct ib_device *ibdev = &dev->ibdev;
2218         u32 qps_inuse;
2219
2220         ib_unregister_device(ibdev);
2221
2222         disable_timer(dev->dd);
2223
2224         if (!list_empty(&dev->pending[0]) ||
2225             !list_empty(&dev->pending[1]) ||
2226             !list_empty(&dev->pending[2]))
2227                 ipath_dev_err(dev->dd, "pending list not empty!\n");
2228         if (!list_empty(&dev->piowait))
2229                 ipath_dev_err(dev->dd, "piowait list not empty!\n");
2230         if (!list_empty(&dev->rnrwait))
2231                 ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
2232         if (!ipath_mcast_tree_empty())
2233                 ipath_dev_err(dev->dd, "multicast table memory leak!\n");
2234         /*
2235          * Note that ipath_unregister_ib_device() can be called before all
2236          * the QPs are destroyed!
2237          */
2238         qps_inuse = ipath_free_all_qps(&dev->qp_table);
2239         if (qps_inuse)
2240                 ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
2241                         qps_inuse);
2242         kfree(dev->qp_table.table);
2243         kfree(dev->lk_table.table);
2244         kfree(dev->txreq_bufs);
2245         ib_dealloc_device(ibdev);
2246 }
2247
2248 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2249                         char *buf)
2250 {
2251         struct ipath_ibdev *dev =
2252                 container_of(device, struct ipath_ibdev, ibdev.dev);
2253
2254         return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
2255 }
2256
2257 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2258                         char *buf)
2259 {
2260         struct ipath_ibdev *dev =
2261                 container_of(device, struct ipath_ibdev, ibdev.dev);
2262         int ret;
2263
2264         ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
2265         if (ret < 0)
2266                 goto bail;
2267         strcat(buf, "\n");
2268         ret = strlen(buf);
2269
2270 bail:
2271         return ret;
2272 }
2273
2274 static ssize_t show_stats(struct device *device, struct device_attribute *attr,
2275                           char *buf)
2276 {
2277         struct ipath_ibdev *dev =
2278                 container_of(device, struct ipath_ibdev, ibdev.dev);
2279         int i;
2280         int len;
2281
2282         len = sprintf(buf,
2283                       "RC resends  %d\n"
2284                       "RC no QACK  %d\n"
2285                       "RC ACKs     %d\n"
2286                       "RC SEQ NAKs %d\n"
2287                       "RC RDMA seq %d\n"
2288                       "RC RNR NAKs %d\n"
2289                       "RC OTH NAKs %d\n"
2290                       "RC timeouts %d\n"
2291                       "RC RDMA dup %d\n"
2292                       "piobuf wait %d\n"
2293                       "unaligned   %d\n"
2294                       "PKT drops   %d\n"
2295                       "WQE errs    %d\n",
2296                       dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
2297                       dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
2298                       dev->n_other_naks, dev->n_timeouts,
2299                       dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
2300                       dev->n_pkt_drops, dev->n_wqe_errs);
2301         for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
2302                 const struct ipath_opcode_stats *si = &dev->opstats[i];
2303
2304                 if (!si->n_packets && !si->n_bytes)
2305                         continue;
2306                 len += sprintf(buf + len, "%02x %llu/%llu\n", i,
2307                                (unsigned long long) si->n_packets,
2308                                (unsigned long long) si->n_bytes);
2309         }
2310         return len;
2311 }
2312
2313 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
2314 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
2315 static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
2316 static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
2317
2318 static struct device_attribute *ipath_class_attributes[] = {
2319         &dev_attr_hw_rev,
2320         &dev_attr_hca_type,
2321         &dev_attr_board_id,
2322         &dev_attr_stats
2323 };
2324
2325 static int ipath_verbs_register_sysfs(struct ib_device *dev)
2326 {
2327         int i;
2328         int ret;
2329
2330         for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
2331                 if (device_create_file(&dev->dev,
2332                                        ipath_class_attributes[i])) {
2333                         ret = 1;
2334                         goto bail;
2335                 }
2336
2337         ret = 0;
2338
2339 bail:
2340         return ret;
2341 }