Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[pandora-kernel.git] / drivers / infiniband / hw / ipath / ipath_verbs.c
1 /*
2  * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <rdma/ib_mad.h>
35 #include <rdma/ib_user_verbs.h>
36 #include <linux/io.h>
37 #include <linux/utsname.h>
38
39 #include "ipath_kernel.h"
40 #include "ipath_verbs.h"
41 #include "ipath_common.h"
42
43 static unsigned int ib_ipath_qp_table_size = 251;
44 module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
45 MODULE_PARM_DESC(qp_table_size, "QP table size");
46
47 unsigned int ib_ipath_lkey_table_size = 12;
48 module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
49                    S_IRUGO);
50 MODULE_PARM_DESC(lkey_table_size,
51                  "LKEY table size in bits (2^n, 1 <= n <= 23)");
52
53 static unsigned int ib_ipath_max_pds = 0xFFFF;
54 module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
55 MODULE_PARM_DESC(max_pds,
56                  "Maximum number of protection domains to support");
57
58 static unsigned int ib_ipath_max_ahs = 0xFFFF;
59 module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
60 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
61
62 unsigned int ib_ipath_max_cqes = 0x2FFFF;
63 module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
64 MODULE_PARM_DESC(max_cqes,
65                  "Maximum number of completion queue entries to support");
66
67 unsigned int ib_ipath_max_cqs = 0x1FFFF;
68 module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
69 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
70
71 unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
72 module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
73                    S_IWUSR | S_IRUGO);
74 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
75
76 unsigned int ib_ipath_max_qps = 16384;
77 module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
78 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
79
80 unsigned int ib_ipath_max_sges = 0x60;
81 module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
82 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
83
84 unsigned int ib_ipath_max_mcast_grps = 16384;
85 module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
86                    S_IWUSR | S_IRUGO);
87 MODULE_PARM_DESC(max_mcast_grps,
88                  "Maximum number of multicast groups to support");
89
90 unsigned int ib_ipath_max_mcast_qp_attached = 16;
91 module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
92                    uint, S_IWUSR | S_IRUGO);
93 MODULE_PARM_DESC(max_mcast_qp_attached,
94                  "Maximum number of attached QPs to support");
95
96 unsigned int ib_ipath_max_srqs = 1024;
97 module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
98 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
99
100 unsigned int ib_ipath_max_srq_sges = 128;
101 module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
102                    uint, S_IWUSR | S_IRUGO);
103 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
104
105 unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
106 module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
107                    uint, S_IWUSR | S_IRUGO);
108 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
109
110 static unsigned int ib_ipath_disable_sma;
111 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
112 MODULE_PARM_DESC(ib_ipath_disable_sma, "Disable the SMA");
113
114 const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
115         [IB_QPS_RESET] = 0,
116         [IB_QPS_INIT] = IPATH_POST_RECV_OK,
117         [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
118         [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
119             IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
120         [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
121             IPATH_POST_SEND_OK,
122         [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
123         [IB_QPS_ERR] = 0,
124 };
125
126 struct ipath_ucontext {
127         struct ib_ucontext ibucontext;
128 };
129
130 static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
131                                                   *ibucontext)
132 {
133         return container_of(ibucontext, struct ipath_ucontext, ibucontext);
134 }
135
136 /*
137  * Translate ib_wr_opcode into ib_wc_opcode.
138  */
139 const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
140         [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
141         [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
142         [IB_WR_SEND] = IB_WC_SEND,
143         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
144         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
145         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
146         [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
147 };
148
149 /*
150  * System image GUID.
151  */
152 static __be64 sys_image_guid;
153
154 /**
155  * ipath_copy_sge - copy data to SGE memory
156  * @ss: the SGE state
157  * @data: the data to copy
158  * @length: the length of the data
159  */
160 void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
161 {
162         struct ipath_sge *sge = &ss->sge;
163
164         while (length) {
165                 u32 len = sge->length;
166
167                 BUG_ON(len == 0);
168                 if (len > length)
169                         len = length;
170                 memcpy(sge->vaddr, data, len);
171                 sge->vaddr += len;
172                 sge->length -= len;
173                 sge->sge_length -= len;
174                 if (sge->sge_length == 0) {
175                         if (--ss->num_sge)
176                                 *sge = *ss->sg_list++;
177                 } else if (sge->length == 0 && sge->mr != NULL) {
178                         if (++sge->n >= IPATH_SEGSZ) {
179                                 if (++sge->m >= sge->mr->mapsz)
180                                         break;
181                                 sge->n = 0;
182                         }
183                         sge->vaddr =
184                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
185                         sge->length =
186                                 sge->mr->map[sge->m]->segs[sge->n].length;
187                 }
188                 data += len;
189                 length -= len;
190         }
191 }
192
193 /**
194  * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
195  * @ss: the SGE state
196  * @length: the number of bytes to skip
197  */
198 void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
199 {
200         struct ipath_sge *sge = &ss->sge;
201
202         while (length) {
203                 u32 len = sge->length;
204
205                 BUG_ON(len == 0);
206                 if (len > length)
207                         len = length;
208                 sge->vaddr += len;
209                 sge->length -= len;
210                 sge->sge_length -= len;
211                 if (sge->sge_length == 0) {
212                         if (--ss->num_sge)
213                                 *sge = *ss->sg_list++;
214                 } else if (sge->length == 0 && sge->mr != NULL) {
215                         if (++sge->n >= IPATH_SEGSZ) {
216                                 if (++sge->m >= sge->mr->mapsz)
217                                         break;
218                                 sge->n = 0;
219                         }
220                         sge->vaddr =
221                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
222                         sge->length =
223                                 sge->mr->map[sge->m]->segs[sge->n].length;
224                 }
225                 length -= len;
226         }
227 }
228
229 /**
230  * ipath_post_send - post a send on a QP
231  * @ibqp: the QP to post the send on
232  * @wr: the list of work requests to post
233  * @bad_wr: the first bad WR is put here
234  *
235  * This may be called from interrupt context.
236  */
237 static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
238                            struct ib_send_wr **bad_wr)
239 {
240         struct ipath_qp *qp = to_iqp(ibqp);
241         int err = 0;
242
243         /* Check that state is OK to post send. */
244         if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) {
245                 *bad_wr = wr;
246                 err = -EINVAL;
247                 goto bail;
248         }
249
250         for (; wr; wr = wr->next) {
251                 switch (qp->ibqp.qp_type) {
252                 case IB_QPT_UC:
253                 case IB_QPT_RC:
254                         err = ipath_post_ruc_send(qp, wr);
255                         break;
256
257                 case IB_QPT_SMI:
258                 case IB_QPT_GSI:
259                 case IB_QPT_UD:
260                         err = ipath_post_ud_send(qp, wr);
261                         break;
262
263                 default:
264                         err = -EINVAL;
265                 }
266                 if (err) {
267                         *bad_wr = wr;
268                         break;
269                 }
270         }
271
272 bail:
273         return err;
274 }
275
276 /**
277  * ipath_post_receive - post a receive on a QP
278  * @ibqp: the QP to post the receive on
279  * @wr: the WR to post
280  * @bad_wr: the first bad WR is put here
281  *
282  * This may be called from interrupt context.
283  */
284 static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
285                               struct ib_recv_wr **bad_wr)
286 {
287         struct ipath_qp *qp = to_iqp(ibqp);
288         struct ipath_rwq *wq = qp->r_rq.wq;
289         unsigned long flags;
290         int ret;
291
292         /* Check that state is OK to post receive. */
293         if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
294                 *bad_wr = wr;
295                 ret = -EINVAL;
296                 goto bail;
297         }
298
299         for (; wr; wr = wr->next) {
300                 struct ipath_rwqe *wqe;
301                 u32 next;
302                 int i;
303
304                 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
305                         *bad_wr = wr;
306                         ret = -ENOMEM;
307                         goto bail;
308                 }
309
310                 spin_lock_irqsave(&qp->r_rq.lock, flags);
311                 next = wq->head + 1;
312                 if (next >= qp->r_rq.size)
313                         next = 0;
314                 if (next == wq->tail) {
315                         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
316                         *bad_wr = wr;
317                         ret = -ENOMEM;
318                         goto bail;
319                 }
320
321                 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
322                 wqe->wr_id = wr->wr_id;
323                 wqe->num_sge = wr->num_sge;
324                 for (i = 0; i < wr->num_sge; i++)
325                         wqe->sg_list[i] = wr->sg_list[i];
326                 wq->head = next;
327                 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
328         }
329         ret = 0;
330
331 bail:
332         return ret;
333 }
334
335 /**
336  * ipath_qp_rcv - processing an incoming packet on a QP
337  * @dev: the device the packet came on
338  * @hdr: the packet header
339  * @has_grh: true if the packet has a GRH
340  * @data: the packet data
341  * @tlen: the packet length
342  * @qp: the QP the packet came on
343  *
344  * This is called from ipath_ib_rcv() to process an incoming packet
345  * for the given QP.
346  * Called at interrupt level.
347  */
348 static void ipath_qp_rcv(struct ipath_ibdev *dev,
349                          struct ipath_ib_header *hdr, int has_grh,
350                          void *data, u32 tlen, struct ipath_qp *qp)
351 {
352         /* Check for valid receive state. */
353         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
354                 dev->n_pkt_drops++;
355                 return;
356         }
357
358         switch (qp->ibqp.qp_type) {
359         case IB_QPT_SMI:
360         case IB_QPT_GSI:
361                 if (ib_ipath_disable_sma)
362                         break;
363                 /* FALLTHROUGH */
364         case IB_QPT_UD:
365                 ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
366                 break;
367
368         case IB_QPT_RC:
369                 ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
370                 break;
371
372         case IB_QPT_UC:
373                 ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
374                 break;
375
376         default:
377                 break;
378         }
379 }
380
381 /**
382  * ipath_ib_rcv - process an incoming packet
383  * @arg: the device pointer
384  * @rhdr: the header of the packet
385  * @data: the packet data
386  * @tlen: the packet length
387  *
388  * This is called from ipath_kreceive() to process an incoming packet at
389  * interrupt level. Tlen is the length of the header + data + CRC in bytes.
390  */
391 void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
392                   u32 tlen)
393 {
394         struct ipath_ib_header *hdr = rhdr;
395         struct ipath_other_headers *ohdr;
396         struct ipath_qp *qp;
397         u32 qp_num;
398         int lnh;
399         u8 opcode;
400         u16 lid;
401
402         if (unlikely(dev == NULL))
403                 goto bail;
404
405         if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
406                 dev->rcv_errors++;
407                 goto bail;
408         }
409
410         /* Check for a valid destination LID (see ch. 7.11.1). */
411         lid = be16_to_cpu(hdr->lrh[1]);
412         if (lid < IPATH_MULTICAST_LID_BASE) {
413                 lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
414                 if (unlikely(lid != dev->dd->ipath_lid)) {
415                         dev->rcv_errors++;
416                         goto bail;
417                 }
418         }
419
420         /* Check for GRH */
421         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
422         if (lnh == IPATH_LRH_BTH)
423                 ohdr = &hdr->u.oth;
424         else if (lnh == IPATH_LRH_GRH)
425                 ohdr = &hdr->u.l.oth;
426         else {
427                 dev->rcv_errors++;
428                 goto bail;
429         }
430
431         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
432         dev->opstats[opcode].n_bytes += tlen;
433         dev->opstats[opcode].n_packets++;
434
435         /* Get the destination QP number. */
436         qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
437         if (qp_num == IPATH_MULTICAST_QPN) {
438                 struct ipath_mcast *mcast;
439                 struct ipath_mcast_qp *p;
440
441                 mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
442                 if (mcast == NULL) {
443                         dev->n_pkt_drops++;
444                         goto bail;
445                 }
446                 dev->n_multicast_rcv++;
447                 list_for_each_entry_rcu(p, &mcast->qp_list, list)
448                         ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
449                                      tlen, p->qp);
450                 /*
451                  * Notify ipath_multicast_detach() if it is waiting for us
452                  * to finish.
453                  */
454                 if (atomic_dec_return(&mcast->refcount) <= 1)
455                         wake_up(&mcast->wait);
456         } else {
457                 qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
458                 if (qp) {
459                         dev->n_unicast_rcv++;
460                         ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
461                                      tlen, qp);
462                         /*
463                          * Notify ipath_destroy_qp() if it is waiting
464                          * for us to finish.
465                          */
466                         if (atomic_dec_and_test(&qp->refcount))
467                                 wake_up(&qp->wait);
468                 } else
469                         dev->n_pkt_drops++;
470         }
471
472 bail:;
473 }
474
475 /**
476  * ipath_ib_timer - verbs timer
477  * @arg: the device pointer
478  *
479  * This is called from ipath_do_rcv_timer() at interrupt level to check for
480  * QPs which need retransmits and to collect performance numbers.
481  */
482 void ipath_ib_timer(struct ipath_ibdev *dev)
483 {
484         struct ipath_qp *resend = NULL;
485         struct list_head *last;
486         struct ipath_qp *qp;
487         unsigned long flags;
488
489         if (dev == NULL)
490                 return;
491
492         spin_lock_irqsave(&dev->pending_lock, flags);
493         /* Start filling the next pending queue. */
494         if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
495                 dev->pending_index = 0;
496         /* Save any requests still in the new queue, they have timed out. */
497         last = &dev->pending[dev->pending_index];
498         while (!list_empty(last)) {
499                 qp = list_entry(last->next, struct ipath_qp, timerwait);
500                 list_del_init(&qp->timerwait);
501                 qp->timer_next = resend;
502                 resend = qp;
503                 atomic_inc(&qp->refcount);
504         }
505         last = &dev->rnrwait;
506         if (!list_empty(last)) {
507                 qp = list_entry(last->next, struct ipath_qp, timerwait);
508                 if (--qp->s_rnr_timeout == 0) {
509                         do {
510                                 list_del_init(&qp->timerwait);
511                                 tasklet_hi_schedule(&qp->s_task);
512                                 if (list_empty(last))
513                                         break;
514                                 qp = list_entry(last->next, struct ipath_qp,
515                                                 timerwait);
516                         } while (qp->s_rnr_timeout == 0);
517                 }
518         }
519         /*
520          * We should only be in the started state if pma_sample_start != 0
521          */
522         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
523             --dev->pma_sample_start == 0) {
524                 dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
525                 ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
526                                         &dev->ipath_rword,
527                                         &dev->ipath_spkts,
528                                         &dev->ipath_rpkts,
529                                         &dev->ipath_xmit_wait);
530         }
531         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
532                 if (dev->pma_sample_interval == 0) {
533                         u64 ta, tb, tc, td, te;
534
535                         dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
536                         ipath_snapshot_counters(dev->dd, &ta, &tb,
537                                                 &tc, &td, &te);
538
539                         dev->ipath_sword = ta - dev->ipath_sword;
540                         dev->ipath_rword = tb - dev->ipath_rword;
541                         dev->ipath_spkts = tc - dev->ipath_spkts;
542                         dev->ipath_rpkts = td - dev->ipath_rpkts;
543                         dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
544                 }
545                 else
546                         dev->pma_sample_interval--;
547         }
548         spin_unlock_irqrestore(&dev->pending_lock, flags);
549
550         /* XXX What if timer fires again while this is running? */
551         for (qp = resend; qp != NULL; qp = qp->timer_next) {
552                 struct ib_wc wc;
553
554                 spin_lock_irqsave(&qp->s_lock, flags);
555                 if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
556                         dev->n_timeouts++;
557                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
558                 }
559                 spin_unlock_irqrestore(&qp->s_lock, flags);
560
561                 /* Notify ipath_destroy_qp() if it is waiting. */
562                 if (atomic_dec_and_test(&qp->refcount))
563                         wake_up(&qp->wait);
564         }
565 }
566
567 static void update_sge(struct ipath_sge_state *ss, u32 length)
568 {
569         struct ipath_sge *sge = &ss->sge;
570
571         sge->vaddr += length;
572         sge->length -= length;
573         sge->sge_length -= length;
574         if (sge->sge_length == 0) {
575                 if (--ss->num_sge)
576                         *sge = *ss->sg_list++;
577         } else if (sge->length == 0 && sge->mr != NULL) {
578                 if (++sge->n >= IPATH_SEGSZ) {
579                         if (++sge->m >= sge->mr->mapsz)
580                                 return;
581                         sge->n = 0;
582                 }
583                 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
584                 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
585         }
586 }
587
588 #ifdef __LITTLE_ENDIAN
589 static inline u32 get_upper_bits(u32 data, u32 shift)
590 {
591         return data >> shift;
592 }
593
594 static inline u32 set_upper_bits(u32 data, u32 shift)
595 {
596         return data << shift;
597 }
598
599 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
600 {
601         data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
602         data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
603         return data;
604 }
605 #else
606 static inline u32 get_upper_bits(u32 data, u32 shift)
607 {
608         return data << shift;
609 }
610
611 static inline u32 set_upper_bits(u32 data, u32 shift)
612 {
613         return data >> shift;
614 }
615
616 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
617 {
618         data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
619         data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
620         return data;
621 }
622 #endif
623
624 static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
625                     u32 length)
626 {
627         u32 extra = 0;
628         u32 data = 0;
629         u32 last;
630
631         while (1) {
632                 u32 len = ss->sge.length;
633                 u32 off;
634
635                 BUG_ON(len == 0);
636                 if (len > length)
637                         len = length;
638                 if (len > ss->sge.sge_length)
639                         len = ss->sge.sge_length;
640                 /* If the source address is not aligned, try to align it. */
641                 off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
642                 if (off) {
643                         u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
644                                             ~(sizeof(u32) - 1));
645                         u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
646                         u32 y;
647
648                         y = sizeof(u32) - off;
649                         if (len > y)
650                                 len = y;
651                         if (len + extra >= sizeof(u32)) {
652                                 data |= set_upper_bits(v, extra *
653                                                        BITS_PER_BYTE);
654                                 len = sizeof(u32) - extra;
655                                 if (len == length) {
656                                         last = data;
657                                         break;
658                                 }
659                                 __raw_writel(data, piobuf);
660                                 piobuf++;
661                                 extra = 0;
662                                 data = 0;
663                         } else {
664                                 /* Clear unused upper bytes */
665                                 data |= clear_upper_bytes(v, len, extra);
666                                 if (len == length) {
667                                         last = data;
668                                         break;
669                                 }
670                                 extra += len;
671                         }
672                 } else if (extra) {
673                         /* Source address is aligned. */
674                         u32 *addr = (u32 *) ss->sge.vaddr;
675                         int shift = extra * BITS_PER_BYTE;
676                         int ushift = 32 - shift;
677                         u32 l = len;
678
679                         while (l >= sizeof(u32)) {
680                                 u32 v = *addr;
681
682                                 data |= set_upper_bits(v, shift);
683                                 __raw_writel(data, piobuf);
684                                 data = get_upper_bits(v, ushift);
685                                 piobuf++;
686                                 addr++;
687                                 l -= sizeof(u32);
688                         }
689                         /*
690                          * We still have 'extra' number of bytes leftover.
691                          */
692                         if (l) {
693                                 u32 v = *addr;
694
695                                 if (l + extra >= sizeof(u32)) {
696                                         data |= set_upper_bits(v, shift);
697                                         len -= l + extra - sizeof(u32);
698                                         if (len == length) {
699                                                 last = data;
700                                                 break;
701                                         }
702                                         __raw_writel(data, piobuf);
703                                         piobuf++;
704                                         extra = 0;
705                                         data = 0;
706                                 } else {
707                                         /* Clear unused upper bytes */
708                                         data |= clear_upper_bytes(v, l,
709                                                                   extra);
710                                         if (len == length) {
711                                                 last = data;
712                                                 break;
713                                         }
714                                         extra += l;
715                                 }
716                         } else if (len == length) {
717                                 last = data;
718                                 break;
719                         }
720                 } else if (len == length) {
721                         u32 w;
722
723                         /*
724                          * Need to round up for the last dword in the
725                          * packet.
726                          */
727                         w = (len + 3) >> 2;
728                         __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
729                         piobuf += w - 1;
730                         last = ((u32 *) ss->sge.vaddr)[w - 1];
731                         break;
732                 } else {
733                         u32 w = len >> 2;
734
735                         __iowrite32_copy(piobuf, ss->sge.vaddr, w);
736                         piobuf += w;
737
738                         extra = len & (sizeof(u32) - 1);
739                         if (extra) {
740                                 u32 v = ((u32 *) ss->sge.vaddr)[w];
741
742                                 /* Clear unused upper bytes */
743                                 data = clear_upper_bytes(v, extra, 0);
744                         }
745                 }
746                 update_sge(ss, len);
747                 length -= len;
748         }
749         /* Update address before sending packet. */
750         update_sge(ss, length);
751         /* must flush early everything before trigger word */
752         ipath_flush_wc();
753         __raw_writel(last, piobuf);
754         /* be sure trigger word is written */
755         ipath_flush_wc();
756 }
757
758 /**
759  * ipath_verbs_send - send a packet
760  * @dd: the infinipath device
761  * @hdrwords: the number of words in the header
762  * @hdr: the packet header
763  * @len: the length of the packet in bytes
764  * @ss: the SGE to send
765  */
766 int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
767                      u32 *hdr, u32 len, struct ipath_sge_state *ss)
768 {
769         u32 __iomem *piobuf;
770         u32 plen;
771         int ret;
772
773         /* +1 is for the qword padding of pbc */
774         plen = hdrwords + ((len + 3) >> 2) + 1;
775         if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) {
776                 ipath_dbg("packet len 0x%x too long, failing\n", plen);
777                 ret = -EINVAL;
778                 goto bail;
779         }
780
781         /* Get a PIO buffer to use. */
782         piobuf = ipath_getpiobuf(dd, NULL);
783         if (unlikely(piobuf == NULL)) {
784                 ret = -EBUSY;
785                 goto bail;
786         }
787
788         /*
789          * Write len to control qword, no flags.
790          * We have to flush after the PBC for correctness on some cpus
791          * or WC buffer can be written out of order.
792          */
793         writeq(plen, piobuf);
794         ipath_flush_wc();
795         piobuf += 2;
796         if (len == 0) {
797                 /*
798                  * If there is just the header portion, must flush before
799                  * writing last word of header for correctness, and after
800                  * the last header word (trigger word).
801                  */
802                 __iowrite32_copy(piobuf, hdr, hdrwords - 1);
803                 ipath_flush_wc();
804                 __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
805                 ipath_flush_wc();
806                 ret = 0;
807                 goto bail;
808         }
809
810         __iowrite32_copy(piobuf, hdr, hdrwords);
811         piobuf += hdrwords;
812
813         /* The common case is aligned and contained in one segment. */
814         if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
815                    !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
816                 u32 w;
817                 u32 *addr = (u32 *) ss->sge.vaddr;
818
819                 /* Update address before sending packet. */
820                 update_sge(ss, len);
821                 /* Need to round up for the last dword in the packet. */
822                 w = (len + 3) >> 2;
823                 __iowrite32_copy(piobuf, addr, w - 1);
824                 /* must flush early everything before trigger word */
825                 ipath_flush_wc();
826                 __raw_writel(addr[w - 1], piobuf + w - 1);
827                 /* be sure trigger word is written */
828                 ipath_flush_wc();
829                 ret = 0;
830                 goto bail;
831         }
832         copy_io(piobuf, ss, len);
833         ret = 0;
834
835 bail:
836         return ret;
837 }
838
839 int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
840                             u64 *rwords, u64 *spkts, u64 *rpkts,
841                             u64 *xmit_wait)
842 {
843         int ret;
844
845         if (!(dd->ipath_flags & IPATH_INITTED)) {
846                 /* no hardware, freeze, etc. */
847                 ipath_dbg("unit %u not usable\n", dd->ipath_unit);
848                 ret = -EINVAL;
849                 goto bail;
850         }
851         *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
852         *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
853         *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
854         *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
855         *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
856
857         ret = 0;
858
859 bail:
860         return ret;
861 }
862
863 /**
864  * ipath_get_counters - get various chip counters
865  * @dd: the infinipath device
866  * @cntrs: counters are placed here
867  *
868  * Return the counters needed by recv_pma_get_portcounters().
869  */
870 int ipath_get_counters(struct ipath_devdata *dd,
871                        struct ipath_verbs_counters *cntrs)
872 {
873         int ret;
874
875         if (!(dd->ipath_flags & IPATH_INITTED)) {
876                 /* no hardware, freeze, etc. */
877                 ipath_dbg("unit %u not usable\n", dd->ipath_unit);
878                 ret = -EINVAL;
879                 goto bail;
880         }
881         cntrs->symbol_error_counter =
882                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
883         cntrs->link_error_recovery_counter =
884                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
885         /*
886          * The link downed counter counts when the other side downs the
887          * connection.  We add in the number of times we downed the link
888          * due to local link integrity errors to compensate.
889          */
890         cntrs->link_downed_counter =
891                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt);
892         cntrs->port_rcv_errors =
893                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) +
894                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) +
895                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) +
896                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) +
897                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) +
898                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) +
899                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) +
900                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) +
901                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt);
902         cntrs->port_rcv_remphys_errors =
903                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt);
904         cntrs->port_xmit_discards =
905                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt);
906         cntrs->port_xmit_data =
907                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
908         cntrs->port_rcv_data =
909                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
910         cntrs->port_xmit_packets =
911                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
912         cntrs->port_rcv_packets =
913                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
914         cntrs->local_link_integrity_errors = dd->ipath_lli_errors;
915         cntrs->excessive_buffer_overrun_errors = 0; /* XXX */
916
917         ret = 0;
918
919 bail:
920         return ret;
921 }
922
923 /**
924  * ipath_ib_piobufavail - callback when a PIO buffer is available
925  * @arg: the device pointer
926  *
927  * This is called from ipath_intr() at interrupt level when a PIO buffer is
928  * available after ipath_verbs_send() returned an error that no buffers were
929  * available.  Return 1 if we consumed all the PIO buffers and we still have
930  * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
931  * return zero).
932  */
933 int ipath_ib_piobufavail(struct ipath_ibdev *dev)
934 {
935         struct ipath_qp *qp;
936         unsigned long flags;
937
938         if (dev == NULL)
939                 goto bail;
940
941         spin_lock_irqsave(&dev->pending_lock, flags);
942         while (!list_empty(&dev->piowait)) {
943                 qp = list_entry(dev->piowait.next, struct ipath_qp,
944                                 piowait);
945                 list_del_init(&qp->piowait);
946                 tasklet_hi_schedule(&qp->s_task);
947         }
948         spin_unlock_irqrestore(&dev->pending_lock, flags);
949
950 bail:
951         return 0;
952 }
953
954 static int ipath_query_device(struct ib_device *ibdev,
955                               struct ib_device_attr *props)
956 {
957         struct ipath_ibdev *dev = to_idev(ibdev);
958
959         memset(props, 0, sizeof(*props));
960
961         props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
962                 IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
963                 IB_DEVICE_SYS_IMAGE_GUID;
964         props->page_size_cap = PAGE_SIZE;
965         props->vendor_id = dev->dd->ipath_vendorid;
966         props->vendor_part_id = dev->dd->ipath_deviceid;
967         props->hw_ver = dev->dd->ipath_pcirev;
968
969         props->sys_image_guid = dev->sys_image_guid;
970
971         props->max_mr_size = ~0ull;
972         props->max_qp = ib_ipath_max_qps;
973         props->max_qp_wr = ib_ipath_max_qp_wrs;
974         props->max_sge = ib_ipath_max_sges;
975         props->max_cq = ib_ipath_max_cqs;
976         props->max_ah = ib_ipath_max_ahs;
977         props->max_cqe = ib_ipath_max_cqes;
978         props->max_mr = dev->lk_table.max;
979         props->max_pd = ib_ipath_max_pds;
980         props->max_qp_rd_atom = 1;
981         props->max_qp_init_rd_atom = 1;
982         /* props->max_res_rd_atom */
983         props->max_srq = ib_ipath_max_srqs;
984         props->max_srq_wr = ib_ipath_max_srq_wrs;
985         props->max_srq_sge = ib_ipath_max_srq_sges;
986         /* props->local_ca_ack_delay */
987         props->atomic_cap = IB_ATOMIC_HCA;
988         props->max_pkeys = ipath_get_npkeys(dev->dd);
989         props->max_mcast_grp = ib_ipath_max_mcast_grps;
990         props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
991         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
992                 props->max_mcast_grp;
993
994         return 0;
995 }
996
997 const u8 ipath_cvt_physportstate[16] = {
998         [INFINIPATH_IBCS_LT_STATE_DISABLED] = 3,
999         [INFINIPATH_IBCS_LT_STATE_LINKUP] = 5,
1000         [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2,
1001         [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2,
1002         [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1,
1003         [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1,
1004         [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4,
1005         [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4,
1006         [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4,
1007         [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4,
1008         [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6,
1009         [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6,
1010         [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6,
1011 };
1012
1013 u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
1014 {
1015         return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
1016 }
1017
1018 static int ipath_query_port(struct ib_device *ibdev,
1019                             u8 port, struct ib_port_attr *props)
1020 {
1021         struct ipath_ibdev *dev = to_idev(ibdev);
1022         enum ib_mtu mtu;
1023         u16 lid = dev->dd->ipath_lid;
1024         u64 ibcstat;
1025
1026         memset(props, 0, sizeof(*props));
1027         props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
1028         props->lmc = dev->mkeyprot_resv_lmc & 7;
1029         props->sm_lid = dev->sm_lid;
1030         props->sm_sl = dev->sm_sl;
1031         ibcstat = dev->dd->ipath_lastibcstat;
1032         props->state = ((ibcstat >> 4) & 0x3) + 1;
1033         /* See phys_state_show() */
1034         props->phys_state = ipath_cvt_physportstate[
1035                 dev->dd->ipath_lastibcstat & 0xf];
1036         props->port_cap_flags = dev->port_cap_flags;
1037         props->gid_tbl_len = 1;
1038         props->max_msg_sz = 0x80000000;
1039         props->pkey_tbl_len = ipath_get_npkeys(dev->dd);
1040         props->bad_pkey_cntr = ipath_get_cr_errpkey(dev->dd) -
1041                 dev->z_pkey_violations;
1042         props->qkey_viol_cntr = dev->qkey_violations;
1043         props->active_width = IB_WIDTH_4X;
1044         /* See rate_show() */
1045         props->active_speed = 1;        /* Regular 10Mbs speed. */
1046         props->max_vl_num = 1;          /* VLCap = VL0 */
1047         props->init_type_reply = 0;
1048
1049         props->max_mtu = IB_MTU_4096;
1050         switch (dev->dd->ipath_ibmtu) {
1051         case 4096:
1052                 mtu = IB_MTU_4096;
1053                 break;
1054         case 2048:
1055                 mtu = IB_MTU_2048;
1056                 break;
1057         case 1024:
1058                 mtu = IB_MTU_1024;
1059                 break;
1060         case 512:
1061                 mtu = IB_MTU_512;
1062                 break;
1063         case 256:
1064                 mtu = IB_MTU_256;
1065                 break;
1066         default:
1067                 mtu = IB_MTU_2048;
1068         }
1069         props->active_mtu = mtu;
1070         props->subnet_timeout = dev->subnet_timeout;
1071
1072         return 0;
1073 }
1074
1075 static int ipath_modify_device(struct ib_device *device,
1076                                int device_modify_mask,
1077                                struct ib_device_modify *device_modify)
1078 {
1079         int ret;
1080
1081         if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1082                                    IB_DEVICE_MODIFY_NODE_DESC)) {
1083                 ret = -EOPNOTSUPP;
1084                 goto bail;
1085         }
1086
1087         if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
1088                 memcpy(device->node_desc, device_modify->node_desc, 64);
1089
1090         if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
1091                 to_idev(device)->sys_image_guid =
1092                         cpu_to_be64(device_modify->sys_image_guid);
1093
1094         ret = 0;
1095
1096 bail:
1097         return ret;
1098 }
1099
1100 static int ipath_modify_port(struct ib_device *ibdev,
1101                              u8 port, int port_modify_mask,
1102                              struct ib_port_modify *props)
1103 {
1104         struct ipath_ibdev *dev = to_idev(ibdev);
1105
1106         dev->port_cap_flags |= props->set_port_cap_mask;
1107         dev->port_cap_flags &= ~props->clr_port_cap_mask;
1108         if (port_modify_mask & IB_PORT_SHUTDOWN)
1109                 ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
1110         if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
1111                 dev->qkey_violations = 0;
1112         return 0;
1113 }
1114
1115 static int ipath_query_gid(struct ib_device *ibdev, u8 port,
1116                            int index, union ib_gid *gid)
1117 {
1118         struct ipath_ibdev *dev = to_idev(ibdev);
1119         int ret;
1120
1121         if (index >= 1) {
1122                 ret = -EINVAL;
1123                 goto bail;
1124         }
1125         gid->global.subnet_prefix = dev->gid_prefix;
1126         gid->global.interface_id = dev->dd->ipath_guid;
1127
1128         ret = 0;
1129
1130 bail:
1131         return ret;
1132 }
1133
1134 static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
1135                                     struct ib_ucontext *context,
1136                                     struct ib_udata *udata)
1137 {
1138         struct ipath_ibdev *dev = to_idev(ibdev);
1139         struct ipath_pd *pd;
1140         struct ib_pd *ret;
1141
1142         /*
1143          * This is actually totally arbitrary.  Some correctness tests
1144          * assume there's a maximum number of PDs that can be allocated.
1145          * We don't actually have this limit, but we fail the test if
1146          * we allow allocations of more than we report for this value.
1147          */
1148
1149         pd = kmalloc(sizeof *pd, GFP_KERNEL);
1150         if (!pd) {
1151                 ret = ERR_PTR(-ENOMEM);
1152                 goto bail;
1153         }
1154
1155         spin_lock(&dev->n_pds_lock);
1156         if (dev->n_pds_allocated == ib_ipath_max_pds) {
1157                 spin_unlock(&dev->n_pds_lock);
1158                 kfree(pd);
1159                 ret = ERR_PTR(-ENOMEM);
1160                 goto bail;
1161         }
1162
1163         dev->n_pds_allocated++;
1164         spin_unlock(&dev->n_pds_lock);
1165
1166         /* ib_alloc_pd() will initialize pd->ibpd. */
1167         pd->user = udata != NULL;
1168
1169         ret = &pd->ibpd;
1170
1171 bail:
1172         return ret;
1173 }
1174
1175 static int ipath_dealloc_pd(struct ib_pd *ibpd)
1176 {
1177         struct ipath_pd *pd = to_ipd(ibpd);
1178         struct ipath_ibdev *dev = to_idev(ibpd->device);
1179
1180         spin_lock(&dev->n_pds_lock);
1181         dev->n_pds_allocated--;
1182         spin_unlock(&dev->n_pds_lock);
1183
1184         kfree(pd);
1185
1186         return 0;
1187 }
1188
1189 /**
1190  * ipath_create_ah - create an address handle
1191  * @pd: the protection domain
1192  * @ah_attr: the attributes of the AH
1193  *
1194  * This may be called from interrupt context.
1195  */
1196 static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
1197                                      struct ib_ah_attr *ah_attr)
1198 {
1199         struct ipath_ah *ah;
1200         struct ib_ah *ret;
1201         struct ipath_ibdev *dev = to_idev(pd->device);
1202
1203         /* A multicast address requires a GRH (see ch. 8.4.1). */
1204         if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
1205             ah_attr->dlid != IPATH_PERMISSIVE_LID &&
1206             !(ah_attr->ah_flags & IB_AH_GRH)) {
1207                 ret = ERR_PTR(-EINVAL);
1208                 goto bail;
1209         }
1210
1211         if (ah_attr->dlid == 0) {
1212                 ret = ERR_PTR(-EINVAL);
1213                 goto bail;
1214         }
1215
1216         if (ah_attr->port_num < 1 ||
1217             ah_attr->port_num > pd->device->phys_port_cnt) {
1218                 ret = ERR_PTR(-EINVAL);
1219                 goto bail;
1220         }
1221
1222         ah = kmalloc(sizeof *ah, GFP_ATOMIC);
1223         if (!ah) {
1224                 ret = ERR_PTR(-ENOMEM);
1225                 goto bail;
1226         }
1227
1228         spin_lock(&dev->n_ahs_lock);
1229         if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
1230                 spin_unlock(&dev->n_ahs_lock);
1231                 kfree(ah);
1232                 ret = ERR_PTR(-ENOMEM);
1233                 goto bail;
1234         }
1235
1236         dev->n_ahs_allocated++;
1237         spin_unlock(&dev->n_ahs_lock);
1238
1239         /* ib_create_ah() will initialize ah->ibah. */
1240         ah->attr = *ah_attr;
1241
1242         ret = &ah->ibah;
1243
1244 bail:
1245         return ret;
1246 }
1247
1248 /**
1249  * ipath_destroy_ah - destroy an address handle
1250  * @ibah: the AH to destroy
1251  *
1252  * This may be called from interrupt context.
1253  */
1254 static int ipath_destroy_ah(struct ib_ah *ibah)
1255 {
1256         struct ipath_ibdev *dev = to_idev(ibah->device);
1257         struct ipath_ah *ah = to_iah(ibah);
1258
1259         spin_lock(&dev->n_ahs_lock);
1260         dev->n_ahs_allocated--;
1261         spin_unlock(&dev->n_ahs_lock);
1262
1263         kfree(ah);
1264
1265         return 0;
1266 }
1267
1268 static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
1269 {
1270         struct ipath_ah *ah = to_iah(ibah);
1271
1272         *ah_attr = ah->attr;
1273
1274         return 0;
1275 }
1276
1277 /**
1278  * ipath_get_npkeys - return the size of the PKEY table for port 0
1279  * @dd: the infinipath device
1280  */
1281 unsigned ipath_get_npkeys(struct ipath_devdata *dd)
1282 {
1283         return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
1284 }
1285
1286 /**
1287  * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table
1288  * @dd: the infinipath device
1289  * @index: the PKEY index
1290  */
1291 unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
1292 {
1293         unsigned ret;
1294
1295         if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
1296                 ret = 0;
1297         else
1298                 ret = dd->ipath_pd[0]->port_pkeys[index];
1299
1300         return ret;
1301 }
1302
1303 static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1304                             u16 *pkey)
1305 {
1306         struct ipath_ibdev *dev = to_idev(ibdev);
1307         int ret;
1308
1309         if (index >= ipath_get_npkeys(dev->dd)) {
1310                 ret = -EINVAL;
1311                 goto bail;
1312         }
1313
1314         *pkey = ipath_get_pkey(dev->dd, index);
1315         ret = 0;
1316
1317 bail:
1318         return ret;
1319 }
1320
1321 /**
1322  * ipath_alloc_ucontext - allocate a ucontest
1323  * @ibdev: the infiniband device
1324  * @udata: not used by the InfiniPath driver
1325  */
1326
1327 static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
1328                                                 struct ib_udata *udata)
1329 {
1330         struct ipath_ucontext *context;
1331         struct ib_ucontext *ret;
1332
1333         context = kmalloc(sizeof *context, GFP_KERNEL);
1334         if (!context) {
1335                 ret = ERR_PTR(-ENOMEM);
1336                 goto bail;
1337         }
1338
1339         ret = &context->ibucontext;
1340
1341 bail:
1342         return ret;
1343 }
1344
1345 static int ipath_dealloc_ucontext(struct ib_ucontext *context)
1346 {
1347         kfree(to_iucontext(context));
1348         return 0;
1349 }
1350
1351 static int ipath_verbs_register_sysfs(struct ib_device *dev);
1352
1353 static void __verbs_timer(unsigned long arg)
1354 {
1355         struct ipath_devdata *dd = (struct ipath_devdata *) arg;
1356
1357         /*
1358          * If port 0 receive packet interrupts are not available, or
1359          * can be missed, poll the receive queue
1360          */
1361         if (dd->ipath_flags & IPATH_POLL_RX_INTR)
1362                 ipath_kreceive(dd);
1363
1364         /* Handle verbs layer timeouts. */
1365         ipath_ib_timer(dd->verbs_dev);
1366
1367         mod_timer(&dd->verbs_timer, jiffies + 1);
1368 }
1369
1370 static int enable_timer(struct ipath_devdata *dd)
1371 {
1372         /*
1373          * Early chips had a design flaw where the chip and kernel idea
1374          * of the tail register don't always agree, and therefore we won't
1375          * get an interrupt on the next packet received.
1376          * If the board supports per packet receive interrupts, use it.
1377          * Otherwise, the timer function periodically checks for packets
1378          * to cover this case.
1379          * Either way, the timer is needed for verbs layer related
1380          * processing.
1381          */
1382         if (dd->ipath_flags & IPATH_GPIO_INTR) {
1383                 ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
1384                                  0x2074076542310ULL);
1385                 /* Enable GPIO bit 2 interrupt */
1386                 ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
1387                                  (u64) (1 << 2));
1388         }
1389
1390         init_timer(&dd->verbs_timer);
1391         dd->verbs_timer.function = __verbs_timer;
1392         dd->verbs_timer.data = (unsigned long)dd;
1393         dd->verbs_timer.expires = jiffies + 1;
1394         add_timer(&dd->verbs_timer);
1395
1396         return 0;
1397 }
1398
1399 static int disable_timer(struct ipath_devdata *dd)
1400 {
1401         /* Disable GPIO bit 2 interrupt */
1402         if (dd->ipath_flags & IPATH_GPIO_INTR)
1403                 ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, 0);
1404
1405         del_timer_sync(&dd->verbs_timer);
1406
1407         return 0;
1408 }
1409
1410 /**
1411  * ipath_register_ib_device - register our device with the infiniband core
1412  * @dd: the device data structure
1413  * Return the allocated ipath_ibdev pointer or NULL on error.
1414  */
1415 int ipath_register_ib_device(struct ipath_devdata *dd)
1416 {
1417         struct ipath_verbs_counters cntrs;
1418         struct ipath_ibdev *idev;
1419         struct ib_device *dev;
1420         int ret;
1421
1422         idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
1423         if (idev == NULL) {
1424                 ret = -ENOMEM;
1425                 goto bail;
1426         }
1427
1428         dev = &idev->ibdev;
1429
1430         /* Only need to initialize non-zero fields. */
1431         spin_lock_init(&idev->n_pds_lock);
1432         spin_lock_init(&idev->n_ahs_lock);
1433         spin_lock_init(&idev->n_cqs_lock);
1434         spin_lock_init(&idev->n_qps_lock);
1435         spin_lock_init(&idev->n_srqs_lock);
1436         spin_lock_init(&idev->n_mcast_grps_lock);
1437
1438         spin_lock_init(&idev->qp_table.lock);
1439         spin_lock_init(&idev->lk_table.lock);
1440         idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
1441         /* Set the prefix to the default value (see ch. 4.1.1) */
1442         idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
1443
1444         ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
1445         if (ret)
1446                 goto err_qp;
1447
1448         /*
1449          * The top ib_ipath_lkey_table_size bits are used to index the
1450          * table.  The lower 8 bits can be owned by the user (copied from
1451          * the LKEY).  The remaining bits act as a generation number or tag.
1452          */
1453         idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
1454         idev->lk_table.table = kzalloc(idev->lk_table.max *
1455                                        sizeof(*idev->lk_table.table),
1456                                        GFP_KERNEL);
1457         if (idev->lk_table.table == NULL) {
1458                 ret = -ENOMEM;
1459                 goto err_lk;
1460         }
1461         spin_lock_init(&idev->pending_lock);
1462         INIT_LIST_HEAD(&idev->pending[0]);
1463         INIT_LIST_HEAD(&idev->pending[1]);
1464         INIT_LIST_HEAD(&idev->pending[2]);
1465         INIT_LIST_HEAD(&idev->piowait);
1466         INIT_LIST_HEAD(&idev->rnrwait);
1467         idev->pending_index = 0;
1468         idev->port_cap_flags =
1469                 IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
1470         idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1471         idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1472         idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1473         idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1474         idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT;
1475         idev->link_width_enabled = 3;   /* 1x or 4x */
1476
1477         /* Snapshot current HW counters to "clear" them. */
1478         ipath_get_counters(dd, &cntrs);
1479         idev->z_symbol_error_counter = cntrs.symbol_error_counter;
1480         idev->z_link_error_recovery_counter =
1481                 cntrs.link_error_recovery_counter;
1482         idev->z_link_downed_counter = cntrs.link_downed_counter;
1483         idev->z_port_rcv_errors = cntrs.port_rcv_errors;
1484         idev->z_port_rcv_remphys_errors =
1485                 cntrs.port_rcv_remphys_errors;
1486         idev->z_port_xmit_discards = cntrs.port_xmit_discards;
1487         idev->z_port_xmit_data = cntrs.port_xmit_data;
1488         idev->z_port_rcv_data = cntrs.port_rcv_data;
1489         idev->z_port_xmit_packets = cntrs.port_xmit_packets;
1490         idev->z_port_rcv_packets = cntrs.port_rcv_packets;
1491         idev->z_local_link_integrity_errors =
1492                 cntrs.local_link_integrity_errors;
1493         idev->z_excessive_buffer_overrun_errors =
1494                 cntrs.excessive_buffer_overrun_errors;
1495
1496         /*
1497          * The system image GUID is supposed to be the same for all
1498          * IB HCAs in a single system but since there can be other
1499          * device types in the system, we can't be sure this is unique.
1500          */
1501         if (!sys_image_guid)
1502                 sys_image_guid = dd->ipath_guid;
1503         idev->sys_image_guid = sys_image_guid;
1504         idev->ib_unit = dd->ipath_unit;
1505         idev->dd = dd;
1506
1507         strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
1508         dev->owner = THIS_MODULE;
1509         dev->node_guid = dd->ipath_guid;
1510         dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
1511         dev->uverbs_cmd_mask =
1512                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
1513                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
1514                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
1515                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
1516                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
1517                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
1518                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
1519                 (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
1520                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
1521                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
1522                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
1523                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
1524                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
1525                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
1526                 (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
1527                 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
1528                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
1529                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
1530                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
1531                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
1532                 (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
1533                 (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
1534                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
1535                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
1536                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
1537                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
1538                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
1539                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
1540                 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
1541         dev->node_type = RDMA_NODE_IB_CA;
1542         dev->phys_port_cnt = 1;
1543         dev->dma_device = &dd->pcidev->dev;
1544         dev->class_dev.dev = dev->dma_device;
1545         dev->query_device = ipath_query_device;
1546         dev->modify_device = ipath_modify_device;
1547         dev->query_port = ipath_query_port;
1548         dev->modify_port = ipath_modify_port;
1549         dev->query_pkey = ipath_query_pkey;
1550         dev->query_gid = ipath_query_gid;
1551         dev->alloc_ucontext = ipath_alloc_ucontext;
1552         dev->dealloc_ucontext = ipath_dealloc_ucontext;
1553         dev->alloc_pd = ipath_alloc_pd;
1554         dev->dealloc_pd = ipath_dealloc_pd;
1555         dev->create_ah = ipath_create_ah;
1556         dev->destroy_ah = ipath_destroy_ah;
1557         dev->query_ah = ipath_query_ah;
1558         dev->create_srq = ipath_create_srq;
1559         dev->modify_srq = ipath_modify_srq;
1560         dev->query_srq = ipath_query_srq;
1561         dev->destroy_srq = ipath_destroy_srq;
1562         dev->create_qp = ipath_create_qp;
1563         dev->modify_qp = ipath_modify_qp;
1564         dev->query_qp = ipath_query_qp;
1565         dev->destroy_qp = ipath_destroy_qp;
1566         dev->post_send = ipath_post_send;
1567         dev->post_recv = ipath_post_receive;
1568         dev->post_srq_recv = ipath_post_srq_receive;
1569         dev->create_cq = ipath_create_cq;
1570         dev->destroy_cq = ipath_destroy_cq;
1571         dev->resize_cq = ipath_resize_cq;
1572         dev->poll_cq = ipath_poll_cq;
1573         dev->req_notify_cq = ipath_req_notify_cq;
1574         dev->get_dma_mr = ipath_get_dma_mr;
1575         dev->reg_phys_mr = ipath_reg_phys_mr;
1576         dev->reg_user_mr = ipath_reg_user_mr;
1577         dev->dereg_mr = ipath_dereg_mr;
1578         dev->alloc_fmr = ipath_alloc_fmr;
1579         dev->map_phys_fmr = ipath_map_phys_fmr;
1580         dev->unmap_fmr = ipath_unmap_fmr;
1581         dev->dealloc_fmr = ipath_dealloc_fmr;
1582         dev->attach_mcast = ipath_multicast_attach;
1583         dev->detach_mcast = ipath_multicast_detach;
1584         dev->process_mad = ipath_process_mad;
1585         dev->mmap = ipath_mmap;
1586
1587         snprintf(dev->node_desc, sizeof(dev->node_desc),
1588                  IPATH_IDSTR " %s", system_utsname.nodename);
1589
1590         ret = ib_register_device(dev);
1591         if (ret)
1592                 goto err_reg;
1593
1594         if (ipath_verbs_register_sysfs(dev))
1595                 goto err_class;
1596
1597         enable_timer(dd);
1598
1599         goto bail;
1600
1601 err_class:
1602         ib_unregister_device(dev);
1603 err_reg:
1604         kfree(idev->lk_table.table);
1605 err_lk:
1606         kfree(idev->qp_table.table);
1607 err_qp:
1608         ib_dealloc_device(dev);
1609         ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1610         idev = NULL;
1611
1612 bail:
1613         dd->verbs_dev = idev;
1614         return ret;
1615 }
1616
1617 void ipath_unregister_ib_device(struct ipath_ibdev *dev)
1618 {
1619         struct ib_device *ibdev = &dev->ibdev;
1620
1621         disable_timer(dev->dd);
1622
1623         ib_unregister_device(ibdev);
1624
1625         if (!list_empty(&dev->pending[0]) ||
1626             !list_empty(&dev->pending[1]) ||
1627             !list_empty(&dev->pending[2]))
1628                 ipath_dev_err(dev->dd, "pending list not empty!\n");
1629         if (!list_empty(&dev->piowait))
1630                 ipath_dev_err(dev->dd, "piowait list not empty!\n");
1631         if (!list_empty(&dev->rnrwait))
1632                 ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
1633         if (!ipath_mcast_tree_empty())
1634                 ipath_dev_err(dev->dd, "multicast table memory leak!\n");
1635         /*
1636          * Note that ipath_unregister_ib_device() can be called before all
1637          * the QPs are destroyed!
1638          */
1639         ipath_free_all_qps(&dev->qp_table);
1640         kfree(dev->qp_table.table);
1641         kfree(dev->lk_table.table);
1642         ib_dealloc_device(ibdev);
1643 }
1644
1645 static ssize_t show_rev(struct class_device *cdev, char *buf)
1646 {
1647         struct ipath_ibdev *dev =
1648                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1649
1650         return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
1651 }
1652
1653 static ssize_t show_hca(struct class_device *cdev, char *buf)
1654 {
1655         struct ipath_ibdev *dev =
1656                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1657         int ret;
1658
1659         ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
1660         if (ret < 0)
1661                 goto bail;
1662         strcat(buf, "\n");
1663         ret = strlen(buf);
1664
1665 bail:
1666         return ret;
1667 }
1668
1669 static ssize_t show_stats(struct class_device *cdev, char *buf)
1670 {
1671         struct ipath_ibdev *dev =
1672                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1673         int i;
1674         int len;
1675
1676         len = sprintf(buf,
1677                       "RC resends  %d\n"
1678                       "RC no QACK  %d\n"
1679                       "RC ACKs     %d\n"
1680                       "RC SEQ NAKs %d\n"
1681                       "RC RDMA seq %d\n"
1682                       "RC RNR NAKs %d\n"
1683                       "RC OTH NAKs %d\n"
1684                       "RC timeouts %d\n"
1685                       "RC RDMA dup %d\n"
1686                       "piobuf wait %d\n"
1687                       "no piobuf   %d\n"
1688                       "PKT drops   %d\n"
1689                       "WQE errs    %d\n",
1690                       dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
1691                       dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
1692                       dev->n_other_naks, dev->n_timeouts,
1693                       dev->n_rdma_dup_busy, dev->n_piowait,
1694                       dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs);
1695         for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
1696                 const struct ipath_opcode_stats *si = &dev->opstats[i];
1697
1698                 if (!si->n_packets && !si->n_bytes)
1699                         continue;
1700                 len += sprintf(buf + len, "%02x %llu/%llu\n", i,
1701                                (unsigned long long) si->n_packets,
1702                                (unsigned long long) si->n_bytes);
1703         }
1704         return len;
1705 }
1706
1707 static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
1708 static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
1709 static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
1710 static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
1711
1712 static struct class_device_attribute *ipath_class_attributes[] = {
1713         &class_device_attr_hw_rev,
1714         &class_device_attr_hca_type,
1715         &class_device_attr_board_id,
1716         &class_device_attr_stats
1717 };
1718
1719 static int ipath_verbs_register_sysfs(struct ib_device *dev)
1720 {
1721         int i;
1722         int ret;
1723
1724         for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
1725                 if (class_device_create_file(&dev->class_dev,
1726                                              ipath_class_attributes[i])) {
1727                         ret = 1;
1728                         goto bail;
1729                 }
1730
1731         ret = 0;
1732
1733 bail:
1734         return ret;
1735 }