Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/infiniband/hw/qib/qib_rc.c
15112 views
1
/*
2
* Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
3
* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4
*
5
* This software is available to you under a choice of one of two
6
* licenses. You may choose to be licensed under the terms of the GNU
7
* General Public License (GPL) Version 2, available from the file
8
* COPYING in the main directory of this source tree, or the
9
* OpenIB.org BSD license below:
10
*
11
* Redistribution and use in source and binary forms, with or
12
* without modification, are permitted provided that the following
13
* conditions are met:
14
*
15
* - Redistributions of source code must retain the above
16
* copyright notice, this list of conditions and the following
17
* disclaimer.
18
*
19
* - Redistributions in binary form must reproduce the above
20
* copyright notice, this list of conditions and the following
21
* disclaimer in the documentation and/or other materials
22
* provided with the distribution.
23
*
24
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
* SOFTWARE.
32
*/
33
34
#include <linux/io.h>
35
36
#include "qib.h"
37
38
/* cut down ridiculously long IB macro names */
39
#define OP(x) IB_OPCODE_RC_##x
40
41
static void rc_timeout(unsigned long arg);
42
43
static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,
44
u32 psn, u32 pmtu)
45
{
46
u32 len;
47
48
len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
49
ss->sge = wqe->sg_list[0];
50
ss->sg_list = wqe->sg_list + 1;
51
ss->num_sge = wqe->wr.num_sge;
52
ss->total_len = wqe->length;
53
qib_skip_sge(ss, len, 0);
54
return wqe->length - len;
55
}
56
57
static void start_timer(struct qib_qp *qp)
58
{
59
qp->s_flags |= QIB_S_TIMER;
60
qp->s_timer.function = rc_timeout;
61
/* 4.096 usec. * (1 << qp->timeout) */
62
qp->s_timer.expires = jiffies +
63
usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / 1000UL);
64
add_timer(&qp->s_timer);
65
}
66
67
/**
68
* qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
69
* @dev: the device for this QP
70
* @qp: a pointer to the QP
71
* @ohdr: a pointer to the IB header being constructed
72
* @pmtu: the path MTU
73
*
74
* Return 1 if constructed; otherwise, return 0.
75
* Note that we are in the responder's side of the QP context.
76
* Note the QP s_lock must be held.
77
*/
78
static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,
79
struct qib_other_headers *ohdr, u32 pmtu)
80
{
81
struct qib_ack_entry *e;
82
u32 hwords;
83
u32 len;
84
u32 bth0;
85
u32 bth2;
86
87
/* Don't send an ACK if we aren't supposed to. */
88
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
89
goto bail;
90
91
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
92
hwords = 5;
93
94
switch (qp->s_ack_state) {
95
case OP(RDMA_READ_RESPONSE_LAST):
96
case OP(RDMA_READ_RESPONSE_ONLY):
97
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
98
if (e->rdma_sge.mr) {
99
atomic_dec(&e->rdma_sge.mr->refcount);
100
e->rdma_sge.mr = NULL;
101
}
102
/* FALLTHROUGH */
103
case OP(ATOMIC_ACKNOWLEDGE):
104
/*
105
* We can increment the tail pointer now that the last
106
* response has been sent instead of only being
107
* constructed.
108
*/
109
if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
110
qp->s_tail_ack_queue = 0;
111
/* FALLTHROUGH */
112
case OP(SEND_ONLY):
113
case OP(ACKNOWLEDGE):
114
/* Check for no next entry in the queue. */
115
if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
116
if (qp->s_flags & QIB_S_ACK_PENDING)
117
goto normal;
118
goto bail;
119
}
120
121
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
122
if (e->opcode == OP(RDMA_READ_REQUEST)) {
123
/*
124
* If a RDMA read response is being resent and
125
* we haven't seen the duplicate request yet,
126
* then stop sending the remaining responses the
127
* responder has seen until the requester resends it.
128
*/
129
len = e->rdma_sge.sge_length;
130
if (len && !e->rdma_sge.mr) {
131
qp->s_tail_ack_queue = qp->r_head_ack_queue;
132
goto bail;
133
}
134
/* Copy SGE state in case we need to resend */
135
qp->s_rdma_mr = e->rdma_sge.mr;
136
if (qp->s_rdma_mr)
137
atomic_inc(&qp->s_rdma_mr->refcount);
138
qp->s_ack_rdma_sge.sge = e->rdma_sge;
139
qp->s_ack_rdma_sge.num_sge = 1;
140
qp->s_cur_sge = &qp->s_ack_rdma_sge;
141
if (len > pmtu) {
142
len = pmtu;
143
qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
144
} else {
145
qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
146
e->sent = 1;
147
}
148
ohdr->u.aeth = qib_compute_aeth(qp);
149
hwords++;
150
qp->s_ack_rdma_psn = e->psn;
151
bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
152
} else {
153
/* COMPARE_SWAP or FETCH_ADD */
154
qp->s_cur_sge = NULL;
155
len = 0;
156
qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
157
ohdr->u.at.aeth = qib_compute_aeth(qp);
158
ohdr->u.at.atomic_ack_eth[0] =
159
cpu_to_be32(e->atomic_data >> 32);
160
ohdr->u.at.atomic_ack_eth[1] =
161
cpu_to_be32(e->atomic_data);
162
hwords += sizeof(ohdr->u.at) / sizeof(u32);
163
bth2 = e->psn & QIB_PSN_MASK;
164
e->sent = 1;
165
}
166
bth0 = qp->s_ack_state << 24;
167
break;
168
169
case OP(RDMA_READ_RESPONSE_FIRST):
170
qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
171
/* FALLTHROUGH */
172
case OP(RDMA_READ_RESPONSE_MIDDLE):
173
qp->s_cur_sge = &qp->s_ack_rdma_sge;
174
qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
175
if (qp->s_rdma_mr)
176
atomic_inc(&qp->s_rdma_mr->refcount);
177
len = qp->s_ack_rdma_sge.sge.sge_length;
178
if (len > pmtu)
179
len = pmtu;
180
else {
181
ohdr->u.aeth = qib_compute_aeth(qp);
182
hwords++;
183
qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
184
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
185
e->sent = 1;
186
}
187
bth0 = qp->s_ack_state << 24;
188
bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
189
break;
190
191
default:
192
normal:
193
/*
194
* Send a regular ACK.
195
* Set the s_ack_state so we wait until after sending
196
* the ACK before setting s_ack_state to ACKNOWLEDGE
197
* (see above).
198
*/
199
qp->s_ack_state = OP(SEND_ONLY);
200
qp->s_flags &= ~QIB_S_ACK_PENDING;
201
qp->s_cur_sge = NULL;
202
if (qp->s_nak_state)
203
ohdr->u.aeth =
204
cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
205
(qp->s_nak_state <<
206
QIB_AETH_CREDIT_SHIFT));
207
else
208
ohdr->u.aeth = qib_compute_aeth(qp);
209
hwords++;
210
len = 0;
211
bth0 = OP(ACKNOWLEDGE) << 24;
212
bth2 = qp->s_ack_psn & QIB_PSN_MASK;
213
}
214
qp->s_rdma_ack_cnt++;
215
qp->s_hdrwords = hwords;
216
qp->s_cur_size = len;
217
qib_make_ruc_header(qp, ohdr, bth0, bth2);
218
return 1;
219
220
bail:
221
qp->s_ack_state = OP(ACKNOWLEDGE);
222
qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);
223
return 0;
224
}
225
226
/**
227
* qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
228
* @qp: a pointer to the QP
229
*
230
* Return 1 if constructed; otherwise, return 0.
231
*/
232
int qib_make_rc_req(struct qib_qp *qp)
233
{
234
struct qib_ibdev *dev = to_idev(qp->ibqp.device);
235
struct qib_other_headers *ohdr;
236
struct qib_sge_state *ss;
237
struct qib_swqe *wqe;
238
u32 hwords;
239
u32 len;
240
u32 bth0;
241
u32 bth2;
242
u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
243
char newreq;
244
unsigned long flags;
245
int ret = 0;
246
int delta;
247
248
ohdr = &qp->s_hdr.u.oth;
249
if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
250
ohdr = &qp->s_hdr.u.l.oth;
251
252
/*
253
* The lock is needed to synchronize between the sending tasklet,
254
* the receive interrupt handler, and timeout resends.
255
*/
256
spin_lock_irqsave(&qp->s_lock, flags);
257
258
/* Sending responses has higher priority over sending requests. */
259
if ((qp->s_flags & QIB_S_RESP_PENDING) &&
260
qib_make_rc_ack(dev, qp, ohdr, pmtu))
261
goto done;
262
263
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
264
if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
265
goto bail;
266
/* We are in the error state, flush the work request. */
267
if (qp->s_last == qp->s_head)
268
goto bail;
269
/* If DMAs are in progress, we can't flush immediately. */
270
if (atomic_read(&qp->s_dma_busy)) {
271
qp->s_flags |= QIB_S_WAIT_DMA;
272
goto bail;
273
}
274
wqe = get_swqe_ptr(qp, qp->s_last);
275
while (qp->s_last != qp->s_acked) {
276
qib_send_complete(qp, wqe, IB_WC_SUCCESS);
277
if (++qp->s_last >= qp->s_size)
278
qp->s_last = 0;
279
wqe = get_swqe_ptr(qp, qp->s_last);
280
}
281
qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
282
goto done;
283
}
284
285
if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))
286
goto bail;
287
288
if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
289
if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
290
qp->s_flags |= QIB_S_WAIT_PSN;
291
goto bail;
292
}
293
qp->s_sending_psn = qp->s_psn;
294
qp->s_sending_hpsn = qp->s_psn - 1;
295
}
296
297
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
298
hwords = 5;
299
bth0 = 0;
300
301
/* Send a request. */
302
wqe = get_swqe_ptr(qp, qp->s_cur);
303
switch (qp->s_state) {
304
default:
305
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))
306
goto bail;
307
/*
308
* Resend an old request or start a new one.
309
*
310
* We keep track of the current SWQE so that
311
* we don't reset the "furthest progress" state
312
* if we need to back up.
313
*/
314
newreq = 0;
315
if (qp->s_cur == qp->s_tail) {
316
/* Check if send work queue is empty. */
317
if (qp->s_tail == qp->s_head)
318
goto bail;
319
/*
320
* If a fence is requested, wait for previous
321
* RDMA read and atomic operations to finish.
322
*/
323
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
324
qp->s_num_rd_atomic) {
325
qp->s_flags |= QIB_S_WAIT_FENCE;
326
goto bail;
327
}
328
wqe->psn = qp->s_next_psn;
329
newreq = 1;
330
}
331
/*
332
* Note that we have to be careful not to modify the
333
* original work request since we may need to resend
334
* it.
335
*/
336
len = wqe->length;
337
ss = &qp->s_sge;
338
bth2 = qp->s_psn & QIB_PSN_MASK;
339
switch (wqe->wr.opcode) {
340
case IB_WR_SEND:
341
case IB_WR_SEND_WITH_IMM:
342
/* If no credit, return. */
343
if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
344
qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
345
qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
346
goto bail;
347
}
348
wqe->lpsn = wqe->psn;
349
if (len > pmtu) {
350
wqe->lpsn += (len - 1) / pmtu;
351
qp->s_state = OP(SEND_FIRST);
352
len = pmtu;
353
break;
354
}
355
if (wqe->wr.opcode == IB_WR_SEND)
356
qp->s_state = OP(SEND_ONLY);
357
else {
358
qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
359
/* Immediate data comes after the BTH */
360
ohdr->u.imm_data = wqe->wr.ex.imm_data;
361
hwords += 1;
362
}
363
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
364
bth0 |= IB_BTH_SOLICITED;
365
bth2 |= IB_BTH_REQ_ACK;
366
if (++qp->s_cur == qp->s_size)
367
qp->s_cur = 0;
368
break;
369
370
case IB_WR_RDMA_WRITE:
371
if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
372
qp->s_lsn++;
373
/* FALLTHROUGH */
374
case IB_WR_RDMA_WRITE_WITH_IMM:
375
/* If no credit, return. */
376
if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
377
qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
378
qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
379
goto bail;
380
}
381
ohdr->u.rc.reth.vaddr =
382
cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
383
ohdr->u.rc.reth.rkey =
384
cpu_to_be32(wqe->wr.wr.rdma.rkey);
385
ohdr->u.rc.reth.length = cpu_to_be32(len);
386
hwords += sizeof(struct ib_reth) / sizeof(u32);
387
wqe->lpsn = wqe->psn;
388
if (len > pmtu) {
389
wqe->lpsn += (len - 1) / pmtu;
390
qp->s_state = OP(RDMA_WRITE_FIRST);
391
len = pmtu;
392
break;
393
}
394
if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
395
qp->s_state = OP(RDMA_WRITE_ONLY);
396
else {
397
qp->s_state =
398
OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
399
/* Immediate data comes after RETH */
400
ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
401
hwords += 1;
402
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
403
bth0 |= IB_BTH_SOLICITED;
404
}
405
bth2 |= IB_BTH_REQ_ACK;
406
if (++qp->s_cur == qp->s_size)
407
qp->s_cur = 0;
408
break;
409
410
case IB_WR_RDMA_READ:
411
/*
412
* Don't allow more operations to be started
413
* than the QP limits allow.
414
*/
415
if (newreq) {
416
if (qp->s_num_rd_atomic >=
417
qp->s_max_rd_atomic) {
418
qp->s_flags |= QIB_S_WAIT_RDMAR;
419
goto bail;
420
}
421
qp->s_num_rd_atomic++;
422
if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
423
qp->s_lsn++;
424
/*
425
* Adjust s_next_psn to count the
426
* expected number of responses.
427
*/
428
if (len > pmtu)
429
qp->s_next_psn += (len - 1) / pmtu;
430
wqe->lpsn = qp->s_next_psn++;
431
}
432
ohdr->u.rc.reth.vaddr =
433
cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
434
ohdr->u.rc.reth.rkey =
435
cpu_to_be32(wqe->wr.wr.rdma.rkey);
436
ohdr->u.rc.reth.length = cpu_to_be32(len);
437
qp->s_state = OP(RDMA_READ_REQUEST);
438
hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
439
ss = NULL;
440
len = 0;
441
bth2 |= IB_BTH_REQ_ACK;
442
if (++qp->s_cur == qp->s_size)
443
qp->s_cur = 0;
444
break;
445
446
case IB_WR_ATOMIC_CMP_AND_SWP:
447
case IB_WR_ATOMIC_FETCH_AND_ADD:
448
/*
449
* Don't allow more operations to be started
450
* than the QP limits allow.
451
*/
452
if (newreq) {
453
if (qp->s_num_rd_atomic >=
454
qp->s_max_rd_atomic) {
455
qp->s_flags |= QIB_S_WAIT_RDMAR;
456
goto bail;
457
}
458
qp->s_num_rd_atomic++;
459
if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
460
qp->s_lsn++;
461
wqe->lpsn = wqe->psn;
462
}
463
if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
464
qp->s_state = OP(COMPARE_SWAP);
465
ohdr->u.atomic_eth.swap_data = cpu_to_be64(
466
wqe->wr.wr.atomic.swap);
467
ohdr->u.atomic_eth.compare_data = cpu_to_be64(
468
wqe->wr.wr.atomic.compare_add);
469
} else {
470
qp->s_state = OP(FETCH_ADD);
471
ohdr->u.atomic_eth.swap_data = cpu_to_be64(
472
wqe->wr.wr.atomic.compare_add);
473
ohdr->u.atomic_eth.compare_data = 0;
474
}
475
ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
476
wqe->wr.wr.atomic.remote_addr >> 32);
477
ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
478
wqe->wr.wr.atomic.remote_addr);
479
ohdr->u.atomic_eth.rkey = cpu_to_be32(
480
wqe->wr.wr.atomic.rkey);
481
hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
482
ss = NULL;
483
len = 0;
484
bth2 |= IB_BTH_REQ_ACK;
485
if (++qp->s_cur == qp->s_size)
486
qp->s_cur = 0;
487
break;
488
489
default:
490
goto bail;
491
}
492
qp->s_sge.sge = wqe->sg_list[0];
493
qp->s_sge.sg_list = wqe->sg_list + 1;
494
qp->s_sge.num_sge = wqe->wr.num_sge;
495
qp->s_sge.total_len = wqe->length;
496
qp->s_len = wqe->length;
497
if (newreq) {
498
qp->s_tail++;
499
if (qp->s_tail >= qp->s_size)
500
qp->s_tail = 0;
501
}
502
if (wqe->wr.opcode == IB_WR_RDMA_READ)
503
qp->s_psn = wqe->lpsn + 1;
504
else {
505
qp->s_psn++;
506
if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
507
qp->s_next_psn = qp->s_psn;
508
}
509
break;
510
511
case OP(RDMA_READ_RESPONSE_FIRST):
512
/*
513
* qp->s_state is normally set to the opcode of the
514
* last packet constructed for new requests and therefore
515
* is never set to RDMA read response.
516
* RDMA_READ_RESPONSE_FIRST is used by the ACK processing
517
* thread to indicate a SEND needs to be restarted from an
518
* earlier PSN without interferring with the sending thread.
519
* See qib_restart_rc().
520
*/
521
qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
522
/* FALLTHROUGH */
523
case OP(SEND_FIRST):
524
qp->s_state = OP(SEND_MIDDLE);
525
/* FALLTHROUGH */
526
case OP(SEND_MIDDLE):
527
bth2 = qp->s_psn++ & QIB_PSN_MASK;
528
if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
529
qp->s_next_psn = qp->s_psn;
530
ss = &qp->s_sge;
531
len = qp->s_len;
532
if (len > pmtu) {
533
len = pmtu;
534
break;
535
}
536
if (wqe->wr.opcode == IB_WR_SEND)
537
qp->s_state = OP(SEND_LAST);
538
else {
539
qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
540
/* Immediate data comes after the BTH */
541
ohdr->u.imm_data = wqe->wr.ex.imm_data;
542
hwords += 1;
543
}
544
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
545
bth0 |= IB_BTH_SOLICITED;
546
bth2 |= IB_BTH_REQ_ACK;
547
qp->s_cur++;
548
if (qp->s_cur >= qp->s_size)
549
qp->s_cur = 0;
550
break;
551
552
case OP(RDMA_READ_RESPONSE_LAST):
553
/*
554
* qp->s_state is normally set to the opcode of the
555
* last packet constructed for new requests and therefore
556
* is never set to RDMA read response.
557
* RDMA_READ_RESPONSE_LAST is used by the ACK processing
558
* thread to indicate a RDMA write needs to be restarted from
559
* an earlier PSN without interferring with the sending thread.
560
* See qib_restart_rc().
561
*/
562
qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
563
/* FALLTHROUGH */
564
case OP(RDMA_WRITE_FIRST):
565
qp->s_state = OP(RDMA_WRITE_MIDDLE);
566
/* FALLTHROUGH */
567
case OP(RDMA_WRITE_MIDDLE):
568
bth2 = qp->s_psn++ & QIB_PSN_MASK;
569
if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
570
qp->s_next_psn = qp->s_psn;
571
ss = &qp->s_sge;
572
len = qp->s_len;
573
if (len > pmtu) {
574
len = pmtu;
575
break;
576
}
577
if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
578
qp->s_state = OP(RDMA_WRITE_LAST);
579
else {
580
qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
581
/* Immediate data comes after the BTH */
582
ohdr->u.imm_data = wqe->wr.ex.imm_data;
583
hwords += 1;
584
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
585
bth0 |= IB_BTH_SOLICITED;
586
}
587
bth2 |= IB_BTH_REQ_ACK;
588
qp->s_cur++;
589
if (qp->s_cur >= qp->s_size)
590
qp->s_cur = 0;
591
break;
592
593
case OP(RDMA_READ_RESPONSE_MIDDLE):
594
/*
595
* qp->s_state is normally set to the opcode of the
596
* last packet constructed for new requests and therefore
597
* is never set to RDMA read response.
598
* RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
599
* thread to indicate a RDMA read needs to be restarted from
600
* an earlier PSN without interferring with the sending thread.
601
* See qib_restart_rc().
602
*/
603
len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
604
ohdr->u.rc.reth.vaddr =
605
cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
606
ohdr->u.rc.reth.rkey =
607
cpu_to_be32(wqe->wr.wr.rdma.rkey);
608
ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
609
qp->s_state = OP(RDMA_READ_REQUEST);
610
hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
611
bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
612
qp->s_psn = wqe->lpsn + 1;
613
ss = NULL;
614
len = 0;
615
qp->s_cur++;
616
if (qp->s_cur == qp->s_size)
617
qp->s_cur = 0;
618
break;
619
}
620
qp->s_sending_hpsn = bth2;
621
delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
622
if (delta && delta % QIB_PSN_CREDIT == 0)
623
bth2 |= IB_BTH_REQ_ACK;
624
if (qp->s_flags & QIB_S_SEND_ONE) {
625
qp->s_flags &= ~QIB_S_SEND_ONE;
626
qp->s_flags |= QIB_S_WAIT_ACK;
627
bth2 |= IB_BTH_REQ_ACK;
628
}
629
qp->s_len -= len;
630
qp->s_hdrwords = hwords;
631
qp->s_cur_sge = ss;
632
qp->s_cur_size = len;
633
qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
634
done:
635
ret = 1;
636
goto unlock;
637
638
bail:
639
qp->s_flags &= ~QIB_S_BUSY;
640
unlock:
641
spin_unlock_irqrestore(&qp->s_lock, flags);
642
return ret;
643
}
644
645
/**
646
* qib_send_rc_ack - Construct an ACK packet and send it
647
* @qp: a pointer to the QP
648
*
649
* This is called from qib_rc_rcv() and qib_kreceive().
650
* Note that RDMA reads and atomics are handled in the
651
* send side QP state and tasklet.
652
*/
653
void qib_send_rc_ack(struct qib_qp *qp)
654
{
655
struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
656
struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
657
struct qib_pportdata *ppd = ppd_from_ibp(ibp);
658
u64 pbc;
659
u16 lrh0;
660
u32 bth0;
661
u32 hwords;
662
u32 pbufn;
663
u32 __iomem *piobuf;
664
struct qib_ib_header hdr;
665
struct qib_other_headers *ohdr;
666
u32 control;
667
unsigned long flags;
668
669
spin_lock_irqsave(&qp->s_lock, flags);
670
671
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
672
goto unlock;
673
674
/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
675
if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
676
goto queue_ack;
677
678
/* Construct the header with s_lock held so APM doesn't change it. */
679
ohdr = &hdr.u.oth;
680
lrh0 = QIB_LRH_BTH;
681
/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
682
hwords = 6;
683
if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
684
hwords += qib_make_grh(ibp, &hdr.u.l.grh,
685
&qp->remote_ah_attr.grh, hwords, 0);
686
ohdr = &hdr.u.l.oth;
687
lrh0 = QIB_LRH_GRH;
688
}
689
/* read pkey_index w/o lock (its atomic) */
690
bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
691
if (qp->s_mig_state == IB_MIG_MIGRATED)
692
bth0 |= IB_BTH_MIG_REQ;
693
if (qp->r_nak_state)
694
ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
695
(qp->r_nak_state <<
696
QIB_AETH_CREDIT_SHIFT));
697
else
698
ohdr->u.aeth = qib_compute_aeth(qp);
699
lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
700
qp->remote_ah_attr.sl << 4;
701
hdr.lrh[0] = cpu_to_be16(lrh0);
702
hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
703
hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
704
hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
705
ohdr->bth[0] = cpu_to_be32(bth0);
706
ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
707
ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
708
709
spin_unlock_irqrestore(&qp->s_lock, flags);
710
711
/* Don't try to send ACKs if the link isn't ACTIVE */
712
if (!(ppd->lflags & QIBL_LINKACTIVE))
713
goto done;
714
715
control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
716
qp->s_srate, lrh0 >> 12);
717
/* length is + 1 for the control dword */
718
pbc = ((u64) control << 32) | (hwords + 1);
719
720
piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
721
if (!piobuf) {
722
/*
723
* We are out of PIO buffers at the moment.
724
* Pass responsibility for sending the ACK to the
725
* send tasklet so that when a PIO buffer becomes
726
* available, the ACK is sent ahead of other outgoing
727
* packets.
728
*/
729
spin_lock_irqsave(&qp->s_lock, flags);
730
goto queue_ack;
731
}
732
733
/*
734
* Write the pbc.
735
* We have to flush after the PBC for correctness
736
* on some cpus or WC buffer can be written out of order.
737
*/
738
writeq(pbc, piobuf);
739
740
if (dd->flags & QIB_PIO_FLUSH_WC) {
741
u32 *hdrp = (u32 *) &hdr;
742
743
qib_flush_wc();
744
qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
745
qib_flush_wc();
746
__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
747
} else
748
qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
749
750
if (dd->flags & QIB_USE_SPCL_TRIG) {
751
u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
752
753
qib_flush_wc();
754
__raw_writel(0xaebecede, piobuf + spcl_off);
755
}
756
757
qib_flush_wc();
758
qib_sendbuf_done(dd, pbufn);
759
760
ibp->n_unicast_xmit++;
761
goto done;
762
763
queue_ack:
764
if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
765
ibp->n_rc_qacks++;
766
qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;
767
qp->s_nak_state = qp->r_nak_state;
768
qp->s_ack_psn = qp->r_ack_psn;
769
770
/* Schedule the send tasklet. */
771
qib_schedule_send(qp);
772
}
773
unlock:
774
spin_unlock_irqrestore(&qp->s_lock, flags);
775
done:
776
return;
777
}
778
779
/**
780
* reset_psn - reset the QP state to send starting from PSN
781
* @qp: the QP
782
* @psn: the packet sequence number to restart at
783
*
784
* This is called from qib_rc_rcv() to process an incoming RC ACK
785
* for the given QP.
786
* Called at interrupt level with the QP s_lock held.
787
*/
788
static void reset_psn(struct qib_qp *qp, u32 psn)
789
{
790
u32 n = qp->s_acked;
791
struct qib_swqe *wqe = get_swqe_ptr(qp, n);
792
u32 opcode;
793
794
qp->s_cur = n;
795
796
/*
797
* If we are starting the request from the beginning,
798
* let the normal send code handle initialization.
799
*/
800
if (qib_cmp24(psn, wqe->psn) <= 0) {
801
qp->s_state = OP(SEND_LAST);
802
goto done;
803
}
804
805
/* Find the work request opcode corresponding to the given PSN. */
806
opcode = wqe->wr.opcode;
807
for (;;) {
808
int diff;
809
810
if (++n == qp->s_size)
811
n = 0;
812
if (n == qp->s_tail)
813
break;
814
wqe = get_swqe_ptr(qp, n);
815
diff = qib_cmp24(psn, wqe->psn);
816
if (diff < 0)
817
break;
818
qp->s_cur = n;
819
/*
820
* If we are starting the request from the beginning,
821
* let the normal send code handle initialization.
822
*/
823
if (diff == 0) {
824
qp->s_state = OP(SEND_LAST);
825
goto done;
826
}
827
opcode = wqe->wr.opcode;
828
}
829
830
/*
831
* Set the state to restart in the middle of a request.
832
* Don't change the s_sge, s_cur_sge, or s_cur_size.
833
* See qib_make_rc_req().
834
*/
835
switch (opcode) {
836
case IB_WR_SEND:
837
case IB_WR_SEND_WITH_IMM:
838
qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
839
break;
840
841
case IB_WR_RDMA_WRITE:
842
case IB_WR_RDMA_WRITE_WITH_IMM:
843
qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
844
break;
845
846
case IB_WR_RDMA_READ:
847
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
848
break;
849
850
default:
851
/*
852
* This case shouldn't happen since its only
853
* one PSN per req.
854
*/
855
qp->s_state = OP(SEND_LAST);
856
}
857
done:
858
qp->s_psn = psn;
859
/*
860
* Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer
861
* asynchronously before the send tasklet can get scheduled.
862
* Doing it in qib_make_rc_req() is too late.
863
*/
864
if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
865
(qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
866
qp->s_flags |= QIB_S_WAIT_PSN;
867
}
868
869
/*
870
* Back up requester to resend the last un-ACKed request.
871
* The QP r_lock and s_lock should be held and interrupts disabled.
872
*/
873
static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)
874
{
875
struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
876
struct qib_ibport *ibp;
877
878
if (qp->s_retry == 0) {
879
if (qp->s_mig_state == IB_MIG_ARMED) {
880
qib_migrate_qp(qp);
881
qp->s_retry = qp->s_retry_cnt;
882
} else if (qp->s_last == qp->s_acked) {
883
qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
884
qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
885
return;
886
} else /* XXX need to handle delayed completion */
887
return;
888
} else
889
qp->s_retry--;
890
891
ibp = to_iport(qp->ibqp.device, qp->port_num);
892
if (wqe->wr.opcode == IB_WR_RDMA_READ)
893
ibp->n_rc_resends++;
894
else
895
ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
896
897
qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |
898
QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |
899
QIB_S_WAIT_ACK);
900
if (wait)
901
qp->s_flags |= QIB_S_SEND_ONE;
902
reset_psn(qp, psn);
903
}
904
905
/*
906
* This is called from s_timer for missing responses.
907
*/
908
static void rc_timeout(unsigned long arg)
909
{
910
struct qib_qp *qp = (struct qib_qp *)arg;
911
struct qib_ibport *ibp;
912
unsigned long flags;
913
914
spin_lock_irqsave(&qp->r_lock, flags);
915
spin_lock(&qp->s_lock);
916
if (qp->s_flags & QIB_S_TIMER) {
917
ibp = to_iport(qp->ibqp.device, qp->port_num);
918
ibp->n_rc_timeouts++;
919
qp->s_flags &= ~QIB_S_TIMER;
920
del_timer(&qp->s_timer);
921
qib_restart_rc(qp, qp->s_last_psn + 1, 1);
922
qib_schedule_send(qp);
923
}
924
spin_unlock(&qp->s_lock);
925
spin_unlock_irqrestore(&qp->r_lock, flags);
926
}
927
928
/*
929
* This is called from s_timer for RNR timeouts.
930
*/
931
void qib_rc_rnr_retry(unsigned long arg)
932
{
933
struct qib_qp *qp = (struct qib_qp *)arg;
934
unsigned long flags;
935
936
spin_lock_irqsave(&qp->s_lock, flags);
937
if (qp->s_flags & QIB_S_WAIT_RNR) {
938
qp->s_flags &= ~QIB_S_WAIT_RNR;
939
del_timer(&qp->s_timer);
940
qib_schedule_send(qp);
941
}
942
spin_unlock_irqrestore(&qp->s_lock, flags);
943
}
944
945
/*
946
* Set qp->s_sending_psn to the next PSN after the given one.
947
* This would be psn+1 except when RDMA reads are present.
948
*/
949
static void reset_sending_psn(struct qib_qp *qp, u32 psn)
950
{
951
struct qib_swqe *wqe;
952
u32 n = qp->s_last;
953
954
/* Find the work request corresponding to the given PSN. */
955
for (;;) {
956
wqe = get_swqe_ptr(qp, n);
957
if (qib_cmp24(psn, wqe->lpsn) <= 0) {
958
if (wqe->wr.opcode == IB_WR_RDMA_READ)
959
qp->s_sending_psn = wqe->lpsn + 1;
960
else
961
qp->s_sending_psn = psn + 1;
962
break;
963
}
964
if (++n == qp->s_size)
965
n = 0;
966
if (n == qp->s_tail)
967
break;
968
}
969
}
970
971
/*
972
* This should be called with the QP s_lock held and interrupts disabled.
973
*/
974
void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)
975
{
976
struct qib_other_headers *ohdr;
977
struct qib_swqe *wqe;
978
struct ib_wc wc;
979
unsigned i;
980
u32 opcode;
981
u32 psn;
982
983
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
984
return;
985
986
/* Find out where the BTH is */
987
if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
988
ohdr = &hdr->u.oth;
989
else
990
ohdr = &hdr->u.l.oth;
991
992
opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
993
if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
994
opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
995
WARN_ON(!qp->s_rdma_ack_cnt);
996
qp->s_rdma_ack_cnt--;
997
return;
998
}
999
1000
psn = be32_to_cpu(ohdr->bth[2]);
1001
reset_sending_psn(qp, psn);
1002
1003
/*
1004
* Start timer after a packet requesting an ACK has been sent and
1005
* there are still requests that haven't been acked.
1006
*/
1007
if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1008
!(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&
1009
(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1010
start_timer(qp);
1011
1012
while (qp->s_last != qp->s_acked) {
1013
wqe = get_swqe_ptr(qp, qp->s_last);
1014
if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1015
qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1016
break;
1017
for (i = 0; i < wqe->wr.num_sge; i++) {
1018
struct qib_sge *sge = &wqe->sg_list[i];
1019
1020
atomic_dec(&sge->mr->refcount);
1021
}
1022
/* Post a send completion queue entry if requested. */
1023
if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1024
(wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1025
memset(&wc, 0, sizeof wc);
1026
wc.wr_id = wqe->wr.wr_id;
1027
wc.status = IB_WC_SUCCESS;
1028
wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1029
wc.byte_len = wqe->length;
1030
wc.qp = &qp->ibqp;
1031
qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1032
}
1033
if (++qp->s_last >= qp->s_size)
1034
qp->s_last = 0;
1035
}
1036
/*
1037
* If we were waiting for sends to complete before resending,
1038
* and they are now complete, restart sending.
1039
*/
1040
if (qp->s_flags & QIB_S_WAIT_PSN &&
1041
qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1042
qp->s_flags &= ~QIB_S_WAIT_PSN;
1043
qp->s_sending_psn = qp->s_psn;
1044
qp->s_sending_hpsn = qp->s_psn - 1;
1045
qib_schedule_send(qp);
1046
}
1047
}
1048
1049
static inline void update_last_psn(struct qib_qp *qp, u32 psn)
1050
{
1051
qp->s_last_psn = psn;
1052
}
1053
1054
/*
1055
* Generate a SWQE completion.
1056
* This is similar to qib_send_complete but has to check to be sure
1057
* that the SGEs are not being referenced if the SWQE is being resent.
1058
*/
1059
static struct qib_swqe *do_rc_completion(struct qib_qp *qp,
1060
struct qib_swqe *wqe,
1061
struct qib_ibport *ibp)
1062
{
1063
struct ib_wc wc;
1064
unsigned i;
1065
1066
/*
1067
* Don't decrement refcount and don't generate a
1068
* completion if the SWQE is being resent until the send
1069
* is finished.
1070
*/
1071
if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
1072
qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1073
for (i = 0; i < wqe->wr.num_sge; i++) {
1074
struct qib_sge *sge = &wqe->sg_list[i];
1075
1076
atomic_dec(&sge->mr->refcount);
1077
}
1078
/* Post a send completion queue entry if requested. */
1079
if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1080
(wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1081
memset(&wc, 0, sizeof wc);
1082
wc.wr_id = wqe->wr.wr_id;
1083
wc.status = IB_WC_SUCCESS;
1084
wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1085
wc.byte_len = wqe->length;
1086
wc.qp = &qp->ibqp;
1087
qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1088
}
1089
if (++qp->s_last >= qp->s_size)
1090
qp->s_last = 0;
1091
} else
1092
ibp->n_rc_delayed_comp++;
1093
1094
qp->s_retry = qp->s_retry_cnt;
1095
update_last_psn(qp, wqe->lpsn);
1096
1097
/*
1098
* If we are completing a request which is in the process of
1099
* being resent, we can stop resending it since we know the
1100
* responder has already seen it.
1101
*/
1102
if (qp->s_acked == qp->s_cur) {
1103
if (++qp->s_cur >= qp->s_size)
1104
qp->s_cur = 0;
1105
qp->s_acked = qp->s_cur;
1106
wqe = get_swqe_ptr(qp, qp->s_cur);
1107
if (qp->s_acked != qp->s_tail) {
1108
qp->s_state = OP(SEND_LAST);
1109
qp->s_psn = wqe->psn;
1110
}
1111
} else {
1112
if (++qp->s_acked >= qp->s_size)
1113
qp->s_acked = 0;
1114
if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1115
qp->s_draining = 0;
1116
wqe = get_swqe_ptr(qp, qp->s_acked);
1117
}
1118
return wqe;
1119
}
1120
1121
/**
1122
* do_rc_ack - process an incoming RC ACK
1123
* @qp: the QP the ACK came in on
1124
* @psn: the packet sequence number of the ACK
1125
* @opcode: the opcode of the request that resulted in the ACK
1126
*
1127
* This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1128
* for the given QP.
1129
* Called at interrupt level with the QP s_lock held.
1130
* Returns 1 if OK, 0 if current operation should be aborted (NAK).
1131
*/
1132
static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,
1133
u64 val, struct qib_ctxtdata *rcd)
1134
{
1135
struct qib_ibport *ibp;
1136
enum ib_wc_status status;
1137
struct qib_swqe *wqe;
1138
int ret = 0;
1139
u32 ack_psn;
1140
int diff;
1141
1142
/* Remove QP from retry timer */
1143
if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1144
qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1145
del_timer(&qp->s_timer);
1146
}
1147
1148
/*
1149
* Note that NAKs implicitly ACK outstanding SEND and RDMA write
1150
* requests and implicitly NAK RDMA read and atomic requests issued
1151
* before the NAK'ed request. The MSN won't include the NAK'ed
1152
* request but will include an ACK'ed request(s).
1153
*/
1154
ack_psn = psn;
1155
if (aeth >> 29)
1156
ack_psn--;
1157
wqe = get_swqe_ptr(qp, qp->s_acked);
1158
ibp = to_iport(qp->ibqp.device, qp->port_num);
1159
1160
/*
1161
* The MSN might be for a later WQE than the PSN indicates so
1162
* only complete WQEs that the PSN finishes.
1163
*/
1164
while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1165
/*
1166
* RDMA_READ_RESPONSE_ONLY is a special case since
1167
* we want to generate completion events for everything
1168
* before the RDMA read, copy the data, then generate
1169
* the completion for the read.
1170
*/
1171
if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1172
opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1173
diff == 0) {
1174
ret = 1;
1175
goto bail;
1176
}
1177
/*
1178
* If this request is a RDMA read or atomic, and the ACK is
1179
* for a later operation, this ACK NAKs the RDMA read or
1180
* atomic. In other words, only a RDMA_READ_LAST or ONLY
1181
* can ACK a RDMA read and likewise for atomic ops. Note
1182
* that the NAK case can only happen if relaxed ordering is
1183
* used and requests are sent after an RDMA read or atomic
1184
* is sent but before the response is received.
1185
*/
1186
if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1187
(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1188
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1189
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1190
(opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1191
/* Retry this request. */
1192
if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {
1193
qp->r_flags |= QIB_R_RDMAR_SEQ;
1194
qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1195
if (list_empty(&qp->rspwait)) {
1196
qp->r_flags |= QIB_R_RSP_SEND;
1197
atomic_inc(&qp->refcount);
1198
list_add_tail(&qp->rspwait,
1199
&rcd->qp_wait_list);
1200
}
1201
}
1202
/*
1203
* No need to process the ACK/NAK since we are
1204
* restarting an earlier request.
1205
*/
1206
goto bail;
1207
}
1208
if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1209
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1210
u64 *vaddr = wqe->sg_list[0].vaddr;
1211
*vaddr = val;
1212
}
1213
if (qp->s_num_rd_atomic &&
1214
(wqe->wr.opcode == IB_WR_RDMA_READ ||
1215
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1216
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1217
qp->s_num_rd_atomic--;
1218
/* Restart sending task if fence is complete */
1219
if ((qp->s_flags & QIB_S_WAIT_FENCE) &&
1220
!qp->s_num_rd_atomic) {
1221
qp->s_flags &= ~(QIB_S_WAIT_FENCE |
1222
QIB_S_WAIT_ACK);
1223
qib_schedule_send(qp);
1224
} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {
1225
qp->s_flags &= ~(QIB_S_WAIT_RDMAR |
1226
QIB_S_WAIT_ACK);
1227
qib_schedule_send(qp);
1228
}
1229
}
1230
wqe = do_rc_completion(qp, wqe, ibp);
1231
if (qp->s_acked == qp->s_tail)
1232
break;
1233
}
1234
1235
switch (aeth >> 29) {
1236
case 0: /* ACK */
1237
ibp->n_rc_acks++;
1238
if (qp->s_acked != qp->s_tail) {
1239
/*
1240
* We are expecting more ACKs so
1241
* reset the retransmit timer.
1242
*/
1243
start_timer(qp);
1244
/*
1245
* We can stop resending the earlier packets and
1246
* continue with the next packet the receiver wants.
1247
*/
1248
if (qib_cmp24(qp->s_psn, psn) <= 0)
1249
reset_psn(qp, psn + 1);
1250
} else if (qib_cmp24(qp->s_psn, psn) <= 0) {
1251
qp->s_state = OP(SEND_LAST);
1252
qp->s_psn = psn + 1;
1253
}
1254
if (qp->s_flags & QIB_S_WAIT_ACK) {
1255
qp->s_flags &= ~QIB_S_WAIT_ACK;
1256
qib_schedule_send(qp);
1257
}
1258
qib_get_credit(qp, aeth);
1259
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1260
qp->s_retry = qp->s_retry_cnt;
1261
update_last_psn(qp, psn);
1262
ret = 1;
1263
goto bail;
1264
1265
case 1: /* RNR NAK */
1266
ibp->n_rnr_naks++;
1267
if (qp->s_acked == qp->s_tail)
1268
goto bail;
1269
if (qp->s_flags & QIB_S_WAIT_RNR)
1270
goto bail;
1271
if (qp->s_rnr_retry == 0) {
1272
status = IB_WC_RNR_RETRY_EXC_ERR;
1273
goto class_b;
1274
}
1275
if (qp->s_rnr_retry_cnt < 7)
1276
qp->s_rnr_retry--;
1277
1278
/* The last valid PSN is the previous PSN. */
1279
update_last_psn(qp, psn - 1);
1280
1281
ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1282
1283
reset_psn(qp, psn);
1284
1285
qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);
1286
qp->s_flags |= QIB_S_WAIT_RNR;
1287
qp->s_timer.function = qib_rc_rnr_retry;
1288
qp->s_timer.expires = jiffies + usecs_to_jiffies(
1289
ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &
1290
QIB_AETH_CREDIT_MASK]);
1291
add_timer(&qp->s_timer);
1292
goto bail;
1293
1294
case 3: /* NAK */
1295
if (qp->s_acked == qp->s_tail)
1296
goto bail;
1297
/* The last valid PSN is the previous PSN. */
1298
update_last_psn(qp, psn - 1);
1299
switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &
1300
QIB_AETH_CREDIT_MASK) {
1301
case 0: /* PSN sequence error */
1302
ibp->n_seq_naks++;
1303
/*
1304
* Back up to the responder's expected PSN.
1305
* Note that we might get a NAK in the middle of an
1306
* RDMA READ response which terminates the RDMA
1307
* READ.
1308
*/
1309
qib_restart_rc(qp, psn, 0);
1310
qib_schedule_send(qp);
1311
break;
1312
1313
case 1: /* Invalid Request */
1314
status = IB_WC_REM_INV_REQ_ERR;
1315
ibp->n_other_naks++;
1316
goto class_b;
1317
1318
case 2: /* Remote Access Error */
1319
status = IB_WC_REM_ACCESS_ERR;
1320
ibp->n_other_naks++;
1321
goto class_b;
1322
1323
case 3: /* Remote Operation Error */
1324
status = IB_WC_REM_OP_ERR;
1325
ibp->n_other_naks++;
1326
class_b:
1327
if (qp->s_last == qp->s_acked) {
1328
qib_send_complete(qp, wqe, status);
1329
qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1330
}
1331
break;
1332
1333
default:
1334
/* Ignore other reserved NAK error codes */
1335
goto reserved;
1336
}
1337
qp->s_retry = qp->s_retry_cnt;
1338
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1339
goto bail;
1340
1341
default: /* 2: reserved */
1342
reserved:
1343
/* Ignore reserved NAK codes. */
1344
goto bail;
1345
}
1346
1347
bail:
1348
return ret;
1349
}
1350
1351
/*
1352
* We have seen an out of sequence RDMA read middle or last packet.
1353
* This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1354
*/
1355
static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,
1356
struct qib_ctxtdata *rcd)
1357
{
1358
struct qib_swqe *wqe;
1359
1360
/* Remove QP from retry timer */
1361
if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1362
qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1363
del_timer(&qp->s_timer);
1364
}
1365
1366
wqe = get_swqe_ptr(qp, qp->s_acked);
1367
1368
while (qib_cmp24(psn, wqe->lpsn) > 0) {
1369
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1370
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1371
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1372
break;
1373
wqe = do_rc_completion(qp, wqe, ibp);
1374
}
1375
1376
ibp->n_rdma_seq++;
1377
qp->r_flags |= QIB_R_RDMAR_SEQ;
1378
qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1379
if (list_empty(&qp->rspwait)) {
1380
qp->r_flags |= QIB_R_RSP_SEND;
1381
atomic_inc(&qp->refcount);
1382
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1383
}
1384
}
1385
1386
/**
1387
* qib_rc_rcv_resp - process an incoming RC response packet
1388
* @ibp: the port this packet came in on
1389
* @ohdr: the other headers for this packet
1390
* @data: the packet data
1391
* @tlen: the packet length
1392
* @qp: the QP for this packet
1393
* @opcode: the opcode for this packet
1394
* @psn: the packet sequence number for this packet
1395
* @hdrsize: the header length
1396
* @pmtu: the path MTU
1397
*
1398
* This is called from qib_rc_rcv() to process an incoming RC response
1399
* packet for the given QP.
1400
* Called at interrupt level.
1401
*/
1402
static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1403
struct qib_other_headers *ohdr,
1404
void *data, u32 tlen,
1405
struct qib_qp *qp,
1406
u32 opcode,
1407
u32 psn, u32 hdrsize, u32 pmtu,
1408
struct qib_ctxtdata *rcd)
1409
{
1410
struct qib_swqe *wqe;
1411
struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1412
enum ib_wc_status status;
1413
unsigned long flags;
1414
int diff;
1415
u32 pad;
1416
u32 aeth;
1417
u64 val;
1418
1419
if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1420
/*
1421
* If ACK'd PSN on SDMA busy list try to make progress to
1422
* reclaim SDMA credits.
1423
*/
1424
if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1425
(qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1426
1427
/*
1428
* If send tasklet not running attempt to progress
1429
* SDMA queue.
1430
*/
1431
if (!(qp->s_flags & QIB_S_BUSY)) {
1432
/* Acquire SDMA Lock */
1433
spin_lock_irqsave(&ppd->sdma_lock, flags);
1434
/* Invoke sdma make progress */
1435
qib_sdma_make_progress(ppd);
1436
/* Release SDMA Lock */
1437
spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1438
}
1439
}
1440
}
1441
1442
spin_lock_irqsave(&qp->s_lock, flags);
1443
if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1444
goto ack_done;
1445
1446
/* Ignore invalid responses. */
1447
if (qib_cmp24(psn, qp->s_next_psn) >= 0)
1448
goto ack_done;
1449
1450
/* Ignore duplicate responses. */
1451
diff = qib_cmp24(psn, qp->s_last_psn);
1452
if (unlikely(diff <= 0)) {
1453
/* Update credits for "ghost" ACKs */
1454
if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1455
aeth = be32_to_cpu(ohdr->u.aeth);
1456
if ((aeth >> 29) == 0)
1457
qib_get_credit(qp, aeth);
1458
}
1459
goto ack_done;
1460
}
1461
1462
/*
1463
* Skip everything other than the PSN we expect, if we are waiting
1464
* for a reply to a restarted RDMA read or atomic op.
1465
*/
1466
if (qp->r_flags & QIB_R_RDMAR_SEQ) {
1467
if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1468
goto ack_done;
1469
qp->r_flags &= ~QIB_R_RDMAR_SEQ;
1470
}
1471
1472
if (unlikely(qp->s_acked == qp->s_tail))
1473
goto ack_done;
1474
wqe = get_swqe_ptr(qp, qp->s_acked);
1475
status = IB_WC_SUCCESS;
1476
1477
switch (opcode) {
1478
case OP(ACKNOWLEDGE):
1479
case OP(ATOMIC_ACKNOWLEDGE):
1480
case OP(RDMA_READ_RESPONSE_FIRST):
1481
aeth = be32_to_cpu(ohdr->u.aeth);
1482
if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
1483
__be32 *p = ohdr->u.at.atomic_ack_eth;
1484
1485
val = ((u64) be32_to_cpu(p[0]) << 32) |
1486
be32_to_cpu(p[1]);
1487
} else
1488
val = 0;
1489
if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1490
opcode != OP(RDMA_READ_RESPONSE_FIRST))
1491
goto ack_done;
1492
hdrsize += 4;
1493
wqe = get_swqe_ptr(qp, qp->s_acked);
1494
if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1495
goto ack_op_err;
1496
/*
1497
* If this is a response to a resent RDMA read, we
1498
* have to be careful to copy the data to the right
1499
* location.
1500
*/
1501
qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1502
wqe, psn, pmtu);
1503
goto read_middle;
1504
1505
case OP(RDMA_READ_RESPONSE_MIDDLE):
1506
/* no AETH, no ACK */
1507
if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1508
goto ack_seq_err;
1509
if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1510
goto ack_op_err;
1511
read_middle:
1512
if (unlikely(tlen != (hdrsize + pmtu + 4)))
1513
goto ack_len_err;
1514
if (unlikely(pmtu >= qp->s_rdma_read_len))
1515
goto ack_len_err;
1516
1517
/*
1518
* We got a response so update the timeout.
1519
* 4.096 usec. * (1 << qp->timeout)
1520
*/
1521
qp->s_flags |= QIB_S_TIMER;
1522
mod_timer(&qp->s_timer, jiffies +
1523
usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
1524
1000UL));
1525
if (qp->s_flags & QIB_S_WAIT_ACK) {
1526
qp->s_flags &= ~QIB_S_WAIT_ACK;
1527
qib_schedule_send(qp);
1528
}
1529
1530
if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1531
qp->s_retry = qp->s_retry_cnt;
1532
1533
/*
1534
* Update the RDMA receive state but do the copy w/o
1535
* holding the locks and blocking interrupts.
1536
*/
1537
qp->s_rdma_read_len -= pmtu;
1538
update_last_psn(qp, psn);
1539
spin_unlock_irqrestore(&qp->s_lock, flags);
1540
qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
1541
goto bail;
1542
1543
case OP(RDMA_READ_RESPONSE_ONLY):
1544
aeth = be32_to_cpu(ohdr->u.aeth);
1545
if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1546
goto ack_done;
1547
/* Get the number of bytes the message was padded by. */
1548
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1549
/*
1550
* Check that the data size is >= 0 && <= pmtu.
1551
* Remember to account for the AETH header (4) and
1552
* ICRC (4).
1553
*/
1554
if (unlikely(tlen < (hdrsize + pad + 8)))
1555
goto ack_len_err;
1556
/*
1557
* If this is a response to a resent RDMA read, we
1558
* have to be careful to copy the data to the right
1559
* location.
1560
*/
1561
wqe = get_swqe_ptr(qp, qp->s_acked);
1562
qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1563
wqe, psn, pmtu);
1564
goto read_last;
1565
1566
case OP(RDMA_READ_RESPONSE_LAST):
1567
/* ACKs READ req. */
1568
if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1569
goto ack_seq_err;
1570
if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1571
goto ack_op_err;
1572
/* Get the number of bytes the message was padded by. */
1573
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1574
/*
1575
* Check that the data size is >= 1 && <= pmtu.
1576
* Remember to account for the AETH header (4) and
1577
* ICRC (4).
1578
*/
1579
if (unlikely(tlen <= (hdrsize + pad + 8)))
1580
goto ack_len_err;
1581
read_last:
1582
tlen -= hdrsize + pad + 8;
1583
if (unlikely(tlen != qp->s_rdma_read_len))
1584
goto ack_len_err;
1585
aeth = be32_to_cpu(ohdr->u.aeth);
1586
qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
1587
WARN_ON(qp->s_rdma_read_sge.num_sge);
1588
(void) do_rc_ack(qp, aeth, psn,
1589
OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1590
goto ack_done;
1591
}
1592
1593
ack_op_err:
1594
status = IB_WC_LOC_QP_OP_ERR;
1595
goto ack_err;
1596
1597
ack_seq_err:
1598
rdma_seq_err(qp, ibp, psn, rcd);
1599
goto ack_done;
1600
1601
ack_len_err:
1602
status = IB_WC_LOC_LEN_ERR;
1603
ack_err:
1604
if (qp->s_last == qp->s_acked) {
1605
qib_send_complete(qp, wqe, status);
1606
qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1607
}
1608
ack_done:
1609
spin_unlock_irqrestore(&qp->s_lock, flags);
1610
bail:
1611
return;
1612
}
1613
1614
/**
1615
* qib_rc_rcv_error - process an incoming duplicate or error RC packet
1616
* @ohdr: the other headers for this packet
1617
* @data: the packet data
1618
* @qp: the QP for this packet
1619
* @opcode: the opcode for this packet
1620
* @psn: the packet sequence number for this packet
1621
* @diff: the difference between the PSN and the expected PSN
1622
*
1623
* This is called from qib_rc_rcv() to process an unexpected
1624
* incoming RC packet for the given QP.
1625
* Called at interrupt level.
1626
* Return 1 if no more processing is needed; otherwise return 0 to
1627
* schedule a response to be sent.
1628
*/
1629
static int qib_rc_rcv_error(struct qib_other_headers *ohdr,
1630
void *data,
1631
struct qib_qp *qp,
1632
u32 opcode,
1633
u32 psn,
1634
int diff,
1635
struct qib_ctxtdata *rcd)
1636
{
1637
struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1638
struct qib_ack_entry *e;
1639
unsigned long flags;
1640
u8 i, prev;
1641
int old_req;
1642
1643
if (diff > 0) {
1644
/*
1645
* Packet sequence error.
1646
* A NAK will ACK earlier sends and RDMA writes.
1647
* Don't queue the NAK if we already sent one.
1648
*/
1649
if (!qp->r_nak_state) {
1650
ibp->n_rc_seqnak++;
1651
qp->r_nak_state = IB_NAK_PSN_ERROR;
1652
/* Use the expected PSN. */
1653
qp->r_ack_psn = qp->r_psn;
1654
/*
1655
* Wait to send the sequence NAK until all packets
1656
* in the receive queue have been processed.
1657
* Otherwise, we end up propagating congestion.
1658
*/
1659
if (list_empty(&qp->rspwait)) {
1660
qp->r_flags |= QIB_R_RSP_NAK;
1661
atomic_inc(&qp->refcount);
1662
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1663
}
1664
}
1665
goto done;
1666
}
1667
1668
/*
1669
* Handle a duplicate request. Don't re-execute SEND, RDMA
1670
* write or atomic op. Don't NAK errors, just silently drop
1671
* the duplicate request. Note that r_sge, r_len, and
1672
* r_rcv_len may be in use so don't modify them.
1673
*
1674
* We are supposed to ACK the earliest duplicate PSN but we
1675
* can coalesce an outstanding duplicate ACK. We have to
1676
* send the earliest so that RDMA reads can be restarted at
1677
* the requester's expected PSN.
1678
*
1679
* First, find where this duplicate PSN falls within the
1680
* ACKs previously sent.
1681
* old_req is true if there is an older response that is scheduled
1682
* to be sent before sending this one.
1683
*/
1684
e = NULL;
1685
old_req = 1;
1686
ibp->n_rc_dupreq++;
1687
1688
spin_lock_irqsave(&qp->s_lock, flags);
1689
1690
for (i = qp->r_head_ack_queue; ; i = prev) {
1691
if (i == qp->s_tail_ack_queue)
1692
old_req = 0;
1693
if (i)
1694
prev = i - 1;
1695
else
1696
prev = QIB_MAX_RDMA_ATOMIC;
1697
if (prev == qp->r_head_ack_queue) {
1698
e = NULL;
1699
break;
1700
}
1701
e = &qp->s_ack_queue[prev];
1702
if (!e->opcode) {
1703
e = NULL;
1704
break;
1705
}
1706
if (qib_cmp24(psn, e->psn) >= 0) {
1707
if (prev == qp->s_tail_ack_queue &&
1708
qib_cmp24(psn, e->lpsn) <= 0)
1709
old_req = 0;
1710
break;
1711
}
1712
}
1713
switch (opcode) {
1714
case OP(RDMA_READ_REQUEST): {
1715
struct ib_reth *reth;
1716
u32 offset;
1717
u32 len;
1718
1719
/*
1720
* If we didn't find the RDMA read request in the ack queue,
1721
* we can ignore this request.
1722
*/
1723
if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1724
goto unlock_done;
1725
/* RETH comes after BTH */
1726
reth = &ohdr->u.rc.reth;
1727
/*
1728
* Address range must be a subset of the original
1729
* request and start on pmtu boundaries.
1730
* We reuse the old ack_queue slot since the requester
1731
* should not back up and request an earlier PSN for the
1732
* same request.
1733
*/
1734
offset = ((psn - e->psn) & QIB_PSN_MASK) *
1735
ib_mtu_enum_to_int(qp->path_mtu);
1736
len = be32_to_cpu(reth->length);
1737
if (unlikely(offset + len != e->rdma_sge.sge_length))
1738
goto unlock_done;
1739
if (e->rdma_sge.mr) {
1740
atomic_dec(&e->rdma_sge.mr->refcount);
1741
e->rdma_sge.mr = NULL;
1742
}
1743
if (len != 0) {
1744
u32 rkey = be32_to_cpu(reth->rkey);
1745
u64 vaddr = be64_to_cpu(reth->vaddr);
1746
int ok;
1747
1748
ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1749
IB_ACCESS_REMOTE_READ);
1750
if (unlikely(!ok))
1751
goto unlock_done;
1752
} else {
1753
e->rdma_sge.vaddr = NULL;
1754
e->rdma_sge.length = 0;
1755
e->rdma_sge.sge_length = 0;
1756
}
1757
e->psn = psn;
1758
if (old_req)
1759
goto unlock_done;
1760
qp->s_tail_ack_queue = prev;
1761
break;
1762
}
1763
1764
case OP(COMPARE_SWAP):
1765
case OP(FETCH_ADD): {
1766
/*
1767
* If we didn't find the atomic request in the ack queue
1768
* or the send tasklet is already backed up to send an
1769
* earlier entry, we can ignore this request.
1770
*/
1771
if (!e || e->opcode != (u8) opcode || old_req)
1772
goto unlock_done;
1773
qp->s_tail_ack_queue = prev;
1774
break;
1775
}
1776
1777
default:
1778
/*
1779
* Ignore this operation if it doesn't request an ACK
1780
* or an earlier RDMA read or atomic is going to be resent.
1781
*/
1782
if (!(psn & IB_BTH_REQ_ACK) || old_req)
1783
goto unlock_done;
1784
/*
1785
* Resend the most recent ACK if this request is
1786
* after all the previous RDMA reads and atomics.
1787
*/
1788
if (i == qp->r_head_ack_queue) {
1789
spin_unlock_irqrestore(&qp->s_lock, flags);
1790
qp->r_nak_state = 0;
1791
qp->r_ack_psn = qp->r_psn - 1;
1792
goto send_ack;
1793
}
1794
/*
1795
* Try to send a simple ACK to work around a Mellanox bug
1796
* which doesn't accept a RDMA read response or atomic
1797
* response as an ACK for earlier SENDs or RDMA writes.
1798
*/
1799
if (!(qp->s_flags & QIB_S_RESP_PENDING)) {
1800
spin_unlock_irqrestore(&qp->s_lock, flags);
1801
qp->r_nak_state = 0;
1802
qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1803
goto send_ack;
1804
}
1805
/*
1806
* Resend the RDMA read or atomic op which
1807
* ACKs this duplicate request.
1808
*/
1809
qp->s_tail_ack_queue = i;
1810
break;
1811
}
1812
qp->s_ack_state = OP(ACKNOWLEDGE);
1813
qp->s_flags |= QIB_S_RESP_PENDING;
1814
qp->r_nak_state = 0;
1815
qib_schedule_send(qp);
1816
1817
unlock_done:
1818
spin_unlock_irqrestore(&qp->s_lock, flags);
1819
done:
1820
return 1;
1821
1822
send_ack:
1823
return 0;
1824
}
1825
1826
void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)
1827
{
1828
unsigned long flags;
1829
int lastwqe;
1830
1831
spin_lock_irqsave(&qp->s_lock, flags);
1832
lastwqe = qib_error_qp(qp, err);
1833
spin_unlock_irqrestore(&qp->s_lock, flags);
1834
1835
if (lastwqe) {
1836
struct ib_event ev;
1837
1838
ev.device = qp->ibqp.device;
1839
ev.element.qp = &qp->ibqp;
1840
ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1841
qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1842
}
1843
}
1844
1845
static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)
1846
{
1847
unsigned next;
1848
1849
next = n + 1;
1850
if (next > QIB_MAX_RDMA_ATOMIC)
1851
next = 0;
1852
qp->s_tail_ack_queue = next;
1853
qp->s_ack_state = OP(ACKNOWLEDGE);
1854
}
1855
1856
/**
1857
* qib_rc_rcv - process an incoming RC packet
1858
* @rcd: the context pointer
1859
* @hdr: the header of this packet
1860
* @has_grh: true if the header has a GRH
1861
* @data: the packet data
1862
* @tlen: the packet length
1863
* @qp: the QP for this packet
1864
*
1865
* This is called from qib_qp_rcv() to process an incoming RC packet
1866
* for the given QP.
1867
* Called at interrupt level.
1868
*/
1869
void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
1870
int has_grh, void *data, u32 tlen, struct qib_qp *qp)
1871
{
1872
struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1873
struct qib_other_headers *ohdr;
1874
u32 opcode;
1875
u32 hdrsize;
1876
u32 psn;
1877
u32 pad;
1878
struct ib_wc wc;
1879
u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
1880
int diff;
1881
struct ib_reth *reth;
1882
unsigned long flags;
1883
int ret;
1884
1885
/* Check for GRH */
1886
if (!has_grh) {
1887
ohdr = &hdr->u.oth;
1888
hdrsize = 8 + 12; /* LRH + BTH */
1889
} else {
1890
ohdr = &hdr->u.l.oth;
1891
hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
1892
}
1893
1894
opcode = be32_to_cpu(ohdr->bth[0]);
1895
spin_lock_irqsave(&qp->s_lock, flags);
1896
if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1897
goto sunlock;
1898
spin_unlock_irqrestore(&qp->s_lock, flags);
1899
1900
psn = be32_to_cpu(ohdr->bth[2]);
1901
opcode >>= 24;
1902
1903
/*
1904
* Process responses (ACKs) before anything else. Note that the
1905
* packet sequence number will be for something in the send work
1906
* queue rather than the expected receive packet sequence number.
1907
* In other words, this QP is the requester.
1908
*/
1909
if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1910
opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1911
qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1912
hdrsize, pmtu, rcd);
1913
return;
1914
}
1915
1916
/* Compute 24 bits worth of difference. */
1917
diff = qib_cmp24(psn, qp->r_psn);
1918
if (unlikely(diff)) {
1919
if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1920
return;
1921
goto send_ack;
1922
}
1923
1924
/* Check for opcode sequence errors. */
1925
switch (qp->r_state) {
1926
case OP(SEND_FIRST):
1927
case OP(SEND_MIDDLE):
1928
if (opcode == OP(SEND_MIDDLE) ||
1929
opcode == OP(SEND_LAST) ||
1930
opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1931
break;
1932
goto nack_inv;
1933
1934
case OP(RDMA_WRITE_FIRST):
1935
case OP(RDMA_WRITE_MIDDLE):
1936
if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1937
opcode == OP(RDMA_WRITE_LAST) ||
1938
opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1939
break;
1940
goto nack_inv;
1941
1942
default:
1943
if (opcode == OP(SEND_MIDDLE) ||
1944
opcode == OP(SEND_LAST) ||
1945
opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1946
opcode == OP(RDMA_WRITE_MIDDLE) ||
1947
opcode == OP(RDMA_WRITE_LAST) ||
1948
opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1949
goto nack_inv;
1950
/*
1951
* Note that it is up to the requester to not send a new
1952
* RDMA read or atomic operation before receiving an ACK
1953
* for the previous operation.
1954
*/
1955
break;
1956
}
1957
1958
memset(&wc, 0, sizeof wc);
1959
1960
if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
1961
qp->r_flags |= QIB_R_COMM_EST;
1962
if (qp->ibqp.event_handler) {
1963
struct ib_event ev;
1964
1965
ev.device = qp->ibqp.device;
1966
ev.element.qp = &qp->ibqp;
1967
ev.event = IB_EVENT_COMM_EST;
1968
qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1969
}
1970
}
1971
1972
/* OK, process the packet. */
1973
switch (opcode) {
1974
case OP(SEND_FIRST):
1975
ret = qib_get_rwqe(qp, 0);
1976
if (ret < 0)
1977
goto nack_op_err;
1978
if (!ret)
1979
goto rnr_nak;
1980
qp->r_rcv_len = 0;
1981
/* FALLTHROUGH */
1982
case OP(SEND_MIDDLE):
1983
case OP(RDMA_WRITE_MIDDLE):
1984
send_middle:
1985
/* Check for invalid length PMTU or posted rwqe len. */
1986
if (unlikely(tlen != (hdrsize + pmtu + 4)))
1987
goto nack_inv;
1988
qp->r_rcv_len += pmtu;
1989
if (unlikely(qp->r_rcv_len > qp->r_len))
1990
goto nack_inv;
1991
qib_copy_sge(&qp->r_sge, data, pmtu, 1);
1992
break;
1993
1994
case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1995
/* consume RWQE */
1996
ret = qib_get_rwqe(qp, 1);
1997
if (ret < 0)
1998
goto nack_op_err;
1999
if (!ret)
2000
goto rnr_nak;
2001
goto send_last_imm;
2002
2003
case OP(SEND_ONLY):
2004
case OP(SEND_ONLY_WITH_IMMEDIATE):
2005
ret = qib_get_rwqe(qp, 0);
2006
if (ret < 0)
2007
goto nack_op_err;
2008
if (!ret)
2009
goto rnr_nak;
2010
qp->r_rcv_len = 0;
2011
if (opcode == OP(SEND_ONLY))
2012
goto send_last;
2013
/* FALLTHROUGH */
2014
case OP(SEND_LAST_WITH_IMMEDIATE):
2015
send_last_imm:
2016
wc.ex.imm_data = ohdr->u.imm_data;
2017
hdrsize += 4;
2018
wc.wc_flags = IB_WC_WITH_IMM;
2019
/* FALLTHROUGH */
2020
case OP(SEND_LAST):
2021
case OP(RDMA_WRITE_LAST):
2022
send_last:
2023
/* Get the number of bytes the message was padded by. */
2024
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
2025
/* Check for invalid length. */
2026
/* XXX LAST len should be >= 1 */
2027
if (unlikely(tlen < (hdrsize + pad + 4)))
2028
goto nack_inv;
2029
/* Don't count the CRC. */
2030
tlen -= (hdrsize + pad + 4);
2031
wc.byte_len = tlen + qp->r_rcv_len;
2032
if (unlikely(wc.byte_len > qp->r_len))
2033
goto nack_inv;
2034
qib_copy_sge(&qp->r_sge, data, tlen, 1);
2035
while (qp->r_sge.num_sge) {
2036
atomic_dec(&qp->r_sge.sge.mr->refcount);
2037
if (--qp->r_sge.num_sge)
2038
qp->r_sge.sge = *qp->r_sge.sg_list++;
2039
}
2040
qp->r_msn++;
2041
if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
2042
break;
2043
wc.wr_id = qp->r_wr_id;
2044
wc.status = IB_WC_SUCCESS;
2045
if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2046
opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2047
wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2048
else
2049
wc.opcode = IB_WC_RECV;
2050
wc.qp = &qp->ibqp;
2051
wc.src_qp = qp->remote_qpn;
2052
wc.slid = qp->remote_ah_attr.dlid;
2053
wc.sl = qp->remote_ah_attr.sl;
2054
/* Signal completion event if the solicited bit is set. */
2055
qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
2056
(ohdr->bth[0] &
2057
cpu_to_be32(IB_BTH_SOLICITED)) != 0);
2058
break;
2059
2060
case OP(RDMA_WRITE_FIRST):
2061
case OP(RDMA_WRITE_ONLY):
2062
case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2063
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2064
goto nack_inv;
2065
/* consume RWQE */
2066
reth = &ohdr->u.rc.reth;
2067
hdrsize += sizeof(*reth);
2068
qp->r_len = be32_to_cpu(reth->length);
2069
qp->r_rcv_len = 0;
2070
qp->r_sge.sg_list = NULL;
2071
if (qp->r_len != 0) {
2072
u32 rkey = be32_to_cpu(reth->rkey);
2073
u64 vaddr = be64_to_cpu(reth->vaddr);
2074
int ok;
2075
2076
/* Check rkey & NAK */
2077
ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2078
rkey, IB_ACCESS_REMOTE_WRITE);
2079
if (unlikely(!ok))
2080
goto nack_acc;
2081
qp->r_sge.num_sge = 1;
2082
} else {
2083
qp->r_sge.num_sge = 0;
2084
qp->r_sge.sge.mr = NULL;
2085
qp->r_sge.sge.vaddr = NULL;
2086
qp->r_sge.sge.length = 0;
2087
qp->r_sge.sge.sge_length = 0;
2088
}
2089
if (opcode == OP(RDMA_WRITE_FIRST))
2090
goto send_middle;
2091
else if (opcode == OP(RDMA_WRITE_ONLY))
2092
goto send_last;
2093
ret = qib_get_rwqe(qp, 1);
2094
if (ret < 0)
2095
goto nack_op_err;
2096
if (!ret)
2097
goto rnr_nak;
2098
wc.ex.imm_data = ohdr->u.rc.imm_data;
2099
hdrsize += 4;
2100
wc.wc_flags = IB_WC_WITH_IMM;
2101
goto send_last;
2102
2103
case OP(RDMA_READ_REQUEST): {
2104
struct qib_ack_entry *e;
2105
u32 len;
2106
u8 next;
2107
2108
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2109
goto nack_inv;
2110
next = qp->r_head_ack_queue + 1;
2111
/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
2112
if (next > QIB_MAX_RDMA_ATOMIC)
2113
next = 0;
2114
spin_lock_irqsave(&qp->s_lock, flags);
2115
if (unlikely(next == qp->s_tail_ack_queue)) {
2116
if (!qp->s_ack_queue[next].sent)
2117
goto nack_inv_unlck;
2118
qib_update_ack_queue(qp, next);
2119
}
2120
e = &qp->s_ack_queue[qp->r_head_ack_queue];
2121
if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2122
atomic_dec(&e->rdma_sge.mr->refcount);
2123
e->rdma_sge.mr = NULL;
2124
}
2125
reth = &ohdr->u.rc.reth;
2126
len = be32_to_cpu(reth->length);
2127
if (len) {
2128
u32 rkey = be32_to_cpu(reth->rkey);
2129
u64 vaddr = be64_to_cpu(reth->vaddr);
2130
int ok;
2131
2132
/* Check rkey & NAK */
2133
ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2134
rkey, IB_ACCESS_REMOTE_READ);
2135
if (unlikely(!ok))
2136
goto nack_acc_unlck;
2137
/*
2138
* Update the next expected PSN. We add 1 later
2139
* below, so only add the remainder here.
2140
*/
2141
if (len > pmtu)
2142
qp->r_psn += (len - 1) / pmtu;
2143
} else {
2144
e->rdma_sge.mr = NULL;
2145
e->rdma_sge.vaddr = NULL;
2146
e->rdma_sge.length = 0;
2147
e->rdma_sge.sge_length = 0;
2148
}
2149
e->opcode = opcode;
2150
e->sent = 0;
2151
e->psn = psn;
2152
e->lpsn = qp->r_psn;
2153
/*
2154
* We need to increment the MSN here instead of when we
2155
* finish sending the result since a duplicate request would
2156
* increment it more than once.
2157
*/
2158
qp->r_msn++;
2159
qp->r_psn++;
2160
qp->r_state = opcode;
2161
qp->r_nak_state = 0;
2162
qp->r_head_ack_queue = next;
2163
2164
/* Schedule the send tasklet. */
2165
qp->s_flags |= QIB_S_RESP_PENDING;
2166
qib_schedule_send(qp);
2167
2168
goto sunlock;
2169
}
2170
2171
case OP(COMPARE_SWAP):
2172
case OP(FETCH_ADD): {
2173
struct ib_atomic_eth *ateth;
2174
struct qib_ack_entry *e;
2175
u64 vaddr;
2176
atomic64_t *maddr;
2177
u64 sdata;
2178
u32 rkey;
2179
u8 next;
2180
2181
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2182
goto nack_inv;
2183
next = qp->r_head_ack_queue + 1;
2184
if (next > QIB_MAX_RDMA_ATOMIC)
2185
next = 0;
2186
spin_lock_irqsave(&qp->s_lock, flags);
2187
if (unlikely(next == qp->s_tail_ack_queue)) {
2188
if (!qp->s_ack_queue[next].sent)
2189
goto nack_inv_unlck;
2190
qib_update_ack_queue(qp, next);
2191
}
2192
e = &qp->s_ack_queue[qp->r_head_ack_queue];
2193
if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2194
atomic_dec(&e->rdma_sge.mr->refcount);
2195
e->rdma_sge.mr = NULL;
2196
}
2197
ateth = &ohdr->u.atomic_eth;
2198
vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
2199
be32_to_cpu(ateth->vaddr[1]);
2200
if (unlikely(vaddr & (sizeof(u64) - 1)))
2201
goto nack_inv_unlck;
2202
rkey = be32_to_cpu(ateth->rkey);
2203
/* Check rkey & NAK */
2204
if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2205
vaddr, rkey,
2206
IB_ACCESS_REMOTE_ATOMIC)))
2207
goto nack_acc_unlck;
2208
/* Perform atomic OP and save result. */
2209
maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2210
sdata = be64_to_cpu(ateth->swap_data);
2211
e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2212
(u64) atomic64_add_return(sdata, maddr) - sdata :
2213
(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2214
be64_to_cpu(ateth->compare_data),
2215
sdata);
2216
atomic_dec(&qp->r_sge.sge.mr->refcount);
2217
qp->r_sge.num_sge = 0;
2218
e->opcode = opcode;
2219
e->sent = 0;
2220
e->psn = psn;
2221
e->lpsn = psn;
2222
qp->r_msn++;
2223
qp->r_psn++;
2224
qp->r_state = opcode;
2225
qp->r_nak_state = 0;
2226
qp->r_head_ack_queue = next;
2227
2228
/* Schedule the send tasklet. */
2229
qp->s_flags |= QIB_S_RESP_PENDING;
2230
qib_schedule_send(qp);
2231
2232
goto sunlock;
2233
}
2234
2235
default:
2236
/* NAK unknown opcodes. */
2237
goto nack_inv;
2238
}
2239
qp->r_psn++;
2240
qp->r_state = opcode;
2241
qp->r_ack_psn = psn;
2242
qp->r_nak_state = 0;
2243
/* Send an ACK if requested or required. */
2244
if (psn & (1 << 31))
2245
goto send_ack;
2246
return;
2247
2248
rnr_nak:
2249
qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2250
qp->r_ack_psn = qp->r_psn;
2251
/* Queue RNR NAK for later */
2252
if (list_empty(&qp->rspwait)) {
2253
qp->r_flags |= QIB_R_RSP_NAK;
2254
atomic_inc(&qp->refcount);
2255
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2256
}
2257
return;
2258
2259
nack_op_err:
2260
qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2261
qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2262
qp->r_ack_psn = qp->r_psn;
2263
/* Queue NAK for later */
2264
if (list_empty(&qp->rspwait)) {
2265
qp->r_flags |= QIB_R_RSP_NAK;
2266
atomic_inc(&qp->refcount);
2267
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2268
}
2269
return;
2270
2271
nack_inv_unlck:
2272
spin_unlock_irqrestore(&qp->s_lock, flags);
2273
nack_inv:
2274
qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2275
qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2276
qp->r_ack_psn = qp->r_psn;
2277
/* Queue NAK for later */
2278
if (list_empty(&qp->rspwait)) {
2279
qp->r_flags |= QIB_R_RSP_NAK;
2280
atomic_inc(&qp->refcount);
2281
list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2282
}
2283
return;
2284
2285
nack_acc_unlck:
2286
spin_unlock_irqrestore(&qp->s_lock, flags);
2287
nack_acc:
2288
qib_rc_error(qp, IB_WC_LOC_PROT_ERR);
2289
qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2290
qp->r_ack_psn = qp->r_psn;
2291
send_ack:
2292
qib_send_rc_ack(qp);
2293
return;
2294
2295
sunlock:
2296
spin_unlock_irqrestore(&qp->s_lock, flags);
2297
}
2298
2299