Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/sunrpc/xprtrdma/verbs.c
15111 views
1
/*
2
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3
*
4
* This software is available to you under a choice of one of two
5
* licenses. You may choose to be licensed under the terms of the GNU
6
* General Public License (GPL) Version 2, available from the file
7
* COPYING in the main directory of this source tree, or the BSD-type
8
* license below:
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
*
14
* Redistributions of source code must retain the above copyright
15
* notice, this list of conditions and the following disclaimer.
16
*
17
* Redistributions in binary form must reproduce the above
18
* copyright notice, this list of conditions and the following
19
* disclaimer in the documentation and/or other materials provided
20
* with the distribution.
21
*
22
* Neither the name of the Network Appliance, Inc. nor the names of
23
* its contributors may be used to endorse or promote products
24
* derived from this software without specific prior written
25
* permission.
26
*
27
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
*/
39
40
/*
41
* verbs.c
42
*
43
* Encapsulates the major functions managing:
44
* o adapters
45
* o endpoints
46
* o connections
47
* o buffer memory
48
*/
49
50
#include <linux/pci.h> /* for Tavor hack below */
51
#include <linux/slab.h>
52
53
#include "xprt_rdma.h"
54
55
/*
56
* Globals/Macros
57
*/
58
59
#ifdef RPC_DEBUG
60
# define RPCDBG_FACILITY RPCDBG_TRANS
61
#endif
62
63
/*
64
* internal functions
65
*/
66
67
/*
68
* handle replies in tasklet context, using a single, global list
69
* rdma tasklet function -- just turn around and call the func
70
* for all replies on the list
71
*/
72
73
static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74
static LIST_HEAD(rpcrdma_tasklets_g);
75
76
static void
77
rpcrdma_run_tasklet(unsigned long data)
78
{
79
struct rpcrdma_rep *rep;
80
void (*func)(struct rpcrdma_rep *);
81
unsigned long flags;
82
83
data = data;
84
spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85
while (!list_empty(&rpcrdma_tasklets_g)) {
86
rep = list_entry(rpcrdma_tasklets_g.next,
87
struct rpcrdma_rep, rr_list);
88
list_del(&rep->rr_list);
89
func = rep->rr_func;
90
rep->rr_func = NULL;
91
spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93
if (func)
94
func(rep);
95
else
96
rpcrdma_recv_buffer_put(rep);
97
98
spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99
}
100
spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101
}
102
103
static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105
static inline void
106
rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107
{
108
unsigned long flags;
109
110
spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111
list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112
spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113
tasklet_schedule(&rpcrdma_tasklet_g);
114
}
115
116
static void
117
rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118
{
119
struct rpcrdma_ep *ep = context;
120
121
dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122
__func__, event->event, event->device->name, context);
123
if (ep->rep_connected == 1) {
124
ep->rep_connected = -EIO;
125
ep->rep_func(ep);
126
wake_up_all(&ep->rep_connect_wait);
127
}
128
}
129
130
static void
131
rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132
{
133
struct rpcrdma_ep *ep = context;
134
135
dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136
__func__, event->event, event->device->name, context);
137
if (ep->rep_connected == 1) {
138
ep->rep_connected = -EIO;
139
ep->rep_func(ep);
140
wake_up_all(&ep->rep_connect_wait);
141
}
142
}
143
144
static inline
145
void rpcrdma_event_process(struct ib_wc *wc)
146
{
147
struct rpcrdma_mw *frmr;
148
struct rpcrdma_rep *rep =
149
(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151
dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152
__func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154
if (!rep) /* send or bind completion that we don't care about */
155
return;
156
157
if (IB_WC_SUCCESS != wc->status) {
158
dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159
__func__, wc->opcode, wc->status);
160
rep->rr_len = ~0U;
161
if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162
rpcrdma_schedule_tasklet(rep);
163
return;
164
}
165
166
switch (wc->opcode) {
167
case IB_WC_FAST_REG_MR:
168
frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169
frmr->r.frmr.state = FRMR_IS_VALID;
170
break;
171
case IB_WC_LOCAL_INV:
172
frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173
frmr->r.frmr.state = FRMR_IS_INVALID;
174
break;
175
case IB_WC_RECV:
176
rep->rr_len = wc->byte_len;
177
ib_dma_sync_single_for_cpu(
178
rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179
rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180
/* Keep (only) the most recent credits, after check validity */
181
if (rep->rr_len >= 16) {
182
struct rpcrdma_msg *p =
183
(struct rpcrdma_msg *) rep->rr_base;
184
unsigned int credits = ntohl(p->rm_credit);
185
if (credits == 0) {
186
dprintk("RPC: %s: server"
187
" dropped credits to 0!\n", __func__);
188
/* don't deadlock */
189
credits = 1;
190
} else if (credits > rep->rr_buffer->rb_max_requests) {
191
dprintk("RPC: %s: server"
192
" over-crediting: %d (%d)\n",
193
__func__, credits,
194
rep->rr_buffer->rb_max_requests);
195
credits = rep->rr_buffer->rb_max_requests;
196
}
197
atomic_set(&rep->rr_buffer->rb_credits, credits);
198
}
199
/* fall through */
200
case IB_WC_BIND_MW:
201
rpcrdma_schedule_tasklet(rep);
202
break;
203
default:
204
dprintk("RPC: %s: unexpected WC event %X\n",
205
__func__, wc->opcode);
206
break;
207
}
208
}
209
210
static inline int
211
rpcrdma_cq_poll(struct ib_cq *cq)
212
{
213
struct ib_wc wc;
214
int rc;
215
216
for (;;) {
217
rc = ib_poll_cq(cq, 1, &wc);
218
if (rc < 0) {
219
dprintk("RPC: %s: ib_poll_cq failed %i\n",
220
__func__, rc);
221
return rc;
222
}
223
if (rc == 0)
224
break;
225
226
rpcrdma_event_process(&wc);
227
}
228
229
return 0;
230
}
231
232
/*
233
* rpcrdma_cq_event_upcall
234
*
235
* This upcall handles recv, send, bind and unbind events.
236
* It is reentrant but processes single events in order to maintain
237
* ordering of receives to keep server credits.
238
*
239
* It is the responsibility of the scheduled tasklet to return
240
* recv buffers to the pool. NOTE: this affects synchronization of
241
* connection shutdown. That is, the structures required for
242
* the completion of the reply handler must remain intact until
243
* all memory has been reclaimed.
244
*
245
* Note that send events are suppressed and do not result in an upcall.
246
*/
247
static void
248
rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249
{
250
int rc;
251
252
rc = rpcrdma_cq_poll(cq);
253
if (rc)
254
return;
255
256
rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257
if (rc) {
258
dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259
__func__, rc);
260
return;
261
}
262
263
rpcrdma_cq_poll(cq);
264
}
265
266
#ifdef RPC_DEBUG
267
static const char * const conn[] = {
268
"address resolved",
269
"address error",
270
"route resolved",
271
"route error",
272
"connect request",
273
"connect response",
274
"connect error",
275
"unreachable",
276
"rejected",
277
"established",
278
"disconnected",
279
"device removal"
280
};
281
#endif
282
283
static int
284
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285
{
286
struct rpcrdma_xprt *xprt = id->context;
287
struct rpcrdma_ia *ia = &xprt->rx_ia;
288
struct rpcrdma_ep *ep = &xprt->rx_ep;
289
#ifdef RPC_DEBUG
290
struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
291
#endif
292
struct ib_qp_attr attr;
293
struct ib_qp_init_attr iattr;
294
int connstate = 0;
295
296
switch (event->event) {
297
case RDMA_CM_EVENT_ADDR_RESOLVED:
298
case RDMA_CM_EVENT_ROUTE_RESOLVED:
299
ia->ri_async_rc = 0;
300
complete(&ia->ri_done);
301
break;
302
case RDMA_CM_EVENT_ADDR_ERROR:
303
ia->ri_async_rc = -EHOSTUNREACH;
304
dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305
__func__, ep);
306
complete(&ia->ri_done);
307
break;
308
case RDMA_CM_EVENT_ROUTE_ERROR:
309
ia->ri_async_rc = -ENETUNREACH;
310
dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311
__func__, ep);
312
complete(&ia->ri_done);
313
break;
314
case RDMA_CM_EVENT_ESTABLISHED:
315
connstate = 1;
316
ib_query_qp(ia->ri_id->qp, &attr,
317
IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318
&iattr);
319
dprintk("RPC: %s: %d responder resources"
320
" (%d initiator)\n",
321
__func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322
goto connected;
323
case RDMA_CM_EVENT_CONNECT_ERROR:
324
connstate = -ENOTCONN;
325
goto connected;
326
case RDMA_CM_EVENT_UNREACHABLE:
327
connstate = -ENETDOWN;
328
goto connected;
329
case RDMA_CM_EVENT_REJECTED:
330
connstate = -ECONNREFUSED;
331
goto connected;
332
case RDMA_CM_EVENT_DISCONNECTED:
333
connstate = -ECONNABORTED;
334
goto connected;
335
case RDMA_CM_EVENT_DEVICE_REMOVAL:
336
connstate = -ENODEV;
337
connected:
338
dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
339
__func__,
340
(event->event <= 11) ? conn[event->event] :
341
"unknown connection error",
342
&addr->sin_addr.s_addr,
343
ntohs(addr->sin_port),
344
ep, event->event);
345
atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346
dprintk("RPC: %s: %sconnected\n",
347
__func__, connstate > 0 ? "" : "dis");
348
ep->rep_connected = connstate;
349
ep->rep_func(ep);
350
wake_up_all(&ep->rep_connect_wait);
351
break;
352
default:
353
dprintk("RPC: %s: unexpected CM event %d\n",
354
__func__, event->event);
355
break;
356
}
357
358
#ifdef RPC_DEBUG
359
if (connstate == 1) {
360
int ird = attr.max_dest_rd_atomic;
361
int tird = ep->rep_remote_cma.responder_resources;
362
printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
363
"on %s, memreg %d slots %d ird %d%s\n",
364
&addr->sin_addr.s_addr,
365
ntohs(addr->sin_port),
366
ia->ri_id->device->name,
367
ia->ri_memreg_strategy,
368
xprt->rx_buf.rb_max_requests,
369
ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370
} else if (connstate < 0) {
371
printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372
&addr->sin_addr.s_addr,
373
ntohs(addr->sin_port),
374
connstate);
375
}
376
#endif
377
378
return 0;
379
}
380
381
static struct rdma_cm_id *
382
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383
struct rpcrdma_ia *ia, struct sockaddr *addr)
384
{
385
struct rdma_cm_id *id;
386
int rc;
387
388
init_completion(&ia->ri_done);
389
390
id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
391
if (IS_ERR(id)) {
392
rc = PTR_ERR(id);
393
dprintk("RPC: %s: rdma_create_id() failed %i\n",
394
__func__, rc);
395
return id;
396
}
397
398
ia->ri_async_rc = -ETIMEDOUT;
399
rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400
if (rc) {
401
dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402
__func__, rc);
403
goto out;
404
}
405
wait_for_completion_interruptible_timeout(&ia->ri_done,
406
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
407
rc = ia->ri_async_rc;
408
if (rc)
409
goto out;
410
411
ia->ri_async_rc = -ETIMEDOUT;
412
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413
if (rc) {
414
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415
__func__, rc);
416
goto out;
417
}
418
wait_for_completion_interruptible_timeout(&ia->ri_done,
419
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
420
rc = ia->ri_async_rc;
421
if (rc)
422
goto out;
423
424
return id;
425
426
out:
427
rdma_destroy_id(id);
428
return ERR_PTR(rc);
429
}
430
431
/*
432
* Drain any cq, prior to teardown.
433
*/
434
static void
435
rpcrdma_clean_cq(struct ib_cq *cq)
436
{
437
struct ib_wc wc;
438
int count = 0;
439
440
while (1 == ib_poll_cq(cq, 1, &wc))
441
++count;
442
443
if (count)
444
dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445
__func__, count, wc.opcode);
446
}
447
448
/*
449
* Exported functions.
450
*/
451
452
/*
453
* Open and initialize an Interface Adapter.
454
* o initializes fields of struct rpcrdma_ia, including
455
* interface and provider attributes and protection zone.
456
*/
457
int
458
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459
{
460
int rc, mem_priv;
461
struct ib_device_attr devattr;
462
struct rpcrdma_ia *ia = &xprt->rx_ia;
463
464
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465
if (IS_ERR(ia->ri_id)) {
466
rc = PTR_ERR(ia->ri_id);
467
goto out1;
468
}
469
470
ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471
if (IS_ERR(ia->ri_pd)) {
472
rc = PTR_ERR(ia->ri_pd);
473
dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474
__func__, rc);
475
goto out2;
476
}
477
478
/*
479
* Query the device to determine if the requested memory
480
* registration strategy is supported. If it isn't, set the
481
* strategy to a globally supported model.
482
*/
483
rc = ib_query_device(ia->ri_id->device, &devattr);
484
if (rc) {
485
dprintk("RPC: %s: ib_query_device failed %d\n",
486
__func__, rc);
487
goto out2;
488
}
489
490
if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491
ia->ri_have_dma_lkey = 1;
492
ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493
}
494
495
switch (memreg) {
496
case RPCRDMA_MEMWINDOWS:
497
case RPCRDMA_MEMWINDOWS_ASYNC:
498
if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499
dprintk("RPC: %s: MEMWINDOWS registration "
500
"specified but not supported by adapter, "
501
"using slower RPCRDMA_REGISTER\n",
502
__func__);
503
memreg = RPCRDMA_REGISTER;
504
}
505
break;
506
case RPCRDMA_MTHCAFMR:
507
if (!ia->ri_id->device->alloc_fmr) {
508
#if RPCRDMA_PERSISTENT_REGISTRATION
509
dprintk("RPC: %s: MTHCAFMR registration "
510
"specified but not supported by adapter, "
511
"using riskier RPCRDMA_ALLPHYSICAL\n",
512
__func__);
513
memreg = RPCRDMA_ALLPHYSICAL;
514
#else
515
dprintk("RPC: %s: MTHCAFMR registration "
516
"specified but not supported by adapter, "
517
"using slower RPCRDMA_REGISTER\n",
518
__func__);
519
memreg = RPCRDMA_REGISTER;
520
#endif
521
}
522
break;
523
case RPCRDMA_FRMR:
524
/* Requires both frmr reg and local dma lkey */
525
if ((devattr.device_cap_flags &
526
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528
#if RPCRDMA_PERSISTENT_REGISTRATION
529
dprintk("RPC: %s: FRMR registration "
530
"specified but not supported by adapter, "
531
"using riskier RPCRDMA_ALLPHYSICAL\n",
532
__func__);
533
memreg = RPCRDMA_ALLPHYSICAL;
534
#else
535
dprintk("RPC: %s: FRMR registration "
536
"specified but not supported by adapter, "
537
"using slower RPCRDMA_REGISTER\n",
538
__func__);
539
memreg = RPCRDMA_REGISTER;
540
#endif
541
}
542
break;
543
}
544
545
/*
546
* Optionally obtain an underlying physical identity mapping in
547
* order to do a memory window-based bind. This base registration
548
* is protected from remote access - that is enabled only by binding
549
* for the specific bytes targeted during each RPC operation, and
550
* revoked after the corresponding completion similar to a storage
551
* adapter.
552
*/
553
switch (memreg) {
554
case RPCRDMA_BOUNCEBUFFERS:
555
case RPCRDMA_REGISTER:
556
case RPCRDMA_FRMR:
557
break;
558
#if RPCRDMA_PERSISTENT_REGISTRATION
559
case RPCRDMA_ALLPHYSICAL:
560
mem_priv = IB_ACCESS_LOCAL_WRITE |
561
IB_ACCESS_REMOTE_WRITE |
562
IB_ACCESS_REMOTE_READ;
563
goto register_setup;
564
#endif
565
case RPCRDMA_MEMWINDOWS_ASYNC:
566
case RPCRDMA_MEMWINDOWS:
567
mem_priv = IB_ACCESS_LOCAL_WRITE |
568
IB_ACCESS_MW_BIND;
569
goto register_setup;
570
case RPCRDMA_MTHCAFMR:
571
if (ia->ri_have_dma_lkey)
572
break;
573
mem_priv = IB_ACCESS_LOCAL_WRITE;
574
register_setup:
575
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576
if (IS_ERR(ia->ri_bind_mem)) {
577
printk(KERN_ALERT "%s: ib_get_dma_mr for "
578
"phys register failed with %lX\n\t"
579
"Will continue with degraded performance\n",
580
__func__, PTR_ERR(ia->ri_bind_mem));
581
memreg = RPCRDMA_REGISTER;
582
ia->ri_bind_mem = NULL;
583
}
584
break;
585
default:
586
printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587
__func__, memreg);
588
rc = -EINVAL;
589
goto out2;
590
}
591
dprintk("RPC: %s: memory registration strategy is %d\n",
592
__func__, memreg);
593
594
/* Else will do memory reg/dereg for each chunk */
595
ia->ri_memreg_strategy = memreg;
596
597
return 0;
598
out2:
599
rdma_destroy_id(ia->ri_id);
600
ia->ri_id = NULL;
601
out1:
602
return rc;
603
}
604
605
/*
606
* Clean up/close an IA.
607
* o if event handles and PD have been initialized, free them.
608
* o close the IA
609
*/
610
void
611
rpcrdma_ia_close(struct rpcrdma_ia *ia)
612
{
613
int rc;
614
615
dprintk("RPC: %s: entering\n", __func__);
616
if (ia->ri_bind_mem != NULL) {
617
rc = ib_dereg_mr(ia->ri_bind_mem);
618
dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619
__func__, rc);
620
}
621
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622
if (ia->ri_id->qp)
623
rdma_destroy_qp(ia->ri_id);
624
rdma_destroy_id(ia->ri_id);
625
ia->ri_id = NULL;
626
}
627
if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628
rc = ib_dealloc_pd(ia->ri_pd);
629
dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630
__func__, rc);
631
}
632
}
633
634
/*
635
* Create unconnected endpoint.
636
*/
637
int
638
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639
struct rpcrdma_create_data_internal *cdata)
640
{
641
struct ib_device_attr devattr;
642
int rc, err;
643
644
rc = ib_query_device(ia->ri_id->device, &devattr);
645
if (rc) {
646
dprintk("RPC: %s: ib_query_device failed %d\n",
647
__func__, rc);
648
return rc;
649
}
650
651
/* check provider's send/recv wr limits */
652
if (cdata->max_requests > devattr.max_qp_wr)
653
cdata->max_requests = devattr.max_qp_wr;
654
655
ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656
ep->rep_attr.qp_context = ep;
657
/* send_cq and recv_cq initialized below */
658
ep->rep_attr.srq = NULL;
659
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660
switch (ia->ri_memreg_strategy) {
661
case RPCRDMA_FRMR:
662
/* Add room for frmr register and invalidate WRs.
663
* 1. FRMR reg WR for head
664
* 2. FRMR invalidate WR for head
665
* 3. FRMR reg WR for pagelist
666
* 4. FRMR invalidate WR for pagelist
667
* 5. FRMR reg WR for tail
668
* 6. FRMR invalidate WR for tail
669
* 7. The RDMA_SEND WR
670
*/
671
ep->rep_attr.cap.max_send_wr *= 7;
672
if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673
cdata->max_requests = devattr.max_qp_wr / 7;
674
if (!cdata->max_requests)
675
return -EINVAL;
676
ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677
}
678
break;
679
case RPCRDMA_MEMWINDOWS_ASYNC:
680
case RPCRDMA_MEMWINDOWS:
681
/* Add room for mw_binds+unbinds - overkill! */
682
ep->rep_attr.cap.max_send_wr++;
683
ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684
if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685
return -EINVAL;
686
break;
687
default:
688
break;
689
}
690
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691
ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692
ep->rep_attr.cap.max_recv_sge = 1;
693
ep->rep_attr.cap.max_inline_data = 0;
694
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695
ep->rep_attr.qp_type = IB_QPT_RC;
696
ep->rep_attr.port_num = ~0;
697
698
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699
"iovs: send %d recv %d\n",
700
__func__,
701
ep->rep_attr.cap.max_send_wr,
702
ep->rep_attr.cap.max_recv_wr,
703
ep->rep_attr.cap.max_send_sge,
704
ep->rep_attr.cap.max_recv_sge);
705
706
/* set trigger for requesting send completion */
707
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
708
switch (ia->ri_memreg_strategy) {
709
case RPCRDMA_MEMWINDOWS_ASYNC:
710
case RPCRDMA_MEMWINDOWS:
711
ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712
break;
713
default:
714
break;
715
}
716
if (ep->rep_cqinit <= 2)
717
ep->rep_cqinit = 0;
718
INIT_CQCOUNT(ep);
719
ep->rep_ia = ia;
720
init_waitqueue_head(&ep->rep_connect_wait);
721
722
/*
723
* Create a single cq for receive dto and mw_bind (only ever
724
* care about unbind, really). Send completions are suppressed.
725
* Use single threaded tasklet upcalls to maintain ordering.
726
*/
727
ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728
rpcrdma_cq_async_error_upcall, NULL,
729
ep->rep_attr.cap.max_recv_wr +
730
ep->rep_attr.cap.max_send_wr + 1, 0);
731
if (IS_ERR(ep->rep_cq)) {
732
rc = PTR_ERR(ep->rep_cq);
733
dprintk("RPC: %s: ib_create_cq failed: %i\n",
734
__func__, rc);
735
goto out1;
736
}
737
738
rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739
if (rc) {
740
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741
__func__, rc);
742
goto out2;
743
}
744
745
ep->rep_attr.send_cq = ep->rep_cq;
746
ep->rep_attr.recv_cq = ep->rep_cq;
747
748
/* Initialize cma parameters */
749
750
/* RPC/RDMA does not use private data */
751
ep->rep_remote_cma.private_data = NULL;
752
ep->rep_remote_cma.private_data_len = 0;
753
754
/* Client offers RDMA Read but does not initiate */
755
ep->rep_remote_cma.initiator_depth = 0;
756
if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757
ep->rep_remote_cma.responder_resources = 0;
758
else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
759
ep->rep_remote_cma.responder_resources = 32;
760
else
761
ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
762
763
ep->rep_remote_cma.retry_count = 7;
764
ep->rep_remote_cma.flow_control = 0;
765
ep->rep_remote_cma.rnr_retry_count = 0;
766
767
return 0;
768
769
out2:
770
err = ib_destroy_cq(ep->rep_cq);
771
if (err)
772
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773
__func__, err);
774
out1:
775
return rc;
776
}
777
778
/*
779
* rpcrdma_ep_destroy
780
*
781
* Disconnect and destroy endpoint. After this, the only
782
* valid operations on the ep are to free it (if dynamically
783
* allocated) or re-create it.
784
*
785
* The caller's error handling must be sure to not leak the endpoint
786
* if this function fails.
787
*/
788
int
789
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790
{
791
int rc;
792
793
dprintk("RPC: %s: entering, connected is %d\n",
794
__func__, ep->rep_connected);
795
796
if (ia->ri_id->qp) {
797
rc = rpcrdma_ep_disconnect(ep, ia);
798
if (rc)
799
dprintk("RPC: %s: rpcrdma_ep_disconnect"
800
" returned %i\n", __func__, rc);
801
rdma_destroy_qp(ia->ri_id);
802
ia->ri_id->qp = NULL;
803
}
804
805
/* padding - could be done in rpcrdma_buffer_destroy... */
806
if (ep->rep_pad_mr) {
807
rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808
ep->rep_pad_mr = NULL;
809
}
810
811
rpcrdma_clean_cq(ep->rep_cq);
812
rc = ib_destroy_cq(ep->rep_cq);
813
if (rc)
814
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815
__func__, rc);
816
817
return rc;
818
}
819
820
/*
821
* Connect unconnected endpoint.
822
*/
823
int
824
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825
{
826
struct rdma_cm_id *id;
827
int rc = 0;
828
int retry_count = 0;
829
830
if (ep->rep_connected != 0) {
831
struct rpcrdma_xprt *xprt;
832
retry:
833
rc = rpcrdma_ep_disconnect(ep, ia);
834
if (rc && rc != -ENOTCONN)
835
dprintk("RPC: %s: rpcrdma_ep_disconnect"
836
" status %i\n", __func__, rc);
837
rpcrdma_clean_cq(ep->rep_cq);
838
839
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840
id = rpcrdma_create_id(xprt, ia,
841
(struct sockaddr *)&xprt->rx_data.addr);
842
if (IS_ERR(id)) {
843
rc = PTR_ERR(id);
844
goto out;
845
}
846
/* TEMP TEMP TEMP - fail if new device:
847
* Deregister/remarshal *all* requests!
848
* Close and recreate adapter, pd, etc!
849
* Re-determine all attributes still sane!
850
* More stuff I haven't thought of!
851
* Rrrgh!
852
*/
853
if (ia->ri_id->device != id->device) {
854
printk("RPC: %s: can't reconnect on "
855
"different device!\n", __func__);
856
rdma_destroy_id(id);
857
rc = -ENETDOWN;
858
goto out;
859
}
860
/* END TEMP */
861
rdma_destroy_qp(ia->ri_id);
862
rdma_destroy_id(ia->ri_id);
863
ia->ri_id = id;
864
}
865
866
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867
if (rc) {
868
dprintk("RPC: %s: rdma_create_qp failed %i\n",
869
__func__, rc);
870
goto out;
871
}
872
873
/* XXX Tavor device performs badly with 2K MTU! */
874
if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875
struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876
if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877
(pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878
pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879
struct ib_qp_attr attr = {
880
.path_mtu = IB_MTU_1024
881
};
882
rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883
}
884
}
885
886
ep->rep_connected = 0;
887
888
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889
if (rc) {
890
dprintk("RPC: %s: rdma_connect() failed with %i\n",
891
__func__, rc);
892
goto out;
893
}
894
895
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897
/*
898
* Check state. A non-peer reject indicates no listener
899
* (ECONNREFUSED), which may be a transient state. All
900
* others indicate a transport condition which has already
901
* undergone a best-effort.
902
*/
903
if (ep->rep_connected == -ECONNREFUSED &&
904
++retry_count <= RDMA_CONNECT_RETRY_MAX) {
905
dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906
goto retry;
907
}
908
if (ep->rep_connected <= 0) {
909
/* Sometimes, the only way to reliably connect to remote
910
* CMs is to use same nonzero values for ORD and IRD. */
911
if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912
(ep->rep_remote_cma.responder_resources == 0 ||
913
ep->rep_remote_cma.initiator_depth !=
914
ep->rep_remote_cma.responder_resources)) {
915
if (ep->rep_remote_cma.responder_resources == 0)
916
ep->rep_remote_cma.responder_resources = 1;
917
ep->rep_remote_cma.initiator_depth =
918
ep->rep_remote_cma.responder_resources;
919
goto retry;
920
}
921
rc = ep->rep_connected;
922
} else {
923
dprintk("RPC: %s: connected\n", __func__);
924
}
925
926
out:
927
if (rc)
928
ep->rep_connected = rc;
929
return rc;
930
}
931
932
/*
933
* rpcrdma_ep_disconnect
934
*
935
* This is separate from destroy to facilitate the ability
936
* to reconnect without recreating the endpoint.
937
*
938
* This call is not reentrant, and must not be made in parallel
939
* on the same endpoint.
940
*/
941
int
942
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943
{
944
int rc;
945
946
rpcrdma_clean_cq(ep->rep_cq);
947
rc = rdma_disconnect(ia->ri_id);
948
if (!rc) {
949
/* returns without wait if not connected */
950
wait_event_interruptible(ep->rep_connect_wait,
951
ep->rep_connected != 1);
952
dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953
(ep->rep_connected == 1) ? "still " : "dis");
954
} else {
955
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956
ep->rep_connected = rc;
957
}
958
return rc;
959
}
960
961
/*
962
* Initialize buffer memory
963
*/
964
int
965
rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967
{
968
char *p;
969
size_t len;
970
int i, rc;
971
struct rpcrdma_mw *r;
972
973
buf->rb_max_requests = cdata->max_requests;
974
spin_lock_init(&buf->rb_lock);
975
atomic_set(&buf->rb_credits, 1);
976
977
/* Need to allocate:
978
* 1. arrays for send and recv pointers
979
* 2. arrays of struct rpcrdma_req to fill in pointers
980
* 3. array of struct rpcrdma_rep for replies
981
* 4. padding, if any
982
* 5. mw's, fmr's or frmr's, if any
983
* Send/recv buffers in req/rep need to be registered
984
*/
985
986
len = buf->rb_max_requests *
987
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988
len += cdata->padding;
989
switch (ia->ri_memreg_strategy) {
990
case RPCRDMA_FRMR:
991
len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992
sizeof(struct rpcrdma_mw);
993
break;
994
case RPCRDMA_MTHCAFMR:
995
/* TBD we are perhaps overallocating here */
996
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997
sizeof(struct rpcrdma_mw);
998
break;
999
case RPCRDMA_MEMWINDOWS_ASYNC:
1000
case RPCRDMA_MEMWINDOWS:
1001
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002
sizeof(struct rpcrdma_mw);
1003
break;
1004
default:
1005
break;
1006
}
1007
1008
/* allocate 1, 4 and 5 in one shot */
1009
p = kzalloc(len, GFP_KERNEL);
1010
if (p == NULL) {
1011
dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012
__func__, len);
1013
rc = -ENOMEM;
1014
goto out;
1015
}
1016
buf->rb_pool = p; /* for freeing it later */
1017
1018
buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019
p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020
buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021
p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023
/*
1024
* Register the zeroed pad buffer, if any.
1025
*/
1026
if (cdata->padding) {
1027
rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028
&ep->rep_pad_mr, &ep->rep_pad);
1029
if (rc)
1030
goto out;
1031
}
1032
p += cdata->padding;
1033
1034
/*
1035
* Allocate the fmr's, or mw's for mw_bind chunk registration.
1036
* We "cycle" the mw's in order to minimize rkey reuse,
1037
* and also reduce unbind-to-bind collision.
1038
*/
1039
INIT_LIST_HEAD(&buf->rb_mws);
1040
r = (struct rpcrdma_mw *)p;
1041
switch (ia->ri_memreg_strategy) {
1042
case RPCRDMA_FRMR:
1043
for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044
r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045
RPCRDMA_MAX_SEGS);
1046
if (IS_ERR(r->r.frmr.fr_mr)) {
1047
rc = PTR_ERR(r->r.frmr.fr_mr);
1048
dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049
" failed %i\n", __func__, rc);
1050
goto out;
1051
}
1052
r->r.frmr.fr_pgl =
1053
ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054
RPCRDMA_MAX_SEGS);
1055
if (IS_ERR(r->r.frmr.fr_pgl)) {
1056
rc = PTR_ERR(r->r.frmr.fr_pgl);
1057
dprintk("RPC: %s: "
1058
"ib_alloc_fast_reg_page_list "
1059
"failed %i\n", __func__, rc);
1060
goto out;
1061
}
1062
list_add(&r->mw_list, &buf->rb_mws);
1063
++r;
1064
}
1065
break;
1066
case RPCRDMA_MTHCAFMR:
1067
/* TBD we are perhaps overallocating here */
1068
for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1069
static struct ib_fmr_attr fa =
1070
{ RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1071
r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073
&fa);
1074
if (IS_ERR(r->r.fmr)) {
1075
rc = PTR_ERR(r->r.fmr);
1076
dprintk("RPC: %s: ib_alloc_fmr"
1077
" failed %i\n", __func__, rc);
1078
goto out;
1079
}
1080
list_add(&r->mw_list, &buf->rb_mws);
1081
++r;
1082
}
1083
break;
1084
case RPCRDMA_MEMWINDOWS_ASYNC:
1085
case RPCRDMA_MEMWINDOWS:
1086
/* Allocate one extra request's worth, for full cycling */
1087
for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088
r->r.mw = ib_alloc_mw(ia->ri_pd);
1089
if (IS_ERR(r->r.mw)) {
1090
rc = PTR_ERR(r->r.mw);
1091
dprintk("RPC: %s: ib_alloc_mw"
1092
" failed %i\n", __func__, rc);
1093
goto out;
1094
}
1095
list_add(&r->mw_list, &buf->rb_mws);
1096
++r;
1097
}
1098
break;
1099
default:
1100
break;
1101
}
1102
1103
/*
1104
* Allocate/init the request/reply buffers. Doing this
1105
* using kmalloc for now -- one for each buf.
1106
*/
1107
for (i = 0; i < buf->rb_max_requests; i++) {
1108
struct rpcrdma_req *req;
1109
struct rpcrdma_rep *rep;
1110
1111
len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112
/* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113
/* Typical ~2400b, so rounding up saves work later */
1114
if (len < 4096)
1115
len = 4096;
1116
req = kmalloc(len, GFP_KERNEL);
1117
if (req == NULL) {
1118
dprintk("RPC: %s: request buffer %d alloc"
1119
" failed\n", __func__, i);
1120
rc = -ENOMEM;
1121
goto out;
1122
}
1123
memset(req, 0, sizeof(struct rpcrdma_req));
1124
buf->rb_send_bufs[i] = req;
1125
buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127
rc = rpcrdma_register_internal(ia, req->rl_base,
1128
len - offsetof(struct rpcrdma_req, rl_base),
1129
&buf->rb_send_bufs[i]->rl_handle,
1130
&buf->rb_send_bufs[i]->rl_iov);
1131
if (rc)
1132
goto out;
1133
1134
buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136
len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137
rep = kmalloc(len, GFP_KERNEL);
1138
if (rep == NULL) {
1139
dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140
__func__, i);
1141
rc = -ENOMEM;
1142
goto out;
1143
}
1144
memset(rep, 0, sizeof(struct rpcrdma_rep));
1145
buf->rb_recv_bufs[i] = rep;
1146
buf->rb_recv_bufs[i]->rr_buffer = buf;
1147
init_waitqueue_head(&rep->rr_unbind);
1148
1149
rc = rpcrdma_register_internal(ia, rep->rr_base,
1150
len - offsetof(struct rpcrdma_rep, rr_base),
1151
&buf->rb_recv_bufs[i]->rr_handle,
1152
&buf->rb_recv_bufs[i]->rr_iov);
1153
if (rc)
1154
goto out;
1155
1156
}
1157
dprintk("RPC: %s: max_requests %d\n",
1158
__func__, buf->rb_max_requests);
1159
/* done */
1160
return 0;
1161
out:
1162
rpcrdma_buffer_destroy(buf);
1163
return rc;
1164
}
1165
1166
/*
1167
* Unregister and destroy buffer memory. Need to deal with
1168
* partial initialization, so it's callable from failed create.
1169
* Must be called before destroying endpoint, as registrations
1170
* reference it.
1171
*/
1172
void
1173
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174
{
1175
int rc, i;
1176
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1177
struct rpcrdma_mw *r;
1178
1179
/* clean up in reverse order from create
1180
* 1. recv mr memory (mr free, then kfree)
1181
* 1a. bind mw memory
1182
* 2. send mr memory (mr free, then kfree)
1183
* 3. padding (if any) [moved to rpcrdma_ep_destroy]
1184
* 4. arrays
1185
*/
1186
dprintk("RPC: %s: entering\n", __func__);
1187
1188
for (i = 0; i < buf->rb_max_requests; i++) {
1189
if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190
rpcrdma_deregister_internal(ia,
1191
buf->rb_recv_bufs[i]->rr_handle,
1192
&buf->rb_recv_bufs[i]->rr_iov);
1193
kfree(buf->rb_recv_bufs[i]);
1194
}
1195
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196
while (!list_empty(&buf->rb_mws)) {
1197
r = list_entry(buf->rb_mws.next,
1198
struct rpcrdma_mw, mw_list);
1199
list_del(&r->mw_list);
1200
switch (ia->ri_memreg_strategy) {
1201
case RPCRDMA_FRMR:
1202
rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203
if (rc)
1204
dprintk("RPC: %s:"
1205
" ib_dereg_mr"
1206
" failed %i\n",
1207
__func__, rc);
1208
ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209
break;
1210
case RPCRDMA_MTHCAFMR:
1211
rc = ib_dealloc_fmr(r->r.fmr);
1212
if (rc)
1213
dprintk("RPC: %s:"
1214
" ib_dealloc_fmr"
1215
" failed %i\n",
1216
__func__, rc);
1217
break;
1218
case RPCRDMA_MEMWINDOWS_ASYNC:
1219
case RPCRDMA_MEMWINDOWS:
1220
rc = ib_dealloc_mw(r->r.mw);
1221
if (rc)
1222
dprintk("RPC: %s:"
1223
" ib_dealloc_mw"
1224
" failed %i\n",
1225
__func__, rc);
1226
break;
1227
default:
1228
break;
1229
}
1230
}
1231
rpcrdma_deregister_internal(ia,
1232
buf->rb_send_bufs[i]->rl_handle,
1233
&buf->rb_send_bufs[i]->rl_iov);
1234
kfree(buf->rb_send_bufs[i]);
1235
}
1236
}
1237
1238
kfree(buf->rb_pool);
1239
}
1240
1241
/*
1242
* Get a set of request/reply buffers.
1243
*
1244
* Reply buffer (if needed) is attached to send buffer upon return.
1245
* Rule:
1246
* rb_send_index and rb_recv_index MUST always be pointing to the
1247
* *next* available buffer (non-NULL). They are incremented after
1248
* removing buffers, and decremented *before* returning them.
1249
*/
1250
struct rpcrdma_req *
1251
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252
{
1253
struct rpcrdma_req *req;
1254
unsigned long flags;
1255
int i;
1256
struct rpcrdma_mw *r;
1257
1258
spin_lock_irqsave(&buffers->rb_lock, flags);
1259
if (buffers->rb_send_index == buffers->rb_max_requests) {
1260
spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261
dprintk("RPC: %s: out of request buffers\n", __func__);
1262
return ((struct rpcrdma_req *)NULL);
1263
}
1264
1265
req = buffers->rb_send_bufs[buffers->rb_send_index];
1266
if (buffers->rb_send_index < buffers->rb_recv_index) {
1267
dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268
__func__,
1269
buffers->rb_recv_index - buffers->rb_send_index);
1270
req->rl_reply = NULL;
1271
} else {
1272
req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274
}
1275
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276
if (!list_empty(&buffers->rb_mws)) {
1277
i = RPCRDMA_MAX_SEGS - 1;
1278
do {
1279
r = list_entry(buffers->rb_mws.next,
1280
struct rpcrdma_mw, mw_list);
1281
list_del(&r->mw_list);
1282
req->rl_segments[i].mr_chunk.rl_mw = r;
1283
} while (--i >= 0);
1284
}
1285
spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286
return req;
1287
}
1288
1289
/*
1290
* Put request/reply buffers back into pool.
1291
* Pre-decrement counter/array index.
1292
*/
1293
void
1294
rpcrdma_buffer_put(struct rpcrdma_req *req)
1295
{
1296
struct rpcrdma_buffer *buffers = req->rl_buffer;
1297
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298
int i;
1299
unsigned long flags;
1300
1301
BUG_ON(req->rl_nchunks != 0);
1302
spin_lock_irqsave(&buffers->rb_lock, flags);
1303
buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304
req->rl_niovs = 0;
1305
if (req->rl_reply) {
1306
buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307
init_waitqueue_head(&req->rl_reply->rr_unbind);
1308
req->rl_reply->rr_func = NULL;
1309
req->rl_reply = NULL;
1310
}
1311
switch (ia->ri_memreg_strategy) {
1312
case RPCRDMA_FRMR:
1313
case RPCRDMA_MTHCAFMR:
1314
case RPCRDMA_MEMWINDOWS_ASYNC:
1315
case RPCRDMA_MEMWINDOWS:
1316
/*
1317
* Cycle mw's back in reverse order, and "spin" them.
1318
* This delays and scrambles reuse as much as possible.
1319
*/
1320
i = 1;
1321
do {
1322
struct rpcrdma_mw **mw;
1323
mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324
list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325
*mw = NULL;
1326
} while (++i < RPCRDMA_MAX_SEGS);
1327
list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328
&buffers->rb_mws);
1329
req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330
break;
1331
default:
1332
break;
1333
}
1334
spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335
}
1336
1337
/*
1338
* Recover reply buffers from pool.
1339
* This happens when recovering from error conditions.
1340
* Post-increment counter/array index.
1341
*/
1342
void
1343
rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344
{
1345
struct rpcrdma_buffer *buffers = req->rl_buffer;
1346
unsigned long flags;
1347
1348
if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349
buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350
spin_lock_irqsave(&buffers->rb_lock, flags);
1351
if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352
req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354
}
1355
spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356
}
1357
1358
/*
1359
* Put reply buffers back into pool when not attached to
1360
* request. This happens in error conditions, and when
1361
* aborting unbinds. Pre-decrement counter/array index.
1362
*/
1363
void
1364
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365
{
1366
struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367
unsigned long flags;
1368
1369
rep->rr_func = NULL;
1370
spin_lock_irqsave(&buffers->rb_lock, flags);
1371
buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372
spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373
}
1374
1375
/*
1376
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377
*/
1378
1379
int
1380
rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381
struct ib_mr **mrp, struct ib_sge *iov)
1382
{
1383
struct ib_phys_buf ipb;
1384
struct ib_mr *mr;
1385
int rc;
1386
1387
/*
1388
* All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389
*/
1390
iov->addr = ib_dma_map_single(ia->ri_id->device,
1391
va, len, DMA_BIDIRECTIONAL);
1392
iov->length = len;
1393
1394
if (ia->ri_have_dma_lkey) {
1395
*mrp = NULL;
1396
iov->lkey = ia->ri_dma_lkey;
1397
return 0;
1398
} else if (ia->ri_bind_mem != NULL) {
1399
*mrp = NULL;
1400
iov->lkey = ia->ri_bind_mem->lkey;
1401
return 0;
1402
}
1403
1404
ipb.addr = iov->addr;
1405
ipb.size = iov->length;
1406
mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407
IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409
dprintk("RPC: %s: phys convert: 0x%llx "
1410
"registered 0x%llx length %d\n",
1411
__func__, (unsigned long long)ipb.addr,
1412
(unsigned long long)iov->addr, len);
1413
1414
if (IS_ERR(mr)) {
1415
*mrp = NULL;
1416
rc = PTR_ERR(mr);
1417
dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418
} else {
1419
*mrp = mr;
1420
iov->lkey = mr->lkey;
1421
rc = 0;
1422
}
1423
1424
return rc;
1425
}
1426
1427
int
1428
rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429
struct ib_mr *mr, struct ib_sge *iov)
1430
{
1431
int rc;
1432
1433
ib_dma_unmap_single(ia->ri_id->device,
1434
iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436
if (NULL == mr)
1437
return 0;
1438
1439
rc = ib_dereg_mr(mr);
1440
if (rc)
1441
dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442
return rc;
1443
}
1444
1445
/*
1446
* Wrappers for chunk registration, shared by read/write chunk code.
1447
*/
1448
1449
static void
1450
rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451
{
1452
seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453
seg->mr_dmalen = seg->mr_len;
1454
if (seg->mr_page)
1455
seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456
seg->mr_page, offset_in_page(seg->mr_offset),
1457
seg->mr_dmalen, seg->mr_dir);
1458
else
1459
seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460
seg->mr_offset,
1461
seg->mr_dmalen, seg->mr_dir);
1462
if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463
dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464
__func__,
1465
(unsigned long long)seg->mr_dma,
1466
seg->mr_offset, seg->mr_dmalen);
1467
}
1468
}
1469
1470
static void
1471
rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472
{
1473
if (seg->mr_page)
1474
ib_dma_unmap_page(ia->ri_id->device,
1475
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476
else
1477
ib_dma_unmap_single(ia->ri_id->device,
1478
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479
}
1480
1481
static int
1482
rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483
int *nsegs, int writing, struct rpcrdma_ia *ia,
1484
struct rpcrdma_xprt *r_xprt)
1485
{
1486
struct rpcrdma_mr_seg *seg1 = seg;
1487
struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
1489
u8 key;
1490
int len, pageoff;
1491
int i, rc;
1492
1493
pageoff = offset_in_page(seg1->mr_offset);
1494
seg1->mr_offset -= pageoff; /* start of page */
1495
seg1->mr_len += pageoff;
1496
len = -pageoff;
1497
if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498
*nsegs = RPCRDMA_MAX_DATA_SEGS;
1499
for (i = 0; i < *nsegs;) {
1500
rpcrdma_map_one(ia, seg, writing);
1501
seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502
len += seg->mr_len;
1503
BUG_ON(seg->mr_len > PAGE_SIZE);
1504
++seg;
1505
++i;
1506
/* Check for holes */
1507
if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509
break;
1510
}
1511
dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1512
__func__, seg1->mr_chunk.rl_mw, i);
1513
1514
if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515
dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516
__func__,
1517
seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518
/* Invalidate before using. */
1519
memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520
invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521
invalidate_wr.next = &frmr_wr;
1522
invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523
invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524
invalidate_wr.ex.invalidate_rkey =
1525
seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526
DECR_CQCOUNT(&r_xprt->rx_ep);
1527
post_wr = &invalidate_wr;
1528
} else
1529
post_wr = &frmr_wr;
1530
1531
/* Bump the key */
1532
key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533
ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535
/* Prepare FRMR WR */
1536
memset(&frmr_wr, 0, sizeof frmr_wr);
1537
frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1538
frmr_wr.opcode = IB_WR_FAST_REG_MR;
1539
frmr_wr.send_flags = IB_SEND_SIGNALED;
1540
frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1541
frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542
frmr_wr.wr.fast_reg.page_list_len = i;
1543
frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544
frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545
BUG_ON(frmr_wr.wr.fast_reg.length < len);
1546
frmr_wr.wr.fast_reg.access_flags = (writing ?
1547
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548
IB_ACCESS_REMOTE_READ);
1549
frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550
DECR_CQCOUNT(&r_xprt->rx_ep);
1551
1552
rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1553
1554
if (rc) {
1555
dprintk("RPC: %s: failed ib_post_send for register,"
1556
" status %i\n", __func__, rc);
1557
while (i--)
1558
rpcrdma_unmap_one(ia, --seg);
1559
} else {
1560
seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561
seg1->mr_base = seg1->mr_dma + pageoff;
1562
seg1->mr_nsegs = i;
1563
seg1->mr_len = len;
1564
}
1565
*nsegs = i;
1566
return rc;
1567
}
1568
1569
static int
1570
rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571
struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572
{
1573
struct rpcrdma_mr_seg *seg1 = seg;
1574
struct ib_send_wr invalidate_wr, *bad_wr;
1575
int rc;
1576
1577
while (seg1->mr_nsegs--)
1578
rpcrdma_unmap_one(ia, seg++);
1579
1580
memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581
invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1582
invalidate_wr.opcode = IB_WR_LOCAL_INV;
1583
invalidate_wr.send_flags = IB_SEND_SIGNALED;
1584
invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585
DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588
if (rc)
1589
dprintk("RPC: %s: failed ib_post_send for invalidate,"
1590
" status %i\n", __func__, rc);
1591
return rc;
1592
}
1593
1594
static int
1595
rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596
int *nsegs, int writing, struct rpcrdma_ia *ia)
1597
{
1598
struct rpcrdma_mr_seg *seg1 = seg;
1599
u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600
int len, pageoff, i, rc;
1601
1602
pageoff = offset_in_page(seg1->mr_offset);
1603
seg1->mr_offset -= pageoff; /* start of page */
1604
seg1->mr_len += pageoff;
1605
len = -pageoff;
1606
if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607
*nsegs = RPCRDMA_MAX_DATA_SEGS;
1608
for (i = 0; i < *nsegs;) {
1609
rpcrdma_map_one(ia, seg, writing);
1610
physaddrs[i] = seg->mr_dma;
1611
len += seg->mr_len;
1612
++seg;
1613
++i;
1614
/* Check for holes */
1615
if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617
break;
1618
}
1619
rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620
physaddrs, i, seg1->mr_dma);
1621
if (rc) {
1622
dprintk("RPC: %s: failed ib_map_phys_fmr "
1623
"%u@0x%llx+%i (%d)... status %i\n", __func__,
1624
len, (unsigned long long)seg1->mr_dma,
1625
pageoff, i, rc);
1626
while (i--)
1627
rpcrdma_unmap_one(ia, --seg);
1628
} else {
1629
seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630
seg1->mr_base = seg1->mr_dma + pageoff;
1631
seg1->mr_nsegs = i;
1632
seg1->mr_len = len;
1633
}
1634
*nsegs = i;
1635
return rc;
1636
}
1637
1638
static int
1639
rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640
struct rpcrdma_ia *ia)
1641
{
1642
struct rpcrdma_mr_seg *seg1 = seg;
1643
LIST_HEAD(l);
1644
int rc;
1645
1646
list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647
rc = ib_unmap_fmr(&l);
1648
while (seg1->mr_nsegs--)
1649
rpcrdma_unmap_one(ia, seg++);
1650
if (rc)
1651
dprintk("RPC: %s: failed ib_unmap_fmr,"
1652
" status %i\n", __func__, rc);
1653
return rc;
1654
}
1655
1656
static int
1657
rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658
int *nsegs, int writing, struct rpcrdma_ia *ia,
1659
struct rpcrdma_xprt *r_xprt)
1660
{
1661
int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662
IB_ACCESS_REMOTE_READ);
1663
struct ib_mw_bind param;
1664
int rc;
1665
1666
*nsegs = 1;
1667
rpcrdma_map_one(ia, seg, writing);
1668
param.mr = ia->ri_bind_mem;
1669
param.wr_id = 0ULL; /* no send cookie */
1670
param.addr = seg->mr_dma;
1671
param.length = seg->mr_len;
1672
param.send_flags = 0;
1673
param.mw_access_flags = mem_priv;
1674
1675
DECR_CQCOUNT(&r_xprt->rx_ep);
1676
rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677
if (rc) {
1678
dprintk("RPC: %s: failed ib_bind_mw "
1679
"%u@0x%llx status %i\n",
1680
__func__, seg->mr_len,
1681
(unsigned long long)seg->mr_dma, rc);
1682
rpcrdma_unmap_one(ia, seg);
1683
} else {
1684
seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685
seg->mr_base = param.addr;
1686
seg->mr_nsegs = 1;
1687
}
1688
return rc;
1689
}
1690
1691
static int
1692
rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693
struct rpcrdma_ia *ia,
1694
struct rpcrdma_xprt *r_xprt, void **r)
1695
{
1696
struct ib_mw_bind param;
1697
LIST_HEAD(l);
1698
int rc;
1699
1700
BUG_ON(seg->mr_nsegs != 1);
1701
param.mr = ia->ri_bind_mem;
1702
param.addr = 0ULL; /* unbind */
1703
param.length = 0;
1704
param.mw_access_flags = 0;
1705
if (*r) {
1706
param.wr_id = (u64) (unsigned long) *r;
1707
param.send_flags = IB_SEND_SIGNALED;
1708
INIT_CQCOUNT(&r_xprt->rx_ep);
1709
} else {
1710
param.wr_id = 0ULL;
1711
param.send_flags = 0;
1712
DECR_CQCOUNT(&r_xprt->rx_ep);
1713
}
1714
rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715
rpcrdma_unmap_one(ia, seg);
1716
if (rc)
1717
dprintk("RPC: %s: failed ib_(un)bind_mw,"
1718
" status %i\n", __func__, rc);
1719
else
1720
*r = NULL; /* will upcall on completion */
1721
return rc;
1722
}
1723
1724
static int
1725
rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726
int *nsegs, int writing, struct rpcrdma_ia *ia)
1727
{
1728
int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729
IB_ACCESS_REMOTE_READ);
1730
struct rpcrdma_mr_seg *seg1 = seg;
1731
struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732
int len, i, rc = 0;
1733
1734
if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735
*nsegs = RPCRDMA_MAX_DATA_SEGS;
1736
for (len = 0, i = 0; i < *nsegs;) {
1737
rpcrdma_map_one(ia, seg, writing);
1738
ipb[i].addr = seg->mr_dma;
1739
ipb[i].size = seg->mr_len;
1740
len += seg->mr_len;
1741
++seg;
1742
++i;
1743
/* Check for holes */
1744
if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745
offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746
break;
1747
}
1748
seg1->mr_base = seg1->mr_dma;
1749
seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750
ipb, i, mem_priv, &seg1->mr_base);
1751
if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752
rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753
dprintk("RPC: %s: failed ib_reg_phys_mr "
1754
"%u@0x%llx (%d)... status %i\n",
1755
__func__, len,
1756
(unsigned long long)seg1->mr_dma, i, rc);
1757
while (i--)
1758
rpcrdma_unmap_one(ia, --seg);
1759
} else {
1760
seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761
seg1->mr_nsegs = i;
1762
seg1->mr_len = len;
1763
}
1764
*nsegs = i;
1765
return rc;
1766
}
1767
1768
static int
1769
rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770
struct rpcrdma_ia *ia)
1771
{
1772
struct rpcrdma_mr_seg *seg1 = seg;
1773
int rc;
1774
1775
rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776
seg1->mr_chunk.rl_mr = NULL;
1777
while (seg1->mr_nsegs--)
1778
rpcrdma_unmap_one(ia, seg++);
1779
if (rc)
1780
dprintk("RPC: %s: failed ib_dereg_mr,"
1781
" status %i\n", __func__, rc);
1782
return rc;
1783
}
1784
1785
int
1786
rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787
int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788
{
1789
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1790
int rc = 0;
1791
1792
switch (ia->ri_memreg_strategy) {
1793
1794
#if RPCRDMA_PERSISTENT_REGISTRATION
1795
case RPCRDMA_ALLPHYSICAL:
1796
rpcrdma_map_one(ia, seg, writing);
1797
seg->mr_rkey = ia->ri_bind_mem->rkey;
1798
seg->mr_base = seg->mr_dma;
1799
seg->mr_nsegs = 1;
1800
nsegs = 1;
1801
break;
1802
#endif
1803
1804
/* Registration using frmr registration */
1805
case RPCRDMA_FRMR:
1806
rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807
break;
1808
1809
/* Registration using fmr memory registration */
1810
case RPCRDMA_MTHCAFMR:
1811
rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1812
break;
1813
1814
/* Registration using memory windows */
1815
case RPCRDMA_MEMWINDOWS_ASYNC:
1816
case RPCRDMA_MEMWINDOWS:
1817
rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1818
break;
1819
1820
/* Default registration each time */
1821
default:
1822
rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1823
break;
1824
}
1825
if (rc)
1826
return -1;
1827
1828
return nsegs;
1829
}
1830
1831
int
1832
rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833
struct rpcrdma_xprt *r_xprt, void *r)
1834
{
1835
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1836
int nsegs = seg->mr_nsegs, rc;
1837
1838
switch (ia->ri_memreg_strategy) {
1839
1840
#if RPCRDMA_PERSISTENT_REGISTRATION
1841
case RPCRDMA_ALLPHYSICAL:
1842
BUG_ON(nsegs != 1);
1843
rpcrdma_unmap_one(ia, seg);
1844
rc = 0;
1845
break;
1846
#endif
1847
1848
case RPCRDMA_FRMR:
1849
rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850
break;
1851
1852
case RPCRDMA_MTHCAFMR:
1853
rc = rpcrdma_deregister_fmr_external(seg, ia);
1854
break;
1855
1856
case RPCRDMA_MEMWINDOWS_ASYNC:
1857
case RPCRDMA_MEMWINDOWS:
1858
rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1859
break;
1860
1861
default:
1862
rc = rpcrdma_deregister_default_external(seg, ia);
1863
break;
1864
}
1865
if (r) {
1866
struct rpcrdma_rep *rep = r;
1867
void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868
rep->rr_func = NULL;
1869
func(rep); /* dereg done, callback now */
1870
}
1871
return nsegs;
1872
}
1873
1874
/*
1875
* Prepost any receive buffer, then post send.
1876
*
1877
* Receive buffer is donated to hardware, reclaimed upon recv completion.
1878
*/
1879
int
1880
rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881
struct rpcrdma_ep *ep,
1882
struct rpcrdma_req *req)
1883
{
1884
struct ib_send_wr send_wr, *send_wr_fail;
1885
struct rpcrdma_rep *rep = req->rl_reply;
1886
int rc;
1887
1888
if (rep) {
1889
rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890
if (rc)
1891
goto out;
1892
req->rl_reply = NULL;
1893
}
1894
1895
send_wr.next = NULL;
1896
send_wr.wr_id = 0ULL; /* no send cookie */
1897
send_wr.sg_list = req->rl_send_iov;
1898
send_wr.num_sge = req->rl_niovs;
1899
send_wr.opcode = IB_WR_SEND;
1900
if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1901
ib_dma_sync_single_for_device(ia->ri_id->device,
1902
req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903
DMA_TO_DEVICE);
1904
ib_dma_sync_single_for_device(ia->ri_id->device,
1905
req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906
DMA_TO_DEVICE);
1907
ib_dma_sync_single_for_device(ia->ri_id->device,
1908
req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909
DMA_TO_DEVICE);
1910
1911
if (DECR_CQCOUNT(ep) > 0)
1912
send_wr.send_flags = 0;
1913
else { /* Provider must take a send completion every now and then */
1914
INIT_CQCOUNT(ep);
1915
send_wr.send_flags = IB_SEND_SIGNALED;
1916
}
1917
1918
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919
if (rc)
1920
dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1921
rc);
1922
out:
1923
return rc;
1924
}
1925
1926
/*
1927
* (Re)post a receive buffer.
1928
*/
1929
int
1930
rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931
struct rpcrdma_ep *ep,
1932
struct rpcrdma_rep *rep)
1933
{
1934
struct ib_recv_wr recv_wr, *recv_wr_fail;
1935
int rc;
1936
1937
recv_wr.next = NULL;
1938
recv_wr.wr_id = (u64) (unsigned long) rep;
1939
recv_wr.sg_list = &rep->rr_iov;
1940
recv_wr.num_sge = 1;
1941
1942
ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943
rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945
DECR_CQCOUNT(ep);
1946
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948
if (rc)
1949
dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1950
rc);
1951
return rc;
1952
}
1953
1954