Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/9p/trans_rdma.c
50032 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* RDMA transport layer based on the trans_fd.c implementation.
4
*
5
* Copyright (C) 2008 by Tom Tucker <[email protected]>
6
* Copyright (C) 2006 by Russ Cox <[email protected]>
7
* Copyright (C) 2004-2005 by Latchesar Ionkov <[email protected]>
8
* Copyright (C) 2004-2008 by Eric Van Hensbergen <[email protected]>
9
* Copyright (C) 1997-2002 by Ron Minnich <[email protected]>
10
*/
11
12
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14
#include <linux/in.h>
15
#include <linux/module.h>
16
#include <linux/net.h>
17
#include <linux/ipv6.h>
18
#include <linux/kthread.h>
19
#include <linux/errno.h>
20
#include <linux/kernel.h>
21
#include <linux/un.h>
22
#include <linux/uaccess.h>
23
#include <linux/inet.h>
24
#include <linux/file.h>
25
#include <linux/fs_context.h>
26
#include <linux/semaphore.h>
27
#include <linux/slab.h>
28
#include <linux/seq_file.h>
29
#include <net/9p/9p.h>
30
#include <net/9p/client.h>
31
#include <net/9p/transport.h>
32
#include <rdma/ib_verbs.h>
33
#include <rdma/rdma_cm.h>
34
35
#define P9_RDMA_SEND_SGE 4
36
#define P9_RDMA_RECV_SGE 4
37
#define P9_RDMA_IRD 0
38
#define P9_RDMA_ORD 0
39
#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */
40
41
/**
42
* struct p9_trans_rdma - RDMA transport instance
43
*
44
* @state: tracks the transport state machine for connection setup and tear down
45
* @cm_id: The RDMA CM ID
46
* @pd: Protection Domain pointer
47
* @qp: Queue Pair pointer
48
* @cq: Completion Queue pointer
49
* @timeout: Number of uSecs to wait for connection management events
50
* @privport: Whether a privileged port may be used
51
* @port: The port to use
52
* @sq_depth: The depth of the Send Queue
53
* @sq_sem: Semaphore for the SQ
54
* @rq_depth: The depth of the Receive Queue.
55
* @rq_sem: Semaphore for the RQ
56
* @excess_rc : Amount of posted Receive Contexts without a pending request.
57
* See rdma_request()
58
* @addr: The remote peer's address
59
* @req_lock: Protects the active request list
60
* @cm_done: Completion event for connection management tracking
61
*/
62
struct p9_trans_rdma {
63
enum {
64
P9_RDMA_INIT,
65
P9_RDMA_ADDR_RESOLVED,
66
P9_RDMA_ROUTE_RESOLVED,
67
P9_RDMA_CONNECTED,
68
P9_RDMA_FLUSHING,
69
P9_RDMA_CLOSING,
70
P9_RDMA_CLOSED,
71
} state;
72
struct rdma_cm_id *cm_id;
73
struct ib_pd *pd;
74
struct ib_qp *qp;
75
struct ib_cq *cq;
76
long timeout;
77
bool privport;
78
u16 port;
79
int sq_depth;
80
struct semaphore sq_sem;
81
int rq_depth;
82
struct semaphore rq_sem;
83
atomic_t excess_rc;
84
struct sockaddr_in addr;
85
spinlock_t req_lock;
86
87
struct completion cm_done;
88
};
89
90
struct p9_rdma_req;
91
92
/**
93
* struct p9_rdma_context - Keeps track of in-process WR
94
*
95
* @cqe: completion queue entry
96
* @busa: Bus address to unmap when the WR completes
97
* @req: Keeps track of requests (send)
98
* @rc: Keepts track of replies (receive)
99
*/
100
struct p9_rdma_context {
101
struct ib_cqe cqe;
102
dma_addr_t busa;
103
union {
104
struct p9_req_t *req;
105
struct p9_fcall rc;
106
};
107
};
108
109
static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
110
{
111
struct p9_trans_rdma *rdma = clnt->trans;
112
113
if (rdma->port != P9_RDMA_PORT)
114
seq_printf(m, ",port=%u", rdma->port);
115
if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
116
seq_printf(m, ",sq=%u", rdma->sq_depth);
117
if (rdma->rq_depth != P9_RDMA_RQ_DEPTH)
118
seq_printf(m, ",rq=%u", rdma->rq_depth);
119
if (rdma->timeout != P9_RDMA_TIMEOUT)
120
seq_printf(m, ",timeout=%lu", rdma->timeout);
121
if (rdma->privport)
122
seq_puts(m, ",privport");
123
return 0;
124
}
125
126
static int
127
p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
128
{
129
struct p9_client *c = id->context;
130
struct p9_trans_rdma *rdma = c->trans;
131
switch (event->event) {
132
case RDMA_CM_EVENT_ADDR_RESOLVED:
133
BUG_ON(rdma->state != P9_RDMA_INIT);
134
rdma->state = P9_RDMA_ADDR_RESOLVED;
135
break;
136
137
case RDMA_CM_EVENT_ROUTE_RESOLVED:
138
BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
139
rdma->state = P9_RDMA_ROUTE_RESOLVED;
140
break;
141
142
case RDMA_CM_EVENT_ESTABLISHED:
143
BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
144
rdma->state = P9_RDMA_CONNECTED;
145
break;
146
147
case RDMA_CM_EVENT_DISCONNECTED:
148
if (rdma)
149
rdma->state = P9_RDMA_CLOSED;
150
c->status = Disconnected;
151
break;
152
153
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
154
break;
155
156
case RDMA_CM_EVENT_ADDR_CHANGE:
157
case RDMA_CM_EVENT_ROUTE_ERROR:
158
case RDMA_CM_EVENT_DEVICE_REMOVAL:
159
case RDMA_CM_EVENT_MULTICAST_JOIN:
160
case RDMA_CM_EVENT_MULTICAST_ERROR:
161
case RDMA_CM_EVENT_REJECTED:
162
case RDMA_CM_EVENT_CONNECT_REQUEST:
163
case RDMA_CM_EVENT_CONNECT_RESPONSE:
164
case RDMA_CM_EVENT_CONNECT_ERROR:
165
case RDMA_CM_EVENT_ADDR_ERROR:
166
case RDMA_CM_EVENT_UNREACHABLE:
167
c->status = Disconnected;
168
rdma_disconnect(rdma->cm_id);
169
break;
170
default:
171
BUG();
172
}
173
complete(&rdma->cm_done);
174
return 0;
175
}
176
177
static void
178
recv_done(struct ib_cq *cq, struct ib_wc *wc)
179
{
180
struct p9_client *client = cq->cq_context;
181
struct p9_trans_rdma *rdma = client->trans;
182
struct p9_rdma_context *c =
183
container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
184
struct p9_req_t *req;
185
int err = 0;
186
int16_t tag;
187
188
req = NULL;
189
ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
190
DMA_FROM_DEVICE);
191
192
if (wc->status != IB_WC_SUCCESS)
193
goto err_out;
194
195
c->rc.size = wc->byte_len;
196
err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1);
197
if (err)
198
goto err_out;
199
200
req = p9_tag_lookup(client, tag);
201
if (!req)
202
goto err_out;
203
204
/* Check that we have not yet received a reply for this request.
205
*/
206
if (unlikely(req->rc.sdata)) {
207
pr_err("Duplicate reply for request %d", tag);
208
goto err_out;
209
}
210
211
req->rc.size = c->rc.size;
212
req->rc.sdata = c->rc.sdata;
213
p9_client_cb(client, req, REQ_STATUS_RCVD);
214
215
out:
216
up(&rdma->rq_sem);
217
kfree(c);
218
return;
219
220
err_out:
221
p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
222
req, err, wc->status);
223
rdma->state = P9_RDMA_FLUSHING;
224
client->status = Disconnected;
225
goto out;
226
}
227
228
static void
229
send_done(struct ib_cq *cq, struct ib_wc *wc)
230
{
231
struct p9_client *client = cq->cq_context;
232
struct p9_trans_rdma *rdma = client->trans;
233
struct p9_rdma_context *c =
234
container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
235
236
ib_dma_unmap_single(rdma->cm_id->device,
237
c->busa, c->req->tc.size,
238
DMA_TO_DEVICE);
239
up(&rdma->sq_sem);
240
p9_req_put(client, c->req);
241
kfree(c);
242
}
243
244
static void qp_event_handler(struct ib_event *event, void *context)
245
{
246
p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
247
event->event, context);
248
}
249
250
static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
251
{
252
if (!rdma)
253
return;
254
255
if (rdma->qp && !IS_ERR(rdma->qp))
256
ib_destroy_qp(rdma->qp);
257
258
if (rdma->pd && !IS_ERR(rdma->pd))
259
ib_dealloc_pd(rdma->pd);
260
261
if (rdma->cq && !IS_ERR(rdma->cq))
262
ib_free_cq(rdma->cq);
263
264
if (rdma->cm_id && !IS_ERR(rdma->cm_id))
265
rdma_destroy_id(rdma->cm_id);
266
267
kfree(rdma);
268
}
269
270
static int
271
post_recv(struct p9_client *client, struct p9_rdma_context *c)
272
{
273
struct p9_trans_rdma *rdma = client->trans;
274
struct ib_recv_wr wr;
275
struct ib_sge sge;
276
int ret;
277
278
c->busa = ib_dma_map_single(rdma->cm_id->device,
279
c->rc.sdata, client->msize,
280
DMA_FROM_DEVICE);
281
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
282
goto error;
283
284
c->cqe.done = recv_done;
285
286
sge.addr = c->busa;
287
sge.length = client->msize;
288
sge.lkey = rdma->pd->local_dma_lkey;
289
290
wr.next = NULL;
291
wr.wr_cqe = &c->cqe;
292
wr.sg_list = &sge;
293
wr.num_sge = 1;
294
295
ret = ib_post_recv(rdma->qp, &wr, NULL);
296
if (ret)
297
ib_dma_unmap_single(rdma->cm_id->device, c->busa,
298
client->msize, DMA_FROM_DEVICE);
299
return ret;
300
301
error:
302
p9_debug(P9_DEBUG_ERROR, "EIO\n");
303
return -EIO;
304
}
305
306
static int rdma_request(struct p9_client *client, struct p9_req_t *req)
307
{
308
struct p9_trans_rdma *rdma = client->trans;
309
struct ib_send_wr wr;
310
struct ib_sge sge;
311
int err = 0;
312
unsigned long flags;
313
struct p9_rdma_context *c = NULL;
314
struct p9_rdma_context *rpl_context = NULL;
315
316
/* When an error occurs between posting the recv and the send,
317
* there will be a receive context posted without a pending request.
318
* Since there is no way to "un-post" it, we remember it and skip
319
* post_recv() for the next request.
320
* So here,
321
* see if we are this `next request' and need to absorb an excess rc.
322
* If yes, then drop and free our own, and do not recv_post().
323
**/
324
if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
325
if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
326
/* Got one! */
327
p9_fcall_fini(&req->rc);
328
req->rc.sdata = NULL;
329
goto dont_need_post_recv;
330
} else {
331
/* We raced and lost. */
332
atomic_inc(&rdma->excess_rc);
333
}
334
}
335
336
/* Allocate an fcall for the reply */
337
rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
338
if (!rpl_context) {
339
err = -ENOMEM;
340
goto recv_error;
341
}
342
rpl_context->rc.sdata = req->rc.sdata;
343
344
/*
345
* Post a receive buffer for this request. We need to ensure
346
* there is a reply buffer available for every outstanding
347
* request. A flushed request can result in no reply for an
348
* outstanding request, so we must keep a count to avoid
349
* overflowing the RQ.
350
*/
351
if (down_interruptible(&rdma->rq_sem)) {
352
err = -EINTR;
353
goto recv_error;
354
}
355
356
err = post_recv(client, rpl_context);
357
if (err) {
358
p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err);
359
goto recv_error;
360
}
361
/* remove posted receive buffer from request structure */
362
req->rc.sdata = NULL;
363
364
dont_need_post_recv:
365
/* Post the request */
366
c = kmalloc(sizeof *c, GFP_NOFS);
367
if (!c) {
368
err = -ENOMEM;
369
goto send_error;
370
}
371
c->req = req;
372
373
c->busa = ib_dma_map_single(rdma->cm_id->device,
374
c->req->tc.sdata, c->req->tc.size,
375
DMA_TO_DEVICE);
376
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
377
err = -EIO;
378
goto send_error;
379
}
380
381
c->cqe.done = send_done;
382
383
sge.addr = c->busa;
384
sge.length = c->req->tc.size;
385
sge.lkey = rdma->pd->local_dma_lkey;
386
387
wr.next = NULL;
388
wr.wr_cqe = &c->cqe;
389
wr.opcode = IB_WR_SEND;
390
wr.send_flags = IB_SEND_SIGNALED;
391
wr.sg_list = &sge;
392
wr.num_sge = 1;
393
394
if (down_interruptible(&rdma->sq_sem)) {
395
err = -EINTR;
396
goto dma_unmap;
397
}
398
399
/* Mark request as `sent' *before* we actually send it,
400
* because doing if after could erase the REQ_STATUS_RCVD
401
* status in case of a very fast reply.
402
*/
403
WRITE_ONCE(req->status, REQ_STATUS_SENT);
404
err = ib_post_send(rdma->qp, &wr, NULL);
405
if (err)
406
goto dma_unmap;
407
408
/* Success */
409
return 0;
410
411
dma_unmap:
412
ib_dma_unmap_single(rdma->cm_id->device, c->busa,
413
c->req->tc.size, DMA_TO_DEVICE);
414
/* Handle errors that happened during or while preparing the send: */
415
send_error:
416
WRITE_ONCE(req->status, REQ_STATUS_ERROR);
417
kfree(c);
418
p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
419
420
/* Ach.
421
* We did recv_post(), but not send. We have one recv_post in excess.
422
*/
423
atomic_inc(&rdma->excess_rc);
424
return err;
425
426
/* Handle errors that happened during or while preparing post_recv(): */
427
recv_error:
428
kfree(rpl_context);
429
spin_lock_irqsave(&rdma->req_lock, flags);
430
if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
431
rdma->state = P9_RDMA_CLOSING;
432
spin_unlock_irqrestore(&rdma->req_lock, flags);
433
rdma_disconnect(rdma->cm_id);
434
} else
435
spin_unlock_irqrestore(&rdma->req_lock, flags);
436
return err;
437
}
438
439
static void rdma_close(struct p9_client *client)
440
{
441
struct p9_trans_rdma *rdma;
442
443
if (!client)
444
return;
445
446
rdma = client->trans;
447
if (!rdma)
448
return;
449
450
client->status = Disconnected;
451
rdma_disconnect(rdma->cm_id);
452
rdma_destroy_trans(rdma);
453
}
454
455
/**
456
* alloc_rdma - Allocate and initialize the rdma transport structure
457
* @opts: Mount options structure
458
*/
459
static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
460
{
461
struct p9_trans_rdma *rdma;
462
463
rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
464
if (!rdma)
465
return NULL;
466
467
rdma->port = opts->port;
468
rdma->privport = opts->privport;
469
rdma->sq_depth = opts->sq_depth;
470
rdma->rq_depth = opts->rq_depth;
471
rdma->timeout = opts->timeout;
472
spin_lock_init(&rdma->req_lock);
473
init_completion(&rdma->cm_done);
474
sema_init(&rdma->sq_sem, rdma->sq_depth);
475
sema_init(&rdma->rq_sem, rdma->rq_depth);
476
atomic_set(&rdma->excess_rc, 0);
477
478
return rdma;
479
}
480
481
static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
482
{
483
/* Nothing to do here.
484
* We will take care of it (if we have to) in rdma_cancelled()
485
*/
486
return 1;
487
}
488
489
/* A request has been fully flushed without a reply.
490
* That means we have posted one buffer in excess.
491
*/
492
static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
493
{
494
struct p9_trans_rdma *rdma = client->trans;
495
atomic_inc(&rdma->excess_rc);
496
return 0;
497
}
498
499
static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
500
{
501
struct sockaddr_in cl = {
502
.sin_family = AF_INET,
503
.sin_addr.s_addr = htonl(INADDR_ANY),
504
};
505
int port, err = -EINVAL;
506
507
for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
508
cl.sin_port = htons((ushort)port);
509
err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
510
if (err != -EADDRINUSE)
511
break;
512
}
513
return err;
514
}
515
516
/**
517
* rdma_create_trans - Transport method for creating a transport instance
518
* @client: client instance
519
* @fc: The filesystem context
520
*/
521
static int
522
rdma_create_trans(struct p9_client *client, struct fs_context *fc)
523
{
524
const char *addr = fc->source;
525
struct v9fs_context *ctx = fc->fs_private;
526
struct p9_rdma_opts opts = ctx->rdma_opts;
527
int err;
528
struct p9_trans_rdma *rdma;
529
struct rdma_conn_param conn_param;
530
struct ib_qp_init_attr qp_attr;
531
532
if (addr == NULL)
533
return -EINVAL;
534
535
/* options are already parsed, in the fs context */
536
opts = ctx->rdma_opts;
537
538
/* Create and initialize the RDMA transport structure */
539
rdma = alloc_rdma(&opts);
540
if (!rdma)
541
return -ENOMEM;
542
543
/* Create the RDMA CM ID */
544
rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
545
RDMA_PS_TCP, IB_QPT_RC);
546
if (IS_ERR(rdma->cm_id))
547
goto error;
548
549
/* Associate the client with the transport */
550
client->trans = rdma;
551
552
/* Bind to a privileged port if we need to */
553
if (opts.privport) {
554
err = p9_rdma_bind_privport(rdma);
555
if (err < 0) {
556
pr_err("%s (%d): problem binding to privport: %d\n",
557
__func__, task_pid_nr(current), -err);
558
goto error;
559
}
560
}
561
562
/* Resolve the server's address */
563
rdma->addr.sin_family = AF_INET;
564
rdma->addr.sin_addr.s_addr = in_aton(addr);
565
rdma->addr.sin_port = htons(opts.port);
566
err = rdma_resolve_addr(rdma->cm_id, NULL,
567
(struct sockaddr *)&rdma->addr,
568
rdma->timeout);
569
if (err)
570
goto error;
571
err = wait_for_completion_interruptible(&rdma->cm_done);
572
if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
573
goto error;
574
575
/* Resolve the route to the server */
576
err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
577
if (err)
578
goto error;
579
err = wait_for_completion_interruptible(&rdma->cm_done);
580
if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
581
goto error;
582
583
/* Create the Completion Queue */
584
rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
585
opts.sq_depth + opts.rq_depth + 1,
586
IB_POLL_SOFTIRQ);
587
if (IS_ERR(rdma->cq))
588
goto error;
589
590
/* Create the Protection Domain */
591
rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
592
if (IS_ERR(rdma->pd))
593
goto error;
594
595
/* Create the Queue Pair */
596
memset(&qp_attr, 0, sizeof qp_attr);
597
qp_attr.event_handler = qp_event_handler;
598
qp_attr.qp_context = client;
599
qp_attr.cap.max_send_wr = opts.sq_depth;
600
qp_attr.cap.max_recv_wr = opts.rq_depth;
601
qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
602
qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
603
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
604
qp_attr.qp_type = IB_QPT_RC;
605
qp_attr.send_cq = rdma->cq;
606
qp_attr.recv_cq = rdma->cq;
607
err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
608
if (err)
609
goto error;
610
rdma->qp = rdma->cm_id->qp;
611
612
/* Request a connection */
613
memset(&conn_param, 0, sizeof(conn_param));
614
conn_param.private_data = NULL;
615
conn_param.private_data_len = 0;
616
conn_param.responder_resources = P9_RDMA_IRD;
617
conn_param.initiator_depth = P9_RDMA_ORD;
618
err = rdma_connect(rdma->cm_id, &conn_param);
619
if (err)
620
goto error;
621
err = wait_for_completion_interruptible(&rdma->cm_done);
622
if (err || (rdma->state != P9_RDMA_CONNECTED))
623
goto error;
624
625
client->status = Connected;
626
627
return 0;
628
629
error:
630
rdma_destroy_trans(rdma);
631
return -ENOTCONN;
632
}
633
634
static struct p9_trans_module p9_rdma_trans = {
635
.name = "rdma",
636
.maxsize = P9_RDMA_MAXSIZE,
637
.pooled_rbuffers = true,
638
.def = false,
639
.supports_vmalloc = false,
640
.owner = THIS_MODULE,
641
.create = rdma_create_trans,
642
.close = rdma_close,
643
.request = rdma_request,
644
.cancel = rdma_cancel,
645
.cancelled = rdma_cancelled,
646
.show_options = p9_rdma_show_options,
647
};
648
649
/**
650
* p9_trans_rdma_init - Register the 9P RDMA transport driver
651
*/
652
static int __init p9_trans_rdma_init(void)
653
{
654
v9fs_register_trans(&p9_rdma_trans);
655
return 0;
656
}
657
658
static void __exit p9_trans_rdma_exit(void)
659
{
660
v9fs_unregister_trans(&p9_rdma_trans);
661
}
662
663
module_init(p9_trans_rdma_init);
664
module_exit(p9_trans_rdma_exit);
665
MODULE_ALIAS_9P("rdma");
666
667
MODULE_AUTHOR("Tom Tucker <[email protected]>");
668
MODULE_DESCRIPTION("RDMA Transport for 9P");
669
MODULE_LICENSE("Dual BSD/GPL");
670
671