Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/rds/ib.c
15109 views
1
/*
2
* Copyright (c) 2006 Oracle. All rights reserved.
3
*
4
* This software is available to you under a choice of one of two
5
* licenses. You may choose to be licensed under the terms of the GNU
6
* General Public License (GPL) Version 2, available from the file
7
* COPYING in the main directory of this source tree, or the
8
* OpenIB.org BSD license below:
9
*
10
* Redistribution and use in source and binary forms, with or
11
* without modification, are permitted provided that the following
12
* conditions are met:
13
*
14
* - Redistributions of source code must retain the above
15
* copyright notice, this list of conditions and the following
16
* disclaimer.
17
*
18
* - Redistributions in binary form must reproduce the above
19
* copyright notice, this list of conditions and the following
20
* disclaimer in the documentation and/or other materials
21
* provided with the distribution.
22
*
23
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
* SOFTWARE.
31
*
32
*/
33
#include <linux/kernel.h>
34
#include <linux/in.h>
35
#include <linux/if.h>
36
#include <linux/netdevice.h>
37
#include <linux/inetdevice.h>
38
#include <linux/if_arp.h>
39
#include <linux/delay.h>
40
#include <linux/slab.h>
41
42
#include "rds.h"
43
#include "ib.h"
44
45
static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
46
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
47
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
48
49
module_param(fmr_pool_size, int, 0444);
50
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
51
module_param(fmr_message_size, int, 0444);
52
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
53
module_param(rds_ib_retry_count, int, 0444);
54
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
55
56
/*
57
* we have a clumsy combination of RCU and a rwsem protecting this list
58
* because it is used both in the get_mr fast path and while blocking in
59
* the FMR flushing path.
60
*/
61
DECLARE_RWSEM(rds_ib_devices_lock);
62
struct list_head rds_ib_devices;
63
64
/* NOTE: if also grabbing ibdev lock, grab this first */
65
DEFINE_SPINLOCK(ib_nodev_conns_lock);
66
LIST_HEAD(ib_nodev_conns);
67
68
static void rds_ib_nodev_connect(void)
69
{
70
struct rds_ib_connection *ic;
71
72
spin_lock(&ib_nodev_conns_lock);
73
list_for_each_entry(ic, &ib_nodev_conns, ib_node)
74
rds_conn_connect_if_down(ic->conn);
75
spin_unlock(&ib_nodev_conns_lock);
76
}
77
78
static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
79
{
80
struct rds_ib_connection *ic;
81
unsigned long flags;
82
83
spin_lock_irqsave(&rds_ibdev->spinlock, flags);
84
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
85
rds_conn_drop(ic->conn);
86
spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
87
}
88
89
/*
90
* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
91
* from interrupt context so we push freing off into a work struct in krdsd.
92
*/
93
static void rds_ib_dev_free(struct work_struct *work)
94
{
95
struct rds_ib_ipaddr *i_ipaddr, *i_next;
96
struct rds_ib_device *rds_ibdev = container_of(work,
97
struct rds_ib_device, free_work);
98
99
if (rds_ibdev->mr_pool)
100
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
101
if (rds_ibdev->mr)
102
ib_dereg_mr(rds_ibdev->mr);
103
if (rds_ibdev->pd)
104
ib_dealloc_pd(rds_ibdev->pd);
105
106
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
107
list_del(&i_ipaddr->list);
108
kfree(i_ipaddr);
109
}
110
111
kfree(rds_ibdev);
112
}
113
114
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
115
{
116
BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
117
if (atomic_dec_and_test(&rds_ibdev->refcount))
118
queue_work(rds_wq, &rds_ibdev->free_work);
119
}
120
121
static void rds_ib_add_one(struct ib_device *device)
122
{
123
struct rds_ib_device *rds_ibdev;
124
struct ib_device_attr *dev_attr;
125
126
/* Only handle IB (no iWARP) devices */
127
if (device->node_type != RDMA_NODE_IB_CA)
128
return;
129
130
dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
131
if (!dev_attr)
132
return;
133
134
if (ib_query_device(device, dev_attr)) {
135
rdsdebug("Query device failed for %s\n", device->name);
136
goto free_attr;
137
}
138
139
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
140
ibdev_to_node(device));
141
if (!rds_ibdev)
142
goto free_attr;
143
144
spin_lock_init(&rds_ibdev->spinlock);
145
atomic_set(&rds_ibdev->refcount, 1);
146
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
147
148
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
149
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
150
151
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
152
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
153
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
154
fmr_pool_size;
155
156
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
157
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
158
159
rds_ibdev->dev = device;
160
rds_ibdev->pd = ib_alloc_pd(device);
161
if (IS_ERR(rds_ibdev->pd)) {
162
rds_ibdev->pd = NULL;
163
goto put_dev;
164
}
165
166
rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
167
if (IS_ERR(rds_ibdev->mr)) {
168
rds_ibdev->mr = NULL;
169
goto put_dev;
170
}
171
172
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
173
if (IS_ERR(rds_ibdev->mr_pool)) {
174
rds_ibdev->mr_pool = NULL;
175
goto put_dev;
176
}
177
178
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
179
INIT_LIST_HEAD(&rds_ibdev->conn_list);
180
181
down_write(&rds_ib_devices_lock);
182
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
183
up_write(&rds_ib_devices_lock);
184
atomic_inc(&rds_ibdev->refcount);
185
186
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
187
atomic_inc(&rds_ibdev->refcount);
188
189
rds_ib_nodev_connect();
190
191
put_dev:
192
rds_ib_dev_put(rds_ibdev);
193
free_attr:
194
kfree(dev_attr);
195
}
196
197
/*
198
* New connections use this to find the device to associate with the
199
* connection. It's not in the fast path so we're not concerned about the
200
* performance of the IB call. (As of this writing, it uses an interrupt
201
* blocking spinlock to serialize walking a per-device list of all registered
202
* clients.)
203
*
204
* RCU is used to handle incoming connections racing with device teardown.
205
* Rather than use a lock to serialize removal from the client_data and
206
* getting a new reference, we use an RCU grace period. The destruction
207
* path removes the device from client_data and then waits for all RCU
208
* readers to finish.
209
*
210
* A new connection can get NULL from this if its arriving on a
211
* device that is in the process of being removed.
212
*/
213
struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
214
{
215
struct rds_ib_device *rds_ibdev;
216
217
rcu_read_lock();
218
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
219
if (rds_ibdev)
220
atomic_inc(&rds_ibdev->refcount);
221
rcu_read_unlock();
222
return rds_ibdev;
223
}
224
225
/*
226
* The IB stack is letting us know that a device is going away. This can
227
* happen if the underlying HCA driver is removed or if PCI hotplug is removing
228
* the pci function, for example.
229
*
230
* This can be called at any time and can be racing with any other RDS path.
231
*/
232
static void rds_ib_remove_one(struct ib_device *device)
233
{
234
struct rds_ib_device *rds_ibdev;
235
236
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
237
if (!rds_ibdev)
238
return;
239
240
rds_ib_dev_shutdown(rds_ibdev);
241
242
/* stop connection attempts from getting a reference to this device. */
243
ib_set_client_data(device, &rds_ib_client, NULL);
244
245
down_write(&rds_ib_devices_lock);
246
list_del_rcu(&rds_ibdev->list);
247
up_write(&rds_ib_devices_lock);
248
249
/*
250
* This synchronize rcu is waiting for readers of both the ib
251
* client data and the devices list to finish before we drop
252
* both of those references.
253
*/
254
synchronize_rcu();
255
rds_ib_dev_put(rds_ibdev);
256
rds_ib_dev_put(rds_ibdev);
257
}
258
259
struct ib_client rds_ib_client = {
260
.name = "rds_ib",
261
.add = rds_ib_add_one,
262
.remove = rds_ib_remove_one
263
};
264
265
static int rds_ib_conn_info_visitor(struct rds_connection *conn,
266
void *buffer)
267
{
268
struct rds_info_rdma_connection *iinfo = buffer;
269
struct rds_ib_connection *ic;
270
271
/* We will only ever look at IB transports */
272
if (conn->c_trans != &rds_ib_transport)
273
return 0;
274
275
iinfo->src_addr = conn->c_laddr;
276
iinfo->dst_addr = conn->c_faddr;
277
278
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
279
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
280
if (rds_conn_state(conn) == RDS_CONN_UP) {
281
struct rds_ib_device *rds_ibdev;
282
struct rdma_dev_addr *dev_addr;
283
284
ic = conn->c_transport_data;
285
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
286
287
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
288
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
289
290
rds_ibdev = ic->rds_ibdev;
291
iinfo->max_send_wr = ic->i_send_ring.w_nr;
292
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
293
iinfo->max_send_sge = rds_ibdev->max_sge;
294
rds_ib_get_mr_info(rds_ibdev, iinfo);
295
}
296
return 1;
297
}
298
299
static void rds_ib_ic_info(struct socket *sock, unsigned int len,
300
struct rds_info_iterator *iter,
301
struct rds_info_lengths *lens)
302
{
303
rds_for_each_conn_info(sock, len, iter, lens,
304
rds_ib_conn_info_visitor,
305
sizeof(struct rds_info_rdma_connection));
306
}
307
308
309
/*
310
* Early RDS/IB was built to only bind to an address if there is an IPoIB
311
* device with that address set.
312
*
313
* If it were me, I'd advocate for something more flexible. Sending and
314
* receiving should be device-agnostic. Transports would try and maintain
315
* connections between peers who have messages queued. Userspace would be
316
* allowed to influence which paths have priority. We could call userspace
317
* asserting this policy "routing".
318
*/
319
static int rds_ib_laddr_check(__be32 addr)
320
{
321
int ret;
322
struct rdma_cm_id *cm_id;
323
struct sockaddr_in sin;
324
325
/* Create a CMA ID and try to bind it. This catches both
326
* IB and iWARP capable NICs.
327
*/
328
cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
329
if (IS_ERR(cm_id))
330
return PTR_ERR(cm_id);
331
332
memset(&sin, 0, sizeof(sin));
333
sin.sin_family = AF_INET;
334
sin.sin_addr.s_addr = addr;
335
336
/* rdma_bind_addr will only succeed for IB & iWARP devices */
337
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
338
/* due to this, we will claim to support iWARP devices unless we
339
check node_type. */
340
if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
341
ret = -EADDRNOTAVAIL;
342
343
rdsdebug("addr %pI4 ret %d node type %d\n",
344
&addr, ret,
345
cm_id->device ? cm_id->device->node_type : -1);
346
347
rdma_destroy_id(cm_id);
348
349
return ret;
350
}
351
352
static void rds_ib_unregister_client(void)
353
{
354
ib_unregister_client(&rds_ib_client);
355
/* wait for rds_ib_dev_free() to complete */
356
flush_workqueue(rds_wq);
357
}
358
359
void rds_ib_exit(void)
360
{
361
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
362
rds_ib_unregister_client();
363
rds_ib_destroy_nodev_conns();
364
rds_ib_sysctl_exit();
365
rds_ib_recv_exit();
366
rds_trans_unregister(&rds_ib_transport);
367
}
368
369
struct rds_transport rds_ib_transport = {
370
.laddr_check = rds_ib_laddr_check,
371
.xmit_complete = rds_ib_xmit_complete,
372
.xmit = rds_ib_xmit,
373
.xmit_rdma = rds_ib_xmit_rdma,
374
.xmit_atomic = rds_ib_xmit_atomic,
375
.recv = rds_ib_recv,
376
.conn_alloc = rds_ib_conn_alloc,
377
.conn_free = rds_ib_conn_free,
378
.conn_connect = rds_ib_conn_connect,
379
.conn_shutdown = rds_ib_conn_shutdown,
380
.inc_copy_to_user = rds_ib_inc_copy_to_user,
381
.inc_free = rds_ib_inc_free,
382
.cm_initiate_connect = rds_ib_cm_initiate_connect,
383
.cm_handle_connect = rds_ib_cm_handle_connect,
384
.cm_connect_complete = rds_ib_cm_connect_complete,
385
.stats_info_copy = rds_ib_stats_info_copy,
386
.exit = rds_ib_exit,
387
.get_mr = rds_ib_get_mr,
388
.sync_mr = rds_ib_sync_mr,
389
.free_mr = rds_ib_free_mr,
390
.flush_mrs = rds_ib_flush_mrs,
391
.t_owner = THIS_MODULE,
392
.t_name = "infiniband",
393
.t_type = RDS_TRANS_IB
394
};
395
396
int rds_ib_init(void)
397
{
398
int ret;
399
400
INIT_LIST_HEAD(&rds_ib_devices);
401
402
ret = ib_register_client(&rds_ib_client);
403
if (ret)
404
goto out;
405
406
ret = rds_ib_sysctl_init();
407
if (ret)
408
goto out_ibreg;
409
410
ret = rds_ib_recv_init();
411
if (ret)
412
goto out_sysctl;
413
414
ret = rds_trans_register(&rds_ib_transport);
415
if (ret)
416
goto out_recv;
417
418
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
419
420
goto out;
421
422
out_recv:
423
rds_ib_recv_exit();
424
out_sysctl:
425
rds_ib_sysctl_exit();
426
out_ibreg:
427
rds_ib_unregister_client();
428
out:
429
return ret;
430
}
431
432
MODULE_LICENSE("GPL");
433
434
435