Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/rds/ib_rdma.c
15109 views
1
/*
2
* Copyright (c) 2006 Oracle. All rights reserved.
3
*
4
* This software is available to you under a choice of one of two
5
* licenses. You may choose to be licensed under the terms of the GNU
6
* General Public License (GPL) Version 2, available from the file
7
* COPYING in the main directory of this source tree, or the
8
* OpenIB.org BSD license below:
9
*
10
* Redistribution and use in source and binary forms, with or
11
* without modification, are permitted provided that the following
12
* conditions are met:
13
*
14
* - Redistributions of source code must retain the above
15
* copyright notice, this list of conditions and the following
16
* disclaimer.
17
*
18
* - Redistributions in binary form must reproduce the above
19
* copyright notice, this list of conditions and the following
20
* disclaimer in the documentation and/or other materials
21
* provided with the distribution.
22
*
23
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
* SOFTWARE.
31
*
32
*/
33
#include <linux/kernel.h>
34
#include <linux/slab.h>
35
#include <linux/rculist.h>
36
37
#include "rds.h"
38
#include "ib.h"
39
#include "xlist.h"
40
41
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
42
#define CLEAN_LIST_BUSY_BIT 0
43
44
/*
45
* This is stored as mr->r_trans_private.
46
*/
47
struct rds_ib_mr {
48
struct rds_ib_device *device;
49
struct rds_ib_mr_pool *pool;
50
struct ib_fmr *fmr;
51
52
struct xlist_head xlist;
53
54
/* unmap_list is for freeing */
55
struct list_head unmap_list;
56
unsigned int remap_count;
57
58
struct scatterlist *sg;
59
unsigned int sg_len;
60
u64 *dma;
61
int sg_dma_len;
62
};
63
64
/*
65
* Our own little FMR pool
66
*/
67
struct rds_ib_mr_pool {
68
struct mutex flush_lock; /* serialize fmr invalidate */
69
struct delayed_work flush_worker; /* flush worker */
70
71
atomic_t item_count; /* total # of MRs */
72
atomic_t dirty_count; /* # dirty of MRs */
73
74
struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
75
struct xlist_head free_list; /* unused MRs */
76
struct xlist_head clean_list; /* global unused & unamapped MRs */
77
wait_queue_head_t flush_wait;
78
79
atomic_t free_pinned; /* memory pinned by free MRs */
80
unsigned long max_items;
81
unsigned long max_items_soft;
82
unsigned long max_free_pinned;
83
struct ib_fmr_attr fmr_attr;
84
};
85
86
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
87
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
88
static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
89
90
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
91
{
92
struct rds_ib_device *rds_ibdev;
93
struct rds_ib_ipaddr *i_ipaddr;
94
95
rcu_read_lock();
96
list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
97
list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
98
if (i_ipaddr->ipaddr == ipaddr) {
99
atomic_inc(&rds_ibdev->refcount);
100
rcu_read_unlock();
101
return rds_ibdev;
102
}
103
}
104
}
105
rcu_read_unlock();
106
107
return NULL;
108
}
109
110
static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
111
{
112
struct rds_ib_ipaddr *i_ipaddr;
113
114
i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
115
if (!i_ipaddr)
116
return -ENOMEM;
117
118
i_ipaddr->ipaddr = ipaddr;
119
120
spin_lock_irq(&rds_ibdev->spinlock);
121
list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
122
spin_unlock_irq(&rds_ibdev->spinlock);
123
124
return 0;
125
}
126
127
static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
128
{
129
struct rds_ib_ipaddr *i_ipaddr;
130
struct rds_ib_ipaddr *to_free = NULL;
131
132
133
spin_lock_irq(&rds_ibdev->spinlock);
134
list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
135
if (i_ipaddr->ipaddr == ipaddr) {
136
list_del_rcu(&i_ipaddr->list);
137
to_free = i_ipaddr;
138
break;
139
}
140
}
141
spin_unlock_irq(&rds_ibdev->spinlock);
142
143
if (to_free) {
144
synchronize_rcu();
145
kfree(to_free);
146
}
147
}
148
149
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
150
{
151
struct rds_ib_device *rds_ibdev_old;
152
153
rds_ibdev_old = rds_ib_get_device(ipaddr);
154
if (rds_ibdev_old) {
155
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
156
rds_ib_dev_put(rds_ibdev_old);
157
}
158
159
return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
160
}
161
162
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
163
{
164
struct rds_ib_connection *ic = conn->c_transport_data;
165
166
/* conn was previously on the nodev_conns_list */
167
spin_lock_irq(&ib_nodev_conns_lock);
168
BUG_ON(list_empty(&ib_nodev_conns));
169
BUG_ON(list_empty(&ic->ib_node));
170
list_del(&ic->ib_node);
171
172
spin_lock(&rds_ibdev->spinlock);
173
list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
174
spin_unlock(&rds_ibdev->spinlock);
175
spin_unlock_irq(&ib_nodev_conns_lock);
176
177
ic->rds_ibdev = rds_ibdev;
178
atomic_inc(&rds_ibdev->refcount);
179
}
180
181
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
182
{
183
struct rds_ib_connection *ic = conn->c_transport_data;
184
185
/* place conn on nodev_conns_list */
186
spin_lock(&ib_nodev_conns_lock);
187
188
spin_lock_irq(&rds_ibdev->spinlock);
189
BUG_ON(list_empty(&ic->ib_node));
190
list_del(&ic->ib_node);
191
spin_unlock_irq(&rds_ibdev->spinlock);
192
193
list_add_tail(&ic->ib_node, &ib_nodev_conns);
194
195
spin_unlock(&ib_nodev_conns_lock);
196
197
ic->rds_ibdev = NULL;
198
rds_ib_dev_put(rds_ibdev);
199
}
200
201
void rds_ib_destroy_nodev_conns(void)
202
{
203
struct rds_ib_connection *ic, *_ic;
204
LIST_HEAD(tmp_list);
205
206
/* avoid calling conn_destroy with irqs off */
207
spin_lock_irq(&ib_nodev_conns_lock);
208
list_splice(&ib_nodev_conns, &tmp_list);
209
spin_unlock_irq(&ib_nodev_conns_lock);
210
211
list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
212
rds_conn_destroy(ic->conn);
213
}
214
215
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
216
{
217
struct rds_ib_mr_pool *pool;
218
219
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
220
if (!pool)
221
return ERR_PTR(-ENOMEM);
222
223
INIT_XLIST_HEAD(&pool->free_list);
224
INIT_XLIST_HEAD(&pool->drop_list);
225
INIT_XLIST_HEAD(&pool->clean_list);
226
mutex_init(&pool->flush_lock);
227
init_waitqueue_head(&pool->flush_wait);
228
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
229
230
pool->fmr_attr.max_pages = fmr_message_size;
231
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
232
pool->fmr_attr.page_shift = PAGE_SHIFT;
233
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
234
235
/* We never allow more than max_items MRs to be allocated.
236
* When we exceed more than max_items_soft, we start freeing
237
* items more aggressively.
238
* Make sure that max_items > max_items_soft > max_items / 2
239
*/
240
pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
241
pool->max_items = rds_ibdev->max_fmrs;
242
243
return pool;
244
}
245
246
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
247
{
248
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
249
250
iinfo->rdma_mr_max = pool->max_items;
251
iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
252
}
253
254
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
255
{
256
cancel_delayed_work_sync(&pool->flush_worker);
257
rds_ib_flush_mr_pool(pool, 1, NULL);
258
WARN_ON(atomic_read(&pool->item_count));
259
WARN_ON(atomic_read(&pool->free_pinned));
260
kfree(pool);
261
}
262
263
static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
264
struct rds_ib_mr **ibmr_ret)
265
{
266
struct xlist_head *ibmr_xl;
267
ibmr_xl = xlist_del_head_fast(xl);
268
*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
269
}
270
271
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
272
{
273
struct rds_ib_mr *ibmr = NULL;
274
struct xlist_head *ret;
275
unsigned long *flag;
276
277
preempt_disable();
278
flag = &__get_cpu_var(clean_list_grace);
279
set_bit(CLEAN_LIST_BUSY_BIT, flag);
280
ret = xlist_del_head(&pool->clean_list);
281
if (ret)
282
ibmr = list_entry(ret, struct rds_ib_mr, xlist);
283
284
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
285
preempt_enable();
286
return ibmr;
287
}
288
289
static inline void wait_clean_list_grace(void)
290
{
291
int cpu;
292
unsigned long *flag;
293
294
for_each_online_cpu(cpu) {
295
flag = &per_cpu(clean_list_grace, cpu);
296
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
297
cpu_relax();
298
}
299
}
300
301
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
302
{
303
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
304
struct rds_ib_mr *ibmr = NULL;
305
int err = 0, iter = 0;
306
307
if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
308
schedule_delayed_work(&pool->flush_worker, 10);
309
310
while (1) {
311
ibmr = rds_ib_reuse_fmr(pool);
312
if (ibmr)
313
return ibmr;
314
315
/* No clean MRs - now we have the choice of either
316
* allocating a fresh MR up to the limit imposed by the
317
* driver, or flush any dirty unused MRs.
318
* We try to avoid stalling in the send path if possible,
319
* so we allocate as long as we're allowed to.
320
*
321
* We're fussy with enforcing the FMR limit, though. If the driver
322
* tells us we can't use more than N fmrs, we shouldn't start
323
* arguing with it */
324
if (atomic_inc_return(&pool->item_count) <= pool->max_items)
325
break;
326
327
atomic_dec(&pool->item_count);
328
329
if (++iter > 2) {
330
rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
331
return ERR_PTR(-EAGAIN);
332
}
333
334
/* We do have some empty MRs. Flush them out. */
335
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
336
rds_ib_flush_mr_pool(pool, 0, &ibmr);
337
if (ibmr)
338
return ibmr;
339
}
340
341
ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
342
if (!ibmr) {
343
err = -ENOMEM;
344
goto out_no_cigar;
345
}
346
347
memset(ibmr, 0, sizeof(*ibmr));
348
349
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
350
(IB_ACCESS_LOCAL_WRITE |
351
IB_ACCESS_REMOTE_READ |
352
IB_ACCESS_REMOTE_WRITE|
353
IB_ACCESS_REMOTE_ATOMIC),
354
&pool->fmr_attr);
355
if (IS_ERR(ibmr->fmr)) {
356
err = PTR_ERR(ibmr->fmr);
357
ibmr->fmr = NULL;
358
printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
359
goto out_no_cigar;
360
}
361
362
rds_ib_stats_inc(s_ib_rdma_mr_alloc);
363
return ibmr;
364
365
out_no_cigar:
366
if (ibmr) {
367
if (ibmr->fmr)
368
ib_dealloc_fmr(ibmr->fmr);
369
kfree(ibmr);
370
}
371
atomic_dec(&pool->item_count);
372
return ERR_PTR(err);
373
}
374
375
static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
376
struct scatterlist *sg, unsigned int nents)
377
{
378
struct ib_device *dev = rds_ibdev->dev;
379
struct scatterlist *scat = sg;
380
u64 io_addr = 0;
381
u64 *dma_pages;
382
u32 len;
383
int page_cnt, sg_dma_len;
384
int i, j;
385
int ret;
386
387
sg_dma_len = ib_dma_map_sg(dev, sg, nents,
388
DMA_BIDIRECTIONAL);
389
if (unlikely(!sg_dma_len)) {
390
printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
391
return -EBUSY;
392
}
393
394
len = 0;
395
page_cnt = 0;
396
397
for (i = 0; i < sg_dma_len; ++i) {
398
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
399
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
400
401
if (dma_addr & ~PAGE_MASK) {
402
if (i > 0)
403
return -EINVAL;
404
else
405
++page_cnt;
406
}
407
if ((dma_addr + dma_len) & ~PAGE_MASK) {
408
if (i < sg_dma_len - 1)
409
return -EINVAL;
410
else
411
++page_cnt;
412
}
413
414
len += dma_len;
415
}
416
417
page_cnt += len >> PAGE_SHIFT;
418
if (page_cnt > fmr_message_size)
419
return -EINVAL;
420
421
dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
422
rdsibdev_to_node(rds_ibdev));
423
if (!dma_pages)
424
return -ENOMEM;
425
426
page_cnt = 0;
427
for (i = 0; i < sg_dma_len; ++i) {
428
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
429
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
430
431
for (j = 0; j < dma_len; j += PAGE_SIZE)
432
dma_pages[page_cnt++] =
433
(dma_addr & PAGE_MASK) + j;
434
}
435
436
ret = ib_map_phys_fmr(ibmr->fmr,
437
dma_pages, page_cnt, io_addr);
438
if (ret)
439
goto out;
440
441
/* Success - we successfully remapped the MR, so we can
442
* safely tear down the old mapping. */
443
rds_ib_teardown_mr(ibmr);
444
445
ibmr->sg = scat;
446
ibmr->sg_len = nents;
447
ibmr->sg_dma_len = sg_dma_len;
448
ibmr->remap_count++;
449
450
rds_ib_stats_inc(s_ib_rdma_mr_used);
451
ret = 0;
452
453
out:
454
kfree(dma_pages);
455
456
return ret;
457
}
458
459
void rds_ib_sync_mr(void *trans_private, int direction)
460
{
461
struct rds_ib_mr *ibmr = trans_private;
462
struct rds_ib_device *rds_ibdev = ibmr->device;
463
464
switch (direction) {
465
case DMA_FROM_DEVICE:
466
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
467
ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
468
break;
469
case DMA_TO_DEVICE:
470
ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
471
ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
472
break;
473
}
474
}
475
476
static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
477
{
478
struct rds_ib_device *rds_ibdev = ibmr->device;
479
480
if (ibmr->sg_dma_len) {
481
ib_dma_unmap_sg(rds_ibdev->dev,
482
ibmr->sg, ibmr->sg_len,
483
DMA_BIDIRECTIONAL);
484
ibmr->sg_dma_len = 0;
485
}
486
487
/* Release the s/g list */
488
if (ibmr->sg_len) {
489
unsigned int i;
490
491
for (i = 0; i < ibmr->sg_len; ++i) {
492
struct page *page = sg_page(&ibmr->sg[i]);
493
494
/* FIXME we need a way to tell a r/w MR
495
* from a r/o MR */
496
BUG_ON(irqs_disabled());
497
set_page_dirty(page);
498
put_page(page);
499
}
500
kfree(ibmr->sg);
501
502
ibmr->sg = NULL;
503
ibmr->sg_len = 0;
504
}
505
}
506
507
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
508
{
509
unsigned int pinned = ibmr->sg_len;
510
511
__rds_ib_teardown_mr(ibmr);
512
if (pinned) {
513
struct rds_ib_device *rds_ibdev = ibmr->device;
514
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
515
516
atomic_sub(pinned, &pool->free_pinned);
517
}
518
}
519
520
static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
521
{
522
unsigned int item_count;
523
524
item_count = atomic_read(&pool->item_count);
525
if (free_all)
526
return item_count;
527
528
return 0;
529
}
530
531
/*
532
* given an xlist of mrs, put them all into the list_head for more processing
533
*/
534
static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
535
{
536
struct rds_ib_mr *ibmr;
537
struct xlist_head splice;
538
struct xlist_head *cur;
539
struct xlist_head *next;
540
541
splice.next = NULL;
542
xlist_splice(xlist, &splice);
543
cur = splice.next;
544
while (cur) {
545
next = cur->next;
546
ibmr = list_entry(cur, struct rds_ib_mr, xlist);
547
list_add_tail(&ibmr->unmap_list, list);
548
cur = next;
549
}
550
}
551
552
/*
553
* this takes a list head of mrs and turns it into an xlist of clusters.
554
* each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
555
* reuse.
556
*/
557
static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
558
struct list_head *list, struct xlist_head *xlist,
559
struct xlist_head **tail_ret)
560
{
561
struct rds_ib_mr *ibmr;
562
struct xlist_head *cur_mr = xlist;
563
struct xlist_head *tail_mr = NULL;
564
565
list_for_each_entry(ibmr, list, unmap_list) {
566
tail_mr = &ibmr->xlist;
567
tail_mr->next = NULL;
568
cur_mr->next = tail_mr;
569
cur_mr = tail_mr;
570
}
571
*tail_ret = tail_mr;
572
}
573
574
/*
575
* Flush our pool of MRs.
576
* At a minimum, all currently unused MRs are unmapped.
577
* If the number of MRs allocated exceeds the limit, we also try
578
* to free as many MRs as needed to get back to this limit.
579
*/
580
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
581
int free_all, struct rds_ib_mr **ibmr_ret)
582
{
583
struct rds_ib_mr *ibmr, *next;
584
struct xlist_head clean_xlist;
585
struct xlist_head *clean_tail;
586
LIST_HEAD(unmap_list);
587
LIST_HEAD(fmr_list);
588
unsigned long unpinned = 0;
589
unsigned int nfreed = 0, ncleaned = 0, free_goal;
590
int ret = 0;
591
592
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
593
594
if (ibmr_ret) {
595
DEFINE_WAIT(wait);
596
while(!mutex_trylock(&pool->flush_lock)) {
597
ibmr = rds_ib_reuse_fmr(pool);
598
if (ibmr) {
599
*ibmr_ret = ibmr;
600
finish_wait(&pool->flush_wait, &wait);
601
goto out_nolock;
602
}
603
604
prepare_to_wait(&pool->flush_wait, &wait,
605
TASK_UNINTERRUPTIBLE);
606
if (xlist_empty(&pool->clean_list))
607
schedule();
608
609
ibmr = rds_ib_reuse_fmr(pool);
610
if (ibmr) {
611
*ibmr_ret = ibmr;
612
finish_wait(&pool->flush_wait, &wait);
613
goto out_nolock;
614
}
615
}
616
finish_wait(&pool->flush_wait, &wait);
617
} else
618
mutex_lock(&pool->flush_lock);
619
620
if (ibmr_ret) {
621
ibmr = rds_ib_reuse_fmr(pool);
622
if (ibmr) {
623
*ibmr_ret = ibmr;
624
goto out;
625
}
626
}
627
628
/* Get the list of all MRs to be dropped. Ordering matters -
629
* we want to put drop_list ahead of free_list.
630
*/
631
xlist_append_to_list(&pool->drop_list, &unmap_list);
632
xlist_append_to_list(&pool->free_list, &unmap_list);
633
if (free_all)
634
xlist_append_to_list(&pool->clean_list, &unmap_list);
635
636
free_goal = rds_ib_flush_goal(pool, free_all);
637
638
if (list_empty(&unmap_list))
639
goto out;
640
641
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
642
list_for_each_entry(ibmr, &unmap_list, unmap_list)
643
list_add(&ibmr->fmr->list, &fmr_list);
644
645
ret = ib_unmap_fmr(&fmr_list);
646
if (ret)
647
printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
648
649
/* Now we can destroy the DMA mapping and unpin any pages */
650
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
651
unpinned += ibmr->sg_len;
652
__rds_ib_teardown_mr(ibmr);
653
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
654
rds_ib_stats_inc(s_ib_rdma_mr_free);
655
list_del(&ibmr->unmap_list);
656
ib_dealloc_fmr(ibmr->fmr);
657
kfree(ibmr);
658
nfreed++;
659
}
660
ncleaned++;
661
}
662
663
if (!list_empty(&unmap_list)) {
664
/* we have to make sure that none of the things we're about
665
* to put on the clean list would race with other cpus trying
666
* to pull items off. The xlist would explode if we managed to
667
* remove something from the clean list and then add it back again
668
* while another CPU was spinning on that same item in xlist_del_head.
669
*
670
* This is pretty unlikely, but just in case wait for an xlist grace period
671
* here before adding anything back into the clean list.
672
*/
673
wait_clean_list_grace();
674
675
list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
676
if (ibmr_ret)
677
refill_local(pool, &clean_xlist, ibmr_ret);
678
679
/* refill_local may have emptied our list */
680
if (!xlist_empty(&clean_xlist))
681
xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
682
683
}
684
685
atomic_sub(unpinned, &pool->free_pinned);
686
atomic_sub(ncleaned, &pool->dirty_count);
687
atomic_sub(nfreed, &pool->item_count);
688
689
out:
690
mutex_unlock(&pool->flush_lock);
691
if (waitqueue_active(&pool->flush_wait))
692
wake_up(&pool->flush_wait);
693
out_nolock:
694
return ret;
695
}
696
697
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
698
{
699
struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
700
701
rds_ib_flush_mr_pool(pool, 0, NULL);
702
}
703
704
void rds_ib_free_mr(void *trans_private, int invalidate)
705
{
706
struct rds_ib_mr *ibmr = trans_private;
707
struct rds_ib_device *rds_ibdev = ibmr->device;
708
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
709
710
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
711
712
/* Return it to the pool's free list */
713
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
714
xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
715
else
716
xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
717
718
atomic_add(ibmr->sg_len, &pool->free_pinned);
719
atomic_inc(&pool->dirty_count);
720
721
/* If we've pinned too many pages, request a flush */
722
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
723
atomic_read(&pool->dirty_count) >= pool->max_items / 10)
724
schedule_delayed_work(&pool->flush_worker, 10);
725
726
if (invalidate) {
727
if (likely(!in_interrupt())) {
728
rds_ib_flush_mr_pool(pool, 0, NULL);
729
} else {
730
/* We get here if the user created a MR marked
731
* as use_once and invalidate at the same time. */
732
schedule_delayed_work(&pool->flush_worker, 10);
733
}
734
}
735
736
rds_ib_dev_put(rds_ibdev);
737
}
738
739
void rds_ib_flush_mrs(void)
740
{
741
struct rds_ib_device *rds_ibdev;
742
743
down_read(&rds_ib_devices_lock);
744
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
745
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
746
747
if (pool)
748
rds_ib_flush_mr_pool(pool, 0, NULL);
749
}
750
up_read(&rds_ib_devices_lock);
751
}
752
753
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
754
struct rds_sock *rs, u32 *key_ret)
755
{
756
struct rds_ib_device *rds_ibdev;
757
struct rds_ib_mr *ibmr = NULL;
758
int ret;
759
760
rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
761
if (!rds_ibdev) {
762
ret = -ENODEV;
763
goto out;
764
}
765
766
if (!rds_ibdev->mr_pool) {
767
ret = -ENODEV;
768
goto out;
769
}
770
771
ibmr = rds_ib_alloc_fmr(rds_ibdev);
772
if (IS_ERR(ibmr))
773
return ibmr;
774
775
ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
776
if (ret == 0)
777
*key_ret = ibmr->fmr->rkey;
778
else
779
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
780
781
ibmr->device = rds_ibdev;
782
rds_ibdev = NULL;
783
784
out:
785
if (ret) {
786
if (ibmr)
787
rds_ib_free_mr(ibmr, 0);
788
ibmr = ERR_PTR(ret);
789
}
790
if (rds_ibdev)
791
rds_ib_dev_put(rds_ibdev);
792
return ibmr;
793
}
794
795
796