Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/block/brd.c
15109 views
1
/*
2
* Ram backed block device driver.
3
*
4
* Copyright (C) 2007 Nick Piggin
5
* Copyright (C) 2007 Novell Inc.
6
*
7
* Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
8
* of their respective owners.
9
*/
10
11
#include <linux/init.h>
12
#include <linux/module.h>
13
#include <linux/moduleparam.h>
14
#include <linux/major.h>
15
#include <linux/blkdev.h>
16
#include <linux/bio.h>
17
#include <linux/highmem.h>
18
#include <linux/mutex.h>
19
#include <linux/radix-tree.h>
20
#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
21
#include <linux/slab.h>
22
23
#include <asm/uaccess.h>
24
25
#define SECTOR_SHIFT 9
26
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
27
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
28
29
/*
30
* Each block ramdisk device has a radix_tree brd_pages of pages that stores
31
* the pages containing the block device's contents. A brd page's ->index is
32
* its offset in PAGE_SIZE units. This is similar to, but in no way connected
33
* with, the kernel's pagecache or buffer cache (which sit above our block
34
* device).
35
*/
36
struct brd_device {
37
int brd_number;
38
39
struct request_queue *brd_queue;
40
struct gendisk *brd_disk;
41
struct list_head brd_list;
42
43
/*
44
* Backing store of pages and lock to protect it. This is the contents
45
* of the block device.
46
*/
47
spinlock_t brd_lock;
48
struct radix_tree_root brd_pages;
49
};
50
51
/*
52
* Look up and return a brd's page for a given sector.
53
*/
54
static DEFINE_MUTEX(brd_mutex);
55
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
56
{
57
pgoff_t idx;
58
struct page *page;
59
60
/*
61
* The page lifetime is protected by the fact that we have opened the
62
* device node -- brd pages will never be deleted under us, so we
63
* don't need any further locking or refcounting.
64
*
65
* This is strictly true for the radix-tree nodes as well (ie. we
66
* don't actually need the rcu_read_lock()), however that is not a
67
* documented feature of the radix-tree API so it is better to be
68
* safe here (we don't have total exclusion from radix tree updates
69
* here, only deletes).
70
*/
71
rcu_read_lock();
72
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
73
page = radix_tree_lookup(&brd->brd_pages, idx);
74
rcu_read_unlock();
75
76
BUG_ON(page && page->index != idx);
77
78
return page;
79
}
80
81
/*
82
* Look up and return a brd's page for a given sector.
83
* If one does not exist, allocate an empty page, and insert that. Then
84
* return it.
85
*/
86
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
87
{
88
pgoff_t idx;
89
struct page *page;
90
gfp_t gfp_flags;
91
92
page = brd_lookup_page(brd, sector);
93
if (page)
94
return page;
95
96
/*
97
* Must use NOIO because we don't want to recurse back into the
98
* block or filesystem layers from page reclaim.
99
*
100
* Cannot support XIP and highmem, because our ->direct_access
101
* routine for XIP must return memory that is always addressable.
102
* If XIP was reworked to use pfns and kmap throughout, this
103
* restriction might be able to be lifted.
104
*/
105
gfp_flags = GFP_NOIO | __GFP_ZERO;
106
#ifndef CONFIG_BLK_DEV_XIP
107
gfp_flags |= __GFP_HIGHMEM;
108
#endif
109
page = alloc_page(gfp_flags);
110
if (!page)
111
return NULL;
112
113
if (radix_tree_preload(GFP_NOIO)) {
114
__free_page(page);
115
return NULL;
116
}
117
118
spin_lock(&brd->brd_lock);
119
idx = sector >> PAGE_SECTORS_SHIFT;
120
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
121
__free_page(page);
122
page = radix_tree_lookup(&brd->brd_pages, idx);
123
BUG_ON(!page);
124
BUG_ON(page->index != idx);
125
} else
126
page->index = idx;
127
spin_unlock(&brd->brd_lock);
128
129
radix_tree_preload_end();
130
131
return page;
132
}
133
134
static void brd_free_page(struct brd_device *brd, sector_t sector)
135
{
136
struct page *page;
137
pgoff_t idx;
138
139
spin_lock(&brd->brd_lock);
140
idx = sector >> PAGE_SECTORS_SHIFT;
141
page = radix_tree_delete(&brd->brd_pages, idx);
142
spin_unlock(&brd->brd_lock);
143
if (page)
144
__free_page(page);
145
}
146
147
static void brd_zero_page(struct brd_device *brd, sector_t sector)
148
{
149
struct page *page;
150
151
page = brd_lookup_page(brd, sector);
152
if (page)
153
clear_highpage(page);
154
}
155
156
/*
157
* Free all backing store pages and radix tree. This must only be called when
158
* there are no other users of the device.
159
*/
160
#define FREE_BATCH 16
161
static void brd_free_pages(struct brd_device *brd)
162
{
163
unsigned long pos = 0;
164
struct page *pages[FREE_BATCH];
165
int nr_pages;
166
167
do {
168
int i;
169
170
nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
171
(void **)pages, pos, FREE_BATCH);
172
173
for (i = 0; i < nr_pages; i++) {
174
void *ret;
175
176
BUG_ON(pages[i]->index < pos);
177
pos = pages[i]->index;
178
ret = radix_tree_delete(&brd->brd_pages, pos);
179
BUG_ON(!ret || ret != pages[i]);
180
__free_page(pages[i]);
181
}
182
183
pos++;
184
185
/*
186
* This assumes radix_tree_gang_lookup always returns as
187
* many pages as possible. If the radix-tree code changes,
188
* so will this have to.
189
*/
190
} while (nr_pages == FREE_BATCH);
191
}
192
193
/*
194
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
195
*/
196
static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
197
{
198
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
199
size_t copy;
200
201
copy = min_t(size_t, n, PAGE_SIZE - offset);
202
if (!brd_insert_page(brd, sector))
203
return -ENOMEM;
204
if (copy < n) {
205
sector += copy >> SECTOR_SHIFT;
206
if (!brd_insert_page(brd, sector))
207
return -ENOMEM;
208
}
209
return 0;
210
}
211
212
static void discard_from_brd(struct brd_device *brd,
213
sector_t sector, size_t n)
214
{
215
while (n >= PAGE_SIZE) {
216
/*
217
* Don't want to actually discard pages here because
218
* re-allocating the pages can result in writeback
219
* deadlocks under heavy load.
220
*/
221
if (0)
222
brd_free_page(brd, sector);
223
else
224
brd_zero_page(brd, sector);
225
sector += PAGE_SIZE >> SECTOR_SHIFT;
226
n -= PAGE_SIZE;
227
}
228
}
229
230
/*
231
* Copy n bytes from src to the brd starting at sector. Does not sleep.
232
*/
233
static void copy_to_brd(struct brd_device *brd, const void *src,
234
sector_t sector, size_t n)
235
{
236
struct page *page;
237
void *dst;
238
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
239
size_t copy;
240
241
copy = min_t(size_t, n, PAGE_SIZE - offset);
242
page = brd_lookup_page(brd, sector);
243
BUG_ON(!page);
244
245
dst = kmap_atomic(page, KM_USER1);
246
memcpy(dst + offset, src, copy);
247
kunmap_atomic(dst, KM_USER1);
248
249
if (copy < n) {
250
src += copy;
251
sector += copy >> SECTOR_SHIFT;
252
copy = n - copy;
253
page = brd_lookup_page(brd, sector);
254
BUG_ON(!page);
255
256
dst = kmap_atomic(page, KM_USER1);
257
memcpy(dst, src, copy);
258
kunmap_atomic(dst, KM_USER1);
259
}
260
}
261
262
/*
263
* Copy n bytes to dst from the brd starting at sector. Does not sleep.
264
*/
265
static void copy_from_brd(void *dst, struct brd_device *brd,
266
sector_t sector, size_t n)
267
{
268
struct page *page;
269
void *src;
270
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
271
size_t copy;
272
273
copy = min_t(size_t, n, PAGE_SIZE - offset);
274
page = brd_lookup_page(brd, sector);
275
if (page) {
276
src = kmap_atomic(page, KM_USER1);
277
memcpy(dst, src + offset, copy);
278
kunmap_atomic(src, KM_USER1);
279
} else
280
memset(dst, 0, copy);
281
282
if (copy < n) {
283
dst += copy;
284
sector += copy >> SECTOR_SHIFT;
285
copy = n - copy;
286
page = brd_lookup_page(brd, sector);
287
if (page) {
288
src = kmap_atomic(page, KM_USER1);
289
memcpy(dst, src, copy);
290
kunmap_atomic(src, KM_USER1);
291
} else
292
memset(dst, 0, copy);
293
}
294
}
295
296
/*
297
* Process a single bvec of a bio.
298
*/
299
static int brd_do_bvec(struct brd_device *brd, struct page *page,
300
unsigned int len, unsigned int off, int rw,
301
sector_t sector)
302
{
303
void *mem;
304
int err = 0;
305
306
if (rw != READ) {
307
err = copy_to_brd_setup(brd, sector, len);
308
if (err)
309
goto out;
310
}
311
312
mem = kmap_atomic(page, KM_USER0);
313
if (rw == READ) {
314
copy_from_brd(mem + off, brd, sector, len);
315
flush_dcache_page(page);
316
} else {
317
flush_dcache_page(page);
318
copy_to_brd(brd, mem + off, sector, len);
319
}
320
kunmap_atomic(mem, KM_USER0);
321
322
out:
323
return err;
324
}
325
326
static int brd_make_request(struct request_queue *q, struct bio *bio)
327
{
328
struct block_device *bdev = bio->bi_bdev;
329
struct brd_device *brd = bdev->bd_disk->private_data;
330
int rw;
331
struct bio_vec *bvec;
332
sector_t sector;
333
int i;
334
int err = -EIO;
335
336
sector = bio->bi_sector;
337
if (sector + (bio->bi_size >> SECTOR_SHIFT) >
338
get_capacity(bdev->bd_disk))
339
goto out;
340
341
if (unlikely(bio->bi_rw & REQ_DISCARD)) {
342
err = 0;
343
discard_from_brd(brd, sector, bio->bi_size);
344
goto out;
345
}
346
347
rw = bio_rw(bio);
348
if (rw == READA)
349
rw = READ;
350
351
bio_for_each_segment(bvec, bio, i) {
352
unsigned int len = bvec->bv_len;
353
err = brd_do_bvec(brd, bvec->bv_page, len,
354
bvec->bv_offset, rw, sector);
355
if (err)
356
break;
357
sector += len >> SECTOR_SHIFT;
358
}
359
360
out:
361
bio_endio(bio, err);
362
363
return 0;
364
}
365
366
#ifdef CONFIG_BLK_DEV_XIP
367
static int brd_direct_access(struct block_device *bdev, sector_t sector,
368
void **kaddr, unsigned long *pfn)
369
{
370
struct brd_device *brd = bdev->bd_disk->private_data;
371
struct page *page;
372
373
if (!brd)
374
return -ENODEV;
375
if (sector & (PAGE_SECTORS-1))
376
return -EINVAL;
377
if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
378
return -ERANGE;
379
page = brd_insert_page(brd, sector);
380
if (!page)
381
return -ENOMEM;
382
*kaddr = page_address(page);
383
*pfn = page_to_pfn(page);
384
385
return 0;
386
}
387
#endif
388
389
static int brd_ioctl(struct block_device *bdev, fmode_t mode,
390
unsigned int cmd, unsigned long arg)
391
{
392
int error;
393
struct brd_device *brd = bdev->bd_disk->private_data;
394
395
if (cmd != BLKFLSBUF)
396
return -ENOTTY;
397
398
/*
399
* ram device BLKFLSBUF has special semantics, we want to actually
400
* release and destroy the ramdisk data.
401
*/
402
mutex_lock(&brd_mutex);
403
mutex_lock(&bdev->bd_mutex);
404
error = -EBUSY;
405
if (bdev->bd_openers <= 1) {
406
/*
407
* Invalidate the cache first, so it isn't written
408
* back to the device.
409
*
410
* Another thread might instantiate more buffercache here,
411
* but there is not much we can do to close that race.
412
*/
413
invalidate_bh_lrus();
414
truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
415
brd_free_pages(brd);
416
error = 0;
417
}
418
mutex_unlock(&bdev->bd_mutex);
419
mutex_unlock(&brd_mutex);
420
421
return error;
422
}
423
424
static const struct block_device_operations brd_fops = {
425
.owner = THIS_MODULE,
426
.ioctl = brd_ioctl,
427
#ifdef CONFIG_BLK_DEV_XIP
428
.direct_access = brd_direct_access,
429
#endif
430
};
431
432
/*
433
* And now the modules code and kernel interface.
434
*/
435
static int rd_nr;
436
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
437
static int max_part;
438
static int part_shift;
439
module_param(rd_nr, int, S_IRUGO);
440
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
441
module_param(rd_size, int, S_IRUGO);
442
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
443
module_param(max_part, int, S_IRUGO);
444
MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
445
MODULE_LICENSE("GPL");
446
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
447
MODULE_ALIAS("rd");
448
449
#ifndef MODULE
450
/* Legacy boot options - nonmodular */
451
static int __init ramdisk_size(char *str)
452
{
453
rd_size = simple_strtol(str, NULL, 0);
454
return 1;
455
}
456
__setup("ramdisk_size=", ramdisk_size);
457
#endif
458
459
/*
460
* The device scheme is derived from loop.c. Keep them in synch where possible
461
* (should share code eventually).
462
*/
463
static LIST_HEAD(brd_devices);
464
static DEFINE_MUTEX(brd_devices_mutex);
465
466
static struct brd_device *brd_alloc(int i)
467
{
468
struct brd_device *brd;
469
struct gendisk *disk;
470
471
brd = kzalloc(sizeof(*brd), GFP_KERNEL);
472
if (!brd)
473
goto out;
474
brd->brd_number = i;
475
spin_lock_init(&brd->brd_lock);
476
INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
477
478
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
479
if (!brd->brd_queue)
480
goto out_free_dev;
481
blk_queue_make_request(brd->brd_queue, brd_make_request);
482
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
483
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
484
485
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
486
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
487
brd->brd_queue->limits.discard_zeroes_data = 1;
488
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
489
490
disk = brd->brd_disk = alloc_disk(1 << part_shift);
491
if (!disk)
492
goto out_free_queue;
493
disk->major = RAMDISK_MAJOR;
494
disk->first_minor = i << part_shift;
495
disk->fops = &brd_fops;
496
disk->private_data = brd;
497
disk->queue = brd->brd_queue;
498
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
499
sprintf(disk->disk_name, "ram%d", i);
500
set_capacity(disk, rd_size * 2);
501
502
return brd;
503
504
out_free_queue:
505
blk_cleanup_queue(brd->brd_queue);
506
out_free_dev:
507
kfree(brd);
508
out:
509
return NULL;
510
}
511
512
static void brd_free(struct brd_device *brd)
513
{
514
put_disk(brd->brd_disk);
515
blk_cleanup_queue(brd->brd_queue);
516
brd_free_pages(brd);
517
kfree(brd);
518
}
519
520
static struct brd_device *brd_init_one(int i)
521
{
522
struct brd_device *brd;
523
524
list_for_each_entry(brd, &brd_devices, brd_list) {
525
if (brd->brd_number == i)
526
goto out;
527
}
528
529
brd = brd_alloc(i);
530
if (brd) {
531
add_disk(brd->brd_disk);
532
list_add_tail(&brd->brd_list, &brd_devices);
533
}
534
out:
535
return brd;
536
}
537
538
static void brd_del_one(struct brd_device *brd)
539
{
540
list_del(&brd->brd_list);
541
del_gendisk(brd->brd_disk);
542
brd_free(brd);
543
}
544
545
static struct kobject *brd_probe(dev_t dev, int *part, void *data)
546
{
547
struct brd_device *brd;
548
struct kobject *kobj;
549
550
mutex_lock(&brd_devices_mutex);
551
brd = brd_init_one(MINOR(dev) >> part_shift);
552
kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
553
mutex_unlock(&brd_devices_mutex);
554
555
*part = 0;
556
return kobj;
557
}
558
559
static int __init brd_init(void)
560
{
561
int i, nr;
562
unsigned long range;
563
struct brd_device *brd, *next;
564
565
/*
566
* brd module now has a feature to instantiate underlying device
567
* structure on-demand, provided that there is an access dev node.
568
* However, this will not work well with user space tool that doesn't
569
* know about such "feature". In order to not break any existing
570
* tool, we do the following:
571
*
572
* (1) if rd_nr is specified, create that many upfront, and this
573
* also becomes a hard limit.
574
* (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
575
* (default 16) rd device on module load, user can further
576
* extend brd device by create dev node themselves and have
577
* kernel automatically instantiate actual device on-demand.
578
*/
579
580
part_shift = 0;
581
if (max_part > 0) {
582
part_shift = fls(max_part);
583
584
/*
585
* Adjust max_part according to part_shift as it is exported
586
* to user space so that user can decide correct minor number
587
* if [s]he want to create more devices.
588
*
589
* Note that -1 is required because partition 0 is reserved
590
* for the whole disk.
591
*/
592
max_part = (1UL << part_shift) - 1;
593
}
594
595
if ((1UL << part_shift) > DISK_MAX_PARTS)
596
return -EINVAL;
597
598
if (rd_nr > 1UL << (MINORBITS - part_shift))
599
return -EINVAL;
600
601
if (rd_nr) {
602
nr = rd_nr;
603
range = rd_nr << part_shift;
604
} else {
605
nr = CONFIG_BLK_DEV_RAM_COUNT;
606
range = 1UL << MINORBITS;
607
}
608
609
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
610
return -EIO;
611
612
for (i = 0; i < nr; i++) {
613
brd = brd_alloc(i);
614
if (!brd)
615
goto out_free;
616
list_add_tail(&brd->brd_list, &brd_devices);
617
}
618
619
/* point of no return */
620
621
list_for_each_entry(brd, &brd_devices, brd_list)
622
add_disk(brd->brd_disk);
623
624
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
625
THIS_MODULE, brd_probe, NULL, NULL);
626
627
printk(KERN_INFO "brd: module loaded\n");
628
return 0;
629
630
out_free:
631
list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
632
list_del(&brd->brd_list);
633
brd_free(brd);
634
}
635
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
636
637
return -ENOMEM;
638
}
639
640
static void __exit brd_exit(void)
641
{
642
unsigned long range;
643
struct brd_device *brd, *next;
644
645
range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
646
647
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
648
brd_del_one(brd);
649
650
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
651
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
652
}
653
654
module_init(brd_init);
655
module_exit(brd_exit);
656
657
658