Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/block_dev.c
15109 views
1
/*
2
* linux/fs/block_dev.c
3
*
4
* Copyright (C) 1991, 1992 Linus Torvalds
5
* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
6
*/
7
8
#include <linux/init.h>
9
#include <linux/mm.h>
10
#include <linux/fcntl.h>
11
#include <linux/slab.h>
12
#include <linux/kmod.h>
13
#include <linux/major.h>
14
#include <linux/device_cgroup.h>
15
#include <linux/highmem.h>
16
#include <linux/blkdev.h>
17
#include <linux/module.h>
18
#include <linux/blkpg.h>
19
#include <linux/buffer_head.h>
20
#include <linux/pagevec.h>
21
#include <linux/writeback.h>
22
#include <linux/mpage.h>
23
#include <linux/mount.h>
24
#include <linux/uio.h>
25
#include <linux/namei.h>
26
#include <linux/log2.h>
27
#include <linux/kmemleak.h>
28
#include <asm/uaccess.h>
29
#include "internal.h"
30
31
struct bdev_inode {
32
struct block_device bdev;
33
struct inode vfs_inode;
34
};
35
36
static const struct address_space_operations def_blk_aops;
37
38
static inline struct bdev_inode *BDEV_I(struct inode *inode)
39
{
40
return container_of(inode, struct bdev_inode, vfs_inode);
41
}
42
43
inline struct block_device *I_BDEV(struct inode *inode)
44
{
45
return &BDEV_I(inode)->bdev;
46
}
47
48
EXPORT_SYMBOL(I_BDEV);
49
50
/*
51
* move the inode from it's current bdi to the a new bdi. if the inode is dirty
52
* we need to move it onto the dirty list of @dst so that the inode is always
53
* on the right list.
54
*/
55
static void bdev_inode_switch_bdi(struct inode *inode,
56
struct backing_dev_info *dst)
57
{
58
spin_lock(&inode_wb_list_lock);
59
spin_lock(&inode->i_lock);
60
inode->i_data.backing_dev_info = dst;
61
if (inode->i_state & I_DIRTY)
62
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63
spin_unlock(&inode->i_lock);
64
spin_unlock(&inode_wb_list_lock);
65
}
66
67
static sector_t max_block(struct block_device *bdev)
68
{
69
sector_t retval = ~((sector_t)0);
70
loff_t sz = i_size_read(bdev->bd_inode);
71
72
if (sz) {
73
unsigned int size = block_size(bdev);
74
unsigned int sizebits = blksize_bits(size);
75
retval = (sz >> sizebits);
76
}
77
return retval;
78
}
79
80
/* Kill _all_ buffers and pagecache , dirty or not.. */
81
static void kill_bdev(struct block_device *bdev)
82
{
83
if (bdev->bd_inode->i_mapping->nrpages == 0)
84
return;
85
invalidate_bh_lrus();
86
truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
87
}
88
89
int set_blocksize(struct block_device *bdev, int size)
90
{
91
/* Size must be a power of two, and between 512 and PAGE_SIZE */
92
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
93
return -EINVAL;
94
95
/* Size cannot be smaller than the size supported by the device */
96
if (size < bdev_logical_block_size(bdev))
97
return -EINVAL;
98
99
/* Don't change the size if it is same as current */
100
if (bdev->bd_block_size != size) {
101
sync_blockdev(bdev);
102
bdev->bd_block_size = size;
103
bdev->bd_inode->i_blkbits = blksize_bits(size);
104
kill_bdev(bdev);
105
}
106
return 0;
107
}
108
109
EXPORT_SYMBOL(set_blocksize);
110
111
int sb_set_blocksize(struct super_block *sb, int size)
112
{
113
if (set_blocksize(sb->s_bdev, size))
114
return 0;
115
/* If we get here, we know size is power of two
116
* and it's value is between 512 and PAGE_SIZE */
117
sb->s_blocksize = size;
118
sb->s_blocksize_bits = blksize_bits(size);
119
return sb->s_blocksize;
120
}
121
122
EXPORT_SYMBOL(sb_set_blocksize);
123
124
int sb_min_blocksize(struct super_block *sb, int size)
125
{
126
int minsize = bdev_logical_block_size(sb->s_bdev);
127
if (size < minsize)
128
size = minsize;
129
return sb_set_blocksize(sb, size);
130
}
131
132
EXPORT_SYMBOL(sb_min_blocksize);
133
134
static int
135
blkdev_get_block(struct inode *inode, sector_t iblock,
136
struct buffer_head *bh, int create)
137
{
138
if (iblock >= max_block(I_BDEV(inode))) {
139
if (create)
140
return -EIO;
141
142
/*
143
* for reads, we're just trying to fill a partial page.
144
* return a hole, they will have to call get_block again
145
* before they can fill it, and they will get -EIO at that
146
* time
147
*/
148
return 0;
149
}
150
bh->b_bdev = I_BDEV(inode);
151
bh->b_blocknr = iblock;
152
set_buffer_mapped(bh);
153
return 0;
154
}
155
156
static int
157
blkdev_get_blocks(struct inode *inode, sector_t iblock,
158
struct buffer_head *bh, int create)
159
{
160
sector_t end_block = max_block(I_BDEV(inode));
161
unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
162
163
if ((iblock + max_blocks) > end_block) {
164
max_blocks = end_block - iblock;
165
if ((long)max_blocks <= 0) {
166
if (create)
167
return -EIO; /* write fully beyond EOF */
168
/*
169
* It is a read which is fully beyond EOF. We return
170
* a !buffer_mapped buffer
171
*/
172
max_blocks = 0;
173
}
174
}
175
176
bh->b_bdev = I_BDEV(inode);
177
bh->b_blocknr = iblock;
178
bh->b_size = max_blocks << inode->i_blkbits;
179
if (max_blocks)
180
set_buffer_mapped(bh);
181
return 0;
182
}
183
184
static ssize_t
185
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
186
loff_t offset, unsigned long nr_segs)
187
{
188
struct file *file = iocb->ki_filp;
189
struct inode *inode = file->f_mapping->host;
190
191
return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
192
nr_segs, blkdev_get_blocks, NULL, NULL, 0);
193
}
194
195
int __sync_blockdev(struct block_device *bdev, int wait)
196
{
197
if (!bdev)
198
return 0;
199
if (!wait)
200
return filemap_flush(bdev->bd_inode->i_mapping);
201
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
202
}
203
204
/*
205
* Write out and wait upon all the dirty data associated with a block
206
* device via its mapping. Does not take the superblock lock.
207
*/
208
int sync_blockdev(struct block_device *bdev)
209
{
210
return __sync_blockdev(bdev, 1);
211
}
212
EXPORT_SYMBOL(sync_blockdev);
213
214
/*
215
* Write out and wait upon all dirty data associated with this
216
* device. Filesystem data as well as the underlying block
217
* device. Takes the superblock lock.
218
*/
219
int fsync_bdev(struct block_device *bdev)
220
{
221
struct super_block *sb = get_super(bdev);
222
if (sb) {
223
int res = sync_filesystem(sb);
224
drop_super(sb);
225
return res;
226
}
227
return sync_blockdev(bdev);
228
}
229
EXPORT_SYMBOL(fsync_bdev);
230
231
/**
232
* freeze_bdev -- lock a filesystem and force it into a consistent state
233
* @bdev: blockdevice to lock
234
*
235
* If a superblock is found on this device, we take the s_umount semaphore
236
* on it to make sure nobody unmounts until the snapshot creation is done.
237
* The reference counter (bd_fsfreeze_count) guarantees that only the last
238
* unfreeze process can unfreeze the frozen filesystem actually when multiple
239
* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
240
* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
241
* actually.
242
*/
243
struct super_block *freeze_bdev(struct block_device *bdev)
244
{
245
struct super_block *sb;
246
int error = 0;
247
248
mutex_lock(&bdev->bd_fsfreeze_mutex);
249
if (++bdev->bd_fsfreeze_count > 1) {
250
/*
251
* We don't even need to grab a reference - the first call
252
* to freeze_bdev grab an active reference and only the last
253
* thaw_bdev drops it.
254
*/
255
sb = get_super(bdev);
256
drop_super(sb);
257
mutex_unlock(&bdev->bd_fsfreeze_mutex);
258
return sb;
259
}
260
261
sb = get_active_super(bdev);
262
if (!sb)
263
goto out;
264
error = freeze_super(sb);
265
if (error) {
266
deactivate_super(sb);
267
bdev->bd_fsfreeze_count--;
268
mutex_unlock(&bdev->bd_fsfreeze_mutex);
269
return ERR_PTR(error);
270
}
271
deactivate_super(sb);
272
out:
273
sync_blockdev(bdev);
274
mutex_unlock(&bdev->bd_fsfreeze_mutex);
275
return sb; /* thaw_bdev releases s->s_umount */
276
}
277
EXPORT_SYMBOL(freeze_bdev);
278
279
/**
280
* thaw_bdev -- unlock filesystem
281
* @bdev: blockdevice to unlock
282
* @sb: associated superblock
283
*
284
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
285
*/
286
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
287
{
288
int error = -EINVAL;
289
290
mutex_lock(&bdev->bd_fsfreeze_mutex);
291
if (!bdev->bd_fsfreeze_count)
292
goto out;
293
294
error = 0;
295
if (--bdev->bd_fsfreeze_count > 0)
296
goto out;
297
298
if (!sb)
299
goto out;
300
301
error = thaw_super(sb);
302
if (error) {
303
bdev->bd_fsfreeze_count++;
304
mutex_unlock(&bdev->bd_fsfreeze_mutex);
305
return error;
306
}
307
out:
308
mutex_unlock(&bdev->bd_fsfreeze_mutex);
309
return 0;
310
}
311
EXPORT_SYMBOL(thaw_bdev);
312
313
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
314
{
315
return block_write_full_page(page, blkdev_get_block, wbc);
316
}
317
318
static int blkdev_readpage(struct file * file, struct page * page)
319
{
320
return block_read_full_page(page, blkdev_get_block);
321
}
322
323
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
324
loff_t pos, unsigned len, unsigned flags,
325
struct page **pagep, void **fsdata)
326
{
327
return block_write_begin(mapping, pos, len, flags, pagep,
328
blkdev_get_block);
329
}
330
331
static int blkdev_write_end(struct file *file, struct address_space *mapping,
332
loff_t pos, unsigned len, unsigned copied,
333
struct page *page, void *fsdata)
334
{
335
int ret;
336
ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
337
338
unlock_page(page);
339
page_cache_release(page);
340
341
return ret;
342
}
343
344
/*
345
* private llseek:
346
* for a block special file file->f_path.dentry->d_inode->i_size is zero
347
* so we compute the size by hand (just as in block_read/write above)
348
*/
349
static loff_t block_llseek(struct file *file, loff_t offset, int origin)
350
{
351
struct inode *bd_inode = file->f_mapping->host;
352
loff_t size;
353
loff_t retval;
354
355
mutex_lock(&bd_inode->i_mutex);
356
size = i_size_read(bd_inode);
357
358
switch (origin) {
359
case 2:
360
offset += size;
361
break;
362
case 1:
363
offset += file->f_pos;
364
}
365
retval = -EINVAL;
366
if (offset >= 0 && offset <= size) {
367
if (offset != file->f_pos) {
368
file->f_pos = offset;
369
}
370
retval = offset;
371
}
372
mutex_unlock(&bd_inode->i_mutex);
373
return retval;
374
}
375
376
int blkdev_fsync(struct file *filp, int datasync)
377
{
378
struct inode *bd_inode = filp->f_mapping->host;
379
struct block_device *bdev = I_BDEV(bd_inode);
380
int error;
381
382
/*
383
* There is no need to serialise calls to blkdev_issue_flush with
384
* i_mutex and doing so causes performance issues with concurrent
385
* O_SYNC writers to a block device.
386
*/
387
mutex_unlock(&bd_inode->i_mutex);
388
389
error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
390
if (error == -EOPNOTSUPP)
391
error = 0;
392
393
mutex_lock(&bd_inode->i_mutex);
394
395
return error;
396
}
397
EXPORT_SYMBOL(blkdev_fsync);
398
399
/*
400
* pseudo-fs
401
*/
402
403
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
404
static struct kmem_cache * bdev_cachep __read_mostly;
405
406
static struct inode *bdev_alloc_inode(struct super_block *sb)
407
{
408
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
409
if (!ei)
410
return NULL;
411
return &ei->vfs_inode;
412
}
413
414
static void bdev_i_callback(struct rcu_head *head)
415
{
416
struct inode *inode = container_of(head, struct inode, i_rcu);
417
struct bdev_inode *bdi = BDEV_I(inode);
418
419
INIT_LIST_HEAD(&inode->i_dentry);
420
kmem_cache_free(bdev_cachep, bdi);
421
}
422
423
static void bdev_destroy_inode(struct inode *inode)
424
{
425
call_rcu(&inode->i_rcu, bdev_i_callback);
426
}
427
428
static void init_once(void *foo)
429
{
430
struct bdev_inode *ei = (struct bdev_inode *) foo;
431
struct block_device *bdev = &ei->bdev;
432
433
memset(bdev, 0, sizeof(*bdev));
434
mutex_init(&bdev->bd_mutex);
435
INIT_LIST_HEAD(&bdev->bd_inodes);
436
INIT_LIST_HEAD(&bdev->bd_list);
437
#ifdef CONFIG_SYSFS
438
INIT_LIST_HEAD(&bdev->bd_holder_disks);
439
#endif
440
inode_init_once(&ei->vfs_inode);
441
/* Initialize mutex for freeze. */
442
mutex_init(&bdev->bd_fsfreeze_mutex);
443
}
444
445
static inline void __bd_forget(struct inode *inode)
446
{
447
list_del_init(&inode->i_devices);
448
inode->i_bdev = NULL;
449
inode->i_mapping = &inode->i_data;
450
}
451
452
static void bdev_evict_inode(struct inode *inode)
453
{
454
struct block_device *bdev = &BDEV_I(inode)->bdev;
455
struct list_head *p;
456
truncate_inode_pages(&inode->i_data, 0);
457
invalidate_inode_buffers(inode); /* is it needed here? */
458
end_writeback(inode);
459
spin_lock(&bdev_lock);
460
while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
461
__bd_forget(list_entry(p, struct inode, i_devices));
462
}
463
list_del_init(&bdev->bd_list);
464
spin_unlock(&bdev_lock);
465
}
466
467
static const struct super_operations bdev_sops = {
468
.statfs = simple_statfs,
469
.alloc_inode = bdev_alloc_inode,
470
.destroy_inode = bdev_destroy_inode,
471
.drop_inode = generic_delete_inode,
472
.evict_inode = bdev_evict_inode,
473
};
474
475
static struct dentry *bd_mount(struct file_system_type *fs_type,
476
int flags, const char *dev_name, void *data)
477
{
478
return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
479
}
480
481
static struct file_system_type bd_type = {
482
.name = "bdev",
483
.mount = bd_mount,
484
.kill_sb = kill_anon_super,
485
};
486
487
struct super_block *blockdev_superblock __read_mostly;
488
489
void __init bdev_cache_init(void)
490
{
491
int err;
492
struct vfsmount *bd_mnt;
493
494
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
495
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
496
SLAB_MEM_SPREAD|SLAB_PANIC),
497
init_once);
498
err = register_filesystem(&bd_type);
499
if (err)
500
panic("Cannot register bdev pseudo-fs");
501
bd_mnt = kern_mount(&bd_type);
502
if (IS_ERR(bd_mnt))
503
panic("Cannot create bdev pseudo-fs");
504
/*
505
* This vfsmount structure is only used to obtain the
506
* blockdev_superblock, so tell kmemleak not to report it.
507
*/
508
kmemleak_not_leak(bd_mnt);
509
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
510
}
511
512
/*
513
* Most likely _very_ bad one - but then it's hardly critical for small
514
* /dev and can be fixed when somebody will need really large one.
515
* Keep in mind that it will be fed through icache hash function too.
516
*/
517
static inline unsigned long hash(dev_t dev)
518
{
519
return MAJOR(dev)+MINOR(dev);
520
}
521
522
static int bdev_test(struct inode *inode, void *data)
523
{
524
return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
525
}
526
527
static int bdev_set(struct inode *inode, void *data)
528
{
529
BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
530
return 0;
531
}
532
533
static LIST_HEAD(all_bdevs);
534
535
struct block_device *bdget(dev_t dev)
536
{
537
struct block_device *bdev;
538
struct inode *inode;
539
540
inode = iget5_locked(blockdev_superblock, hash(dev),
541
bdev_test, bdev_set, &dev);
542
543
if (!inode)
544
return NULL;
545
546
bdev = &BDEV_I(inode)->bdev;
547
548
if (inode->i_state & I_NEW) {
549
bdev->bd_contains = NULL;
550
bdev->bd_inode = inode;
551
bdev->bd_block_size = (1 << inode->i_blkbits);
552
bdev->bd_part_count = 0;
553
bdev->bd_invalidated = 0;
554
inode->i_mode = S_IFBLK;
555
inode->i_rdev = dev;
556
inode->i_bdev = bdev;
557
inode->i_data.a_ops = &def_blk_aops;
558
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
559
inode->i_data.backing_dev_info = &default_backing_dev_info;
560
spin_lock(&bdev_lock);
561
list_add(&bdev->bd_list, &all_bdevs);
562
spin_unlock(&bdev_lock);
563
unlock_new_inode(inode);
564
}
565
return bdev;
566
}
567
568
EXPORT_SYMBOL(bdget);
569
570
/**
571
* bdgrab -- Grab a reference to an already referenced block device
572
* @bdev: Block device to grab a reference to.
573
*/
574
struct block_device *bdgrab(struct block_device *bdev)
575
{
576
ihold(bdev->bd_inode);
577
return bdev;
578
}
579
580
long nr_blockdev_pages(void)
581
{
582
struct block_device *bdev;
583
long ret = 0;
584
spin_lock(&bdev_lock);
585
list_for_each_entry(bdev, &all_bdevs, bd_list) {
586
ret += bdev->bd_inode->i_mapping->nrpages;
587
}
588
spin_unlock(&bdev_lock);
589
return ret;
590
}
591
592
void bdput(struct block_device *bdev)
593
{
594
iput(bdev->bd_inode);
595
}
596
597
EXPORT_SYMBOL(bdput);
598
599
static struct block_device *bd_acquire(struct inode *inode)
600
{
601
struct block_device *bdev;
602
603
spin_lock(&bdev_lock);
604
bdev = inode->i_bdev;
605
if (bdev) {
606
ihold(bdev->bd_inode);
607
spin_unlock(&bdev_lock);
608
return bdev;
609
}
610
spin_unlock(&bdev_lock);
611
612
bdev = bdget(inode->i_rdev);
613
if (bdev) {
614
spin_lock(&bdev_lock);
615
if (!inode->i_bdev) {
616
/*
617
* We take an additional reference to bd_inode,
618
* and it's released in clear_inode() of inode.
619
* So, we can access it via ->i_mapping always
620
* without igrab().
621
*/
622
ihold(bdev->bd_inode);
623
inode->i_bdev = bdev;
624
inode->i_mapping = bdev->bd_inode->i_mapping;
625
list_add(&inode->i_devices, &bdev->bd_inodes);
626
}
627
spin_unlock(&bdev_lock);
628
}
629
return bdev;
630
}
631
632
/* Call when you free inode */
633
634
void bd_forget(struct inode *inode)
635
{
636
struct block_device *bdev = NULL;
637
638
spin_lock(&bdev_lock);
639
if (inode->i_bdev) {
640
if (!sb_is_blkdev_sb(inode->i_sb))
641
bdev = inode->i_bdev;
642
__bd_forget(inode);
643
}
644
spin_unlock(&bdev_lock);
645
646
if (bdev)
647
iput(bdev->bd_inode);
648
}
649
650
/**
651
* bd_may_claim - test whether a block device can be claimed
652
* @bdev: block device of interest
653
* @whole: whole block device containing @bdev, may equal @bdev
654
* @holder: holder trying to claim @bdev
655
*
656
* Test whether @bdev can be claimed by @holder.
657
*
658
* CONTEXT:
659
* spin_lock(&bdev_lock).
660
*
661
* RETURNS:
662
* %true if @bdev can be claimed, %false otherwise.
663
*/
664
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
665
void *holder)
666
{
667
if (bdev->bd_holder == holder)
668
return true; /* already a holder */
669
else if (bdev->bd_holder != NULL)
670
return false; /* held by someone else */
671
else if (bdev->bd_contains == bdev)
672
return true; /* is a whole device which isn't held */
673
674
else if (whole->bd_holder == bd_may_claim)
675
return true; /* is a partition of a device that is being partitioned */
676
else if (whole->bd_holder != NULL)
677
return false; /* is a partition of a held device */
678
else
679
return true; /* is a partition of an un-held device */
680
}
681
682
/**
683
* bd_prepare_to_claim - prepare to claim a block device
684
* @bdev: block device of interest
685
* @whole: the whole device containing @bdev, may equal @bdev
686
* @holder: holder trying to claim @bdev
687
*
688
* Prepare to claim @bdev. This function fails if @bdev is already
689
* claimed by another holder and waits if another claiming is in
690
* progress. This function doesn't actually claim. On successful
691
* return, the caller has ownership of bd_claiming and bd_holder[s].
692
*
693
* CONTEXT:
694
* spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
695
* it multiple times.
696
*
697
* RETURNS:
698
* 0 if @bdev can be claimed, -EBUSY otherwise.
699
*/
700
static int bd_prepare_to_claim(struct block_device *bdev,
701
struct block_device *whole, void *holder)
702
{
703
retry:
704
/* if someone else claimed, fail */
705
if (!bd_may_claim(bdev, whole, holder))
706
return -EBUSY;
707
708
/* if claiming is already in progress, wait for it to finish */
709
if (whole->bd_claiming) {
710
wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
711
DEFINE_WAIT(wait);
712
713
prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
714
spin_unlock(&bdev_lock);
715
schedule();
716
finish_wait(wq, &wait);
717
spin_lock(&bdev_lock);
718
goto retry;
719
}
720
721
/* yay, all mine */
722
return 0;
723
}
724
725
/**
726
* bd_start_claiming - start claiming a block device
727
* @bdev: block device of interest
728
* @holder: holder trying to claim @bdev
729
*
730
* @bdev is about to be opened exclusively. Check @bdev can be opened
731
* exclusively and mark that an exclusive open is in progress. Each
732
* successful call to this function must be matched with a call to
733
* either bd_finish_claiming() or bd_abort_claiming() (which do not
734
* fail).
735
*
736
* This function is used to gain exclusive access to the block device
737
* without actually causing other exclusive open attempts to fail. It
738
* should be used when the open sequence itself requires exclusive
739
* access but may subsequently fail.
740
*
741
* CONTEXT:
742
* Might sleep.
743
*
744
* RETURNS:
745
* Pointer to the block device containing @bdev on success, ERR_PTR()
746
* value on failure.
747
*/
748
static struct block_device *bd_start_claiming(struct block_device *bdev,
749
void *holder)
750
{
751
struct gendisk *disk;
752
struct block_device *whole;
753
int partno, err;
754
755
might_sleep();
756
757
/*
758
* @bdev might not have been initialized properly yet, look up
759
* and grab the outer block device the hard way.
760
*/
761
disk = get_gendisk(bdev->bd_dev, &partno);
762
if (!disk)
763
return ERR_PTR(-ENXIO);
764
765
/*
766
* Normally, @bdev should equal what's returned from bdget_disk()
767
* if partno is 0; however, some drivers (floppy) use multiple
768
* bdev's for the same physical device and @bdev may be one of the
769
* aliases. Keep @bdev if partno is 0. This means claimer
770
* tracking is broken for those devices but it has always been that
771
* way.
772
*/
773
if (partno)
774
whole = bdget_disk(disk, 0);
775
else
776
whole = bdgrab(bdev);
777
778
module_put(disk->fops->owner);
779
put_disk(disk);
780
if (!whole)
781
return ERR_PTR(-ENOMEM);
782
783
/* prepare to claim, if successful, mark claiming in progress */
784
spin_lock(&bdev_lock);
785
786
err = bd_prepare_to_claim(bdev, whole, holder);
787
if (err == 0) {
788
whole->bd_claiming = holder;
789
spin_unlock(&bdev_lock);
790
return whole;
791
} else {
792
spin_unlock(&bdev_lock);
793
bdput(whole);
794
return ERR_PTR(err);
795
}
796
}
797
798
#ifdef CONFIG_SYSFS
799
struct bd_holder_disk {
800
struct list_head list;
801
struct gendisk *disk;
802
int refcnt;
803
};
804
805
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
806
struct gendisk *disk)
807
{
808
struct bd_holder_disk *holder;
809
810
list_for_each_entry(holder, &bdev->bd_holder_disks, list)
811
if (holder->disk == disk)
812
return holder;
813
return NULL;
814
}
815
816
static int add_symlink(struct kobject *from, struct kobject *to)
817
{
818
return sysfs_create_link(from, to, kobject_name(to));
819
}
820
821
static void del_symlink(struct kobject *from, struct kobject *to)
822
{
823
sysfs_remove_link(from, kobject_name(to));
824
}
825
826
/**
827
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
828
* @bdev: the claimed slave bdev
829
* @disk: the holding disk
830
*
831
* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
832
*
833
* This functions creates the following sysfs symlinks.
834
*
835
* - from "slaves" directory of the holder @disk to the claimed @bdev
836
* - from "holders" directory of the @bdev to the holder @disk
837
*
838
* For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
839
* passed to bd_link_disk_holder(), then:
840
*
841
* /sys/block/dm-0/slaves/sda --> /sys/block/sda
842
* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
843
*
844
* The caller must have claimed @bdev before calling this function and
845
* ensure that both @bdev and @disk are valid during the creation and
846
* lifetime of these symlinks.
847
*
848
* CONTEXT:
849
* Might sleep.
850
*
851
* RETURNS:
852
* 0 on success, -errno on failure.
853
*/
854
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
855
{
856
struct bd_holder_disk *holder;
857
int ret = 0;
858
859
mutex_lock(&bdev->bd_mutex);
860
861
WARN_ON_ONCE(!bdev->bd_holder);
862
863
/* FIXME: remove the following once add_disk() handles errors */
864
if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
865
goto out_unlock;
866
867
holder = bd_find_holder_disk(bdev, disk);
868
if (holder) {
869
holder->refcnt++;
870
goto out_unlock;
871
}
872
873
holder = kzalloc(sizeof(*holder), GFP_KERNEL);
874
if (!holder) {
875
ret = -ENOMEM;
876
goto out_unlock;
877
}
878
879
INIT_LIST_HEAD(&holder->list);
880
holder->disk = disk;
881
holder->refcnt = 1;
882
883
ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
884
if (ret)
885
goto out_free;
886
887
ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
888
if (ret)
889
goto out_del;
890
/*
891
* bdev could be deleted beneath us which would implicitly destroy
892
* the holder directory. Hold on to it.
893
*/
894
kobject_get(bdev->bd_part->holder_dir);
895
896
list_add(&holder->list, &bdev->bd_holder_disks);
897
goto out_unlock;
898
899
out_del:
900
del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
901
out_free:
902
kfree(holder);
903
out_unlock:
904
mutex_unlock(&bdev->bd_mutex);
905
return ret;
906
}
907
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
908
909
/**
910
* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
911
* @bdev: the calimed slave bdev
912
* @disk: the holding disk
913
*
914
* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
915
*
916
* CONTEXT:
917
* Might sleep.
918
*/
919
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
920
{
921
struct bd_holder_disk *holder;
922
923
mutex_lock(&bdev->bd_mutex);
924
925
holder = bd_find_holder_disk(bdev, disk);
926
927
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
928
del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
929
del_symlink(bdev->bd_part->holder_dir,
930
&disk_to_dev(disk)->kobj);
931
kobject_put(bdev->bd_part->holder_dir);
932
list_del_init(&holder->list);
933
kfree(holder);
934
}
935
936
mutex_unlock(&bdev->bd_mutex);
937
}
938
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
939
#endif
940
941
/**
942
* flush_disk - invalidates all buffer-cache entries on a disk
943
*
944
* @bdev: struct block device to be flushed
945
* @kill_dirty: flag to guide handling of dirty inodes
946
*
947
* Invalidates all buffer-cache entries on a disk. It should be called
948
* when a disk has been changed -- either by a media change or online
949
* resize.
950
*/
951
static void flush_disk(struct block_device *bdev, bool kill_dirty)
952
{
953
if (__invalidate_device(bdev, kill_dirty)) {
954
char name[BDEVNAME_SIZE] = "";
955
956
if (bdev->bd_disk)
957
disk_name(bdev->bd_disk, 0, name);
958
printk(KERN_WARNING "VFS: busy inodes on changed media or "
959
"resized disk %s\n", name);
960
}
961
962
if (!bdev->bd_disk)
963
return;
964
if (disk_partitionable(bdev->bd_disk))
965
bdev->bd_invalidated = 1;
966
}
967
968
/**
969
* check_disk_size_change - checks for disk size change and adjusts bdev size.
970
* @disk: struct gendisk to check
971
* @bdev: struct bdev to adjust.
972
*
973
* This routine checks to see if the bdev size does not match the disk size
974
* and adjusts it if it differs.
975
*/
976
void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
977
{
978
loff_t disk_size, bdev_size;
979
980
disk_size = (loff_t)get_capacity(disk) << 9;
981
bdev_size = i_size_read(bdev->bd_inode);
982
if (disk_size != bdev_size) {
983
char name[BDEVNAME_SIZE];
984
985
disk_name(disk, 0, name);
986
printk(KERN_INFO
987
"%s: detected capacity change from %lld to %lld\n",
988
name, bdev_size, disk_size);
989
i_size_write(bdev->bd_inode, disk_size);
990
flush_disk(bdev, false);
991
}
992
}
993
EXPORT_SYMBOL(check_disk_size_change);
994
995
/**
996
* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
997
* @disk: struct gendisk to be revalidated
998
*
999
* This routine is a wrapper for lower-level driver's revalidate_disk
1000
* call-backs. It is used to do common pre and post operations needed
1001
* for all revalidate_disk operations.
1002
*/
1003
int revalidate_disk(struct gendisk *disk)
1004
{
1005
struct block_device *bdev;
1006
int ret = 0;
1007
1008
if (disk->fops->revalidate_disk)
1009
ret = disk->fops->revalidate_disk(disk);
1010
1011
bdev = bdget_disk(disk, 0);
1012
if (!bdev)
1013
return ret;
1014
1015
mutex_lock(&bdev->bd_mutex);
1016
check_disk_size_change(disk, bdev);
1017
mutex_unlock(&bdev->bd_mutex);
1018
bdput(bdev);
1019
return ret;
1020
}
1021
EXPORT_SYMBOL(revalidate_disk);
1022
1023
/*
1024
* This routine checks whether a removable media has been changed,
1025
* and invalidates all buffer-cache-entries in that case. This
1026
* is a relatively slow routine, so we have to try to minimize using
1027
* it. Thus it is called only upon a 'mount' or 'open'. This
1028
* is the best way of combining speed and utility, I think.
1029
* People changing diskettes in the middle of an operation deserve
1030
* to lose :-)
1031
*/
1032
int check_disk_change(struct block_device *bdev)
1033
{
1034
struct gendisk *disk = bdev->bd_disk;
1035
const struct block_device_operations *bdops = disk->fops;
1036
unsigned int events;
1037
1038
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1039
DISK_EVENT_EJECT_REQUEST);
1040
if (!(events & DISK_EVENT_MEDIA_CHANGE))
1041
return 0;
1042
1043
flush_disk(bdev, true);
1044
if (bdops->revalidate_disk)
1045
bdops->revalidate_disk(bdev->bd_disk);
1046
return 1;
1047
}
1048
1049
EXPORT_SYMBOL(check_disk_change);
1050
1051
void bd_set_size(struct block_device *bdev, loff_t size)
1052
{
1053
unsigned bsize = bdev_logical_block_size(bdev);
1054
1055
bdev->bd_inode->i_size = size;
1056
while (bsize < PAGE_CACHE_SIZE) {
1057
if (size & bsize)
1058
break;
1059
bsize <<= 1;
1060
}
1061
bdev->bd_block_size = bsize;
1062
bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1063
}
1064
EXPORT_SYMBOL(bd_set_size);
1065
1066
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1067
1068
/*
1069
* bd_mutex locking:
1070
*
1071
* mutex_lock(part->bd_mutex)
1072
* mutex_lock_nested(whole->bd_mutex, 1)
1073
*/
1074
1075
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1076
{
1077
struct gendisk *disk;
1078
int ret;
1079
int partno;
1080
int perm = 0;
1081
1082
if (mode & FMODE_READ)
1083
perm |= MAY_READ;
1084
if (mode & FMODE_WRITE)
1085
perm |= MAY_WRITE;
1086
/*
1087
* hooks: /n/, see "layering violations".
1088
*/
1089
if (!for_part) {
1090
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1091
if (ret != 0) {
1092
bdput(bdev);
1093
return ret;
1094
}
1095
}
1096
1097
restart:
1098
1099
ret = -ENXIO;
1100
disk = get_gendisk(bdev->bd_dev, &partno);
1101
if (!disk)
1102
goto out;
1103
1104
disk_block_events(disk);
1105
mutex_lock_nested(&bdev->bd_mutex, for_part);
1106
if (!bdev->bd_openers) {
1107
bdev->bd_disk = disk;
1108
bdev->bd_contains = bdev;
1109
if (!partno) {
1110
struct backing_dev_info *bdi;
1111
1112
ret = -ENXIO;
1113
bdev->bd_part = disk_get_part(disk, partno);
1114
if (!bdev->bd_part)
1115
goto out_clear;
1116
1117
ret = 0;
1118
if (disk->fops->open) {
1119
ret = disk->fops->open(bdev, mode);
1120
if (ret == -ERESTARTSYS) {
1121
/* Lost a race with 'disk' being
1122
* deleted, try again.
1123
* See md.c
1124
*/
1125
disk_put_part(bdev->bd_part);
1126
bdev->bd_part = NULL;
1127
bdev->bd_disk = NULL;
1128
mutex_unlock(&bdev->bd_mutex);
1129
disk_unblock_events(disk);
1130
module_put(disk->fops->owner);
1131
put_disk(disk);
1132
goto restart;
1133
}
1134
}
1135
1136
if (!ret && !bdev->bd_openers) {
1137
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1138
bdi = blk_get_backing_dev_info(bdev);
1139
if (bdi == NULL)
1140
bdi = &default_backing_dev_info;
1141
bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1142
}
1143
1144
/*
1145
* If the device is invalidated, rescan partition
1146
* if open succeeded or failed with -ENOMEDIUM.
1147
* The latter is necessary to prevent ghost
1148
* partitions on a removed medium.
1149
*/
1150
if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1151
rescan_partitions(disk, bdev);
1152
if (ret)
1153
goto out_clear;
1154
} else {
1155
struct block_device *whole;
1156
whole = bdget_disk(disk, 0);
1157
ret = -ENOMEM;
1158
if (!whole)
1159
goto out_clear;
1160
BUG_ON(for_part);
1161
ret = __blkdev_get(whole, mode, 1);
1162
if (ret)
1163
goto out_clear;
1164
bdev->bd_contains = whole;
1165
bdev_inode_switch_bdi(bdev->bd_inode,
1166
whole->bd_inode->i_data.backing_dev_info);
1167
bdev->bd_part = disk_get_part(disk, partno);
1168
if (!(disk->flags & GENHD_FL_UP) ||
1169
!bdev->bd_part || !bdev->bd_part->nr_sects) {
1170
ret = -ENXIO;
1171
goto out_clear;
1172
}
1173
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1174
}
1175
} else {
1176
if (bdev->bd_contains == bdev) {
1177
ret = 0;
1178
if (bdev->bd_disk->fops->open)
1179
ret = bdev->bd_disk->fops->open(bdev, mode);
1180
/* the same as first opener case, read comment there */
1181
if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1182
rescan_partitions(bdev->bd_disk, bdev);
1183
if (ret)
1184
goto out_unlock_bdev;
1185
}
1186
/* only one opener holds refs to the module and disk */
1187
module_put(disk->fops->owner);
1188
put_disk(disk);
1189
}
1190
bdev->bd_openers++;
1191
if (for_part)
1192
bdev->bd_part_count++;
1193
mutex_unlock(&bdev->bd_mutex);
1194
disk_unblock_events(disk);
1195
return 0;
1196
1197
out_clear:
1198
disk_put_part(bdev->bd_part);
1199
bdev->bd_disk = NULL;
1200
bdev->bd_part = NULL;
1201
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1202
if (bdev != bdev->bd_contains)
1203
__blkdev_put(bdev->bd_contains, mode, 1);
1204
bdev->bd_contains = NULL;
1205
out_unlock_bdev:
1206
mutex_unlock(&bdev->bd_mutex);
1207
disk_unblock_events(disk);
1208
module_put(disk->fops->owner);
1209
put_disk(disk);
1210
out:
1211
bdput(bdev);
1212
1213
return ret;
1214
}
1215
1216
/**
1217
* blkdev_get - open a block device
1218
* @bdev: block_device to open
1219
* @mode: FMODE_* mask
1220
* @holder: exclusive holder identifier
1221
*
1222
* Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1223
* open with exclusive access. Specifying %FMODE_EXCL with %NULL
1224
* @holder is invalid. Exclusive opens may nest for the same @holder.
1225
*
1226
* On success, the reference count of @bdev is unchanged. On failure,
1227
* @bdev is put.
1228
*
1229
* CONTEXT:
1230
* Might sleep.
1231
*
1232
* RETURNS:
1233
* 0 on success, -errno on failure.
1234
*/
1235
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1236
{
1237
struct block_device *whole = NULL;
1238
int res;
1239
1240
WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1241
1242
if ((mode & FMODE_EXCL) && holder) {
1243
whole = bd_start_claiming(bdev, holder);
1244
if (IS_ERR(whole)) {
1245
bdput(bdev);
1246
return PTR_ERR(whole);
1247
}
1248
}
1249
1250
res = __blkdev_get(bdev, mode, 0);
1251
1252
if (whole) {
1253
struct gendisk *disk = whole->bd_disk;
1254
1255
/* finish claiming */
1256
mutex_lock(&bdev->bd_mutex);
1257
spin_lock(&bdev_lock);
1258
1259
if (!res) {
1260
BUG_ON(!bd_may_claim(bdev, whole, holder));
1261
/*
1262
* Note that for a whole device bd_holders
1263
* will be incremented twice, and bd_holder
1264
* will be set to bd_may_claim before being
1265
* set to holder
1266
*/
1267
whole->bd_holders++;
1268
whole->bd_holder = bd_may_claim;
1269
bdev->bd_holders++;
1270
bdev->bd_holder = holder;
1271
}
1272
1273
/* tell others that we're done */
1274
BUG_ON(whole->bd_claiming != holder);
1275
whole->bd_claiming = NULL;
1276
wake_up_bit(&whole->bd_claiming, 0);
1277
1278
spin_unlock(&bdev_lock);
1279
1280
/*
1281
* Block event polling for write claims if requested. Any
1282
* write holder makes the write_holder state stick until
1283
* all are released. This is good enough and tracking
1284
* individual writeable reference is too fragile given the
1285
* way @mode is used in blkdev_get/put().
1286
*/
1287
if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1288
(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1289
bdev->bd_write_holder = true;
1290
disk_block_events(disk);
1291
}
1292
1293
mutex_unlock(&bdev->bd_mutex);
1294
bdput(whole);
1295
}
1296
1297
return res;
1298
}
1299
EXPORT_SYMBOL(blkdev_get);
1300
1301
/**
1302
* blkdev_get_by_path - open a block device by name
1303
* @path: path to the block device to open
1304
* @mode: FMODE_* mask
1305
* @holder: exclusive holder identifier
1306
*
1307
* Open the blockdevice described by the device file at @path. @mode
1308
* and @holder are identical to blkdev_get().
1309
*
1310
* On success, the returned block_device has reference count of one.
1311
*
1312
* CONTEXT:
1313
* Might sleep.
1314
*
1315
* RETURNS:
1316
* Pointer to block_device on success, ERR_PTR(-errno) on failure.
1317
*/
1318
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1319
void *holder)
1320
{
1321
struct block_device *bdev;
1322
int err;
1323
1324
bdev = lookup_bdev(path);
1325
if (IS_ERR(bdev))
1326
return bdev;
1327
1328
err = blkdev_get(bdev, mode, holder);
1329
if (err)
1330
return ERR_PTR(err);
1331
1332
if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1333
blkdev_put(bdev, mode);
1334
return ERR_PTR(-EACCES);
1335
}
1336
1337
return bdev;
1338
}
1339
EXPORT_SYMBOL(blkdev_get_by_path);
1340
1341
/**
1342
* blkdev_get_by_dev - open a block device by device number
1343
* @dev: device number of block device to open
1344
* @mode: FMODE_* mask
1345
* @holder: exclusive holder identifier
1346
*
1347
* Open the blockdevice described by device number @dev. @mode and
1348
* @holder are identical to blkdev_get().
1349
*
1350
* Use it ONLY if you really do not have anything better - i.e. when
1351
* you are behind a truly sucky interface and all you are given is a
1352
* device number. _Never_ to be used for internal purposes. If you
1353
* ever need it - reconsider your API.
1354
*
1355
* On success, the returned block_device has reference count of one.
1356
*
1357
* CONTEXT:
1358
* Might sleep.
1359
*
1360
* RETURNS:
1361
* Pointer to block_device on success, ERR_PTR(-errno) on failure.
1362
*/
1363
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1364
{
1365
struct block_device *bdev;
1366
int err;
1367
1368
bdev = bdget(dev);
1369
if (!bdev)
1370
return ERR_PTR(-ENOMEM);
1371
1372
err = blkdev_get(bdev, mode, holder);
1373
if (err)
1374
return ERR_PTR(err);
1375
1376
return bdev;
1377
}
1378
EXPORT_SYMBOL(blkdev_get_by_dev);
1379
1380
static int blkdev_open(struct inode * inode, struct file * filp)
1381
{
1382
struct block_device *bdev;
1383
1384
/*
1385
* Preserve backwards compatibility and allow large file access
1386
* even if userspace doesn't ask for it explicitly. Some mkfs
1387
* binary needs it. We might want to drop this workaround
1388
* during an unstable branch.
1389
*/
1390
filp->f_flags |= O_LARGEFILE;
1391
1392
if (filp->f_flags & O_NDELAY)
1393
filp->f_mode |= FMODE_NDELAY;
1394
if (filp->f_flags & O_EXCL)
1395
filp->f_mode |= FMODE_EXCL;
1396
if ((filp->f_flags & O_ACCMODE) == 3)
1397
filp->f_mode |= FMODE_WRITE_IOCTL;
1398
1399
bdev = bd_acquire(inode);
1400
if (bdev == NULL)
1401
return -ENOMEM;
1402
1403
filp->f_mapping = bdev->bd_inode->i_mapping;
1404
1405
return blkdev_get(bdev, filp->f_mode, filp);
1406
}
1407
1408
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1409
{
1410
int ret = 0;
1411
struct gendisk *disk = bdev->bd_disk;
1412
struct block_device *victim = NULL;
1413
1414
mutex_lock_nested(&bdev->bd_mutex, for_part);
1415
if (for_part)
1416
bdev->bd_part_count--;
1417
1418
if (!--bdev->bd_openers) {
1419
WARN_ON_ONCE(bdev->bd_holders);
1420
sync_blockdev(bdev);
1421
kill_bdev(bdev);
1422
}
1423
if (bdev->bd_contains == bdev) {
1424
if (disk->fops->release)
1425
ret = disk->fops->release(disk, mode);
1426
}
1427
if (!bdev->bd_openers) {
1428
struct module *owner = disk->fops->owner;
1429
1430
put_disk(disk);
1431
module_put(owner);
1432
disk_put_part(bdev->bd_part);
1433
bdev->bd_part = NULL;
1434
bdev->bd_disk = NULL;
1435
bdev_inode_switch_bdi(bdev->bd_inode,
1436
&default_backing_dev_info);
1437
if (bdev != bdev->bd_contains)
1438
victim = bdev->bd_contains;
1439
bdev->bd_contains = NULL;
1440
}
1441
mutex_unlock(&bdev->bd_mutex);
1442
bdput(bdev);
1443
if (victim)
1444
__blkdev_put(victim, mode, 1);
1445
return ret;
1446
}
1447
1448
int blkdev_put(struct block_device *bdev, fmode_t mode)
1449
{
1450
if (mode & FMODE_EXCL) {
1451
bool bdev_free;
1452
1453
/*
1454
* Release a claim on the device. The holder fields
1455
* are protected with bdev_lock. bd_mutex is to
1456
* synchronize disk_holder unlinking.
1457
*/
1458
mutex_lock(&bdev->bd_mutex);
1459
spin_lock(&bdev_lock);
1460
1461
WARN_ON_ONCE(--bdev->bd_holders < 0);
1462
WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1463
1464
/* bd_contains might point to self, check in a separate step */
1465
if ((bdev_free = !bdev->bd_holders))
1466
bdev->bd_holder = NULL;
1467
if (!bdev->bd_contains->bd_holders)
1468
bdev->bd_contains->bd_holder = NULL;
1469
1470
spin_unlock(&bdev_lock);
1471
1472
/*
1473
* If this was the last claim, remove holder link and
1474
* unblock evpoll if it was a write holder.
1475
*/
1476
if (bdev_free) {
1477
if (bdev->bd_write_holder) {
1478
disk_unblock_events(bdev->bd_disk);
1479
disk_check_events(bdev->bd_disk);
1480
bdev->bd_write_holder = false;
1481
}
1482
}
1483
1484
mutex_unlock(&bdev->bd_mutex);
1485
}
1486
1487
return __blkdev_put(bdev, mode, 0);
1488
}
1489
EXPORT_SYMBOL(blkdev_put);
1490
1491
static int blkdev_close(struct inode * inode, struct file * filp)
1492
{
1493
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1494
1495
return blkdev_put(bdev, filp->f_mode);
1496
}
1497
1498
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1499
{
1500
struct block_device *bdev = I_BDEV(file->f_mapping->host);
1501
fmode_t mode = file->f_mode;
1502
1503
/*
1504
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1505
* to updated it before every ioctl.
1506
*/
1507
if (file->f_flags & O_NDELAY)
1508
mode |= FMODE_NDELAY;
1509
else
1510
mode &= ~FMODE_NDELAY;
1511
1512
return blkdev_ioctl(bdev, mode, cmd, arg);
1513
}
1514
1515
/*
1516
* Write data to the block device. Only intended for the block device itself
1517
* and the raw driver which basically is a fake block device.
1518
*
1519
* Does not take i_mutex for the write and thus is not for general purpose
1520
* use.
1521
*/
1522
ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1523
unsigned long nr_segs, loff_t pos)
1524
{
1525
struct file *file = iocb->ki_filp;
1526
ssize_t ret;
1527
1528
BUG_ON(iocb->ki_pos != pos);
1529
1530
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1531
if (ret > 0 || ret == -EIOCBQUEUED) {
1532
ssize_t err;
1533
1534
err = generic_write_sync(file, pos, ret);
1535
if (err < 0 && ret > 0)
1536
ret = err;
1537
}
1538
return ret;
1539
}
1540
EXPORT_SYMBOL_GPL(blkdev_aio_write);
1541
1542
/*
1543
* Try to release a page associated with block device when the system
1544
* is under memory pressure.
1545
*/
1546
static int blkdev_releasepage(struct page *page, gfp_t wait)
1547
{
1548
struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1549
1550
if (super && super->s_op->bdev_try_to_free_page)
1551
return super->s_op->bdev_try_to_free_page(super, page, wait);
1552
1553
return try_to_free_buffers(page);
1554
}
1555
1556
static const struct address_space_operations def_blk_aops = {
1557
.readpage = blkdev_readpage,
1558
.writepage = blkdev_writepage,
1559
.write_begin = blkdev_write_begin,
1560
.write_end = blkdev_write_end,
1561
.writepages = generic_writepages,
1562
.releasepage = blkdev_releasepage,
1563
.direct_IO = blkdev_direct_IO,
1564
};
1565
1566
const struct file_operations def_blk_fops = {
1567
.open = blkdev_open,
1568
.release = blkdev_close,
1569
.llseek = block_llseek,
1570
.read = do_sync_read,
1571
.write = do_sync_write,
1572
.aio_read = generic_file_aio_read,
1573
.aio_write = blkdev_aio_write,
1574
.mmap = generic_file_mmap,
1575
.fsync = blkdev_fsync,
1576
.unlocked_ioctl = block_ioctl,
1577
#ifdef CONFIG_COMPAT
1578
.compat_ioctl = compat_blkdev_ioctl,
1579
#endif
1580
.splice_read = generic_file_splice_read,
1581
.splice_write = generic_file_splice_write,
1582
};
1583
1584
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1585
{
1586
int res;
1587
mm_segment_t old_fs = get_fs();
1588
set_fs(KERNEL_DS);
1589
res = blkdev_ioctl(bdev, 0, cmd, arg);
1590
set_fs(old_fs);
1591
return res;
1592
}
1593
1594
EXPORT_SYMBOL(ioctl_by_bdev);
1595
1596
/**
1597
* lookup_bdev - lookup a struct block_device by name
1598
* @pathname: special file representing the block device
1599
*
1600
* Get a reference to the blockdevice at @pathname in the current
1601
* namespace if possible and return it. Return ERR_PTR(error)
1602
* otherwise.
1603
*/
1604
struct block_device *lookup_bdev(const char *pathname)
1605
{
1606
struct block_device *bdev;
1607
struct inode *inode;
1608
struct path path;
1609
int error;
1610
1611
if (!pathname || !*pathname)
1612
return ERR_PTR(-EINVAL);
1613
1614
error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1615
if (error)
1616
return ERR_PTR(error);
1617
1618
inode = path.dentry->d_inode;
1619
error = -ENOTBLK;
1620
if (!S_ISBLK(inode->i_mode))
1621
goto fail;
1622
error = -EACCES;
1623
if (path.mnt->mnt_flags & MNT_NODEV)
1624
goto fail;
1625
error = -ENOMEM;
1626
bdev = bd_acquire(inode);
1627
if (!bdev)
1628
goto fail;
1629
out:
1630
path_put(&path);
1631
return bdev;
1632
fail:
1633
bdev = ERR_PTR(error);
1634
goto out;
1635
}
1636
EXPORT_SYMBOL(lookup_bdev);
1637
1638
int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1639
{
1640
struct super_block *sb = get_super(bdev);
1641
int res = 0;
1642
1643
if (sb) {
1644
/*
1645
* no need to lock the super, get_super holds the
1646
* read mutex so the filesystem cannot go away
1647
* under us (->put_super runs with the write lock
1648
* hold).
1649
*/
1650
shrink_dcache_sb(sb);
1651
res = invalidate_inodes(sb, kill_dirty);
1652
drop_super(sb);
1653
}
1654
invalidate_bdev(bdev);
1655
return res;
1656
}
1657
EXPORT_SYMBOL(__invalidate_device);
1658
1659