CoCalc -- block

GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/block_dev.c
¹⁵¹⁰⁹ views
1
/*
2
 *  linux/fs/block_dev.c
3
 *
4
 *  Copyright (C) 1991, 1992  Linus Torvalds
5
 *  Copyright (C) 2001  Andrea Arcangeli <[email protected]> SuSE
6
 */
7

8
#include <linux/init.h>
9
#include <linux/mm.h>
10
#include <linux/fcntl.h>
11
#include <linux/slab.h>
12
#include <linux/kmod.h>
13
#include <linux/major.h>
14
#include <linux/device_cgroup.h>
15
#include <linux/highmem.h>
16
#include <linux/blkdev.h>
17
#include <linux/module.h>
18
#include <linux/blkpg.h>
19
#include <linux/buffer_head.h>
20
#include <linux/pagevec.h>
21
#include <linux/writeback.h>
22
#include <linux/mpage.h>
23
#include <linux/mount.h>
24
#include <linux/uio.h>
25
#include <linux/namei.h>
26
#include <linux/log2.h>
27
#include <linux/kmemleak.h>
28
#include <asm/uaccess.h>
29
#include "internal.h"
30

31
struct bdev_inode {
32
	struct block_device bdev;
33
	struct inode vfs_inode;
34
};
35

36
static const struct address_space_operations def_blk_aops;
37

38
static inline struct bdev_inode *BDEV_I(struct inode *inode)
39
{
40
	return container_of(inode, struct bdev_inode, vfs_inode);
41
}
42

43
inline struct block_device *I_BDEV(struct inode *inode)
44
{
45
	return &BDEV_I(inode)->bdev;
46
}
47

48
EXPORT_SYMBOL(I_BDEV);
49

50
/*
51
 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
52
 * we need to move it onto the dirty list of @dst so that the inode is always
53
 * on the right list.
54
 */
55
static void bdev_inode_switch_bdi(struct inode *inode,
56
			struct backing_dev_info *dst)
57
{
58
	spin_lock(&inode_wb_list_lock);
59
	spin_lock(&inode->i_lock);
60
	inode->i_data.backing_dev_info = dst;
61
	if (inode->i_state & I_DIRTY)
62
		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63
	spin_unlock(&inode->i_lock);
64
	spin_unlock(&inode_wb_list_lock);
65
}
66

67
static sector_t max_block(struct block_device *bdev)
68
{
69
	sector_t retval = ~((sector_t)0);
70
	loff_t sz = i_size_read(bdev->bd_inode);
71

72
	if (sz) {
73
		unsigned int size = block_size(bdev);
74
		unsigned int sizebits = blksize_bits(size);
75
		retval = (sz >> sizebits);
76
	}
77
	return retval;
78
}
79

80
/* Kill _all_ buffers and pagecache , dirty or not.. */
81
static void kill_bdev(struct block_device *bdev)
82
{
83
	if (bdev->bd_inode->i_mapping->nrpages == 0)
84
		return;
85
	invalidate_bh_lrus();
86
	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
87
}	
88

89
int set_blocksize(struct block_device *bdev, int size)
90
{
91
	/* Size must be a power of two, and between 512 and PAGE_SIZE */
92
	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
93
		return -EINVAL;
94

95
	/* Size cannot be smaller than the size supported by the device */
96
	if (size < bdev_logical_block_size(bdev))
97
		return -EINVAL;
98

99
	/* Don't change the size if it is same as current */
100
	if (bdev->bd_block_size != size) {
101
		sync_blockdev(bdev);
102
		bdev->bd_block_size = size;
103
		bdev->bd_inode->i_blkbits = blksize_bits(size);
104
		kill_bdev(bdev);
105
	}
106
	return 0;
107
}
108

109
EXPORT_SYMBOL(set_blocksize);
110

111
int sb_set_blocksize(struct super_block *sb, int size)
112
{
113
	if (set_blocksize(sb->s_bdev, size))
114
		return 0;
115
	/* If we get here, we know size is power of two
116
	 * and it's value is between 512 and PAGE_SIZE */
117
	sb->s_blocksize = size;
118
	sb->s_blocksize_bits = blksize_bits(size);
119
	return sb->s_blocksize;
120
}
121

122
EXPORT_SYMBOL(sb_set_blocksize);
123

124
int sb_min_blocksize(struct super_block *sb, int size)
125
{
126
	int minsize = bdev_logical_block_size(sb->s_bdev);
127
	if (size < minsize)
128
		size = minsize;
129
	return sb_set_blocksize(sb, size);
130
}
131

132
EXPORT_SYMBOL(sb_min_blocksize);
133

134
static int
135
blkdev_get_block(struct inode *inode, sector_t iblock,
136
		struct buffer_head *bh, int create)
137
{
138
	if (iblock >= max_block(I_BDEV(inode))) {
139
		if (create)
140
			return -EIO;
141

142
		/*
143
		 * for reads, we're just trying to fill a partial page.
144
		 * return a hole, they will have to call get_block again
145
		 * before they can fill it, and they will get -EIO at that
146
		 * time
147
		 */
148
		return 0;
149
	}
150
	bh->b_bdev = I_BDEV(inode);
151
	bh->b_blocknr = iblock;
152
	set_buffer_mapped(bh);
153
	return 0;
154
}
155

156
static int
157
blkdev_get_blocks(struct inode *inode, sector_t iblock,
158
		struct buffer_head *bh, int create)
159
{
160
	sector_t end_block = max_block(I_BDEV(inode));
161
	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
162

163
	if ((iblock + max_blocks) > end_block) {
164
		max_blocks = end_block - iblock;
165
		if ((long)max_blocks <= 0) {
166
			if (create)
167
				return -EIO;	/* write fully beyond EOF */
168
			/*
169
			 * It is a read which is fully beyond EOF.  We return
170
			 * a !buffer_mapped buffer
171
			 */
172
			max_blocks = 0;
173
		}
174
	}
175

176
	bh->b_bdev = I_BDEV(inode);
177
	bh->b_blocknr = iblock;
178
	bh->b_size = max_blocks << inode->i_blkbits;
179
	if (max_blocks)
180
		set_buffer_mapped(bh);
181
	return 0;
182
}
183

184
static ssize_t
185
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
186
			loff_t offset, unsigned long nr_segs)
187
{
188
	struct file *file = iocb->ki_filp;
189
	struct inode *inode = file->f_mapping->host;
190

191
	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
192
				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
193
}
194

195
int __sync_blockdev(struct block_device *bdev, int wait)
196
{
197
	if (!bdev)
198
		return 0;
199
	if (!wait)
200
		return filemap_flush(bdev->bd_inode->i_mapping);
201
	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
202
}
203

204
/*
205
 * Write out and wait upon all the dirty data associated with a block
206
 * device via its mapping.  Does not take the superblock lock.
207
 */
208
int sync_blockdev(struct block_device *bdev)
209
{
210
	return __sync_blockdev(bdev, 1);
211
}
212
EXPORT_SYMBOL(sync_blockdev);
213

214
/*
215
 * Write out and wait upon all dirty data associated with this
216
 * device.   Filesystem data as well as the underlying block
217
 * device.  Takes the superblock lock.
218
 */
219
int fsync_bdev(struct block_device *bdev)
220
{
221
	struct super_block *sb = get_super(bdev);
222
	if (sb) {
223
		int res = sync_filesystem(sb);
224
		drop_super(sb);
225
		return res;
226
	}
227
	return sync_blockdev(bdev);
228
}
229
EXPORT_SYMBOL(fsync_bdev);
230

231
/**
232
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
233
 * @bdev:	blockdevice to lock
234
 *
235
 * If a superblock is found on this device, we take the s_umount semaphore
236
 * on it to make sure nobody unmounts until the snapshot creation is done.
237
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
238
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
239
 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
240
 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
241
 * actually.
242
 */
243
struct super_block *freeze_bdev(struct block_device *bdev)
244
{
245
	struct super_block *sb;
246
	int error = 0;
247

248
	mutex_lock(&bdev->bd_fsfreeze_mutex);
249
	if (++bdev->bd_fsfreeze_count > 1) {
250
		/*
251
		 * We don't even need to grab a reference - the first call
252
		 * to freeze_bdev grab an active reference and only the last
253
		 * thaw_bdev drops it.
254
		 */
255
		sb = get_super(bdev);
256
		drop_super(sb);
257
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
258
		return sb;
259
	}
260

261
	sb = get_active_super(bdev);
262
	if (!sb)
263
		goto out;
264
	error = freeze_super(sb);
265
	if (error) {
266
		deactivate_super(sb);
267
		bdev->bd_fsfreeze_count--;
268
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
269
		return ERR_PTR(error);
270
	}
271
	deactivate_super(sb);
272
 out:
273
	sync_blockdev(bdev);
274
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
275
	return sb;	/* thaw_bdev releases s->s_umount */
276
}
277
EXPORT_SYMBOL(freeze_bdev);
278

279
/**
280
 * thaw_bdev  -- unlock filesystem
281
 * @bdev:	blockdevice to unlock
282
 * @sb:		associated superblock
283
 *
284
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
285
 */
286
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
287
{
288
	int error = -EINVAL;
289

290
	mutex_lock(&bdev->bd_fsfreeze_mutex);
291
	if (!bdev->bd_fsfreeze_count)
292
		goto out;
293

294
	error = 0;
295
	if (--bdev->bd_fsfreeze_count > 0)
296
		goto out;
297

298
	if (!sb)
299
		goto out;
300

301
	error = thaw_super(sb);
302
	if (error) {
303
		bdev->bd_fsfreeze_count++;
304
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
305
		return error;
306
	}
307
out:
308
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
309
	return 0;
310
}
311
EXPORT_SYMBOL(thaw_bdev);
312

313
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
314
{
315
	return block_write_full_page(page, blkdev_get_block, wbc);
316
}
317

318
static int blkdev_readpage(struct file * file, struct page * page)
319
{
320
	return block_read_full_page(page, blkdev_get_block);
321
}
322

323
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
324
			loff_t pos, unsigned len, unsigned flags,
325
			struct page **pagep, void **fsdata)
326
{
327
	return block_write_begin(mapping, pos, len, flags, pagep,
328
				 blkdev_get_block);
329
}
330

331
static int blkdev_write_end(struct file *file, struct address_space *mapping,
332
			loff_t pos, unsigned len, unsigned copied,
333
			struct page *page, void *fsdata)
334
{
335
	int ret;
336
	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
337

338
	unlock_page(page);
339
	page_cache_release(page);
340

341
	return ret;
342
}
343

344
/*
345
 * private llseek:
346
 * for a block special file file->f_path.dentry->d_inode->i_size is zero
347
 * so we compute the size by hand (just as in block_read/write above)
348
 */
349
static loff_t block_llseek(struct file *file, loff_t offset, int origin)
350
{
351
	struct inode *bd_inode = file->f_mapping->host;
352
	loff_t size;
353
	loff_t retval;
354

355
	mutex_lock(&bd_inode->i_mutex);
356
	size = i_size_read(bd_inode);
357

358
	switch (origin) {
359
		case 2:
360
			offset += size;
361
			break;
362
		case 1:
363
			offset += file->f_pos;
364
	}
365
	retval = -EINVAL;
366
	if (offset >= 0 && offset <= size) {
367
		if (offset != file->f_pos) {
368
			file->f_pos = offset;
369
		}
370
		retval = offset;
371
	}
372
	mutex_unlock(&bd_inode->i_mutex);
373
	return retval;
374
}
375
	
376
int blkdev_fsync(struct file *filp, int datasync)
377
{
378
	struct inode *bd_inode = filp->f_mapping->host;
379
	struct block_device *bdev = I_BDEV(bd_inode);
380
	int error;
381

382
	/*
383
	 * There is no need to serialise calls to blkdev_issue_flush with
384
	 * i_mutex and doing so causes performance issues with concurrent
385
	 * O_SYNC writers to a block device.
386
	 */
387
	mutex_unlock(&bd_inode->i_mutex);
388

389
	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
390
	if (error == -EOPNOTSUPP)
391
		error = 0;
392

393
	mutex_lock(&bd_inode->i_mutex);
394

395
	return error;
396
}
397
EXPORT_SYMBOL(blkdev_fsync);
398

399
/*
400
 * pseudo-fs
401
 */
402

403
static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
404
static struct kmem_cache * bdev_cachep __read_mostly;
405

406
static struct inode *bdev_alloc_inode(struct super_block *sb)
407
{
408
	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
409
	if (!ei)
410
		return NULL;
411
	return &ei->vfs_inode;
412
}
413

414
static void bdev_i_callback(struct rcu_head *head)
415
{
416
	struct inode *inode = container_of(head, struct inode, i_rcu);
417
	struct bdev_inode *bdi = BDEV_I(inode);
418

419
	INIT_LIST_HEAD(&inode->i_dentry);
420
	kmem_cache_free(bdev_cachep, bdi);
421
}
422

423
static void bdev_destroy_inode(struct inode *inode)
424
{
425
	call_rcu(&inode->i_rcu, bdev_i_callback);
426
}
427

428
static void init_once(void *foo)
429
{
430
	struct bdev_inode *ei = (struct bdev_inode *) foo;
431
	struct block_device *bdev = &ei->bdev;
432

433
	memset(bdev, 0, sizeof(*bdev));
434
	mutex_init(&bdev->bd_mutex);
435
	INIT_LIST_HEAD(&bdev->bd_inodes);
436
	INIT_LIST_HEAD(&bdev->bd_list);
437
#ifdef CONFIG_SYSFS
438
	INIT_LIST_HEAD(&bdev->bd_holder_disks);
439
#endif
440
	inode_init_once(&ei->vfs_inode);
441
	/* Initialize mutex for freeze. */
442
	mutex_init(&bdev->bd_fsfreeze_mutex);
443
}
444

445
static inline void __bd_forget(struct inode *inode)
446
{
447
	list_del_init(&inode->i_devices);
448
	inode->i_bdev = NULL;
449
	inode->i_mapping = &inode->i_data;
450
}
451

452
static void bdev_evict_inode(struct inode *inode)
453
{
454
	struct block_device *bdev = &BDEV_I(inode)->bdev;
455
	struct list_head *p;
456
	truncate_inode_pages(&inode->i_data, 0);
457
	invalidate_inode_buffers(inode); /* is it needed here? */
458
	end_writeback(inode);
459
	spin_lock(&bdev_lock);
460
	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
461
		__bd_forget(list_entry(p, struct inode, i_devices));
462
	}
463
	list_del_init(&bdev->bd_list);
464
	spin_unlock(&bdev_lock);
465
}
466

467
static const struct super_operations bdev_sops = {
468
	.statfs = simple_statfs,
469
	.alloc_inode = bdev_alloc_inode,
470
	.destroy_inode = bdev_destroy_inode,
471
	.drop_inode = generic_delete_inode,
472
	.evict_inode = bdev_evict_inode,
473
};
474

475
static struct dentry *bd_mount(struct file_system_type *fs_type,
476
	int flags, const char *dev_name, void *data)
477
{
478
	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
479
}
480

481
static struct file_system_type bd_type = {
482
	.name		= "bdev",
483
	.mount		= bd_mount,
484
	.kill_sb	= kill_anon_super,
485
};
486

487
struct super_block *blockdev_superblock __read_mostly;
488

489
void __init bdev_cache_init(void)
490
{
491
	int err;
492
	struct vfsmount *bd_mnt;
493

494
	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
495
			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
496
				SLAB_MEM_SPREAD|SLAB_PANIC),
497
			init_once);
498
	err = register_filesystem(&bd_type);
499
	if (err)
500
		panic("Cannot register bdev pseudo-fs");
501
	bd_mnt = kern_mount(&bd_type);
502
	if (IS_ERR(bd_mnt))
503
		panic("Cannot create bdev pseudo-fs");
504
	/*
505
	 * This vfsmount structure is only used to obtain the
506
	 * blockdev_superblock, so tell kmemleak not to report it.
507
	 */
508
	kmemleak_not_leak(bd_mnt);
509
	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
510
}
511

512
/*
513
 * Most likely _very_ bad one - but then it's hardly critical for small
514
 * /dev and can be fixed when somebody will need really large one.
515
 * Keep in mind that it will be fed through icache hash function too.
516
 */
517
static inline unsigned long hash(dev_t dev)
518
{
519
	return MAJOR(dev)+MINOR(dev);
520
}
521

522
static int bdev_test(struct inode *inode, void *data)
523
{
524
	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
525
}
526

527
static int bdev_set(struct inode *inode, void *data)
528
{
529
	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
530
	return 0;
531
}
532

533
static LIST_HEAD(all_bdevs);
534

535
struct block_device *bdget(dev_t dev)
536
{
537
	struct block_device *bdev;
538
	struct inode *inode;
539

540
	inode = iget5_locked(blockdev_superblock, hash(dev),
541
			bdev_test, bdev_set, &dev);
542

543
	if (!inode)
544
		return NULL;
545

546
	bdev = &BDEV_I(inode)->bdev;
547

548
	if (inode->i_state & I_NEW) {
549
		bdev->bd_contains = NULL;
550
		bdev->bd_inode = inode;
551
		bdev->bd_block_size = (1 << inode->i_blkbits);
552
		bdev->bd_part_count = 0;
553
		bdev->bd_invalidated = 0;
554
		inode->i_mode = S_IFBLK;
555
		inode->i_rdev = dev;
556
		inode->i_bdev = bdev;
557
		inode->i_data.a_ops = &def_blk_aops;
558
		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
559
		inode->i_data.backing_dev_info = &default_backing_dev_info;
560
		spin_lock(&bdev_lock);
561
		list_add(&bdev->bd_list, &all_bdevs);
562
		spin_unlock(&bdev_lock);
563
		unlock_new_inode(inode);
564
	}
565
	return bdev;
566
}
567

568
EXPORT_SYMBOL(bdget);
569

570
/**
571
 * bdgrab -- Grab a reference to an already referenced block device
572
 * @bdev:	Block device to grab a reference to.
573
 */
574
struct block_device *bdgrab(struct block_device *bdev)
575
{
576
	ihold(bdev->bd_inode);
577
	return bdev;
578
}
579

580
long nr_blockdev_pages(void)
581
{
582
	struct block_device *bdev;
583
	long ret = 0;
584
	spin_lock(&bdev_lock);
585
	list_for_each_entry(bdev, &all_bdevs, bd_list) {
586
		ret += bdev->bd_inode->i_mapping->nrpages;
587
	}
588
	spin_unlock(&bdev_lock);
589
	return ret;
590
}
591

592
void bdput(struct block_device *bdev)
593
{
594
	iput(bdev->bd_inode);
595
}
596

597
EXPORT_SYMBOL(bdput);
598
 
599
static struct block_device *bd_acquire(struct inode *inode)
600
{
601
	struct block_device *bdev;
602

603
	spin_lock(&bdev_lock);
604
	bdev = inode->i_bdev;
605
	if (bdev) {
606
		ihold(bdev->bd_inode);
607
		spin_unlock(&bdev_lock);
608
		return bdev;
609
	}
610
	spin_unlock(&bdev_lock);
611

612
	bdev = bdget(inode->i_rdev);
613
	if (bdev) {
614
		spin_lock(&bdev_lock);
615
		if (!inode->i_bdev) {
616
			/*
617
			 * We take an additional reference to bd_inode,
618
			 * and it's released in clear_inode() of inode.
619
			 * So, we can access it via ->i_mapping always
620
			 * without igrab().
621
			 */
622
			ihold(bdev->bd_inode);
623
			inode->i_bdev = bdev;
624
			inode->i_mapping = bdev->bd_inode->i_mapping;
625
			list_add(&inode->i_devices, &bdev->bd_inodes);
626
		}
627
		spin_unlock(&bdev_lock);
628
	}
629
	return bdev;
630
}
631

632
/* Call when you free inode */
633

634
void bd_forget(struct inode *inode)
635
{
636
	struct block_device *bdev = NULL;
637

638
	spin_lock(&bdev_lock);
639
	if (inode->i_bdev) {
640
		if (!sb_is_blkdev_sb(inode->i_sb))
641
			bdev = inode->i_bdev;
642
		__bd_forget(inode);
643
	}
644
	spin_unlock(&bdev_lock);
645

646
	if (bdev)
647
		iput(bdev->bd_inode);
648
}
649

650
/**
651
 * bd_may_claim - test whether a block device can be claimed
652
 * @bdev: block device of interest
653
 * @whole: whole block device containing @bdev, may equal @bdev
654
 * @holder: holder trying to claim @bdev
655
 *
656
 * Test whether @bdev can be claimed by @holder.
657
 *
658
 * CONTEXT:
659
 * spin_lock(&bdev_lock).
660
 *
661
 * RETURNS:
662
 * %true if @bdev can be claimed, %false otherwise.
663
 */
664
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
665
			 void *holder)
666
{
667
	if (bdev->bd_holder == holder)
668
		return true;	 /* already a holder */
669
	else if (bdev->bd_holder != NULL)
670
		return false; 	 /* held by someone else */
671
	else if (bdev->bd_contains == bdev)
672
		return true;  	 /* is a whole device which isn't held */
673

674
	else if (whole->bd_holder == bd_may_claim)
675
		return true; 	 /* is a partition of a device that is being partitioned */
676
	else if (whole->bd_holder != NULL)
677
		return false;	 /* is a partition of a held device */
678
	else
679
		return true;	 /* is a partition of an un-held device */
680
}
681

682
/**
683
 * bd_prepare_to_claim - prepare to claim a block device
684
 * @bdev: block device of interest
685
 * @whole: the whole device containing @bdev, may equal @bdev
686
 * @holder: holder trying to claim @bdev
687
 *
688
 * Prepare to claim @bdev.  This function fails if @bdev is already
689
 * claimed by another holder and waits if another claiming is in
690
 * progress.  This function doesn't actually claim.  On successful
691
 * return, the caller has ownership of bd_claiming and bd_holder[s].
692
 *
693
 * CONTEXT:
694
 * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
695
 * it multiple times.
696
 *
697
 * RETURNS:
698
 * 0 if @bdev can be claimed, -EBUSY otherwise.
699
 */
700
static int bd_prepare_to_claim(struct block_device *bdev,
701
			       struct block_device *whole, void *holder)
702
{
703
retry:
704
	/* if someone else claimed, fail */
705
	if (!bd_may_claim(bdev, whole, holder))
706
		return -EBUSY;
707

708
	/* if claiming is already in progress, wait for it to finish */
709
	if (whole->bd_claiming) {
710
		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
711
		DEFINE_WAIT(wait);
712

713
		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
714
		spin_unlock(&bdev_lock);
715
		schedule();
716
		finish_wait(wq, &wait);
717
		spin_lock(&bdev_lock);
718
		goto retry;
719
	}
720

721
	/* yay, all mine */
722
	return 0;
723
}
724

725
/**
726
 * bd_start_claiming - start claiming a block device
727
 * @bdev: block device of interest
728
 * @holder: holder trying to claim @bdev
729
 *
730
 * @bdev is about to be opened exclusively.  Check @bdev can be opened
731
 * exclusively and mark that an exclusive open is in progress.  Each
732
 * successful call to this function must be matched with a call to
733
 * either bd_finish_claiming() or bd_abort_claiming() (which do not
734
 * fail).
735
 *
736
 * This function is used to gain exclusive access to the block device
737
 * without actually causing other exclusive open attempts to fail. It
738
 * should be used when the open sequence itself requires exclusive
739
 * access but may subsequently fail.
740
 *
741
 * CONTEXT:
742
 * Might sleep.
743
 *
744
 * RETURNS:
745
 * Pointer to the block device containing @bdev on success, ERR_PTR()
746
 * value on failure.
747
 */
748
static struct block_device *bd_start_claiming(struct block_device *bdev,
749
					      void *holder)
750
{
751
	struct gendisk *disk;
752
	struct block_device *whole;
753
	int partno, err;
754

755
	might_sleep();
756

757
	/*
758
	 * @bdev might not have been initialized properly yet, look up
759
	 * and grab the outer block device the hard way.
760
	 */
761
	disk = get_gendisk(bdev->bd_dev, &partno);
762
	if (!disk)
763
		return ERR_PTR(-ENXIO);
764

765
	/*
766
	 * Normally, @bdev should equal what's returned from bdget_disk()
767
	 * if partno is 0; however, some drivers (floppy) use multiple
768
	 * bdev's for the same physical device and @bdev may be one of the
769
	 * aliases.  Keep @bdev if partno is 0.  This means claimer
770
	 * tracking is broken for those devices but it has always been that
771
	 * way.
772
	 */
773
	if (partno)
774
		whole = bdget_disk(disk, 0);
775
	else
776
		whole = bdgrab(bdev);
777

778
	module_put(disk->fops->owner);
779
	put_disk(disk);
780
	if (!whole)
781
		return ERR_PTR(-ENOMEM);
782

783
	/* prepare to claim, if successful, mark claiming in progress */
784
	spin_lock(&bdev_lock);
785

786
	err = bd_prepare_to_claim(bdev, whole, holder);
787
	if (err == 0) {
788
		whole->bd_claiming = holder;
789
		spin_unlock(&bdev_lock);
790
		return whole;
791
	} else {
792
		spin_unlock(&bdev_lock);
793
		bdput(whole);
794
		return ERR_PTR(err);
795
	}
796
}
797

798
#ifdef CONFIG_SYSFS
799
struct bd_holder_disk {
800
	struct list_head	list;
801
	struct gendisk		*disk;
802
	int			refcnt;
803
};
804

805
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
806
						  struct gendisk *disk)
807
{
808
	struct bd_holder_disk *holder;
809

810
	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
811
		if (holder->disk == disk)
812
			return holder;
813
	return NULL;
814
}
815

816
static int add_symlink(struct kobject *from, struct kobject *to)
817
{
818
	return sysfs_create_link(from, to, kobject_name(to));
819
}
820

821
static void del_symlink(struct kobject *from, struct kobject *to)
822
{
823
	sysfs_remove_link(from, kobject_name(to));
824
}
825

826
/**
827
 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
828
 * @bdev: the claimed slave bdev
829
 * @disk: the holding disk
830
 *
831
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
832
 *
833
 * This functions creates the following sysfs symlinks.
834
 *
835
 * - from "slaves" directory of the holder @disk to the claimed @bdev
836
 * - from "holders" directory of the @bdev to the holder @disk
837
 *
838
 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
839
 * passed to bd_link_disk_holder(), then:
840
 *
841
 *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
842
 *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
843
 *
844
 * The caller must have claimed @bdev before calling this function and
845
 * ensure that both @bdev and @disk are valid during the creation and
846
 * lifetime of these symlinks.
847
 *
848
 * CONTEXT:
849
 * Might sleep.
850
 *
851
 * RETURNS:
852
 * 0 on success, -errno on failure.
853
 */
854
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
855
{
856
	struct bd_holder_disk *holder;
857
	int ret = 0;
858

859
	mutex_lock(&bdev->bd_mutex);
860

861
	WARN_ON_ONCE(!bdev->bd_holder);
862

863
	/* FIXME: remove the following once add_disk() handles errors */
864
	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
865
		goto out_unlock;
866

867
	holder = bd_find_holder_disk(bdev, disk);
868
	if (holder) {
869
		holder->refcnt++;
870
		goto out_unlock;
871
	}
872

873
	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
874
	if (!holder) {
875
		ret = -ENOMEM;
876
		goto out_unlock;
877
	}
878

879
	INIT_LIST_HEAD(&holder->list);
880
	holder->disk = disk;
881
	holder->refcnt = 1;
882

883
	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
884
	if (ret)
885
		goto out_free;
886

887
	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
888
	if (ret)
889
		goto out_del;
890
	/*
891
	 * bdev could be deleted beneath us which would implicitly destroy
892
	 * the holder directory.  Hold on to it.
893
	 */
894
	kobject_get(bdev->bd_part->holder_dir);
895

896
	list_add(&holder->list, &bdev->bd_holder_disks);
897
	goto out_unlock;
898

899
out_del:
900
	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
901
out_free:
902
	kfree(holder);
903
out_unlock:
904
	mutex_unlock(&bdev->bd_mutex);
905
	return ret;
906
}
907
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
908

909
/**
910
 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
911
 * @bdev: the calimed slave bdev
912
 * @disk: the holding disk
913
 *
914
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
915
 *
916
 * CONTEXT:
917
 * Might sleep.
918
 */
919
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
920
{
921
	struct bd_holder_disk *holder;
922

923
	mutex_lock(&bdev->bd_mutex);
924

925
	holder = bd_find_holder_disk(bdev, disk);
926

927
	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
928
		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
929
		del_symlink(bdev->bd_part->holder_dir,
930
			    &disk_to_dev(disk)->kobj);
931
		kobject_put(bdev->bd_part->holder_dir);
932
		list_del_init(&holder->list);
933
		kfree(holder);
934
	}
935

936
	mutex_unlock(&bdev->bd_mutex);
937
}
938
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
939
#endif
940

941
/**
942
 * flush_disk - invalidates all buffer-cache entries on a disk
943
 *
944
 * @bdev:      struct block device to be flushed
945
 * @kill_dirty: flag to guide handling of dirty inodes
946
 *
947
 * Invalidates all buffer-cache entries on a disk. It should be called
948
 * when a disk has been changed -- either by a media change or online
949
 * resize.
950
 */
951
static void flush_disk(struct block_device *bdev, bool kill_dirty)
952
{
953
	if (__invalidate_device(bdev, kill_dirty)) {
954
		char name[BDEVNAME_SIZE] = "";
955

956
		if (bdev->bd_disk)
957
			disk_name(bdev->bd_disk, 0, name);
958
		printk(KERN_WARNING "VFS: busy inodes on changed media or "
959
		       "resized disk %s\n", name);
960
	}
961

962
	if (!bdev->bd_disk)
963
		return;
964
	if (disk_partitionable(bdev->bd_disk))
965
		bdev->bd_invalidated = 1;
966
}
967

968
/**
969
 * check_disk_size_change - checks for disk size change and adjusts bdev size.
970
 * @disk: struct gendisk to check
971
 * @bdev: struct bdev to adjust.
972
 *
973
 * This routine checks to see if the bdev size does not match the disk size
974
 * and adjusts it if it differs.
975
 */
976
void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
977
{
978
	loff_t disk_size, bdev_size;
979

980
	disk_size = (loff_t)get_capacity(disk) << 9;
981
	bdev_size = i_size_read(bdev->bd_inode);
982
	if (disk_size != bdev_size) {
983
		char name[BDEVNAME_SIZE];
984

985
		disk_name(disk, 0, name);
986
		printk(KERN_INFO
987
		       "%s: detected capacity change from %lld to %lld\n",
988
		       name, bdev_size, disk_size);
989
		i_size_write(bdev->bd_inode, disk_size);
990
		flush_disk(bdev, false);
991
	}
992
}
993
EXPORT_SYMBOL(check_disk_size_change);
994

995
/**
996
 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
997
 * @disk: struct gendisk to be revalidated
998
 *
999
 * This routine is a wrapper for lower-level driver's revalidate_disk
1000
 * call-backs.  It is used to do common pre and post operations needed
1001
 * for all revalidate_disk operations.
1002
 */
1003
int revalidate_disk(struct gendisk *disk)
1004
{
1005
	struct block_device *bdev;
1006
	int ret = 0;
1007

1008
	if (disk->fops->revalidate_disk)
1009
		ret = disk->fops->revalidate_disk(disk);
1010

1011
	bdev = bdget_disk(disk, 0);
1012
	if (!bdev)
1013
		return ret;
1014

1015
	mutex_lock(&bdev->bd_mutex);
1016
	check_disk_size_change(disk, bdev);
1017
	mutex_unlock(&bdev->bd_mutex);
1018
	bdput(bdev);
1019
	return ret;
1020
}
1021
EXPORT_SYMBOL(revalidate_disk);
1022

1023
/*
1024
 * This routine checks whether a removable media has been changed,
1025
 * and invalidates all buffer-cache-entries in that case. This
1026
 * is a relatively slow routine, so we have to try to minimize using
1027
 * it. Thus it is called only upon a 'mount' or 'open'. This
1028
 * is the best way of combining speed and utility, I think.
1029
 * People changing diskettes in the middle of an operation deserve
1030
 * to lose :-)
1031
 */
1032
int check_disk_change(struct block_device *bdev)
1033
{
1034
	struct gendisk *disk = bdev->bd_disk;
1035
	const struct block_device_operations *bdops = disk->fops;
1036
	unsigned int events;
1037

1038
	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1039
				   DISK_EVENT_EJECT_REQUEST);
1040
	if (!(events & DISK_EVENT_MEDIA_CHANGE))
1041
		return 0;
1042

1043
	flush_disk(bdev, true);
1044
	if (bdops->revalidate_disk)
1045
		bdops->revalidate_disk(bdev->bd_disk);
1046
	return 1;
1047
}
1048

1049
EXPORT_SYMBOL(check_disk_change);
1050

1051
void bd_set_size(struct block_device *bdev, loff_t size)
1052
{
1053
	unsigned bsize = bdev_logical_block_size(bdev);
1054

1055
	bdev->bd_inode->i_size = size;
1056
	while (bsize < PAGE_CACHE_SIZE) {
1057
		if (size & bsize)
1058
			break;
1059
		bsize <<= 1;
1060
	}
1061
	bdev->bd_block_size = bsize;
1062
	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1063
}
1064
EXPORT_SYMBOL(bd_set_size);
1065

1066
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1067

1068
/*
1069
 * bd_mutex locking:
1070
 *
1071
 *  mutex_lock(part->bd_mutex)
1072
 *    mutex_lock_nested(whole->bd_mutex, 1)
1073
 */
1074

1075
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1076
{
1077
	struct gendisk *disk;
1078
	int ret;
1079
	int partno;
1080
	int perm = 0;
1081

1082
	if (mode & FMODE_READ)
1083
		perm |= MAY_READ;
1084
	if (mode & FMODE_WRITE)
1085
		perm |= MAY_WRITE;
1086
	/*
1087
	 * hooks: /n/, see "layering violations".
1088
	 */
1089
	if (!for_part) {
1090
		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1091
		if (ret != 0) {
1092
			bdput(bdev);
1093
			return ret;
1094
		}
1095
	}
1096

1097
 restart:
1098

1099
	ret = -ENXIO;
1100
	disk = get_gendisk(bdev->bd_dev, &partno);
1101
	if (!disk)
1102
		goto out;
1103

1104
	disk_block_events(disk);
1105
	mutex_lock_nested(&bdev->bd_mutex, for_part);
1106
	if (!bdev->bd_openers) {
1107
		bdev->bd_disk = disk;
1108
		bdev->bd_contains = bdev;
1109
		if (!partno) {
1110
			struct backing_dev_info *bdi;
1111

1112
			ret = -ENXIO;
1113
			bdev->bd_part = disk_get_part(disk, partno);
1114
			if (!bdev->bd_part)
1115
				goto out_clear;
1116

1117
			ret = 0;
1118
			if (disk->fops->open) {
1119
				ret = disk->fops->open(bdev, mode);
1120
				if (ret == -ERESTARTSYS) {
1121
					/* Lost a race with 'disk' being
1122
					 * deleted, try again.
1123
					 * See md.c
1124
					 */
1125
					disk_put_part(bdev->bd_part);
1126
					bdev->bd_part = NULL;
1127
					bdev->bd_disk = NULL;
1128
					mutex_unlock(&bdev->bd_mutex);
1129
					disk_unblock_events(disk);
1130
					module_put(disk->fops->owner);
1131
					put_disk(disk);
1132
					goto restart;
1133
				}
1134
			}
1135

1136
			if (!ret && !bdev->bd_openers) {
1137
				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1138
				bdi = blk_get_backing_dev_info(bdev);
1139
				if (bdi == NULL)
1140
					bdi = &default_backing_dev_info;
1141
				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1142
			}
1143

1144
			/*
1145
			 * If the device is invalidated, rescan partition
1146
			 * if open succeeded or failed with -ENOMEDIUM.
1147
			 * The latter is necessary to prevent ghost
1148
			 * partitions on a removed medium.
1149
			 */
1150
			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1151
				rescan_partitions(disk, bdev);
1152
			if (ret)
1153
				goto out_clear;
1154
		} else {
1155
			struct block_device *whole;
1156
			whole = bdget_disk(disk, 0);
1157
			ret = -ENOMEM;
1158
			if (!whole)
1159
				goto out_clear;
1160
			BUG_ON(for_part);
1161
			ret = __blkdev_get(whole, mode, 1);
1162
			if (ret)
1163
				goto out_clear;
1164
			bdev->bd_contains = whole;
1165
			bdev_inode_switch_bdi(bdev->bd_inode,
1166
				whole->bd_inode->i_data.backing_dev_info);
1167
			bdev->bd_part = disk_get_part(disk, partno);
1168
			if (!(disk->flags & GENHD_FL_UP) ||
1169
			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
1170
				ret = -ENXIO;
1171
				goto out_clear;
1172
			}
1173
			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1174
		}
1175
	} else {
1176
		if (bdev->bd_contains == bdev) {
1177
			ret = 0;
1178
			if (bdev->bd_disk->fops->open)
1179
				ret = bdev->bd_disk->fops->open(bdev, mode);
1180
			/* the same as first opener case, read comment there */
1181
			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
1182
				rescan_partitions(bdev->bd_disk, bdev);
1183
			if (ret)
1184
				goto out_unlock_bdev;
1185
		}
1186
		/* only one opener holds refs to the module and disk */
1187
		module_put(disk->fops->owner);
1188
		put_disk(disk);
1189
	}
1190
	bdev->bd_openers++;
1191
	if (for_part)
1192
		bdev->bd_part_count++;
1193
	mutex_unlock(&bdev->bd_mutex);
1194
	disk_unblock_events(disk);
1195
	return 0;
1196

1197
 out_clear:
1198
	disk_put_part(bdev->bd_part);
1199
	bdev->bd_disk = NULL;
1200
	bdev->bd_part = NULL;
1201
	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1202
	if (bdev != bdev->bd_contains)
1203
		__blkdev_put(bdev->bd_contains, mode, 1);
1204
	bdev->bd_contains = NULL;
1205
 out_unlock_bdev:
1206
	mutex_unlock(&bdev->bd_mutex);
1207
	disk_unblock_events(disk);
1208
	module_put(disk->fops->owner);
1209
	put_disk(disk);
1210
 out:
1211
	bdput(bdev);
1212

1213
	return ret;
1214
}
1215

1216
/**
1217
 * blkdev_get - open a block device
1218
 * @bdev: block_device to open
1219
 * @mode: FMODE_* mask
1220
 * @holder: exclusive holder identifier
1221
 *
1222
 * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
1223
 * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
1224
 * @holder is invalid.  Exclusive opens may nest for the same @holder.
1225
 *
1226
 * On success, the reference count of @bdev is unchanged.  On failure,
1227
 * @bdev is put.
1228
 *
1229
 * CONTEXT:
1230
 * Might sleep.
1231
 *
1232
 * RETURNS:
1233
 * 0 on success, -errno on failure.
1234
 */
1235
int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1236
{
1237
	struct block_device *whole = NULL;
1238
	int res;
1239

1240
	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1241

1242
	if ((mode & FMODE_EXCL) && holder) {
1243
		whole = bd_start_claiming(bdev, holder);
1244
		if (IS_ERR(whole)) {
1245
			bdput(bdev);
1246
			return PTR_ERR(whole);
1247
		}
1248
	}
1249

1250
	res = __blkdev_get(bdev, mode, 0);
1251

1252
	if (whole) {
1253
		struct gendisk *disk = whole->bd_disk;
1254

1255
		/* finish claiming */
1256
		mutex_lock(&bdev->bd_mutex);
1257
		spin_lock(&bdev_lock);
1258

1259
		if (!res) {
1260
			BUG_ON(!bd_may_claim(bdev, whole, holder));
1261
			/*
1262
			 * Note that for a whole device bd_holders
1263
			 * will be incremented twice, and bd_holder
1264
			 * will be set to bd_may_claim before being
1265
			 * set to holder
1266
			 */
1267
			whole->bd_holders++;
1268
			whole->bd_holder = bd_may_claim;
1269
			bdev->bd_holders++;
1270
			bdev->bd_holder = holder;
1271
		}
1272

1273
		/* tell others that we're done */
1274
		BUG_ON(whole->bd_claiming != holder);
1275
		whole->bd_claiming = NULL;
1276
		wake_up_bit(&whole->bd_claiming, 0);
1277

1278
		spin_unlock(&bdev_lock);
1279

1280
		/*
1281
		 * Block event polling for write claims if requested.  Any
1282
		 * write holder makes the write_holder state stick until
1283
		 * all are released.  This is good enough and tracking
1284
		 * individual writeable reference is too fragile given the
1285
		 * way @mode is used in blkdev_get/put().
1286
		 */
1287
		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1288
		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1289
			bdev->bd_write_holder = true;
1290
			disk_block_events(disk);
1291
		}
1292

1293
		mutex_unlock(&bdev->bd_mutex);
1294
		bdput(whole);
1295
	}
1296

1297
	return res;
1298
}
1299
EXPORT_SYMBOL(blkdev_get);
1300

1301
/**
1302
 * blkdev_get_by_path - open a block device by name
1303
 * @path: path to the block device to open
1304
 * @mode: FMODE_* mask
1305
 * @holder: exclusive holder identifier
1306
 *
1307
 * Open the blockdevice described by the device file at @path.  @mode
1308
 * and @holder are identical to blkdev_get().
1309
 *
1310
 * On success, the returned block_device has reference count of one.
1311
 *
1312
 * CONTEXT:
1313
 * Might sleep.
1314
 *
1315
 * RETURNS:
1316
 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1317
 */
1318
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1319
					void *holder)
1320
{
1321
	struct block_device *bdev;
1322
	int err;
1323

1324
	bdev = lookup_bdev(path);
1325
	if (IS_ERR(bdev))
1326
		return bdev;
1327

1328
	err = blkdev_get(bdev, mode, holder);
1329
	if (err)
1330
		return ERR_PTR(err);
1331

1332
	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1333
		blkdev_put(bdev, mode);
1334
		return ERR_PTR(-EACCES);
1335
	}
1336

1337
	return bdev;
1338
}
1339
EXPORT_SYMBOL(blkdev_get_by_path);
1340

1341
/**
1342
 * blkdev_get_by_dev - open a block device by device number
1343
 * @dev: device number of block device to open
1344
 * @mode: FMODE_* mask
1345
 * @holder: exclusive holder identifier
1346
 *
1347
 * Open the blockdevice described by device number @dev.  @mode and
1348
 * @holder are identical to blkdev_get().
1349
 *
1350
 * Use it ONLY if you really do not have anything better - i.e. when
1351
 * you are behind a truly sucky interface and all you are given is a
1352
 * device number.  _Never_ to be used for internal purposes.  If you
1353
 * ever need it - reconsider your API.
1354
 *
1355
 * On success, the returned block_device has reference count of one.
1356
 *
1357
 * CONTEXT:
1358
 * Might sleep.
1359
 *
1360
 * RETURNS:
1361
 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1362
 */
1363
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1364
{
1365
	struct block_device *bdev;
1366
	int err;
1367

1368
	bdev = bdget(dev);
1369
	if (!bdev)
1370
		return ERR_PTR(-ENOMEM);
1371

1372
	err = blkdev_get(bdev, mode, holder);
1373
	if (err)
1374
		return ERR_PTR(err);
1375

1376
	return bdev;
1377
}
1378
EXPORT_SYMBOL(blkdev_get_by_dev);
1379

1380
static int blkdev_open(struct inode * inode, struct file * filp)
1381
{
1382
	struct block_device *bdev;
1383

1384
	/*
1385
	 * Preserve backwards compatibility and allow large file access
1386
	 * even if userspace doesn't ask for it explicitly. Some mkfs
1387
	 * binary needs it. We might want to drop this workaround
1388
	 * during an unstable branch.
1389
	 */
1390
	filp->f_flags |= O_LARGEFILE;
1391

1392
	if (filp->f_flags & O_NDELAY)
1393
		filp->f_mode |= FMODE_NDELAY;
1394
	if (filp->f_flags & O_EXCL)
1395
		filp->f_mode |= FMODE_EXCL;
1396
	if ((filp->f_flags & O_ACCMODE) == 3)
1397
		filp->f_mode |= FMODE_WRITE_IOCTL;
1398

1399
	bdev = bd_acquire(inode);
1400
	if (bdev == NULL)
1401
		return -ENOMEM;
1402

1403
	filp->f_mapping = bdev->bd_inode->i_mapping;
1404

1405
	return blkdev_get(bdev, filp->f_mode, filp);
1406
}
1407

1408
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1409
{
1410
	int ret = 0;
1411
	struct gendisk *disk = bdev->bd_disk;
1412
	struct block_device *victim = NULL;
1413

1414
	mutex_lock_nested(&bdev->bd_mutex, for_part);
1415
	if (for_part)
1416
		bdev->bd_part_count--;
1417

1418
	if (!--bdev->bd_openers) {
1419
		WARN_ON_ONCE(bdev->bd_holders);
1420
		sync_blockdev(bdev);
1421
		kill_bdev(bdev);
1422
	}
1423
	if (bdev->bd_contains == bdev) {
1424
		if (disk->fops->release)
1425
			ret = disk->fops->release(disk, mode);
1426
	}
1427
	if (!bdev->bd_openers) {
1428
		struct module *owner = disk->fops->owner;
1429

1430
		put_disk(disk);
1431
		module_put(owner);
1432
		disk_put_part(bdev->bd_part);
1433
		bdev->bd_part = NULL;
1434
		bdev->bd_disk = NULL;
1435
		bdev_inode_switch_bdi(bdev->bd_inode,
1436
					&default_backing_dev_info);
1437
		if (bdev != bdev->bd_contains)
1438
			victim = bdev->bd_contains;
1439
		bdev->bd_contains = NULL;
1440
	}
1441
	mutex_unlock(&bdev->bd_mutex);
1442
	bdput(bdev);
1443
	if (victim)
1444
		__blkdev_put(victim, mode, 1);
1445
	return ret;
1446
}
1447

1448
int blkdev_put(struct block_device *bdev, fmode_t mode)
1449
{
1450
	if (mode & FMODE_EXCL) {
1451
		bool bdev_free;
1452

1453
		/*
1454
		 * Release a claim on the device.  The holder fields
1455
		 * are protected with bdev_lock.  bd_mutex is to
1456
		 * synchronize disk_holder unlinking.
1457
		 */
1458
		mutex_lock(&bdev->bd_mutex);
1459
		spin_lock(&bdev_lock);
1460

1461
		WARN_ON_ONCE(--bdev->bd_holders < 0);
1462
		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1463

1464
		/* bd_contains might point to self, check in a separate step */
1465
		if ((bdev_free = !bdev->bd_holders))
1466
			bdev->bd_holder = NULL;
1467
		if (!bdev->bd_contains->bd_holders)
1468
			bdev->bd_contains->bd_holder = NULL;
1469

1470
		spin_unlock(&bdev_lock);
1471

1472
		/*
1473
		 * If this was the last claim, remove holder link and
1474
		 * unblock evpoll if it was a write holder.
1475
		 */
1476
		if (bdev_free) {
1477
			if (bdev->bd_write_holder) {
1478
				disk_unblock_events(bdev->bd_disk);
1479
				disk_check_events(bdev->bd_disk);
1480
				bdev->bd_write_holder = false;
1481
			}
1482
		}
1483

1484
		mutex_unlock(&bdev->bd_mutex);
1485
	}
1486

1487
	return __blkdev_put(bdev, mode, 0);
1488
}
1489
EXPORT_SYMBOL(blkdev_put);
1490

1491
static int blkdev_close(struct inode * inode, struct file * filp)
1492
{
1493
	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1494

1495
	return blkdev_put(bdev, filp->f_mode);
1496
}
1497

1498
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1499
{
1500
	struct block_device *bdev = I_BDEV(file->f_mapping->host);
1501
	fmode_t mode = file->f_mode;
1502

1503
	/*
1504
	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1505
	 * to updated it before every ioctl.
1506
	 */
1507
	if (file->f_flags & O_NDELAY)
1508
		mode |= FMODE_NDELAY;
1509
	else
1510
		mode &= ~FMODE_NDELAY;
1511

1512
	return blkdev_ioctl(bdev, mode, cmd, arg);
1513
}
1514

1515
/*
1516
 * Write data to the block device.  Only intended for the block device itself
1517
 * and the raw driver which basically is a fake block device.
1518
 *
1519
 * Does not take i_mutex for the write and thus is not for general purpose
1520
 * use.
1521
 */
1522
ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1523
			 unsigned long nr_segs, loff_t pos)
1524
{
1525
	struct file *file = iocb->ki_filp;
1526
	ssize_t ret;
1527

1528
	BUG_ON(iocb->ki_pos != pos);
1529

1530
	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1531
	if (ret > 0 || ret == -EIOCBQUEUED) {
1532
		ssize_t err;
1533

1534
		err = generic_write_sync(file, pos, ret);
1535
		if (err < 0 && ret > 0)
1536
			ret = err;
1537
	}
1538
	return ret;
1539
}
1540
EXPORT_SYMBOL_GPL(blkdev_aio_write);
1541

1542
/*
1543
 * Try to release a page associated with block device when the system
1544
 * is under memory pressure.
1545
 */
1546
static int blkdev_releasepage(struct page *page, gfp_t wait)
1547
{
1548
	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1549

1550
	if (super && super->s_op->bdev_try_to_free_page)
1551
		return super->s_op->bdev_try_to_free_page(super, page, wait);
1552

1553
	return try_to_free_buffers(page);
1554
}
1555

1556
static const struct address_space_operations def_blk_aops = {
1557
	.readpage	= blkdev_readpage,
1558
	.writepage	= blkdev_writepage,
1559
	.write_begin	= blkdev_write_begin,
1560
	.write_end	= blkdev_write_end,
1561
	.writepages	= generic_writepages,
1562
	.releasepage	= blkdev_releasepage,
1563
	.direct_IO	= blkdev_direct_IO,
1564
};
1565

1566
const struct file_operations def_blk_fops = {
1567
	.open		= blkdev_open,
1568
	.release	= blkdev_close,
1569
	.llseek		= block_llseek,
1570
	.read		= do_sync_read,
1571
	.write		= do_sync_write,
1572
  	.aio_read	= generic_file_aio_read,
1573
	.aio_write	= blkdev_aio_write,
1574
	.mmap		= generic_file_mmap,
1575
	.fsync		= blkdev_fsync,
1576
	.unlocked_ioctl	= block_ioctl,
1577
#ifdef CONFIG_COMPAT
1578
	.compat_ioctl	= compat_blkdev_ioctl,
1579
#endif
1580
	.splice_read	= generic_file_splice_read,
1581
	.splice_write	= generic_file_splice_write,
1582
};
1583

1584
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1585
{
1586
	int res;
1587
	mm_segment_t old_fs = get_fs();
1588
	set_fs(KERNEL_DS);
1589
	res = blkdev_ioctl(bdev, 0, cmd, arg);
1590
	set_fs(old_fs);
1591
	return res;
1592
}
1593

1594
EXPORT_SYMBOL(ioctl_by_bdev);
1595

1596
/**
1597
 * lookup_bdev  - lookup a struct block_device by name
1598
 * @pathname:	special file representing the block device
1599
 *
1600
 * Get a reference to the blockdevice at @pathname in the current
1601
 * namespace if possible and return it.  Return ERR_PTR(error)
1602
 * otherwise.
1603
 */
1604
struct block_device *lookup_bdev(const char *pathname)
1605
{
1606
	struct block_device *bdev;
1607
	struct inode *inode;
1608
	struct path path;
1609
	int error;
1610

1611
	if (!pathname || !*pathname)
1612
		return ERR_PTR(-EINVAL);
1613

1614
	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1615
	if (error)
1616
		return ERR_PTR(error);
1617

1618
	inode = path.dentry->d_inode;
1619
	error = -ENOTBLK;
1620
	if (!S_ISBLK(inode->i_mode))
1621
		goto fail;
1622
	error = -EACCES;
1623
	if (path.mnt->mnt_flags & MNT_NODEV)
1624
		goto fail;
1625
	error = -ENOMEM;
1626
	bdev = bd_acquire(inode);
1627
	if (!bdev)
1628
		goto fail;
1629
out:
1630
	path_put(&path);
1631
	return bdev;
1632
fail:
1633
	bdev = ERR_PTR(error);
1634
	goto out;
1635
}
1636
EXPORT_SYMBOL(lookup_bdev);
1637

1638
int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1639
{
1640
	struct super_block *sb = get_super(bdev);
1641
	int res = 0;
1642

1643
	if (sb) {
1644
		/*
1645
		 * no need to lock the super, get_super holds the
1646
		 * read mutex so the filesystem cannot go away
1647
		 * under us (->put_super runs with the write lock
1648
		 * hold).
1649
		 */
1650
		shrink_dcache_sb(sb);
1651
		res = invalidate_inodes(sb, kill_dirty);
1652
		drop_super(sb);
1653
	}
1654
	invalidate_bdev(bdev);
1655
	return res;
1656
}
1657
EXPORT_SYMBOL(__invalidate_device);
1658

1659
Product

Resources

Company