CoCalc -- bio.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/bio.c
¹⁷³¹¹ views
1
/*
2
 * Copyright (C) 2001 Jens Axboe <[email protected]>
3
 *
4
 * This program is free software; you can redistribute it and/or modify
5
 * it under the terms of the GNU General Public License version 2 as
6
 * published by the Free Software Foundation.
7
 *
8
 * This program is distributed in the hope that it will be useful,
9
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
 * GNU General Public License for more details.
12
 *
13
 * You should have received a copy of the GNU General Public Licens
14
 * along with this program; if not, write to the Free Software
15
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
16
 *
17
 */
18
#include <linux/mm.h>
19
#include <linux/swap.h>
20
#include <linux/bio.h>
21
#include <linux/blkdev.h>
22
#include <linux/slab.h>
23
#include <linux/init.h>
24
#include <linux/kernel.h>
25
#include <linux/module.h>
26
#include <linux/mempool.h>
27
#include <linux/workqueue.h>
28
#include <scsi/sg.h>		/* for struct sg_iovec */
29

30
#include <trace/events/block.h>
31

32
/*
33
 * Test patch to inline a certain number of bi_io_vec's inside the bio
34
 * itself, to shrink a bio data allocation from two mempool calls to one
35
 */
36
#define BIO_INLINE_VECS		4
37

38
static mempool_t *bio_split_pool __read_mostly;
39

40
/*
41
 * if you change this list, also change bvec_alloc or things will
42
 * break badly! cannot be bigger than what you can fit into an
43
 * unsigned short
44
 */
45
#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
46
static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
47
	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
48
};
49
#undef BV
50

51
/*
52
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
53
 * IO code that does not need private memory pools.
54
 */
55
struct bio_set *fs_bio_set;
56

57
/*
58
 * Our slab pool management
59
 */
60
struct bio_slab {
61
	struct kmem_cache *slab;
62
	unsigned int slab_ref;
63
	unsigned int slab_size;
64
	char name[8];
65
};
66
static DEFINE_MUTEX(bio_slab_lock);
67
static struct bio_slab *bio_slabs;
68
static unsigned int bio_slab_nr, bio_slab_max;
69

70
static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
71
{
72
	unsigned int sz = sizeof(struct bio) + extra_size;
73
	struct kmem_cache *slab = NULL;
74
	struct bio_slab *bslab;
75
	unsigned int i, entry = -1;
76

77
	mutex_lock(&bio_slab_lock);
78

79
	i = 0;
80
	while (i < bio_slab_nr) {
81
		bslab = &bio_slabs[i];
82

83
		if (!bslab->slab && entry == -1)
84
			entry = i;
85
		else if (bslab->slab_size == sz) {
86
			slab = bslab->slab;
87
			bslab->slab_ref++;
88
			break;
89
		}
90
		i++;
91
	}
92

93
	if (slab)
94
		goto out_unlock;
95

96
	if (bio_slab_nr == bio_slab_max && entry == -1) {
97
		bio_slab_max <<= 1;
98
		bio_slabs = krealloc(bio_slabs,
99
				     bio_slab_max * sizeof(struct bio_slab),
100
				     GFP_KERNEL);
101
		if (!bio_slabs)
102
			goto out_unlock;
103
	}
104
	if (entry == -1)
105
		entry = bio_slab_nr++;
106

107
	bslab = &bio_slabs[entry];
108

109
	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
110
	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
111
	if (!slab)
112
		goto out_unlock;
113

114
	printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
115
	bslab->slab = slab;
116
	bslab->slab_ref = 1;
117
	bslab->slab_size = sz;
118
out_unlock:
119
	mutex_unlock(&bio_slab_lock);
120
	return slab;
121
}
122

123
static void bio_put_slab(struct bio_set *bs)
124
{
125
	struct bio_slab *bslab = NULL;
126
	unsigned int i;
127

128
	mutex_lock(&bio_slab_lock);
129

130
	for (i = 0; i < bio_slab_nr; i++) {
131
		if (bs->bio_slab == bio_slabs[i].slab) {
132
			bslab = &bio_slabs[i];
133
			break;
134
		}
135
	}
136

137
	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
138
		goto out;
139

140
	WARN_ON(!bslab->slab_ref);
141

142
	if (--bslab->slab_ref)
143
		goto out;
144

145
	kmem_cache_destroy(bslab->slab);
146
	bslab->slab = NULL;
147

148
out:
149
	mutex_unlock(&bio_slab_lock);
150
}
151

152
unsigned int bvec_nr_vecs(unsigned short idx)
153
{
154
	return bvec_slabs[idx].nr_vecs;
155
}
156

157
void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
158
{
159
	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
160

161
	if (idx == BIOVEC_MAX_IDX)
162
		mempool_free(bv, bs->bvec_pool);
163
	else {
164
		struct biovec_slab *bvs = bvec_slabs + idx;
165

166
		kmem_cache_free(bvs->slab, bv);
167
	}
168
}
169

170
struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
171
			      struct bio_set *bs)
172
{
173
	struct bio_vec *bvl;
174

175
	/*
176
	 * see comment near bvec_array define!
177
	 */
178
	switch (nr) {
179
	case 1:
180
		*idx = 0;
181
		break;
182
	case 2 ... 4:
183
		*idx = 1;
184
		break;
185
	case 5 ... 16:
186
		*idx = 2;
187
		break;
188
	case 17 ... 64:
189
		*idx = 3;
190
		break;
191
	case 65 ... 128:
192
		*idx = 4;
193
		break;
194
	case 129 ... BIO_MAX_PAGES:
195
		*idx = 5;
196
		break;
197
	default:
198
		return NULL;
199
	}
200

201
	/*
202
	 * idx now points to the pool we want to allocate from. only the
203
	 * 1-vec entry pool is mempool backed.
204
	 */
205
	if (*idx == BIOVEC_MAX_IDX) {
206
fallback:
207
		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
208
	} else {
209
		struct biovec_slab *bvs = bvec_slabs + *idx;
210
		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
211

212
		/*
213
		 * Make this allocation restricted and don't dump info on
214
		 * allocation failures, since we'll fallback to the mempool
215
		 * in case of failure.
216
		 */
217
		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
218

219
		/*
220
		 * Try a slab allocation. If this fails and __GFP_WAIT
221
		 * is set, retry with the 1-entry mempool
222
		 */
223
		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
224
		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
225
			*idx = BIOVEC_MAX_IDX;
226
			goto fallback;
227
		}
228
	}
229

230
	return bvl;
231
}
232

233
void bio_free(struct bio *bio, struct bio_set *bs)
234
{
235
	void *p;
236

237
	if (bio_has_allocated_vec(bio))
238
		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
239

240
	if (bio_integrity(bio))
241
		bio_integrity_free(bio, bs);
242

243
	/*
244
	 * If we have front padding, adjust the bio pointer before freeing
245
	 */
246
	p = bio;
247
	if (bs->front_pad)
248
		p -= bs->front_pad;
249

250
	mempool_free(p, bs->bio_pool);
251
}
252
EXPORT_SYMBOL(bio_free);
253

254
void bio_init(struct bio *bio)
255
{
256
	memset(bio, 0, sizeof(*bio));
257
	bio->bi_flags = 1 << BIO_UPTODATE;
258
	bio->bi_comp_cpu = -1;
259
	atomic_set(&bio->bi_cnt, 1);
260
}
261
EXPORT_SYMBOL(bio_init);
262

263
/**
264
 * bio_alloc_bioset - allocate a bio for I/O
265
 * @gfp_mask:   the GFP_ mask given to the slab allocator
266
 * @nr_iovecs:	number of iovecs to pre-allocate
267
 * @bs:		the bio_set to allocate from.
268
 *
269
 * Description:
270
 *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
271
 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
272
 *   for a &struct bio to become free.
273
 *
274
 *   Note that the caller must set ->bi_destructor on successful return
275
 *   of a bio, to do the appropriate freeing of the bio once the reference
276
 *   count drops to zero.
277
 **/
278
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
279
{
280
	unsigned long idx = BIO_POOL_NONE;
281
	struct bio_vec *bvl = NULL;
282
	struct bio *bio;
283
	void *p;
284

285
	p = mempool_alloc(bs->bio_pool, gfp_mask);
286
	if (unlikely(!p))
287
		return NULL;
288
	bio = p + bs->front_pad;
289

290
	bio_init(bio);
291

292
	if (unlikely(!nr_iovecs))
293
		goto out_set;
294

295
	if (nr_iovecs <= BIO_INLINE_VECS) {
296
		bvl = bio->bi_inline_vecs;
297
		nr_iovecs = BIO_INLINE_VECS;
298
	} else {
299
		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
300
		if (unlikely(!bvl))
301
			goto err_free;
302

303
		nr_iovecs = bvec_nr_vecs(idx);
304
	}
305
out_set:
306
	bio->bi_flags |= idx << BIO_POOL_OFFSET;
307
	bio->bi_max_vecs = nr_iovecs;
308
	bio->bi_io_vec = bvl;
309
	return bio;
310

311
err_free:
312
	mempool_free(p, bs->bio_pool);
313
	return NULL;
314
}
315
EXPORT_SYMBOL(bio_alloc_bioset);
316

317
static void bio_fs_destructor(struct bio *bio)
318
{
319
	bio_free(bio, fs_bio_set);
320
}
321

322
/**
323
 *	bio_alloc - allocate a new bio, memory pool backed
324
 *	@gfp_mask: allocation mask to use
325
 *	@nr_iovecs: number of iovecs
326
 *
327
 *	bio_alloc will allocate a bio and associated bio_vec array that can hold
328
 *	at least @nr_iovecs entries. Allocations will be done from the
329
 *	fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
330
 *
331
 *	If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
332
 *	a bio. This is due to the mempool guarantees. To make this work, callers
333
 *	must never allocate more than 1 bio at a time from this pool. Callers
334
 *	that need to allocate more than 1 bio must always submit the previously
335
 *	allocated bio for IO before attempting to allocate a new one. Failure to
336
 *	do so can cause livelocks under memory pressure.
337
 *
338
 *	RETURNS:
339
 *	Pointer to new bio on success, NULL on failure.
340
 */
341
struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
342
{
343
	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
344

345
	if (bio)
346
		bio->bi_destructor = bio_fs_destructor;
347

348
	return bio;
349
}
350
EXPORT_SYMBOL(bio_alloc);
351

352
static void bio_kmalloc_destructor(struct bio *bio)
353
{
354
	if (bio_integrity(bio))
355
		bio_integrity_free(bio, fs_bio_set);
356
	kfree(bio);
357
}
358

359
/**
360
 * bio_kmalloc - allocate a bio for I/O using kmalloc()
361
 * @gfp_mask:   the GFP_ mask given to the slab allocator
362
 * @nr_iovecs:	number of iovecs to pre-allocate
363
 *
364
 * Description:
365
 *   Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask contains
366
 *   %__GFP_WAIT, the allocation is guaranteed to succeed.
367
 *
368
 **/
369
struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
370
{
371
	struct bio *bio;
372

373
	if (nr_iovecs > UIO_MAXIOV)
374
		return NULL;
375

376
	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
377
		      gfp_mask);
378
	if (unlikely(!bio))
379
		return NULL;
380

381
	bio_init(bio);
382
	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
383
	bio->bi_max_vecs = nr_iovecs;
384
	bio->bi_io_vec = bio->bi_inline_vecs;
385
	bio->bi_destructor = bio_kmalloc_destructor;
386

387
	return bio;
388
}
389
EXPORT_SYMBOL(bio_kmalloc);
390

391
void zero_fill_bio(struct bio *bio)
392
{
393
	unsigned long flags;
394
	struct bio_vec *bv;
395
	int i;
396

397
	bio_for_each_segment(bv, bio, i) {
398
		char *data = bvec_kmap_irq(bv, &flags);
399
		memset(data, 0, bv->bv_len);
400
		flush_dcache_page(bv->bv_page);
401
		bvec_kunmap_irq(data, &flags);
402
	}
403
}
404
EXPORT_SYMBOL(zero_fill_bio);
405

406
/**
407
 * bio_put - release a reference to a bio
408
 * @bio:   bio to release reference to
409
 *
410
 * Description:
411
 *   Put a reference to a &struct bio, either one you have gotten with
412
 *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
413
 **/
414
void bio_put(struct bio *bio)
415
{
416
	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
417

418
	/*
419
	 * last put frees it
420
	 */
421
	if (atomic_dec_and_test(&bio->bi_cnt)) {
422
		bio->bi_next = NULL;
423
		bio->bi_destructor(bio);
424
	}
425
}
426
EXPORT_SYMBOL(bio_put);
427

428
inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
429
{
430
	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
431
		blk_recount_segments(q, bio);
432

433
	return bio->bi_phys_segments;
434
}
435
EXPORT_SYMBOL(bio_phys_segments);
436

437
/**
438
 * 	__bio_clone	-	clone a bio
439
 * 	@bio: destination bio
440
 * 	@bio_src: bio to clone
441
 *
442
 *	Clone a &bio. Caller will own the returned bio, but not
443
 *	the actual data it points to. Reference count of returned
444
 * 	bio will be one.
445
 */
446
void __bio_clone(struct bio *bio, struct bio *bio_src)
447
{
448
	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
449
		bio_src->bi_max_vecs * sizeof(struct bio_vec));
450

451
	/*
452
	 * most users will be overriding ->bi_bdev with a new target,
453
	 * so we don't set nor calculate new physical/hw segment counts here
454
	 */
455
	bio->bi_sector = bio_src->bi_sector;
456
	bio->bi_bdev = bio_src->bi_bdev;
457
	bio->bi_flags |= 1 << BIO_CLONED;
458
	bio->bi_rw = bio_src->bi_rw;
459
	bio->bi_vcnt = bio_src->bi_vcnt;
460
	bio->bi_size = bio_src->bi_size;
461
	bio->bi_idx = bio_src->bi_idx;
462
}
463
EXPORT_SYMBOL(__bio_clone);
464

465
/**
466
 *	bio_clone	-	clone a bio
467
 *	@bio: bio to clone
468
 *	@gfp_mask: allocation priority
469
 *
470
 * 	Like __bio_clone, only also allocates the returned bio
471
 */
472
struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
473
{
474
	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
475

476
	if (!b)
477
		return NULL;
478

479
	b->bi_destructor = bio_fs_destructor;
480
	__bio_clone(b, bio);
481

482
	if (bio_integrity(bio)) {
483
		int ret;
484

485
		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
486

487
		if (ret < 0) {
488
			bio_put(b);
489
			return NULL;
490
		}
491
	}
492

493
	return b;
494
}
495
EXPORT_SYMBOL(bio_clone);
496

497
/**
498
 *	bio_get_nr_vecs		- return approx number of vecs
499
 *	@bdev:  I/O target
500
 *
501
 *	Return the approximate number of pages we can send to this target.
502
 *	There's no guarantee that you will be able to fit this number of pages
503
 *	into a bio, it does not account for dynamic restrictions that vary
504
 *	on offset.
505
 */
506
int bio_get_nr_vecs(struct block_device *bdev)
507
{
508
	struct request_queue *q = bdev_get_queue(bdev);
509
	int nr_pages;
510

511
	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
512
	if (nr_pages > queue_max_segments(q))
513
		nr_pages = queue_max_segments(q);
514

515
	return nr_pages;
516
}
517
EXPORT_SYMBOL(bio_get_nr_vecs);
518

519
static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
520
			  *page, unsigned int len, unsigned int offset,
521
			  unsigned short max_sectors)
522
{
523
	int retried_segments = 0;
524
	struct bio_vec *bvec;
525

526
	/*
527
	 * cloned bio must not modify vec list
528
	 */
529
	if (unlikely(bio_flagged(bio, BIO_CLONED)))
530
		return 0;
531

532
	if (((bio->bi_size + len) >> 9) > max_sectors)
533
		return 0;
534

535
	/*
536
	 * For filesystems with a blocksize smaller than the pagesize
537
	 * we will often be called with the same page as last time and
538
	 * a consecutive offset.  Optimize this special case.
539
	 */
540
	if (bio->bi_vcnt > 0) {
541
		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
542

543
		if (page == prev->bv_page &&
544
		    offset == prev->bv_offset + prev->bv_len) {
545
			unsigned int prev_bv_len = prev->bv_len;
546
			prev->bv_len += len;
547

548
			if (q->merge_bvec_fn) {
549
				struct bvec_merge_data bvm = {
550
					/* prev_bvec is already charged in
551
					   bi_size, discharge it in order to
552
					   simulate merging updated prev_bvec
553
					   as new bvec. */
554
					.bi_bdev = bio->bi_bdev,
555
					.bi_sector = bio->bi_sector,
556
					.bi_size = bio->bi_size - prev_bv_len,
557
					.bi_rw = bio->bi_rw,
558
				};
559

560
				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
561
					prev->bv_len -= len;
562
					return 0;
563
				}
564
			}
565

566
			goto done;
567
		}
568
	}
569

570
	if (bio->bi_vcnt >= bio->bi_max_vecs)
571
		return 0;
572

573
	/*
574
	 * we might lose a segment or two here, but rather that than
575
	 * make this too complex.
576
	 */
577

578
	while (bio->bi_phys_segments >= queue_max_segments(q)) {
579

580
		if (retried_segments)
581
			return 0;
582

583
		retried_segments = 1;
584
		blk_recount_segments(q, bio);
585
	}
586

587
	/*
588
	 * setup the new entry, we might clear it again later if we
589
	 * cannot add the page
590
	 */
591
	bvec = &bio->bi_io_vec[bio->bi_vcnt];
592
	bvec->bv_page = page;
593
	bvec->bv_len = len;
594
	bvec->bv_offset = offset;
595

596
	/*
597
	 * if queue has other restrictions (eg varying max sector size
598
	 * depending on offset), it can specify a merge_bvec_fn in the
599
	 * queue to get further control
600
	 */
601
	if (q->merge_bvec_fn) {
602
		struct bvec_merge_data bvm = {
603
			.bi_bdev = bio->bi_bdev,
604
			.bi_sector = bio->bi_sector,
605
			.bi_size = bio->bi_size,
606
			.bi_rw = bio->bi_rw,
607
		};
608

609
		/*
610
		 * merge_bvec_fn() returns number of bytes it can accept
611
		 * at this offset
612
		 */
613
		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
614
			bvec->bv_page = NULL;
615
			bvec->bv_len = 0;
616
			bvec->bv_offset = 0;
617
			return 0;
618
		}
619
	}
620

621
	/* If we may be able to merge these biovecs, force a recount */
622
	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
623
		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
624

625
	bio->bi_vcnt++;
626
	bio->bi_phys_segments++;
627
 done:
628
	bio->bi_size += len;
629
	return len;
630
}
631

632
/**
633
 *	bio_add_pc_page	-	attempt to add page to bio
634
 *	@q: the target queue
635
 *	@bio: destination bio
636
 *	@page: page to add
637
 *	@len: vec entry length
638
 *	@offset: vec entry offset
639
 *
640
 *	Attempt to add a page to the bio_vec maplist. This can fail for a
641
 *	number of reasons, such as the bio being full or target block device
642
 *	limitations. The target block device must allow bio's up to PAGE_SIZE,
643
 *	so it is always possible to add a single page to an empty bio.
644
 *
645
 *	This should only be used by REQ_PC bios.
646
 */
647
int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
648
		    unsigned int len, unsigned int offset)
649
{
650
	return __bio_add_page(q, bio, page, len, offset,
651
			      queue_max_hw_sectors(q));
652
}
653
EXPORT_SYMBOL(bio_add_pc_page);
654

655
/**
656
 *	bio_add_page	-	attempt to add page to bio
657
 *	@bio: destination bio
658
 *	@page: page to add
659
 *	@len: vec entry length
660
 *	@offset: vec entry offset
661
 *
662
 *	Attempt to add a page to the bio_vec maplist. This can fail for a
663
 *	number of reasons, such as the bio being full or target block device
664
 *	limitations. The target block device must allow bio's up to PAGE_SIZE,
665
 *	so it is always possible to add a single page to an empty bio.
666
 */
667
int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
668
		 unsigned int offset)
669
{
670
	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
671
	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
672
}
673
EXPORT_SYMBOL(bio_add_page);
674

675
struct bio_map_data {
676
	struct bio_vec *iovecs;
677
	struct sg_iovec *sgvecs;
678
	int nr_sgvecs;
679
	int is_our_pages;
680
};
681

682
static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
683
			     struct sg_iovec *iov, int iov_count,
684
			     int is_our_pages)
685
{
686
	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
687
	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
688
	bmd->nr_sgvecs = iov_count;
689
	bmd->is_our_pages = is_our_pages;
690
	bio->bi_private = bmd;
691
}
692

693
static void bio_free_map_data(struct bio_map_data *bmd)
694
{
695
	kfree(bmd->iovecs);
696
	kfree(bmd->sgvecs);
697
	kfree(bmd);
698
}
699

700
static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
701
					       gfp_t gfp_mask)
702
{
703
	struct bio_map_data *bmd;
704

705
	if (iov_count > UIO_MAXIOV)
706
		return NULL;
707

708
	bmd = kmalloc(sizeof(*bmd), gfp_mask);
709
	if (!bmd)
710
		return NULL;
711

712
	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
713
	if (!bmd->iovecs) {
714
		kfree(bmd);
715
		return NULL;
716
	}
717

718
	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
719
	if (bmd->sgvecs)
720
		return bmd;
721

722
	kfree(bmd->iovecs);
723
	kfree(bmd);
724
	return NULL;
725
}
726

727
static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
728
			  struct sg_iovec *iov, int iov_count,
729
			  int to_user, int from_user, int do_free_page)
730
{
731
	int ret = 0, i;
732
	struct bio_vec *bvec;
733
	int iov_idx = 0;
734
	unsigned int iov_off = 0;
735

736
	__bio_for_each_segment(bvec, bio, i, 0) {
737
		char *bv_addr = page_address(bvec->bv_page);
738
		unsigned int bv_len = iovecs[i].bv_len;
739

740
		while (bv_len && iov_idx < iov_count) {
741
			unsigned int bytes;
742
			char __user *iov_addr;
743

744
			bytes = min_t(unsigned int,
745
				      iov[iov_idx].iov_len - iov_off, bv_len);
746
			iov_addr = iov[iov_idx].iov_base + iov_off;
747

748
			if (!ret) {
749
				if (to_user)
750
					ret = copy_to_user(iov_addr, bv_addr,
751
							   bytes);
752

753
				if (from_user)
754
					ret = copy_from_user(bv_addr, iov_addr,
755
							     bytes);
756

757
				if (ret)
758
					ret = -EFAULT;
759
			}
760

761
			bv_len -= bytes;
762
			bv_addr += bytes;
763
			iov_addr += bytes;
764
			iov_off += bytes;
765

766
			if (iov[iov_idx].iov_len == iov_off) {
767
				iov_idx++;
768
				iov_off = 0;
769
			}
770
		}
771

772
		if (do_free_page)
773
			__free_page(bvec->bv_page);
774
	}
775

776
	return ret;
777
}
778

779
/**
780
 *	bio_uncopy_user	-	finish previously mapped bio
781
 *	@bio: bio being terminated
782
 *
783
 *	Free pages allocated from bio_copy_user() and write back data
784
 *	to user space in case of a read.
785
 */
786
int bio_uncopy_user(struct bio *bio)
787
{
788
	struct bio_map_data *bmd = bio->bi_private;
789
	int ret = 0;
790

791
	if (!bio_flagged(bio, BIO_NULL_MAPPED))
792
		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
793
				     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
794
				     0, bmd->is_our_pages);
795
	bio_free_map_data(bmd);
796
	bio_put(bio);
797
	return ret;
798
}
799
EXPORT_SYMBOL(bio_uncopy_user);
800

801
/**
802
 *	bio_copy_user_iov	-	copy user data to bio
803
 *	@q: destination block queue
804
 *	@map_data: pointer to the rq_map_data holding pages (if necessary)
805
 *	@iov:	the iovec.
806
 *	@iov_count: number of elements in the iovec
807
 *	@write_to_vm: bool indicating writing to pages or not
808
 *	@gfp_mask: memory allocation flags
809
 *
810
 *	Prepares and returns a bio for indirect user io, bouncing data
811
 *	to/from kernel pages as necessary. Must be paired with
812
 *	call bio_uncopy_user() on io completion.
813
 */
814
struct bio *bio_copy_user_iov(struct request_queue *q,
815
			      struct rq_map_data *map_data,
816
			      struct sg_iovec *iov, int iov_count,
817
			      int write_to_vm, gfp_t gfp_mask)
818
{
819
	struct bio_map_data *bmd;
820
	struct bio_vec *bvec;
821
	struct page *page;
822
	struct bio *bio;
823
	int i, ret;
824
	int nr_pages = 0;
825
	unsigned int len = 0;
826
	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
827

828
	for (i = 0; i < iov_count; i++) {
829
		unsigned long uaddr;
830
		unsigned long end;
831
		unsigned long start;
832

833
		uaddr = (unsigned long)iov[i].iov_base;
834
		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
835
		start = uaddr >> PAGE_SHIFT;
836

837
		/*
838
		 * Overflow, abort
839
		 */
840
		if (end < start)
841
			return ERR_PTR(-EINVAL);
842

843
		nr_pages += end - start;
844
		len += iov[i].iov_len;
845
	}
846

847
	if (offset)
848
		nr_pages++;
849

850
	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
851
	if (!bmd)
852
		return ERR_PTR(-ENOMEM);
853

854
	ret = -ENOMEM;
855
	bio = bio_kmalloc(gfp_mask, nr_pages);
856
	if (!bio)
857
		goto out_bmd;
858

859
	if (!write_to_vm)
860
		bio->bi_rw |= REQ_WRITE;
861

862
	ret = 0;
863

864
	if (map_data) {
865
		nr_pages = 1 << map_data->page_order;
866
		i = map_data->offset / PAGE_SIZE;
867
	}
868
	while (len) {
869
		unsigned int bytes = PAGE_SIZE;
870

871
		bytes -= offset;
872

873
		if (bytes > len)
874
			bytes = len;
875

876
		if (map_data) {
877
			if (i == map_data->nr_entries * nr_pages) {
878
				ret = -ENOMEM;
879
				break;
880
			}
881

882
			page = map_data->pages[i / nr_pages];
883
			page += (i % nr_pages);
884

885
			i++;
886
		} else {
887
			page = alloc_page(q->bounce_gfp | gfp_mask);
888
			if (!page) {
889
				ret = -ENOMEM;
890
				break;
891
			}
892
		}
893

894
		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
895
			break;
896

897
		len -= bytes;
898
		offset = 0;
899
	}
900

901
	if (ret)
902
		goto cleanup;
903

904
	/*
905
	 * success
906
	 */
907
	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
908
	    (map_data && map_data->from_user)) {
909
		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
910
		if (ret)
911
			goto cleanup;
912
	}
913

914
	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
915
	return bio;
916
cleanup:
917
	if (!map_data)
918
		bio_for_each_segment(bvec, bio, i)
919
			__free_page(bvec->bv_page);
920

921
	bio_put(bio);
922
out_bmd:
923
	bio_free_map_data(bmd);
924
	return ERR_PTR(ret);
925
}
926

927
/**
928
 *	bio_copy_user	-	copy user data to bio
929
 *	@q: destination block queue
930
 *	@map_data: pointer to the rq_map_data holding pages (if necessary)
931
 *	@uaddr: start of user address
932
 *	@len: length in bytes
933
 *	@write_to_vm: bool indicating writing to pages or not
934
 *	@gfp_mask: memory allocation flags
935
 *
936
 *	Prepares and returns a bio for indirect user io, bouncing data
937
 *	to/from kernel pages as necessary. Must be paired with
938
 *	call bio_uncopy_user() on io completion.
939
 */
940
struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
941
			  unsigned long uaddr, unsigned int len,
942
			  int write_to_vm, gfp_t gfp_mask)
943
{
944
	struct sg_iovec iov;
945

946
	iov.iov_base = (void __user *)uaddr;
947
	iov.iov_len = len;
948

949
	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
950
}
951
EXPORT_SYMBOL(bio_copy_user);
952

953
static struct bio *__bio_map_user_iov(struct request_queue *q,
954
				      struct block_device *bdev,
955
				      struct sg_iovec *iov, int iov_count,
956
				      int write_to_vm, gfp_t gfp_mask)
957
{
958
	int i, j;
959
	int nr_pages = 0;
960
	struct page **pages;
961
	struct bio *bio;
962
	int cur_page = 0;
963
	int ret, offset;
964

965
	for (i = 0; i < iov_count; i++) {
966
		unsigned long uaddr = (unsigned long)iov[i].iov_base;
967
		unsigned long len = iov[i].iov_len;
968
		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
969
		unsigned long start = uaddr >> PAGE_SHIFT;
970

971
		/*
972
		 * Overflow, abort
973
		 */
974
		if (end < start)
975
			return ERR_PTR(-EINVAL);
976

977
		nr_pages += end - start;
978
		/*
979
		 * buffer must be aligned to at least hardsector size for now
980
		 */
981
		if (uaddr & queue_dma_alignment(q))
982
			return ERR_PTR(-EINVAL);
983
	}
984

985
	if (!nr_pages)
986
		return ERR_PTR(-EINVAL);
987

988
	bio = bio_kmalloc(gfp_mask, nr_pages);
989
	if (!bio)
990
		return ERR_PTR(-ENOMEM);
991

992
	ret = -ENOMEM;
993
	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
994
	if (!pages)
995
		goto out;
996

997
	for (i = 0; i < iov_count; i++) {
998
		unsigned long uaddr = (unsigned long)iov[i].iov_base;
999
		unsigned long len = iov[i].iov_len;
1000
		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1001
		unsigned long start = uaddr >> PAGE_SHIFT;
1002
		const int local_nr_pages = end - start;
1003
		const int page_limit = cur_page + local_nr_pages;
1004

1005
		ret = get_user_pages_fast(uaddr, local_nr_pages,
1006
				write_to_vm, &pages[cur_page]);
1007
		if (ret < local_nr_pages) {
1008
			ret = -EFAULT;
1009
			goto out_unmap;
1010
		}
1011

1012
		offset = uaddr & ~PAGE_MASK;
1013
		for (j = cur_page; j < page_limit; j++) {
1014
			unsigned int bytes = PAGE_SIZE - offset;
1015

1016
			if (len <= 0)
1017
				break;
1018
			
1019
			if (bytes > len)
1020
				bytes = len;
1021

1022
			/*
1023
			 * sorry...
1024
			 */
1025
			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
1026
					    bytes)
1027
				break;
1028

1029
			len -= bytes;
1030
			offset = 0;
1031
		}
1032

1033
		cur_page = j;
1034
		/*
1035
		 * release the pages we didn't map into the bio, if any
1036
		 */
1037
		while (j < page_limit)
1038
			page_cache_release(pages[j++]);
1039
	}
1040

1041
	kfree(pages);
1042

1043
	/*
1044
	 * set data direction, and check if mapped pages need bouncing
1045
	 */
1046
	if (!write_to_vm)
1047
		bio->bi_rw |= REQ_WRITE;
1048

1049
	bio->bi_bdev = bdev;
1050
	bio->bi_flags |= (1 << BIO_USER_MAPPED);
1051
	return bio;
1052

1053
 out_unmap:
1054
	for (i = 0; i < nr_pages; i++) {
1055
		if(!pages[i])
1056
			break;
1057
		page_cache_release(pages[i]);
1058
	}
1059
 out:
1060
	kfree(pages);
1061
	bio_put(bio);
1062
	return ERR_PTR(ret);
1063
}
1064

1065
/**
1066
 *	bio_map_user	-	map user address into bio
1067
 *	@q: the struct request_queue for the bio
1068
 *	@bdev: destination block device
1069
 *	@uaddr: start of user address
1070
 *	@len: length in bytes
1071
 *	@write_to_vm: bool indicating writing to pages or not
1072
 *	@gfp_mask: memory allocation flags
1073
 *
1074
 *	Map the user space address into a bio suitable for io to a block
1075
 *	device. Returns an error pointer in case of error.
1076
 */
1077
struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1078
			 unsigned long uaddr, unsigned int len, int write_to_vm,
1079
			 gfp_t gfp_mask)
1080
{
1081
	struct sg_iovec iov;
1082

1083
	iov.iov_base = (void __user *)uaddr;
1084
	iov.iov_len = len;
1085

1086
	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1087
}
1088
EXPORT_SYMBOL(bio_map_user);
1089

1090
/**
1091
 *	bio_map_user_iov - map user sg_iovec table into bio
1092
 *	@q: the struct request_queue for the bio
1093
 *	@bdev: destination block device
1094
 *	@iov:	the iovec.
1095
 *	@iov_count: number of elements in the iovec
1096
 *	@write_to_vm: bool indicating writing to pages or not
1097
 *	@gfp_mask: memory allocation flags
1098
 *
1099
 *	Map the user space address into a bio suitable for io to a block
1100
 *	device. Returns an error pointer in case of error.
1101
 */
1102
struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
1103
			     struct sg_iovec *iov, int iov_count,
1104
			     int write_to_vm, gfp_t gfp_mask)
1105
{
1106
	struct bio *bio;
1107

1108
	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
1109
				 gfp_mask);
1110
	if (IS_ERR(bio))
1111
		return bio;
1112

1113
	/*
1114
	 * subtle -- if __bio_map_user() ended up bouncing a bio,
1115
	 * it would normally disappear when its bi_end_io is run.
1116
	 * however, we need it for the unmap, so grab an extra
1117
	 * reference to it
1118
	 */
1119
	bio_get(bio);
1120

1121
	return bio;
1122
}
1123

1124
static void __bio_unmap_user(struct bio *bio)
1125
{
1126
	struct bio_vec *bvec;
1127
	int i;
1128

1129
	/*
1130
	 * make sure we dirty pages we wrote to
1131
	 */
1132
	__bio_for_each_segment(bvec, bio, i, 0) {
1133
		if (bio_data_dir(bio) == READ)
1134
			set_page_dirty_lock(bvec->bv_page);
1135

1136
		page_cache_release(bvec->bv_page);
1137
	}
1138

1139
	bio_put(bio);
1140
}
1141

1142
/**
1143
 *	bio_unmap_user	-	unmap a bio
1144
 *	@bio:		the bio being unmapped
1145
 *
1146
 *	Unmap a bio previously mapped by bio_map_user(). Must be called with
1147
 *	a process context.
1148
 *
1149
 *	bio_unmap_user() may sleep.
1150
 */
1151
void bio_unmap_user(struct bio *bio)
1152
{
1153
	__bio_unmap_user(bio);
1154
	bio_put(bio);
1155
}
1156
EXPORT_SYMBOL(bio_unmap_user);
1157

1158
static void bio_map_kern_endio(struct bio *bio, int err)
1159
{
1160
	bio_put(bio);
1161
}
1162

1163
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1164
				  unsigned int len, gfp_t gfp_mask)
1165
{
1166
	unsigned long kaddr = (unsigned long)data;
1167
	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1168
	unsigned long start = kaddr >> PAGE_SHIFT;
1169
	const int nr_pages = end - start;
1170
	int offset, i;
1171
	struct bio *bio;
1172

1173
	bio = bio_kmalloc(gfp_mask, nr_pages);
1174
	if (!bio)
1175
		return ERR_PTR(-ENOMEM);
1176

1177
	offset = offset_in_page(kaddr);
1178
	for (i = 0; i < nr_pages; i++) {
1179
		unsigned int bytes = PAGE_SIZE - offset;
1180

1181
		if (len <= 0)
1182
			break;
1183

1184
		if (bytes > len)
1185
			bytes = len;
1186

1187
		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
1188
				    offset) < bytes)
1189
			break;
1190

1191
		data += bytes;
1192
		len -= bytes;
1193
		offset = 0;
1194
	}
1195

1196
	bio->bi_end_io = bio_map_kern_endio;
1197
	return bio;
1198
}
1199

1200
/**
1201
 *	bio_map_kern	-	map kernel address into bio
1202
 *	@q: the struct request_queue for the bio
1203
 *	@data: pointer to buffer to map
1204
 *	@len: length in bytes
1205
 *	@gfp_mask: allocation flags for bio allocation
1206
 *
1207
 *	Map the kernel address into a bio suitable for io to a block
1208
 *	device. Returns an error pointer in case of error.
1209
 */
1210
struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1211
			 gfp_t gfp_mask)
1212
{
1213
	struct bio *bio;
1214

1215
	bio = __bio_map_kern(q, data, len, gfp_mask);
1216
	if (IS_ERR(bio))
1217
		return bio;
1218

1219
	if (bio->bi_size == len)
1220
		return bio;
1221

1222
	/*
1223
	 * Don't support partial mappings.
1224
	 */
1225
	bio_put(bio);
1226
	return ERR_PTR(-EINVAL);
1227
}
1228
EXPORT_SYMBOL(bio_map_kern);
1229

1230
static void bio_copy_kern_endio(struct bio *bio, int err)
1231
{
1232
	struct bio_vec *bvec;
1233
	const int read = bio_data_dir(bio) == READ;
1234
	struct bio_map_data *bmd = bio->bi_private;
1235
	int i;
1236
	char *p = bmd->sgvecs[0].iov_base;
1237

1238
	__bio_for_each_segment(bvec, bio, i, 0) {
1239
		char *addr = page_address(bvec->bv_page);
1240
		int len = bmd->iovecs[i].bv_len;
1241

1242
		if (read)
1243
			memcpy(p, addr, len);
1244

1245
		__free_page(bvec->bv_page);
1246
		p += len;
1247
	}
1248

1249
	bio_free_map_data(bmd);
1250
	bio_put(bio);
1251
}
1252

1253
/**
1254
 *	bio_copy_kern	-	copy kernel address into bio
1255
 *	@q: the struct request_queue for the bio
1256
 *	@data: pointer to buffer to copy
1257
 *	@len: length in bytes
1258
 *	@gfp_mask: allocation flags for bio and page allocation
1259
 *	@reading: data direction is READ
1260
 *
1261
 *	copy the kernel address into a bio suitable for io to a block
1262
 *	device. Returns an error pointer in case of error.
1263
 */
1264
struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1265
			  gfp_t gfp_mask, int reading)
1266
{
1267
	struct bio *bio;
1268
	struct bio_vec *bvec;
1269
	int i;
1270

1271
	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1272
	if (IS_ERR(bio))
1273
		return bio;
1274

1275
	if (!reading) {
1276
		void *p = data;
1277

1278
		bio_for_each_segment(bvec, bio, i) {
1279
			char *addr = page_address(bvec->bv_page);
1280

1281
			memcpy(addr, p, bvec->bv_len);
1282
			p += bvec->bv_len;
1283
		}
1284
	}
1285

1286
	bio->bi_end_io = bio_copy_kern_endio;
1287

1288
	return bio;
1289
}
1290
EXPORT_SYMBOL(bio_copy_kern);
1291

1292
/*
1293
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1294
 * for performing direct-IO in BIOs.
1295
 *
1296
 * The problem is that we cannot run set_page_dirty() from interrupt context
1297
 * because the required locks are not interrupt-safe.  So what we can do is to
1298
 * mark the pages dirty _before_ performing IO.  And in interrupt context,
1299
 * check that the pages are still dirty.   If so, fine.  If not, redirty them
1300
 * in process context.
1301
 *
1302
 * We special-case compound pages here: normally this means reads into hugetlb
1303
 * pages.  The logic in here doesn't really work right for compound pages
1304
 * because the VM does not uniformly chase down the head page in all cases.
1305
 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1306
 * handle them at all.  So we skip compound pages here at an early stage.
1307
 *
1308
 * Note that this code is very hard to test under normal circumstances because
1309
 * direct-io pins the pages with get_user_pages().  This makes
1310
 * is_page_cache_freeable return false, and the VM will not clean the pages.
1311
 * But other code (eg, pdflush) could clean the pages if they are mapped
1312
 * pagecache.
1313
 *
1314
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1315
 * deferred bio dirtying paths.
1316
 */
1317

1318
/*
1319
 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1320
 */
1321
void bio_set_pages_dirty(struct bio *bio)
1322
{
1323
	struct bio_vec *bvec = bio->bi_io_vec;
1324
	int i;
1325

1326
	for (i = 0; i < bio->bi_vcnt; i++) {
1327
		struct page *page = bvec[i].bv_page;
1328

1329
		if (page && !PageCompound(page))
1330
			set_page_dirty_lock(page);
1331
	}
1332
}
1333

1334
static void bio_release_pages(struct bio *bio)
1335
{
1336
	struct bio_vec *bvec = bio->bi_io_vec;
1337
	int i;
1338

1339
	for (i = 0; i < bio->bi_vcnt; i++) {
1340
		struct page *page = bvec[i].bv_page;
1341

1342
		if (page)
1343
			put_page(page);
1344
	}
1345
}
1346

1347
/*
1348
 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1349
 * If they are, then fine.  If, however, some pages are clean then they must
1350
 * have been written out during the direct-IO read.  So we take another ref on
1351
 * the BIO and the offending pages and re-dirty the pages in process context.
1352
 *
1353
 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1354
 * here on.  It will run one page_cache_release() against each page and will
1355
 * run one bio_put() against the BIO.
1356
 */
1357

1358
static void bio_dirty_fn(struct work_struct *work);
1359

1360
static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1361
static DEFINE_SPINLOCK(bio_dirty_lock);
1362
static struct bio *bio_dirty_list;
1363

1364
/*
1365
 * This runs in process context
1366
 */
1367
static void bio_dirty_fn(struct work_struct *work)
1368
{
1369
	unsigned long flags;
1370
	struct bio *bio;
1371

1372
	spin_lock_irqsave(&bio_dirty_lock, flags);
1373
	bio = bio_dirty_list;
1374
	bio_dirty_list = NULL;
1375
	spin_unlock_irqrestore(&bio_dirty_lock, flags);
1376

1377
	while (bio) {
1378
		struct bio *next = bio->bi_private;
1379

1380
		bio_set_pages_dirty(bio);
1381
		bio_release_pages(bio);
1382
		bio_put(bio);
1383
		bio = next;
1384
	}
1385
}
1386

1387
void bio_check_pages_dirty(struct bio *bio)
1388
{
1389
	struct bio_vec *bvec = bio->bi_io_vec;
1390
	int nr_clean_pages = 0;
1391
	int i;
1392

1393
	for (i = 0; i < bio->bi_vcnt; i++) {
1394
		struct page *page = bvec[i].bv_page;
1395

1396
		if (PageDirty(page) || PageCompound(page)) {
1397
			page_cache_release(page);
1398
			bvec[i].bv_page = NULL;
1399
		} else {
1400
			nr_clean_pages++;
1401
		}
1402
	}
1403

1404
	if (nr_clean_pages) {
1405
		unsigned long flags;
1406

1407
		spin_lock_irqsave(&bio_dirty_lock, flags);
1408
		bio->bi_private = bio_dirty_list;
1409
		bio_dirty_list = bio;
1410
		spin_unlock_irqrestore(&bio_dirty_lock, flags);
1411
		schedule_work(&bio_dirty_work);
1412
	} else {
1413
		bio_put(bio);
1414
	}
1415
}
1416

1417
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1418
void bio_flush_dcache_pages(struct bio *bi)
1419
{
1420
	int i;
1421
	struct bio_vec *bvec;
1422

1423
	bio_for_each_segment(bvec, bi, i)
1424
		flush_dcache_page(bvec->bv_page);
1425
}
1426
EXPORT_SYMBOL(bio_flush_dcache_pages);
1427
#endif
1428

1429
/**
1430
 * bio_endio - end I/O on a bio
1431
 * @bio:	bio
1432
 * @error:	error, if any
1433
 *
1434
 * Description:
1435
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the
1436
 *   preferred way to end I/O on a bio, it takes care of clearing
1437
 *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
1438
 *   established -Exxxx (-EIO, for instance) error values in case
1439
 *   something went wrong. No one should call bi_end_io() directly on a
1440
 *   bio unless they own it and thus know that it has an end_io
1441
 *   function.
1442
 **/
1443
void bio_endio(struct bio *bio, int error)
1444
{
1445
	if (error)
1446
		clear_bit(BIO_UPTODATE, &bio->bi_flags);
1447
	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1448
		error = -EIO;
1449

1450
	if (bio->bi_end_io)
1451
		bio->bi_end_io(bio, error);
1452
}
1453
EXPORT_SYMBOL(bio_endio);
1454

1455
void bio_pair_release(struct bio_pair *bp)
1456
{
1457
	if (atomic_dec_and_test(&bp->cnt)) {
1458
		struct bio *master = bp->bio1.bi_private;
1459

1460
		bio_endio(master, bp->error);
1461
		mempool_free(bp, bp->bio2.bi_private);
1462
	}
1463
}
1464
EXPORT_SYMBOL(bio_pair_release);
1465

1466
static void bio_pair_end_1(struct bio *bi, int err)
1467
{
1468
	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
1469

1470
	if (err)
1471
		bp->error = err;
1472

1473
	bio_pair_release(bp);
1474
}
1475

1476
static void bio_pair_end_2(struct bio *bi, int err)
1477
{
1478
	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
1479

1480
	if (err)
1481
		bp->error = err;
1482

1483
	bio_pair_release(bp);
1484
}
1485

1486
/*
1487
 * split a bio - only worry about a bio with a single page in its iovec
1488
 */
1489
struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1490
{
1491
	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
1492

1493
	if (!bp)
1494
		return bp;
1495

1496
	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1497
				bi->bi_sector + first_sectors);
1498

1499
	BUG_ON(bi->bi_vcnt != 1);
1500
	BUG_ON(bi->bi_idx != 0);
1501
	atomic_set(&bp->cnt, 3);
1502
	bp->error = 0;
1503
	bp->bio1 = *bi;
1504
	bp->bio2 = *bi;
1505
	bp->bio2.bi_sector += first_sectors;
1506
	bp->bio2.bi_size -= first_sectors << 9;
1507
	bp->bio1.bi_size = first_sectors << 9;
1508

1509
	bp->bv1 = bi->bi_io_vec[0];
1510
	bp->bv2 = bi->bi_io_vec[0];
1511
	bp->bv2.bv_offset += first_sectors << 9;
1512
	bp->bv2.bv_len -= first_sectors << 9;
1513
	bp->bv1.bv_len = first_sectors << 9;
1514

1515
	bp->bio1.bi_io_vec = &bp->bv1;
1516
	bp->bio2.bi_io_vec = &bp->bv2;
1517

1518
	bp->bio1.bi_max_vecs = 1;
1519
	bp->bio2.bi_max_vecs = 1;
1520

1521
	bp->bio1.bi_end_io = bio_pair_end_1;
1522
	bp->bio2.bi_end_io = bio_pair_end_2;
1523

1524
	bp->bio1.bi_private = bi;
1525
	bp->bio2.bi_private = bio_split_pool;
1526

1527
	if (bio_integrity(bi))
1528
		bio_integrity_split(bi, bp, first_sectors);
1529

1530
	return bp;
1531
}
1532
EXPORT_SYMBOL(bio_split);
1533

1534
/**
1535
 *      bio_sector_offset - Find hardware sector offset in bio
1536
 *      @bio:           bio to inspect
1537
 *      @index:         bio_vec index
1538
 *      @offset:        offset in bv_page
1539
 *
1540
 *      Return the number of hardware sectors between beginning of bio
1541
 *      and an end point indicated by a bio_vec index and an offset
1542
 *      within that vector's page.
1543
 */
1544
sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1545
			   unsigned int offset)
1546
{
1547
	unsigned int sector_sz;
1548
	struct bio_vec *bv;
1549
	sector_t sectors;
1550
	int i;
1551

1552
	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
1553
	sectors = 0;
1554

1555
	if (index >= bio->bi_idx)
1556
		index = bio->bi_vcnt - 1;
1557

1558
	__bio_for_each_segment(bv, bio, i, 0) {
1559
		if (i == index) {
1560
			if (offset > bv->bv_offset)
1561
				sectors += (offset - bv->bv_offset) / sector_sz;
1562
			break;
1563
		}
1564

1565
		sectors += bv->bv_len / sector_sz;
1566
	}
1567

1568
	return sectors;
1569
}
1570
EXPORT_SYMBOL(bio_sector_offset);
1571

1572
/*
1573
 * create memory pools for biovec's in a bio_set.
1574
 * use the global biovec slabs created for general use.
1575
 */
1576
static int biovec_create_pools(struct bio_set *bs, int pool_entries)
1577
{
1578
	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1579

1580
	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
1581
	if (!bs->bvec_pool)
1582
		return -ENOMEM;
1583

1584
	return 0;
1585
}
1586

1587
static void biovec_free_pools(struct bio_set *bs)
1588
{
1589
	mempool_destroy(bs->bvec_pool);
1590
}
1591

1592
void bioset_free(struct bio_set *bs)
1593
{
1594
	if (bs->bio_pool)
1595
		mempool_destroy(bs->bio_pool);
1596

1597
	bioset_integrity_free(bs);
1598
	biovec_free_pools(bs);
1599
	bio_put_slab(bs);
1600

1601
	kfree(bs);
1602
}
1603
EXPORT_SYMBOL(bioset_free);
1604

1605
/**
1606
 * bioset_create  - Create a bio_set
1607
 * @pool_size:	Number of bio and bio_vecs to cache in the mempool
1608
 * @front_pad:	Number of bytes to allocate in front of the returned bio
1609
 *
1610
 * Description:
1611
 *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1612
 *    to ask for a number of bytes to be allocated in front of the bio.
1613
 *    Front pad allocation is useful for embedding the bio inside
1614
 *    another structure, to avoid allocating extra data to go with the bio.
1615
 *    Note that the bio must be embedded at the END of that structure always,
1616
 *    or things will break badly.
1617
 */
1618
struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1619
{
1620
	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1621
	struct bio_set *bs;
1622

1623
	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1624
	if (!bs)
1625
		return NULL;
1626

1627
	bs->front_pad = front_pad;
1628

1629
	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1630
	if (!bs->bio_slab) {
1631
		kfree(bs);
1632
		return NULL;
1633
	}
1634

1635
	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
1636
	if (!bs->bio_pool)
1637
		goto bad;
1638

1639
	if (!biovec_create_pools(bs, pool_size))
1640
		return bs;
1641

1642
bad:
1643
	bioset_free(bs);
1644
	return NULL;
1645
}
1646
EXPORT_SYMBOL(bioset_create);
1647

1648
static void __init biovec_init_slabs(void)
1649
{
1650
	int i;
1651

1652
	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
1653
		int size;
1654
		struct biovec_slab *bvs = bvec_slabs + i;
1655

1656
		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1657
			bvs->slab = NULL;
1658
			continue;
1659
		}
1660

1661
		size = bvs->nr_vecs * sizeof(struct bio_vec);
1662
		bvs->slab = kmem_cache_create(bvs->name, size, 0,
1663
                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1664
	}
1665
}
1666

1667
static int __init init_bio(void)
1668
{
1669
	bio_slab_max = 2;
1670
	bio_slab_nr = 0;
1671
	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
1672
	if (!bio_slabs)
1673
		panic("bio: can't allocate bios\n");
1674

1675
	bio_integrity_init();
1676
	biovec_init_slabs();
1677

1678
	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
1679
	if (!fs_bio_set)
1680
		panic("bio: can't allocate bios\n");
1681

1682
	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
1683
		panic("bio: can't create integrity pool\n");
1684

1685
	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
1686
						     sizeof(struct bio_pair));
1687
	if (!bio_split_pool)
1688
		panic("bio: can't create split pool\n");
1689

1690
	return 0;
1691
}
1692
subsys_initcall(init_bio);
1693

1694
Product

Resources

Company