Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/md/dm.c
15109 views
1
/*
2
* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4
*
5
* This file is released under the GPL.
6
*/
7
8
#include "dm.h"
9
#include "dm-uevent.h"
10
11
#include <linux/init.h>
12
#include <linux/module.h>
13
#include <linux/mutex.h>
14
#include <linux/moduleparam.h>
15
#include <linux/blkpg.h>
16
#include <linux/bio.h>
17
#include <linux/buffer_head.h>
18
#include <linux/mempool.h>
19
#include <linux/slab.h>
20
#include <linux/idr.h>
21
#include <linux/hdreg.h>
22
#include <linux/delay.h>
23
24
#include <trace/events/block.h>
25
26
#define DM_MSG_PREFIX "core"
27
28
/*
29
* Cookies are numeric values sent with CHANGE and REMOVE
30
* uevents while resuming, removing or renaming the device.
31
*/
32
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
33
#define DM_COOKIE_LENGTH 24
34
35
static const char *_name = DM_NAME;
36
37
static unsigned int major = 0;
38
static unsigned int _major = 0;
39
40
static DEFINE_SPINLOCK(_minor_lock);
41
/*
42
* For bio-based dm.
43
* One of these is allocated per bio.
44
*/
45
struct dm_io {
46
struct mapped_device *md;
47
int error;
48
atomic_t io_count;
49
struct bio *bio;
50
unsigned long start_time;
51
spinlock_t endio_lock;
52
};
53
54
/*
55
* For bio-based dm.
56
* One of these is allocated per target within a bio. Hopefully
57
* this will be simplified out one day.
58
*/
59
struct dm_target_io {
60
struct dm_io *io;
61
struct dm_target *ti;
62
union map_info info;
63
};
64
65
/*
66
* For request-based dm.
67
* One of these is allocated per request.
68
*/
69
struct dm_rq_target_io {
70
struct mapped_device *md;
71
struct dm_target *ti;
72
struct request *orig, clone;
73
int error;
74
union map_info info;
75
};
76
77
/*
78
* For request-based dm.
79
* One of these is allocated per bio.
80
*/
81
struct dm_rq_clone_bio_info {
82
struct bio *orig;
83
struct dm_rq_target_io *tio;
84
};
85
86
union map_info *dm_get_mapinfo(struct bio *bio)
87
{
88
if (bio && bio->bi_private)
89
return &((struct dm_target_io *)bio->bi_private)->info;
90
return NULL;
91
}
92
93
union map_info *dm_get_rq_mapinfo(struct request *rq)
94
{
95
if (rq && rq->end_io_data)
96
return &((struct dm_rq_target_io *)rq->end_io_data)->info;
97
return NULL;
98
}
99
EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
100
101
#define MINOR_ALLOCED ((void *)-1)
102
103
/*
104
* Bits for the md->flags field.
105
*/
106
#define DMF_BLOCK_IO_FOR_SUSPEND 0
107
#define DMF_SUSPENDED 1
108
#define DMF_FROZEN 2
109
#define DMF_FREEING 3
110
#define DMF_DELETING 4
111
#define DMF_NOFLUSH_SUSPENDING 5
112
113
/*
114
* Work processed by per-device workqueue.
115
*/
116
struct mapped_device {
117
struct rw_semaphore io_lock;
118
struct mutex suspend_lock;
119
rwlock_t map_lock;
120
atomic_t holders;
121
atomic_t open_count;
122
123
unsigned long flags;
124
125
struct request_queue *queue;
126
unsigned type;
127
/* Protect queue and type against concurrent access. */
128
struct mutex type_lock;
129
130
struct gendisk *disk;
131
char name[16];
132
133
void *interface_ptr;
134
135
/*
136
* A list of ios that arrived while we were suspended.
137
*/
138
atomic_t pending[2];
139
wait_queue_head_t wait;
140
struct work_struct work;
141
struct bio_list deferred;
142
spinlock_t deferred_lock;
143
144
/*
145
* Processing queue (flush)
146
*/
147
struct workqueue_struct *wq;
148
149
/*
150
* The current mapping.
151
*/
152
struct dm_table *map;
153
154
/*
155
* io objects are allocated from here.
156
*/
157
mempool_t *io_pool;
158
mempool_t *tio_pool;
159
160
struct bio_set *bs;
161
162
/*
163
* Event handling.
164
*/
165
atomic_t event_nr;
166
wait_queue_head_t eventq;
167
atomic_t uevent_seq;
168
struct list_head uevent_list;
169
spinlock_t uevent_lock; /* Protect access to uevent_list */
170
171
/*
172
* freeze/thaw support require holding onto a super block
173
*/
174
struct super_block *frozen_sb;
175
struct block_device *bdev;
176
177
/* forced geometry settings */
178
struct hd_geometry geometry;
179
180
/* For saving the address of __make_request for request based dm */
181
make_request_fn *saved_make_request_fn;
182
183
/* sysfs handle */
184
struct kobject kobj;
185
186
/* zero-length flush that will be cloned and submitted to targets */
187
struct bio flush_bio;
188
};
189
190
/*
191
* For mempools pre-allocation at the table loading time.
192
*/
193
struct dm_md_mempools {
194
mempool_t *io_pool;
195
mempool_t *tio_pool;
196
struct bio_set *bs;
197
};
198
199
#define MIN_IOS 256
200
static struct kmem_cache *_io_cache;
201
static struct kmem_cache *_tio_cache;
202
static struct kmem_cache *_rq_tio_cache;
203
static struct kmem_cache *_rq_bio_info_cache;
204
205
static int __init local_init(void)
206
{
207
int r = -ENOMEM;
208
209
/* allocate a slab for the dm_ios */
210
_io_cache = KMEM_CACHE(dm_io, 0);
211
if (!_io_cache)
212
return r;
213
214
/* allocate a slab for the target ios */
215
_tio_cache = KMEM_CACHE(dm_target_io, 0);
216
if (!_tio_cache)
217
goto out_free_io_cache;
218
219
_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
220
if (!_rq_tio_cache)
221
goto out_free_tio_cache;
222
223
_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
224
if (!_rq_bio_info_cache)
225
goto out_free_rq_tio_cache;
226
227
r = dm_uevent_init();
228
if (r)
229
goto out_free_rq_bio_info_cache;
230
231
_major = major;
232
r = register_blkdev(_major, _name);
233
if (r < 0)
234
goto out_uevent_exit;
235
236
if (!_major)
237
_major = r;
238
239
return 0;
240
241
out_uevent_exit:
242
dm_uevent_exit();
243
out_free_rq_bio_info_cache:
244
kmem_cache_destroy(_rq_bio_info_cache);
245
out_free_rq_tio_cache:
246
kmem_cache_destroy(_rq_tio_cache);
247
out_free_tio_cache:
248
kmem_cache_destroy(_tio_cache);
249
out_free_io_cache:
250
kmem_cache_destroy(_io_cache);
251
252
return r;
253
}
254
255
static void local_exit(void)
256
{
257
kmem_cache_destroy(_rq_bio_info_cache);
258
kmem_cache_destroy(_rq_tio_cache);
259
kmem_cache_destroy(_tio_cache);
260
kmem_cache_destroy(_io_cache);
261
unregister_blkdev(_major, _name);
262
dm_uevent_exit();
263
264
_major = 0;
265
266
DMINFO("cleaned up");
267
}
268
269
static int (*_inits[])(void) __initdata = {
270
local_init,
271
dm_target_init,
272
dm_linear_init,
273
dm_stripe_init,
274
dm_io_init,
275
dm_kcopyd_init,
276
dm_interface_init,
277
};
278
279
static void (*_exits[])(void) = {
280
local_exit,
281
dm_target_exit,
282
dm_linear_exit,
283
dm_stripe_exit,
284
dm_io_exit,
285
dm_kcopyd_exit,
286
dm_interface_exit,
287
};
288
289
static int __init dm_init(void)
290
{
291
const int count = ARRAY_SIZE(_inits);
292
293
int r, i;
294
295
for (i = 0; i < count; i++) {
296
r = _inits[i]();
297
if (r)
298
goto bad;
299
}
300
301
return 0;
302
303
bad:
304
while (i--)
305
_exits[i]();
306
307
return r;
308
}
309
310
static void __exit dm_exit(void)
311
{
312
int i = ARRAY_SIZE(_exits);
313
314
while (i--)
315
_exits[i]();
316
}
317
318
/*
319
* Block device functions
320
*/
321
int dm_deleting_md(struct mapped_device *md)
322
{
323
return test_bit(DMF_DELETING, &md->flags);
324
}
325
326
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
327
{
328
struct mapped_device *md;
329
330
spin_lock(&_minor_lock);
331
332
md = bdev->bd_disk->private_data;
333
if (!md)
334
goto out;
335
336
if (test_bit(DMF_FREEING, &md->flags) ||
337
dm_deleting_md(md)) {
338
md = NULL;
339
goto out;
340
}
341
342
dm_get(md);
343
atomic_inc(&md->open_count);
344
345
out:
346
spin_unlock(&_minor_lock);
347
348
return md ? 0 : -ENXIO;
349
}
350
351
static int dm_blk_close(struct gendisk *disk, fmode_t mode)
352
{
353
struct mapped_device *md = disk->private_data;
354
355
spin_lock(&_minor_lock);
356
357
atomic_dec(&md->open_count);
358
dm_put(md);
359
360
spin_unlock(&_minor_lock);
361
362
return 0;
363
}
364
365
int dm_open_count(struct mapped_device *md)
366
{
367
return atomic_read(&md->open_count);
368
}
369
370
/*
371
* Guarantees nothing is using the device before it's deleted.
372
*/
373
int dm_lock_for_deletion(struct mapped_device *md)
374
{
375
int r = 0;
376
377
spin_lock(&_minor_lock);
378
379
if (dm_open_count(md))
380
r = -EBUSY;
381
else
382
set_bit(DMF_DELETING, &md->flags);
383
384
spin_unlock(&_minor_lock);
385
386
return r;
387
}
388
389
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
390
{
391
struct mapped_device *md = bdev->bd_disk->private_data;
392
393
return dm_get_geometry(md, geo);
394
}
395
396
static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
397
unsigned int cmd, unsigned long arg)
398
{
399
struct mapped_device *md = bdev->bd_disk->private_data;
400
struct dm_table *map = dm_get_live_table(md);
401
struct dm_target *tgt;
402
int r = -ENOTTY;
403
404
if (!map || !dm_table_get_size(map))
405
goto out;
406
407
/* We only support devices that have a single target */
408
if (dm_table_get_num_targets(map) != 1)
409
goto out;
410
411
tgt = dm_table_get_target(map, 0);
412
413
if (dm_suspended_md(md)) {
414
r = -EAGAIN;
415
goto out;
416
}
417
418
if (tgt->type->ioctl)
419
r = tgt->type->ioctl(tgt, cmd, arg);
420
421
out:
422
dm_table_put(map);
423
424
return r;
425
}
426
427
static struct dm_io *alloc_io(struct mapped_device *md)
428
{
429
return mempool_alloc(md->io_pool, GFP_NOIO);
430
}
431
432
static void free_io(struct mapped_device *md, struct dm_io *io)
433
{
434
mempool_free(io, md->io_pool);
435
}
436
437
static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
438
{
439
mempool_free(tio, md->tio_pool);
440
}
441
442
static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
443
gfp_t gfp_mask)
444
{
445
return mempool_alloc(md->tio_pool, gfp_mask);
446
}
447
448
static void free_rq_tio(struct dm_rq_target_io *tio)
449
{
450
mempool_free(tio, tio->md->tio_pool);
451
}
452
453
static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
454
{
455
return mempool_alloc(md->io_pool, GFP_ATOMIC);
456
}
457
458
static void free_bio_info(struct dm_rq_clone_bio_info *info)
459
{
460
mempool_free(info, info->tio->md->io_pool);
461
}
462
463
static int md_in_flight(struct mapped_device *md)
464
{
465
return atomic_read(&md->pending[READ]) +
466
atomic_read(&md->pending[WRITE]);
467
}
468
469
static void start_io_acct(struct dm_io *io)
470
{
471
struct mapped_device *md = io->md;
472
int cpu;
473
int rw = bio_data_dir(io->bio);
474
475
io->start_time = jiffies;
476
477
cpu = part_stat_lock();
478
part_round_stats(cpu, &dm_disk(md)->part0);
479
part_stat_unlock();
480
atomic_set(&dm_disk(md)->part0.in_flight[rw],
481
atomic_inc_return(&md->pending[rw]));
482
}
483
484
static void end_io_acct(struct dm_io *io)
485
{
486
struct mapped_device *md = io->md;
487
struct bio *bio = io->bio;
488
unsigned long duration = jiffies - io->start_time;
489
int pending, cpu;
490
int rw = bio_data_dir(bio);
491
492
cpu = part_stat_lock();
493
part_round_stats(cpu, &dm_disk(md)->part0);
494
part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
495
part_stat_unlock();
496
497
/*
498
* After this is decremented the bio must not be touched if it is
499
* a flush.
500
*/
501
pending = atomic_dec_return(&md->pending[rw]);
502
atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
503
pending += atomic_read(&md->pending[rw^0x1]);
504
505
/* nudge anyone waiting on suspend queue */
506
if (!pending)
507
wake_up(&md->wait);
508
}
509
510
/*
511
* Add the bio to the list of deferred io.
512
*/
513
static void queue_io(struct mapped_device *md, struct bio *bio)
514
{
515
unsigned long flags;
516
517
spin_lock_irqsave(&md->deferred_lock, flags);
518
bio_list_add(&md->deferred, bio);
519
spin_unlock_irqrestore(&md->deferred_lock, flags);
520
queue_work(md->wq, &md->work);
521
}
522
523
/*
524
* Everyone (including functions in this file), should use this
525
* function to access the md->map field, and make sure they call
526
* dm_table_put() when finished.
527
*/
528
struct dm_table *dm_get_live_table(struct mapped_device *md)
529
{
530
struct dm_table *t;
531
unsigned long flags;
532
533
read_lock_irqsave(&md->map_lock, flags);
534
t = md->map;
535
if (t)
536
dm_table_get(t);
537
read_unlock_irqrestore(&md->map_lock, flags);
538
539
return t;
540
}
541
542
/*
543
* Get the geometry associated with a dm device
544
*/
545
int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
546
{
547
*geo = md->geometry;
548
549
return 0;
550
}
551
552
/*
553
* Set the geometry of a device.
554
*/
555
int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
556
{
557
sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
558
559
if (geo->start > sz) {
560
DMWARN("Start sector is beyond the geometry limits.");
561
return -EINVAL;
562
}
563
564
md->geometry = *geo;
565
566
return 0;
567
}
568
569
/*-----------------------------------------------------------------
570
* CRUD START:
571
* A more elegant soln is in the works that uses the queue
572
* merge fn, unfortunately there are a couple of changes to
573
* the block layer that I want to make for this. So in the
574
* interests of getting something for people to use I give
575
* you this clearly demarcated crap.
576
*---------------------------------------------------------------*/
577
578
static int __noflush_suspending(struct mapped_device *md)
579
{
580
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
581
}
582
583
/*
584
* Decrements the number of outstanding ios that a bio has been
585
* cloned into, completing the original io if necc.
586
*/
587
static void dec_pending(struct dm_io *io, int error)
588
{
589
unsigned long flags;
590
int io_error;
591
struct bio *bio;
592
struct mapped_device *md = io->md;
593
594
/* Push-back supersedes any I/O errors */
595
if (unlikely(error)) {
596
spin_lock_irqsave(&io->endio_lock, flags);
597
if (!(io->error > 0 && __noflush_suspending(md)))
598
io->error = error;
599
spin_unlock_irqrestore(&io->endio_lock, flags);
600
}
601
602
if (atomic_dec_and_test(&io->io_count)) {
603
if (io->error == DM_ENDIO_REQUEUE) {
604
/*
605
* Target requested pushing back the I/O.
606
*/
607
spin_lock_irqsave(&md->deferred_lock, flags);
608
if (__noflush_suspending(md))
609
bio_list_add_head(&md->deferred, io->bio);
610
else
611
/* noflush suspend was interrupted. */
612
io->error = -EIO;
613
spin_unlock_irqrestore(&md->deferred_lock, flags);
614
}
615
616
io_error = io->error;
617
bio = io->bio;
618
end_io_acct(io);
619
free_io(md, io);
620
621
if (io_error == DM_ENDIO_REQUEUE)
622
return;
623
624
if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
625
/*
626
* Preflush done for flush with data, reissue
627
* without REQ_FLUSH.
628
*/
629
bio->bi_rw &= ~REQ_FLUSH;
630
queue_io(md, bio);
631
} else {
632
/* done with normal IO or empty flush */
633
trace_block_bio_complete(md->queue, bio, io_error);
634
bio_endio(bio, io_error);
635
}
636
}
637
}
638
639
static void clone_endio(struct bio *bio, int error)
640
{
641
int r = 0;
642
struct dm_target_io *tio = bio->bi_private;
643
struct dm_io *io = tio->io;
644
struct mapped_device *md = tio->io->md;
645
dm_endio_fn endio = tio->ti->type->end_io;
646
647
if (!bio_flagged(bio, BIO_UPTODATE) && !error)
648
error = -EIO;
649
650
if (endio) {
651
r = endio(tio->ti, bio, error, &tio->info);
652
if (r < 0 || r == DM_ENDIO_REQUEUE)
653
/*
654
* error and requeue request are handled
655
* in dec_pending().
656
*/
657
error = r;
658
else if (r == DM_ENDIO_INCOMPLETE)
659
/* The target will handle the io */
660
return;
661
else if (r) {
662
DMWARN("unimplemented target endio return value: %d", r);
663
BUG();
664
}
665
}
666
667
/*
668
* Store md for cleanup instead of tio which is about to get freed.
669
*/
670
bio->bi_private = md->bs;
671
672
free_tio(md, tio);
673
bio_put(bio);
674
dec_pending(io, error);
675
}
676
677
/*
678
* Partial completion handling for request-based dm
679
*/
680
static void end_clone_bio(struct bio *clone, int error)
681
{
682
struct dm_rq_clone_bio_info *info = clone->bi_private;
683
struct dm_rq_target_io *tio = info->tio;
684
struct bio *bio = info->orig;
685
unsigned int nr_bytes = info->orig->bi_size;
686
687
bio_put(clone);
688
689
if (tio->error)
690
/*
691
* An error has already been detected on the request.
692
* Once error occurred, just let clone->end_io() handle
693
* the remainder.
694
*/
695
return;
696
else if (error) {
697
/*
698
* Don't notice the error to the upper layer yet.
699
* The error handling decision is made by the target driver,
700
* when the request is completed.
701
*/
702
tio->error = error;
703
return;
704
}
705
706
/*
707
* I/O for the bio successfully completed.
708
* Notice the data completion to the upper layer.
709
*/
710
711
/*
712
* bios are processed from the head of the list.
713
* So the completing bio should always be rq->bio.
714
* If it's not, something wrong is happening.
715
*/
716
if (tio->orig->bio != bio)
717
DMERR("bio completion is going in the middle of the request");
718
719
/*
720
* Update the original request.
721
* Do not use blk_end_request() here, because it may complete
722
* the original request before the clone, and break the ordering.
723
*/
724
blk_update_request(tio->orig, 0, nr_bytes);
725
}
726
727
/*
728
* Don't touch any member of the md after calling this function because
729
* the md may be freed in dm_put() at the end of this function.
730
* Or do dm_get() before calling this function and dm_put() later.
731
*/
732
static void rq_completed(struct mapped_device *md, int rw, int run_queue)
733
{
734
atomic_dec(&md->pending[rw]);
735
736
/* nudge anyone waiting on suspend queue */
737
if (!md_in_flight(md))
738
wake_up(&md->wait);
739
740
if (run_queue)
741
blk_run_queue(md->queue);
742
743
/*
744
* dm_put() must be at the end of this function. See the comment above
745
*/
746
dm_put(md);
747
}
748
749
static void free_rq_clone(struct request *clone)
750
{
751
struct dm_rq_target_io *tio = clone->end_io_data;
752
753
blk_rq_unprep_clone(clone);
754
free_rq_tio(tio);
755
}
756
757
/*
758
* Complete the clone and the original request.
759
* Must be called without queue lock.
760
*/
761
static void dm_end_request(struct request *clone, int error)
762
{
763
int rw = rq_data_dir(clone);
764
struct dm_rq_target_io *tio = clone->end_io_data;
765
struct mapped_device *md = tio->md;
766
struct request *rq = tio->orig;
767
768
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
769
rq->errors = clone->errors;
770
rq->resid_len = clone->resid_len;
771
772
if (rq->sense)
773
/*
774
* We are using the sense buffer of the original
775
* request.
776
* So setting the length of the sense data is enough.
777
*/
778
rq->sense_len = clone->sense_len;
779
}
780
781
free_rq_clone(clone);
782
blk_end_request_all(rq, error);
783
rq_completed(md, rw, true);
784
}
785
786
static void dm_unprep_request(struct request *rq)
787
{
788
struct request *clone = rq->special;
789
790
rq->special = NULL;
791
rq->cmd_flags &= ~REQ_DONTPREP;
792
793
free_rq_clone(clone);
794
}
795
796
/*
797
* Requeue the original request of a clone.
798
*/
799
void dm_requeue_unmapped_request(struct request *clone)
800
{
801
int rw = rq_data_dir(clone);
802
struct dm_rq_target_io *tio = clone->end_io_data;
803
struct mapped_device *md = tio->md;
804
struct request *rq = tio->orig;
805
struct request_queue *q = rq->q;
806
unsigned long flags;
807
808
dm_unprep_request(rq);
809
810
spin_lock_irqsave(q->queue_lock, flags);
811
blk_requeue_request(q, rq);
812
spin_unlock_irqrestore(q->queue_lock, flags);
813
814
rq_completed(md, rw, 0);
815
}
816
EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
817
818
static void __stop_queue(struct request_queue *q)
819
{
820
blk_stop_queue(q);
821
}
822
823
static void stop_queue(struct request_queue *q)
824
{
825
unsigned long flags;
826
827
spin_lock_irqsave(q->queue_lock, flags);
828
__stop_queue(q);
829
spin_unlock_irqrestore(q->queue_lock, flags);
830
}
831
832
static void __start_queue(struct request_queue *q)
833
{
834
if (blk_queue_stopped(q))
835
blk_start_queue(q);
836
}
837
838
static void start_queue(struct request_queue *q)
839
{
840
unsigned long flags;
841
842
spin_lock_irqsave(q->queue_lock, flags);
843
__start_queue(q);
844
spin_unlock_irqrestore(q->queue_lock, flags);
845
}
846
847
static void dm_done(struct request *clone, int error, bool mapped)
848
{
849
int r = error;
850
struct dm_rq_target_io *tio = clone->end_io_data;
851
dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
852
853
if (mapped && rq_end_io)
854
r = rq_end_io(tio->ti, clone, error, &tio->info);
855
856
if (r <= 0)
857
/* The target wants to complete the I/O */
858
dm_end_request(clone, r);
859
else if (r == DM_ENDIO_INCOMPLETE)
860
/* The target will handle the I/O */
861
return;
862
else if (r == DM_ENDIO_REQUEUE)
863
/* The target wants to requeue the I/O */
864
dm_requeue_unmapped_request(clone);
865
else {
866
DMWARN("unimplemented target endio return value: %d", r);
867
BUG();
868
}
869
}
870
871
/*
872
* Request completion handler for request-based dm
873
*/
874
static void dm_softirq_done(struct request *rq)
875
{
876
bool mapped = true;
877
struct request *clone = rq->completion_data;
878
struct dm_rq_target_io *tio = clone->end_io_data;
879
880
if (rq->cmd_flags & REQ_FAILED)
881
mapped = false;
882
883
dm_done(clone, tio->error, mapped);
884
}
885
886
/*
887
* Complete the clone and the original request with the error status
888
* through softirq context.
889
*/
890
static void dm_complete_request(struct request *clone, int error)
891
{
892
struct dm_rq_target_io *tio = clone->end_io_data;
893
struct request *rq = tio->orig;
894
895
tio->error = error;
896
rq->completion_data = clone;
897
blk_complete_request(rq);
898
}
899
900
/*
901
* Complete the not-mapped clone and the original request with the error status
902
* through softirq context.
903
* Target's rq_end_io() function isn't called.
904
* This may be used when the target's map_rq() function fails.
905
*/
906
void dm_kill_unmapped_request(struct request *clone, int error)
907
{
908
struct dm_rq_target_io *tio = clone->end_io_data;
909
struct request *rq = tio->orig;
910
911
rq->cmd_flags |= REQ_FAILED;
912
dm_complete_request(clone, error);
913
}
914
EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
915
916
/*
917
* Called with the queue lock held
918
*/
919
static void end_clone_request(struct request *clone, int error)
920
{
921
/*
922
* For just cleaning up the information of the queue in which
923
* the clone was dispatched.
924
* The clone is *NOT* freed actually here because it is alloced from
925
* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
926
*/
927
__blk_put_request(clone->q, clone);
928
929
/*
930
* Actual request completion is done in a softirq context which doesn't
931
* hold the queue lock. Otherwise, deadlock could occur because:
932
* - another request may be submitted by the upper level driver
933
* of the stacking during the completion
934
* - the submission which requires queue lock may be done
935
* against this queue
936
*/
937
dm_complete_request(clone, error);
938
}
939
940
/*
941
* Return maximum size of I/O possible at the supplied sector up to the current
942
* target boundary.
943
*/
944
static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
945
{
946
sector_t target_offset = dm_target_offset(ti, sector);
947
948
return ti->len - target_offset;
949
}
950
951
static sector_t max_io_len(sector_t sector, struct dm_target *ti)
952
{
953
sector_t len = max_io_len_target_boundary(sector, ti);
954
955
/*
956
* Does the target need to split even further ?
957
*/
958
if (ti->split_io) {
959
sector_t boundary;
960
sector_t offset = dm_target_offset(ti, sector);
961
boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
962
- offset;
963
if (len > boundary)
964
len = boundary;
965
}
966
967
return len;
968
}
969
970
static void __map_bio(struct dm_target *ti, struct bio *clone,
971
struct dm_target_io *tio)
972
{
973
int r;
974
sector_t sector;
975
struct mapped_device *md;
976
977
clone->bi_end_io = clone_endio;
978
clone->bi_private = tio;
979
980
/*
981
* Map the clone. If r == 0 we don't need to do
982
* anything, the target has assumed ownership of
983
* this io.
984
*/
985
atomic_inc(&tio->io->io_count);
986
sector = clone->bi_sector;
987
r = ti->type->map(ti, clone, &tio->info);
988
if (r == DM_MAPIO_REMAPPED) {
989
/* the bio has been remapped so dispatch it */
990
991
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
992
tio->io->bio->bi_bdev->bd_dev, sector);
993
994
generic_make_request(clone);
995
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
996
/* error the io and bail out, or requeue it if needed */
997
md = tio->io->md;
998
dec_pending(tio->io, r);
999
/*
1000
* Store bio_set for cleanup.
1001
*/
1002
clone->bi_private = md->bs;
1003
bio_put(clone);
1004
free_tio(md, tio);
1005
} else if (r) {
1006
DMWARN("unimplemented target map return value: %d", r);
1007
BUG();
1008
}
1009
}
1010
1011
struct clone_info {
1012
struct mapped_device *md;
1013
struct dm_table *map;
1014
struct bio *bio;
1015
struct dm_io *io;
1016
sector_t sector;
1017
sector_t sector_count;
1018
unsigned short idx;
1019
};
1020
1021
static void dm_bio_destructor(struct bio *bio)
1022
{
1023
struct bio_set *bs = bio->bi_private;
1024
1025
bio_free(bio, bs);
1026
}
1027
1028
/*
1029
* Creates a little bio that just does part of a bvec.
1030
*/
1031
static struct bio *split_bvec(struct bio *bio, sector_t sector,
1032
unsigned short idx, unsigned int offset,
1033
unsigned int len, struct bio_set *bs)
1034
{
1035
struct bio *clone;
1036
struct bio_vec *bv = bio->bi_io_vec + idx;
1037
1038
clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1039
clone->bi_destructor = dm_bio_destructor;
1040
*clone->bi_io_vec = *bv;
1041
1042
clone->bi_sector = sector;
1043
clone->bi_bdev = bio->bi_bdev;
1044
clone->bi_rw = bio->bi_rw;
1045
clone->bi_vcnt = 1;
1046
clone->bi_size = to_bytes(len);
1047
clone->bi_io_vec->bv_offset = offset;
1048
clone->bi_io_vec->bv_len = clone->bi_size;
1049
clone->bi_flags |= 1 << BIO_CLONED;
1050
1051
if (bio_integrity(bio)) {
1052
bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1053
bio_integrity_trim(clone,
1054
bio_sector_offset(bio, idx, offset), len);
1055
}
1056
1057
return clone;
1058
}
1059
1060
/*
1061
* Creates a bio that consists of range of complete bvecs.
1062
*/
1063
static struct bio *clone_bio(struct bio *bio, sector_t sector,
1064
unsigned short idx, unsigned short bv_count,
1065
unsigned int len, struct bio_set *bs)
1066
{
1067
struct bio *clone;
1068
1069
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1070
__bio_clone(clone, bio);
1071
clone->bi_destructor = dm_bio_destructor;
1072
clone->bi_sector = sector;
1073
clone->bi_idx = idx;
1074
clone->bi_vcnt = idx + bv_count;
1075
clone->bi_size = to_bytes(len);
1076
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1077
1078
if (bio_integrity(bio)) {
1079
bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1080
1081
if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1082
bio_integrity_trim(clone,
1083
bio_sector_offset(bio, idx, 0), len);
1084
}
1085
1086
return clone;
1087
}
1088
1089
static struct dm_target_io *alloc_tio(struct clone_info *ci,
1090
struct dm_target *ti)
1091
{
1092
struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1093
1094
tio->io = ci->io;
1095
tio->ti = ti;
1096
memset(&tio->info, 0, sizeof(tio->info));
1097
1098
return tio;
1099
}
1100
1101
static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1102
unsigned request_nr, sector_t len)
1103
{
1104
struct dm_target_io *tio = alloc_tio(ci, ti);
1105
struct bio *clone;
1106
1107
tio->info.target_request_nr = request_nr;
1108
1109
/*
1110
* Discard requests require the bio's inline iovecs be initialized.
1111
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1112
* and discard, so no need for concern about wasted bvec allocations.
1113
*/
1114
clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1115
__bio_clone(clone, ci->bio);
1116
clone->bi_destructor = dm_bio_destructor;
1117
if (len) {
1118
clone->bi_sector = ci->sector;
1119
clone->bi_size = to_bytes(len);
1120
}
1121
1122
__map_bio(ti, clone, tio);
1123
}
1124
1125
static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1126
unsigned num_requests, sector_t len)
1127
{
1128
unsigned request_nr;
1129
1130
for (request_nr = 0; request_nr < num_requests; request_nr++)
1131
__issue_target_request(ci, ti, request_nr, len);
1132
}
1133
1134
static int __clone_and_map_empty_flush(struct clone_info *ci)
1135
{
1136
unsigned target_nr = 0;
1137
struct dm_target *ti;
1138
1139
BUG_ON(bio_has_data(ci->bio));
1140
while ((ti = dm_table_get_target(ci->map, target_nr++)))
1141
__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1142
1143
return 0;
1144
}
1145
1146
/*
1147
* Perform all io with a single clone.
1148
*/
1149
static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1150
{
1151
struct bio *clone, *bio = ci->bio;
1152
struct dm_target_io *tio;
1153
1154
tio = alloc_tio(ci, ti);
1155
clone = clone_bio(bio, ci->sector, ci->idx,
1156
bio->bi_vcnt - ci->idx, ci->sector_count,
1157
ci->md->bs);
1158
__map_bio(ti, clone, tio);
1159
ci->sector_count = 0;
1160
}
1161
1162
static int __clone_and_map_discard(struct clone_info *ci)
1163
{
1164
struct dm_target *ti;
1165
sector_t len;
1166
1167
do {
1168
ti = dm_table_find_target(ci->map, ci->sector);
1169
if (!dm_target_is_valid(ti))
1170
return -EIO;
1171
1172
/*
1173
* Even though the device advertised discard support,
1174
* reconfiguration might have changed that since the
1175
* check was performed.
1176
*/
1177
if (!ti->num_discard_requests)
1178
return -EOPNOTSUPP;
1179
1180
len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1181
1182
__issue_target_requests(ci, ti, ti->num_discard_requests, len);
1183
1184
ci->sector += len;
1185
} while (ci->sector_count -= len);
1186
1187
return 0;
1188
}
1189
1190
static int __clone_and_map(struct clone_info *ci)
1191
{
1192
struct bio *clone, *bio = ci->bio;
1193
struct dm_target *ti;
1194
sector_t len = 0, max;
1195
struct dm_target_io *tio;
1196
1197
if (unlikely(bio->bi_rw & REQ_DISCARD))
1198
return __clone_and_map_discard(ci);
1199
1200
ti = dm_table_find_target(ci->map, ci->sector);
1201
if (!dm_target_is_valid(ti))
1202
return -EIO;
1203
1204
max = max_io_len(ci->sector, ti);
1205
1206
if (ci->sector_count <= max) {
1207
/*
1208
* Optimise for the simple case where we can do all of
1209
* the remaining io with a single clone.
1210
*/
1211
__clone_and_map_simple(ci, ti);
1212
1213
} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1214
/*
1215
* There are some bvecs that don't span targets.
1216
* Do as many of these as possible.
1217
*/
1218
int i;
1219
sector_t remaining = max;
1220
sector_t bv_len;
1221
1222
for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1223
bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1224
1225
if (bv_len > remaining)
1226
break;
1227
1228
remaining -= bv_len;
1229
len += bv_len;
1230
}
1231
1232
tio = alloc_tio(ci, ti);
1233
clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1234
ci->md->bs);
1235
__map_bio(ti, clone, tio);
1236
1237
ci->sector += len;
1238
ci->sector_count -= len;
1239
ci->idx = i;
1240
1241
} else {
1242
/*
1243
* Handle a bvec that must be split between two or more targets.
1244
*/
1245
struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1246
sector_t remaining = to_sector(bv->bv_len);
1247
unsigned int offset = 0;
1248
1249
do {
1250
if (offset) {
1251
ti = dm_table_find_target(ci->map, ci->sector);
1252
if (!dm_target_is_valid(ti))
1253
return -EIO;
1254
1255
max = max_io_len(ci->sector, ti);
1256
}
1257
1258
len = min(remaining, max);
1259
1260
tio = alloc_tio(ci, ti);
1261
clone = split_bvec(bio, ci->sector, ci->idx,
1262
bv->bv_offset + offset, len,
1263
ci->md->bs);
1264
1265
__map_bio(ti, clone, tio);
1266
1267
ci->sector += len;
1268
ci->sector_count -= len;
1269
offset += to_bytes(len);
1270
} while (remaining -= len);
1271
1272
ci->idx++;
1273
}
1274
1275
return 0;
1276
}
1277
1278
/*
1279
* Split the bio into several clones and submit it to targets.
1280
*/
1281
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1282
{
1283
struct clone_info ci;
1284
int error = 0;
1285
1286
ci.map = dm_get_live_table(md);
1287
if (unlikely(!ci.map)) {
1288
bio_io_error(bio);
1289
return;
1290
}
1291
1292
ci.md = md;
1293
ci.io = alloc_io(md);
1294
ci.io->error = 0;
1295
atomic_set(&ci.io->io_count, 1);
1296
ci.io->bio = bio;
1297
ci.io->md = md;
1298
spin_lock_init(&ci.io->endio_lock);
1299
ci.sector = bio->bi_sector;
1300
ci.idx = bio->bi_idx;
1301
1302
start_io_acct(ci.io);
1303
if (bio->bi_rw & REQ_FLUSH) {
1304
ci.bio = &ci.md->flush_bio;
1305
ci.sector_count = 0;
1306
error = __clone_and_map_empty_flush(&ci);
1307
/* dec_pending submits any data associated with flush */
1308
} else {
1309
ci.bio = bio;
1310
ci.sector_count = bio_sectors(bio);
1311
while (ci.sector_count && !error)
1312
error = __clone_and_map(&ci);
1313
}
1314
1315
/* drop the extra reference count */
1316
dec_pending(ci.io, error);
1317
dm_table_put(ci.map);
1318
}
1319
/*-----------------------------------------------------------------
1320
* CRUD END
1321
*---------------------------------------------------------------*/
1322
1323
static int dm_merge_bvec(struct request_queue *q,
1324
struct bvec_merge_data *bvm,
1325
struct bio_vec *biovec)
1326
{
1327
struct mapped_device *md = q->queuedata;
1328
struct dm_table *map = dm_get_live_table(md);
1329
struct dm_target *ti;
1330
sector_t max_sectors;
1331
int max_size = 0;
1332
1333
if (unlikely(!map))
1334
goto out;
1335
1336
ti = dm_table_find_target(map, bvm->bi_sector);
1337
if (!dm_target_is_valid(ti))
1338
goto out_table;
1339
1340
/*
1341
* Find maximum amount of I/O that won't need splitting
1342
*/
1343
max_sectors = min(max_io_len(bvm->bi_sector, ti),
1344
(sector_t) BIO_MAX_SECTORS);
1345
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1346
if (max_size < 0)
1347
max_size = 0;
1348
1349
/*
1350
* merge_bvec_fn() returns number of bytes
1351
* it can accept at this offset
1352
* max is precomputed maximal io size
1353
*/
1354
if (max_size && ti->type->merge)
1355
max_size = ti->type->merge(ti, bvm, biovec, max_size);
1356
/*
1357
* If the target doesn't support merge method and some of the devices
1358
* provided their merge_bvec method (we know this by looking at
1359
* queue_max_hw_sectors), then we can't allow bios with multiple vector
1360
* entries. So always set max_size to 0, and the code below allows
1361
* just one page.
1362
*/
1363
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1364
1365
max_size = 0;
1366
1367
out_table:
1368
dm_table_put(map);
1369
1370
out:
1371
/*
1372
* Always allow an entire first page
1373
*/
1374
if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1375
max_size = biovec->bv_len;
1376
1377
return max_size;
1378
}
1379
1380
/*
1381
* The request function that just remaps the bio built up by
1382
* dm_merge_bvec.
1383
*/
1384
static int _dm_request(struct request_queue *q, struct bio *bio)
1385
{
1386
int rw = bio_data_dir(bio);
1387
struct mapped_device *md = q->queuedata;
1388
int cpu;
1389
1390
down_read(&md->io_lock);
1391
1392
cpu = part_stat_lock();
1393
part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1394
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1395
part_stat_unlock();
1396
1397
/* if we're suspended, we have to queue this io for later */
1398
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1399
up_read(&md->io_lock);
1400
1401
if (bio_rw(bio) != READA)
1402
queue_io(md, bio);
1403
else
1404
bio_io_error(bio);
1405
return 0;
1406
}
1407
1408
__split_and_process_bio(md, bio);
1409
up_read(&md->io_lock);
1410
return 0;
1411
}
1412
1413
static int dm_make_request(struct request_queue *q, struct bio *bio)
1414
{
1415
struct mapped_device *md = q->queuedata;
1416
1417
return md->saved_make_request_fn(q, bio); /* call __make_request() */
1418
}
1419
1420
static int dm_request_based(struct mapped_device *md)
1421
{
1422
return blk_queue_stackable(md->queue);
1423
}
1424
1425
static int dm_request(struct request_queue *q, struct bio *bio)
1426
{
1427
struct mapped_device *md = q->queuedata;
1428
1429
if (dm_request_based(md))
1430
return dm_make_request(q, bio);
1431
1432
return _dm_request(q, bio);
1433
}
1434
1435
void dm_dispatch_request(struct request *rq)
1436
{
1437
int r;
1438
1439
if (blk_queue_io_stat(rq->q))
1440
rq->cmd_flags |= REQ_IO_STAT;
1441
1442
rq->start_time = jiffies;
1443
r = blk_insert_cloned_request(rq->q, rq);
1444
if (r)
1445
dm_complete_request(rq, r);
1446
}
1447
EXPORT_SYMBOL_GPL(dm_dispatch_request);
1448
1449
static void dm_rq_bio_destructor(struct bio *bio)
1450
{
1451
struct dm_rq_clone_bio_info *info = bio->bi_private;
1452
struct mapped_device *md = info->tio->md;
1453
1454
free_bio_info(info);
1455
bio_free(bio, md->bs);
1456
}
1457
1458
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1459
void *data)
1460
{
1461
struct dm_rq_target_io *tio = data;
1462
struct mapped_device *md = tio->md;
1463
struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1464
1465
if (!info)
1466
return -ENOMEM;
1467
1468
info->orig = bio_orig;
1469
info->tio = tio;
1470
bio->bi_end_io = end_clone_bio;
1471
bio->bi_private = info;
1472
bio->bi_destructor = dm_rq_bio_destructor;
1473
1474
return 0;
1475
}
1476
1477
static int setup_clone(struct request *clone, struct request *rq,
1478
struct dm_rq_target_io *tio)
1479
{
1480
int r;
1481
1482
r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1483
dm_rq_bio_constructor, tio);
1484
if (r)
1485
return r;
1486
1487
clone->cmd = rq->cmd;
1488
clone->cmd_len = rq->cmd_len;
1489
clone->sense = rq->sense;
1490
clone->buffer = rq->buffer;
1491
clone->end_io = end_clone_request;
1492
clone->end_io_data = tio;
1493
1494
return 0;
1495
}
1496
1497
static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1498
gfp_t gfp_mask)
1499
{
1500
struct request *clone;
1501
struct dm_rq_target_io *tio;
1502
1503
tio = alloc_rq_tio(md, gfp_mask);
1504
if (!tio)
1505
return NULL;
1506
1507
tio->md = md;
1508
tio->ti = NULL;
1509
tio->orig = rq;
1510
tio->error = 0;
1511
memset(&tio->info, 0, sizeof(tio->info));
1512
1513
clone = &tio->clone;
1514
if (setup_clone(clone, rq, tio)) {
1515
/* -ENOMEM */
1516
free_rq_tio(tio);
1517
return NULL;
1518
}
1519
1520
return clone;
1521
}
1522
1523
/*
1524
* Called with the queue lock held.
1525
*/
1526
static int dm_prep_fn(struct request_queue *q, struct request *rq)
1527
{
1528
struct mapped_device *md = q->queuedata;
1529
struct request *clone;
1530
1531
if (unlikely(rq->special)) {
1532
DMWARN("Already has something in rq->special.");
1533
return BLKPREP_KILL;
1534
}
1535
1536
clone = clone_rq(rq, md, GFP_ATOMIC);
1537
if (!clone)
1538
return BLKPREP_DEFER;
1539
1540
rq->special = clone;
1541
rq->cmd_flags |= REQ_DONTPREP;
1542
1543
return BLKPREP_OK;
1544
}
1545
1546
/*
1547
* Returns:
1548
* 0 : the request has been processed (not requeued)
1549
* !0 : the request has been requeued
1550
*/
1551
static int map_request(struct dm_target *ti, struct request *clone,
1552
struct mapped_device *md)
1553
{
1554
int r, requeued = 0;
1555
struct dm_rq_target_io *tio = clone->end_io_data;
1556
1557
/*
1558
* Hold the md reference here for the in-flight I/O.
1559
* We can't rely on the reference count by device opener,
1560
* because the device may be closed during the request completion
1561
* when all bios are completed.
1562
* See the comment in rq_completed() too.
1563
*/
1564
dm_get(md);
1565
1566
tio->ti = ti;
1567
r = ti->type->map_rq(ti, clone, &tio->info);
1568
switch (r) {
1569
case DM_MAPIO_SUBMITTED:
1570
/* The target has taken the I/O to submit by itself later */
1571
break;
1572
case DM_MAPIO_REMAPPED:
1573
/* The target has remapped the I/O so dispatch it */
1574
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1575
blk_rq_pos(tio->orig));
1576
dm_dispatch_request(clone);
1577
break;
1578
case DM_MAPIO_REQUEUE:
1579
/* The target wants to requeue the I/O */
1580
dm_requeue_unmapped_request(clone);
1581
requeued = 1;
1582
break;
1583
default:
1584
if (r > 0) {
1585
DMWARN("unimplemented target map return value: %d", r);
1586
BUG();
1587
}
1588
1589
/* The target wants to complete the I/O */
1590
dm_kill_unmapped_request(clone, r);
1591
break;
1592
}
1593
1594
return requeued;
1595
}
1596
1597
/*
1598
* q->request_fn for request-based dm.
1599
* Called with the queue lock held.
1600
*/
1601
static void dm_request_fn(struct request_queue *q)
1602
{
1603
struct mapped_device *md = q->queuedata;
1604
struct dm_table *map = dm_get_live_table(md);
1605
struct dm_target *ti;
1606
struct request *rq, *clone;
1607
sector_t pos;
1608
1609
/*
1610
* For suspend, check blk_queue_stopped() and increment
1611
* ->pending within a single queue_lock not to increment the
1612
* number of in-flight I/Os after the queue is stopped in
1613
* dm_suspend().
1614
*/
1615
while (!blk_queue_stopped(q)) {
1616
rq = blk_peek_request(q);
1617
if (!rq)
1618
goto delay_and_out;
1619
1620
/* always use block 0 to find the target for flushes for now */
1621
pos = 0;
1622
if (!(rq->cmd_flags & REQ_FLUSH))
1623
pos = blk_rq_pos(rq);
1624
1625
ti = dm_table_find_target(map, pos);
1626
BUG_ON(!dm_target_is_valid(ti));
1627
1628
if (ti->type->busy && ti->type->busy(ti))
1629
goto delay_and_out;
1630
1631
blk_start_request(rq);
1632
clone = rq->special;
1633
atomic_inc(&md->pending[rq_data_dir(clone)]);
1634
1635
spin_unlock(q->queue_lock);
1636
if (map_request(ti, clone, md))
1637
goto requeued;
1638
1639
BUG_ON(!irqs_disabled());
1640
spin_lock(q->queue_lock);
1641
}
1642
1643
goto out;
1644
1645
requeued:
1646
BUG_ON(!irqs_disabled());
1647
spin_lock(q->queue_lock);
1648
1649
delay_and_out:
1650
blk_delay_queue(q, HZ / 10);
1651
out:
1652
dm_table_put(map);
1653
1654
return;
1655
}
1656
1657
int dm_underlying_device_busy(struct request_queue *q)
1658
{
1659
return blk_lld_busy(q);
1660
}
1661
EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1662
1663
static int dm_lld_busy(struct request_queue *q)
1664
{
1665
int r;
1666
struct mapped_device *md = q->queuedata;
1667
struct dm_table *map = dm_get_live_table(md);
1668
1669
if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1670
r = 1;
1671
else
1672
r = dm_table_any_busy_target(map);
1673
1674
dm_table_put(map);
1675
1676
return r;
1677
}
1678
1679
static int dm_any_congested(void *congested_data, int bdi_bits)
1680
{
1681
int r = bdi_bits;
1682
struct mapped_device *md = congested_data;
1683
struct dm_table *map;
1684
1685
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1686
map = dm_get_live_table(md);
1687
if (map) {
1688
/*
1689
* Request-based dm cares about only own queue for
1690
* the query about congestion status of request_queue
1691
*/
1692
if (dm_request_based(md))
1693
r = md->queue->backing_dev_info.state &
1694
bdi_bits;
1695
else
1696
r = dm_table_any_congested(map, bdi_bits);
1697
1698
dm_table_put(map);
1699
}
1700
}
1701
1702
return r;
1703
}
1704
1705
/*-----------------------------------------------------------------
1706
* An IDR is used to keep track of allocated minor numbers.
1707
*---------------------------------------------------------------*/
1708
static DEFINE_IDR(_minor_idr);
1709
1710
static void free_minor(int minor)
1711
{
1712
spin_lock(&_minor_lock);
1713
idr_remove(&_minor_idr, minor);
1714
spin_unlock(&_minor_lock);
1715
}
1716
1717
/*
1718
* See if the device with a specific minor # is free.
1719
*/
1720
static int specific_minor(int minor)
1721
{
1722
int r, m;
1723
1724
if (minor >= (1 << MINORBITS))
1725
return -EINVAL;
1726
1727
r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1728
if (!r)
1729
return -ENOMEM;
1730
1731
spin_lock(&_minor_lock);
1732
1733
if (idr_find(&_minor_idr, minor)) {
1734
r = -EBUSY;
1735
goto out;
1736
}
1737
1738
r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1739
if (r)
1740
goto out;
1741
1742
if (m != minor) {
1743
idr_remove(&_minor_idr, m);
1744
r = -EBUSY;
1745
goto out;
1746
}
1747
1748
out:
1749
spin_unlock(&_minor_lock);
1750
return r;
1751
}
1752
1753
static int next_free_minor(int *minor)
1754
{
1755
int r, m;
1756
1757
r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1758
if (!r)
1759
return -ENOMEM;
1760
1761
spin_lock(&_minor_lock);
1762
1763
r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1764
if (r)
1765
goto out;
1766
1767
if (m >= (1 << MINORBITS)) {
1768
idr_remove(&_minor_idr, m);
1769
r = -ENOSPC;
1770
goto out;
1771
}
1772
1773
*minor = m;
1774
1775
out:
1776
spin_unlock(&_minor_lock);
1777
return r;
1778
}
1779
1780
static const struct block_device_operations dm_blk_dops;
1781
1782
static void dm_wq_work(struct work_struct *work);
1783
1784
static void dm_init_md_queue(struct mapped_device *md)
1785
{
1786
/*
1787
* Request-based dm devices cannot be stacked on top of bio-based dm
1788
* devices. The type of this dm device has not been decided yet.
1789
* The type is decided at the first table loading time.
1790
* To prevent problematic device stacking, clear the queue flag
1791
* for request stacking support until then.
1792
*
1793
* This queue is new, so no concurrency on the queue_flags.
1794
*/
1795
queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1796
1797
md->queue->queuedata = md;
1798
md->queue->backing_dev_info.congested_fn = dm_any_congested;
1799
md->queue->backing_dev_info.congested_data = md;
1800
blk_queue_make_request(md->queue, dm_request);
1801
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1802
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803
blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1804
}
1805
1806
/*
1807
* Allocate and initialise a blank device with a given minor.
1808
*/
1809
static struct mapped_device *alloc_dev(int minor)
1810
{
1811
int r;
1812
struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1813
void *old_md;
1814
1815
if (!md) {
1816
DMWARN("unable to allocate device, out of memory.");
1817
return NULL;
1818
}
1819
1820
if (!try_module_get(THIS_MODULE))
1821
goto bad_module_get;
1822
1823
/* get a minor number for the dev */
1824
if (minor == DM_ANY_MINOR)
1825
r = next_free_minor(&minor);
1826
else
1827
r = specific_minor(minor);
1828
if (r < 0)
1829
goto bad_minor;
1830
1831
md->type = DM_TYPE_NONE;
1832
init_rwsem(&md->io_lock);
1833
mutex_init(&md->suspend_lock);
1834
mutex_init(&md->type_lock);
1835
spin_lock_init(&md->deferred_lock);
1836
rwlock_init(&md->map_lock);
1837
atomic_set(&md->holders, 1);
1838
atomic_set(&md->open_count, 0);
1839
atomic_set(&md->event_nr, 0);
1840
atomic_set(&md->uevent_seq, 0);
1841
INIT_LIST_HEAD(&md->uevent_list);
1842
spin_lock_init(&md->uevent_lock);
1843
1844
md->queue = blk_alloc_queue(GFP_KERNEL);
1845
if (!md->queue)
1846
goto bad_queue;
1847
1848
dm_init_md_queue(md);
1849
1850
md->disk = alloc_disk(1);
1851
if (!md->disk)
1852
goto bad_disk;
1853
1854
atomic_set(&md->pending[0], 0);
1855
atomic_set(&md->pending[1], 0);
1856
init_waitqueue_head(&md->wait);
1857
INIT_WORK(&md->work, dm_wq_work);
1858
init_waitqueue_head(&md->eventq);
1859
1860
md->disk->major = _major;
1861
md->disk->first_minor = minor;
1862
md->disk->fops = &dm_blk_dops;
1863
md->disk->queue = md->queue;
1864
md->disk->private_data = md;
1865
sprintf(md->disk->disk_name, "dm-%d", minor);
1866
add_disk(md->disk);
1867
format_dev_t(md->name, MKDEV(_major, minor));
1868
1869
md->wq = alloc_workqueue("kdmflush",
1870
WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1871
if (!md->wq)
1872
goto bad_thread;
1873
1874
md->bdev = bdget_disk(md->disk, 0);
1875
if (!md->bdev)
1876
goto bad_bdev;
1877
1878
bio_init(&md->flush_bio);
1879
md->flush_bio.bi_bdev = md->bdev;
1880
md->flush_bio.bi_rw = WRITE_FLUSH;
1881
1882
/* Populate the mapping, nobody knows we exist yet */
1883
spin_lock(&_minor_lock);
1884
old_md = idr_replace(&_minor_idr, md, minor);
1885
spin_unlock(&_minor_lock);
1886
1887
BUG_ON(old_md != MINOR_ALLOCED);
1888
1889
return md;
1890
1891
bad_bdev:
1892
destroy_workqueue(md->wq);
1893
bad_thread:
1894
del_gendisk(md->disk);
1895
put_disk(md->disk);
1896
bad_disk:
1897
blk_cleanup_queue(md->queue);
1898
bad_queue:
1899
free_minor(minor);
1900
bad_minor:
1901
module_put(THIS_MODULE);
1902
bad_module_get:
1903
kfree(md);
1904
return NULL;
1905
}
1906
1907
static void unlock_fs(struct mapped_device *md);
1908
1909
static void free_dev(struct mapped_device *md)
1910
{
1911
int minor = MINOR(disk_devt(md->disk));
1912
1913
unlock_fs(md);
1914
bdput(md->bdev);
1915
destroy_workqueue(md->wq);
1916
if (md->tio_pool)
1917
mempool_destroy(md->tio_pool);
1918
if (md->io_pool)
1919
mempool_destroy(md->io_pool);
1920
if (md->bs)
1921
bioset_free(md->bs);
1922
blk_integrity_unregister(md->disk);
1923
del_gendisk(md->disk);
1924
free_minor(minor);
1925
1926
spin_lock(&_minor_lock);
1927
md->disk->private_data = NULL;
1928
spin_unlock(&_minor_lock);
1929
1930
put_disk(md->disk);
1931
blk_cleanup_queue(md->queue);
1932
module_put(THIS_MODULE);
1933
kfree(md);
1934
}
1935
1936
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1937
{
1938
struct dm_md_mempools *p;
1939
1940
if (md->io_pool && md->tio_pool && md->bs)
1941
/* the md already has necessary mempools */
1942
goto out;
1943
1944
p = dm_table_get_md_mempools(t);
1945
BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1946
1947
md->io_pool = p->io_pool;
1948
p->io_pool = NULL;
1949
md->tio_pool = p->tio_pool;
1950
p->tio_pool = NULL;
1951
md->bs = p->bs;
1952
p->bs = NULL;
1953
1954
out:
1955
/* mempool bind completed, now no need any mempools in the table */
1956
dm_table_free_md_mempools(t);
1957
}
1958
1959
/*
1960
* Bind a table to the device.
1961
*/
1962
static void event_callback(void *context)
1963
{
1964
unsigned long flags;
1965
LIST_HEAD(uevents);
1966
struct mapped_device *md = (struct mapped_device *) context;
1967
1968
spin_lock_irqsave(&md->uevent_lock, flags);
1969
list_splice_init(&md->uevent_list, &uevents);
1970
spin_unlock_irqrestore(&md->uevent_lock, flags);
1971
1972
dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1973
1974
atomic_inc(&md->event_nr);
1975
wake_up(&md->eventq);
1976
}
1977
1978
/*
1979
* Protected by md->suspend_lock obtained by dm_swap_table().
1980
*/
1981
static void __set_size(struct mapped_device *md, sector_t size)
1982
{
1983
set_capacity(md->disk, size);
1984
1985
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1986
}
1987
1988
/*
1989
* Returns old map, which caller must destroy.
1990
*/
1991
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1992
struct queue_limits *limits)
1993
{
1994
struct dm_table *old_map;
1995
struct request_queue *q = md->queue;
1996
sector_t size;
1997
unsigned long flags;
1998
1999
size = dm_table_get_size(t);
2000
2001
/*
2002
* Wipe any geometry if the size of the table changed.
2003
*/
2004
if (size != get_capacity(md->disk))
2005
memset(&md->geometry, 0, sizeof(md->geometry));
2006
2007
__set_size(md, size);
2008
2009
dm_table_event_callback(t, event_callback, md);
2010
2011
/*
2012
* The queue hasn't been stopped yet, if the old table type wasn't
2013
* for request-based during suspension. So stop it to prevent
2014
* I/O mapping before resume.
2015
* This must be done before setting the queue restrictions,
2016
* because request-based dm may be run just after the setting.
2017
*/
2018
if (dm_table_request_based(t) && !blk_queue_stopped(q))
2019
stop_queue(q);
2020
2021
__bind_mempools(md, t);
2022
2023
write_lock_irqsave(&md->map_lock, flags);
2024
old_map = md->map;
2025
md->map = t;
2026
dm_table_set_restrictions(t, q, limits);
2027
write_unlock_irqrestore(&md->map_lock, flags);
2028
2029
return old_map;
2030
}
2031
2032
/*
2033
* Returns unbound table for the caller to free.
2034
*/
2035
static struct dm_table *__unbind(struct mapped_device *md)
2036
{
2037
struct dm_table *map = md->map;
2038
unsigned long flags;
2039
2040
if (!map)
2041
return NULL;
2042
2043
dm_table_event_callback(map, NULL, NULL);
2044
write_lock_irqsave(&md->map_lock, flags);
2045
md->map = NULL;
2046
write_unlock_irqrestore(&md->map_lock, flags);
2047
2048
return map;
2049
}
2050
2051
/*
2052
* Constructor for a new device.
2053
*/
2054
int dm_create(int minor, struct mapped_device **result)
2055
{
2056
struct mapped_device *md;
2057
2058
md = alloc_dev(minor);
2059
if (!md)
2060
return -ENXIO;
2061
2062
dm_sysfs_init(md);
2063
2064
*result = md;
2065
return 0;
2066
}
2067
2068
/*
2069
* Functions to manage md->type.
2070
* All are required to hold md->type_lock.
2071
*/
2072
void dm_lock_md_type(struct mapped_device *md)
2073
{
2074
mutex_lock(&md->type_lock);
2075
}
2076
2077
void dm_unlock_md_type(struct mapped_device *md)
2078
{
2079
mutex_unlock(&md->type_lock);
2080
}
2081
2082
void dm_set_md_type(struct mapped_device *md, unsigned type)
2083
{
2084
md->type = type;
2085
}
2086
2087
unsigned dm_get_md_type(struct mapped_device *md)
2088
{
2089
return md->type;
2090
}
2091
2092
/*
2093
* Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2094
*/
2095
static int dm_init_request_based_queue(struct mapped_device *md)
2096
{
2097
struct request_queue *q = NULL;
2098
2099
if (md->queue->elevator)
2100
return 1;
2101
2102
/* Fully initialize the queue */
2103
q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2104
if (!q)
2105
return 0;
2106
2107
md->queue = q;
2108
md->saved_make_request_fn = md->queue->make_request_fn;
2109
dm_init_md_queue(md);
2110
blk_queue_softirq_done(md->queue, dm_softirq_done);
2111
blk_queue_prep_rq(md->queue, dm_prep_fn);
2112
blk_queue_lld_busy(md->queue, dm_lld_busy);
2113
2114
elv_register_queue(md->queue);
2115
2116
return 1;
2117
}
2118
2119
/*
2120
* Setup the DM device's queue based on md's type
2121
*/
2122
int dm_setup_md_queue(struct mapped_device *md)
2123
{
2124
if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2125
!dm_init_request_based_queue(md)) {
2126
DMWARN("Cannot initialize queue for request-based mapped device");
2127
return -EINVAL;
2128
}
2129
2130
return 0;
2131
}
2132
2133
static struct mapped_device *dm_find_md(dev_t dev)
2134
{
2135
struct mapped_device *md;
2136
unsigned minor = MINOR(dev);
2137
2138
if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2139
return NULL;
2140
2141
spin_lock(&_minor_lock);
2142
2143
md = idr_find(&_minor_idr, minor);
2144
if (md && (md == MINOR_ALLOCED ||
2145
(MINOR(disk_devt(dm_disk(md))) != minor) ||
2146
dm_deleting_md(md) ||
2147
test_bit(DMF_FREEING, &md->flags))) {
2148
md = NULL;
2149
goto out;
2150
}
2151
2152
out:
2153
spin_unlock(&_minor_lock);
2154
2155
return md;
2156
}
2157
2158
struct mapped_device *dm_get_md(dev_t dev)
2159
{
2160
struct mapped_device *md = dm_find_md(dev);
2161
2162
if (md)
2163
dm_get(md);
2164
2165
return md;
2166
}
2167
2168
void *dm_get_mdptr(struct mapped_device *md)
2169
{
2170
return md->interface_ptr;
2171
}
2172
2173
void dm_set_mdptr(struct mapped_device *md, void *ptr)
2174
{
2175
md->interface_ptr = ptr;
2176
}
2177
2178
void dm_get(struct mapped_device *md)
2179
{
2180
atomic_inc(&md->holders);
2181
BUG_ON(test_bit(DMF_FREEING, &md->flags));
2182
}
2183
2184
const char *dm_device_name(struct mapped_device *md)
2185
{
2186
return md->name;
2187
}
2188
EXPORT_SYMBOL_GPL(dm_device_name);
2189
2190
static void __dm_destroy(struct mapped_device *md, bool wait)
2191
{
2192
struct dm_table *map;
2193
2194
might_sleep();
2195
2196
spin_lock(&_minor_lock);
2197
map = dm_get_live_table(md);
2198
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2199
set_bit(DMF_FREEING, &md->flags);
2200
spin_unlock(&_minor_lock);
2201
2202
if (!dm_suspended_md(md)) {
2203
dm_table_presuspend_targets(map);
2204
dm_table_postsuspend_targets(map);
2205
}
2206
2207
/*
2208
* Rare, but there may be I/O requests still going to complete,
2209
* for example. Wait for all references to disappear.
2210
* No one should increment the reference count of the mapped_device,
2211
* after the mapped_device state becomes DMF_FREEING.
2212
*/
2213
if (wait)
2214
while (atomic_read(&md->holders))
2215
msleep(1);
2216
else if (atomic_read(&md->holders))
2217
DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2218
dm_device_name(md), atomic_read(&md->holders));
2219
2220
dm_sysfs_exit(md);
2221
dm_table_put(map);
2222
dm_table_destroy(__unbind(md));
2223
free_dev(md);
2224
}
2225
2226
void dm_destroy(struct mapped_device *md)
2227
{
2228
__dm_destroy(md, true);
2229
}
2230
2231
void dm_destroy_immediate(struct mapped_device *md)
2232
{
2233
__dm_destroy(md, false);
2234
}
2235
2236
void dm_put(struct mapped_device *md)
2237
{
2238
atomic_dec(&md->holders);
2239
}
2240
EXPORT_SYMBOL_GPL(dm_put);
2241
2242
static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2243
{
2244
int r = 0;
2245
DECLARE_WAITQUEUE(wait, current);
2246
2247
add_wait_queue(&md->wait, &wait);
2248
2249
while (1) {
2250
set_current_state(interruptible);
2251
2252
smp_mb();
2253
if (!md_in_flight(md))
2254
break;
2255
2256
if (interruptible == TASK_INTERRUPTIBLE &&
2257
signal_pending(current)) {
2258
r = -EINTR;
2259
break;
2260
}
2261
2262
io_schedule();
2263
}
2264
set_current_state(TASK_RUNNING);
2265
2266
remove_wait_queue(&md->wait, &wait);
2267
2268
return r;
2269
}
2270
2271
/*
2272
* Process the deferred bios
2273
*/
2274
static void dm_wq_work(struct work_struct *work)
2275
{
2276
struct mapped_device *md = container_of(work, struct mapped_device,
2277
work);
2278
struct bio *c;
2279
2280
down_read(&md->io_lock);
2281
2282
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2283
spin_lock_irq(&md->deferred_lock);
2284
c = bio_list_pop(&md->deferred);
2285
spin_unlock_irq(&md->deferred_lock);
2286
2287
if (!c)
2288
break;
2289
2290
up_read(&md->io_lock);
2291
2292
if (dm_request_based(md))
2293
generic_make_request(c);
2294
else
2295
__split_and_process_bio(md, c);
2296
2297
down_read(&md->io_lock);
2298
}
2299
2300
up_read(&md->io_lock);
2301
}
2302
2303
static void dm_queue_flush(struct mapped_device *md)
2304
{
2305
clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2306
smp_mb__after_clear_bit();
2307
queue_work(md->wq, &md->work);
2308
}
2309
2310
/*
2311
* Swap in a new table, returning the old one for the caller to destroy.
2312
*/
2313
struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2314
{
2315
struct dm_table *map = ERR_PTR(-EINVAL);
2316
struct queue_limits limits;
2317
int r;
2318
2319
mutex_lock(&md->suspend_lock);
2320
2321
/* device must be suspended */
2322
if (!dm_suspended_md(md))
2323
goto out;
2324
2325
r = dm_calculate_queue_limits(table, &limits);
2326
if (r) {
2327
map = ERR_PTR(r);
2328
goto out;
2329
}
2330
2331
map = __bind(md, table, &limits);
2332
2333
out:
2334
mutex_unlock(&md->suspend_lock);
2335
return map;
2336
}
2337
2338
/*
2339
* Functions to lock and unlock any filesystem running on the
2340
* device.
2341
*/
2342
static int lock_fs(struct mapped_device *md)
2343
{
2344
int r;
2345
2346
WARN_ON(md->frozen_sb);
2347
2348
md->frozen_sb = freeze_bdev(md->bdev);
2349
if (IS_ERR(md->frozen_sb)) {
2350
r = PTR_ERR(md->frozen_sb);
2351
md->frozen_sb = NULL;
2352
return r;
2353
}
2354
2355
set_bit(DMF_FROZEN, &md->flags);
2356
2357
return 0;
2358
}
2359
2360
static void unlock_fs(struct mapped_device *md)
2361
{
2362
if (!test_bit(DMF_FROZEN, &md->flags))
2363
return;
2364
2365
thaw_bdev(md->bdev, md->frozen_sb);
2366
md->frozen_sb = NULL;
2367
clear_bit(DMF_FROZEN, &md->flags);
2368
}
2369
2370
/*
2371
* We need to be able to change a mapping table under a mounted
2372
* filesystem. For example we might want to move some data in
2373
* the background. Before the table can be swapped with
2374
* dm_bind_table, dm_suspend must be called to flush any in
2375
* flight bios and ensure that any further io gets deferred.
2376
*/
2377
/*
2378
* Suspend mechanism in request-based dm.
2379
*
2380
* 1. Flush all I/Os by lock_fs() if needed.
2381
* 2. Stop dispatching any I/O by stopping the request_queue.
2382
* 3. Wait for all in-flight I/Os to be completed or requeued.
2383
*
2384
* To abort suspend, start the request_queue.
2385
*/
2386
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2387
{
2388
struct dm_table *map = NULL;
2389
int r = 0;
2390
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2391
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2392
2393
mutex_lock(&md->suspend_lock);
2394
2395
if (dm_suspended_md(md)) {
2396
r = -EINVAL;
2397
goto out_unlock;
2398
}
2399
2400
map = dm_get_live_table(md);
2401
2402
/*
2403
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2404
* This flag is cleared before dm_suspend returns.
2405
*/
2406
if (noflush)
2407
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2408
2409
/* This does not get reverted if there's an error later. */
2410
dm_table_presuspend_targets(map);
2411
2412
/*
2413
* Flush I/O to the device.
2414
* Any I/O submitted after lock_fs() may not be flushed.
2415
* noflush takes precedence over do_lockfs.
2416
* (lock_fs() flushes I/Os and waits for them to complete.)
2417
*/
2418
if (!noflush && do_lockfs) {
2419
r = lock_fs(md);
2420
if (r)
2421
goto out;
2422
}
2423
2424
/*
2425
* Here we must make sure that no processes are submitting requests
2426
* to target drivers i.e. no one may be executing
2427
* __split_and_process_bio. This is called from dm_request and
2428
* dm_wq_work.
2429
*
2430
* To get all processes out of __split_and_process_bio in dm_request,
2431
* we take the write lock. To prevent any process from reentering
2432
* __split_and_process_bio from dm_request and quiesce the thread
2433
* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2434
* flush_workqueue(md->wq).
2435
*/
2436
down_write(&md->io_lock);
2437
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2438
up_write(&md->io_lock);
2439
2440
/*
2441
* Stop md->queue before flushing md->wq in case request-based
2442
* dm defers requests to md->wq from md->queue.
2443
*/
2444
if (dm_request_based(md))
2445
stop_queue(md->queue);
2446
2447
flush_workqueue(md->wq);
2448
2449
/*
2450
* At this point no more requests are entering target request routines.
2451
* We call dm_wait_for_completion to wait for all existing requests
2452
* to finish.
2453
*/
2454
r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2455
2456
down_write(&md->io_lock);
2457
if (noflush)
2458
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2459
up_write(&md->io_lock);
2460
2461
/* were we interrupted ? */
2462
if (r < 0) {
2463
dm_queue_flush(md);
2464
2465
if (dm_request_based(md))
2466
start_queue(md->queue);
2467
2468
unlock_fs(md);
2469
goto out; /* pushback list is already flushed, so skip flush */
2470
}
2471
2472
/*
2473
* If dm_wait_for_completion returned 0, the device is completely
2474
* quiescent now. There is no request-processing activity. All new
2475
* requests are being added to md->deferred list.
2476
*/
2477
2478
set_bit(DMF_SUSPENDED, &md->flags);
2479
2480
dm_table_postsuspend_targets(map);
2481
2482
out:
2483
dm_table_put(map);
2484
2485
out_unlock:
2486
mutex_unlock(&md->suspend_lock);
2487
return r;
2488
}
2489
2490
int dm_resume(struct mapped_device *md)
2491
{
2492
int r = -EINVAL;
2493
struct dm_table *map = NULL;
2494
2495
mutex_lock(&md->suspend_lock);
2496
if (!dm_suspended_md(md))
2497
goto out;
2498
2499
map = dm_get_live_table(md);
2500
if (!map || !dm_table_get_size(map))
2501
goto out;
2502
2503
r = dm_table_resume_targets(map);
2504
if (r)
2505
goto out;
2506
2507
dm_queue_flush(md);
2508
2509
/*
2510
* Flushing deferred I/Os must be done after targets are resumed
2511
* so that mapping of targets can work correctly.
2512
* Request-based dm is queueing the deferred I/Os in its request_queue.
2513
*/
2514
if (dm_request_based(md))
2515
start_queue(md->queue);
2516
2517
unlock_fs(md);
2518
2519
clear_bit(DMF_SUSPENDED, &md->flags);
2520
2521
r = 0;
2522
out:
2523
dm_table_put(map);
2524
mutex_unlock(&md->suspend_lock);
2525
2526
return r;
2527
}
2528
2529
/*-----------------------------------------------------------------
2530
* Event notification.
2531
*---------------------------------------------------------------*/
2532
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2533
unsigned cookie)
2534
{
2535
char udev_cookie[DM_COOKIE_LENGTH];
2536
char *envp[] = { udev_cookie, NULL };
2537
2538
if (!cookie)
2539
return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2540
else {
2541
snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2542
DM_COOKIE_ENV_VAR_NAME, cookie);
2543
return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2544
action, envp);
2545
}
2546
}
2547
2548
uint32_t dm_next_uevent_seq(struct mapped_device *md)
2549
{
2550
return atomic_add_return(1, &md->uevent_seq);
2551
}
2552
2553
uint32_t dm_get_event_nr(struct mapped_device *md)
2554
{
2555
return atomic_read(&md->event_nr);
2556
}
2557
2558
int dm_wait_event(struct mapped_device *md, int event_nr)
2559
{
2560
return wait_event_interruptible(md->eventq,
2561
(event_nr != atomic_read(&md->event_nr)));
2562
}
2563
2564
void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2565
{
2566
unsigned long flags;
2567
2568
spin_lock_irqsave(&md->uevent_lock, flags);
2569
list_add(elist, &md->uevent_list);
2570
spin_unlock_irqrestore(&md->uevent_lock, flags);
2571
}
2572
2573
/*
2574
* The gendisk is only valid as long as you have a reference
2575
* count on 'md'.
2576
*/
2577
struct gendisk *dm_disk(struct mapped_device *md)
2578
{
2579
return md->disk;
2580
}
2581
2582
struct kobject *dm_kobject(struct mapped_device *md)
2583
{
2584
return &md->kobj;
2585
}
2586
2587
/*
2588
* struct mapped_device should not be exported outside of dm.c
2589
* so use this check to verify that kobj is part of md structure
2590
*/
2591
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2592
{
2593
struct mapped_device *md;
2594
2595
md = container_of(kobj, struct mapped_device, kobj);
2596
if (&md->kobj != kobj)
2597
return NULL;
2598
2599
if (test_bit(DMF_FREEING, &md->flags) ||
2600
dm_deleting_md(md))
2601
return NULL;
2602
2603
dm_get(md);
2604
return md;
2605
}
2606
2607
int dm_suspended_md(struct mapped_device *md)
2608
{
2609
return test_bit(DMF_SUSPENDED, &md->flags);
2610
}
2611
2612
int dm_suspended(struct dm_target *ti)
2613
{
2614
return dm_suspended_md(dm_table_get_md(ti->table));
2615
}
2616
EXPORT_SYMBOL_GPL(dm_suspended);
2617
2618
int dm_noflush_suspending(struct dm_target *ti)
2619
{
2620
return __noflush_suspending(dm_table_get_md(ti->table));
2621
}
2622
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2623
2624
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2625
{
2626
struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2627
unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2628
2629
if (!pools)
2630
return NULL;
2631
2632
pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2633
mempool_create_slab_pool(MIN_IOS, _io_cache) :
2634
mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2635
if (!pools->io_pool)
2636
goto free_pools_and_out;
2637
2638
pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2639
mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2640
mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2641
if (!pools->tio_pool)
2642
goto free_io_pool_and_out;
2643
2644
pools->bs = bioset_create(pool_size, 0);
2645
if (!pools->bs)
2646
goto free_tio_pool_and_out;
2647
2648
if (integrity && bioset_integrity_create(pools->bs, pool_size))
2649
goto free_bioset_and_out;
2650
2651
return pools;
2652
2653
free_bioset_and_out:
2654
bioset_free(pools->bs);
2655
2656
free_tio_pool_and_out:
2657
mempool_destroy(pools->tio_pool);
2658
2659
free_io_pool_and_out:
2660
mempool_destroy(pools->io_pool);
2661
2662
free_pools_and_out:
2663
kfree(pools);
2664
2665
return NULL;
2666
}
2667
2668
void dm_free_md_mempools(struct dm_md_mempools *pools)
2669
{
2670
if (!pools)
2671
return;
2672
2673
if (pools->io_pool)
2674
mempool_destroy(pools->io_pool);
2675
2676
if (pools->tio_pool)
2677
mempool_destroy(pools->tio_pool);
2678
2679
if (pools->bs)
2680
bioset_free(pools->bs);
2681
2682
kfree(pools);
2683
}
2684
2685
static const struct block_device_operations dm_blk_dops = {
2686
.open = dm_blk_open,
2687
.release = dm_blk_close,
2688
.ioctl = dm_blk_ioctl,
2689
.getgeo = dm_blk_getgeo,
2690
.owner = THIS_MODULE
2691
};
2692
2693
EXPORT_SYMBOL(dm_get_mapinfo);
2694
2695
/*
2696
* module hooks
2697
*/
2698
module_init(dm_init);
2699
module_exit(dm_exit);
2700
2701
module_param(major, uint, 0);
2702
MODULE_PARM_DESC(major, "The major number of the device mapper");
2703
MODULE_DESCRIPTION(DM_NAME " driver");
2704
MODULE_AUTHOR("Joe Thornber <[email protected]>");
2705
MODULE_LICENSE("GPL");
2706
2707