Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
48775 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
24
* Copyright (c) 2024, 2025, Rob Norris <[email protected]>
25
* Copyright (c) 2024, 2025, Klara, Inc.
26
*/
27
28
#include <sys/dataset_kstats.h>
29
#include <sys/dbuf.h>
30
#include <sys/dmu_traverse.h>
31
#include <sys/dsl_dataset.h>
32
#include <sys/dsl_prop.h>
33
#include <sys/dsl_dir.h>
34
#include <sys/zap.h>
35
#include <sys/zfeature.h>
36
#include <sys/zil_impl.h>
37
#include <sys/dmu_tx.h>
38
#include <sys/zio.h>
39
#include <sys/zfs_rlock.h>
40
#include <sys/spa_impl.h>
41
#include <sys/zvol.h>
42
#include <sys/zvol_impl.h>
43
#include <cityhash.h>
44
45
#include <linux/blkdev_compat.h>
46
#include <linux/task_io_accounting_ops.h>
47
#include <linux/workqueue.h>
48
#include <linux/blk-mq.h>
49
50
static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
51
struct request *rq, boolean_t force_sync);
52
53
static unsigned int zvol_major = ZVOL_MAJOR;
54
static unsigned long zvol_max_discard_blocks = 16384;
55
56
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
57
static unsigned int zvol_open_timeout_ms = 1000;
58
#endif
59
60
static unsigned int zvol_blk_mq_threads = 0;
61
static unsigned int zvol_blk_mq_actual_threads;
62
static boolean_t zvol_use_blk_mq = B_FALSE;
63
64
/*
65
* The maximum number of volblocksize blocks to process per thread. Typically,
66
* write heavy workloads preform better with higher values here, and read
67
* heavy workloads preform better with lower values, but that's not a hard
68
* and fast rule. It's basically a knob to tune between "less overhead with
69
* less parallelism" and "more overhead, but more parallelism".
70
*
71
* '8' was chosen as a reasonable, balanced, default based off of sequential
72
* read and write tests to a zvol in an NVMe pool (with 16 CPUs).
73
*/
74
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
75
76
#ifndef BLKDEV_DEFAULT_RQ
77
/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
78
#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
79
#endif
80
81
/*
82
* Finalize our BIO or request.
83
*/
84
static inline void
85
zvol_end_io(struct bio *bio, struct request *rq, int error)
86
{
87
ASSERT3U(error, >=, 0);
88
if (bio) {
89
bio->bi_status = errno_to_bi_status(error);
90
bio_endio(bio);
91
} else {
92
blk_mq_end_request(rq, errno_to_bi_status(error));
93
}
94
}
95
96
static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
97
static unsigned int zvol_actual_blk_mq_queue_depth;
98
99
struct zvol_state_os {
100
struct gendisk *zvo_disk; /* generic disk */
101
struct request_queue *zvo_queue; /* request queue */
102
dev_t zvo_dev; /* device id */
103
104
struct blk_mq_tag_set tag_set;
105
106
/* Set from the global 'zvol_use_blk_mq' at zvol load */
107
boolean_t use_blk_mq;
108
};
109
110
static struct ida zvol_ida;
111
112
/*
113
* This is called when a new block multiqueue request comes in. A request
114
* contains one or more BIOs.
115
*/
116
static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
117
const struct blk_mq_queue_data *bd)
118
{
119
struct request *rq = bd->rq;
120
zvol_state_t *zv = rq->q->queuedata;
121
122
/* Tell the kernel that we are starting to process this request */
123
blk_mq_start_request(rq);
124
125
if (blk_rq_is_passthrough(rq)) {
126
/* Skip non filesystem request */
127
blk_mq_end_request(rq, BLK_STS_IOERR);
128
return (BLK_STS_IOERR);
129
}
130
131
zvol_request_impl(zv, NULL, rq, 0);
132
133
/* Acknowledge to the kernel that we got this request */
134
return (BLK_STS_OK);
135
}
136
137
static struct blk_mq_ops zvol_blk_mq_queue_ops = {
138
.queue_rq = zvol_mq_queue_rq,
139
};
140
141
/* Initialize our blk-mq struct */
142
static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
143
{
144
struct zvol_state_os *zso = zv->zv_zso;
145
146
memset(&zso->tag_set, 0, sizeof (zso->tag_set));
147
148
/* Initialize tag set. */
149
zso->tag_set.ops = &zvol_blk_mq_queue_ops;
150
zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
151
zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
152
zso->tag_set.numa_node = NUMA_NO_NODE;
153
zso->tag_set.cmd_size = 0;
154
155
/*
156
* We need BLK_MQ_F_BLOCKING here since we do blocking calls in
157
* zvol_request_impl()
158
*/
159
zso->tag_set.flags = BLK_MQ_F_BLOCKING;
160
161
#ifdef BLK_MQ_F_SHOULD_MERGE
162
/*
163
* Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit.
164
* For older kernels, we set it.
165
*/
166
zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE;
167
#endif
168
169
zso->tag_set.driver_data = zv;
170
171
return (blk_mq_alloc_tag_set(&zso->tag_set));
172
}
173
174
/*
175
* Given a path, return TRUE if path is a ZVOL.
176
*/
177
boolean_t
178
zvol_os_is_zvol(const char *path)
179
{
180
dev_t dev = 0;
181
182
if (vdev_lookup_bdev(path, &dev) != 0)
183
return (B_FALSE);
184
185
if (MAJOR(dev) == zvol_major)
186
return (B_TRUE);
187
188
return (B_FALSE);
189
}
190
191
static void
192
zvol_write(zv_request_t *zvr)
193
{
194
struct bio *bio = zvr->bio;
195
struct request *rq = zvr->rq;
196
int error = 0;
197
zfs_uio_t uio;
198
zvol_state_t *zv = zvr->zv;
199
struct request_queue *q;
200
struct gendisk *disk;
201
unsigned long start_time = 0;
202
boolean_t acct = B_FALSE;
203
204
ASSERT3P(zv, !=, NULL);
205
ASSERT3U(zv->zv_open_count, >, 0);
206
ASSERT3P(zv->zv_zilog, !=, NULL);
207
208
q = zv->zv_zso->zvo_queue;
209
disk = zv->zv_zso->zvo_disk;
210
211
/* bio marked as FLUSH need to flush before write */
212
if (io_is_flush(bio, rq)) {
213
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
214
if (error != 0) {
215
rw_exit(&zv->zv_suspend_lock);
216
zvol_end_io(bio, rq, -error);
217
return;
218
}
219
}
220
221
/* Some requests are just for flush and nothing else. */
222
if (io_size(bio, rq) == 0) {
223
rw_exit(&zv->zv_suspend_lock);
224
zvol_end_io(bio, rq, 0);
225
return;
226
}
227
228
zfs_uio_bvec_init(&uio, bio, rq);
229
230
ssize_t start_resid = uio.uio_resid;
231
232
/*
233
* With use_blk_mq, accounting is done by blk_mq_start_request()
234
* and blk_mq_end_request(), so we can skip it here.
235
*/
236
if (bio) {
237
acct = blk_queue_io_stat(q);
238
if (acct) {
239
start_time = blk_generic_start_io_acct(q, disk, WRITE,
240
bio);
241
}
242
}
243
244
boolean_t sync =
245
io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
246
247
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
248
uio.uio_loffset, uio.uio_resid, RL_WRITER);
249
250
uint64_t volsize = zv->zv_volsize;
251
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
252
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
253
uint64_t off = uio.uio_loffset;
254
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
255
256
if (bytes > volsize - off) /* don't write past the end */
257
bytes = volsize - off;
258
259
dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
260
261
/* This will only fail for ENOSPC */
262
error = dmu_tx_assign(tx, DMU_TX_WAIT);
263
if (error) {
264
dmu_tx_abort(tx);
265
break;
266
}
267
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
268
DMU_READ_PREFETCH);
269
if (error == 0) {
270
zvol_log_write(zv, tx, off, bytes, sync);
271
}
272
dmu_tx_commit(tx);
273
274
if (error)
275
break;
276
}
277
zfs_rangelock_exit(lr);
278
279
int64_t nwritten = start_resid - uio.uio_resid;
280
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
281
task_io_account_write(nwritten);
282
283
if (error == 0 && sync)
284
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
285
286
rw_exit(&zv->zv_suspend_lock);
287
288
if (bio && acct) {
289
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
290
}
291
292
zvol_end_io(bio, rq, error);
293
}
294
295
static void
296
zvol_write_task(void *arg)
297
{
298
zv_request_task_t *task = arg;
299
zvol_write(&task->zvr);
300
zv_request_task_free(task);
301
}
302
303
static void
304
zvol_discard(zv_request_t *zvr)
305
{
306
struct bio *bio = zvr->bio;
307
struct request *rq = zvr->rq;
308
zvol_state_t *zv = zvr->zv;
309
uint64_t start = io_offset(bio, rq);
310
uint64_t size = io_size(bio, rq);
311
uint64_t end = start + size;
312
boolean_t sync;
313
int error = 0;
314
dmu_tx_t *tx;
315
struct request_queue *q = zv->zv_zso->zvo_queue;
316
struct gendisk *disk = zv->zv_zso->zvo_disk;
317
unsigned long start_time = 0;
318
boolean_t acct = B_FALSE;
319
320
ASSERT3P(zv, !=, NULL);
321
ASSERT3U(zv->zv_open_count, >, 0);
322
ASSERT3P(zv->zv_zilog, !=, NULL);
323
324
if (bio) {
325
acct = blk_queue_io_stat(q);
326
if (acct) {
327
start_time = blk_generic_start_io_acct(q, disk, WRITE,
328
bio);
329
}
330
}
331
332
sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
333
334
if (end > zv->zv_volsize) {
335
error = SET_ERROR(EIO);
336
goto unlock;
337
}
338
339
/*
340
* Align the request to volume block boundaries. This will prevent
341
* dnode_free_range() from zeroing out the unaligned parts which is
342
* slow (read-modify-write) and useless since we are not freeing any
343
* space by doing so.
344
*/
345
start = P2ROUNDUP(start, zv->zv_volblocksize);
346
end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
347
size = end - start;
348
349
if (start >= end)
350
goto unlock;
351
352
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
353
start, size, RL_WRITER);
354
355
tx = dmu_tx_create(zv->zv_objset);
356
dmu_tx_mark_netfree(tx);
357
error = dmu_tx_assign(tx, DMU_TX_WAIT);
358
if (error != 0) {
359
dmu_tx_abort(tx);
360
} else {
361
zvol_log_truncate(zv, tx, start, size);
362
dmu_tx_commit(tx);
363
error = dmu_free_long_range(zv->zv_objset,
364
ZVOL_OBJ, start, size);
365
}
366
zfs_rangelock_exit(lr);
367
368
if (error == 0 && sync)
369
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
370
371
unlock:
372
rw_exit(&zv->zv_suspend_lock);
373
374
if (bio && acct) {
375
blk_generic_end_io_acct(q, disk, WRITE, bio,
376
start_time);
377
}
378
379
zvol_end_io(bio, rq, error);
380
}
381
382
static void
383
zvol_discard_task(void *arg)
384
{
385
zv_request_task_t *task = arg;
386
zvol_discard(&task->zvr);
387
zv_request_task_free(task);
388
}
389
390
static void
391
zvol_read(zv_request_t *zvr)
392
{
393
struct bio *bio = zvr->bio;
394
struct request *rq = zvr->rq;
395
int error = 0;
396
zfs_uio_t uio;
397
boolean_t acct = B_FALSE;
398
zvol_state_t *zv = zvr->zv;
399
struct request_queue *q;
400
struct gendisk *disk;
401
unsigned long start_time = 0;
402
403
ASSERT3P(zv, !=, NULL);
404
ASSERT3U(zv->zv_open_count, >, 0);
405
406
zfs_uio_bvec_init(&uio, bio, rq);
407
408
q = zv->zv_zso->zvo_queue;
409
disk = zv->zv_zso->zvo_disk;
410
411
ssize_t start_resid = uio.uio_resid;
412
413
/*
414
* When blk-mq is being used, accounting is done by
415
* blk_mq_start_request() and blk_mq_end_request().
416
*/
417
if (bio) {
418
acct = blk_queue_io_stat(q);
419
if (acct)
420
start_time = blk_generic_start_io_acct(q, disk, READ,
421
bio);
422
}
423
424
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
425
uio.uio_loffset, uio.uio_resid, RL_READER);
426
427
uint64_t volsize = zv->zv_volsize;
428
429
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
430
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
431
432
/* don't read past the end */
433
if (bytes > volsize - uio.uio_loffset)
434
bytes = volsize - uio.uio_loffset;
435
436
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
437
DMU_READ_PREFETCH);
438
if (error) {
439
/* convert checksum errors into IO errors */
440
if (error == ECKSUM)
441
error = SET_ERROR(EIO);
442
break;
443
}
444
}
445
zfs_rangelock_exit(lr);
446
447
int64_t nread = start_resid - uio.uio_resid;
448
dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
449
task_io_account_read(nread);
450
451
rw_exit(&zv->zv_suspend_lock);
452
453
if (bio && acct) {
454
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
455
}
456
457
zvol_end_io(bio, rq, error);
458
}
459
460
static void
461
zvol_read_task(void *arg)
462
{
463
zv_request_task_t *task = arg;
464
zvol_read(&task->zvr);
465
zv_request_task_free(task);
466
}
467
468
/*
469
* Note:
470
*
471
* The kernel uses different enum names for the IO opcode, depending on the
472
* kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather
473
* than inline functions for these checks.
474
*/
475
/* Should this IO go down the zvol write path? */
476
#define ZVOL_OP_IS_WRITE(op) \
477
(op == REQ_OP_WRITE || \
478
op == REQ_OP_FLUSH || \
479
op == REQ_OP_DISCARD)
480
481
/* Is this IO type supported by zvols? */
482
#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
483
484
/* Get the IO opcode */
485
#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
486
487
/*
488
* Process a BIO or request
489
*
490
* Either 'bio' or 'rq' should be set depending on if we are processing a
491
* bio or a request (both should not be set).
492
*
493
* force_sync: Set to 0 to defer processing to a background taskq
494
* Set to 1 to process data synchronously
495
*/
496
static void
497
zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
498
boolean_t force_sync)
499
{
500
fstrans_cookie_t cookie = spl_fstrans_mark();
501
uint64_t offset = io_offset(bio, rq);
502
uint64_t size = io_size(bio, rq);
503
int rw;
504
505
if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
506
zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
507
rq != NULL ? "request" : "BIO",
508
ZVOL_OP(bio, rq),
509
rq != NULL ? rq->cmd_flags : bio->bi_opf);
510
ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
511
zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
512
goto out;
513
}
514
515
if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
516
rw = WRITE;
517
} else {
518
rw = READ;
519
}
520
521
/*
522
* Sanity check
523
*
524
* If we're a BIO, check our rw matches the kernel's
525
* bio_data_dir(bio) rw. We need to check because we support fewer
526
* IO operations, and want to verify that what we think are reads and
527
* writes from those operations match what the kernel thinks.
528
*/
529
ASSERT(rq != NULL || rw == bio_data_dir(bio));
530
531
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
532
zvol_end_io(bio, rq, SET_ERROR(ENXIO));
533
goto out;
534
}
535
536
if (zvol_request_sync || zv->zv_threading == B_FALSE)
537
force_sync = 1;
538
539
zv_request_t zvr = {
540
.zv = zv,
541
.bio = bio,
542
.rq = rq,
543
};
544
545
if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
546
printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
547
zv->zv_zso->zvo_disk->disk_name,
548
(long long unsigned)offset,
549
(long unsigned)size);
550
551
zvol_end_io(bio, rq, SET_ERROR(EIO));
552
goto out;
553
}
554
555
zv_request_task_t *task;
556
zv_taskq_t *ztqs = &zvol_taskqs;
557
uint_t blk_mq_hw_queue = 0;
558
uint_t tq_idx;
559
uint_t taskq_hash;
560
if (rq)
561
#ifdef HAVE_BLK_MQ_RQ_HCTX
562
blk_mq_hw_queue = rq->mq_hctx->queue_num;
563
#else
564
blk_mq_hw_queue = rq->q->queue_hw_ctx[
565
rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
566
#endif
567
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
568
blk_mq_hw_queue);
569
tq_idx = taskq_hash % ztqs->tqs_cnt;
570
571
if (rw == WRITE) {
572
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
573
zvol_end_io(bio, rq, SET_ERROR(EROFS));
574
goto out;
575
}
576
577
/*
578
* Prevents the zvol from being suspended, or the ZIL being
579
* concurrently opened. Will be released after the i/o
580
* completes.
581
*/
582
rw_enter(&zv->zv_suspend_lock, RW_READER);
583
584
/*
585
* Open a ZIL if this is the first time we have written to this
586
* zvol. We protect zv->zv_zilog with zv_suspend_lock rather
587
* than zv_state_lock so that we don't need to acquire an
588
* additional lock in this path.
589
*/
590
if (zv->zv_zilog == NULL) {
591
rw_exit(&zv->zv_suspend_lock);
592
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
593
if (zv->zv_zilog == NULL) {
594
zv->zv_zilog = zil_open(zv->zv_objset,
595
zvol_get_data, &zv->zv_kstat.dk_zil_sums);
596
zv->zv_flags |= ZVOL_WRITTEN_TO;
597
/* replay / destroy done in zvol_create_minor */
598
VERIFY0((zv->zv_zilog->zl_header->zh_flags &
599
ZIL_REPLAY_NEEDED));
600
}
601
rw_downgrade(&zv->zv_suspend_lock);
602
}
603
604
/*
605
* We don't want this thread to be blocked waiting for i/o to
606
* complete, so we instead wait from a taskq callback. The
607
* i/o may be a ZIL write (via zil_commit()), or a read of an
608
* indirect block, or a read of a data block (if this is a
609
* partial-block write). We will indicate that the i/o is
610
* complete by calling END_IO() from the taskq callback.
611
*
612
* This design allows the calling thread to continue and
613
* initiate more concurrent operations by calling
614
* zvol_request() again. There are typically only a small
615
* number of threads available to call zvol_request() (e.g.
616
* one per iSCSI target), so keeping the latency of
617
* zvol_request() low is important for performance.
618
*
619
* The zvol_request_sync module parameter allows this
620
* behavior to be altered, for performance evaluation
621
* purposes. If the callback blocks, setting
622
* zvol_request_sync=1 will result in much worse performance.
623
*
624
* We can have up to zvol_threads concurrent i/o's being
625
* processed for all zvols on the system. This is typically
626
* a vast improvement over the zvol_request_sync=1 behavior
627
* of one i/o at a time per zvol. However, an even better
628
* design would be for zvol_request() to initiate the zio
629
* directly, and then be notified by the zio_done callback,
630
* which would call END_IO(). Unfortunately, the DMU/ZIL
631
* interfaces lack this functionality (they block waiting for
632
* the i/o to complete).
633
*/
634
if (io_is_discard(bio, rq)) {
635
if (force_sync) {
636
zvol_discard(&zvr);
637
} else {
638
task = zv_request_task_create(zvr);
639
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
640
zvol_discard_task, task, 0, &task->ent);
641
}
642
} else {
643
if (force_sync) {
644
zvol_write(&zvr);
645
} else {
646
task = zv_request_task_create(zvr);
647
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
648
zvol_write_task, task, 0, &task->ent);
649
}
650
}
651
} else {
652
/*
653
* The SCST driver, and possibly others, may issue READ I/Os
654
* with a length of zero bytes. These empty I/Os contain no
655
* data and require no additional handling.
656
*/
657
if (size == 0) {
658
zvol_end_io(bio, rq, 0);
659
goto out;
660
}
661
662
rw_enter(&zv->zv_suspend_lock, RW_READER);
663
664
/* See comment in WRITE case above. */
665
if (force_sync) {
666
zvol_read(&zvr);
667
} else {
668
task = zv_request_task_create(zvr);
669
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
670
zvol_read_task, task, 0, &task->ent);
671
}
672
}
673
674
out:
675
spl_fstrans_unmark(cookie);
676
}
677
678
#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
679
#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
680
static void
681
zvol_submit_bio(struct bio *bio)
682
#else
683
static blk_qc_t
684
zvol_submit_bio(struct bio *bio)
685
#endif
686
#else
687
static MAKE_REQUEST_FN_RET
688
zvol_request(struct request_queue *q, struct bio *bio)
689
#endif
690
{
691
#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
692
#if defined(HAVE_BIO_BDEV_DISK)
693
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
694
#else
695
struct request_queue *q = bio->bi_disk->queue;
696
#endif
697
#endif
698
zvol_state_t *zv = q->queuedata;
699
700
zvol_request_impl(zv, bio, NULL, 0);
701
#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
702
defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
703
!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
704
return (BLK_QC_T_NONE);
705
#endif
706
}
707
708
static int
709
#ifdef HAVE_BLK_MODE_T
710
zvol_open(struct gendisk *disk, blk_mode_t flag)
711
#else
712
zvol_open(struct block_device *bdev, fmode_t flag)
713
#endif
714
{
715
zvol_state_t *zv;
716
int error = 0;
717
boolean_t drop_suspend = B_FALSE;
718
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
719
hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
720
hrtime_t start = gethrtime();
721
722
retry:
723
#endif
724
725
#ifdef HAVE_BLK_MODE_T
726
zv = atomic_load_ptr(&disk->private_data);
727
#else
728
zv = atomic_load_ptr(&bdev->bd_disk->private_data);
729
#endif
730
if (zv == NULL) {
731
return (-SET_ERROR(ENXIO));
732
}
733
734
mutex_enter(&zv->zv_state_lock);
735
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
736
mutex_exit(&zv->zv_state_lock);
737
return (-SET_ERROR(ENXIO));
738
}
739
740
/*
741
* Make sure zvol is not suspended during first open
742
* (hold zv_suspend_lock) and respect proper lock acquisition
743
* ordering - zv_suspend_lock before zv_state_lock
744
*/
745
if (zv->zv_open_count == 0) {
746
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
747
mutex_exit(&zv->zv_state_lock);
748
749
/*
750
* Removal may happen while the locks are down, so
751
* we can't trust zv any longer; we have to start over.
752
*/
753
#ifdef HAVE_BLK_MODE_T
754
zv = atomic_load_ptr(&disk->private_data);
755
#else
756
zv = atomic_load_ptr(&bdev->bd_disk->private_data);
757
#endif
758
if (zv == NULL)
759
return (-SET_ERROR(ENXIO));
760
761
rw_enter(&zv->zv_suspend_lock, RW_READER);
762
mutex_enter(&zv->zv_state_lock);
763
764
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
765
mutex_exit(&zv->zv_state_lock);
766
rw_exit(&zv->zv_suspend_lock);
767
return (-SET_ERROR(ENXIO));
768
}
769
770
/* check to see if zv_suspend_lock is needed */
771
if (zv->zv_open_count != 0) {
772
rw_exit(&zv->zv_suspend_lock);
773
} else {
774
drop_suspend = B_TRUE;
775
}
776
} else {
777
drop_suspend = B_TRUE;
778
}
779
}
780
781
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
782
783
if (zv->zv_open_count == 0) {
784
boolean_t drop_namespace = B_FALSE;
785
786
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
787
788
/*
789
* In all other call paths the spa_namespace_lock is taken
790
* before the bdev->bd_mutex lock. However, on open(2)
791
* the __blkdev_get() function calls fops->open() with the
792
* bdev->bd_mutex lock held. This can result in a deadlock
793
* when zvols from one pool are used as vdevs in another.
794
*
795
* To prevent a lock inversion deadlock we preemptively
796
* take the spa_namespace_lock. Normally the lock will not
797
* be contended and this is safe because spa_open_common()
798
* handles the case where the caller already holds the
799
* spa_namespace_lock.
800
*
801
* When the lock cannot be aquired after multiple retries
802
* this must be the vdev on zvol deadlock case and we have
803
* no choice but to return an error. For 5.12 and older
804
* kernels returning -ERESTARTSYS will result in the
805
* bdev->bd_mutex being dropped, then reacquired, and
806
* fops->open() being called again. This process can be
807
* repeated safely until both locks are acquired. For 5.13
808
* and newer the -ERESTARTSYS retry logic was removed from
809
* the kernel so the only option is to return the error for
810
* the caller to handle it.
811
*/
812
if (!mutex_owned(&spa_namespace_lock)) {
813
if (!mutex_tryenter(&spa_namespace_lock)) {
814
mutex_exit(&zv->zv_state_lock);
815
rw_exit(&zv->zv_suspend_lock);
816
drop_suspend = B_FALSE;
817
818
#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
819
schedule();
820
return (-SET_ERROR(ERESTARTSYS));
821
#else
822
if ((gethrtime() - start) > timeout)
823
return (-SET_ERROR(ERESTARTSYS));
824
825
schedule_timeout_interruptible(
826
MSEC_TO_TICK(10));
827
goto retry;
828
#endif
829
} else {
830
drop_namespace = B_TRUE;
831
}
832
}
833
834
error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
835
836
if (drop_namespace)
837
mutex_exit(&spa_namespace_lock);
838
}
839
840
if (error == 0) {
841
if ((blk_mode_is_open_write(flag)) &&
842
(zv->zv_flags & ZVOL_RDONLY)) {
843
if (zv->zv_open_count == 0)
844
zvol_last_close(zv);
845
846
error = -SET_ERROR(EROFS);
847
} else {
848
zv->zv_open_count++;
849
}
850
}
851
852
mutex_exit(&zv->zv_state_lock);
853
if (drop_suspend)
854
rw_exit(&zv->zv_suspend_lock);
855
856
if (error == 0)
857
#ifdef HAVE_BLK_MODE_T
858
disk_check_media_change(disk);
859
#else
860
zfs_check_media_change(bdev);
861
#endif
862
863
return (error);
864
}
865
866
static void
867
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
868
zvol_release(struct gendisk *disk)
869
#else
870
zvol_release(struct gendisk *disk, fmode_t unused)
871
#endif
872
{
873
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
874
(void) unused;
875
#endif
876
boolean_t drop_suspend = B_TRUE;
877
878
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
879
if (zv == NULL)
880
return;
881
882
mutex_enter(&zv->zv_state_lock);
883
ASSERT3U(zv->zv_open_count, >, 0);
884
/*
885
* make sure zvol is not suspended during last close
886
* (hold zv_suspend_lock) and respect proper lock acquisition
887
* ordering - zv_suspend_lock before zv_state_lock
888
*/
889
if (zv->zv_open_count == 1) {
890
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
891
mutex_exit(&zv->zv_state_lock);
892
rw_enter(&zv->zv_suspend_lock, RW_READER);
893
mutex_enter(&zv->zv_state_lock);
894
895
/*
896
* Unlike in zvol_open(), we don't check if removal
897
* started here, because we might be one of the openers
898
* that needs to be thrown out! If we're the last, we
899
* need to call zvol_last_close() below to finish
900
* cleanup. So, no special treatment for us.
901
*/
902
903
/* check to see if zv_suspend_lock is needed */
904
if (zv->zv_open_count != 1) {
905
rw_exit(&zv->zv_suspend_lock);
906
drop_suspend = B_FALSE;
907
}
908
}
909
} else {
910
drop_suspend = B_FALSE;
911
}
912
913
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
914
915
zv->zv_open_count--;
916
if (zv->zv_open_count == 0) {
917
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
918
zvol_last_close(zv);
919
}
920
921
mutex_exit(&zv->zv_state_lock);
922
923
if (drop_suspend)
924
rw_exit(&zv->zv_suspend_lock);
925
}
926
927
static int
928
zvol_ioctl(struct block_device *bdev, fmode_t mode,
929
unsigned int cmd, unsigned long arg)
930
{
931
int error = 0;
932
933
zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
934
ASSERT3P(zv, !=, NULL);
935
ASSERT3U(zv->zv_open_count, >, 0);
936
937
switch (cmd) {
938
case BLKFLSBUF:
939
#ifdef HAVE_FSYNC_BDEV
940
fsync_bdev(bdev);
941
#elif defined(HAVE_SYNC_BLOCKDEV)
942
sync_blockdev(bdev);
943
#else
944
#error "Neither fsync_bdev() nor sync_blockdev() found"
945
#endif
946
invalidate_bdev(bdev);
947
rw_enter(&zv->zv_suspend_lock, RW_READER);
948
949
if (!(zv->zv_flags & ZVOL_RDONLY))
950
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
951
952
rw_exit(&zv->zv_suspend_lock);
953
break;
954
955
case BLKZNAME:
956
mutex_enter(&zv->zv_state_lock);
957
error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
958
mutex_exit(&zv->zv_state_lock);
959
if (error)
960
error = SET_ERROR(error);
961
break;
962
963
default:
964
error = SET_ERROR(ENOTTY);
965
break;
966
}
967
968
return (-error);
969
}
970
971
#ifdef CONFIG_COMPAT
972
static int
973
zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
974
unsigned cmd, unsigned long arg)
975
{
976
return (zvol_ioctl(bdev, mode, cmd, arg));
977
}
978
#else
979
#define zvol_compat_ioctl NULL
980
#endif
981
982
static unsigned int
983
zvol_check_events(struct gendisk *disk, unsigned int clearing)
984
{
985
unsigned int mask = 0;
986
987
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
988
989
if (zv != NULL) {
990
mutex_enter(&zv->zv_state_lock);
991
mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
992
zv->zv_changed = 0;
993
mutex_exit(&zv->zv_state_lock);
994
}
995
996
return (mask);
997
}
998
999
static int
1000
zvol_revalidate_disk(struct gendisk *disk)
1001
{
1002
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
1003
1004
if (zv != NULL) {
1005
mutex_enter(&zv->zv_state_lock);
1006
set_capacity(zv->zv_zso->zvo_disk,
1007
zv->zv_volsize >> SECTOR_BITS);
1008
mutex_exit(&zv->zv_state_lock);
1009
}
1010
1011
return (0);
1012
}
1013
1014
int
1015
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1016
{
1017
struct gendisk *disk = zv->zv_zso->zvo_disk;
1018
1019
#if defined(HAVE_REVALIDATE_DISK_SIZE)
1020
revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
1021
#elif defined(HAVE_REVALIDATE_DISK)
1022
revalidate_disk(disk);
1023
#else
1024
zvol_revalidate_disk(disk);
1025
#endif
1026
return (0);
1027
}
1028
1029
/*
1030
* Provide a simple virtual geometry for legacy compatibility. For devices
1031
* smaller than 1 MiB a small head and sector count is used to allow very
1032
* tiny devices. For devices over 1 Mib a standard head and sector count
1033
* is used to keep the cylinders count reasonable.
1034
*/
1035
static inline int
1036
zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
1037
{
1038
zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
1039
sector_t sectors;
1040
1041
ASSERT3P(zv, !=, NULL);
1042
ASSERT3U(zv->zv_open_count, >, 0);
1043
1044
sectors = get_capacity(zv->zv_zso->zvo_disk);
1045
1046
if (sectors > 2048) {
1047
geo->heads = 16;
1048
geo->sectors = 63;
1049
} else {
1050
geo->heads = 2;
1051
geo->sectors = 4;
1052
}
1053
1054
geo->start = 0;
1055
geo->cylinders = sectors / (geo->heads * geo->sectors);
1056
1057
return (0);
1058
}
1059
1060
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
1061
static int
1062
zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
1063
{
1064
return (zvol_getgeo_impl(disk, geo));
1065
}
1066
#else
1067
static int
1068
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1069
{
1070
return (zvol_getgeo_impl(bdev->bd_disk, geo));
1071
}
1072
#endif
1073
1074
/*
1075
* Why have two separate block_device_operations structs?
1076
*
1077
* Normally we'd just have one, and assign 'submit_bio' as needed. However,
1078
* it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
1079
* can't just change submit_bio dynamically at runtime. So just create two
1080
* separate structs to get around this.
1081
*/
1082
static const struct block_device_operations zvol_ops_blk_mq = {
1083
.open = zvol_open,
1084
.release = zvol_release,
1085
.ioctl = zvol_ioctl,
1086
.compat_ioctl = zvol_compat_ioctl,
1087
.check_events = zvol_check_events,
1088
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1089
.revalidate_disk = zvol_revalidate_disk,
1090
#endif
1091
.getgeo = zvol_getgeo,
1092
.owner = THIS_MODULE,
1093
};
1094
1095
static const struct block_device_operations zvol_ops = {
1096
.open = zvol_open,
1097
.release = zvol_release,
1098
.ioctl = zvol_ioctl,
1099
.compat_ioctl = zvol_compat_ioctl,
1100
.check_events = zvol_check_events,
1101
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1102
.revalidate_disk = zvol_revalidate_disk,
1103
#endif
1104
.getgeo = zvol_getgeo,
1105
.owner = THIS_MODULE,
1106
#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1107
.submit_bio = zvol_submit_bio,
1108
#endif
1109
};
1110
1111
/*
1112
* Since 6.9, Linux has been removing queue limit setters in favour of an
1113
* initial queue_limits struct applied when the device is open. Since 6.11,
1114
* queue_limits is being extended to allow more things to be applied when the
1115
* device is open. Setters are also being removed for this.
1116
*
1117
* For OpenZFS, this means that depending on kernel version, some options may
1118
* be set up before the device is open, and some applied to an open device
1119
* (queue) after the fact.
1120
*
1121
* We manage this complexity by having our own limits struct,
1122
* zvol_queue_limits_t, in which we carry any queue config that we're
1123
* interested in setting. This structure is the same on all kernels.
1124
*
1125
* These limits are then applied to the queue at device open time by the most
1126
* appropriate method for the kernel.
1127
*
1128
* zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
1129
* blk_alloc_disk() exists). This converts our limits struct to a proper Linux
1130
* struct queue_limits, and passes it in. Any fields added in later kernels are
1131
* (obviously) not set up here.
1132
*
1133
* zvol_queue_limits_apply() is called on all kernel versions after the queue
1134
* is created, and applies any remaining config. Before 6.9 that will be
1135
* everything, via setter methods. After 6.9 that will be whatever couldn't be
1136
* put into struct queue_limits. (This implies that zvol_queue_limits_apply()
1137
* will always be a no-op on the latest kernel we support).
1138
*/
1139
typedef struct zvol_queue_limits {
1140
unsigned int zql_max_hw_sectors;
1141
unsigned short zql_max_segments;
1142
unsigned int zql_max_segment_size;
1143
unsigned int zql_io_opt;
1144
unsigned int zql_physical_block_size;
1145
unsigned int zql_max_discard_sectors;
1146
unsigned int zql_discard_granularity;
1147
} zvol_queue_limits_t;
1148
1149
static void
1150
zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
1151
boolean_t use_blk_mq)
1152
{
1153
limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
1154
1155
if (use_blk_mq) {
1156
/*
1157
* IO requests can be really big (1MB). When an IO request
1158
* comes in, it is passed off to zvol_read() or zvol_write()
1159
* in a new thread, where it is chunked up into 'volblocksize'
1160
* sized pieces and processed. So for example, if the request
1161
* is a 1MB write and your volblocksize is 128k, one zvol_write
1162
* thread will take that request and sequentially do ten 128k
1163
* IOs. This is due to the fact that the thread needs to lock
1164
* each volblocksize sized block. So you might be wondering:
1165
* "instead of passing the whole 1MB request to one thread,
1166
* why not pass ten individual 128k chunks to ten threads and
1167
* process the whole write in parallel?" The short answer is
1168
* that there's a sweet spot number of chunks that balances
1169
* the greater parallelism with the added overhead of more
1170
* threads. The sweet spot can be different depending on if you
1171
* have a read or write heavy workload. Writes typically want
1172
* high chunk counts while reads typically want lower ones. On
1173
* a test pool with 6 NVMe drives in a 3x 2-disk mirror
1174
* configuration, with volblocksize=8k, the sweet spot for good
1175
* sequential reads and writes was at 8 chunks.
1176
*/
1177
1178
/*
1179
* Below we tell the kernel how big we want our requests
1180
* to be. You would think that blk_queue_io_opt() would be
1181
* used to do this since it is used to "set optimal request
1182
* size for the queue", but that doesn't seem to do
1183
* anything - the kernel still gives you huge requests
1184
* with tons of little PAGE_SIZE segments contained within it.
1185
*
1186
* Knowing that the kernel will just give you PAGE_SIZE segments
1187
* no matter what, you can say "ok, I want PAGE_SIZE byte
1188
* segments, and I want 'N' of them per request", where N is
1189
* the correct number of segments for the volblocksize and
1190
* number of chunks you want.
1191
*/
1192
if (zvol_blk_mq_blocks_per_thread != 0) {
1193
unsigned int chunks;
1194
chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
1195
1196
limits->zql_max_segment_size = PAGE_SIZE;
1197
limits->zql_max_segments =
1198
(zv->zv_volblocksize * chunks) / PAGE_SIZE;
1199
} else {
1200
/*
1201
* Special case: zvol_blk_mq_blocks_per_thread = 0
1202
* Max everything out.
1203
*/
1204
limits->zql_max_segments = UINT16_MAX;
1205
limits->zql_max_segment_size = UINT_MAX;
1206
}
1207
} else {
1208
limits->zql_max_segments = UINT16_MAX;
1209
limits->zql_max_segment_size = UINT_MAX;
1210
}
1211
1212
limits->zql_io_opt = DMU_MAX_ACCESS / 2;
1213
1214
limits->zql_physical_block_size = zv->zv_volblocksize;
1215
limits->zql_max_discard_sectors =
1216
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
1217
limits->zql_discard_granularity = zv->zv_volblocksize;
1218
}
1219
1220
#ifdef HAVE_BLK_ALLOC_DISK_2ARG
1221
static void
1222
zvol_queue_limits_convert(zvol_queue_limits_t *limits,
1223
struct queue_limits *qlimits)
1224
{
1225
memset(qlimits, 0, sizeof (struct queue_limits));
1226
qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
1227
qlimits->max_segments = limits->zql_max_segments;
1228
qlimits->max_segment_size = limits->zql_max_segment_size;
1229
qlimits->io_opt = limits->zql_io_opt;
1230
qlimits->physical_block_size = limits->zql_physical_block_size;
1231
qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
1232
qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
1233
qlimits->discard_granularity = limits->zql_discard_granularity;
1234
#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1235
qlimits->features =
1236
BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
1237
#endif
1238
}
1239
#endif
1240
1241
static void
1242
zvol_queue_limits_apply(zvol_queue_limits_t *limits,
1243
struct request_queue *queue)
1244
{
1245
#ifndef HAVE_BLK_ALLOC_DISK_2ARG
1246
blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
1247
blk_queue_max_segments(queue, limits->zql_max_segments);
1248
blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
1249
blk_queue_io_opt(queue, limits->zql_io_opt);
1250
blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
1251
blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
1252
blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
1253
#endif
1254
#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1255
blk_queue_set_write_cache(queue, B_TRUE);
1256
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
1257
#endif
1258
}
1259
1260
static int
1261
zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
1262
{
1263
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
1264
#if defined(HAVE_BLK_ALLOC_DISK)
1265
zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
1266
if (zso->zvo_disk == NULL)
1267
return (1);
1268
1269
zso->zvo_disk->minors = ZVOL_MINORS;
1270
zso->zvo_queue = zso->zvo_disk->queue;
1271
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1272
struct queue_limits qlimits;
1273
zvol_queue_limits_convert(limits, &qlimits);
1274
struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
1275
if (IS_ERR(disk)) {
1276
zso->zvo_disk = NULL;
1277
return (1);
1278
}
1279
1280
zso->zvo_disk = disk;
1281
zso->zvo_disk->minors = ZVOL_MINORS;
1282
zso->zvo_queue = zso->zvo_disk->queue;
1283
1284
#else
1285
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
1286
if (zso->zvo_queue == NULL)
1287
return (1);
1288
1289
zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1290
if (zso->zvo_disk == NULL) {
1291
blk_cleanup_queue(zso->zvo_queue);
1292
return (1);
1293
}
1294
1295
zso->zvo_disk->queue = zso->zvo_queue;
1296
#endif /* HAVE_BLK_ALLOC_DISK */
1297
#else
1298
zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
1299
if (zso->zvo_queue == NULL)
1300
return (1);
1301
1302
zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1303
if (zso->zvo_disk == NULL) {
1304
blk_cleanup_queue(zso->zvo_queue);
1305
return (1);
1306
}
1307
1308
zso->zvo_disk->queue = zso->zvo_queue;
1309
#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
1310
1311
zvol_queue_limits_apply(limits, zso->zvo_queue);
1312
1313
return (0);
1314
1315
}
1316
1317
static int
1318
zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
1319
{
1320
struct zvol_state_os *zso = zv->zv_zso;
1321
1322
/* Allocate our blk-mq tag_set */
1323
if (zvol_blk_mq_alloc_tag_set(zv) != 0)
1324
return (1);
1325
1326
#if defined(HAVE_BLK_ALLOC_DISK)
1327
zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
1328
if (zso->zvo_disk == NULL) {
1329
blk_mq_free_tag_set(&zso->tag_set);
1330
return (1);
1331
}
1332
zso->zvo_queue = zso->zvo_disk->queue;
1333
zso->zvo_disk->minors = ZVOL_MINORS;
1334
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1335
struct queue_limits qlimits;
1336
zvol_queue_limits_convert(limits, &qlimits);
1337
struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
1338
if (IS_ERR(disk)) {
1339
zso->zvo_disk = NULL;
1340
blk_mq_free_tag_set(&zso->tag_set);
1341
return (1);
1342
}
1343
1344
zso->zvo_disk = disk;
1345
zso->zvo_queue = zso->zvo_disk->queue;
1346
zso->zvo_disk->minors = ZVOL_MINORS;
1347
#else
1348
zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1349
if (zso->zvo_disk == NULL) {
1350
blk_cleanup_queue(zso->zvo_queue);
1351
blk_mq_free_tag_set(&zso->tag_set);
1352
return (1);
1353
}
1354
/* Allocate queue */
1355
zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
1356
if (IS_ERR(zso->zvo_queue)) {
1357
blk_mq_free_tag_set(&zso->tag_set);
1358
return (1);
1359
}
1360
1361
/* Our queue is now created, assign it to our disk */
1362
zso->zvo_disk->queue = zso->zvo_queue;
1363
#endif
1364
1365
zvol_queue_limits_apply(limits, zso->zvo_queue);
1366
1367
return (0);
1368
}
1369
1370
/*
1371
* Allocate memory for a new zvol_state_t and setup the required
1372
* request queue and generic disk structures for the block device.
1373
*/
1374
static int
1375
zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
1376
zvol_state_t **zvp)
1377
{
1378
zvol_state_t *zv;
1379
struct zvol_state_os *zso;
1380
uint64_t volmode;
1381
int ret;
1382
1383
ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
1384
if (ret)
1385
return (ret);
1386
1387
if (volmode == ZFS_VOLMODE_DEFAULT)
1388
volmode = zvol_volmode;
1389
1390
if (volmode == ZFS_VOLMODE_NONE)
1391
return (0);
1392
1393
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1394
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1395
zv->zv_zso = zso;
1396
zv->zv_volmode = volmode;
1397
zv->zv_volsize = volsize;
1398
zv->zv_volblocksize = volblocksize;
1399
1400
list_link_init(&zv->zv_next);
1401
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1402
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1403
1404
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
1405
1406
zvol_queue_limits_t limits;
1407
zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
1408
1409
/*
1410
* The block layer has 3 interfaces for getting BIOs:
1411
*
1412
* 1. blk-mq request queues (new)
1413
* 2. submit_bio() (oldest)
1414
* 3. regular request queues (old).
1415
*
1416
* Each of those interfaces has two permutations:
1417
*
1418
* a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
1419
* both the disk and its queue (5.14 kernel or newer)
1420
*
1421
* b) We don't have blk_*alloc_disk(), and have to allocate the
1422
* disk and the queue separately. (5.13 kernel or older)
1423
*/
1424
if (zv->zv_zso->use_blk_mq) {
1425
ret = zvol_alloc_blk_mq(zv, &limits);
1426
if (ret != 0)
1427
goto out_kmem;
1428
zso->zvo_disk->fops = &zvol_ops_blk_mq;
1429
} else {
1430
ret = zvol_alloc_non_blk_mq(zso, &limits);
1431
if (ret != 0)
1432
goto out_kmem;
1433
zso->zvo_disk->fops = &zvol_ops;
1434
}
1435
1436
/* Limit read-ahead to a single page to prevent over-prefetching. */
1437
blk_queue_set_read_ahead(zso->zvo_queue, 1);
1438
1439
if (!zv->zv_zso->use_blk_mq) {
1440
/* Disable write merging in favor of the ZIO pipeline. */
1441
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
1442
}
1443
1444
zso->zvo_queue->queuedata = zv;
1445
zso->zvo_dev = dev;
1446
zv->zv_open_count = 0;
1447
strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1448
1449
zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1450
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1451
1452
zso->zvo_disk->major = zvol_major;
1453
zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1454
1455
/*
1456
* Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1457
* This is accomplished by limiting the number of minors for the
1458
* device to one and explicitly disabling partition scanning.
1459
*/
1460
if (volmode == ZFS_VOLMODE_DEV) {
1461
zso->zvo_disk->minors = 1;
1462
zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
1463
zso->zvo_disk->flags |= GENHD_FL_NO_PART;
1464
}
1465
1466
zso->zvo_disk->first_minor = (dev & MINORMASK);
1467
zso->zvo_disk->private_data = zv;
1468
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1469
ZVOL_DEV_NAME, (dev & MINORMASK));
1470
1471
*zvp = zv;
1472
return (ret);
1473
1474
out_kmem:
1475
kmem_free(zso, sizeof (struct zvol_state_os));
1476
kmem_free(zv, sizeof (zvol_state_t));
1477
return (ret);
1478
}
1479
1480
void
1481
zvol_os_remove_minor(zvol_state_t *zv)
1482
{
1483
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1484
ASSERT0(zv->zv_open_count);
1485
ASSERT0(atomic_read(&zv->zv_suspend_ref));
1486
ASSERT(zv->zv_flags & ZVOL_REMOVING);
1487
1488
struct zvol_state_os *zso = zv->zv_zso;
1489
zv->zv_zso = NULL;
1490
1491
/* Clearing private_data will make new callers return immediately. */
1492
atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
1493
1494
/*
1495
* Drop the state lock before calling del_gendisk(). There may be
1496
* callers waiting to acquire it, but del_gendisk() will block until
1497
* they exit, which would deadlock.
1498
*/
1499
mutex_exit(&zv->zv_state_lock);
1500
1501
del_gendisk(zso->zvo_disk);
1502
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1503
(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1504
#if defined(HAVE_BLK_CLEANUP_DISK)
1505
blk_cleanup_disk(zso->zvo_disk);
1506
#else
1507
put_disk(zso->zvo_disk);
1508
#endif
1509
#else
1510
blk_cleanup_queue(zso->zvo_queue);
1511
put_disk(zso->zvo_disk);
1512
#endif
1513
1514
if (zso->use_blk_mq)
1515
blk_mq_free_tag_set(&zso->tag_set);
1516
1517
ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
1518
1519
kmem_free(zso, sizeof (struct zvol_state_os));
1520
1521
mutex_enter(&zv->zv_state_lock);
1522
}
1523
1524
void
1525
zvol_os_free(zvol_state_t *zv)
1526
{
1527
1528
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1529
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1530
ASSERT0(zv->zv_open_count);
1531
ASSERT0P(zv->zv_zso);
1532
1533
ASSERT0P(zv->zv_objset);
1534
ASSERT0P(zv->zv_zilog);
1535
ASSERT0P(zv->zv_dn);
1536
1537
rw_destroy(&zv->zv_suspend_lock);
1538
zfs_rangelock_fini(&zv->zv_rangelock);
1539
1540
cv_destroy(&zv->zv_removing_cv);
1541
mutex_destroy(&zv->zv_state_lock);
1542
dataset_kstats_destroy(&zv->zv_kstat);
1543
1544
kmem_free(zv, sizeof (zvol_state_t));
1545
}
1546
1547
void
1548
zvol_wait_close(zvol_state_t *zv)
1549
{
1550
}
1551
1552
struct add_disk_work {
1553
struct delayed_work work;
1554
struct gendisk *disk;
1555
int error;
1556
};
1557
1558
static int
1559
__zvol_os_add_disk(struct gendisk *disk)
1560
{
1561
int error = 0;
1562
#ifdef HAVE_ADD_DISK_RET
1563
error = -add_disk(disk);
1564
if (error)
1565
error = SET_ERROR(error);
1566
#else
1567
add_disk(disk);
1568
#endif
1569
return (error);
1570
}
1571
1572
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
1573
static void
1574
zvol_os_add_disk_work(struct work_struct *work)
1575
{
1576
struct add_disk_work *add_disk_work;
1577
add_disk_work = container_of(work, struct add_disk_work, work.work);
1578
add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
1579
}
1580
#endif
1581
1582
/*
1583
* SPECIAL CASE:
1584
*
1585
* This function basically calls add_disk() from a workqueue. You may be
1586
* thinking: why not just call add_disk() directly?
1587
*
1588
* When you call add_disk(), the zvol appears to the world. When this happens,
1589
* the kernel calls disk_scan_partitions() on the zvol, which behaves
1590
* differently on the 6.9+ kernels:
1591
*
1592
* - 6.8 and older kernels -
1593
* disk_scan_partitions()
1594
* handle = bdev_open_by_dev(
1595
* zvol_open()
1596
* bdev_release(handle);
1597
* zvol_release()
1598
*
1599
*
1600
* - 6.9+ kernels -
1601
* disk_scan_partitions()
1602
* file = bdev_file_open_by_dev()
1603
* zvol_open()
1604
* fput(file)
1605
* < wait for return to userspace >
1606
* zvol_release()
1607
*
1608
* The difference is that the bdev_release() from the 6.8 kernel is synchronous
1609
* while the fput() from the 6.9 kernel is async. Or more specifically it's
1610
* async that has to wait until we return to userspace (since it adds the fput
1611
* into the caller's work queue with the TWA_RESUME flag set). This is not the
1612
* behavior we want, since we want do things like create+destroy a zvol within
1613
* a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
1614
* reference to the zvol while we're in the IOCTL, which can't wait until we
1615
* return to userspace.
1616
*
1617
* We can get around this since fput() has a special codepath for when it's
1618
* running in a kernel thread or interrupt. In those cases, it just puts the
1619
* fput into the system workqueue, which we can force to run with
1620
* __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
1621
* run from a kernel thread and "tricks" the fput() codepaths.
1622
*
1623
* Note that __flush_workqueue() is slowly getting deprecated. This may be ok
1624
* though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
1625
* fput) to happen, which it eventually, naturally, will from the system_wq
1626
* without us explicitly calling __flush_workqueue().
1627
*/
1628
static int
1629
zvol_os_add_disk(struct gendisk *disk)
1630
{
1631
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */
1632
struct add_disk_work add_disk_work;
1633
1634
INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
1635
add_disk_work.disk = disk;
1636
add_disk_work.error = 0;
1637
1638
/* Use *_delayed_work functions since they're not GPL'd */
1639
schedule_delayed_work(&add_disk_work.work, 0);
1640
flush_delayed_work(&add_disk_work.work);
1641
1642
__flush_workqueue(system_wq);
1643
return (add_disk_work.error);
1644
#else /* <= 6.8 kernel */
1645
return (__zvol_os_add_disk(disk));
1646
#endif
1647
}
1648
1649
/*
1650
* Create a block device minor node and setup the linkage between it
1651
* and the specified volume. Once this function returns the block
1652
* device is live and ready for use.
1653
*/
1654
int
1655
zvol_os_create_minor(const char *name)
1656
{
1657
zvol_state_t *zv = NULL;
1658
objset_t *os;
1659
dmu_object_info_t *doi;
1660
uint64_t volsize;
1661
uint64_t len;
1662
unsigned minor = 0;
1663
int error = 0;
1664
int idx;
1665
uint64_t hash = zvol_name_hash(name);
1666
uint64_t volthreading;
1667
bool replayed_zil = B_FALSE;
1668
1669
if (zvol_inhibit_dev)
1670
return (0);
1671
1672
idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
1673
if (idx < 0)
1674
return (SET_ERROR(-idx));
1675
minor = idx << ZVOL_MINOR_BITS;
1676
if (MINOR(minor) != minor) {
1677
/* too many partitions can cause an overflow */
1678
zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1679
name, minor, MINOR(minor));
1680
ida_free(&zvol_ida, idx);
1681
return (SET_ERROR(EINVAL));
1682
}
1683
1684
zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1685
if (zv) {
1686
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1687
mutex_exit(&zv->zv_state_lock);
1688
ida_free(&zvol_ida, idx);
1689
return (SET_ERROR(EEXIST));
1690
}
1691
1692
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1693
1694
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1695
if (error)
1696
goto out_doi;
1697
1698
error = dmu_object_info(os, ZVOL_OBJ, doi);
1699
if (error)
1700
goto out_dmu_objset_disown;
1701
1702
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1703
if (error)
1704
goto out_dmu_objset_disown;
1705
1706
error = zvol_alloc(MKDEV(zvol_major, minor), name,
1707
volsize, doi->doi_data_block_size, &zv);
1708
if (error || zv == NULL)
1709
goto out_dmu_objset_disown;
1710
1711
zv->zv_hash = hash;
1712
1713
if (dmu_objset_is_snapshot(os))
1714
zv->zv_flags |= ZVOL_RDONLY;
1715
1716
zv->zv_objset = os;
1717
1718
/* Default */
1719
zv->zv_threading = B_TRUE;
1720
if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1721
== 0)
1722
zv->zv_threading = volthreading;
1723
1724
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1725
1726
#ifdef QUEUE_FLAG_DISCARD
1727
blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1728
#endif
1729
#ifdef QUEUE_FLAG_NONROT
1730
blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1731
#endif
1732
#ifdef QUEUE_FLAG_ADD_RANDOM
1733
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1734
#endif
1735
/* This flag was introduced in kernel version 4.12. */
1736
#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1737
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1738
#endif
1739
1740
ASSERT0P(zv->zv_kstat.dk_kstats);
1741
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1742
if (error)
1743
goto out_dmu_objset_disown;
1744
ASSERT0P(zv->zv_zilog);
1745
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1746
if (spa_writeable(dmu_objset_spa(os))) {
1747
if (zil_replay_disable)
1748
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1749
else
1750
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1751
}
1752
if (replayed_zil)
1753
zil_close(zv->zv_zilog);
1754
zv->zv_zilog = NULL;
1755
1756
/*
1757
* When udev detects the addition of the device it will immediately
1758
* invoke blkid(8) to determine the type of content on the device.
1759
* Prefetching the blocks commonly scanned by blkid(8) will speed
1760
* up this process.
1761
*/
1762
len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1763
if (len > 0) {
1764
dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1765
dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1766
ZIO_PRIORITY_SYNC_READ);
1767
}
1768
1769
zv->zv_objset = NULL;
1770
out_dmu_objset_disown:
1771
dmu_objset_disown(os, B_TRUE, FTAG);
1772
out_doi:
1773
kmem_free(doi, sizeof (dmu_object_info_t));
1774
1775
/*
1776
* Keep in mind that once add_disk() is called, the zvol is
1777
* announced to the world, and zvol_open()/zvol_release() can
1778
* be called at any time. Incidentally, add_disk() itself calls
1779
* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1780
* directly as well.
1781
*/
1782
if (error == 0 && zv) {
1783
rw_enter(&zvol_state_lock, RW_WRITER);
1784
zvol_insert(zv);
1785
rw_exit(&zvol_state_lock);
1786
error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
1787
} else {
1788
ida_free(&zvol_ida, idx);
1789
}
1790
1791
return (error);
1792
}
1793
1794
int
1795
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1796
{
1797
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1798
1799
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1800
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1801
1802
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1803
1804
/* move to new hashtable entry */
1805
zv->zv_hash = zvol_name_hash(newname);
1806
hlist_del(&zv->zv_hlink);
1807
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1808
1809
/*
1810
* The block device's read-only state is briefly changed causing
1811
* a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1812
* the name change and fixes the symlinks. This does not change
1813
* ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1814
* changes. This would normally be done using kobject_uevent() but
1815
* that is a GPL-only symbol which is why we need this workaround.
1816
*/
1817
set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1818
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1819
1820
dataset_kstats_rename(&zv->zv_kstat, newname);
1821
1822
return (0);
1823
}
1824
1825
void
1826
zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1827
{
1828
1829
set_disk_ro(zv->zv_zso->zvo_disk, flags);
1830
}
1831
1832
void
1833
zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1834
{
1835
1836
set_capacity(zv->zv_zso->zvo_disk, capacity);
1837
}
1838
1839
int
1840
zvol_init(void)
1841
{
1842
int error;
1843
1844
error = zvol_init_impl();
1845
if (error) {
1846
printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
1847
return (error);
1848
}
1849
1850
error = -register_blkdev(zvol_major, ZVOL_DRIVER);
1851
if (error) {
1852
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1853
return (SET_ERROR(error));
1854
}
1855
1856
if (zvol_blk_mq_queue_depth == 0) {
1857
zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
1858
} else {
1859
zvol_actual_blk_mq_queue_depth =
1860
MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
1861
}
1862
1863
if (zvol_blk_mq_threads == 0) {
1864
zvol_blk_mq_actual_threads = num_online_cpus();
1865
} else {
1866
zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
1867
1024);
1868
}
1869
1870
ida_init(&zvol_ida);
1871
return (0);
1872
}
1873
1874
void
1875
zvol_fini(void)
1876
{
1877
unregister_blkdev(zvol_major, ZVOL_DRIVER);
1878
1879
zvol_fini_impl();
1880
1881
ida_destroy(&zvol_ida);
1882
}
1883
1884
module_param(zvol_major, uint, 0444);
1885
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1886
1887
module_param(zvol_max_discard_blocks, ulong, 0444);
1888
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1889
1890
module_param(zvol_blk_mq_queue_depth, uint, 0644);
1891
MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
1892
1893
module_param(zvol_use_blk_mq, uint, 0644);
1894
MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
1895
1896
module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
1897
MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
1898
"Process volblocksize blocks per thread");
1899
1900
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
1901
module_param(zvol_open_timeout_ms, uint, 0644);
1902
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
1903
#endif
1904
1905