Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
48775 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2012, 2020 by Delphix. All rights reserved.23* Copyright (c) 2024, 2025, Rob Norris <[email protected]>24* Copyright (c) 2024, 2025, Klara, Inc.25*/2627#include <sys/dataset_kstats.h>28#include <sys/dbuf.h>29#include <sys/dmu_traverse.h>30#include <sys/dsl_dataset.h>31#include <sys/dsl_prop.h>32#include <sys/dsl_dir.h>33#include <sys/zap.h>34#include <sys/zfeature.h>35#include <sys/zil_impl.h>36#include <sys/dmu_tx.h>37#include <sys/zio.h>38#include <sys/zfs_rlock.h>39#include <sys/spa_impl.h>40#include <sys/zvol.h>41#include <sys/zvol_impl.h>42#include <cityhash.h>4344#include <linux/blkdev_compat.h>45#include <linux/task_io_accounting_ops.h>46#include <linux/workqueue.h>47#include <linux/blk-mq.h>4849static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,50struct request *rq, boolean_t force_sync);5152static unsigned int zvol_major = ZVOL_MAJOR;53static unsigned long zvol_max_discard_blocks = 16384;5455#ifndef HAVE_BLKDEV_GET_ERESTARTSYS56static unsigned int zvol_open_timeout_ms = 1000;57#endif5859static unsigned int zvol_blk_mq_threads = 0;60static unsigned int zvol_blk_mq_actual_threads;61static boolean_t zvol_use_blk_mq = B_FALSE;6263/*64* The maximum number of volblocksize blocks to process per thread. Typically,65* write heavy workloads preform better with higher values here, and read66* heavy workloads preform better with lower values, but that's not a hard67* and fast rule. It's basically a knob to tune between "less overhead with68* less parallelism" and "more overhead, but more parallelism".69*70* '8' was chosen as a reasonable, balanced, default based off of sequential71* read and write tests to a zvol in an NVMe pool (with 16 CPUs).72*/73static unsigned int zvol_blk_mq_blocks_per_thread = 8;7475#ifndef BLKDEV_DEFAULT_RQ76/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */77#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ78#endif7980/*81* Finalize our BIO or request.82*/83static inline void84zvol_end_io(struct bio *bio, struct request *rq, int error)85{86ASSERT3U(error, >=, 0);87if (bio) {88bio->bi_status = errno_to_bi_status(error);89bio_endio(bio);90} else {91blk_mq_end_request(rq, errno_to_bi_status(error));92}93}9495static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;96static unsigned int zvol_actual_blk_mq_queue_depth;9798struct zvol_state_os {99struct gendisk *zvo_disk; /* generic disk */100struct request_queue *zvo_queue; /* request queue */101dev_t zvo_dev; /* device id */102103struct blk_mq_tag_set tag_set;104105/* Set from the global 'zvol_use_blk_mq' at zvol load */106boolean_t use_blk_mq;107};108109static struct ida zvol_ida;110111/*112* This is called when a new block multiqueue request comes in. A request113* contains one or more BIOs.114*/115static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,116const struct blk_mq_queue_data *bd)117{118struct request *rq = bd->rq;119zvol_state_t *zv = rq->q->queuedata;120121/* Tell the kernel that we are starting to process this request */122blk_mq_start_request(rq);123124if (blk_rq_is_passthrough(rq)) {125/* Skip non filesystem request */126blk_mq_end_request(rq, BLK_STS_IOERR);127return (BLK_STS_IOERR);128}129130zvol_request_impl(zv, NULL, rq, 0);131132/* Acknowledge to the kernel that we got this request */133return (BLK_STS_OK);134}135136static struct blk_mq_ops zvol_blk_mq_queue_ops = {137.queue_rq = zvol_mq_queue_rq,138};139140/* Initialize our blk-mq struct */141static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)142{143struct zvol_state_os *zso = zv->zv_zso;144145memset(&zso->tag_set, 0, sizeof (zso->tag_set));146147/* Initialize tag set. */148zso->tag_set.ops = &zvol_blk_mq_queue_ops;149zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;150zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;151zso->tag_set.numa_node = NUMA_NO_NODE;152zso->tag_set.cmd_size = 0;153154/*155* We need BLK_MQ_F_BLOCKING here since we do blocking calls in156* zvol_request_impl()157*/158zso->tag_set.flags = BLK_MQ_F_BLOCKING;159160#ifdef BLK_MQ_F_SHOULD_MERGE161/*162* Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit.163* For older kernels, we set it.164*/165zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE;166#endif167168zso->tag_set.driver_data = zv;169170return (blk_mq_alloc_tag_set(&zso->tag_set));171}172173/*174* Given a path, return TRUE if path is a ZVOL.175*/176boolean_t177zvol_os_is_zvol(const char *path)178{179dev_t dev = 0;180181if (vdev_lookup_bdev(path, &dev) != 0)182return (B_FALSE);183184if (MAJOR(dev) == zvol_major)185return (B_TRUE);186187return (B_FALSE);188}189190static void191zvol_write(zv_request_t *zvr)192{193struct bio *bio = zvr->bio;194struct request *rq = zvr->rq;195int error = 0;196zfs_uio_t uio;197zvol_state_t *zv = zvr->zv;198struct request_queue *q;199struct gendisk *disk;200unsigned long start_time = 0;201boolean_t acct = B_FALSE;202203ASSERT3P(zv, !=, NULL);204ASSERT3U(zv->zv_open_count, >, 0);205ASSERT3P(zv->zv_zilog, !=, NULL);206207q = zv->zv_zso->zvo_queue;208disk = zv->zv_zso->zvo_disk;209210/* bio marked as FLUSH need to flush before write */211if (io_is_flush(bio, rq)) {212error = zil_commit(zv->zv_zilog, ZVOL_OBJ);213if (error != 0) {214rw_exit(&zv->zv_suspend_lock);215zvol_end_io(bio, rq, -error);216return;217}218}219220/* Some requests are just for flush and nothing else. */221if (io_size(bio, rq) == 0) {222rw_exit(&zv->zv_suspend_lock);223zvol_end_io(bio, rq, 0);224return;225}226227zfs_uio_bvec_init(&uio, bio, rq);228229ssize_t start_resid = uio.uio_resid;230231/*232* With use_blk_mq, accounting is done by blk_mq_start_request()233* and blk_mq_end_request(), so we can skip it here.234*/235if (bio) {236acct = blk_queue_io_stat(q);237if (acct) {238start_time = blk_generic_start_io_acct(q, disk, WRITE,239bio);240}241}242243boolean_t sync =244io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;245246zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,247uio.uio_loffset, uio.uio_resid, RL_WRITER);248249uint64_t volsize = zv->zv_volsize;250while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {251uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);252uint64_t off = uio.uio_loffset;253dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);254255if (bytes > volsize - off) /* don't write past the end */256bytes = volsize - off;257258dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);259260/* This will only fail for ENOSPC */261error = dmu_tx_assign(tx, DMU_TX_WAIT);262if (error) {263dmu_tx_abort(tx);264break;265}266error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,267DMU_READ_PREFETCH);268if (error == 0) {269zvol_log_write(zv, tx, off, bytes, sync);270}271dmu_tx_commit(tx);272273if (error)274break;275}276zfs_rangelock_exit(lr);277278int64_t nwritten = start_resid - uio.uio_resid;279dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);280task_io_account_write(nwritten);281282if (error == 0 && sync)283error = zil_commit(zv->zv_zilog, ZVOL_OBJ);284285rw_exit(&zv->zv_suspend_lock);286287if (bio && acct) {288blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);289}290291zvol_end_io(bio, rq, error);292}293294static void295zvol_write_task(void *arg)296{297zv_request_task_t *task = arg;298zvol_write(&task->zvr);299zv_request_task_free(task);300}301302static void303zvol_discard(zv_request_t *zvr)304{305struct bio *bio = zvr->bio;306struct request *rq = zvr->rq;307zvol_state_t *zv = zvr->zv;308uint64_t start = io_offset(bio, rq);309uint64_t size = io_size(bio, rq);310uint64_t end = start + size;311boolean_t sync;312int error = 0;313dmu_tx_t *tx;314struct request_queue *q = zv->zv_zso->zvo_queue;315struct gendisk *disk = zv->zv_zso->zvo_disk;316unsigned long start_time = 0;317boolean_t acct = B_FALSE;318319ASSERT3P(zv, !=, NULL);320ASSERT3U(zv->zv_open_count, >, 0);321ASSERT3P(zv->zv_zilog, !=, NULL);322323if (bio) {324acct = blk_queue_io_stat(q);325if (acct) {326start_time = blk_generic_start_io_acct(q, disk, WRITE,327bio);328}329}330331sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;332333if (end > zv->zv_volsize) {334error = SET_ERROR(EIO);335goto unlock;336}337338/*339* Align the request to volume block boundaries. This will prevent340* dnode_free_range() from zeroing out the unaligned parts which is341* slow (read-modify-write) and useless since we are not freeing any342* space by doing so.343*/344start = P2ROUNDUP(start, zv->zv_volblocksize);345end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);346size = end - start;347348if (start >= end)349goto unlock;350351zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,352start, size, RL_WRITER);353354tx = dmu_tx_create(zv->zv_objset);355dmu_tx_mark_netfree(tx);356error = dmu_tx_assign(tx, DMU_TX_WAIT);357if (error != 0) {358dmu_tx_abort(tx);359} else {360zvol_log_truncate(zv, tx, start, size);361dmu_tx_commit(tx);362error = dmu_free_long_range(zv->zv_objset,363ZVOL_OBJ, start, size);364}365zfs_rangelock_exit(lr);366367if (error == 0 && sync)368error = zil_commit(zv->zv_zilog, ZVOL_OBJ);369370unlock:371rw_exit(&zv->zv_suspend_lock);372373if (bio && acct) {374blk_generic_end_io_acct(q, disk, WRITE, bio,375start_time);376}377378zvol_end_io(bio, rq, error);379}380381static void382zvol_discard_task(void *arg)383{384zv_request_task_t *task = arg;385zvol_discard(&task->zvr);386zv_request_task_free(task);387}388389static void390zvol_read(zv_request_t *zvr)391{392struct bio *bio = zvr->bio;393struct request *rq = zvr->rq;394int error = 0;395zfs_uio_t uio;396boolean_t acct = B_FALSE;397zvol_state_t *zv = zvr->zv;398struct request_queue *q;399struct gendisk *disk;400unsigned long start_time = 0;401402ASSERT3P(zv, !=, NULL);403ASSERT3U(zv->zv_open_count, >, 0);404405zfs_uio_bvec_init(&uio, bio, rq);406407q = zv->zv_zso->zvo_queue;408disk = zv->zv_zso->zvo_disk;409410ssize_t start_resid = uio.uio_resid;411412/*413* When blk-mq is being used, accounting is done by414* blk_mq_start_request() and blk_mq_end_request().415*/416if (bio) {417acct = blk_queue_io_stat(q);418if (acct)419start_time = blk_generic_start_io_acct(q, disk, READ,420bio);421}422423zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,424uio.uio_loffset, uio.uio_resid, RL_READER);425426uint64_t volsize = zv->zv_volsize;427428while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {429uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);430431/* don't read past the end */432if (bytes > volsize - uio.uio_loffset)433bytes = volsize - uio.uio_loffset;434435error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,436DMU_READ_PREFETCH);437if (error) {438/* convert checksum errors into IO errors */439if (error == ECKSUM)440error = SET_ERROR(EIO);441break;442}443}444zfs_rangelock_exit(lr);445446int64_t nread = start_resid - uio.uio_resid;447dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);448task_io_account_read(nread);449450rw_exit(&zv->zv_suspend_lock);451452if (bio && acct) {453blk_generic_end_io_acct(q, disk, READ, bio, start_time);454}455456zvol_end_io(bio, rq, error);457}458459static void460zvol_read_task(void *arg)461{462zv_request_task_t *task = arg;463zvol_read(&task->zvr);464zv_request_task_free(task);465}466467/*468* Note:469*470* The kernel uses different enum names for the IO opcode, depending on the471* kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather472* than inline functions for these checks.473*/474/* Should this IO go down the zvol write path? */475#define ZVOL_OP_IS_WRITE(op) \476(op == REQ_OP_WRITE || \477op == REQ_OP_FLUSH || \478op == REQ_OP_DISCARD)479480/* Is this IO type supported by zvols? */481#define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))482483/* Get the IO opcode */484#define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))485486/*487* Process a BIO or request488*489* Either 'bio' or 'rq' should be set depending on if we are processing a490* bio or a request (both should not be set).491*492* force_sync: Set to 0 to defer processing to a background taskq493* Set to 1 to process data synchronously494*/495static void496zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,497boolean_t force_sync)498{499fstrans_cookie_t cookie = spl_fstrans_mark();500uint64_t offset = io_offset(bio, rq);501uint64_t size = io_size(bio, rq);502int rw;503504if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {505zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",506rq != NULL ? "request" : "BIO",507ZVOL_OP(bio, rq),508rq != NULL ? rq->cmd_flags : bio->bi_opf);509ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));510zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));511goto out;512}513514if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {515rw = WRITE;516} else {517rw = READ;518}519520/*521* Sanity check522*523* If we're a BIO, check our rw matches the kernel's524* bio_data_dir(bio) rw. We need to check because we support fewer525* IO operations, and want to verify that what we think are reads and526* writes from those operations match what the kernel thinks.527*/528ASSERT(rq != NULL || rw == bio_data_dir(bio));529530if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {531zvol_end_io(bio, rq, SET_ERROR(ENXIO));532goto out;533}534535if (zvol_request_sync || zv->zv_threading == B_FALSE)536force_sync = 1;537538zv_request_t zvr = {539.zv = zv,540.bio = bio,541.rq = rq,542};543544if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {545printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",546zv->zv_zso->zvo_disk->disk_name,547(long long unsigned)offset,548(long unsigned)size);549550zvol_end_io(bio, rq, SET_ERROR(EIO));551goto out;552}553554zv_request_task_t *task;555zv_taskq_t *ztqs = &zvol_taskqs;556uint_t blk_mq_hw_queue = 0;557uint_t tq_idx;558uint_t taskq_hash;559if (rq)560#ifdef HAVE_BLK_MQ_RQ_HCTX561blk_mq_hw_queue = rq->mq_hctx->queue_num;562#else563blk_mq_hw_queue = rq->q->queue_hw_ctx[564rq->q->mq_map[raw_smp_processor_id()]]->queue_num;565#endif566taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,567blk_mq_hw_queue);568tq_idx = taskq_hash % ztqs->tqs_cnt;569570if (rw == WRITE) {571if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {572zvol_end_io(bio, rq, SET_ERROR(EROFS));573goto out;574}575576/*577* Prevents the zvol from being suspended, or the ZIL being578* concurrently opened. Will be released after the i/o579* completes.580*/581rw_enter(&zv->zv_suspend_lock, RW_READER);582583/*584* Open a ZIL if this is the first time we have written to this585* zvol. We protect zv->zv_zilog with zv_suspend_lock rather586* than zv_state_lock so that we don't need to acquire an587* additional lock in this path.588*/589if (zv->zv_zilog == NULL) {590rw_exit(&zv->zv_suspend_lock);591rw_enter(&zv->zv_suspend_lock, RW_WRITER);592if (zv->zv_zilog == NULL) {593zv->zv_zilog = zil_open(zv->zv_objset,594zvol_get_data, &zv->zv_kstat.dk_zil_sums);595zv->zv_flags |= ZVOL_WRITTEN_TO;596/* replay / destroy done in zvol_create_minor */597VERIFY0((zv->zv_zilog->zl_header->zh_flags &598ZIL_REPLAY_NEEDED));599}600rw_downgrade(&zv->zv_suspend_lock);601}602603/*604* We don't want this thread to be blocked waiting for i/o to605* complete, so we instead wait from a taskq callback. The606* i/o may be a ZIL write (via zil_commit()), or a read of an607* indirect block, or a read of a data block (if this is a608* partial-block write). We will indicate that the i/o is609* complete by calling END_IO() from the taskq callback.610*611* This design allows the calling thread to continue and612* initiate more concurrent operations by calling613* zvol_request() again. There are typically only a small614* number of threads available to call zvol_request() (e.g.615* one per iSCSI target), so keeping the latency of616* zvol_request() low is important for performance.617*618* The zvol_request_sync module parameter allows this619* behavior to be altered, for performance evaluation620* purposes. If the callback blocks, setting621* zvol_request_sync=1 will result in much worse performance.622*623* We can have up to zvol_threads concurrent i/o's being624* processed for all zvols on the system. This is typically625* a vast improvement over the zvol_request_sync=1 behavior626* of one i/o at a time per zvol. However, an even better627* design would be for zvol_request() to initiate the zio628* directly, and then be notified by the zio_done callback,629* which would call END_IO(). Unfortunately, the DMU/ZIL630* interfaces lack this functionality (they block waiting for631* the i/o to complete).632*/633if (io_is_discard(bio, rq)) {634if (force_sync) {635zvol_discard(&zvr);636} else {637task = zv_request_task_create(zvr);638taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],639zvol_discard_task, task, 0, &task->ent);640}641} else {642if (force_sync) {643zvol_write(&zvr);644} else {645task = zv_request_task_create(zvr);646taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],647zvol_write_task, task, 0, &task->ent);648}649}650} else {651/*652* The SCST driver, and possibly others, may issue READ I/Os653* with a length of zero bytes. These empty I/Os contain no654* data and require no additional handling.655*/656if (size == 0) {657zvol_end_io(bio, rq, 0);658goto out;659}660661rw_enter(&zv->zv_suspend_lock, RW_READER);662663/* See comment in WRITE case above. */664if (force_sync) {665zvol_read(&zvr);666} else {667task = zv_request_task_create(zvr);668taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],669zvol_read_task, task, 0, &task->ent);670}671}672673out:674spl_fstrans_unmark(cookie);675}676677#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS678#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID679static void680zvol_submit_bio(struct bio *bio)681#else682static blk_qc_t683zvol_submit_bio(struct bio *bio)684#endif685#else686static MAKE_REQUEST_FN_RET687zvol_request(struct request_queue *q, struct bio *bio)688#endif689{690#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS691#if defined(HAVE_BIO_BDEV_DISK)692struct request_queue *q = bio->bi_bdev->bd_disk->queue;693#else694struct request_queue *q = bio->bi_disk->queue;695#endif696#endif697zvol_state_t *zv = q->queuedata;698699zvol_request_impl(zv, bio, NULL, 0);700#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \701defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \702!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)703return (BLK_QC_T_NONE);704#endif705}706707static int708#ifdef HAVE_BLK_MODE_T709zvol_open(struct gendisk *disk, blk_mode_t flag)710#else711zvol_open(struct block_device *bdev, fmode_t flag)712#endif713{714zvol_state_t *zv;715int error = 0;716boolean_t drop_suspend = B_FALSE;717#ifndef HAVE_BLKDEV_GET_ERESTARTSYS718hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);719hrtime_t start = gethrtime();720721retry:722#endif723724#ifdef HAVE_BLK_MODE_T725zv = atomic_load_ptr(&disk->private_data);726#else727zv = atomic_load_ptr(&bdev->bd_disk->private_data);728#endif729if (zv == NULL) {730return (-SET_ERROR(ENXIO));731}732733mutex_enter(&zv->zv_state_lock);734if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {735mutex_exit(&zv->zv_state_lock);736return (-SET_ERROR(ENXIO));737}738739/*740* Make sure zvol is not suspended during first open741* (hold zv_suspend_lock) and respect proper lock acquisition742* ordering - zv_suspend_lock before zv_state_lock743*/744if (zv->zv_open_count == 0) {745if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {746mutex_exit(&zv->zv_state_lock);747748/*749* Removal may happen while the locks are down, so750* we can't trust zv any longer; we have to start over.751*/752#ifdef HAVE_BLK_MODE_T753zv = atomic_load_ptr(&disk->private_data);754#else755zv = atomic_load_ptr(&bdev->bd_disk->private_data);756#endif757if (zv == NULL)758return (-SET_ERROR(ENXIO));759760rw_enter(&zv->zv_suspend_lock, RW_READER);761mutex_enter(&zv->zv_state_lock);762763if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {764mutex_exit(&zv->zv_state_lock);765rw_exit(&zv->zv_suspend_lock);766return (-SET_ERROR(ENXIO));767}768769/* check to see if zv_suspend_lock is needed */770if (zv->zv_open_count != 0) {771rw_exit(&zv->zv_suspend_lock);772} else {773drop_suspend = B_TRUE;774}775} else {776drop_suspend = B_TRUE;777}778}779780ASSERT(MUTEX_HELD(&zv->zv_state_lock));781782if (zv->zv_open_count == 0) {783boolean_t drop_namespace = B_FALSE;784785ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));786787/*788* In all other call paths the spa_namespace_lock is taken789* before the bdev->bd_mutex lock. However, on open(2)790* the __blkdev_get() function calls fops->open() with the791* bdev->bd_mutex lock held. This can result in a deadlock792* when zvols from one pool are used as vdevs in another.793*794* To prevent a lock inversion deadlock we preemptively795* take the spa_namespace_lock. Normally the lock will not796* be contended and this is safe because spa_open_common()797* handles the case where the caller already holds the798* spa_namespace_lock.799*800* When the lock cannot be aquired after multiple retries801* this must be the vdev on zvol deadlock case and we have802* no choice but to return an error. For 5.12 and older803* kernels returning -ERESTARTSYS will result in the804* bdev->bd_mutex being dropped, then reacquired, and805* fops->open() being called again. This process can be806* repeated safely until both locks are acquired. For 5.13807* and newer the -ERESTARTSYS retry logic was removed from808* the kernel so the only option is to return the error for809* the caller to handle it.810*/811if (!mutex_owned(&spa_namespace_lock)) {812if (!mutex_tryenter(&spa_namespace_lock)) {813mutex_exit(&zv->zv_state_lock);814rw_exit(&zv->zv_suspend_lock);815drop_suspend = B_FALSE;816817#ifdef HAVE_BLKDEV_GET_ERESTARTSYS818schedule();819return (-SET_ERROR(ERESTARTSYS));820#else821if ((gethrtime() - start) > timeout)822return (-SET_ERROR(ERESTARTSYS));823824schedule_timeout_interruptible(825MSEC_TO_TICK(10));826goto retry;827#endif828} else {829drop_namespace = B_TRUE;830}831}832833error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));834835if (drop_namespace)836mutex_exit(&spa_namespace_lock);837}838839if (error == 0) {840if ((blk_mode_is_open_write(flag)) &&841(zv->zv_flags & ZVOL_RDONLY)) {842if (zv->zv_open_count == 0)843zvol_last_close(zv);844845error = -SET_ERROR(EROFS);846} else {847zv->zv_open_count++;848}849}850851mutex_exit(&zv->zv_state_lock);852if (drop_suspend)853rw_exit(&zv->zv_suspend_lock);854855if (error == 0)856#ifdef HAVE_BLK_MODE_T857disk_check_media_change(disk);858#else859zfs_check_media_change(bdev);860#endif861862return (error);863}864865static void866#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG867zvol_release(struct gendisk *disk)868#else869zvol_release(struct gendisk *disk, fmode_t unused)870#endif871{872#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)873(void) unused;874#endif875boolean_t drop_suspend = B_TRUE;876877zvol_state_t *zv = atomic_load_ptr(&disk->private_data);878if (zv == NULL)879return;880881mutex_enter(&zv->zv_state_lock);882ASSERT3U(zv->zv_open_count, >, 0);883/*884* make sure zvol is not suspended during last close885* (hold zv_suspend_lock) and respect proper lock acquisition886* ordering - zv_suspend_lock before zv_state_lock887*/888if (zv->zv_open_count == 1) {889if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {890mutex_exit(&zv->zv_state_lock);891rw_enter(&zv->zv_suspend_lock, RW_READER);892mutex_enter(&zv->zv_state_lock);893894/*895* Unlike in zvol_open(), we don't check if removal896* started here, because we might be one of the openers897* that needs to be thrown out! If we're the last, we898* need to call zvol_last_close() below to finish899* cleanup. So, no special treatment for us.900*/901902/* check to see if zv_suspend_lock is needed */903if (zv->zv_open_count != 1) {904rw_exit(&zv->zv_suspend_lock);905drop_suspend = B_FALSE;906}907}908} else {909drop_suspend = B_FALSE;910}911912ASSERT(MUTEX_HELD(&zv->zv_state_lock));913914zv->zv_open_count--;915if (zv->zv_open_count == 0) {916ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));917zvol_last_close(zv);918}919920mutex_exit(&zv->zv_state_lock);921922if (drop_suspend)923rw_exit(&zv->zv_suspend_lock);924}925926static int927zvol_ioctl(struct block_device *bdev, fmode_t mode,928unsigned int cmd, unsigned long arg)929{930int error = 0;931932zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);933ASSERT3P(zv, !=, NULL);934ASSERT3U(zv->zv_open_count, >, 0);935936switch (cmd) {937case BLKFLSBUF:938#ifdef HAVE_FSYNC_BDEV939fsync_bdev(bdev);940#elif defined(HAVE_SYNC_BLOCKDEV)941sync_blockdev(bdev);942#else943#error "Neither fsync_bdev() nor sync_blockdev() found"944#endif945invalidate_bdev(bdev);946rw_enter(&zv->zv_suspend_lock, RW_READER);947948if (!(zv->zv_flags & ZVOL_RDONLY))949txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);950951rw_exit(&zv->zv_suspend_lock);952break;953954case BLKZNAME:955mutex_enter(&zv->zv_state_lock);956error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);957mutex_exit(&zv->zv_state_lock);958if (error)959error = SET_ERROR(error);960break;961962default:963error = SET_ERROR(ENOTTY);964break;965}966967return (-error);968}969970#ifdef CONFIG_COMPAT971static int972zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,973unsigned cmd, unsigned long arg)974{975return (zvol_ioctl(bdev, mode, cmd, arg));976}977#else978#define zvol_compat_ioctl NULL979#endif980981static unsigned int982zvol_check_events(struct gendisk *disk, unsigned int clearing)983{984unsigned int mask = 0;985986zvol_state_t *zv = atomic_load_ptr(&disk->private_data);987988if (zv != NULL) {989mutex_enter(&zv->zv_state_lock);990mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;991zv->zv_changed = 0;992mutex_exit(&zv->zv_state_lock);993}994995return (mask);996}997998static int999zvol_revalidate_disk(struct gendisk *disk)1000{1001zvol_state_t *zv = atomic_load_ptr(&disk->private_data);10021003if (zv != NULL) {1004mutex_enter(&zv->zv_state_lock);1005set_capacity(zv->zv_zso->zvo_disk,1006zv->zv_volsize >> SECTOR_BITS);1007mutex_exit(&zv->zv_state_lock);1008}10091010return (0);1011}10121013int1014zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)1015{1016struct gendisk *disk = zv->zv_zso->zvo_disk;10171018#if defined(HAVE_REVALIDATE_DISK_SIZE)1019revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);1020#elif defined(HAVE_REVALIDATE_DISK)1021revalidate_disk(disk);1022#else1023zvol_revalidate_disk(disk);1024#endif1025return (0);1026}10271028/*1029* Provide a simple virtual geometry for legacy compatibility. For devices1030* smaller than 1 MiB a small head and sector count is used to allow very1031* tiny devices. For devices over 1 Mib a standard head and sector count1032* is used to keep the cylinders count reasonable.1033*/1034static inline int1035zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)1036{1037zvol_state_t *zv = atomic_load_ptr(&disk->private_data);1038sector_t sectors;10391040ASSERT3P(zv, !=, NULL);1041ASSERT3U(zv->zv_open_count, >, 0);10421043sectors = get_capacity(zv->zv_zso->zvo_disk);10441045if (sectors > 2048) {1046geo->heads = 16;1047geo->sectors = 63;1048} else {1049geo->heads = 2;1050geo->sectors = 4;1051}10521053geo->start = 0;1054geo->cylinders = sectors / (geo->heads * geo->sectors);10551056return (0);1057}10581059#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK1060static int1061zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)1062{1063return (zvol_getgeo_impl(disk, geo));1064}1065#else1066static int1067zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)1068{1069return (zvol_getgeo_impl(bdev->bd_disk, geo));1070}1071#endif10721073/*1074* Why have two separate block_device_operations structs?1075*1076* Normally we'd just have one, and assign 'submit_bio' as needed. However,1077* it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we1078* can't just change submit_bio dynamically at runtime. So just create two1079* separate structs to get around this.1080*/1081static const struct block_device_operations zvol_ops_blk_mq = {1082.open = zvol_open,1083.release = zvol_release,1084.ioctl = zvol_ioctl,1085.compat_ioctl = zvol_compat_ioctl,1086.check_events = zvol_check_events,1087#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK1088.revalidate_disk = zvol_revalidate_disk,1089#endif1090.getgeo = zvol_getgeo,1091.owner = THIS_MODULE,1092};10931094static const struct block_device_operations zvol_ops = {1095.open = zvol_open,1096.release = zvol_release,1097.ioctl = zvol_ioctl,1098.compat_ioctl = zvol_compat_ioctl,1099.check_events = zvol_check_events,1100#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK1101.revalidate_disk = zvol_revalidate_disk,1102#endif1103.getgeo = zvol_getgeo,1104.owner = THIS_MODULE,1105#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS1106.submit_bio = zvol_submit_bio,1107#endif1108};11091110/*1111* Since 6.9, Linux has been removing queue limit setters in favour of an1112* initial queue_limits struct applied when the device is open. Since 6.11,1113* queue_limits is being extended to allow more things to be applied when the1114* device is open. Setters are also being removed for this.1115*1116* For OpenZFS, this means that depending on kernel version, some options may1117* be set up before the device is open, and some applied to an open device1118* (queue) after the fact.1119*1120* We manage this complexity by having our own limits struct,1121* zvol_queue_limits_t, in which we carry any queue config that we're1122* interested in setting. This structure is the same on all kernels.1123*1124* These limits are then applied to the queue at device open time by the most1125* appropriate method for the kernel.1126*1127* zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of1128* blk_alloc_disk() exists). This converts our limits struct to a proper Linux1129* struct queue_limits, and passes it in. Any fields added in later kernels are1130* (obviously) not set up here.1131*1132* zvol_queue_limits_apply() is called on all kernel versions after the queue1133* is created, and applies any remaining config. Before 6.9 that will be1134* everything, via setter methods. After 6.9 that will be whatever couldn't be1135* put into struct queue_limits. (This implies that zvol_queue_limits_apply()1136* will always be a no-op on the latest kernel we support).1137*/1138typedef struct zvol_queue_limits {1139unsigned int zql_max_hw_sectors;1140unsigned short zql_max_segments;1141unsigned int zql_max_segment_size;1142unsigned int zql_io_opt;1143unsigned int zql_physical_block_size;1144unsigned int zql_max_discard_sectors;1145unsigned int zql_discard_granularity;1146} zvol_queue_limits_t;11471148static void1149zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,1150boolean_t use_blk_mq)1151{1152limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;11531154if (use_blk_mq) {1155/*1156* IO requests can be really big (1MB). When an IO request1157* comes in, it is passed off to zvol_read() or zvol_write()1158* in a new thread, where it is chunked up into 'volblocksize'1159* sized pieces and processed. So for example, if the request1160* is a 1MB write and your volblocksize is 128k, one zvol_write1161* thread will take that request and sequentially do ten 128k1162* IOs. This is due to the fact that the thread needs to lock1163* each volblocksize sized block. So you might be wondering:1164* "instead of passing the whole 1MB request to one thread,1165* why not pass ten individual 128k chunks to ten threads and1166* process the whole write in parallel?" The short answer is1167* that there's a sweet spot number of chunks that balances1168* the greater parallelism with the added overhead of more1169* threads. The sweet spot can be different depending on if you1170* have a read or write heavy workload. Writes typically want1171* high chunk counts while reads typically want lower ones. On1172* a test pool with 6 NVMe drives in a 3x 2-disk mirror1173* configuration, with volblocksize=8k, the sweet spot for good1174* sequential reads and writes was at 8 chunks.1175*/11761177/*1178* Below we tell the kernel how big we want our requests1179* to be. You would think that blk_queue_io_opt() would be1180* used to do this since it is used to "set optimal request1181* size for the queue", but that doesn't seem to do1182* anything - the kernel still gives you huge requests1183* with tons of little PAGE_SIZE segments contained within it.1184*1185* Knowing that the kernel will just give you PAGE_SIZE segments1186* no matter what, you can say "ok, I want PAGE_SIZE byte1187* segments, and I want 'N' of them per request", where N is1188* the correct number of segments for the volblocksize and1189* number of chunks you want.1190*/1191if (zvol_blk_mq_blocks_per_thread != 0) {1192unsigned int chunks;1193chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);11941195limits->zql_max_segment_size = PAGE_SIZE;1196limits->zql_max_segments =1197(zv->zv_volblocksize * chunks) / PAGE_SIZE;1198} else {1199/*1200* Special case: zvol_blk_mq_blocks_per_thread = 01201* Max everything out.1202*/1203limits->zql_max_segments = UINT16_MAX;1204limits->zql_max_segment_size = UINT_MAX;1205}1206} else {1207limits->zql_max_segments = UINT16_MAX;1208limits->zql_max_segment_size = UINT_MAX;1209}12101211limits->zql_io_opt = DMU_MAX_ACCESS / 2;12121213limits->zql_physical_block_size = zv->zv_volblocksize;1214limits->zql_max_discard_sectors =1215(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;1216limits->zql_discard_granularity = zv->zv_volblocksize;1217}12181219#ifdef HAVE_BLK_ALLOC_DISK_2ARG1220static void1221zvol_queue_limits_convert(zvol_queue_limits_t *limits,1222struct queue_limits *qlimits)1223{1224memset(qlimits, 0, sizeof (struct queue_limits));1225qlimits->max_hw_sectors = limits->zql_max_hw_sectors;1226qlimits->max_segments = limits->zql_max_segments;1227qlimits->max_segment_size = limits->zql_max_segment_size;1228qlimits->io_opt = limits->zql_io_opt;1229qlimits->physical_block_size = limits->zql_physical_block_size;1230qlimits->max_discard_sectors = limits->zql_max_discard_sectors;1231qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;1232qlimits->discard_granularity = limits->zql_discard_granularity;1233#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES1234qlimits->features =1235BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;1236#endif1237}1238#endif12391240static void1241zvol_queue_limits_apply(zvol_queue_limits_t *limits,1242struct request_queue *queue)1243{1244#ifndef HAVE_BLK_ALLOC_DISK_2ARG1245blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);1246blk_queue_max_segments(queue, limits->zql_max_segments);1247blk_queue_max_segment_size(queue, limits->zql_max_segment_size);1248blk_queue_io_opt(queue, limits->zql_io_opt);1249blk_queue_physical_block_size(queue, limits->zql_physical_block_size);1250blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);1251blk_queue_discard_granularity(queue, limits->zql_discard_granularity);1252#endif1253#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES1254blk_queue_set_write_cache(queue, B_TRUE);1255blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);1256#endif1257}12581259static int1260zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)1261{1262#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)1263#if defined(HAVE_BLK_ALLOC_DISK)1264zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);1265if (zso->zvo_disk == NULL)1266return (1);12671268zso->zvo_disk->minors = ZVOL_MINORS;1269zso->zvo_queue = zso->zvo_disk->queue;1270#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)1271struct queue_limits qlimits;1272zvol_queue_limits_convert(limits, &qlimits);1273struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);1274if (IS_ERR(disk)) {1275zso->zvo_disk = NULL;1276return (1);1277}12781279zso->zvo_disk = disk;1280zso->zvo_disk->minors = ZVOL_MINORS;1281zso->zvo_queue = zso->zvo_disk->queue;12821283#else1284zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);1285if (zso->zvo_queue == NULL)1286return (1);12871288zso->zvo_disk = alloc_disk(ZVOL_MINORS);1289if (zso->zvo_disk == NULL) {1290blk_cleanup_queue(zso->zvo_queue);1291return (1);1292}12931294zso->zvo_disk->queue = zso->zvo_queue;1295#endif /* HAVE_BLK_ALLOC_DISK */1296#else1297zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);1298if (zso->zvo_queue == NULL)1299return (1);13001301zso->zvo_disk = alloc_disk(ZVOL_MINORS);1302if (zso->zvo_disk == NULL) {1303blk_cleanup_queue(zso->zvo_queue);1304return (1);1305}13061307zso->zvo_disk->queue = zso->zvo_queue;1308#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */13091310zvol_queue_limits_apply(limits, zso->zvo_queue);13111312return (0);13131314}13151316static int1317zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)1318{1319struct zvol_state_os *zso = zv->zv_zso;13201321/* Allocate our blk-mq tag_set */1322if (zvol_blk_mq_alloc_tag_set(zv) != 0)1323return (1);13241325#if defined(HAVE_BLK_ALLOC_DISK)1326zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);1327if (zso->zvo_disk == NULL) {1328blk_mq_free_tag_set(&zso->tag_set);1329return (1);1330}1331zso->zvo_queue = zso->zvo_disk->queue;1332zso->zvo_disk->minors = ZVOL_MINORS;1333#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)1334struct queue_limits qlimits;1335zvol_queue_limits_convert(limits, &qlimits);1336struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);1337if (IS_ERR(disk)) {1338zso->zvo_disk = NULL;1339blk_mq_free_tag_set(&zso->tag_set);1340return (1);1341}13421343zso->zvo_disk = disk;1344zso->zvo_queue = zso->zvo_disk->queue;1345zso->zvo_disk->minors = ZVOL_MINORS;1346#else1347zso->zvo_disk = alloc_disk(ZVOL_MINORS);1348if (zso->zvo_disk == NULL) {1349blk_cleanup_queue(zso->zvo_queue);1350blk_mq_free_tag_set(&zso->tag_set);1351return (1);1352}1353/* Allocate queue */1354zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);1355if (IS_ERR(zso->zvo_queue)) {1356blk_mq_free_tag_set(&zso->tag_set);1357return (1);1358}13591360/* Our queue is now created, assign it to our disk */1361zso->zvo_disk->queue = zso->zvo_queue;1362#endif13631364zvol_queue_limits_apply(limits, zso->zvo_queue);13651366return (0);1367}13681369/*1370* Allocate memory for a new zvol_state_t and setup the required1371* request queue and generic disk structures for the block device.1372*/1373static int1374zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,1375zvol_state_t **zvp)1376{1377zvol_state_t *zv;1378struct zvol_state_os *zso;1379uint64_t volmode;1380int ret;13811382ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);1383if (ret)1384return (ret);13851386if (volmode == ZFS_VOLMODE_DEFAULT)1387volmode = zvol_volmode;13881389if (volmode == ZFS_VOLMODE_NONE)1390return (0);13911392zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);1393zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);1394zv->zv_zso = zso;1395zv->zv_volmode = volmode;1396zv->zv_volsize = volsize;1397zv->zv_volblocksize = volblocksize;13981399list_link_init(&zv->zv_next);1400mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);1401cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);14021403zv->zv_zso->use_blk_mq = zvol_use_blk_mq;14041405zvol_queue_limits_t limits;1406zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);14071408/*1409* The block layer has 3 interfaces for getting BIOs:1410*1411* 1. blk-mq request queues (new)1412* 2. submit_bio() (oldest)1413* 3. regular request queues (old).1414*1415* Each of those interfaces has two permutations:1416*1417* a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates1418* both the disk and its queue (5.14 kernel or newer)1419*1420* b) We don't have blk_*alloc_disk(), and have to allocate the1421* disk and the queue separately. (5.13 kernel or older)1422*/1423if (zv->zv_zso->use_blk_mq) {1424ret = zvol_alloc_blk_mq(zv, &limits);1425if (ret != 0)1426goto out_kmem;1427zso->zvo_disk->fops = &zvol_ops_blk_mq;1428} else {1429ret = zvol_alloc_non_blk_mq(zso, &limits);1430if (ret != 0)1431goto out_kmem;1432zso->zvo_disk->fops = &zvol_ops;1433}14341435/* Limit read-ahead to a single page to prevent over-prefetching. */1436blk_queue_set_read_ahead(zso->zvo_queue, 1);14371438if (!zv->zv_zso->use_blk_mq) {1439/* Disable write merging in favor of the ZIO pipeline. */1440blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);1441}14421443zso->zvo_queue->queuedata = zv;1444zso->zvo_dev = dev;1445zv->zv_open_count = 0;1446strlcpy(zv->zv_name, name, sizeof (zv->zv_name));14471448zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);1449rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);14501451zso->zvo_disk->major = zvol_major;1452zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;14531454/*1455* Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.1456* This is accomplished by limiting the number of minors for the1457* device to one and explicitly disabling partition scanning.1458*/1459if (volmode == ZFS_VOLMODE_DEV) {1460zso->zvo_disk->minors = 1;1461zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;1462zso->zvo_disk->flags |= GENHD_FL_NO_PART;1463}14641465zso->zvo_disk->first_minor = (dev & MINORMASK);1466zso->zvo_disk->private_data = zv;1467snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",1468ZVOL_DEV_NAME, (dev & MINORMASK));14691470*zvp = zv;1471return (ret);14721473out_kmem:1474kmem_free(zso, sizeof (struct zvol_state_os));1475kmem_free(zv, sizeof (zvol_state_t));1476return (ret);1477}14781479void1480zvol_os_remove_minor(zvol_state_t *zv)1481{1482ASSERT(MUTEX_HELD(&zv->zv_state_lock));1483ASSERT0(zv->zv_open_count);1484ASSERT0(atomic_read(&zv->zv_suspend_ref));1485ASSERT(zv->zv_flags & ZVOL_REMOVING);14861487struct zvol_state_os *zso = zv->zv_zso;1488zv->zv_zso = NULL;14891490/* Clearing private_data will make new callers return immediately. */1491atomic_store_ptr(&zso->zvo_disk->private_data, NULL);14921493/*1494* Drop the state lock before calling del_gendisk(). There may be1495* callers waiting to acquire it, but del_gendisk() will block until1496* they exit, which would deadlock.1497*/1498mutex_exit(&zv->zv_state_lock);14991500del_gendisk(zso->zvo_disk);1501#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \1502(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))1503#if defined(HAVE_BLK_CLEANUP_DISK)1504blk_cleanup_disk(zso->zvo_disk);1505#else1506put_disk(zso->zvo_disk);1507#endif1508#else1509blk_cleanup_queue(zso->zvo_queue);1510put_disk(zso->zvo_disk);1511#endif15121513if (zso->use_blk_mq)1514blk_mq_free_tag_set(&zso->tag_set);15151516ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);15171518kmem_free(zso, sizeof (struct zvol_state_os));15191520mutex_enter(&zv->zv_state_lock);1521}15221523void1524zvol_os_free(zvol_state_t *zv)1525{15261527ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));1528ASSERT(!MUTEX_HELD(&zv->zv_state_lock));1529ASSERT0(zv->zv_open_count);1530ASSERT0P(zv->zv_zso);15311532ASSERT0P(zv->zv_objset);1533ASSERT0P(zv->zv_zilog);1534ASSERT0P(zv->zv_dn);15351536rw_destroy(&zv->zv_suspend_lock);1537zfs_rangelock_fini(&zv->zv_rangelock);15381539cv_destroy(&zv->zv_removing_cv);1540mutex_destroy(&zv->zv_state_lock);1541dataset_kstats_destroy(&zv->zv_kstat);15421543kmem_free(zv, sizeof (zvol_state_t));1544}15451546void1547zvol_wait_close(zvol_state_t *zv)1548{1549}15501551struct add_disk_work {1552struct delayed_work work;1553struct gendisk *disk;1554int error;1555};15561557static int1558__zvol_os_add_disk(struct gendisk *disk)1559{1560int error = 0;1561#ifdef HAVE_ADD_DISK_RET1562error = -add_disk(disk);1563if (error)1564error = SET_ERROR(error);1565#else1566add_disk(disk);1567#endif1568return (error);1569}15701571#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)1572static void1573zvol_os_add_disk_work(struct work_struct *work)1574{1575struct add_disk_work *add_disk_work;1576add_disk_work = container_of(work, struct add_disk_work, work.work);1577add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);1578}1579#endif15801581/*1582* SPECIAL CASE:1583*1584* This function basically calls add_disk() from a workqueue. You may be1585* thinking: why not just call add_disk() directly?1586*1587* When you call add_disk(), the zvol appears to the world. When this happens,1588* the kernel calls disk_scan_partitions() on the zvol, which behaves1589* differently on the 6.9+ kernels:1590*1591* - 6.8 and older kernels -1592* disk_scan_partitions()1593* handle = bdev_open_by_dev(1594* zvol_open()1595* bdev_release(handle);1596* zvol_release()1597*1598*1599* - 6.9+ kernels -1600* disk_scan_partitions()1601* file = bdev_file_open_by_dev()1602* zvol_open()1603* fput(file)1604* < wait for return to userspace >1605* zvol_release()1606*1607* The difference is that the bdev_release() from the 6.8 kernel is synchronous1608* while the fput() from the 6.9 kernel is async. Or more specifically it's1609* async that has to wait until we return to userspace (since it adds the fput1610* into the caller's work queue with the TWA_RESUME flag set). This is not the1611* behavior we want, since we want do things like create+destroy a zvol within1612* a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the1613* reference to the zvol while we're in the IOCTL, which can't wait until we1614* return to userspace.1615*1616* We can get around this since fput() has a special codepath for when it's1617* running in a kernel thread or interrupt. In those cases, it just puts the1618* fput into the system workqueue, which we can force to run with1619* __flush_workqueue(). That is why we call add_disk() from a workqueue - so it1620* run from a kernel thread and "tricks" the fput() codepaths.1621*1622* Note that __flush_workqueue() is slowly getting deprecated. This may be ok1623* though, since our IOCTL will spin on EBUSY waiting for the zvol release (via1624* fput) to happen, which it eventually, naturally, will from the system_wq1625* without us explicitly calling __flush_workqueue().1626*/1627static int1628zvol_os_add_disk(struct gendisk *disk)1629{1630#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */1631struct add_disk_work add_disk_work;16321633INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);1634add_disk_work.disk = disk;1635add_disk_work.error = 0;16361637/* Use *_delayed_work functions since they're not GPL'd */1638schedule_delayed_work(&add_disk_work.work, 0);1639flush_delayed_work(&add_disk_work.work);16401641__flush_workqueue(system_wq);1642return (add_disk_work.error);1643#else /* <= 6.8 kernel */1644return (__zvol_os_add_disk(disk));1645#endif1646}16471648/*1649* Create a block device minor node and setup the linkage between it1650* and the specified volume. Once this function returns the block1651* device is live and ready for use.1652*/1653int1654zvol_os_create_minor(const char *name)1655{1656zvol_state_t *zv = NULL;1657objset_t *os;1658dmu_object_info_t *doi;1659uint64_t volsize;1660uint64_t len;1661unsigned minor = 0;1662int error = 0;1663int idx;1664uint64_t hash = zvol_name_hash(name);1665uint64_t volthreading;1666bool replayed_zil = B_FALSE;16671668if (zvol_inhibit_dev)1669return (0);16701671idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));1672if (idx < 0)1673return (SET_ERROR(-idx));1674minor = idx << ZVOL_MINOR_BITS;1675if (MINOR(minor) != minor) {1676/* too many partitions can cause an overflow */1677zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",1678name, minor, MINOR(minor));1679ida_free(&zvol_ida, idx);1680return (SET_ERROR(EINVAL));1681}16821683zv = zvol_find_by_name_hash(name, hash, RW_NONE);1684if (zv) {1685ASSERT(MUTEX_HELD(&zv->zv_state_lock));1686mutex_exit(&zv->zv_state_lock);1687ida_free(&zvol_ida, idx);1688return (SET_ERROR(EEXIST));1689}16901691doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);16921693error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);1694if (error)1695goto out_doi;16961697error = dmu_object_info(os, ZVOL_OBJ, doi);1698if (error)1699goto out_dmu_objset_disown;17001701error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);1702if (error)1703goto out_dmu_objset_disown;17041705error = zvol_alloc(MKDEV(zvol_major, minor), name,1706volsize, doi->doi_data_block_size, &zv);1707if (error || zv == NULL)1708goto out_dmu_objset_disown;17091710zv->zv_hash = hash;17111712if (dmu_objset_is_snapshot(os))1713zv->zv_flags |= ZVOL_RDONLY;17141715zv->zv_objset = os;17161717/* Default */1718zv->zv_threading = B_TRUE;1719if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)1720== 0)1721zv->zv_threading = volthreading;17221723set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);17241725#ifdef QUEUE_FLAG_DISCARD1726blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);1727#endif1728#ifdef QUEUE_FLAG_NONROT1729blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);1730#endif1731#ifdef QUEUE_FLAG_ADD_RANDOM1732blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);1733#endif1734/* This flag was introduced in kernel version 4.12. */1735#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH1736blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);1737#endif17381739ASSERT0P(zv->zv_kstat.dk_kstats);1740error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);1741if (error)1742goto out_dmu_objset_disown;1743ASSERT0P(zv->zv_zilog);1744zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);1745if (spa_writeable(dmu_objset_spa(os))) {1746if (zil_replay_disable)1747replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);1748else1749replayed_zil = zil_replay(os, zv, zvol_replay_vector);1750}1751if (replayed_zil)1752zil_close(zv->zv_zilog);1753zv->zv_zilog = NULL;17541755/*1756* When udev detects the addition of the device it will immediately1757* invoke blkid(8) to determine the type of content on the device.1758* Prefetching the blocks commonly scanned by blkid(8) will speed1759* up this process.1760*/1761len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);1762if (len > 0) {1763dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);1764dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,1765ZIO_PRIORITY_SYNC_READ);1766}17671768zv->zv_objset = NULL;1769out_dmu_objset_disown:1770dmu_objset_disown(os, B_TRUE, FTAG);1771out_doi:1772kmem_free(doi, sizeof (dmu_object_info_t));17731774/*1775* Keep in mind that once add_disk() is called, the zvol is1776* announced to the world, and zvol_open()/zvol_release() can1777* be called at any time. Incidentally, add_disk() itself calls1778* zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()1779* directly as well.1780*/1781if (error == 0 && zv) {1782rw_enter(&zvol_state_lock, RW_WRITER);1783zvol_insert(zv);1784rw_exit(&zvol_state_lock);1785error = zvol_os_add_disk(zv->zv_zso->zvo_disk);1786} else {1787ida_free(&zvol_ida, idx);1788}17891790return (error);1791}17921793int1794zvol_os_rename_minor(zvol_state_t *zv, const char *newname)1795{1796int readonly = get_disk_ro(zv->zv_zso->zvo_disk);17971798ASSERT(RW_LOCK_HELD(&zvol_state_lock));1799ASSERT(MUTEX_HELD(&zv->zv_state_lock));18001801strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));18021803/* move to new hashtable entry */1804zv->zv_hash = zvol_name_hash(newname);1805hlist_del(&zv->zv_hlink);1806hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));18071808/*1809* The block device's read-only state is briefly changed causing1810* a KOBJ_CHANGE uevent to be issued. This ensures udev detects1811* the name change and fixes the symlinks. This does not change1812* ZVOL_RDONLY in zv->zv_flags so the actual read-only state never1813* changes. This would normally be done using kobject_uevent() but1814* that is a GPL-only symbol which is why we need this workaround.1815*/1816set_disk_ro(zv->zv_zso->zvo_disk, !readonly);1817set_disk_ro(zv->zv_zso->zvo_disk, readonly);18181819dataset_kstats_rename(&zv->zv_kstat, newname);18201821return (0);1822}18231824void1825zvol_os_set_disk_ro(zvol_state_t *zv, int flags)1826{18271828set_disk_ro(zv->zv_zso->zvo_disk, flags);1829}18301831void1832zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)1833{18341835set_capacity(zv->zv_zso->zvo_disk, capacity);1836}18371838int1839zvol_init(void)1840{1841int error;18421843error = zvol_init_impl();1844if (error) {1845printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);1846return (error);1847}18481849error = -register_blkdev(zvol_major, ZVOL_DRIVER);1850if (error) {1851printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);1852return (SET_ERROR(error));1853}18541855if (zvol_blk_mq_queue_depth == 0) {1856zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;1857} else {1858zvol_actual_blk_mq_queue_depth =1859MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);1860}18611862if (zvol_blk_mq_threads == 0) {1863zvol_blk_mq_actual_threads = num_online_cpus();1864} else {1865zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),18661024);1867}18681869ida_init(&zvol_ida);1870return (0);1871}18721873void1874zvol_fini(void)1875{1876unregister_blkdev(zvol_major, ZVOL_DRIVER);18771878zvol_fini_impl();18791880ida_destroy(&zvol_ida);1881}18821883module_param(zvol_major, uint, 0444);1884MODULE_PARM_DESC(zvol_major, "Major number for zvol device");18851886module_param(zvol_max_discard_blocks, ulong, 0444);1887MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");18881889module_param(zvol_blk_mq_queue_depth, uint, 0644);1890MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");18911892module_param(zvol_use_blk_mq, uint, 0644);1893MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");18941895module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);1896MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,1897"Process volblocksize blocks per thread");18981899#ifndef HAVE_BLKDEV_GET_ERESTARTSYS1900module_param(zvol_open_timeout_ms, uint, 0644);1901MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");1902#endif190319041905