CoCalc -- vdev

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
⁴⁸⁷⁷⁴ views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
 * CDDL HEADER START
4
 *
5
 * The contents of this file are subject to the terms of the
6
 * Common Development and Distribution License (the "License").
7
 * You may not use this file except in compliance with the License.
8
 *
9
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
 * or https://opensource.org/licenses/CDDL-1.0.
11
 * See the License for the specific language governing permissions
12
 * and limitations under the License.
13
 *
14
 * When distributing Covered Code, include this CDDL HEADER in each
15
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
 * If applicable, add the following below this CDDL HEADER, with the
17
 * fields enclosed by brackets "[]" replaced with your own identifying
18
 * information: Portions Copyright [yyyy] [name of copyright owner]
19
 *
20
 * CDDL HEADER END
21
 */
22
/*
23
 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
24
 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
25
 * Rewritten for Linux by Brian Behlendorf <[email protected]>.
26
 * LLNL-CODE-403049.
27
 * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
28
 * Copyright (c) 2023, 2024, 2025, Klara, Inc.
29
 */
30

31
#include <sys/zfs_context.h>
32
#include <sys/spa_impl.h>
33
#include <sys/vdev_disk.h>
34
#include <sys/vdev_impl.h>
35
#include <sys/vdev_trim.h>
36
#include <sys/abd.h>
37
#include <sys/fs/zfs.h>
38
#include <sys/zio.h>
39
#include <linux/blkpg.h>
40
#include <linux/msdos_fs.h>
41
#include <linux/vfs_compat.h>
42
#include <linux/blk-cgroup.h>
43

44
/*
45
 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
46
 * block_device. Since it carries the block_device inside, its convenient to
47
 * just use the handle as a proxy.
48
 *
49
 * Linux 6.9.x uses a file for the same purpose.
50
 *
51
 * For pre-6.8, we just emulate this with a cast, since we don't need any of
52
 * the other fields inside the handle.
53
 */
54
#if defined(HAVE_BDEV_OPEN_BY_PATH)
55
typedef struct bdev_handle zfs_bdev_handle_t;
56
#define	BDH_BDEV(bdh)		((bdh)->bdev)
57
#define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
58
#define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
59
#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
60
#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
61
typedef struct file zfs_bdev_handle_t;
62
#define	BDH_BDEV(bdh)		(file_bdev(bdh))
63
#define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
64
#define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
65
#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
66
#else
67
typedef void zfs_bdev_handle_t;
68
#define	BDH_BDEV(bdh)		((struct block_device *)bdh)
69
#define	BDH_IS_ERR(bdh)		(IS_ERR(BDH_BDEV(bdh)))
70
#define	BDH_PTR_ERR(bdh)	(PTR_ERR(BDH_BDEV(bdh)))
71
#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
72
#endif
73

74
typedef struct vdev_disk {
75
	zfs_bdev_handle_t		*vd_bdh;
76
	krwlock_t			vd_lock;
77
} vdev_disk_t;
78

79
/*
80
 * Maximum number of segments to add to a bio (min 4). If this is higher than
81
 * the maximum allowed by the device queue or the kernel itself, it will be
82
 * clamped. Setting it to zero will cause the kernel's ideal size to be used.
83
 */
84
uint_t zfs_vdev_disk_max_segs = 0;
85

86
/*
87
 * Unique identifier for the exclusive vdev holder.
88
 */
89
static void *zfs_vdev_holder = VDEV_HOLDER;
90

91
/*
92
 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
93
 * device is missing. The missing path may be transient since the links
94
 * can be briefly removed and recreated in response to udev events.
95
 */
96
static uint_t zfs_vdev_open_timeout_ms = 1000;
97

98
/*
99
 * Size of the "reserved" partition, in blocks.
100
 */
101
#define	EFI_MIN_RESV_SIZE	(16 * 1024)
102

103
/*
104
 * BIO request failfast mask.
105
 */
106

107
static unsigned int zfs_vdev_failfast_mask = 1;
108

109
/*
110
 * Convert SPA mode flags into bdev open mode flags.
111
 */
112
#ifdef HAVE_BLK_MODE_T
113
typedef blk_mode_t vdev_bdev_mode_t;
114
#define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
115
#define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
116
#define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
117
#define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
118
#else
119
typedef fmode_t vdev_bdev_mode_t;
120
#define	VDEV_BDEV_MODE_READ	FMODE_READ
121
#define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
122
#define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
123
#define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
124
#endif
125

126
static vdev_bdev_mode_t
127
vdev_bdev_mode(spa_mode_t smode)
128
{
129
	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
130
	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
131

132
	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
133

134
	if (smode & SPA_MODE_READ)
135
		bmode |= VDEV_BDEV_MODE_READ;
136

137
	if (smode & SPA_MODE_WRITE)
138
		bmode |= VDEV_BDEV_MODE_WRITE;
139

140
	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
141
	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
142

143
	return (bmode);
144
}
145

146
/*
147
 * Returns the usable capacity (in bytes) for the partition or disk.
148
 */
149
static uint64_t
150
bdev_capacity(struct block_device *bdev)
151
{
152
#ifdef HAVE_BDEV_NR_BYTES
153
	return (bdev_nr_bytes(bdev));
154
#else
155
	return (i_size_read(bdev->bd_inode));
156
#endif
157
}
158

159
#if !defined(HAVE_BDEV_WHOLE)
160
static inline struct block_device *
161
bdev_whole(struct block_device *bdev)
162
{
163
	return (bdev->bd_contains);
164
}
165
#endif
166

167
#if defined(HAVE_BDEVNAME)
168
#define	vdev_bdevname(bdev, name)	bdevname(bdev, name)
169
#else
170
static inline void
171
vdev_bdevname(struct block_device *bdev, char *name)
172
{
173
	snprintf(name, BDEVNAME_SIZE, "%pg", bdev);
174
}
175
#endif
176

177
/*
178
 * Returns the maximum expansion capacity of the block device (in bytes).
179
 *
180
 * It is possible to expand a vdev when it has been created as a wholedisk
181
 * and the containing block device has increased in capacity.  Or when the
182
 * partition containing the pool has been manually increased in size.
183
 *
184
 * This function is only responsible for calculating the potential expansion
185
 * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
186
 * responsible for verifying the expected partition layout in the wholedisk
187
 * case, and updating the partition table if appropriate.  Once the partition
188
 * size has been increased the additional capacity will be visible using
189
 * bdev_capacity().
190
 *
191
 * The returned maximum expansion capacity is always expected to be larger, or
192
 * at the very least equal, to its usable capacity to prevent overestimating
193
 * the pool expandsize.
194
 */
195
static uint64_t
196
bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
197
{
198
	uint64_t psize;
199
	int64_t available;
200

201
	if (wholedisk && bdev != bdev_whole(bdev)) {
202
		/*
203
		 * When reporting maximum expansion capacity for a wholedisk
204
		 * deduct any capacity which is expected to be lost due to
205
		 * alignment restrictions.  Over reporting this value isn't
206
		 * harmful and would only result in slightly less capacity
207
		 * than expected post expansion.
208
		 * The estimated available space may be slightly smaller than
209
		 * bdev_capacity() for devices where the number of sectors is
210
		 * not a multiple of the alignment size and the partition layout
211
		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
212
		 * "reserved" EFI partition: in such cases return the device
213
		 * usable capacity.
214
		 */
215
		available = bdev_capacity(bdev_whole(bdev)) -
216
		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
217
		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
218
		psize = MAX(available, bdev_capacity(bdev));
219
	} else {
220
		psize = bdev_capacity(bdev);
221
	}
222

223
	return (psize);
224
}
225

226
static void
227
vdev_disk_error(zio_t *zio)
228
{
229
	/*
230
	 * This function can be called in interrupt context, for instance while
231
	 * handling IRQs coming from a misbehaving disk device; use printk()
232
	 * which is safe from any context.
233
	 */
234
	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
235
	    "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa),
236
	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
237
	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
238
	    zio->io_flags);
239
}
240

241
static void
242
vdev_disk_kobj_evt_post(vdev_t *v)
243
{
244
	vdev_disk_t *vd = v->vdev_tsd;
245
	if (vd && vd->vd_bdh) {
246
		spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh));
247
	} else {
248
		vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
249
		    v->vdev_path);
250
	}
251
}
252

253
static zfs_bdev_handle_t *
254
vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
255
{
256
	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
257

258
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
259
	return (bdev_file_open_by_path(path, bmode, holder, NULL));
260
#elif defined(HAVE_BDEV_OPEN_BY_PATH)
261
	return (bdev_open_by_path(path, bmode, holder, NULL));
262
#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
263
	return (blkdev_get_by_path(path, bmode, holder, NULL));
264
#else
265
	return (blkdev_get_by_path(path, bmode, holder));
266
#endif
267
}
268

269
static void
270
vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
271
{
272
#if defined(HAVE_BDEV_RELEASE)
273
	return (bdev_release(bdh));
274
#elif defined(HAVE_BLKDEV_PUT_HOLDER)
275
	return (blkdev_put(BDH_BDEV(bdh), holder));
276
#elif defined(HAVE_BLKDEV_PUT)
277
	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
278
#else
279
	fput(bdh);
280
#endif
281
}
282

283
static int
284
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
285
    uint64_t *logical_ashift, uint64_t *physical_ashift)
286
{
287
	zfs_bdev_handle_t *bdh;
288
	spa_mode_t smode = spa_mode(v->vdev_spa);
289
	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
290
	vdev_disk_t *vd;
291

292
	/* Must have a pathname and it must be absolute. */
293
	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
294
		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
295
		vdev_dbgmsg(v, "invalid vdev_path");
296
		return (SET_ERROR(EINVAL));
297
	}
298

299
	/*
300
	 * Reopen the device if it is currently open.  When expanding a
301
	 * partition force re-scanning the partition table if userland
302
	 * did not take care of this already. We need to do this while closed
303
	 * in order to get an accurate updated block device size.  Then
304
	 * since udev may need to recreate the device links increase the
305
	 * open retry timeout before reporting the device as unavailable.
306
	 */
307
	vd = v->vdev_tsd;
308
	if (vd) {
309
		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
310
		boolean_t reread_part = B_FALSE;
311

312
		rw_enter(&vd->vd_lock, RW_WRITER);
313
		bdh = vd->vd_bdh;
314
		vd->vd_bdh = NULL;
315

316
		if (bdh) {
317
			struct block_device *bdev = BDH_BDEV(bdh);
318
			if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
319
				vdev_bdevname(bdev_whole(bdev), disk_name + 5);
320
				/*
321
				 * If userland has BLKPG_RESIZE_PARTITION,
322
				 * then it should have updated the partition
323
				 * table already. We can detect this by
324
				 * comparing our current physical size
325
				 * with that of the device. If they are
326
				 * the same, then we must not have
327
				 * BLKPG_RESIZE_PARTITION or it failed to
328
				 * update the partition table online. We
329
				 * fallback to rescanning the partition
330
				 * table from the kernel below. However,
331
				 * if the capacity already reflects the
332
				 * updated partition, then we skip
333
				 * rescanning the partition table here.
334
				 */
335
				if (v->vdev_psize == bdev_capacity(bdev))
336
					reread_part = B_TRUE;
337
			}
338

339
			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
340
		}
341

342
		if (reread_part) {
343
			bdh = vdev_blkdev_get_by_path(disk_name, smode,
344
			    zfs_vdev_holder);
345
			if (!BDH_IS_ERR(bdh)) {
346
				int error =
347
				    vdev_bdev_reread_part(BDH_BDEV(bdh));
348
				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
349
				if (error == 0) {
350
					timeout = MSEC2NSEC(
351
					    zfs_vdev_open_timeout_ms * 2);
352
				}
353
			}
354
		}
355
	} else {
356
		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
357

358
		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
359
		rw_enter(&vd->vd_lock, RW_WRITER);
360
	}
361

362
	/*
363
	 * Devices are always opened by the path provided at configuration
364
	 * time.  This means that if the provided path is a udev by-id path
365
	 * then drives may be re-cabled without an issue.  If the provided
366
	 * path is a udev by-path path, then the physical location information
367
	 * will be preserved.  This can be critical for more complicated
368
	 * configurations where drives are located in specific physical
369
	 * locations to maximize the systems tolerance to component failure.
370
	 *
371
	 * Alternatively, you can provide your own udev rule to flexibly map
372
	 * the drives as you see fit.  It is not advised that you use the
373
	 * /dev/[hd]d devices which may be reordered due to probing order.
374
	 * Devices in the wrong locations will be detected by the higher
375
	 * level vdev validation.
376
	 *
377
	 * The specified paths may be briefly removed and recreated in
378
	 * response to udev events.  This should be exceptionally unlikely
379
	 * because the zpool command makes every effort to verify these paths
380
	 * have already settled prior to reaching this point.  Therefore,
381
	 * a ENOENT failure at this point is highly likely to be transient
382
	 * and it is reasonable to sleep and retry before giving up.  In
383
	 * practice delays have been observed to be on the order of 100ms.
384
	 *
385
	 * When ERESTARTSYS is returned it indicates the block device is
386
	 * a zvol which could not be opened due to the deadlock detection
387
	 * logic in zvol_open().  Extend the timeout and retry the open
388
	 * subsequent attempts are expected to eventually succeed.
389
	 */
390
	hrtime_t start = gethrtime();
391
	bdh = BDH_ERR_PTR(-ENXIO);
392
	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
393
		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
394
		    zfs_vdev_holder);
395
		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
396
			/*
397
			 * There is no point of waiting since device is removed
398
			 * explicitly
399
			 */
400
			if (v->vdev_removed)
401
				break;
402

403
			schedule_timeout_interruptible(MSEC_TO_TICK(10));
404
		} else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
405
			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
406
			continue;
407
		} else if (BDH_IS_ERR(bdh)) {
408
			break;
409
		}
410
	}
411

412
	if (BDH_IS_ERR(bdh)) {
413
		int error = -BDH_PTR_ERR(bdh);
414
		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
415
		    (u_longlong_t)(gethrtime() - start),
416
		    (u_longlong_t)timeout);
417
		vd->vd_bdh = NULL;
418
		v->vdev_tsd = vd;
419
		rw_exit(&vd->vd_lock);
420
		return (SET_ERROR(error));
421
	} else {
422
		vd->vd_bdh = bdh;
423
		v->vdev_tsd = vd;
424
		rw_exit(&vd->vd_lock);
425
	}
426

427
	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
428

429
	/*  Determine the physical block size */
430
	int physical_block_size = bdev_physical_block_size(bdev);
431

432
	/*  Determine the logical block size */
433
	int logical_block_size = bdev_logical_block_size(bdev);
434

435
	/*
436
	 * If the device has a write cache, clear the nowritecache flag,
437
	 * so that we start issuing flush requests again.
438
	 */
439
	v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev);
440

441
	/* Set when device reports it supports TRIM. */
442
	v->vdev_has_trim = bdev_discard_supported(bdev);
443

444
	/* Set when device reports it supports secure TRIM. */
445
	v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
446

447
	/* Inform the ZIO pipeline that we are non-rotational */
448
	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
449

450
	/* Physical volume size in bytes for the partition */
451
	*psize = bdev_capacity(bdev);
452

453
	/* Physical volume size in bytes including possible expansion space */
454
	*max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk);
455

456
	/* Based on the minimum sector size set the block size */
457
	*physical_ashift = highbit64(MAX(physical_block_size,
458
	    SPA_MINBLOCKSIZE)) - 1;
459

460
	*logical_ashift = highbit64(MAX(logical_block_size,
461
	    SPA_MINBLOCKSIZE)) - 1;
462

463
	return (0);
464
}
465

466
static void
467
vdev_disk_close(vdev_t *v)
468
{
469
	vdev_disk_t *vd = v->vdev_tsd;
470

471
	if (v->vdev_reopening || vd == NULL)
472
		return;
473

474
	rw_enter(&vd->vd_lock, RW_WRITER);
475

476
	if (vd->vd_bdh != NULL)
477
		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
478
		    zfs_vdev_holder);
479

480
	v->vdev_tsd = NULL;
481

482
	rw_exit(&vd->vd_lock);
483
	rw_destroy(&vd->vd_lock);
484
	kmem_free(vd, sizeof (vdev_disk_t));
485
}
486

487
/*
488
 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
489
 * replace it with preempt_schedule under the following condition:
490
 */
491
#if defined(CONFIG_ARM64) && \
492
    defined(CONFIG_PREEMPTION) && \
493
    defined(CONFIG_BLK_CGROUP)
494
#define	preempt_schedule_notrace(x) preempt_schedule(x)
495
#endif
496

497
/*
498
 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct
499
 * as an argument removing the need to set it with bio_set_dev().  This
500
 * removes the need for all of the following compatibility code.
501
 */
502
#if !defined(HAVE_BIO_ALLOC_4ARG)
503

504
#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
505
/*
506
 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
507
 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
508
 * As a side effect the function was converted to GPL-only.  Define our
509
 * own version when needed which uses rcu_read_lock_sched().
510
 *
511
 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public
512
 * part, moving blkg_tryget into the private one. Define our own version.
513
 */
514
#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET)
515
static inline bool
516
vdev_blkg_tryget(struct blkcg_gq *blkg)
517
{
518
	struct percpu_ref *ref = &blkg->refcnt;
519
	unsigned long __percpu *count;
520
	bool rc;
521

522
	rcu_read_lock_sched();
523

524
	if (__ref_is_percpu(ref, &count)) {
525
		this_cpu_inc(*count);
526
		rc = true;
527
	} else {
528
#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
529
		rc = atomic_long_inc_not_zero(&ref->data->count);
530
#else
531
		rc = atomic_long_inc_not_zero(&ref->count);
532
#endif
533
	}
534

535
	rcu_read_unlock_sched();
536

537
	return (rc);
538
}
539
#else
540
#define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
541
#endif
542
#ifdef HAVE_BIO_SET_DEV_MACRO
543
/*
544
 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
545
 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
546
 * the entire macro.  Provide a minimal version which always assigns the
547
 * request queue's root_blkg to the bio.
548
 */
549
static inline void
550
vdev_bio_associate_blkg(struct bio *bio)
551
{
552
#if defined(HAVE_BIO_BDEV_DISK)
553
	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
554
#else
555
	struct request_queue *q = bio->bi_disk->queue;
556
#endif
557

558
	ASSERT3P(q, !=, NULL);
559
	ASSERT0P(bio->bi_blkg);
560

561
	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
562
		bio->bi_blkg = q->root_blkg;
563
}
564

565
#define	bio_associate_blkg vdev_bio_associate_blkg
566
#else
567
static inline void
568
vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
569
{
570
#if defined(HAVE_BIO_BDEV_DISK)
571
	struct request_queue *q = bdev->bd_disk->queue;
572
#else
573
	struct request_queue *q = bio->bi_disk->queue;
574
#endif
575
	bio_clear_flag(bio, BIO_REMAPPED);
576
	if (bio->bi_bdev != bdev)
577
		bio_clear_flag(bio, BIO_THROTTLED);
578
	bio->bi_bdev = bdev;
579

580
	ASSERT3P(q, !=, NULL);
581
	ASSERT0P(bio->bi_blkg);
582

583
	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
584
		bio->bi_blkg = q->root_blkg;
585
}
586
#define	bio_set_dev		vdev_bio_set_dev
587
#endif
588
#endif
589
#endif /* !HAVE_BIO_ALLOC_4ARG */
590

591
static inline void
592
vdev_submit_bio(struct bio *bio)
593
{
594
	struct bio_list *bio_list = current->bio_list;
595
	current->bio_list = NULL;
596
	(void) submit_bio(bio);
597
	current->bio_list = bio_list;
598
}
599

600
static inline struct bio *
601
vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
602
    unsigned short nr_vecs)
603
{
604
	struct bio *bio;
605

606
#ifdef HAVE_BIO_ALLOC_4ARG
607
	bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask);
608
#else
609
	bio = bio_alloc(gfp_mask, nr_vecs);
610
	if (likely(bio != NULL))
611
		bio_set_dev(bio, bdev);
612
#endif
613

614
	return (bio);
615
}
616

617
static inline uint_t
618
vdev_bio_max_segs(struct block_device *bdev)
619
{
620
	/*
621
	 * Smallest of the device max segs and the tunable max segs. Minimum
622
	 * 4, so there's room to finish split pages if they come up.
623
	 */
624
	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
625
	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
626
	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
627
	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
628

629
#ifdef HAVE_BIO_MAX_SEGS
630
	return (bio_max_segs(max_segs));
631
#else
632
	return (MIN(max_segs, BIO_MAX_PAGES));
633
#endif
634
}
635

636
static inline uint_t
637
vdev_bio_max_bytes(struct block_device *bdev)
638
{
639
	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
640
}
641

642

643
/*
644
 * Virtual block IO object (VBIO)
645
 *
646
 * Linux block IO (BIO) objects have a limit on how many data segments (pages)
647
 * they can hold. Depending on how they're allocated and structured, a large
648
 * ZIO can require more than one BIO to be submitted to the kernel, which then
649
 * all have to complete before we can return the completed ZIO back to ZFS.
650
 *
651
 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
652
 * translate a ZIO down into the kernel block layer and back again.
653
 *
654
 * Note that these are only used for data ZIOs (read/write). Meta-operations
655
 * (flush/trim) don't need multiple BIOs and so can just make the call
656
 * directly.
657
 */
658
typedef struct {
659
	zio_t		*vbio_zio;	/* parent zio */
660

661
	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
662

663
	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
664

665
	uint_t		vbio_max_segs;	/* max segs per bio */
666

667
	uint_t		vbio_max_bytes;	/* max bytes per bio */
668
	uint_t		vbio_lbs_mask;	/* logical block size mask */
669

670
	uint64_t	vbio_offset;	/* start offset of next bio */
671

672
	struct bio	*vbio_bio;	/* pointer to the current bio */
673
	int		vbio_flags;	/* bio flags */
674
} vbio_t;
675

676
static vbio_t *
677
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
678
{
679
	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
680

681
	vbio->vbio_zio = zio;
682
	vbio->vbio_bdev = bdev;
683
	vbio->vbio_abd = NULL;
684
	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
685
	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
686
	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
687
	vbio->vbio_offset = zio->io_offset;
688
	vbio->vbio_bio = NULL;
689
	vbio->vbio_flags = flags;
690

691
	return (vbio);
692
}
693

694
static void vbio_completion(struct bio *bio);
695

696
static int
697
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
698
{
699
	struct bio *bio = vbio->vbio_bio;
700
	uint_t ssize;
701

702
	while (size > 0) {
703
		if (bio == NULL) {
704
			/* New BIO, allocate and set up */
705
			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
706
			    vbio->vbio_max_segs);
707
			VERIFY(bio);
708

709
			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
710
			bio_set_op_attrs(bio,
711
			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
712
			    WRITE : READ, vbio->vbio_flags);
713

714
			if (vbio->vbio_bio) {
715
				bio_chain(vbio->vbio_bio, bio);
716
				vdev_submit_bio(vbio->vbio_bio);
717
			}
718
			vbio->vbio_bio = bio;
719
		}
720

721
		/*
722
		 * Only load as much of the current page data as will fit in
723
		 * the space left in the BIO, respecting lbs alignment. Older
724
		 * kernels will error if we try to overfill the BIO, while
725
		 * newer ones will accept it and split the BIO. This ensures
726
		 * everything works on older kernels, and avoids an additional
727
		 * overhead on the new.
728
		 */
729
		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
730
		    vbio->vbio_lbs_mask);
731
		if (ssize > 0 &&
732
		    bio_add_page(bio, page, ssize, offset) == ssize) {
733
			/* Accepted, adjust and load any remaining. */
734
			size -= ssize;
735
			offset += ssize;
736
			continue;
737
		}
738

739
		/* No room, set up for a new BIO and loop */
740
		vbio->vbio_offset += BIO_BI_SIZE(bio);
741

742
		/* Signal new BIO allocation wanted */
743
		bio = NULL;
744
	}
745

746
	return (0);
747
}
748

749
/* Iterator callback to submit ABD pages to the vbio. */
750
static int
751
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
752
{
753
	vbio_t *vbio = priv;
754
	return (vbio_add_page(vbio, page, len, off));
755
}
756

757
/* Create some BIOs, fill them with data and submit them */
758
static void
759
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
760
{
761
	/*
762
	 * We plug so we can submit the BIOs as we go and only unplug them when
763
	 * they are fully created and submitted. This is important; if we don't
764
	 * plug, then the kernel may start executing earlier BIOs while we're
765
	 * still creating and executing later ones, and if the device goes
766
	 * away while that's happening, older kernels can get confused and
767
	 * trample memory.
768
	 */
769
	struct blk_plug plug;
770
	blk_start_plug(&plug);
771

772
	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
773
	ASSERT(vbio->vbio_bio);
774

775
	vbio->vbio_bio->bi_end_io = vbio_completion;
776
	vbio->vbio_bio->bi_private = vbio;
777

778
	/*
779
	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
780
	 * can't touch it again. The bio may complete and vbio_completion() be
781
	 * called and free the vbio before this task is run again, so we must
782
	 * consider it invalid from this point.
783
	 */
784
	vdev_submit_bio(vbio->vbio_bio);
785

786
	blk_finish_plug(&plug);
787
}
788

789
/* IO completion callback */
790
static void
791
vbio_completion(struct bio *bio)
792
{
793
	vbio_t *vbio = bio->bi_private;
794
	zio_t *zio = vbio->vbio_zio;
795

796
	ASSERT(zio);
797

798
	/* Capture and log any errors */
799
	zio->io_error = bi_status_to_errno(bio->bi_status);
800
	ASSERT3U(zio->io_error, >=, 0);
801

802
	if (zio->io_error)
803
		vdev_disk_error(zio);
804

805
	/* Return the BIO to the kernel */
806
	bio_put(bio);
807

808
	/*
809
	 * We're likely in an interrupt context so we can't do ABD/memory work
810
	 * here; instead we stash vbio on the zio and take care of it in the
811
	 * done callback.
812
	 */
813
	ASSERT0P(zio->io_bio);
814
	zio->io_bio = vbio;
815

816
	zio_delay_interrupt(zio);
817
}
818

819
/*
820
 * Iterator callback to count ABD pages and check their size & alignment.
821
 *
822
 * On Linux, each BIO segment can take a page pointer, and an offset+length of
823
 * the data within that page. A page can be arbitrarily large ("compound"
824
 * pages) but we still have to ensure the data portion is correctly sized and
825
 * aligned to the logical block size, to ensure that if the kernel wants to
826
 * split the BIO, the two halves will still be properly aligned.
827
 *
828
 * NOTE: if you change this function, change the copy in
829
 * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
830
 * data there to validate the change you're making.
831
 */
832
typedef struct {
833
	size_t	blocksize;
834
	int	seen_first;
835
	int	seen_last;
836
} vdev_disk_check_alignment_t;
837

838
static int
839
vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
840
    void *priv)
841
{
842
	(void) page;
843
	vdev_disk_check_alignment_t *s = priv;
844

845
	/*
846
	 * The cardinal rule: a single on-disk block must never cross an
847
	 * physical (order-0) page boundary, as the kernel expects to be able
848
	 * to split at both LBS and page boundaries.
849
	 *
850
	 * This implies various alignment rules for the blocks in this
851
	 * (possibly compound) page, which we can check for.
852
	 */
853

854
	/*
855
	 * If the previous page did not end on a page boundary, then we
856
	 * can't proceed without creating a hole.
857
	 */
858
	if (s->seen_last)
859
		return (1);
860

861
	/* This page must contain only whole LBS-sized blocks. */
862
	if (!IS_P2ALIGNED(len, s->blocksize))
863
		return (1);
864

865
	/*
866
	 * If this is not the first page in the ABD, then the data must start
867
	 * on a page-aligned boundary (so the kernel can split on page
868
	 * boundaries without having to deal with a hole). If it is, then
869
	 * it can start on LBS-alignment.
870
	 */
871
	if (s->seen_first) {
872
		if (!IS_P2ALIGNED(off, PAGESIZE))
873
			return (1);
874
	} else {
875
		if (!IS_P2ALIGNED(off, s->blocksize))
876
			return (1);
877
		s->seen_first = 1;
878
	}
879

880
	/*
881
	 * If this data does not end on a page-aligned boundary, then this
882
	 * must be the last page in the ABD, for the same reason.
883
	 */
884
	s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
885

886
	return (0);
887
}
888

889
/*
890
 * Check if we can submit the pages in this ABD to the kernel as-is. Returns
891
 * the number of pages, or 0 if it can't be submitted like this.
892
 */
893
static boolean_t
894
vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
895
{
896
	vdev_disk_check_alignment_t s = {
897
	    .blocksize = bdev_logical_block_size(bdev),
898
	};
899

900
	if (abd_iterate_page_func(abd, 0, size,
901
	    vdev_disk_check_alignment_cb, &s))
902
		return (B_FALSE);
903

904
	return (B_TRUE);
905
}
906

907
static int
908
vdev_disk_io_rw(zio_t *zio)
909
{
910
	vdev_t *v = zio->io_vd;
911
	vdev_disk_t *vd = v->vdev_tsd;
912
	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
913
	int flags = 0;
914

915
	/*
916
	 * Accessing outside the block device is never allowed.
917
	 */
918
	if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) {
919
		vdev_dbgmsg(zio->io_vd,
920
		    "Illegal access %llu size %llu, device size %llu",
921
		    (u_longlong_t)zio->io_offset,
922
		    (u_longlong_t)zio->io_size,
923
		    (u_longlong_t)bdev_capacity(bdev));
924
		return (SET_ERROR(EIO));
925
	}
926

927
	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
928
	    v->vdev_failfast == B_TRUE) {
929
		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
930
		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
931
	}
932

933
	/*
934
	 * Check alignment of the incoming ABD. If any part of it would require
935
	 * submitting a page that is not aligned to both the logical block size
936
	 * and the page size, then we take a copy into a new memory region with
937
	 * correct alignment.  This should be impossible on a 512b LBS. On
938
	 * larger blocks, this can happen at least when a small number of
939
	 * blocks (usually 1) are allocated from a shared slab, or when
940
	 * abnormally-small data regions (eg gang headers) are mixed into the
941
	 * same ABD as larger allocations (eg aggregations).
942
	 */
943
	abd_t *abd = zio->io_abd;
944
	if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
945
		/* Allocate a new memory region with guaranteed alignment */
946
		abd = abd_alloc_for_io(zio->io_size,
947
		    zio->io_abd->abd_flags & ABD_FLAG_META);
948

949
		/* If we're writing copy our data into it */
950
		if (zio->io_type == ZIO_TYPE_WRITE)
951
			abd_copy(abd, zio->io_abd, zio->io_size);
952

953
		/*
954
		 * False here would mean the new allocation has an invalid
955
		 * alignment too, which would mean that abd_alloc() is not
956
		 * guaranteeing this, or our logic in
957
		 * vdev_disk_check_alignment() is wrong. In either case,
958
		 * something in seriously wrong and its not safe to continue.
959
		 */
960
		VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
961
	}
962

963
	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
964
	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
965
	if (abd != zio->io_abd)
966
		vbio->vbio_abd = abd;
967

968
	/* Fill it with data pages and submit it to the kernel */
969
	vbio_submit(vbio, abd, zio->io_size);
970
	return (0);
971
}
972

973
static void
974
vdev_disk_io_flush_completion(struct bio *bio)
975
{
976
	zio_t *zio = bio->bi_private;
977
	zio->io_error = bi_status_to_errno(bio->bi_status);
978
	if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY)
979
		zio->io_error = SET_ERROR(ENOTSUP);
980

981
	bio_put(bio);
982
	ASSERT3S(zio->io_error, >=, 0);
983
	if (zio->io_error)
984
		vdev_disk_error(zio);
985
	zio_interrupt(zio);
986
}
987

988
static int
989
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
990
{
991
	struct request_queue *q;
992
	struct bio *bio;
993

994
	q = bdev_get_queue(bdev);
995
	if (!q)
996
		return (SET_ERROR(ENXIO));
997

998
	bio = vdev_bio_alloc(bdev, GFP_NOIO, 0);
999
	if (unlikely(bio == NULL))
1000
		return (SET_ERROR(ENOMEM));
1001

1002
	bio->bi_end_io = vdev_disk_io_flush_completion;
1003
	bio->bi_private = zio;
1004
	bio_set_flush(bio);
1005
	vdev_submit_bio(bio);
1006
	invalidate_bdev(bdev);
1007

1008
	return (0);
1009
}
1010

1011
static void
1012
vdev_disk_discard_end_io(struct bio *bio)
1013
{
1014
	zio_t *zio = bio->bi_private;
1015
	zio->io_error = bi_status_to_errno(bio->bi_status);
1016

1017
	bio_put(bio);
1018
	if (zio->io_error)
1019
		vdev_disk_error(zio);
1020
	zio_interrupt(zio);
1021
}
1022

1023
/*
1024
 * Wrappers for the different secure erase and discard APIs. We use async
1025
 * when available; in this case, *biop is set to the last bio in the chain.
1026
 */
1027
static int
1028
vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
1029
    sector_t nsect, struct bio **biop)
1030
{
1031
	*biop = NULL;
1032
	int error;
1033

1034
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
1035
	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
1036
	    sector, nsect, GFP_NOFS);
1037
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
1038
	error = __blkdev_issue_discard(BDH_BDEV(bdh),
1039
	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
1040
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
1041
	error = blkdev_issue_discard(BDH_BDEV(bdh),
1042
	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
1043
#else
1044
#error "unsupported kernel"
1045
#endif
1046

1047
	return (error);
1048
}
1049

1050
static int
1051
vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
1052
    sector_t nsect, struct bio **biop)
1053
{
1054
	*biop = NULL;
1055
	int error;
1056

1057
#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
1058
	error = __blkdev_issue_discard(BDH_BDEV(bdh),
1059
	    sector, nsect, GFP_NOFS, 0, biop);
1060
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
1061
	error = __blkdev_issue_discard(BDH_BDEV(bdh),
1062
	    sector, nsect, GFP_NOFS, biop);
1063
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
1064
	error = blkdev_issue_discard(BDH_BDEV(bdh),
1065
	    sector, nsect, GFP_NOFS, 0);
1066
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
1067
	error = blkdev_issue_discard(BDH_BDEV(bdh),
1068
	    sector, nsect, GFP_NOFS);
1069
#else
1070
#error "unsupported kernel"
1071
#endif
1072

1073
	return (error);
1074
}
1075

1076
/*
1077
 * Entry point for TRIM ops. This calls the right wrapper for secure erase or
1078
 * discard, and then does the appropriate finishing work for error vs success
1079
 * and async vs sync.
1080
 */
1081
static int
1082
vdev_disk_io_trim(zio_t *zio)
1083
{
1084
	int error;
1085
	struct bio *bio;
1086

1087
	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
1088
	sector_t sector = zio->io_offset >> 9;
1089
	sector_t nsects = zio->io_size >> 9;
1090

1091
	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
1092
		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
1093
	else
1094
		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
1095

1096
	if (error != 0)
1097
		return (SET_ERROR(-error));
1098

1099
	if (bio == NULL) {
1100
		/*
1101
		 * This was a synchronous op that completed successfully, so
1102
		 * return it to ZFS immediately.
1103
		 */
1104
		zio_interrupt(zio);
1105
	} else {
1106
		/*
1107
		 * This was an asynchronous op; set up completion callback and
1108
		 * issue it.
1109
		 */
1110
		bio->bi_private = zio;
1111
		bio->bi_end_io = vdev_disk_discard_end_io;
1112
		vdev_submit_bio(bio);
1113
	}
1114

1115
	return (0);
1116
}
1117

1118
static void
1119
vdev_disk_io_start(zio_t *zio)
1120
{
1121
	vdev_t *v = zio->io_vd;
1122
	vdev_disk_t *vd = v->vdev_tsd;
1123
	int error;
1124

1125
	/*
1126
	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
1127
	 * Nothing to be done here but return failure.
1128
	 */
1129
	if (vd == NULL) {
1130
		zio->io_error = ENXIO;
1131
		zio_interrupt(zio);
1132
		return;
1133
	}
1134

1135
	rw_enter(&vd->vd_lock, RW_READER);
1136

1137
	/*
1138
	 * If the vdev is closed, it's likely due to a failed reopen and is
1139
	 * in the UNAVAIL state.  Nothing to be done here but return failure.
1140
	 */
1141
	if (vd->vd_bdh == NULL) {
1142
		rw_exit(&vd->vd_lock);
1143
		zio->io_error = ENXIO;
1144
		zio_interrupt(zio);
1145
		return;
1146
	}
1147

1148
	switch (zio->io_type) {
1149
	case ZIO_TYPE_FLUSH:
1150

1151
		if (!vdev_readable(v)) {
1152
			/* Drive not there, can't flush */
1153
			error = SET_ERROR(ENXIO);
1154
		} else if (zfs_nocacheflush) {
1155
			/* Flushing disabled by operator, declare success */
1156
			error = 0;
1157
		} else if (v->vdev_nowritecache) {
1158
			/* This vdev not capable of flushing */
1159
			error = SET_ERROR(ENOTSUP);
1160
		} else {
1161
			/*
1162
			 * Issue the flush. If successful, the response will
1163
			 * be handled in the completion callback, so we're done.
1164
			 */
1165
			error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
1166
			if (error == 0) {
1167
				rw_exit(&vd->vd_lock);
1168
				return;
1169
			}
1170
		}
1171

1172
		/* Couldn't issue the flush, so set the error and return it */
1173
		rw_exit(&vd->vd_lock);
1174
		zio->io_error = error;
1175
		zio_execute(zio);
1176
		return;
1177

1178
	case ZIO_TYPE_TRIM:
1179
		error = vdev_disk_io_trim(zio);
1180
		rw_exit(&vd->vd_lock);
1181
		if (error) {
1182
			zio->io_error = error;
1183
			zio_execute(zio);
1184
		}
1185
		return;
1186

1187
	case ZIO_TYPE_READ:
1188
	case ZIO_TYPE_WRITE:
1189
		zio->io_target_timestamp = zio_handle_io_delay(zio);
1190
		error = vdev_disk_io_rw(zio);
1191
		rw_exit(&vd->vd_lock);
1192
		if (error) {
1193
			zio->io_error = error;
1194
			zio_interrupt(zio);
1195
		}
1196
		return;
1197

1198
	default:
1199
		/*
1200
		 * Getting here means our parent vdev has made a very strange
1201
		 * request of us, and shouldn't happen. Assert here to force a
1202
		 * crash in dev builds, but in production return the IO
1203
		 * unhandled. The pool will likely suspend anyway but that's
1204
		 * nicer than crashing the kernel.
1205
		 */
1206
		ASSERT3S(zio->io_type, ==, -1);
1207

1208
		rw_exit(&vd->vd_lock);
1209
		zio->io_error = SET_ERROR(ENOTSUP);
1210
		zio_interrupt(zio);
1211
		return;
1212
	}
1213

1214
	__builtin_unreachable();
1215
}
1216

1217
static void
1218
vdev_disk_io_done(zio_t *zio)
1219
{
1220
	/* If this was a read or write, we need to clean up the vbio */
1221
	if (zio->io_bio != NULL) {
1222
		vbio_t *vbio = zio->io_bio;
1223
		zio->io_bio = NULL;
1224

1225
		/*
1226
		 * If we copied the ABD before issuing it, clean up and return
1227
		 * the copy to the ADB, with changes if appropriate.
1228
		 */
1229
		if (vbio->vbio_abd != NULL) {
1230
			if (zio->io_type == ZIO_TYPE_READ)
1231
				abd_copy(zio->io_abd, vbio->vbio_abd,
1232
				    zio->io_size);
1233

1234
			abd_free(vbio->vbio_abd);
1235
			vbio->vbio_abd = NULL;
1236
		}
1237

1238
		/* Final cleanup */
1239
		kmem_free(vbio, sizeof (vbio_t));
1240
	}
1241

1242
	/*
1243
	 * If the device returned EIO, we revalidate the media.  If it is
1244
	 * determined the media has changed this triggers the asynchronous
1245
	 * removal of the device from the configuration.
1246
	 */
1247
	if (zio->io_error == EIO) {
1248
		vdev_t *v = zio->io_vd;
1249
		vdev_disk_t *vd = v->vdev_tsd;
1250

1251
		if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
1252
			invalidate_bdev(BDH_BDEV(vd->vd_bdh));
1253
			v->vdev_remove_wanted = B_TRUE;
1254
			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
1255
		}
1256
	}
1257
}
1258

1259
static void
1260
vdev_disk_hold(vdev_t *vd)
1261
{
1262
	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
1263

1264
	/* We must have a pathname, and it must be absolute. */
1265
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
1266
		return;
1267

1268
	/*
1269
	 * Only prefetch path and devid info if the device has
1270
	 * never been opened.
1271
	 */
1272
	if (vd->vdev_tsd != NULL)
1273
		return;
1274

1275
}
1276

1277
static void
1278
vdev_disk_rele(vdev_t *vd)
1279
{
1280
	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
1281

1282
	/* XXX: Implement me as a vnode rele for the device */
1283
}
1284

1285
vdev_ops_t vdev_disk_ops = {
1286
	.vdev_op_init = NULL,
1287
	.vdev_op_fini = NULL,
1288
	.vdev_op_open = vdev_disk_open,
1289
	.vdev_op_close = vdev_disk_close,
1290
	.vdev_op_asize_to_psize = vdev_default_psize,
1291
	.vdev_op_psize_to_asize = vdev_default_asize,
1292
	.vdev_op_min_asize = vdev_default_min_asize,
1293
	.vdev_op_min_alloc = NULL,
1294
	.vdev_op_io_start = vdev_disk_io_start,
1295
	.vdev_op_io_done = vdev_disk_io_done,
1296
	.vdev_op_state_change = NULL,
1297
	.vdev_op_need_resilver = NULL,
1298
	.vdev_op_hold = vdev_disk_hold,
1299
	.vdev_op_rele = vdev_disk_rele,
1300
	.vdev_op_remap = NULL,
1301
	.vdev_op_xlate = vdev_default_xlate,
1302
	.vdev_op_rebuild_asize = NULL,
1303
	.vdev_op_metaslab_init = NULL,
1304
	.vdev_op_config_generate = NULL,
1305
	.vdev_op_nparity = NULL,
1306
	.vdev_op_ndisks = NULL,
1307
	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
1308
	.vdev_op_leaf = B_TRUE,			/* leaf vdev */
1309
	.vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
1310
};
1311

1312
int
1313
param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
1314
{
1315
	uint_t val;
1316
	int error;
1317

1318
	error = kstrtouint(buf, 0, &val);
1319
	if (error < 0)
1320
		return (SET_ERROR(error));
1321

1322
	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
1323
		return (SET_ERROR(-EINVAL));
1324

1325
	error = param_set_uint(buf, kp);
1326
	if (error < 0)
1327
		return (SET_ERROR(error));
1328

1329
	return (0);
1330
}
1331

1332
int
1333
param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
1334
{
1335
	uint_t val;
1336
	int error;
1337

1338
	error = kstrtouint(buf, 0, &val);
1339
	if (error < 0)
1340
		return (SET_ERROR(error));
1341

1342
	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
1343
		return (SET_ERROR(-EINVAL));
1344

1345
	error = param_set_uint(buf, kp);
1346
	if (error < 0)
1347
		return (SET_ERROR(error));
1348

1349
	return (0);
1350
}
1351

1352
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
1353
	"Timeout before determining that a device is missing");
1354

1355
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
1356
	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
1357

1358
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
1359
	"Maximum number of data segments to add to an IO request (min 4)");
1360

1361
Product

Resources

Company