Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
48288 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
23
/*
24
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
26
* Copyright (c) 2016, 2017 Intel Corporation.
27
* Copyright 2016 Igor Kozhukhov <[email protected]>.
28
*/
29
30
/*
31
* Functions to convert between a list of vdevs and an nvlist representing the
32
* configuration. Each entry in the list can be one of:
33
*
34
* Device vdevs
35
* disk=(path=..., devid=...)
36
* file=(path=...)
37
*
38
* Group vdevs
39
* raidz[1|2]=(...)
40
* mirror=(...)
41
*
42
* Hot spares
43
*
44
* While the underlying implementation supports it, group vdevs cannot contain
45
* other group vdevs. All userland verification of devices is contained within
46
* this file. If successful, the nvlist returned can be passed directly to the
47
* kernel; we've done as much verification as possible in userland.
48
*
49
* Hot spares are a special case, and passed down as an array of disk vdevs, at
50
* the same level as the root of the vdev tree.
51
*
52
* The only function exported by this file is 'make_root_vdev'. The
53
* function performs several passes:
54
*
55
* 1. Construct the vdev specification. Performs syntax validation and
56
* makes sure each device is valid.
57
* 2. Check for devices in use. Using libblkid to make sure that no
58
* devices are also in use. Some can be overridden using the 'force'
59
* flag, others cannot.
60
* 3. Check for replication errors if the 'force' flag is not specified.
61
* validates that the replication level is consistent across the
62
* entire pool.
63
* 4. Call libzfs to label any whole disks with an EFI label.
64
*/
65
66
#include <assert.h>
67
#include <ctype.h>
68
#include <errno.h>
69
#include <fcntl.h>
70
#include <libintl.h>
71
#include <libnvpair.h>
72
#include <libzutil.h>
73
#include <limits.h>
74
#include <sys/spa.h>
75
#include <stdio.h>
76
#include <string.h>
77
#include <unistd.h>
78
#include "zpool_util.h"
79
#include <sys/zfs_context.h>
80
#include <sys/stat.h>
81
82
/*
83
* For any given vdev specification, we can have multiple errors. The
84
* vdev_error() function keeps track of whether we have seen an error yet, and
85
* prints out a header if its the first error we've seen.
86
*/
87
boolean_t error_seen;
88
boolean_t is_force;
89
90
void
91
vdev_error(const char *fmt, ...)
92
{
93
va_list ap;
94
95
if (!error_seen) {
96
(void) fprintf(stderr, gettext("invalid vdev specification\n"));
97
if (!is_force)
98
(void) fprintf(stderr, gettext("use '-f' to override "
99
"the following errors:\n"));
100
else
101
(void) fprintf(stderr, gettext("the following errors "
102
"must be manually repaired:\n"));
103
error_seen = B_TRUE;
104
}
105
106
va_start(ap, fmt);
107
(void) vfprintf(stderr, fmt, ap);
108
va_end(ap);
109
}
110
111
/*
112
* Check that a file is valid. All we can do in this case is check that it's
113
* not in use by another pool, and not in use by swap.
114
*/
115
int
116
check_file_generic(const char *file, boolean_t force, boolean_t isspare)
117
{
118
char *name;
119
int fd;
120
int ret = 0;
121
pool_state_t state;
122
boolean_t inuse;
123
124
if ((fd = open(file, O_RDONLY)) < 0)
125
return (0);
126
127
if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
128
const char *desc;
129
130
switch (state) {
131
case POOL_STATE_ACTIVE:
132
desc = gettext("active");
133
break;
134
135
case POOL_STATE_EXPORTED:
136
desc = gettext("exported");
137
break;
138
139
case POOL_STATE_POTENTIALLY_ACTIVE:
140
desc = gettext("potentially active");
141
break;
142
143
default:
144
desc = gettext("unknown");
145
break;
146
}
147
148
/*
149
* Allow hot spares to be shared between pools.
150
*/
151
if (state == POOL_STATE_SPARE && isspare) {
152
free(name);
153
(void) close(fd);
154
return (0);
155
}
156
157
if (state == POOL_STATE_ACTIVE ||
158
state == POOL_STATE_SPARE || !force) {
159
switch (state) {
160
case POOL_STATE_SPARE:
161
vdev_error(gettext("%s is reserved as a hot "
162
"spare for pool %s\n"), file, name);
163
break;
164
default:
165
vdev_error(gettext("%s is part of %s pool "
166
"'%s'\n"), file, desc, name);
167
break;
168
}
169
ret = -1;
170
}
171
172
free(name);
173
}
174
175
(void) close(fd);
176
return (ret);
177
}
178
179
/*
180
* This may be a shorthand device path or it could be total gibberish.
181
* Check to see if it is a known device available in zfs_vdev_paths.
182
* As part of this check, see if we've been given an entire disk
183
* (minus the slice number).
184
*/
185
static int
186
is_shorthand_path(const char *arg, char *path, size_t path_size,
187
struct stat64 *statbuf, boolean_t *wholedisk)
188
{
189
int error;
190
191
error = zfs_resolve_shortname(arg, path, path_size);
192
if (error == 0) {
193
*wholedisk = zfs_dev_is_whole_disk(path);
194
if (*wholedisk || (stat64(path, statbuf) == 0))
195
return (0);
196
}
197
198
strlcpy(path, arg, path_size);
199
memset(statbuf, 0, sizeof (*statbuf));
200
*wholedisk = B_FALSE;
201
202
return (error);
203
}
204
205
/*
206
* Determine if the given path is a hot spare within the given configuration.
207
* If no configuration is given we rely solely on the label.
208
*/
209
static boolean_t
210
is_spare(nvlist_t *config, const char *path)
211
{
212
int fd;
213
pool_state_t state;
214
char *name = NULL;
215
nvlist_t *label;
216
uint64_t guid, spareguid;
217
nvlist_t *nvroot;
218
nvlist_t **spares;
219
uint_t i, nspares;
220
boolean_t inuse;
221
222
if (zpool_is_draid_spare(path))
223
return (B_TRUE);
224
225
if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
226
return (B_FALSE);
227
228
if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
229
!inuse ||
230
state != POOL_STATE_SPARE ||
231
zpool_read_label(fd, &label, NULL) != 0) {
232
free(name);
233
(void) close(fd);
234
return (B_FALSE);
235
}
236
free(name);
237
(void) close(fd);
238
239
if (config == NULL) {
240
nvlist_free(label);
241
return (B_TRUE);
242
}
243
244
verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
245
nvlist_free(label);
246
247
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
248
&nvroot) == 0);
249
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
250
&spares, &nspares) == 0) {
251
for (i = 0; i < nspares; i++) {
252
verify(nvlist_lookup_uint64(spares[i],
253
ZPOOL_CONFIG_GUID, &spareguid) == 0);
254
if (spareguid == guid)
255
return (B_TRUE);
256
}
257
}
258
259
return (B_FALSE);
260
}
261
262
/*
263
* Create a leaf vdev. Determine if this is a file or a device. If it's a
264
* device, fill in the device id to make a complete nvlist. Valid forms for a
265
* leaf vdev are:
266
*
267
* /dev/xxx Complete disk path
268
* /xxx Full path to file
269
* xxx Shorthand for <zfs_vdev_paths>/xxx
270
* draid* Virtual dRAID spare
271
*/
272
static nvlist_t *
273
make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift)
274
{
275
char path[MAXPATHLEN];
276
struct stat64 statbuf;
277
nvlist_t *vdev = NULL;
278
const char *type = NULL;
279
boolean_t wholedisk = B_FALSE;
280
int err;
281
282
/*
283
* Determine what type of vdev this is, and put the full path into
284
* 'path'. We detect whether this is a device of file afterwards by
285
* checking the st_mode of the file.
286
*/
287
if (arg[0] == '/') {
288
/*
289
* Complete device or file path. Exact type is determined by
290
* examining the file descriptor afterwards. Symbolic links
291
* are resolved to their real paths to determine whole disk
292
* and S_ISBLK/S_ISREG type checks. However, we are careful
293
* to store the given path as ZPOOL_CONFIG_PATH to ensure we
294
* can leverage udev's persistent device labels.
295
*/
296
if (realpath(arg, path) == NULL) {
297
(void) fprintf(stderr,
298
gettext("cannot resolve path '%s'\n"), arg);
299
return (NULL);
300
}
301
302
wholedisk = zfs_dev_is_whole_disk(path);
303
if (!wholedisk && (stat64(path, &statbuf) != 0)) {
304
(void) fprintf(stderr,
305
gettext("cannot open '%s': %s\n"),
306
path, strerror(errno));
307
return (NULL);
308
}
309
310
/* After whole disk check restore original passed path */
311
strlcpy(path, arg, sizeof (path));
312
} else if (zpool_is_draid_spare(arg)) {
313
if (!is_primary) {
314
(void) fprintf(stderr,
315
gettext("cannot open '%s': dRAID spares can only "
316
"be used to replace primary vdevs\n"), arg);
317
return (NULL);
318
}
319
320
wholedisk = B_TRUE;
321
strlcpy(path, arg, sizeof (path));
322
type = VDEV_TYPE_DRAID_SPARE;
323
} else {
324
err = is_shorthand_path(arg, path, sizeof (path),
325
&statbuf, &wholedisk);
326
if (err != 0) {
327
/*
328
* If we got ENOENT, then the user gave us
329
* gibberish, so try to direct them with a
330
* reasonable error message. Otherwise,
331
* regurgitate strerror() since it's the best we
332
* can do.
333
*/
334
if (err == ENOENT) {
335
(void) fprintf(stderr,
336
gettext("cannot open '%s': no such "
337
"device in %s\n"), arg, DISK_ROOT);
338
(void) fprintf(stderr,
339
gettext("must be a full path or "
340
"shorthand device name\n"));
341
return (NULL);
342
} else {
343
(void) fprintf(stderr,
344
gettext("cannot open '%s': %s\n"),
345
path, strerror(errno));
346
return (NULL);
347
}
348
}
349
}
350
351
if (type == NULL) {
352
/*
353
* Determine whether this is a device or a file.
354
*/
355
if (wholedisk || S_ISBLK(statbuf.st_mode)) {
356
type = VDEV_TYPE_DISK;
357
} else if (S_ISREG(statbuf.st_mode)) {
358
type = VDEV_TYPE_FILE;
359
} else {
360
fprintf(stderr, gettext("cannot use '%s': must "
361
"be a block device or regular file\n"), path);
362
return (NULL);
363
}
364
}
365
366
/*
367
* Finally, we have the complete device or file, and we know that it is
368
* acceptable to use. Construct the nvlist to describe this vdev. All
369
* vdevs have a 'path' element, and devices also have a 'devid' element.
370
*/
371
verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
372
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
373
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
374
375
/* Lookup and add the enclosure sysfs path (if exists) */
376
update_vdev_config_dev_sysfs_path(vdev, path,
377
ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
378
379
if (strcmp(type, VDEV_TYPE_DISK) == 0)
380
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
381
(uint64_t)wholedisk) == 0);
382
383
/*
384
* If the device is known to incorrectly report its physical sector
385
* size explicitly provide the known correct value.
386
*/
387
if (ashift == 0) {
388
int sector_size;
389
390
if (check_sector_size_database(path, &sector_size) == B_TRUE)
391
ashift = highbit64(sector_size) - 1;
392
}
393
394
if (ashift > 0)
395
(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
396
397
return (vdev);
398
}
399
400
/*
401
* Go through and verify the replication level of the pool is consistent.
402
* Performs the following checks:
403
*
404
* For the new spec, verifies that devices in mirrors and raidz are the
405
* same size.
406
*
407
* If the current configuration already has inconsistent replication
408
* levels, ignore any other potential problems in the new spec.
409
*
410
* Otherwise, make sure that the current spec (if there is one) and the new
411
* spec have consistent replication levels.
412
*
413
* If there is no current spec (create), make sure new spec has at least
414
* one general purpose vdev.
415
*/
416
typedef struct replication_level {
417
const char *zprl_type;
418
uint64_t zprl_children;
419
uint64_t zprl_parity;
420
} replication_level_t;
421
422
#define ZPOOL_FUZZ (16 * 1024 * 1024)
423
424
/*
425
* N.B. For the purposes of comparing replication levels dRAID can be
426
* considered functionally equivalent to raidz.
427
*/
428
static boolean_t
429
is_raidz_mirror(replication_level_t *a, replication_level_t *b,
430
replication_level_t **raidz, replication_level_t **mirror)
431
{
432
if ((strcmp(a->zprl_type, "raidz") == 0 ||
433
strcmp(a->zprl_type, "draid") == 0) &&
434
strcmp(b->zprl_type, "mirror") == 0) {
435
*raidz = a;
436
*mirror = b;
437
return (B_TRUE);
438
}
439
return (B_FALSE);
440
}
441
442
/*
443
* Comparison for determining if dRAID and raidz where passed in either order.
444
*/
445
static boolean_t
446
is_raidz_draid(replication_level_t *a, replication_level_t *b)
447
{
448
if ((strcmp(a->zprl_type, "raidz") == 0 ||
449
strcmp(a->zprl_type, "draid") == 0) &&
450
(strcmp(b->zprl_type, "raidz") == 0 ||
451
strcmp(b->zprl_type, "draid") == 0)) {
452
return (B_TRUE);
453
}
454
455
return (B_FALSE);
456
}
457
458
/*
459
* Given a list of toplevel vdevs, return the current replication level. If
460
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
461
* an error message will be displayed for each self-inconsistent vdev.
462
*/
463
static replication_level_t *
464
get_replication(nvlist_t *nvroot, boolean_t fatal)
465
{
466
nvlist_t **top;
467
uint_t t, toplevels;
468
nvlist_t **child;
469
uint_t c, children;
470
nvlist_t *nv;
471
const char *type;
472
replication_level_t lastrep = {0};
473
replication_level_t rep;
474
replication_level_t *ret;
475
replication_level_t *raidz, *mirror;
476
boolean_t dontreport;
477
478
ret = safe_malloc(sizeof (replication_level_t));
479
480
verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
481
&top, &toplevels) == 0);
482
483
for (t = 0; t < toplevels; t++) {
484
uint64_t is_log = B_FALSE;
485
486
nv = top[t];
487
488
/*
489
* For separate logs we ignore the top level vdev replication
490
* constraints.
491
*/
492
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
493
if (is_log)
494
continue;
495
496
/*
497
* Ignore holes introduced by removing aux devices, along
498
* with indirect vdevs introduced by previously removed
499
* vdevs.
500
*/
501
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
502
if (strcmp(type, VDEV_TYPE_HOLE) == 0 ||
503
strcmp(type, VDEV_TYPE_INDIRECT) == 0)
504
continue;
505
506
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
507
&child, &children) != 0) {
508
/*
509
* This is a 'file' or 'disk' vdev.
510
*/
511
rep.zprl_type = type;
512
rep.zprl_children = 1;
513
rep.zprl_parity = 0;
514
} else {
515
int64_t vdev_size;
516
517
/*
518
* This is a mirror or RAID-Z vdev. Go through and make
519
* sure the contents are all the same (files vs. disks),
520
* keeping track of the number of elements in the
521
* process.
522
*
523
* We also check that the size of each vdev (if it can
524
* be determined) is the same.
525
*/
526
rep.zprl_type = type;
527
rep.zprl_children = 0;
528
529
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
530
strcmp(type, VDEV_TYPE_DRAID) == 0) {
531
verify(nvlist_lookup_uint64(nv,
532
ZPOOL_CONFIG_NPARITY,
533
&rep.zprl_parity) == 0);
534
assert(rep.zprl_parity != 0);
535
} else {
536
rep.zprl_parity = 0;
537
}
538
539
/*
540
* The 'dontreport' variable indicates that we've
541
* already reported an error for this spec, so don't
542
* bother doing it again.
543
*/
544
type = NULL;
545
dontreport = 0;
546
vdev_size = -1LL;
547
for (c = 0; c < children; c++) {
548
nvlist_t *cnv = child[c];
549
const char *path;
550
struct stat64 statbuf;
551
const char *childtype;
552
int fd, err;
553
554
rep.zprl_children++;
555
556
verify(nvlist_lookup_string(cnv,
557
ZPOOL_CONFIG_TYPE, &childtype) == 0);
558
559
/*
560
* If this is a replacing or spare vdev, then
561
* get the real first child of the vdev: do this
562
* in a loop because replacing and spare vdevs
563
* can be nested.
564
*/
565
while (strcmp(childtype,
566
VDEV_TYPE_REPLACING) == 0 ||
567
strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
568
nvlist_t **rchild;
569
uint_t rchildren;
570
571
verify(nvlist_lookup_nvlist_array(cnv,
572
ZPOOL_CONFIG_CHILDREN, &rchild,
573
&rchildren) == 0);
574
assert(rchildren == 2);
575
cnv = rchild[0];
576
577
verify(nvlist_lookup_string(cnv,
578
ZPOOL_CONFIG_TYPE,
579
&childtype) == 0);
580
}
581
582
verify(nvlist_lookup_string(cnv,
583
ZPOOL_CONFIG_PATH, &path) == 0);
584
585
/*
586
* Skip active spares they should never cause
587
* the pool to be evaluated as inconsistent.
588
*/
589
if (is_spare(NULL, path))
590
continue;
591
592
/*
593
* If we have a raidz/mirror that combines disks
594
* with files, only report it as an error when
595
* fatal is set to ensure all the replication
596
* checks aren't skipped in check_replication().
597
*/
598
if (fatal && !dontreport && type != NULL &&
599
strcmp(type, childtype) != 0) {
600
if (ret != NULL)
601
free(ret);
602
ret = NULL;
603
vdev_error(gettext(
604
"mismatched replication "
605
"level: %s contains both "
606
"files and devices\n"),
607
rep.zprl_type);
608
dontreport = B_TRUE;
609
}
610
611
/*
612
* According to stat(2), the value of 'st_size'
613
* is undefined for block devices and character
614
* devices. But there is no effective way to
615
* determine the real size in userland.
616
*
617
* Instead, we'll take advantage of an
618
* implementation detail of spec_size(). If the
619
* device is currently open, then we (should)
620
* return a valid size.
621
*
622
* If we still don't get a valid size (indicated
623
* by a size of 0 or MAXOFFSET_T), then ignore
624
* this device altogether.
625
*/
626
if ((fd = open(path, O_RDONLY)) >= 0) {
627
err = fstat64_blk(fd, &statbuf);
628
(void) close(fd);
629
} else {
630
err = stat64(path, &statbuf);
631
}
632
633
if (err != 0 ||
634
statbuf.st_size == 0 ||
635
statbuf.st_size == MAXOFFSET_T)
636
continue;
637
638
int64_t size = statbuf.st_size;
639
640
/*
641
* Also make sure that devices and
642
* slices have a consistent size. If
643
* they differ by a significant amount
644
* (~16MB) then report an error.
645
*/
646
if (!dontreport &&
647
(vdev_size != -1LL &&
648
(llabs(size - vdev_size) >
649
ZPOOL_FUZZ))) {
650
if (ret != NULL)
651
free(ret);
652
ret = NULL;
653
if (fatal)
654
vdev_error(gettext(
655
"%s contains devices of "
656
"different sizes\n"),
657
rep.zprl_type);
658
else
659
return (NULL);
660
dontreport = B_TRUE;
661
}
662
663
type = childtype;
664
vdev_size = size;
665
}
666
}
667
668
/*
669
* At this point, we have the replication of the last toplevel
670
* vdev in 'rep'. Compare it to 'lastrep' to see if it is
671
* different.
672
*/
673
if (lastrep.zprl_type != NULL) {
674
if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
675
is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
676
/*
677
* Accepted raidz and mirror when they can
678
* handle the same number of disk failures.
679
*/
680
if (raidz->zprl_parity !=
681
mirror->zprl_children - 1) {
682
if (ret != NULL)
683
free(ret);
684
ret = NULL;
685
if (fatal)
686
vdev_error(gettext(
687
"mismatched replication "
688
"level: "
689
"%s and %s vdevs with "
690
"different redundancy, "
691
"%llu vs. %llu (%llu-way) "
692
"are present\n"),
693
raidz->zprl_type,
694
mirror->zprl_type,
695
(u_longlong_t)
696
raidz->zprl_parity,
697
(u_longlong_t)
698
mirror->zprl_children - 1,
699
(u_longlong_t)
700
mirror->zprl_children);
701
else
702
return (NULL);
703
}
704
} else if (is_raidz_draid(&lastrep, &rep)) {
705
/*
706
* Accepted raidz and draid when they can
707
* handle the same number of disk failures.
708
*/
709
if (lastrep.zprl_parity != rep.zprl_parity) {
710
if (ret != NULL)
711
free(ret);
712
ret = NULL;
713
if (fatal)
714
vdev_error(gettext(
715
"mismatched replication "
716
"level: %s and %s vdevs "
717
"with different "
718
"redundancy, %llu vs. "
719
"%llu are present\n"),
720
lastrep.zprl_type,
721
rep.zprl_type,
722
(u_longlong_t)
723
lastrep.zprl_parity,
724
(u_longlong_t)
725
rep.zprl_parity);
726
else
727
return (NULL);
728
}
729
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
730
0) {
731
if (ret != NULL)
732
free(ret);
733
ret = NULL;
734
if (fatal)
735
vdev_error(gettext(
736
"mismatched replication level: "
737
"both %s and %s vdevs are "
738
"present\n"),
739
lastrep.zprl_type, rep.zprl_type);
740
else
741
return (NULL);
742
} else if (lastrep.zprl_parity != rep.zprl_parity) {
743
if (ret)
744
free(ret);
745
ret = NULL;
746
if (fatal)
747
vdev_error(gettext(
748
"mismatched replication level: "
749
"both %llu and %llu device parity "
750
"%s vdevs are present\n"),
751
(u_longlong_t)
752
lastrep.zprl_parity,
753
(u_longlong_t)rep.zprl_parity,
754
rep.zprl_type);
755
else
756
return (NULL);
757
} else if (lastrep.zprl_children != rep.zprl_children) {
758
if (ret)
759
free(ret);
760
ret = NULL;
761
if (fatal)
762
vdev_error(gettext(
763
"mismatched replication level: "
764
"both %llu-way and %llu-way %s "
765
"vdevs are present\n"),
766
(u_longlong_t)
767
lastrep.zprl_children,
768
(u_longlong_t)
769
rep.zprl_children,
770
rep.zprl_type);
771
else
772
return (NULL);
773
}
774
}
775
lastrep = rep;
776
}
777
778
if (ret != NULL)
779
*ret = rep;
780
781
return (ret);
782
}
783
784
/*
785
* Check the replication level of the vdev spec against the current pool. Calls
786
* get_replication() to make sure the new spec is self-consistent. If the pool
787
* has a consistent replication level, then we ignore any errors. Otherwise,
788
* report any difference between the two.
789
*/
790
static int
791
check_replication(nvlist_t *config, nvlist_t *newroot)
792
{
793
nvlist_t **child;
794
uint_t children;
795
replication_level_t *current = NULL, *new;
796
replication_level_t *raidz, *mirror;
797
int ret;
798
799
/*
800
* If we have a current pool configuration, check to see if it's
801
* self-consistent. If not, simply return success.
802
*/
803
if (config != NULL) {
804
nvlist_t *nvroot;
805
806
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
807
&nvroot) == 0);
808
if ((current = get_replication(nvroot, B_FALSE)) == NULL)
809
return (0);
810
}
811
/*
812
* for spares there may be no children, and therefore no
813
* replication level to check
814
*/
815
if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
816
&child, &children) != 0) || (children == 0)) {
817
free(current);
818
return (0);
819
}
820
821
/*
822
* If all we have is logs then there's no replication level to check.
823
*/
824
if (num_logs(newroot) == children) {
825
free(current);
826
return (0);
827
}
828
829
/*
830
* Get the replication level of the new vdev spec, reporting any
831
* inconsistencies found.
832
*/
833
if ((new = get_replication(newroot, B_TRUE)) == NULL) {
834
free(current);
835
return (-1);
836
}
837
838
/*
839
* Check to see if the new vdev spec matches the replication level of
840
* the current pool.
841
*/
842
ret = 0;
843
if (current != NULL) {
844
if (is_raidz_mirror(current, new, &raidz, &mirror) ||
845
is_raidz_mirror(new, current, &raidz, &mirror)) {
846
if (raidz->zprl_parity != mirror->zprl_children - 1) {
847
vdev_error(gettext(
848
"mismatched replication level: pool and "
849
"new vdev with different redundancy, %s "
850
"and %s vdevs, %llu vs. %llu (%llu-way)\n"),
851
raidz->zprl_type,
852
mirror->zprl_type,
853
(u_longlong_t)raidz->zprl_parity,
854
(u_longlong_t)mirror->zprl_children - 1,
855
(u_longlong_t)mirror->zprl_children);
856
ret = -1;
857
}
858
} else if (is_raidz_draid(current, new)) {
859
if (current->zprl_parity != new->zprl_parity) {
860
vdev_error(gettext(
861
"mismatched replication level: pool and "
862
"new vdev with different redundancy, %s "
863
"and %s vdevs, %llu vs. %llu\n"),
864
current->zprl_type,
865
new->zprl_type,
866
(u_longlong_t)current->zprl_parity,
867
(u_longlong_t)new->zprl_parity);
868
ret = -1;
869
}
870
} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
871
vdev_error(gettext(
872
"mismatched replication level: pool uses %s "
873
"and new vdev is %s\n"),
874
current->zprl_type, new->zprl_type);
875
ret = -1;
876
} else if (current->zprl_parity != new->zprl_parity) {
877
vdev_error(gettext(
878
"mismatched replication level: pool uses %llu "
879
"device parity and new vdev uses %llu\n"),
880
(u_longlong_t)current->zprl_parity,
881
(u_longlong_t)new->zprl_parity);
882
ret = -1;
883
} else if (current->zprl_children != new->zprl_children) {
884
vdev_error(gettext(
885
"mismatched replication level: pool uses %llu-way "
886
"%s and new vdev uses %llu-way %s\n"),
887
(u_longlong_t)current->zprl_children,
888
current->zprl_type,
889
(u_longlong_t)new->zprl_children,
890
new->zprl_type);
891
ret = -1;
892
}
893
}
894
895
free(new);
896
if (current != NULL)
897
free(current);
898
899
return (ret);
900
}
901
902
static int
903
zero_label(const char *path)
904
{
905
const int size = 4096;
906
char buf[size];
907
int err, fd;
908
909
if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
910
(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
911
path, strerror(errno));
912
return (-1);
913
}
914
915
memset(buf, 0, size);
916
err = write(fd, buf, size);
917
(void) fdatasync(fd);
918
(void) close(fd);
919
920
if (err == -1) {
921
(void) fprintf(stderr, gettext("cannot zero first %d bytes "
922
"of '%s': %s\n"), size, path, strerror(errno));
923
return (-1);
924
}
925
926
if (err != size) {
927
(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
928
"of '%s'\n"), err, size, path);
929
return (-1);
930
}
931
932
return (0);
933
}
934
935
static void
936
lines_to_stderr(char *lines[], int lines_cnt)
937
{
938
int i;
939
for (i = 0; i < lines_cnt; i++) {
940
fprintf(stderr, "%s\n", lines[i]);
941
}
942
}
943
944
/*
945
* Go through and find any whole disks in the vdev specification, labelling them
946
* as appropriate. When constructing the vdev spec, we were unable to open this
947
* device in order to provide a devid. Now that we have labelled the disk and
948
* know that slice 0 is valid, we can construct the devid now.
949
*
950
* If the disk was already labeled with an EFI label, we will have gotten the
951
* devid already (because we were able to open the whole disk). Otherwise, we
952
* need to get the devid after we label the disk.
953
*/
954
static int
955
make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing)
956
{
957
nvlist_t **child;
958
uint_t c, children;
959
const char *type, *path;
960
char devpath[MAXPATHLEN];
961
char udevpath[MAXPATHLEN];
962
uint64_t wholedisk;
963
struct stat64 statbuf;
964
int is_exclusive = 0;
965
int fd;
966
int ret;
967
968
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
969
970
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
971
&child, &children) != 0) {
972
973
if (strcmp(type, VDEV_TYPE_DISK) != 0)
974
return (0);
975
976
/*
977
* We have a disk device. If this is a whole disk write
978
* out the efi partition table, otherwise write zero's to
979
* the first 4k of the partition. This is to ensure that
980
* libblkid will not misidentify the partition due to a
981
* magic value left by the previous filesystem.
982
*/
983
verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
984
verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
985
&wholedisk));
986
987
if (!wholedisk) {
988
/*
989
* Update device id string for mpath nodes (Linux only)
990
*/
991
if (is_mpath_whole_disk(path))
992
update_vdev_config_dev_strs(nv);
993
994
if (!is_spare(NULL, path))
995
(void) zero_label(path);
996
return (0);
997
}
998
999
if (realpath(path, devpath) == NULL) {
1000
ret = errno;
1001
(void) fprintf(stderr,
1002
gettext("cannot resolve path '%s'\n"), path);
1003
return (ret);
1004
}
1005
1006
/*
1007
* Remove any previously existing symlink from a udev path to
1008
* the device before labeling the disk. This ensures that
1009
* only newly created links are used. Otherwise there is a
1010
* window between when udev deletes and recreates the link
1011
* during which access attempts will fail with ENOENT.
1012
*/
1013
strlcpy(udevpath, path, MAXPATHLEN);
1014
(void) zfs_append_partition(udevpath, MAXPATHLEN);
1015
1016
fd = open(devpath, O_RDWR|O_EXCL);
1017
if (fd == -1) {
1018
if (errno == EBUSY)
1019
is_exclusive = 1;
1020
#ifdef __FreeBSD__
1021
if (errno == EPERM)
1022
is_exclusive = 1;
1023
#endif
1024
} else {
1025
(void) close(fd);
1026
}
1027
1028
/*
1029
* If the partition exists, contains a valid spare label,
1030
* and is opened exclusively there is no need to partition
1031
* it. Hot spares have already been partitioned and are
1032
* held open exclusively by the kernel as a safety measure.
1033
*
1034
* If the provided path is for a /dev/disk/ device its
1035
* symbolic link will be removed, partition table created,
1036
* and then block until udev creates the new link.
1037
*/
1038
if (!is_exclusive && !is_spare(NULL, udevpath)) {
1039
char *devnode = strrchr(devpath, '/') + 1;
1040
char **lines = NULL;
1041
int lines_cnt = 0;
1042
1043
ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1044
if (ret == 0) {
1045
ret = lstat64(udevpath, &statbuf);
1046
if (ret == 0 && S_ISLNK(statbuf.st_mode))
1047
(void) unlink(udevpath);
1048
}
1049
1050
/*
1051
* When labeling a pool the raw device node name
1052
* is provided as it appears under /dev/.
1053
*
1054
* Note that 'zhp' will be NULL when we're creating a
1055
* pool.
1056
*/
1057
if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode,
1058
nv, zhp == NULL ? "create" :
1059
replacing ? "replace" : "add", &lines,
1060
&lines_cnt) != 0) {
1061
(void) fprintf(stderr,
1062
gettext(
1063
"Error preparing/labeling disk.\n"));
1064
if (lines_cnt > 0) {
1065
(void) fprintf(stderr,
1066
gettext("zfs_prepare_disk output:\n"));
1067
lines_to_stderr(lines, lines_cnt);
1068
}
1069
1070
libzfs_free_str_array(lines, lines_cnt);
1071
return (-1);
1072
}
1073
libzfs_free_str_array(lines, lines_cnt);
1074
1075
/*
1076
* Wait for udev to signal the device is available
1077
* by the provided path.
1078
*/
1079
ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1080
if (ret) {
1081
(void) fprintf(stderr,
1082
gettext("missing link: %s was "
1083
"partitioned but %s is missing\n"),
1084
devnode, udevpath);
1085
return (ret);
1086
}
1087
1088
ret = zero_label(udevpath);
1089
if (ret)
1090
return (ret);
1091
}
1092
1093
/*
1094
* Update the path to refer to the partition. The presence of
1095
* the 'whole_disk' field indicates to the CLI that we should
1096
* chop off the partition number when displaying the device in
1097
* future output.
1098
*/
1099
verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1100
1101
/*
1102
* Update device id strings for whole disks (Linux only)
1103
*/
1104
update_vdev_config_dev_strs(nv);
1105
1106
return (0);
1107
}
1108
1109
for (c = 0; c < children; c++)
1110
if ((ret = make_disks(zhp, child[c], replacing)) != 0)
1111
return (ret);
1112
1113
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1114
&child, &children) == 0)
1115
for (c = 0; c < children; c++)
1116
if ((ret = make_disks(zhp, child[c], replacing)) != 0)
1117
return (ret);
1118
1119
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1120
&child, &children) == 0)
1121
for (c = 0; c < children; c++)
1122
if ((ret = make_disks(zhp, child[c], replacing)) != 0)
1123
return (ret);
1124
1125
return (0);
1126
}
1127
1128
/*
1129
* Go through and find any devices that are in use. We rely on libdiskmgt for
1130
* the majority of this task.
1131
*/
1132
static boolean_t
1133
is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1134
boolean_t replacing, boolean_t isspare)
1135
{
1136
nvlist_t **child;
1137
uint_t c, children;
1138
const char *type, *path;
1139
int ret = 0;
1140
char buf[MAXPATHLEN];
1141
uint64_t wholedisk = B_FALSE;
1142
boolean_t anyinuse = B_FALSE;
1143
1144
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1145
1146
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1147
&child, &children) != 0) {
1148
1149
verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1150
if (strcmp(type, VDEV_TYPE_DISK) == 0)
1151
verify(!nvlist_lookup_uint64(nv,
1152
ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1153
1154
/*
1155
* As a generic check, we look to see if this is a replace of a
1156
* hot spare within the same pool. If so, we allow it
1157
* regardless of what libblkid or zpool_in_use() says.
1158
*/
1159
if (replacing) {
1160
(void) strlcpy(buf, path, sizeof (buf));
1161
if (wholedisk) {
1162
ret = zfs_append_partition(buf, sizeof (buf));
1163
if (ret == -1)
1164
return (-1);
1165
}
1166
1167
if (is_spare(config, buf))
1168
return (B_FALSE);
1169
}
1170
1171
if (strcmp(type, VDEV_TYPE_DISK) == 0)
1172
ret = check_device(path, force, isspare, wholedisk);
1173
1174
else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1175
ret = check_file(path, force, isspare);
1176
1177
return (ret != 0);
1178
}
1179
1180
for (c = 0; c < children; c++)
1181
if (is_device_in_use(config, child[c], force, replacing,
1182
B_FALSE))
1183
anyinuse = B_TRUE;
1184
1185
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1186
&child, &children) == 0)
1187
for (c = 0; c < children; c++)
1188
if (is_device_in_use(config, child[c], force, replacing,
1189
B_TRUE))
1190
anyinuse = B_TRUE;
1191
1192
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1193
&child, &children) == 0)
1194
for (c = 0; c < children; c++)
1195
if (is_device_in_use(config, child[c], force, replacing,
1196
B_FALSE))
1197
anyinuse = B_TRUE;
1198
1199
return (anyinuse);
1200
}
1201
1202
/*
1203
* Returns the parity level extracted from a raidz or draid type.
1204
* If the parity cannot be determined zero is returned.
1205
*/
1206
static int
1207
get_parity(const char *type)
1208
{
1209
long parity = 0;
1210
const char *p;
1211
1212
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
1213
p = type + strlen(VDEV_TYPE_RAIDZ);
1214
1215
if (*p == '\0') {
1216
/* when unspecified default to single parity */
1217
return (1);
1218
} else if (*p == '0') {
1219
/* no zero prefixes allowed */
1220
return (0);
1221
} else {
1222
/* 0-3, no suffixes allowed */
1223
char *end;
1224
errno = 0;
1225
parity = strtol(p, &end, 10);
1226
if (errno != 0 || *end != '\0' ||
1227
parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
1228
return (0);
1229
}
1230
}
1231
} else if (strncmp(type, VDEV_TYPE_DRAID,
1232
strlen(VDEV_TYPE_DRAID)) == 0) {
1233
p = type + strlen(VDEV_TYPE_DRAID);
1234
1235
if (*p == '\0' || *p == ':') {
1236
/* when unspecified default to single parity */
1237
return (1);
1238
} else if (*p == '0') {
1239
/* no zero prefixes allowed */
1240
return (0);
1241
} else {
1242
/* 0-3, allowed suffixes: '\0' or ':' */
1243
char *end;
1244
errno = 0;
1245
parity = strtol(p, &end, 10);
1246
if (errno != 0 ||
1247
parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
1248
(*end != '\0' && *end != ':')) {
1249
return (0);
1250
}
1251
}
1252
}
1253
1254
return ((int)parity);
1255
}
1256
1257
/*
1258
* Assign the minimum and maximum number of devices allowed for
1259
* the specified type. On error NULL is returned, otherwise the
1260
* type prefix is returned (raidz, mirror, etc).
1261
*/
1262
static const char *
1263
is_grouping(const char *type, int *mindev, int *maxdev)
1264
{
1265
int nparity;
1266
1267
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
1268
strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
1269
nparity = get_parity(type);
1270
if (nparity == 0)
1271
return (NULL);
1272
if (mindev != NULL)
1273
*mindev = nparity + 1;
1274
if (maxdev != NULL)
1275
*maxdev = 255;
1276
1277
if (strncmp(type, VDEV_TYPE_RAIDZ,
1278
strlen(VDEV_TYPE_RAIDZ)) == 0) {
1279
return (VDEV_TYPE_RAIDZ);
1280
} else {
1281
return (VDEV_TYPE_DRAID);
1282
}
1283
}
1284
1285
if (maxdev != NULL)
1286
*maxdev = INT_MAX;
1287
1288
if (strcmp(type, "mirror") == 0) {
1289
if (mindev != NULL)
1290
*mindev = 2;
1291
return (VDEV_TYPE_MIRROR);
1292
}
1293
1294
if (strcmp(type, "spare") == 0) {
1295
if (mindev != NULL)
1296
*mindev = 1;
1297
return (VDEV_TYPE_SPARE);
1298
}
1299
1300
if (strcmp(type, "log") == 0) {
1301
if (mindev != NULL)
1302
*mindev = 1;
1303
return (VDEV_TYPE_LOG);
1304
}
1305
1306
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1307
strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1308
if (mindev != NULL)
1309
*mindev = 1;
1310
return (type);
1311
}
1312
1313
if (strcmp(type, "cache") == 0) {
1314
if (mindev != NULL)
1315
*mindev = 1;
1316
return (VDEV_TYPE_L2CACHE);
1317
}
1318
1319
return (NULL);
1320
}
1321
1322
/*
1323
* Extract the configuration parameters encoded in the dRAID type and
1324
* use them to generate a dRAID configuration. The expected format is:
1325
*
1326
* draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
1327
*
1328
* The intent is to be able to generate a good configuration when no
1329
* additional information is provided. The only mandatory component
1330
* of the 'type' is the 'draid' prefix. If a value is not provided
1331
* then reasonable defaults are used. The optional components may
1332
* appear in any order but the d/s/c suffix is required.
1333
*
1334
* Valid inputs:
1335
* - data: number of data devices per group (1-255)
1336
* - parity: number of parity blocks per group (1-3)
1337
* - spares: number of distributed spare (0-100)
1338
* - children: total number of devices (1-255)
1339
*
1340
* Examples:
1341
* - zpool create tank draid <devices...>
1342
* - zpool create tank draid2:8d:51c:2s <devices...>
1343
*/
1344
static int
1345
draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
1346
{
1347
uint64_t nparity;
1348
uint64_t nspares = 0;
1349
uint64_t ndata = UINT64_MAX;
1350
uint64_t ngroups = 1;
1351
long value;
1352
1353
if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
1354
return (EINVAL);
1355
1356
nparity = (uint64_t)get_parity(type);
1357
if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
1358
fprintf(stderr,
1359
gettext("invalid dRAID parity level %llu; must be "
1360
"between 1 and %d\n"), (u_longlong_t)nparity,
1361
VDEV_DRAID_MAXPARITY);
1362
return (EINVAL);
1363
}
1364
1365
char *p = (char *)type;
1366
while ((p = strchr(p, ':')) != NULL) {
1367
char *end;
1368
1369
p = p + 1;
1370
errno = 0;
1371
1372
if (!isdigit(p[0])) {
1373
(void) fprintf(stderr, gettext("invalid dRAID "
1374
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
1375
type);
1376
return (EINVAL);
1377
}
1378
1379
/* Expected non-zero value with c/d/s suffix */
1380
value = strtol(p, &end, 10);
1381
char suffix = tolower(*end);
1382
if (errno != 0 ||
1383
(suffix != 'c' && suffix != 'd' && suffix != 's')) {
1384
(void) fprintf(stderr, gettext("invalid dRAID "
1385
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
1386
type);
1387
return (EINVAL);
1388
}
1389
1390
if (suffix == 'c') {
1391
if ((uint64_t)value != children) {
1392
fprintf(stderr,
1393
gettext("invalid number of dRAID children; "
1394
"%llu required but %llu provided\n"),
1395
(u_longlong_t)value,
1396
(u_longlong_t)children);
1397
return (EINVAL);
1398
}
1399
} else if (suffix == 'd') {
1400
ndata = (uint64_t)value;
1401
} else if (suffix == 's') {
1402
nspares = (uint64_t)value;
1403
} else {
1404
verify(0); /* Unreachable */
1405
}
1406
}
1407
1408
/*
1409
* When a specific number of data disks is not provided limit a
1410
* redundancy group to 8 data disks. This value was selected to
1411
* provide a reasonable tradeoff between capacity and performance.
1412
*/
1413
if (ndata == UINT64_MAX) {
1414
if (children > nspares + nparity) {
1415
ndata = MIN(children - nspares - nparity, 8);
1416
} else {
1417
fprintf(stderr, gettext("request number of "
1418
"distributed spares %llu and parity level %llu\n"
1419
"leaves no disks available for data\n"),
1420
(u_longlong_t)nspares, (u_longlong_t)nparity);
1421
return (EINVAL);
1422
}
1423
}
1424
1425
/* Verify the maximum allowed group size is never exceeded. */
1426
if (ndata == 0 || (ndata + nparity > children - nspares)) {
1427
fprintf(stderr, gettext("requested number of dRAID data "
1428
"disks per group %llu is too high,\nat most %llu disks "
1429
"are available for data\n"), (u_longlong_t)ndata,
1430
(u_longlong_t)(children - nspares - nparity));
1431
return (EINVAL);
1432
}
1433
1434
/*
1435
* Verify the requested number of spares can be satisfied.
1436
* An arbitrary limit of 100 distributed spares is applied.
1437
*/
1438
if (nspares > 100 || nspares > (children - (ndata + nparity))) {
1439
fprintf(stderr,
1440
gettext("invalid number of dRAID spares %llu; additional "
1441
"disks would be required\n"), (u_longlong_t)nspares);
1442
return (EINVAL);
1443
}
1444
1445
/* Verify the requested number children is sufficient. */
1446
if (children < (ndata + nparity + nspares)) {
1447
fprintf(stderr, gettext("%llu disks were provided, but at "
1448
"least %llu disks are required for this config\n"),
1449
(u_longlong_t)children,
1450
(u_longlong_t)(ndata + nparity + nspares));
1451
}
1452
1453
if (children > VDEV_DRAID_MAX_CHILDREN) {
1454
fprintf(stderr, gettext("%llu disks were provided, but "
1455
"dRAID only supports up to %u disks"),
1456
(u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
1457
}
1458
1459
/*
1460
* Calculate the minimum number of groups required to fill a slice.
1461
* This is the LCM of the stripe width (ndata + nparity) and the
1462
* number of data drives (children - nspares).
1463
*/
1464
while (ngroups * (ndata + nparity) % (children - nspares) != 0)
1465
ngroups++;
1466
1467
/* Store the basic dRAID configuration. */
1468
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
1469
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
1470
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
1471
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
1472
1473
return (0);
1474
}
1475
1476
/*
1477
* Construct a syntactically valid vdev specification,
1478
* and ensure that all devices and files exist and can be opened.
1479
* Note: we don't bother freeing anything in the error paths
1480
* because the program is just going to exit anyway.
1481
*/
1482
static nvlist_t *
1483
construct_spec(nvlist_t *props, int argc, char **argv)
1484
{
1485
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1486
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1487
const char *type, *fulltype;
1488
boolean_t is_log, is_special, is_dedup, is_spare;
1489
boolean_t seen_logs;
1490
uint64_t ashift = 0;
1491
1492
if (props != NULL) {
1493
const char *value = NULL;
1494
1495
if (nvlist_lookup_string(props,
1496
zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
1497
if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
1498
(void) fprintf(stderr,
1499
gettext("ashift must be a number.\n"));
1500
return (NULL);
1501
}
1502
if (ashift != 0 &&
1503
(ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
1504
(void) fprintf(stderr,
1505
gettext("invalid 'ashift=%" PRIu64 "' "
1506
"property: only values between %" PRId32 " "
1507
"and %" PRId32 " are allowed.\n"),
1508
ashift, ASHIFT_MIN, ASHIFT_MAX);
1509
return (NULL);
1510
}
1511
}
1512
}
1513
1514
top = NULL;
1515
toplevels = 0;
1516
spares = NULL;
1517
l2cache = NULL;
1518
nspares = 0;
1519
nlogs = 0;
1520
nl2cache = 0;
1521
is_log = is_special = is_dedup = is_spare = B_FALSE;
1522
seen_logs = B_FALSE;
1523
nvroot = NULL;
1524
1525
while (argc > 0) {
1526
fulltype = argv[0];
1527
nv = NULL;
1528
1529
/*
1530
* If it's a mirror, raidz, or draid the subsequent arguments
1531
* are its leaves -- until we encounter the next mirror,
1532
* raidz or draid.
1533
*/
1534
if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
1535
nvlist_t **child = NULL;
1536
int c, children = 0;
1537
1538
if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1539
if (spares != NULL) {
1540
(void) fprintf(stderr,
1541
gettext("invalid vdev "
1542
"specification: 'spare' can be "
1543
"specified only once\n"));
1544
goto spec_out;
1545
}
1546
is_spare = B_TRUE;
1547
is_log = is_special = is_dedup = B_FALSE;
1548
}
1549
1550
if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1551
if (seen_logs) {
1552
(void) fprintf(stderr,
1553
gettext("invalid vdev "
1554
"specification: 'log' can be "
1555
"specified only once\n"));
1556
goto spec_out;
1557
}
1558
seen_logs = B_TRUE;
1559
is_log = B_TRUE;
1560
is_special = is_dedup = is_spare = B_FALSE;
1561
argc--;
1562
argv++;
1563
/*
1564
* A log is not a real grouping device.
1565
* We just set is_log and continue.
1566
*/
1567
continue;
1568
}
1569
1570
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1571
is_special = B_TRUE;
1572
is_log = is_dedup = is_spare = B_FALSE;
1573
argc--;
1574
argv++;
1575
continue;
1576
}
1577
1578
if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1579
is_dedup = B_TRUE;
1580
is_log = is_special = is_spare = B_FALSE;
1581
argc--;
1582
argv++;
1583
continue;
1584
}
1585
1586
if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1587
if (l2cache != NULL) {
1588
(void) fprintf(stderr,
1589
gettext("invalid vdev "
1590
"specification: 'cache' can be "
1591
"specified only once\n"));
1592
goto spec_out;
1593
}
1594
is_log = is_special = B_FALSE;
1595
is_dedup = is_spare = B_FALSE;
1596
}
1597
1598
if (is_log) {
1599
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1600
(void) fprintf(stderr,
1601
gettext("invalid vdev "
1602
"specification: unsupported 'log' "
1603
"device: %s\n"), type);
1604
goto spec_out;
1605
}
1606
nlogs++;
1607
}
1608
1609
for (c = 1; c < argc; c++) {
1610
if (is_grouping(argv[c], NULL, NULL) != NULL)
1611
break;
1612
1613
children++;
1614
child = realloc(child,
1615
children * sizeof (nvlist_t *));
1616
if (child == NULL)
1617
zpool_no_memory();
1618
if ((nv = make_leaf_vdev(argv[c],
1619
!(is_log || is_special || is_dedup ||
1620
is_spare), ashift)) == NULL) {
1621
for (c = 0; c < children - 1; c++)
1622
nvlist_free(child[c]);
1623
free(child);
1624
goto spec_out;
1625
}
1626
1627
child[children - 1] = nv;
1628
}
1629
1630
if (children < mindev) {
1631
(void) fprintf(stderr, gettext("invalid vdev "
1632
"specification: %s requires at least %d "
1633
"devices\n"), argv[0], mindev);
1634
for (c = 0; c < children; c++)
1635
nvlist_free(child[c]);
1636
free(child);
1637
goto spec_out;
1638
}
1639
1640
if (children > maxdev) {
1641
(void) fprintf(stderr, gettext("invalid vdev "
1642
"specification: %s supports no more than "
1643
"%d devices\n"), argv[0], maxdev);
1644
for (c = 0; c < children; c++)
1645
nvlist_free(child[c]);
1646
free(child);
1647
goto spec_out;
1648
}
1649
1650
argc -= c;
1651
argv += c;
1652
1653
if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1654
spares = child;
1655
nspares = children;
1656
continue;
1657
} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1658
l2cache = child;
1659
nl2cache = children;
1660
continue;
1661
} else {
1662
/* create a top-level vdev with children */
1663
verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1664
0) == 0);
1665
verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1666
type) == 0);
1667
verify(nvlist_add_uint64(nv,
1668
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1669
if (is_log) {
1670
verify(nvlist_add_string(nv,
1671
ZPOOL_CONFIG_ALLOCATION_BIAS,
1672
VDEV_ALLOC_BIAS_LOG) == 0);
1673
}
1674
if (is_special) {
1675
verify(nvlist_add_string(nv,
1676
ZPOOL_CONFIG_ALLOCATION_BIAS,
1677
VDEV_ALLOC_BIAS_SPECIAL) == 0);
1678
}
1679
if (is_dedup) {
1680
verify(nvlist_add_string(nv,
1681
ZPOOL_CONFIG_ALLOCATION_BIAS,
1682
VDEV_ALLOC_BIAS_DEDUP) == 0);
1683
}
1684
if (ashift > 0) {
1685
fnvlist_add_uint64(nv,
1686
ZPOOL_CONFIG_ASHIFT, ashift);
1687
}
1688
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1689
verify(nvlist_add_uint64(nv,
1690
ZPOOL_CONFIG_NPARITY,
1691
mindev - 1) == 0);
1692
}
1693
if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
1694
if (draid_config_by_type(nv,
1695
fulltype, children) != 0) {
1696
for (c = 0; c < children; c++)
1697
nvlist_free(child[c]);
1698
free(child);
1699
goto spec_out;
1700
}
1701
}
1702
verify(nvlist_add_nvlist_array(nv,
1703
ZPOOL_CONFIG_CHILDREN,
1704
(const nvlist_t **)child, children) == 0);
1705
1706
for (c = 0; c < children; c++)
1707
nvlist_free(child[c]);
1708
free(child);
1709
}
1710
} else {
1711
/*
1712
* We have a device. Pass off to make_leaf_vdev() to
1713
* construct the appropriate nvlist describing the vdev.
1714
*/
1715
if ((nv = make_leaf_vdev(argv[0], !(is_log ||
1716
is_special || is_dedup || is_spare),
1717
ashift)) == NULL)
1718
goto spec_out;
1719
1720
verify(nvlist_add_uint64(nv,
1721
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1722
if (is_log) {
1723
verify(nvlist_add_string(nv,
1724
ZPOOL_CONFIG_ALLOCATION_BIAS,
1725
VDEV_ALLOC_BIAS_LOG) == 0);
1726
nlogs++;
1727
}
1728
1729
if (is_special) {
1730
verify(nvlist_add_string(nv,
1731
ZPOOL_CONFIG_ALLOCATION_BIAS,
1732
VDEV_ALLOC_BIAS_SPECIAL) == 0);
1733
}
1734
if (is_dedup) {
1735
verify(nvlist_add_string(nv,
1736
ZPOOL_CONFIG_ALLOCATION_BIAS,
1737
VDEV_ALLOC_BIAS_DEDUP) == 0);
1738
}
1739
argc--;
1740
argv++;
1741
}
1742
1743
toplevels++;
1744
top = realloc(top, toplevels * sizeof (nvlist_t *));
1745
if (top == NULL)
1746
zpool_no_memory();
1747
top[toplevels - 1] = nv;
1748
}
1749
1750
if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1751
(void) fprintf(stderr, gettext("invalid vdev "
1752
"specification: at least one toplevel vdev must be "
1753
"specified\n"));
1754
goto spec_out;
1755
}
1756
1757
if (seen_logs && nlogs == 0) {
1758
(void) fprintf(stderr, gettext("invalid vdev specification: "
1759
"log requires at least 1 device\n"));
1760
goto spec_out;
1761
}
1762
1763
/*
1764
* Finally, create nvroot and add all top-level vdevs to it.
1765
*/
1766
verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1767
verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1768
VDEV_TYPE_ROOT) == 0);
1769
verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1770
(const nvlist_t **)top, toplevels) == 0);
1771
if (nspares != 0)
1772
verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1773
(const nvlist_t **)spares, nspares) == 0);
1774
if (nl2cache != 0)
1775
verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1776
(const nvlist_t **)l2cache, nl2cache) == 0);
1777
1778
spec_out:
1779
for (t = 0; t < toplevels; t++)
1780
nvlist_free(top[t]);
1781
for (t = 0; t < nspares; t++)
1782
nvlist_free(spares[t]);
1783
for (t = 0; t < nl2cache; t++)
1784
nvlist_free(l2cache[t]);
1785
1786
free(spares);
1787
free(l2cache);
1788
free(top);
1789
1790
return (nvroot);
1791
}
1792
1793
nvlist_t *
1794
split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1795
splitflags_t flags, int argc, char **argv)
1796
{
1797
nvlist_t *newroot = NULL, **child;
1798
uint_t c, children;
1799
1800
if (argc > 0) {
1801
if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1802
(void) fprintf(stderr, gettext("Unable to build a "
1803
"pool from the specified devices\n"));
1804
return (NULL);
1805
}
1806
1807
if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) {
1808
nvlist_free(newroot);
1809
return (NULL);
1810
}
1811
1812
/* avoid any tricks in the spec */
1813
verify(nvlist_lookup_nvlist_array(newroot,
1814
ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1815
for (c = 0; c < children; c++) {
1816
const char *path;
1817
const char *type;
1818
int min, max;
1819
1820
verify(nvlist_lookup_string(child[c],
1821
ZPOOL_CONFIG_PATH, &path) == 0);
1822
if ((type = is_grouping(path, &min, &max)) != NULL) {
1823
(void) fprintf(stderr, gettext("Cannot use "
1824
"'%s' as a device for splitting\n"), type);
1825
nvlist_free(newroot);
1826
return (NULL);
1827
}
1828
}
1829
}
1830
1831
if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1832
nvlist_free(newroot);
1833
return (NULL);
1834
}
1835
1836
return (newroot);
1837
}
1838
1839
static int
1840
num_normal_vdevs(nvlist_t *nvroot)
1841
{
1842
nvlist_t **top;
1843
uint_t t, toplevels, normal = 0;
1844
1845
verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1846
&top, &toplevels) == 0);
1847
1848
for (t = 0; t < toplevels; t++) {
1849
uint64_t log = B_FALSE;
1850
1851
(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1852
if (log)
1853
continue;
1854
if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1855
continue;
1856
1857
normal++;
1858
}
1859
1860
return (normal);
1861
}
1862
1863
/*
1864
* Get and validate the contents of the given vdev specification. This ensures
1865
* that the nvlist returned is well-formed, that all the devices exist, and that
1866
* they are not currently in use by any other known consumer. The 'poolconfig'
1867
* parameter is the current configuration of the pool when adding devices
1868
* existing pool, and is used to perform additional checks, such as changing the
1869
* replication level of the pool. It can be 'NULL' to indicate that this is a
1870
* new pool. The 'force' flag controls whether devices should be forcefully
1871
* added, even if they appear in use.
1872
*/
1873
nvlist_t *
1874
make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1875
boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1876
{
1877
nvlist_t *newroot;
1878
nvlist_t *poolconfig = NULL;
1879
is_force = force;
1880
1881
/*
1882
* Construct the vdev specification. If this is successful, we know
1883
* that we have a valid specification, and that all devices can be
1884
* opened.
1885
*/
1886
if ((newroot = construct_spec(props, argc, argv)) == NULL)
1887
return (NULL);
1888
1889
if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1890
nvlist_free(newroot);
1891
return (NULL);
1892
}
1893
1894
/*
1895
* Validate each device to make sure that it's not shared with another
1896
* subsystem. We do this even if 'force' is set, because there are some
1897
* uses (such as a dedicated dump device) that even '-f' cannot
1898
* override.
1899
*/
1900
if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1901
nvlist_free(newroot);
1902
return (NULL);
1903
}
1904
1905
/*
1906
* Check the replication level of the given vdevs and report any errors
1907
* found. We include the existing pool spec, if any, as we need to
1908
* catch changes against the existing replication level.
1909
*/
1910
if (check_rep && check_replication(poolconfig, newroot) != 0) {
1911
nvlist_free(newroot);
1912
return (NULL);
1913
}
1914
1915
/*
1916
* On pool create the new vdev spec must have one normal vdev.
1917
*/
1918
if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1919
vdev_error(gettext("at least one general top-level vdev must "
1920
"be specified\n"));
1921
nvlist_free(newroot);
1922
return (NULL);
1923
}
1924
1925
/*
1926
* Run through the vdev specification and label any whole disks found.
1927
*/
1928
if (!dryrun && make_disks(zhp, newroot, replacing) != 0) {
1929
nvlist_free(newroot);
1930
return (NULL);
1931
}
1932
1933
return (newroot);
1934
}
1935
1936