Path: blob/main/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
48288 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2013, 2018 by Delphix. All rights reserved.25* Copyright (c) 2016, 2017 Intel Corporation.26* Copyright 2016 Igor Kozhukhov <[email protected]>.27*/2829/*30* Functions to convert between a list of vdevs and an nvlist representing the31* configuration. Each entry in the list can be one of:32*33* Device vdevs34* disk=(path=..., devid=...)35* file=(path=...)36*37* Group vdevs38* raidz[1|2]=(...)39* mirror=(...)40*41* Hot spares42*43* While the underlying implementation supports it, group vdevs cannot contain44* other group vdevs. All userland verification of devices is contained within45* this file. If successful, the nvlist returned can be passed directly to the46* kernel; we've done as much verification as possible in userland.47*48* Hot spares are a special case, and passed down as an array of disk vdevs, at49* the same level as the root of the vdev tree.50*51* The only function exported by this file is 'make_root_vdev'. The52* function performs several passes:53*54* 1. Construct the vdev specification. Performs syntax validation and55* makes sure each device is valid.56* 2. Check for devices in use. Using libblkid to make sure that no57* devices are also in use. Some can be overridden using the 'force'58* flag, others cannot.59* 3. Check for replication errors if the 'force' flag is not specified.60* validates that the replication level is consistent across the61* entire pool.62* 4. Call libzfs to label any whole disks with an EFI label.63*/6465#include <assert.h>66#include <ctype.h>67#include <errno.h>68#include <fcntl.h>69#include <libintl.h>70#include <libnvpair.h>71#include <libzutil.h>72#include <limits.h>73#include <sys/spa.h>74#include <stdio.h>75#include <string.h>76#include <unistd.h>77#include "zpool_util.h"78#include <sys/zfs_context.h>79#include <sys/stat.h>8081/*82* For any given vdev specification, we can have multiple errors. The83* vdev_error() function keeps track of whether we have seen an error yet, and84* prints out a header if its the first error we've seen.85*/86boolean_t error_seen;87boolean_t is_force;8889void90vdev_error(const char *fmt, ...)91{92va_list ap;9394if (!error_seen) {95(void) fprintf(stderr, gettext("invalid vdev specification\n"));96if (!is_force)97(void) fprintf(stderr, gettext("use '-f' to override "98"the following errors:\n"));99else100(void) fprintf(stderr, gettext("the following errors "101"must be manually repaired:\n"));102error_seen = B_TRUE;103}104105va_start(ap, fmt);106(void) vfprintf(stderr, fmt, ap);107va_end(ap);108}109110/*111* Check that a file is valid. All we can do in this case is check that it's112* not in use by another pool, and not in use by swap.113*/114int115check_file_generic(const char *file, boolean_t force, boolean_t isspare)116{117char *name;118int fd;119int ret = 0;120pool_state_t state;121boolean_t inuse;122123if ((fd = open(file, O_RDONLY)) < 0)124return (0);125126if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {127const char *desc;128129switch (state) {130case POOL_STATE_ACTIVE:131desc = gettext("active");132break;133134case POOL_STATE_EXPORTED:135desc = gettext("exported");136break;137138case POOL_STATE_POTENTIALLY_ACTIVE:139desc = gettext("potentially active");140break;141142default:143desc = gettext("unknown");144break;145}146147/*148* Allow hot spares to be shared between pools.149*/150if (state == POOL_STATE_SPARE && isspare) {151free(name);152(void) close(fd);153return (0);154}155156if (state == POOL_STATE_ACTIVE ||157state == POOL_STATE_SPARE || !force) {158switch (state) {159case POOL_STATE_SPARE:160vdev_error(gettext("%s is reserved as a hot "161"spare for pool %s\n"), file, name);162break;163default:164vdev_error(gettext("%s is part of %s pool "165"'%s'\n"), file, desc, name);166break;167}168ret = -1;169}170171free(name);172}173174(void) close(fd);175return (ret);176}177178/*179* This may be a shorthand device path or it could be total gibberish.180* Check to see if it is a known device available in zfs_vdev_paths.181* As part of this check, see if we've been given an entire disk182* (minus the slice number).183*/184static int185is_shorthand_path(const char *arg, char *path, size_t path_size,186struct stat64 *statbuf, boolean_t *wholedisk)187{188int error;189190error = zfs_resolve_shortname(arg, path, path_size);191if (error == 0) {192*wholedisk = zfs_dev_is_whole_disk(path);193if (*wholedisk || (stat64(path, statbuf) == 0))194return (0);195}196197strlcpy(path, arg, path_size);198memset(statbuf, 0, sizeof (*statbuf));199*wholedisk = B_FALSE;200201return (error);202}203204/*205* Determine if the given path is a hot spare within the given configuration.206* If no configuration is given we rely solely on the label.207*/208static boolean_t209is_spare(nvlist_t *config, const char *path)210{211int fd;212pool_state_t state;213char *name = NULL;214nvlist_t *label;215uint64_t guid, spareguid;216nvlist_t *nvroot;217nvlist_t **spares;218uint_t i, nspares;219boolean_t inuse;220221if (zpool_is_draid_spare(path))222return (B_TRUE);223224if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)225return (B_FALSE);226227if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||228!inuse ||229state != POOL_STATE_SPARE ||230zpool_read_label(fd, &label, NULL) != 0) {231free(name);232(void) close(fd);233return (B_FALSE);234}235free(name);236(void) close(fd);237238if (config == NULL) {239nvlist_free(label);240return (B_TRUE);241}242243verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);244nvlist_free(label);245246verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,247&nvroot) == 0);248if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,249&spares, &nspares) == 0) {250for (i = 0; i < nspares; i++) {251verify(nvlist_lookup_uint64(spares[i],252ZPOOL_CONFIG_GUID, &spareguid) == 0);253if (spareguid == guid)254return (B_TRUE);255}256}257258return (B_FALSE);259}260261/*262* Create a leaf vdev. Determine if this is a file or a device. If it's a263* device, fill in the device id to make a complete nvlist. Valid forms for a264* leaf vdev are:265*266* /dev/xxx Complete disk path267* /xxx Full path to file268* xxx Shorthand for <zfs_vdev_paths>/xxx269* draid* Virtual dRAID spare270*/271static nvlist_t *272make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift)273{274char path[MAXPATHLEN];275struct stat64 statbuf;276nvlist_t *vdev = NULL;277const char *type = NULL;278boolean_t wholedisk = B_FALSE;279int err;280281/*282* Determine what type of vdev this is, and put the full path into283* 'path'. We detect whether this is a device of file afterwards by284* checking the st_mode of the file.285*/286if (arg[0] == '/') {287/*288* Complete device or file path. Exact type is determined by289* examining the file descriptor afterwards. Symbolic links290* are resolved to their real paths to determine whole disk291* and S_ISBLK/S_ISREG type checks. However, we are careful292* to store the given path as ZPOOL_CONFIG_PATH to ensure we293* can leverage udev's persistent device labels.294*/295if (realpath(arg, path) == NULL) {296(void) fprintf(stderr,297gettext("cannot resolve path '%s'\n"), arg);298return (NULL);299}300301wholedisk = zfs_dev_is_whole_disk(path);302if (!wholedisk && (stat64(path, &statbuf) != 0)) {303(void) fprintf(stderr,304gettext("cannot open '%s': %s\n"),305path, strerror(errno));306return (NULL);307}308309/* After whole disk check restore original passed path */310strlcpy(path, arg, sizeof (path));311} else if (zpool_is_draid_spare(arg)) {312if (!is_primary) {313(void) fprintf(stderr,314gettext("cannot open '%s': dRAID spares can only "315"be used to replace primary vdevs\n"), arg);316return (NULL);317}318319wholedisk = B_TRUE;320strlcpy(path, arg, sizeof (path));321type = VDEV_TYPE_DRAID_SPARE;322} else {323err = is_shorthand_path(arg, path, sizeof (path),324&statbuf, &wholedisk);325if (err != 0) {326/*327* If we got ENOENT, then the user gave us328* gibberish, so try to direct them with a329* reasonable error message. Otherwise,330* regurgitate strerror() since it's the best we331* can do.332*/333if (err == ENOENT) {334(void) fprintf(stderr,335gettext("cannot open '%s': no such "336"device in %s\n"), arg, DISK_ROOT);337(void) fprintf(stderr,338gettext("must be a full path or "339"shorthand device name\n"));340return (NULL);341} else {342(void) fprintf(stderr,343gettext("cannot open '%s': %s\n"),344path, strerror(errno));345return (NULL);346}347}348}349350if (type == NULL) {351/*352* Determine whether this is a device or a file.353*/354if (wholedisk || S_ISBLK(statbuf.st_mode)) {355type = VDEV_TYPE_DISK;356} else if (S_ISREG(statbuf.st_mode)) {357type = VDEV_TYPE_FILE;358} else {359fprintf(stderr, gettext("cannot use '%s': must "360"be a block device or regular file\n"), path);361return (NULL);362}363}364365/*366* Finally, we have the complete device or file, and we know that it is367* acceptable to use. Construct the nvlist to describe this vdev. All368* vdevs have a 'path' element, and devices also have a 'devid' element.369*/370verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);371verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);372verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);373374/* Lookup and add the enclosure sysfs path (if exists) */375update_vdev_config_dev_sysfs_path(vdev, path,376ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);377378if (strcmp(type, VDEV_TYPE_DISK) == 0)379verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,380(uint64_t)wholedisk) == 0);381382/*383* If the device is known to incorrectly report its physical sector384* size explicitly provide the known correct value.385*/386if (ashift == 0) {387int sector_size;388389if (check_sector_size_database(path, §or_size) == B_TRUE)390ashift = highbit64(sector_size) - 1;391}392393if (ashift > 0)394(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);395396return (vdev);397}398399/*400* Go through and verify the replication level of the pool is consistent.401* Performs the following checks:402*403* For the new spec, verifies that devices in mirrors and raidz are the404* same size.405*406* If the current configuration already has inconsistent replication407* levels, ignore any other potential problems in the new spec.408*409* Otherwise, make sure that the current spec (if there is one) and the new410* spec have consistent replication levels.411*412* If there is no current spec (create), make sure new spec has at least413* one general purpose vdev.414*/415typedef struct replication_level {416const char *zprl_type;417uint64_t zprl_children;418uint64_t zprl_parity;419} replication_level_t;420421#define ZPOOL_FUZZ (16 * 1024 * 1024)422423/*424* N.B. For the purposes of comparing replication levels dRAID can be425* considered functionally equivalent to raidz.426*/427static boolean_t428is_raidz_mirror(replication_level_t *a, replication_level_t *b,429replication_level_t **raidz, replication_level_t **mirror)430{431if ((strcmp(a->zprl_type, "raidz") == 0 ||432strcmp(a->zprl_type, "draid") == 0) &&433strcmp(b->zprl_type, "mirror") == 0) {434*raidz = a;435*mirror = b;436return (B_TRUE);437}438return (B_FALSE);439}440441/*442* Comparison for determining if dRAID and raidz where passed in either order.443*/444static boolean_t445is_raidz_draid(replication_level_t *a, replication_level_t *b)446{447if ((strcmp(a->zprl_type, "raidz") == 0 ||448strcmp(a->zprl_type, "draid") == 0) &&449(strcmp(b->zprl_type, "raidz") == 0 ||450strcmp(b->zprl_type, "draid") == 0)) {451return (B_TRUE);452}453454return (B_FALSE);455}456457/*458* Given a list of toplevel vdevs, return the current replication level. If459* the config is inconsistent, then NULL is returned. If 'fatal' is set, then460* an error message will be displayed for each self-inconsistent vdev.461*/462static replication_level_t *463get_replication(nvlist_t *nvroot, boolean_t fatal)464{465nvlist_t **top;466uint_t t, toplevels;467nvlist_t **child;468uint_t c, children;469nvlist_t *nv;470const char *type;471replication_level_t lastrep = {0};472replication_level_t rep;473replication_level_t *ret;474replication_level_t *raidz, *mirror;475boolean_t dontreport;476477ret = safe_malloc(sizeof (replication_level_t));478479verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,480&top, &toplevels) == 0);481482for (t = 0; t < toplevels; t++) {483uint64_t is_log = B_FALSE;484485nv = top[t];486487/*488* For separate logs we ignore the top level vdev replication489* constraints.490*/491(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);492if (is_log)493continue;494495/*496* Ignore holes introduced by removing aux devices, along497* with indirect vdevs introduced by previously removed498* vdevs.499*/500verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);501if (strcmp(type, VDEV_TYPE_HOLE) == 0 ||502strcmp(type, VDEV_TYPE_INDIRECT) == 0)503continue;504505if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,506&child, &children) != 0) {507/*508* This is a 'file' or 'disk' vdev.509*/510rep.zprl_type = type;511rep.zprl_children = 1;512rep.zprl_parity = 0;513} else {514int64_t vdev_size;515516/*517* This is a mirror or RAID-Z vdev. Go through and make518* sure the contents are all the same (files vs. disks),519* keeping track of the number of elements in the520* process.521*522* We also check that the size of each vdev (if it can523* be determined) is the same.524*/525rep.zprl_type = type;526rep.zprl_children = 0;527528if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||529strcmp(type, VDEV_TYPE_DRAID) == 0) {530verify(nvlist_lookup_uint64(nv,531ZPOOL_CONFIG_NPARITY,532&rep.zprl_parity) == 0);533assert(rep.zprl_parity != 0);534} else {535rep.zprl_parity = 0;536}537538/*539* The 'dontreport' variable indicates that we've540* already reported an error for this spec, so don't541* bother doing it again.542*/543type = NULL;544dontreport = 0;545vdev_size = -1LL;546for (c = 0; c < children; c++) {547nvlist_t *cnv = child[c];548const char *path;549struct stat64 statbuf;550const char *childtype;551int fd, err;552553rep.zprl_children++;554555verify(nvlist_lookup_string(cnv,556ZPOOL_CONFIG_TYPE, &childtype) == 0);557558/*559* If this is a replacing or spare vdev, then560* get the real first child of the vdev: do this561* in a loop because replacing and spare vdevs562* can be nested.563*/564while (strcmp(childtype,565VDEV_TYPE_REPLACING) == 0 ||566strcmp(childtype, VDEV_TYPE_SPARE) == 0) {567nvlist_t **rchild;568uint_t rchildren;569570verify(nvlist_lookup_nvlist_array(cnv,571ZPOOL_CONFIG_CHILDREN, &rchild,572&rchildren) == 0);573assert(rchildren == 2);574cnv = rchild[0];575576verify(nvlist_lookup_string(cnv,577ZPOOL_CONFIG_TYPE,578&childtype) == 0);579}580581verify(nvlist_lookup_string(cnv,582ZPOOL_CONFIG_PATH, &path) == 0);583584/*585* Skip active spares they should never cause586* the pool to be evaluated as inconsistent.587*/588if (is_spare(NULL, path))589continue;590591/*592* If we have a raidz/mirror that combines disks593* with files, only report it as an error when594* fatal is set to ensure all the replication595* checks aren't skipped in check_replication().596*/597if (fatal && !dontreport && type != NULL &&598strcmp(type, childtype) != 0) {599if (ret != NULL)600free(ret);601ret = NULL;602vdev_error(gettext(603"mismatched replication "604"level: %s contains both "605"files and devices\n"),606rep.zprl_type);607dontreport = B_TRUE;608}609610/*611* According to stat(2), the value of 'st_size'612* is undefined for block devices and character613* devices. But there is no effective way to614* determine the real size in userland.615*616* Instead, we'll take advantage of an617* implementation detail of spec_size(). If the618* device is currently open, then we (should)619* return a valid size.620*621* If we still don't get a valid size (indicated622* by a size of 0 or MAXOFFSET_T), then ignore623* this device altogether.624*/625if ((fd = open(path, O_RDONLY)) >= 0) {626err = fstat64_blk(fd, &statbuf);627(void) close(fd);628} else {629err = stat64(path, &statbuf);630}631632if (err != 0 ||633statbuf.st_size == 0 ||634statbuf.st_size == MAXOFFSET_T)635continue;636637int64_t size = statbuf.st_size;638639/*640* Also make sure that devices and641* slices have a consistent size. If642* they differ by a significant amount643* (~16MB) then report an error.644*/645if (!dontreport &&646(vdev_size != -1LL &&647(llabs(size - vdev_size) >648ZPOOL_FUZZ))) {649if (ret != NULL)650free(ret);651ret = NULL;652if (fatal)653vdev_error(gettext(654"%s contains devices of "655"different sizes\n"),656rep.zprl_type);657else658return (NULL);659dontreport = B_TRUE;660}661662type = childtype;663vdev_size = size;664}665}666667/*668* At this point, we have the replication of the last toplevel669* vdev in 'rep'. Compare it to 'lastrep' to see if it is670* different.671*/672if (lastrep.zprl_type != NULL) {673if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||674is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {675/*676* Accepted raidz and mirror when they can677* handle the same number of disk failures.678*/679if (raidz->zprl_parity !=680mirror->zprl_children - 1) {681if (ret != NULL)682free(ret);683ret = NULL;684if (fatal)685vdev_error(gettext(686"mismatched replication "687"level: "688"%s and %s vdevs with "689"different redundancy, "690"%llu vs. %llu (%llu-way) "691"are present\n"),692raidz->zprl_type,693mirror->zprl_type,694(u_longlong_t)695raidz->zprl_parity,696(u_longlong_t)697mirror->zprl_children - 1,698(u_longlong_t)699mirror->zprl_children);700else701return (NULL);702}703} else if (is_raidz_draid(&lastrep, &rep)) {704/*705* Accepted raidz and draid when they can706* handle the same number of disk failures.707*/708if (lastrep.zprl_parity != rep.zprl_parity) {709if (ret != NULL)710free(ret);711ret = NULL;712if (fatal)713vdev_error(gettext(714"mismatched replication "715"level: %s and %s vdevs "716"with different "717"redundancy, %llu vs. "718"%llu are present\n"),719lastrep.zprl_type,720rep.zprl_type,721(u_longlong_t)722lastrep.zprl_parity,723(u_longlong_t)724rep.zprl_parity);725else726return (NULL);727}728} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=7290) {730if (ret != NULL)731free(ret);732ret = NULL;733if (fatal)734vdev_error(gettext(735"mismatched replication level: "736"both %s and %s vdevs are "737"present\n"),738lastrep.zprl_type, rep.zprl_type);739else740return (NULL);741} else if (lastrep.zprl_parity != rep.zprl_parity) {742if (ret)743free(ret);744ret = NULL;745if (fatal)746vdev_error(gettext(747"mismatched replication level: "748"both %llu and %llu device parity "749"%s vdevs are present\n"),750(u_longlong_t)751lastrep.zprl_parity,752(u_longlong_t)rep.zprl_parity,753rep.zprl_type);754else755return (NULL);756} else if (lastrep.zprl_children != rep.zprl_children) {757if (ret)758free(ret);759ret = NULL;760if (fatal)761vdev_error(gettext(762"mismatched replication level: "763"both %llu-way and %llu-way %s "764"vdevs are present\n"),765(u_longlong_t)766lastrep.zprl_children,767(u_longlong_t)768rep.zprl_children,769rep.zprl_type);770else771return (NULL);772}773}774lastrep = rep;775}776777if (ret != NULL)778*ret = rep;779780return (ret);781}782783/*784* Check the replication level of the vdev spec against the current pool. Calls785* get_replication() to make sure the new spec is self-consistent. If the pool786* has a consistent replication level, then we ignore any errors. Otherwise,787* report any difference between the two.788*/789static int790check_replication(nvlist_t *config, nvlist_t *newroot)791{792nvlist_t **child;793uint_t children;794replication_level_t *current = NULL, *new;795replication_level_t *raidz, *mirror;796int ret;797798/*799* If we have a current pool configuration, check to see if it's800* self-consistent. If not, simply return success.801*/802if (config != NULL) {803nvlist_t *nvroot;804805verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,806&nvroot) == 0);807if ((current = get_replication(nvroot, B_FALSE)) == NULL)808return (0);809}810/*811* for spares there may be no children, and therefore no812* replication level to check813*/814if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,815&child, &children) != 0) || (children == 0)) {816free(current);817return (0);818}819820/*821* If all we have is logs then there's no replication level to check.822*/823if (num_logs(newroot) == children) {824free(current);825return (0);826}827828/*829* Get the replication level of the new vdev spec, reporting any830* inconsistencies found.831*/832if ((new = get_replication(newroot, B_TRUE)) == NULL) {833free(current);834return (-1);835}836837/*838* Check to see if the new vdev spec matches the replication level of839* the current pool.840*/841ret = 0;842if (current != NULL) {843if (is_raidz_mirror(current, new, &raidz, &mirror) ||844is_raidz_mirror(new, current, &raidz, &mirror)) {845if (raidz->zprl_parity != mirror->zprl_children - 1) {846vdev_error(gettext(847"mismatched replication level: pool and "848"new vdev with different redundancy, %s "849"and %s vdevs, %llu vs. %llu (%llu-way)\n"),850raidz->zprl_type,851mirror->zprl_type,852(u_longlong_t)raidz->zprl_parity,853(u_longlong_t)mirror->zprl_children - 1,854(u_longlong_t)mirror->zprl_children);855ret = -1;856}857} else if (is_raidz_draid(current, new)) {858if (current->zprl_parity != new->zprl_parity) {859vdev_error(gettext(860"mismatched replication level: pool and "861"new vdev with different redundancy, %s "862"and %s vdevs, %llu vs. %llu\n"),863current->zprl_type,864new->zprl_type,865(u_longlong_t)current->zprl_parity,866(u_longlong_t)new->zprl_parity);867ret = -1;868}869} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {870vdev_error(gettext(871"mismatched replication level: pool uses %s "872"and new vdev is %s\n"),873current->zprl_type, new->zprl_type);874ret = -1;875} else if (current->zprl_parity != new->zprl_parity) {876vdev_error(gettext(877"mismatched replication level: pool uses %llu "878"device parity and new vdev uses %llu\n"),879(u_longlong_t)current->zprl_parity,880(u_longlong_t)new->zprl_parity);881ret = -1;882} else if (current->zprl_children != new->zprl_children) {883vdev_error(gettext(884"mismatched replication level: pool uses %llu-way "885"%s and new vdev uses %llu-way %s\n"),886(u_longlong_t)current->zprl_children,887current->zprl_type,888(u_longlong_t)new->zprl_children,889new->zprl_type);890ret = -1;891}892}893894free(new);895if (current != NULL)896free(current);897898return (ret);899}900901static int902zero_label(const char *path)903{904const int size = 4096;905char buf[size];906int err, fd;907908if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {909(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),910path, strerror(errno));911return (-1);912}913914memset(buf, 0, size);915err = write(fd, buf, size);916(void) fdatasync(fd);917(void) close(fd);918919if (err == -1) {920(void) fprintf(stderr, gettext("cannot zero first %d bytes "921"of '%s': %s\n"), size, path, strerror(errno));922return (-1);923}924925if (err != size) {926(void) fprintf(stderr, gettext("could only zero %d/%d bytes "927"of '%s'\n"), err, size, path);928return (-1);929}930931return (0);932}933934static void935lines_to_stderr(char *lines[], int lines_cnt)936{937int i;938for (i = 0; i < lines_cnt; i++) {939fprintf(stderr, "%s\n", lines[i]);940}941}942943/*944* Go through and find any whole disks in the vdev specification, labelling them945* as appropriate. When constructing the vdev spec, we were unable to open this946* device in order to provide a devid. Now that we have labelled the disk and947* know that slice 0 is valid, we can construct the devid now.948*949* If the disk was already labeled with an EFI label, we will have gotten the950* devid already (because we were able to open the whole disk). Otherwise, we951* need to get the devid after we label the disk.952*/953static int954make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing)955{956nvlist_t **child;957uint_t c, children;958const char *type, *path;959char devpath[MAXPATHLEN];960char udevpath[MAXPATHLEN];961uint64_t wholedisk;962struct stat64 statbuf;963int is_exclusive = 0;964int fd;965int ret;966967verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);968969if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,970&child, &children) != 0) {971972if (strcmp(type, VDEV_TYPE_DISK) != 0)973return (0);974975/*976* We have a disk device. If this is a whole disk write977* out the efi partition table, otherwise write zero's to978* the first 4k of the partition. This is to ensure that979* libblkid will not misidentify the partition due to a980* magic value left by the previous filesystem.981*/982verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));983verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,984&wholedisk));985986if (!wholedisk) {987/*988* Update device id string for mpath nodes (Linux only)989*/990if (is_mpath_whole_disk(path))991update_vdev_config_dev_strs(nv);992993if (!is_spare(NULL, path))994(void) zero_label(path);995return (0);996}997998if (realpath(path, devpath) == NULL) {999ret = errno;1000(void) fprintf(stderr,1001gettext("cannot resolve path '%s'\n"), path);1002return (ret);1003}10041005/*1006* Remove any previously existing symlink from a udev path to1007* the device before labeling the disk. This ensures that1008* only newly created links are used. Otherwise there is a1009* window between when udev deletes and recreates the link1010* during which access attempts will fail with ENOENT.1011*/1012strlcpy(udevpath, path, MAXPATHLEN);1013(void) zfs_append_partition(udevpath, MAXPATHLEN);10141015fd = open(devpath, O_RDWR|O_EXCL);1016if (fd == -1) {1017if (errno == EBUSY)1018is_exclusive = 1;1019#ifdef __FreeBSD__1020if (errno == EPERM)1021is_exclusive = 1;1022#endif1023} else {1024(void) close(fd);1025}10261027/*1028* If the partition exists, contains a valid spare label,1029* and is opened exclusively there is no need to partition1030* it. Hot spares have already been partitioned and are1031* held open exclusively by the kernel as a safety measure.1032*1033* If the provided path is for a /dev/disk/ device its1034* symbolic link will be removed, partition table created,1035* and then block until udev creates the new link.1036*/1037if (!is_exclusive && !is_spare(NULL, udevpath)) {1038char *devnode = strrchr(devpath, '/') + 1;1039char **lines = NULL;1040int lines_cnt = 0;10411042ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));1043if (ret == 0) {1044ret = lstat64(udevpath, &statbuf);1045if (ret == 0 && S_ISLNK(statbuf.st_mode))1046(void) unlink(udevpath);1047}10481049/*1050* When labeling a pool the raw device node name1051* is provided as it appears under /dev/.1052*1053* Note that 'zhp' will be NULL when we're creating a1054* pool.1055*/1056if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode,1057nv, zhp == NULL ? "create" :1058replacing ? "replace" : "add", &lines,1059&lines_cnt) != 0) {1060(void) fprintf(stderr,1061gettext(1062"Error preparing/labeling disk.\n"));1063if (lines_cnt > 0) {1064(void) fprintf(stderr,1065gettext("zfs_prepare_disk output:\n"));1066lines_to_stderr(lines, lines_cnt);1067}10681069libzfs_free_str_array(lines, lines_cnt);1070return (-1);1071}1072libzfs_free_str_array(lines, lines_cnt);10731074/*1075* Wait for udev to signal the device is available1076* by the provided path.1077*/1078ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);1079if (ret) {1080(void) fprintf(stderr,1081gettext("missing link: %s was "1082"partitioned but %s is missing\n"),1083devnode, udevpath);1084return (ret);1085}10861087ret = zero_label(udevpath);1088if (ret)1089return (ret);1090}10911092/*1093* Update the path to refer to the partition. The presence of1094* the 'whole_disk' field indicates to the CLI that we should1095* chop off the partition number when displaying the device in1096* future output.1097*/1098verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);10991100/*1101* Update device id strings for whole disks (Linux only)1102*/1103update_vdev_config_dev_strs(nv);11041105return (0);1106}11071108for (c = 0; c < children; c++)1109if ((ret = make_disks(zhp, child[c], replacing)) != 0)1110return (ret);11111112if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,1113&child, &children) == 0)1114for (c = 0; c < children; c++)1115if ((ret = make_disks(zhp, child[c], replacing)) != 0)1116return (ret);11171118if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,1119&child, &children) == 0)1120for (c = 0; c < children; c++)1121if ((ret = make_disks(zhp, child[c], replacing)) != 0)1122return (ret);11231124return (0);1125}11261127/*1128* Go through and find any devices that are in use. We rely on libdiskmgt for1129* the majority of this task.1130*/1131static boolean_t1132is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,1133boolean_t replacing, boolean_t isspare)1134{1135nvlist_t **child;1136uint_t c, children;1137const char *type, *path;1138int ret = 0;1139char buf[MAXPATHLEN];1140uint64_t wholedisk = B_FALSE;1141boolean_t anyinuse = B_FALSE;11421143verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);11441145if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,1146&child, &children) != 0) {11471148verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));1149if (strcmp(type, VDEV_TYPE_DISK) == 0)1150verify(!nvlist_lookup_uint64(nv,1151ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));11521153/*1154* As a generic check, we look to see if this is a replace of a1155* hot spare within the same pool. If so, we allow it1156* regardless of what libblkid or zpool_in_use() says.1157*/1158if (replacing) {1159(void) strlcpy(buf, path, sizeof (buf));1160if (wholedisk) {1161ret = zfs_append_partition(buf, sizeof (buf));1162if (ret == -1)1163return (-1);1164}11651166if (is_spare(config, buf))1167return (B_FALSE);1168}11691170if (strcmp(type, VDEV_TYPE_DISK) == 0)1171ret = check_device(path, force, isspare, wholedisk);11721173else if (strcmp(type, VDEV_TYPE_FILE) == 0)1174ret = check_file(path, force, isspare);11751176return (ret != 0);1177}11781179for (c = 0; c < children; c++)1180if (is_device_in_use(config, child[c], force, replacing,1181B_FALSE))1182anyinuse = B_TRUE;11831184if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,1185&child, &children) == 0)1186for (c = 0; c < children; c++)1187if (is_device_in_use(config, child[c], force, replacing,1188B_TRUE))1189anyinuse = B_TRUE;11901191if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,1192&child, &children) == 0)1193for (c = 0; c < children; c++)1194if (is_device_in_use(config, child[c], force, replacing,1195B_FALSE))1196anyinuse = B_TRUE;11971198return (anyinuse);1199}12001201/*1202* Returns the parity level extracted from a raidz or draid type.1203* If the parity cannot be determined zero is returned.1204*/1205static int1206get_parity(const char *type)1207{1208long parity = 0;1209const char *p;12101211if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {1212p = type + strlen(VDEV_TYPE_RAIDZ);12131214if (*p == '\0') {1215/* when unspecified default to single parity */1216return (1);1217} else if (*p == '0') {1218/* no zero prefixes allowed */1219return (0);1220} else {1221/* 0-3, no suffixes allowed */1222char *end;1223errno = 0;1224parity = strtol(p, &end, 10);1225if (errno != 0 || *end != '\0' ||1226parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {1227return (0);1228}1229}1230} else if (strncmp(type, VDEV_TYPE_DRAID,1231strlen(VDEV_TYPE_DRAID)) == 0) {1232p = type + strlen(VDEV_TYPE_DRAID);12331234if (*p == '\0' || *p == ':') {1235/* when unspecified default to single parity */1236return (1);1237} else if (*p == '0') {1238/* no zero prefixes allowed */1239return (0);1240} else {1241/* 0-3, allowed suffixes: '\0' or ':' */1242char *end;1243errno = 0;1244parity = strtol(p, &end, 10);1245if (errno != 0 ||1246parity < 1 || parity > VDEV_DRAID_MAXPARITY ||1247(*end != '\0' && *end != ':')) {1248return (0);1249}1250}1251}12521253return ((int)parity);1254}12551256/*1257* Assign the minimum and maximum number of devices allowed for1258* the specified type. On error NULL is returned, otherwise the1259* type prefix is returned (raidz, mirror, etc).1260*/1261static const char *1262is_grouping(const char *type, int *mindev, int *maxdev)1263{1264int nparity;12651266if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||1267strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {1268nparity = get_parity(type);1269if (nparity == 0)1270return (NULL);1271if (mindev != NULL)1272*mindev = nparity + 1;1273if (maxdev != NULL)1274*maxdev = 255;12751276if (strncmp(type, VDEV_TYPE_RAIDZ,1277strlen(VDEV_TYPE_RAIDZ)) == 0) {1278return (VDEV_TYPE_RAIDZ);1279} else {1280return (VDEV_TYPE_DRAID);1281}1282}12831284if (maxdev != NULL)1285*maxdev = INT_MAX;12861287if (strcmp(type, "mirror") == 0) {1288if (mindev != NULL)1289*mindev = 2;1290return (VDEV_TYPE_MIRROR);1291}12921293if (strcmp(type, "spare") == 0) {1294if (mindev != NULL)1295*mindev = 1;1296return (VDEV_TYPE_SPARE);1297}12981299if (strcmp(type, "log") == 0) {1300if (mindev != NULL)1301*mindev = 1;1302return (VDEV_TYPE_LOG);1303}13041305if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||1306strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {1307if (mindev != NULL)1308*mindev = 1;1309return (type);1310}13111312if (strcmp(type, "cache") == 0) {1313if (mindev != NULL)1314*mindev = 1;1315return (VDEV_TYPE_L2CACHE);1316}13171318return (NULL);1319}13201321/*1322* Extract the configuration parameters encoded in the dRAID type and1323* use them to generate a dRAID configuration. The expected format is:1324*1325* draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]1326*1327* The intent is to be able to generate a good configuration when no1328* additional information is provided. The only mandatory component1329* of the 'type' is the 'draid' prefix. If a value is not provided1330* then reasonable defaults are used. The optional components may1331* appear in any order but the d/s/c suffix is required.1332*1333* Valid inputs:1334* - data: number of data devices per group (1-255)1335* - parity: number of parity blocks per group (1-3)1336* - spares: number of distributed spare (0-100)1337* - children: total number of devices (1-255)1338*1339* Examples:1340* - zpool create tank draid <devices...>1341* - zpool create tank draid2:8d:51c:2s <devices...>1342*/1343static int1344draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)1345{1346uint64_t nparity;1347uint64_t nspares = 0;1348uint64_t ndata = UINT64_MAX;1349uint64_t ngroups = 1;1350long value;13511352if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)1353return (EINVAL);13541355nparity = (uint64_t)get_parity(type);1356if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {1357fprintf(stderr,1358gettext("invalid dRAID parity level %llu; must be "1359"between 1 and %d\n"), (u_longlong_t)nparity,1360VDEV_DRAID_MAXPARITY);1361return (EINVAL);1362}13631364char *p = (char *)type;1365while ((p = strchr(p, ':')) != NULL) {1366char *end;13671368p = p + 1;1369errno = 0;13701371if (!isdigit(p[0])) {1372(void) fprintf(stderr, gettext("invalid dRAID "1373"syntax; expected [:<number><c|d|s>] not '%s'\n"),1374type);1375return (EINVAL);1376}13771378/* Expected non-zero value with c/d/s suffix */1379value = strtol(p, &end, 10);1380char suffix = tolower(*end);1381if (errno != 0 ||1382(suffix != 'c' && suffix != 'd' && suffix != 's')) {1383(void) fprintf(stderr, gettext("invalid dRAID "1384"syntax; expected [:<number><c|d|s>] not '%s'\n"),1385type);1386return (EINVAL);1387}13881389if (suffix == 'c') {1390if ((uint64_t)value != children) {1391fprintf(stderr,1392gettext("invalid number of dRAID children; "1393"%llu required but %llu provided\n"),1394(u_longlong_t)value,1395(u_longlong_t)children);1396return (EINVAL);1397}1398} else if (suffix == 'd') {1399ndata = (uint64_t)value;1400} else if (suffix == 's') {1401nspares = (uint64_t)value;1402} else {1403verify(0); /* Unreachable */1404}1405}14061407/*1408* When a specific number of data disks is not provided limit a1409* redundancy group to 8 data disks. This value was selected to1410* provide a reasonable tradeoff between capacity and performance.1411*/1412if (ndata == UINT64_MAX) {1413if (children > nspares + nparity) {1414ndata = MIN(children - nspares - nparity, 8);1415} else {1416fprintf(stderr, gettext("request number of "1417"distributed spares %llu and parity level %llu\n"1418"leaves no disks available for data\n"),1419(u_longlong_t)nspares, (u_longlong_t)nparity);1420return (EINVAL);1421}1422}14231424/* Verify the maximum allowed group size is never exceeded. */1425if (ndata == 0 || (ndata + nparity > children - nspares)) {1426fprintf(stderr, gettext("requested number of dRAID data "1427"disks per group %llu is too high,\nat most %llu disks "1428"are available for data\n"), (u_longlong_t)ndata,1429(u_longlong_t)(children - nspares - nparity));1430return (EINVAL);1431}14321433/*1434* Verify the requested number of spares can be satisfied.1435* An arbitrary limit of 100 distributed spares is applied.1436*/1437if (nspares > 100 || nspares > (children - (ndata + nparity))) {1438fprintf(stderr,1439gettext("invalid number of dRAID spares %llu; additional "1440"disks would be required\n"), (u_longlong_t)nspares);1441return (EINVAL);1442}14431444/* Verify the requested number children is sufficient. */1445if (children < (ndata + nparity + nspares)) {1446fprintf(stderr, gettext("%llu disks were provided, but at "1447"least %llu disks are required for this config\n"),1448(u_longlong_t)children,1449(u_longlong_t)(ndata + nparity + nspares));1450}14511452if (children > VDEV_DRAID_MAX_CHILDREN) {1453fprintf(stderr, gettext("%llu disks were provided, but "1454"dRAID only supports up to %u disks"),1455(u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);1456}14571458/*1459* Calculate the minimum number of groups required to fill a slice.1460* This is the LCM of the stripe width (ndata + nparity) and the1461* number of data drives (children - nspares).1462*/1463while (ngroups * (ndata + nparity) % (children - nspares) != 0)1464ngroups++;14651466/* Store the basic dRAID configuration. */1467fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);1468fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);1469fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);1470fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);14711472return (0);1473}14741475/*1476* Construct a syntactically valid vdev specification,1477* and ensure that all devices and files exist and can be opened.1478* Note: we don't bother freeing anything in the error paths1479* because the program is just going to exit anyway.1480*/1481static nvlist_t *1482construct_spec(nvlist_t *props, int argc, char **argv)1483{1484nvlist_t *nvroot, *nv, **top, **spares, **l2cache;1485int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;1486const char *type, *fulltype;1487boolean_t is_log, is_special, is_dedup, is_spare;1488boolean_t seen_logs;1489uint64_t ashift = 0;14901491if (props != NULL) {1492const char *value = NULL;14931494if (nvlist_lookup_string(props,1495zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {1496if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {1497(void) fprintf(stderr,1498gettext("ashift must be a number.\n"));1499return (NULL);1500}1501if (ashift != 0 &&1502(ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {1503(void) fprintf(stderr,1504gettext("invalid 'ashift=%" PRIu64 "' "1505"property: only values between %" PRId32 " "1506"and %" PRId32 " are allowed.\n"),1507ashift, ASHIFT_MIN, ASHIFT_MAX);1508return (NULL);1509}1510}1511}15121513top = NULL;1514toplevels = 0;1515spares = NULL;1516l2cache = NULL;1517nspares = 0;1518nlogs = 0;1519nl2cache = 0;1520is_log = is_special = is_dedup = is_spare = B_FALSE;1521seen_logs = B_FALSE;1522nvroot = NULL;15231524while (argc > 0) {1525fulltype = argv[0];1526nv = NULL;15271528/*1529* If it's a mirror, raidz, or draid the subsequent arguments1530* are its leaves -- until we encounter the next mirror,1531* raidz or draid.1532*/1533if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {1534nvlist_t **child = NULL;1535int c, children = 0;15361537if (strcmp(type, VDEV_TYPE_SPARE) == 0) {1538if (spares != NULL) {1539(void) fprintf(stderr,1540gettext("invalid vdev "1541"specification: 'spare' can be "1542"specified only once\n"));1543goto spec_out;1544}1545is_spare = B_TRUE;1546is_log = is_special = is_dedup = B_FALSE;1547}15481549if (strcmp(type, VDEV_TYPE_LOG) == 0) {1550if (seen_logs) {1551(void) fprintf(stderr,1552gettext("invalid vdev "1553"specification: 'log' can be "1554"specified only once\n"));1555goto spec_out;1556}1557seen_logs = B_TRUE;1558is_log = B_TRUE;1559is_special = is_dedup = is_spare = B_FALSE;1560argc--;1561argv++;1562/*1563* A log is not a real grouping device.1564* We just set is_log and continue.1565*/1566continue;1567}15681569if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {1570is_special = B_TRUE;1571is_log = is_dedup = is_spare = B_FALSE;1572argc--;1573argv++;1574continue;1575}15761577if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {1578is_dedup = B_TRUE;1579is_log = is_special = is_spare = B_FALSE;1580argc--;1581argv++;1582continue;1583}15841585if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {1586if (l2cache != NULL) {1587(void) fprintf(stderr,1588gettext("invalid vdev "1589"specification: 'cache' can be "1590"specified only once\n"));1591goto spec_out;1592}1593is_log = is_special = B_FALSE;1594is_dedup = is_spare = B_FALSE;1595}15961597if (is_log) {1598if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {1599(void) fprintf(stderr,1600gettext("invalid vdev "1601"specification: unsupported 'log' "1602"device: %s\n"), type);1603goto spec_out;1604}1605nlogs++;1606}16071608for (c = 1; c < argc; c++) {1609if (is_grouping(argv[c], NULL, NULL) != NULL)1610break;16111612children++;1613child = realloc(child,1614children * sizeof (nvlist_t *));1615if (child == NULL)1616zpool_no_memory();1617if ((nv = make_leaf_vdev(argv[c],1618!(is_log || is_special || is_dedup ||1619is_spare), ashift)) == NULL) {1620for (c = 0; c < children - 1; c++)1621nvlist_free(child[c]);1622free(child);1623goto spec_out;1624}16251626child[children - 1] = nv;1627}16281629if (children < mindev) {1630(void) fprintf(stderr, gettext("invalid vdev "1631"specification: %s requires at least %d "1632"devices\n"), argv[0], mindev);1633for (c = 0; c < children; c++)1634nvlist_free(child[c]);1635free(child);1636goto spec_out;1637}16381639if (children > maxdev) {1640(void) fprintf(stderr, gettext("invalid vdev "1641"specification: %s supports no more than "1642"%d devices\n"), argv[0], maxdev);1643for (c = 0; c < children; c++)1644nvlist_free(child[c]);1645free(child);1646goto spec_out;1647}16481649argc -= c;1650argv += c;16511652if (strcmp(type, VDEV_TYPE_SPARE) == 0) {1653spares = child;1654nspares = children;1655continue;1656} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {1657l2cache = child;1658nl2cache = children;1659continue;1660} else {1661/* create a top-level vdev with children */1662verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,16630) == 0);1664verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,1665type) == 0);1666verify(nvlist_add_uint64(nv,1667ZPOOL_CONFIG_IS_LOG, is_log) == 0);1668if (is_log) {1669verify(nvlist_add_string(nv,1670ZPOOL_CONFIG_ALLOCATION_BIAS,1671VDEV_ALLOC_BIAS_LOG) == 0);1672}1673if (is_special) {1674verify(nvlist_add_string(nv,1675ZPOOL_CONFIG_ALLOCATION_BIAS,1676VDEV_ALLOC_BIAS_SPECIAL) == 0);1677}1678if (is_dedup) {1679verify(nvlist_add_string(nv,1680ZPOOL_CONFIG_ALLOCATION_BIAS,1681VDEV_ALLOC_BIAS_DEDUP) == 0);1682}1683if (ashift > 0) {1684fnvlist_add_uint64(nv,1685ZPOOL_CONFIG_ASHIFT, ashift);1686}1687if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {1688verify(nvlist_add_uint64(nv,1689ZPOOL_CONFIG_NPARITY,1690mindev - 1) == 0);1691}1692if (strcmp(type, VDEV_TYPE_DRAID) == 0) {1693if (draid_config_by_type(nv,1694fulltype, children) != 0) {1695for (c = 0; c < children; c++)1696nvlist_free(child[c]);1697free(child);1698goto spec_out;1699}1700}1701verify(nvlist_add_nvlist_array(nv,1702ZPOOL_CONFIG_CHILDREN,1703(const nvlist_t **)child, children) == 0);17041705for (c = 0; c < children; c++)1706nvlist_free(child[c]);1707free(child);1708}1709} else {1710/*1711* We have a device. Pass off to make_leaf_vdev() to1712* construct the appropriate nvlist describing the vdev.1713*/1714if ((nv = make_leaf_vdev(argv[0], !(is_log ||1715is_special || is_dedup || is_spare),1716ashift)) == NULL)1717goto spec_out;17181719verify(nvlist_add_uint64(nv,1720ZPOOL_CONFIG_IS_LOG, is_log) == 0);1721if (is_log) {1722verify(nvlist_add_string(nv,1723ZPOOL_CONFIG_ALLOCATION_BIAS,1724VDEV_ALLOC_BIAS_LOG) == 0);1725nlogs++;1726}17271728if (is_special) {1729verify(nvlist_add_string(nv,1730ZPOOL_CONFIG_ALLOCATION_BIAS,1731VDEV_ALLOC_BIAS_SPECIAL) == 0);1732}1733if (is_dedup) {1734verify(nvlist_add_string(nv,1735ZPOOL_CONFIG_ALLOCATION_BIAS,1736VDEV_ALLOC_BIAS_DEDUP) == 0);1737}1738argc--;1739argv++;1740}17411742toplevels++;1743top = realloc(top, toplevels * sizeof (nvlist_t *));1744if (top == NULL)1745zpool_no_memory();1746top[toplevels - 1] = nv;1747}17481749if (toplevels == 0 && nspares == 0 && nl2cache == 0) {1750(void) fprintf(stderr, gettext("invalid vdev "1751"specification: at least one toplevel vdev must be "1752"specified\n"));1753goto spec_out;1754}17551756if (seen_logs && nlogs == 0) {1757(void) fprintf(stderr, gettext("invalid vdev specification: "1758"log requires at least 1 device\n"));1759goto spec_out;1760}17611762/*1763* Finally, create nvroot and add all top-level vdevs to it.1764*/1765verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);1766verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,1767VDEV_TYPE_ROOT) == 0);1768verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,1769(const nvlist_t **)top, toplevels) == 0);1770if (nspares != 0)1771verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,1772(const nvlist_t **)spares, nspares) == 0);1773if (nl2cache != 0)1774verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,1775(const nvlist_t **)l2cache, nl2cache) == 0);17761777spec_out:1778for (t = 0; t < toplevels; t++)1779nvlist_free(top[t]);1780for (t = 0; t < nspares; t++)1781nvlist_free(spares[t]);1782for (t = 0; t < nl2cache; t++)1783nvlist_free(l2cache[t]);17841785free(spares);1786free(l2cache);1787free(top);17881789return (nvroot);1790}17911792nvlist_t *1793split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,1794splitflags_t flags, int argc, char **argv)1795{1796nvlist_t *newroot = NULL, **child;1797uint_t c, children;17981799if (argc > 0) {1800if ((newroot = construct_spec(props, argc, argv)) == NULL) {1801(void) fprintf(stderr, gettext("Unable to build a "1802"pool from the specified devices\n"));1803return (NULL);1804}18051806if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) {1807nvlist_free(newroot);1808return (NULL);1809}18101811/* avoid any tricks in the spec */1812verify(nvlist_lookup_nvlist_array(newroot,1813ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);1814for (c = 0; c < children; c++) {1815const char *path;1816const char *type;1817int min, max;18181819verify(nvlist_lookup_string(child[c],1820ZPOOL_CONFIG_PATH, &path) == 0);1821if ((type = is_grouping(path, &min, &max)) != NULL) {1822(void) fprintf(stderr, gettext("Cannot use "1823"'%s' as a device for splitting\n"), type);1824nvlist_free(newroot);1825return (NULL);1826}1827}1828}18291830if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {1831nvlist_free(newroot);1832return (NULL);1833}18341835return (newroot);1836}18371838static int1839num_normal_vdevs(nvlist_t *nvroot)1840{1841nvlist_t **top;1842uint_t t, toplevels, normal = 0;18431844verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,1845&top, &toplevels) == 0);18461847for (t = 0; t < toplevels; t++) {1848uint64_t log = B_FALSE;18491850(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);1851if (log)1852continue;1853if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))1854continue;18551856normal++;1857}18581859return (normal);1860}18611862/*1863* Get and validate the contents of the given vdev specification. This ensures1864* that the nvlist returned is well-formed, that all the devices exist, and that1865* they are not currently in use by any other known consumer. The 'poolconfig'1866* parameter is the current configuration of the pool when adding devices1867* existing pool, and is used to perform additional checks, such as changing the1868* replication level of the pool. It can be 'NULL' to indicate that this is a1869* new pool. The 'force' flag controls whether devices should be forcefully1870* added, even if they appear in use.1871*/1872nvlist_t *1873make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,1874boolean_t replacing, boolean_t dryrun, int argc, char **argv)1875{1876nvlist_t *newroot;1877nvlist_t *poolconfig = NULL;1878is_force = force;18791880/*1881* Construct the vdev specification. If this is successful, we know1882* that we have a valid specification, and that all devices can be1883* opened.1884*/1885if ((newroot = construct_spec(props, argc, argv)) == NULL)1886return (NULL);18871888if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {1889nvlist_free(newroot);1890return (NULL);1891}18921893/*1894* Validate each device to make sure that it's not shared with another1895* subsystem. We do this even if 'force' is set, because there are some1896* uses (such as a dedicated dump device) that even '-f' cannot1897* override.1898*/1899if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {1900nvlist_free(newroot);1901return (NULL);1902}19031904/*1905* Check the replication level of the given vdevs and report any errors1906* found. We include the existing pool spec, if any, as we need to1907* catch changes against the existing replication level.1908*/1909if (check_rep && check_replication(poolconfig, newroot) != 0) {1910nvlist_free(newroot);1911return (NULL);1912}19131914/*1915* On pool create the new vdev spec must have one normal vdev.1916*/1917if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {1918vdev_error(gettext("at least one general top-level vdev must "1919"be specified\n"));1920nvlist_free(newroot);1921return (NULL);1922}19231924/*1925* Run through the vdev specification and label any whole disks found.1926*/1927if (!dryrun && make_disks(zhp, newroot, replacing) != 0) {1928nvlist_free(newroot);1929return (NULL);1930}19311932return (newroot);1933}193419351936