Path: blob/main/sys/contrib/openzfs/lib/libzutil/zutil_import.c
48375 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2015 Nexenta Systems, Inc. All rights reserved.23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2012, 2018 by Delphix. All rights reserved.25* Copyright 2015 RackTop Systems.26* Copyright (c) 2016, Intel Corporation.27* Copyright (c) 2021, Colm Buckley <[email protected]>28*/2930/*31* Pool import support functions.32*33* Used by zpool, ztest, zdb, and zhack to locate importable configs. Since34* these commands are expected to run in the global zone, we can assume35* that the devices are all readable when called.36*37* To import a pool, we rely on reading the configuration information from the38* ZFS label of each device. If we successfully read the label, then we39* organize the configuration information in the following hierarchy:40*41* pool guid -> toplevel vdev guid -> label txg42*43* Duplicate entries matching this same tuple will be discarded. Once we have44* examined every device, we pick the best label txg config for each toplevel45* vdev. We then arrange these toplevel vdevs into a complete pool config, and46* update any paths that have changed. Finally, we attempt to import the pool47* using our derived config, and record the results.48*/4950#ifdef HAVE_AIO_H51#include <aio.h>52#endif53#include <ctype.h>54#include <dirent.h>55#include <errno.h>56#include <libintl.h>57#include <libgen.h>58#include <stddef.h>59#include <stdlib.h>60#include <string.h>61#include <sys/stat.h>62#include <unistd.h>63#include <fcntl.h>64#include <sys/dktp/fdisk.h>65#include <sys/vdev_impl.h>66#include <sys/fs/zfs.h>6768#include <thread_pool.h>69#include <libzutil.h>70#include <libnvpair.h>7172#include "zutil_import.h"7374const char *75libpc_error_description(libpc_handle_t *hdl)76{77if (hdl->lpc_desc[0] != '\0')78return (hdl->lpc_desc);7980switch (hdl->lpc_error) {81case LPC_BADCACHE:82return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));83case LPC_BADPATH:84return (dgettext(TEXT_DOMAIN, "must be an absolute path"));85case LPC_NOMEM:86return (dgettext(TEXT_DOMAIN, "out of memory"));87case LPC_EACCESS:88return (dgettext(TEXT_DOMAIN, "some devices require root "89"privileges"));90case LPC_UNKNOWN:91return (dgettext(TEXT_DOMAIN, "unknown error"));92default:93assert(hdl->lpc_error == 0);94return (dgettext(TEXT_DOMAIN, "no error"));95}96}9798static __attribute__((format(printf, 2, 3))) void99zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...)100{101va_list ap;102103va_start(ap, fmt);104105(void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap);106hdl->lpc_desc_active = B_TRUE;107108va_end(ap);109}110111static void112zutil_verror(libpc_handle_t *hdl, lpc_error_t error, const char *fmt,113va_list ap)114{115char action[1024];116117(void) vsnprintf(action, sizeof (action), fmt, ap);118hdl->lpc_error = error;119120if (hdl->lpc_desc_active)121hdl->lpc_desc_active = B_FALSE;122else123hdl->lpc_desc[0] = '\0';124125if (hdl->lpc_printerr)126(void) fprintf(stderr, "%s: %s\n", action,127libpc_error_description(hdl));128}129130static __attribute__((format(printf, 3, 4))) int131zutil_error_fmt(libpc_handle_t *hdl, lpc_error_t error,132const char *fmt, ...)133{134va_list ap;135136va_start(ap, fmt);137138zutil_verror(hdl, error, fmt, ap);139140va_end(ap);141142return (-1);143}144145static int146zutil_error(libpc_handle_t *hdl, lpc_error_t error, const char *msg)147{148return (zutil_error_fmt(hdl, error, "%s", msg));149}150151static int152zutil_no_memory(libpc_handle_t *hdl)153{154zutil_error(hdl, LPC_NOMEM, "internal error");155exit(1);156}157158void *159zutil_alloc(libpc_handle_t *hdl, size_t size)160{161void *data;162163if ((data = calloc(1, size)) == NULL)164(void) zutil_no_memory(hdl);165166return (data);167}168169char *170zutil_strdup(libpc_handle_t *hdl, const char *str)171{172char *ret;173174if ((ret = strdup(str)) == NULL)175(void) zutil_no_memory(hdl);176177return (ret);178}179180static char *181zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n)182{183char *ret;184185if ((ret = strndup(str, n)) == NULL)186(void) zutil_no_memory(hdl);187188return (ret);189}190191/*192* Intermediate structures used to gather configuration information.193*/194typedef struct config_entry {195uint64_t ce_txg;196nvlist_t *ce_config;197struct config_entry *ce_next;198} config_entry_t;199200typedef struct vdev_entry {201uint64_t ve_guid;202config_entry_t *ve_configs;203struct vdev_entry *ve_next;204} vdev_entry_t;205206typedef struct pool_entry {207uint64_t pe_guid;208vdev_entry_t *pe_vdevs;209struct pool_entry *pe_next;210} pool_entry_t;211212typedef struct name_entry {213char *ne_name;214uint64_t ne_guid;215uint64_t ne_order;216uint64_t ne_num_labels;217struct name_entry *ne_next;218} name_entry_t;219220typedef struct pool_list {221pool_entry_t *pools;222name_entry_t *names;223} pool_list_t;224225/*226* Go through and fix up any path and/or devid information for the given vdev227* configuration.228*/229static int230fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names)231{232nvlist_t **child;233uint_t c, children;234uint64_t guid;235name_entry_t *ne, *best;236const char *path;237238if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,239&child, &children) == 0) {240for (c = 0; c < children; c++)241if (fix_paths(hdl, child[c], names) != 0)242return (-1);243return (0);244}245246/*247* This is a leaf (file or disk) vdev. In either case, go through248* the name list and see if we find a matching guid. If so, replace249* the path and see if we can calculate a new devid.250*251* There may be multiple names associated with a particular guid, in252* which case we have overlapping partitions or multiple paths to the253* same disk. In this case we prefer to use the path name which254* matches the ZPOOL_CONFIG_PATH. If no matching entry is found we255* use the lowest order device which corresponds to the first match256* while traversing the ZPOOL_IMPORT_PATH search path.257*/258verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);259if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)260path = NULL;261262best = NULL;263for (ne = names; ne != NULL; ne = ne->ne_next) {264if (ne->ne_guid == guid) {265if (path == NULL) {266best = ne;267break;268}269270if ((strlen(path) == strlen(ne->ne_name)) &&271strncmp(path, ne->ne_name, strlen(path)) == 0) {272best = ne;273break;274}275276if (best == NULL) {277best = ne;278continue;279}280281/* Prefer paths with move vdev labels. */282if (ne->ne_num_labels > best->ne_num_labels) {283best = ne;284continue;285}286287/* Prefer paths earlier in the search order. */288if (ne->ne_num_labels == best->ne_num_labels &&289ne->ne_order < best->ne_order) {290best = ne;291continue;292}293}294}295296if (best == NULL)297return (0);298299if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)300return (-1);301302update_vdev_config_dev_strs(nv);303304return (0);305}306307/*308* Add the given configuration to the list of known devices.309*/310static int311add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path,312int order, int num_labels, nvlist_t *config)313{314uint64_t pool_guid, vdev_guid, top_guid, txg, state;315pool_entry_t *pe;316vdev_entry_t *ve;317config_entry_t *ce;318name_entry_t *ne;319320/*321* If this is a hot spare not currently in use or level 2 cache322* device, add it to the list of names to translate, but don't do323* anything else.324*/325if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,326&state) == 0 &&327(state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&328nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {329if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)330return (-1);331332if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {333free(ne);334return (-1);335}336ne->ne_guid = vdev_guid;337ne->ne_order = order;338ne->ne_num_labels = num_labels;339ne->ne_next = pl->names;340pl->names = ne;341342return (0);343}344345/*346* If we have a valid config but cannot read any of these fields, then347* it means we have a half-initialized label. In vdev_label_init()348* we write a label with txg == 0 so that we can identify the device349* in case the user refers to the same disk later on. If we fail to350* create the pool, we'll be left with a label in this state351* which should not be considered part of a valid pool.352*/353if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,354&pool_guid) != 0 ||355nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,356&vdev_guid) != 0 ||357nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,358&top_guid) != 0 ||359nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,360&txg) != 0 || txg == 0) {361return (0);362}363364/*365* First, see if we know about this pool. If not, then add it to the366* list of known pools.367*/368for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {369if (pe->pe_guid == pool_guid)370break;371}372373if (pe == NULL) {374if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) {375return (-1);376}377pe->pe_guid = pool_guid;378pe->pe_next = pl->pools;379pl->pools = pe;380}381382/*383* Second, see if we know about this toplevel vdev. Add it if its384* missing.385*/386for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {387if (ve->ve_guid == top_guid)388break;389}390391if (ve == NULL) {392if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {393return (-1);394}395ve->ve_guid = top_guid;396ve->ve_next = pe->pe_vdevs;397pe->pe_vdevs = ve;398}399400/*401* Third, see if we have a config with a matching transaction group. If402* so, then we do nothing. Otherwise, add it to the list of known403* configs.404*/405for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {406if (ce->ce_txg == txg)407break;408}409410if (ce == NULL) {411if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) {412return (-1);413}414ce->ce_txg = txg;415ce->ce_config = fnvlist_dup(config);416ce->ce_next = ve->ve_configs;417ve->ve_configs = ce;418}419420/*421* At this point we've successfully added our config to the list of422* known configs. The last thing to do is add the vdev guid -> path423* mappings so that we can fix up the configuration as necessary before424* doing the import.425*/426if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)427return (-1);428429if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {430free(ne);431return (-1);432}433434ne->ne_guid = vdev_guid;435ne->ne_order = order;436ne->ne_num_labels = num_labels;437ne->ne_next = pl->names;438pl->names = ne;439440return (0);441}442443static int444zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid,445boolean_t *isactive)446{447ASSERT(hdl->lpc_ops->pco_pool_active != NULL);448449int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name,450guid, isactive);451452return (error);453}454455static nvlist_t *456zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig)457{458ASSERT(hdl->lpc_ops->pco_refresh_config != NULL);459460return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle,461tryconfig));462}463464/*465* Determine if the vdev id is a hole in the namespace.466*/467static boolean_t468vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)469{470int c;471472for (c = 0; c < holes; c++) {473474/* Top-level is a hole */475if (hole_array[c] == id)476return (B_TRUE);477}478return (B_FALSE);479}480481/*482* Convert our list of pools into the definitive set of configurations. We483* start by picking the best config for each toplevel vdev. Once that's done,484* we assemble the toplevel vdevs into a full config for the pool. We make a485* pass to fix up any incorrect paths, and then add it to the main list to486* return to the user.487*/488static nvlist_t *489get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,490nvlist_t *policy)491{492pool_entry_t *pe;493vdev_entry_t *ve;494config_entry_t *ce;495nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;496nvlist_t **spares, **l2cache;497uint_t i, nspares, nl2cache;498boolean_t config_seen;499uint64_t best_txg;500const char *name, *hostname = NULL;501uint64_t guid;502uint_t children = 0;503nvlist_t **child = NULL;504uint64_t *hole_array, max_id;505uint_t c;506boolean_t isactive;507nvlist_t *nvl;508boolean_t valid_top_config = B_FALSE;509510if (nvlist_alloc(&ret, 0, 0) != 0)511goto nomem;512513for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {514uint64_t id, max_txg = 0, hostid = 0;515uint_t holes = 0;516517if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)518goto nomem;519config_seen = B_FALSE;520521/*522* Iterate over all toplevel vdevs. Grab the pool configuration523* from the first one we find, and then go through the rest and524* add them as necessary to the 'vdevs' member of the config.525*/526for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {527528/*529* Determine the best configuration for this vdev by530* selecting the config with the latest transaction531* group.532*/533best_txg = 0;534for (ce = ve->ve_configs; ce != NULL;535ce = ce->ce_next) {536537if (ce->ce_txg > best_txg) {538tmp = ce->ce_config;539best_txg = ce->ce_txg;540}541}542543/*544* We rely on the fact that the max txg for the545* pool will contain the most up-to-date information546* about the valid top-levels in the vdev namespace.547*/548if (best_txg > max_txg) {549(void) nvlist_remove(config,550ZPOOL_CONFIG_VDEV_CHILDREN,551DATA_TYPE_UINT64);552(void) nvlist_remove(config,553ZPOOL_CONFIG_HOLE_ARRAY,554DATA_TYPE_UINT64_ARRAY);555556max_txg = best_txg;557hole_array = NULL;558holes = 0;559max_id = 0;560valid_top_config = B_FALSE;561562if (nvlist_lookup_uint64(tmp,563ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {564verify(nvlist_add_uint64(config,565ZPOOL_CONFIG_VDEV_CHILDREN,566max_id) == 0);567valid_top_config = B_TRUE;568}569570if (nvlist_lookup_uint64_array(tmp,571ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,572&holes) == 0) {573verify(nvlist_add_uint64_array(config,574ZPOOL_CONFIG_HOLE_ARRAY,575hole_array, holes) == 0);576}577}578579if (!config_seen) {580/*581* Copy the relevant pieces of data to the pool582* configuration:583*584* version585* pool guid586* name587* comment (if available)588* compatibility features (if available)589* pool state590* hostid (if available)591* hostname (if available)592*/593uint64_t state, version;594const char *comment = NULL;595const char *compatibility = NULL;596597version = fnvlist_lookup_uint64(tmp,598ZPOOL_CONFIG_VERSION);599fnvlist_add_uint64(config,600ZPOOL_CONFIG_VERSION, version);601guid = fnvlist_lookup_uint64(tmp,602ZPOOL_CONFIG_POOL_GUID);603fnvlist_add_uint64(config,604ZPOOL_CONFIG_POOL_GUID, guid);605name = fnvlist_lookup_string(tmp,606ZPOOL_CONFIG_POOL_NAME);607fnvlist_add_string(config,608ZPOOL_CONFIG_POOL_NAME, name);609610if (nvlist_lookup_string(tmp,611ZPOOL_CONFIG_COMMENT, &comment) == 0)612fnvlist_add_string(config,613ZPOOL_CONFIG_COMMENT, comment);614615if (nvlist_lookup_string(tmp,616ZPOOL_CONFIG_COMPATIBILITY,617&compatibility) == 0)618fnvlist_add_string(config,619ZPOOL_CONFIG_COMPATIBILITY,620compatibility);621622state = fnvlist_lookup_uint64(tmp,623ZPOOL_CONFIG_POOL_STATE);624fnvlist_add_uint64(config,625ZPOOL_CONFIG_POOL_STATE, state);626627hostid = 0;628if (nvlist_lookup_uint64(tmp,629ZPOOL_CONFIG_HOSTID, &hostid) == 0) {630fnvlist_add_uint64(config,631ZPOOL_CONFIG_HOSTID, hostid);632hostname = fnvlist_lookup_string(tmp,633ZPOOL_CONFIG_HOSTNAME);634fnvlist_add_string(config,635ZPOOL_CONFIG_HOSTNAME, hostname);636}637638config_seen = B_TRUE;639}640641/*642* Add this top-level vdev to the child array.643*/644verify(nvlist_lookup_nvlist(tmp,645ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);646verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,647&id) == 0);648649if (id >= children) {650nvlist_t **newchild;651652newchild = zutil_alloc(hdl, (id + 1) *653sizeof (nvlist_t *));654if (newchild == NULL)655goto nomem;656657for (c = 0; c < children; c++)658newchild[c] = child[c];659660free(child);661child = newchild;662children = id + 1;663}664if (nvlist_dup(nvtop, &child[id], 0) != 0)665goto nomem;666667}668669/*670* If we have information about all the top-levels then671* clean up the nvlist which we've constructed. This672* means removing any extraneous devices that are673* beyond the valid range or adding devices to the end674* of our array which appear to be missing.675*/676if (valid_top_config) {677if (max_id < children) {678for (c = max_id; c < children; c++)679nvlist_free(child[c]);680children = max_id;681} else if (max_id > children) {682nvlist_t **newchild;683684newchild = zutil_alloc(hdl, (max_id) *685sizeof (nvlist_t *));686if (newchild == NULL)687goto nomem;688689for (c = 0; c < children; c++)690newchild[c] = child[c];691692free(child);693child = newchild;694children = max_id;695}696}697698verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,699&guid) == 0);700701/*702* The vdev namespace may contain holes as a result of703* device removal. We must add them back into the vdev704* tree before we process any missing devices.705*/706if (holes > 0) {707ASSERT(valid_top_config);708709for (c = 0; c < children; c++) {710nvlist_t *holey;711712if (child[c] != NULL ||713!vdev_is_hole(hole_array, holes, c))714continue;715716if (nvlist_alloc(&holey, NV_UNIQUE_NAME,7170) != 0)718goto nomem;719720/*721* Holes in the namespace are treated as722* "hole" top-level vdevs and have a723* special flag set on them.724*/725if (nvlist_add_string(holey,726ZPOOL_CONFIG_TYPE,727VDEV_TYPE_HOLE) != 0 ||728nvlist_add_uint64(holey,729ZPOOL_CONFIG_ID, c) != 0 ||730nvlist_add_uint64(holey,731ZPOOL_CONFIG_GUID, 0ULL) != 0) {732nvlist_free(holey);733goto nomem;734}735child[c] = holey;736}737}738739/*740* Look for any missing top-level vdevs. If this is the case,741* create a faked up 'missing' vdev as a placeholder. We cannot742* simply compress the child array, because the kernel performs743* certain checks to make sure the vdev IDs match their location744* in the configuration.745*/746for (c = 0; c < children; c++) {747if (child[c] == NULL) {748nvlist_t *missing;749if (nvlist_alloc(&missing, NV_UNIQUE_NAME,7500) != 0)751goto nomem;752if (nvlist_add_string(missing,753ZPOOL_CONFIG_TYPE,754VDEV_TYPE_MISSING) != 0 ||755nvlist_add_uint64(missing,756ZPOOL_CONFIG_ID, c) != 0 ||757nvlist_add_uint64(missing,758ZPOOL_CONFIG_GUID, 0ULL) != 0) {759nvlist_free(missing);760goto nomem;761}762child[c] = missing;763}764}765766/*767* Put all of this pool's top-level vdevs into a root vdev.768*/769if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)770goto nomem;771if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,772VDEV_TYPE_ROOT) != 0 ||773nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||774nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||775nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,776(const nvlist_t **)child, children) != 0) {777nvlist_free(nvroot);778goto nomem;779}780781for (c = 0; c < children; c++)782nvlist_free(child[c]);783free(child);784children = 0;785child = NULL;786787/*788* Go through and fix up any paths and/or devids based on our789* known list of vdev GUID -> path mappings.790*/791if (fix_paths(hdl, nvroot, pl->names) != 0) {792nvlist_free(nvroot);793goto nomem;794}795796/*797* Add the root vdev to this pool's configuration.798*/799if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,800nvroot) != 0) {801nvlist_free(nvroot);802goto nomem;803}804nvlist_free(nvroot);805806/*807* zdb uses this path to report on active pools that were808* imported or created using -R.809*/810if (active_ok)811goto add_pool;812813/*814* Determine if this pool is currently active, in which case we815* can't actually import it.816*/817verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,818&name) == 0);819verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,820&guid) == 0);821822if (zutil_pool_active(hdl, name, guid, &isactive) != 0)823goto error;824825if (isactive) {826nvlist_free(config);827config = NULL;828continue;829}830831if (policy != NULL) {832if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,833policy) != 0)834goto nomem;835}836837if ((nvl = zutil_refresh_config(hdl, config)) == NULL) {838nvlist_free(config);839config = NULL;840continue;841}842843nvlist_free(config);844config = nvl;845846/*847* Go through and update the paths for spares, now that we have848* them.849*/850verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,851&nvroot) == 0);852if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,853&spares, &nspares) == 0) {854for (i = 0; i < nspares; i++) {855if (fix_paths(hdl, spares[i], pl->names) != 0)856goto nomem;857}858}859860/*861* Update the paths for l2cache devices.862*/863if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,864&l2cache, &nl2cache) == 0) {865for (i = 0; i < nl2cache; i++) {866if (fix_paths(hdl, l2cache[i], pl->names) != 0)867goto nomem;868}869}870871/*872* Restore the original information read from the actual label.873*/874(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,875DATA_TYPE_UINT64);876(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,877DATA_TYPE_STRING);878if (hostid != 0) {879verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,880hostid) == 0);881verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,882hostname) == 0);883}884885add_pool:886/*887* Add this pool to the list of configs.888*/889verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,890&name) == 0);891892if (nvlist_add_nvlist(ret, name, config) != 0)893goto nomem;894895nvlist_free(config);896config = NULL;897}898899return (ret);900901nomem:902(void) zutil_no_memory(hdl);903error:904nvlist_free(config);905nvlist_free(ret);906for (c = 0; c < children; c++)907nvlist_free(child[c]);908free(child);909910return (NULL);911}912913/*914* Return the offset of the given label.915*/916static uint64_t917label_offset(uint64_t size, int l)918{919ASSERT0(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t));920return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?9210 : size - VDEV_LABELS * sizeof (vdev_label_t)));922}923924/*925* The same description applies as to zpool_read_label below,926* except here we do it without aio, presumably because an aio call927* errored out in a way we think not using it could circumvent.928*/929static int930zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels)931{932struct stat64 statbuf;933int l, count = 0;934vdev_phys_t *label;935nvlist_t *expected_config = NULL;936uint64_t expected_guid = 0, size;937938*config = NULL;939940if (fstat64_blk(fd, &statbuf) == -1)941return (0);942size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);943944label = (vdev_phys_t *)umem_alloc_aligned(sizeof (*label), PAGESIZE,945UMEM_DEFAULT);946if (label == NULL)947return (-1);948949for (l = 0; l < VDEV_LABELS; l++) {950uint64_t state, guid, txg;951off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;952953if (pread64(fd, label, sizeof (vdev_phys_t),954offset) != sizeof (vdev_phys_t))955continue;956957if (nvlist_unpack(label->vp_nvlist,958sizeof (label->vp_nvlist), config, 0) != 0)959continue;960961if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,962&guid) != 0 || guid == 0) {963nvlist_free(*config);964continue;965}966967if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,968&state) != 0 || state > POOL_STATE_L2CACHE) {969nvlist_free(*config);970continue;971}972973if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&974(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,975&txg) != 0 || txg == 0)) {976nvlist_free(*config);977continue;978}979980if (expected_guid) {981if (expected_guid == guid)982count++;983984nvlist_free(*config);985} else {986expected_config = *config;987expected_guid = guid;988count++;989}990}991992if (num_labels != NULL)993*num_labels = count;994995umem_free_aligned(label, sizeof (*label));996*config = expected_config;997998return (0);999}10001001/*1002* Given a file descriptor, read the label information and return an nvlist1003* describing the configuration, if there is one. The number of valid1004* labels found will be returned in num_labels when non-NULL.1005*/1006int1007zpool_read_label(int fd, nvlist_t **config, int *num_labels)1008{1009#ifndef HAVE_AIO_H1010return (zpool_read_label_slow(fd, config, num_labels));1011#else1012struct stat64 statbuf;1013struct aiocb aiocbs[VDEV_LABELS];1014struct aiocb *aiocbps[VDEV_LABELS];1015vdev_phys_t *labels;1016nvlist_t *expected_config = NULL;1017uint64_t expected_guid = 0, size;1018int error, l, count = 0;10191020*config = NULL;10211022if (fstat64_blk(fd, &statbuf) == -1)1023return (0);1024size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);10251026labels = (vdev_phys_t *)umem_alloc_aligned(1027VDEV_LABELS * sizeof (*labels), PAGESIZE, UMEM_DEFAULT);1028if (labels == NULL)1029return (-1);10301031memset(aiocbs, 0, sizeof (aiocbs));1032for (l = 0; l < VDEV_LABELS; l++) {1033off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;10341035aiocbs[l].aio_fildes = fd;1036aiocbs[l].aio_offset = offset;1037aiocbs[l].aio_buf = &labels[l];1038aiocbs[l].aio_nbytes = sizeof (vdev_phys_t);1039aiocbs[l].aio_lio_opcode = LIO_READ;1040aiocbps[l] = &aiocbs[l];1041}10421043if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) {1044int saved_errno = errno;1045boolean_t do_slow = B_FALSE;1046error = -1;10471048if (errno == EAGAIN || errno == EINTR || errno == EIO) {1049/*1050* A portion of the requests may have been submitted.1051* Clean them up.1052*/1053for (l = 0; l < VDEV_LABELS; l++) {1054errno = 0;1055switch (aio_error(&aiocbs[l])) {1056case EINVAL:1057break;1058case EINPROGRESS:1059/*1060* This shouldn't be possible to1061* encounter, die if we do.1062*/1063ASSERT(B_FALSE);1064zfs_fallthrough;1065case EREMOTEIO:1066/*1067* May be returned by an NVMe device1068* which is visible in /dev/ but due1069* to a low-level format change, or1070* other error, needs to be rescanned.1071* Try the slow method.1072*/1073zfs_fallthrough;1074case EAGAIN:1075case EOPNOTSUPP:1076case ENOSYS:1077do_slow = B_TRUE;1078zfs_fallthrough;1079case 0:1080default:1081(void) aio_return(&aiocbs[l]);1082}1083}1084}1085if (do_slow) {1086/*1087* At least some IO involved access unsafe-for-AIO1088* files. Let's try again, without AIO this time.1089*/1090error = zpool_read_label_slow(fd, config, num_labels);1091saved_errno = errno;1092}1093umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels));1094errno = saved_errno;1095return (error);1096}10971098for (l = 0; l < VDEV_LABELS; l++) {1099uint64_t state, guid, txg;11001101if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t))1102continue;11031104if (nvlist_unpack(labels[l].vp_nvlist,1105sizeof (labels[l].vp_nvlist), config, 0) != 0)1106continue;11071108if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,1109&guid) != 0 || guid == 0) {1110nvlist_free(*config);1111continue;1112}11131114if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,1115&state) != 0 || state > POOL_STATE_L2CACHE) {1116nvlist_free(*config);1117continue;1118}11191120if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&1121(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,1122&txg) != 0 || txg == 0)) {1123nvlist_free(*config);1124continue;1125}11261127if (expected_guid) {1128if (expected_guid == guid)1129count++;11301131nvlist_free(*config);1132} else {1133expected_config = *config;1134expected_guid = guid;1135count++;1136}1137}11381139if (num_labels != NULL)1140*num_labels = count;11411142umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels));1143*config = expected_config;11441145return (0);1146#endif1147}11481149/*1150* Sorted by full path and then vdev guid to allow for multiple entries with1151* the same full path name. This is required because it's possible to1152* have multiple block devices with labels that refer to the same1153* ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both1154* entries need to be added to the cache. Scenarios where this can occur1155* include overwritten pool labels, devices which are visible from multiple1156* hosts and multipath devices.1157*/1158int1159slice_cache_compare(const void *arg1, const void *arg2)1160{1161const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;1162const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;1163uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;1164uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;1165int rv;11661167rv = TREE_ISIGN(strcmp(nm1, nm2));1168if (rv)1169return (rv);11701171return (TREE_CMP(guid1, guid2));1172}11731174static int1175label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,1176uint64_t vdev_guid, const char **path, const char **devid)1177{1178nvlist_t **child;1179uint_t c, children;1180uint64_t guid;1181const char *val;1182int error;11831184if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,1185&child, &children) == 0) {1186for (c = 0; c < children; c++) {1187error = label_paths_impl(hdl, child[c],1188pool_guid, vdev_guid, path, devid);1189if (error)1190return (error);1191}1192return (0);1193}11941195if (nvroot == NULL)1196return (0);11971198error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);1199if ((error != 0) || (guid != vdev_guid))1200return (0);12011202error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);1203if (error == 0)1204*path = val;12051206error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);1207if (error == 0)1208*devid = val;12091210return (0);1211}12121213/*1214* Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID1215* and store these strings as config_path and devid_path respectively.1216* The returned pointers are only valid as long as label remains valid.1217*/1218int1219label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path,1220const char **devid)1221{1222nvlist_t *nvroot;1223uint64_t pool_guid;1224uint64_t vdev_guid;1225uint64_t state;12261227*path = NULL;1228*devid = NULL;1229if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0)1230return (ENOENT);12311232/*1233* In case of spare or l2cache, we directly return path/devid from the1234* label.1235*/1236if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) &&1237(state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) {1238(void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path);1239(void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid);1240return (0);1241}12421243if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||1244nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid))1245return (ENOENT);12461247return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,1248devid));1249}12501251static void1252zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock,1253avl_tree_t *cache, const char *path, const char *name, int order)1254{1255avl_index_t where;1256rdsk_node_t *slice;12571258slice = zutil_alloc(hdl, sizeof (rdsk_node_t));1259if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {1260free(slice);1261return;1262}1263slice->rn_vdev_guid = 0;1264slice->rn_lock = lock;1265slice->rn_avl = cache;1266slice->rn_hdl = hdl;1267slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;1268slice->rn_labelpaths = B_FALSE;12691270pthread_mutex_lock(lock);1271if (avl_find(cache, slice, &where)) {1272free(slice->rn_name);1273free(slice);1274} else {1275avl_insert(cache, slice, where);1276}1277pthread_mutex_unlock(lock);1278}12791280static int1281zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock,1282avl_tree_t *cache, const char *dir, int order)1283{1284int error;1285char path[MAXPATHLEN];1286struct dirent64 *dp;1287DIR *dirp;12881289if (realpath(dir, path) == NULL) {1290error = errno;1291if (error == ENOENT)1292return (0);12931294zutil_error_aux(hdl, "%s", zfs_strerror(error));1295(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1296"cannot resolve path '%s'"), dir);1297return (error);1298}12991300dirp = opendir(path);1301if (dirp == NULL) {1302error = errno;1303zutil_error_aux(hdl, "%s", zfs_strerror(error));1304(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1305"cannot open '%s'"), path);1306return (error);1307}13081309while ((dp = readdir64(dirp)) != NULL) {1310const char *name = dp->d_name;1311if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)1312continue;13131314switch (dp->d_type) {1315case DT_UNKNOWN:1316case DT_BLK:1317case DT_LNK:1318#ifdef __FreeBSD__1319case DT_CHR:1320#endif1321case DT_REG:1322break;1323default:1324continue;1325}13261327zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,1328order);1329}13301331(void) closedir(dirp);1332return (0);1333}13341335static int1336zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock,1337avl_tree_t *cache, const char *dir, int order)1338{1339int error = 0;1340char path[MAXPATHLEN];1341char *d = NULL;1342ssize_t dl;1343const char *dpath, *name;13441345/*1346* Separate the directory and the basename.1347* We do this so that we can get the realpath of1348* the directory. We don't get the realpath on the1349* whole path because if it's a symlink, we want the1350* path of the symlink not where it points to.1351*/1352name = zfs_basename(dir);1353if ((dl = zfs_dirnamelen(dir)) == -1)1354dpath = ".";1355else1356dpath = d = zutil_strndup(hdl, dir, dl);13571358if (realpath(dpath, path) == NULL) {1359error = errno;1360if (error == ENOENT) {1361error = 0;1362goto out;1363}13641365zutil_error_aux(hdl, "%s", zfs_strerror(error));1366(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1367"cannot resolve path '%s'"), dir);1368goto out;1369}13701371zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);13721373out:1374free(d);1375return (error);1376}13771378/*1379* Scan a list of directories for zfs devices.1380*/1381static int1382zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock,1383avl_tree_t **slice_cache, const char * const *dir, size_t dirs)1384{1385avl_tree_t *cache;1386rdsk_node_t *slice;1387void *cookie;1388int i, error;13891390*slice_cache = NULL;1391cache = zutil_alloc(hdl, sizeof (avl_tree_t));1392avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),1393offsetof(rdsk_node_t, rn_node));13941395for (i = 0; i < dirs; i++) {1396struct stat sbuf;13971398if (stat(dir[i], &sbuf) != 0) {1399error = errno;1400if (error == ENOENT)1401continue;14021403zutil_error_aux(hdl, "%s", zfs_strerror(error));1404(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(1405TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);1406goto error;1407}14081409/*1410* If dir[i] is a directory, we walk through it and add all1411* the entries to the cache. If it's not a directory, we just1412* add it to the cache.1413*/1414if (S_ISDIR(sbuf.st_mode)) {1415if ((error = zpool_find_import_scan_dir(hdl, lock,1416cache, dir[i], i)) != 0)1417goto error;1418} else {1419if ((error = zpool_find_import_scan_path(hdl, lock,1420cache, dir[i], i)) != 0)1421goto error;1422}1423}14241425*slice_cache = cache;1426return (0);14271428error:1429cookie = NULL;1430while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {1431free(slice->rn_name);1432free(slice);1433}1434free(cache);14351436return (error);1437}14381439/*1440* Given a list of directories to search, find all pools stored on disk. This1441* includes partial pools which are not available to import. If no args are1442* given (argc is 0), then the default directory (/dev/dsk) is searched.1443* poolname or guid (but not both) are provided by the caller when trying1444* to import a specific pool.1445*/1446static nvlist_t *1447zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg,1448pthread_mutex_t *lock, avl_tree_t *cache)1449{1450(void) lock;1451nvlist_t *ret = NULL;1452pool_list_t pools = { 0 };1453pool_entry_t *pe, *penext;1454vdev_entry_t *ve, *venext;1455config_entry_t *ce, *cenext;1456name_entry_t *ne, *nenext;1457rdsk_node_t *slice;1458void *cookie;1459tpool_t *t;14601461verify(iarg->poolname == NULL || iarg->guid == 0);14621463/*1464* Create a thread pool to parallelize the process of reading and1465* validating labels, a large number of threads can be used due to1466* minimal contention.1467*/1468long threads = 2 * sysconf(_SC_NPROCESSORS_ONLN);1469#ifdef HAVE_AIO_H1470long am;1471#ifdef _SC_AIO_LISTIO_MAX1472am = sysconf(_SC_AIO_LISTIO_MAX);1473if (am >= VDEV_LABELS)1474threads = MIN(threads, am / VDEV_LABELS);1475#endif1476#ifdef _SC_AIO_MAX1477am = sysconf(_SC_AIO_MAX);1478if (am >= VDEV_LABELS)1479threads = MIN(threads, am / VDEV_LABELS);1480#endif1481#endif1482t = tpool_create(1, threads, 0, NULL);1483for (slice = avl_first(cache); slice;1484(slice = avl_walk(cache, slice, AVL_AFTER)))1485(void) tpool_dispatch(t, zpool_open_func, slice);14861487tpool_wait(t);1488tpool_destroy(t);14891490/*1491* Process the cache, filtering out any entries which are not1492* for the specified pool then adding matching label configs.1493*/1494cookie = NULL;1495while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {1496if (slice->rn_config != NULL) {1497nvlist_t *config = slice->rn_config;1498boolean_t matched = B_TRUE;1499boolean_t aux = B_FALSE;1500int fd;15011502/*1503* Check if it's a spare or l2cache device. If it is,1504* we need to skip the name and guid check since they1505* don't exist on aux device label.1506*/1507if (iarg->poolname != NULL || iarg->guid != 0) {1508uint64_t state;1509aux = nvlist_lookup_uint64(config,1510ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&1511(state == POOL_STATE_SPARE ||1512state == POOL_STATE_L2CACHE);1513}15141515if (iarg->poolname != NULL && !aux) {1516const char *pname;15171518matched = nvlist_lookup_string(config,1519ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&1520strcmp(iarg->poolname, pname) == 0;1521} else if (iarg->guid != 0 && !aux) {1522uint64_t this_guid;15231524matched = nvlist_lookup_uint64(config,1525ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&1526iarg->guid == this_guid;1527}1528if (matched) {1529/*1530* Verify all remaining entries can be opened1531* exclusively. This will prune all underlying1532* multipath devices which otherwise could1533* result in the vdev appearing as UNAVAIL.1534*1535* Under zdb, this step isn't required and1536* would prevent a zdb -e of active pools with1537* no cachefile.1538*/1539fd = open(slice->rn_name,1540O_RDONLY | O_EXCL | O_CLOEXEC);1541if (fd >= 0 || iarg->can_be_active) {1542if (fd >= 0)1543close(fd);1544add_config(hdl, &pools,1545slice->rn_name, slice->rn_order,1546slice->rn_num_labels, config);1547}1548}1549nvlist_free(config);1550}1551free(slice->rn_name);1552free(slice);1553}1554avl_destroy(cache);1555free(cache);15561557ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);15581559for (pe = pools.pools; pe != NULL; pe = penext) {1560penext = pe->pe_next;1561for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {1562venext = ve->ve_next;1563for (ce = ve->ve_configs; ce != NULL; ce = cenext) {1564cenext = ce->ce_next;1565nvlist_free(ce->ce_config);1566free(ce);1567}1568free(ve);1569}1570free(pe);1571}15721573for (ne = pools.names; ne != NULL; ne = nenext) {1574nenext = ne->ne_next;1575free(ne->ne_name);1576free(ne);1577}15781579return (ret);1580}15811582/*1583* Given a config, discover the paths for the devices which1584* exist in the config.1585*/1586static int1587discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv,1588avl_tree_t *cache, pthread_mutex_t *lock)1589{1590const char *path = NULL;1591ssize_t dl;1592uint_t children;1593nvlist_t **child;15941595if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,1596&child, &children) == 0) {1597for (int c = 0; c < children; c++) {1598discover_cached_paths(hdl, child[c], cache, lock);1599}1600}16011602/*1603* Once we have the path, we need to add the directory to1604* our directory cache.1605*/1606if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {1607int ret;1608char c = '\0';1609if ((dl = zfs_dirnamelen(path)) == -1) {1610path = ".";1611} else {1612c = path[dl];1613((char *)path)[dl] = '\0';16141615}1616ret = zpool_find_import_scan_dir(hdl, lock, cache,1617path, 0);1618if (c != '\0')1619((char *)path)[dl] = c;16201621return (ret);1622}1623return (0);1624}16251626/*1627* Given a cache file, return the contents as a list of importable pools.1628* poolname or guid (but not both) are provided by the caller when trying1629* to import a specific pool.1630*/1631static nvlist_t *1632zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg)1633{1634char *buf;1635int fd;1636struct stat64 statbuf;1637nvlist_t *raw, *src, *dst;1638nvlist_t *pools;1639nvpair_t *elem;1640const char *name;1641uint64_t this_guid;1642boolean_t active;16431644verify(iarg->poolname == NULL || iarg->guid == 0);16451646if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) {1647zutil_error_aux(hdl, "%s", zfs_strerror(errno));1648(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1649"failed to open cache file"));1650return (NULL);1651}16521653if (fstat64(fd, &statbuf) != 0) {1654zutil_error_aux(hdl, "%s", zfs_strerror(errno));1655(void) close(fd);1656(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1657"failed to get size of cache file"));1658return (NULL);1659}16601661if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) {1662(void) close(fd);1663return (NULL);1664}16651666if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {1667(void) close(fd);1668free(buf);1669(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1670"failed to read cache file contents"));1671return (NULL);1672}16731674(void) close(fd);16751676if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {1677free(buf);1678(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1679"invalid or corrupt cache file contents"));1680return (NULL);1681}16821683free(buf);16841685/*1686* Go through and get the current state of the pools and refresh their1687* state.1688*/1689if (nvlist_alloc(&pools, 0, 0) != 0) {1690(void) zutil_no_memory(hdl);1691nvlist_free(raw);1692return (NULL);1693}16941695elem = NULL;1696while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {1697src = fnvpair_value_nvlist(elem);16981699name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);1700if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0)1701continue;17021703this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);1704if (iarg->guid != 0 && iarg->guid != this_guid)1705continue;17061707if (zutil_pool_active(hdl, name, this_guid, &active) != 0) {1708nvlist_free(raw);1709nvlist_free(pools);1710return (NULL);1711}17121713if (active)1714continue;17151716if (iarg->scan) {1717uint64_t saved_guid = iarg->guid;1718const char *saved_poolname = iarg->poolname;1719pthread_mutex_t lock;17201721/*1722* Create the device cache that will hold the1723* devices we will scan based on the cachefile.1724* This will get destroyed and freed by1725* zpool_find_import_impl.1726*/1727avl_tree_t *cache = zutil_alloc(hdl,1728sizeof (avl_tree_t));1729avl_create(cache, slice_cache_compare,1730sizeof (rdsk_node_t),1731offsetof(rdsk_node_t, rn_node));1732nvlist_t *nvroot = fnvlist_lookup_nvlist(src,1733ZPOOL_CONFIG_VDEV_TREE);17341735/*1736* We only want to find the pool with this_guid.1737* We will reset these values back later.1738*/1739iarg->guid = this_guid;1740iarg->poolname = NULL;17411742/*1743* We need to build up a cache of devices that exists1744* in the paths pointed to by the cachefile. This allows1745* us to preserve the device namespace that was1746* originally specified by the user but also lets us1747* scan devices in those directories in case they had1748* been renamed.1749*/1750pthread_mutex_init(&lock, NULL);1751discover_cached_paths(hdl, nvroot, cache, &lock);1752nvlist_t *nv = zpool_find_import_impl(hdl, iarg,1753&lock, cache);1754pthread_mutex_destroy(&lock);17551756/*1757* zpool_find_import_impl will return back1758* a list of pools that it found based on the1759* device cache. There should only be one pool1760* since we're looking for a specific guid.1761* We will use that pool to build up the final1762* pool nvlist which is returned back to the1763* caller.1764*/1765nvpair_t *pair = nvlist_next_nvpair(nv, NULL);1766if (pair == NULL)1767continue;1768fnvlist_add_nvlist(pools, nvpair_name(pair),1769fnvpair_value_nvlist(pair));17701771VERIFY0P(nvlist_next_nvpair(nv, pair));17721773iarg->guid = saved_guid;1774iarg->poolname = saved_poolname;1775continue;1776}17771778if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,1779iarg->cachefile) != 0) {1780(void) zutil_no_memory(hdl);1781nvlist_free(raw);1782nvlist_free(pools);1783return (NULL);1784}17851786update_vdevs_config_dev_sysfs_path(src);17871788if ((dst = zutil_refresh_config(hdl, src)) == NULL) {1789nvlist_free(raw);1790nvlist_free(pools);1791return (NULL);1792}17931794if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {1795(void) zutil_no_memory(hdl);1796nvlist_free(dst);1797nvlist_free(raw);1798nvlist_free(pools);1799return (NULL);1800}1801nvlist_free(dst);1802}1803nvlist_free(raw);1804return (pools);1805}18061807static nvlist_t *1808zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg)1809{1810pthread_mutex_t lock;1811avl_tree_t *cache;1812nvlist_t *pools = NULL;18131814verify(iarg->poolname == NULL || iarg->guid == 0);1815pthread_mutex_init(&lock, NULL);18161817/*1818* Locate pool member vdevs by blkid or by directory scanning.1819* On success a newly allocated AVL tree which is populated with an1820* entry for each discovered vdev will be returned in the cache.1821* It's the caller's responsibility to consume and destroy this tree.1822*/1823if (iarg->scan || iarg->paths != 0) {1824size_t dirs = iarg->paths;1825const char * const *dir = (const char * const *)iarg->path;18261827if (dirs == 0)1828dir = zpool_default_search_paths(&dirs);18291830if (zpool_find_import_scan(hdl, &lock, &cache,1831dir, dirs) != 0) {1832pthread_mutex_destroy(&lock);1833return (NULL);1834}1835} else {1836if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) {1837pthread_mutex_destroy(&lock);1838return (NULL);1839}1840}18411842pools = zpool_find_import_impl(hdl, iarg, &lock, cache);1843pthread_mutex_destroy(&lock);1844return (pools);1845}184618471848nvlist_t *1849zpool_search_import(libpc_handle_t *hdl, importargs_t *import)1850{1851nvlist_t *pools = NULL;18521853verify(import->poolname == NULL || import->guid == 0);18541855if (import->cachefile != NULL)1856pools = zpool_find_import_cached(hdl, import);1857else1858pools = zpool_find_import(hdl, import);18591860if ((pools == NULL || nvlist_empty(pools)) &&1861hdl->lpc_open_access_error && geteuid() != 0) {1862(void) zutil_error(hdl, LPC_EACCESS, dgettext(TEXT_DOMAIN,1863"no pools found"));1864}18651866return (pools);1867}18681869static boolean_t1870pool_match(nvlist_t *cfg, const char *tgt)1871{1872uint64_t v, guid = strtoull(tgt, NULL, 0);1873const char *s;18741875if (guid != 0) {1876if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)1877return (v == guid);1878} else {1879if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)1880return (strcmp(s, tgt) == 0);1881}1882return (B_FALSE);1883}18841885int1886zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,1887importargs_t *args)1888{1889nvlist_t *pools;1890nvlist_t *match = NULL;1891nvlist_t *config = NULL;1892char *sepp = NULL;1893int count = 0;1894char *targetdup = strdup(target);18951896if (targetdup == NULL)1897return (ENOMEM);18981899*configp = NULL;19001901if ((sepp = strpbrk(targetdup, "/@")) != NULL)1902*sepp = '\0';19031904pools = zpool_search_import(hdl, args);1905if (pools == NULL) {1906zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, "no pools found"));1907(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1908"failed to find config for pool '%s'"), targetdup);1909free(targetdup);1910return (ENOENT);1911}19121913nvpair_t *elem = NULL;1914while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {1915VERIFY0(nvpair_value_nvlist(elem, &config));1916if (pool_match(config, targetdup)) {1917count++;1918if (match != NULL) {1919/* multiple matches found */1920continue;1921} else {1922match = fnvlist_dup(config);1923}1924}1925}1926fnvlist_free(pools);19271928if (count == 0) {1929zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,1930"no matching pools"));1931(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1932"failed to find config for pool '%s'"), targetdup);1933free(targetdup);1934return (ENOENT);1935}19361937if (count > 1) {1938zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,1939"more than one matching pool"));1940(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1941"failed to find config for pool '%s'"), targetdup);1942free(targetdup);1943fnvlist_free(match);1944return (EINVAL);1945}19461947*configp = match;1948free(targetdup);19491950return (0);1951}19521953/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */1954static boolean_t1955vdev_is_leaf(nvlist_t *nv)1956{1957uint_t children = 0;1958nvlist_t **child;19591960(void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,1961&child, &children);19621963return (children == 0);1964}19651966/* Return if a vdev is a leaf vdev and a real device (disk or file) */1967static boolean_t1968vdev_is_real_leaf(nvlist_t *nv)1969{1970const char *type = NULL;1971if (!vdev_is_leaf(nv))1972return (B_FALSE);19731974(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type);1975if ((strcmp(type, VDEV_TYPE_DISK) == 0) ||1976(strcmp(type, VDEV_TYPE_FILE) == 0)) {1977return (B_TRUE);1978}19791980return (B_FALSE);1981}19821983/*1984* This function is called by our FOR_EACH_VDEV() macros.1985*1986* state: State machine status (stored inside of a (nvlist_t *))1987* nv: The current vdev nvlist_t we are iterating over.1988* last_nv: The previous vdev nvlist_t we returned to the user in1989* the last iteration of FOR_EACH_VDEV(). We use it1990* to find the next vdev nvlist_t we should return.1991* real_leaves_only: Only return leaf vdevs.1992*1993* Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if1994* we're still searching for it.1995*/1996static int1997__for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,1998boolean_t real_leaves_only)1999{2000enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2};20012002/* The very first entry in the NV list is a special case */2003if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) {2004if (real_leaves_only && !vdev_is_real_leaf(nv))2005return (0);20062007*((nvlist_t **)last_nv) = nv;2008*((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING;2009return (1);2010}20112012/*2013* We came across our last_nv, meaning the next one is the one we2014* want2015*/2016if (nv == *((nvlist_t **)last_nv)) {2017/* Next iteration of this function will return the nvlist_t */2018*((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH;2019return (0);2020}20212022/*2023* We marked NEXT_IS_MATCH on the previous iteration, so this is the one2024* we want.2025*/2026if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) {2027if (real_leaves_only && !vdev_is_real_leaf(nv))2028return (0);20292030*((nvlist_t **)last_nv) = nv;2031*((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING;2032return (1);2033}20342035return (0);2036}20372038int2039for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv)2040{2041return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE));2042}20432044int2045for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv,2046void *last_nv)2047{2048return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE));2049}20502051/*2052* Internal function for iterating over the vdevs.2053*2054* For each vdev, func() will be called and will be passed 'zhp' (which is2055* typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and2056* a user-defined data pointer).2057*2058* The return values from all the func() calls will be OR'd together and2059* returned.2060*/2061int2062for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func,2063void *data)2064{2065nvlist_t **child;2066uint_t c, children;2067int ret = 0;2068int i;2069const char *type;20702071const char *list[] = {2072ZPOOL_CONFIG_SPARES,2073ZPOOL_CONFIG_L2CACHE,2074ZPOOL_CONFIG_CHILDREN2075};20762077if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)2078return (ret);20792080/* Don't run our function on indirect vdevs */2081if (strcmp(type, VDEV_TYPE_INDIRECT) != 0) {2082ret |= func(zhp, nv, data);2083}20842085for (i = 0; i < ARRAY_SIZE(list); i++) {2086if (nvlist_lookup_nvlist_array(nv, list[i], &child,2087&children) == 0) {2088for (c = 0; c < children; c++) {2089uint64_t ishole = 0;20902091(void) nvlist_lookup_uint64(child[c],2092ZPOOL_CONFIG_IS_HOLE, &ishole);20932094if (ishole)2095continue;20962097ret |= for_each_vdev_cb(zhp, child[c],2098func, data);2099}2100}2101}21022103return (ret);2104}21052106/*2107* Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling2108* func() for each one. func() is passed the vdev's nvlist and an optional2109* user-defined 'data' pointer.2110*/2111int2112for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data)2113{2114return (for_each_vdev_cb(NULL, nvroot, func, data));2115}211621172118