Path: blob/main/sys/contrib/openzfs/lib/libzutil/zutil_import.c
106766 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2015 Nexenta Systems, Inc. All rights reserved.23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2012, 2018 by Delphix. All rights reserved.25* Copyright 2015 RackTop Systems.26* Copyright (c) 2016, Intel Corporation.27* Copyright (c) 2021, Colm Buckley <[email protected]>28*/2930/*31* Pool import support functions.32*33* Used by zpool, ztest, zdb, and zhack to locate importable configs. Since34* these commands are expected to run in the global zone, we can assume35* that the devices are all readable when called.36*37* To import a pool, we rely on reading the configuration information from the38* ZFS label of each device. If we successfully read the label, then we39* organize the configuration information in the following hierarchy:40*41* pool guid -> toplevel vdev guid -> label txg42*43* Duplicate entries matching this same tuple will be discarded. Once we have44* examined every device, we pick the best label txg config for each toplevel45* vdev. We then arrange these toplevel vdevs into a complete pool config, and46* update any paths that have changed. Finally, we attempt to import the pool47* using our derived config, and record the results.48*/4950#ifdef HAVE_AIO_H51#include <aio.h>52#endif53#include <ctype.h>54#include <dirent.h>55#include <errno.h>56#include <libintl.h>57#include <libgen.h>58#include <stddef.h>59#include <stdlib.h>60#include <string.h>61#include <sys/stat.h>62#include <unistd.h>63#include <fcntl.h>64#include <sys/dktp/fdisk.h>65#include <sys/vdev_impl.h>66#include <sys/fs/zfs.h>67#include <sys/taskq.h>6869#include <libzutil.h>70#include <libnvpair.h>7172#include "zutil_import.h"7374const char *75libpc_error_description(libpc_handle_t *hdl)76{77if (hdl->lpc_desc[0] != '\0')78return (hdl->lpc_desc);7980switch (hdl->lpc_error) {81case LPC_BADCACHE:82return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));83case LPC_BADPATH:84return (dgettext(TEXT_DOMAIN, "must be an absolute path"));85case LPC_NOMEM:86return (dgettext(TEXT_DOMAIN, "out of memory"));87case LPC_EACCESS:88return (dgettext(TEXT_DOMAIN, "some devices require root "89"privileges"));90case LPC_UNKNOWN:91return (dgettext(TEXT_DOMAIN, "unknown error"));92default:93assert(hdl->lpc_error == 0);94return (dgettext(TEXT_DOMAIN, "no error"));95}96}9798static __attribute__((format(printf, 2, 3))) void99zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...)100{101va_list ap;102103va_start(ap, fmt);104105(void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap);106hdl->lpc_desc_active = B_TRUE;107108va_end(ap);109}110111static void112zutil_verror(libpc_handle_t *hdl, lpc_error_t error, const char *fmt,113va_list ap)114{115char action[1024];116117(void) vsnprintf(action, sizeof (action), fmt, ap);118hdl->lpc_error = error;119120if (hdl->lpc_desc_active)121hdl->lpc_desc_active = B_FALSE;122else123hdl->lpc_desc[0] = '\0';124125if (hdl->lpc_printerr)126(void) fprintf(stderr, "%s: %s\n", action,127libpc_error_description(hdl));128}129130static __attribute__((format(printf, 3, 4))) int131zutil_error_fmt(libpc_handle_t *hdl, lpc_error_t error,132const char *fmt, ...)133{134va_list ap;135136va_start(ap, fmt);137138zutil_verror(hdl, error, fmt, ap);139140va_end(ap);141142return (-1);143}144145static int146zutil_error(libpc_handle_t *hdl, lpc_error_t error, const char *msg)147{148return (zutil_error_fmt(hdl, error, "%s", msg));149}150151static int152zutil_no_memory(libpc_handle_t *hdl)153{154zutil_error(hdl, LPC_NOMEM, "internal error");155exit(1);156}157158void *159zutil_alloc(libpc_handle_t *hdl, size_t size)160{161void *data;162163if ((data = calloc(1, size)) == NULL)164(void) zutil_no_memory(hdl);165166return (data);167}168169char *170zutil_strdup(libpc_handle_t *hdl, const char *str)171{172char *ret;173174if ((ret = strdup(str)) == NULL)175(void) zutil_no_memory(hdl);176177return (ret);178}179180static char *181zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n)182{183char *ret;184185if ((ret = strndup(str, n)) == NULL)186(void) zutil_no_memory(hdl);187188return (ret);189}190191/*192* Intermediate structures used to gather configuration information.193*/194typedef struct config_entry {195uint64_t ce_txg;196nvlist_t *ce_config;197struct config_entry *ce_next;198} config_entry_t;199200typedef struct vdev_entry {201uint64_t ve_guid;202config_entry_t *ve_configs;203struct vdev_entry *ve_next;204} vdev_entry_t;205206typedef struct pool_entry {207uint64_t pe_guid;208vdev_entry_t *pe_vdevs;209struct pool_entry *pe_next;210} pool_entry_t;211212typedef struct name_entry {213char *ne_name;214uint64_t ne_guid;215uint64_t ne_order;216uint64_t ne_num_labels;217struct name_entry *ne_next;218} name_entry_t;219220typedef struct pool_list {221pool_entry_t *pools;222name_entry_t *names;223} pool_list_t;224225/*226* Go through and fix up any path and/or devid information for the given vdev227* configuration.228*/229static int230fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names)231{232nvlist_t **child;233uint_t c, children;234uint64_t guid;235name_entry_t *ne, *best;236const char *path;237238if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,239&child, &children) == 0) {240for (c = 0; c < children; c++)241if (fix_paths(hdl, child[c], names) != 0)242return (-1);243return (0);244}245246/*247* This is a leaf (file or disk) vdev. In either case, go through248* the name list and see if we find a matching guid. If so, replace249* the path and see if we can calculate a new devid.250*251* There may be multiple names associated with a particular guid, in252* which case we have overlapping partitions or multiple paths to the253* same disk. In this case we prefer to use the path name which254* matches the ZPOOL_CONFIG_PATH. If no matching entry is found we255* use the lowest order device which corresponds to the first match256* while traversing the ZPOOL_IMPORT_PATH search path.257*/258verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);259if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)260path = NULL;261262best = NULL;263for (ne = names; ne != NULL; ne = ne->ne_next) {264if (ne->ne_guid == guid) {265if (path == NULL) {266best = ne;267break;268}269270if ((strlen(path) == strlen(ne->ne_name)) &&271strncmp(path, ne->ne_name, strlen(path)) == 0) {272best = ne;273break;274}275276if (best == NULL) {277best = ne;278continue;279}280281/* Prefer paths with move vdev labels. */282if (ne->ne_num_labels > best->ne_num_labels) {283best = ne;284continue;285}286287/* Prefer paths earlier in the search order. */288if (ne->ne_num_labels == best->ne_num_labels &&289ne->ne_order < best->ne_order) {290best = ne;291continue;292}293}294}295296if (best == NULL)297return (0);298299if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)300return (-1);301302update_vdev_config_dev_strs(nv);303304return (0);305}306307/*308* Add the given configuration to the list of known devices.309*/310static int311add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path,312int order, int num_labels, nvlist_t *config)313{314uint64_t pool_guid, vdev_guid, top_guid, txg, state;315pool_entry_t *pe;316vdev_entry_t *ve;317config_entry_t *ce;318name_entry_t *ne;319320/*321* If this is a hot spare not currently in use or level 2 cache322* device, add it to the list of names to translate, but don't do323* anything else.324*/325if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,326&state) == 0 &&327(state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&328nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {329if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)330return (-1);331332if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {333free(ne);334return (-1);335}336ne->ne_guid = vdev_guid;337ne->ne_order = order;338ne->ne_num_labels = num_labels;339ne->ne_next = pl->names;340pl->names = ne;341342return (0);343}344345/*346* If we have a valid config but cannot read any of these fields, then347* it means we have a half-initialized label. In vdev_label_init()348* we write a label with txg == 0 so that we can identify the device349* in case the user refers to the same disk later on. If we fail to350* create the pool, we'll be left with a label in this state351* which should not be considered part of a valid pool.352*/353if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,354&pool_guid) != 0 ||355nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,356&vdev_guid) != 0 ||357nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,358&top_guid) != 0 ||359nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,360&txg) != 0 || txg == 0) {361return (0);362}363364/*365* First, see if we know about this pool. If not, then add it to the366* list of known pools.367*/368for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {369if (pe->pe_guid == pool_guid)370break;371}372373if (pe == NULL) {374if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) {375return (-1);376}377pe->pe_guid = pool_guid;378pe->pe_next = pl->pools;379pl->pools = pe;380}381382/*383* Second, see if we know about this toplevel vdev. Add it if its384* missing.385*/386for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {387if (ve->ve_guid == top_guid)388break;389}390391if (ve == NULL) {392if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {393return (-1);394}395ve->ve_guid = top_guid;396ve->ve_next = pe->pe_vdevs;397pe->pe_vdevs = ve;398}399400/*401* Third, see if we have a config with a matching transaction group. If402* so, then we do nothing. Otherwise, add it to the list of known403* configs.404*/405for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {406if (ce->ce_txg == txg)407break;408}409410if (ce == NULL) {411if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) {412return (-1);413}414ce->ce_txg = txg;415ce->ce_config = fnvlist_dup(config);416ce->ce_next = ve->ve_configs;417ve->ve_configs = ce;418}419420/*421* At this point we've successfully added our config to the list of422* known configs. The last thing to do is add the vdev guid -> path423* mappings so that we can fix up the configuration as necessary before424* doing the import.425*/426if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL)427return (-1);428429if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) {430free(ne);431return (-1);432}433434ne->ne_guid = vdev_guid;435ne->ne_order = order;436ne->ne_num_labels = num_labels;437ne->ne_next = pl->names;438pl->names = ne;439440return (0);441}442443static int444zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid,445boolean_t *isactive)446{447ASSERT(hdl->lpc_ops->pco_pool_active != NULL);448449int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name,450guid, isactive);451452return (error);453}454455static nvlist_t *456zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig)457{458ASSERT(hdl->lpc_ops->pco_refresh_config != NULL);459460return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle,461tryconfig));462}463464/*465* Determine if the vdev id is a hole in the namespace.466*/467static boolean_t468vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)469{470int c;471472for (c = 0; c < holes; c++) {473474/* Top-level is a hole */475if (hole_array[c] == id)476return (B_TRUE);477}478return (B_FALSE);479}480481/*482* Convert our list of pools into the definitive set of configurations. We483* start by picking the best config for each toplevel vdev. Once that's done,484* we assemble the toplevel vdevs into a full config for the pool. We make a485* pass to fix up any incorrect paths, and then add it to the main list to486* return to the user.487*/488static nvlist_t *489get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,490nvlist_t *policy)491{492pool_entry_t *pe;493vdev_entry_t *ve;494config_entry_t *ce;495nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;496nvlist_t **spares, **l2cache;497uint_t i, nspares, nl2cache;498boolean_t config_seen;499uint64_t best_txg;500const char *name, *hostname = NULL;501uint64_t guid;502uint_t children = 0;503nvlist_t **child = NULL;504uint64_t *hole_array, max_id;505uint_t c;506boolean_t isactive;507nvlist_t *nvl;508boolean_t valid_top_config = B_FALSE;509510if (nvlist_alloc(&ret, 0, 0) != 0)511goto nomem;512513for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {514uint64_t id, max_txg = 0, hostid = 0;515uint_t holes = 0;516517if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)518goto nomem;519config_seen = B_FALSE;520521/*522* Iterate over all toplevel vdevs. Grab the pool configuration523* from the first one we find, and then go through the rest and524* add them as necessary to the 'vdevs' member of the config.525*/526for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {527528/*529* Determine the best configuration for this vdev by530* selecting the config with the latest transaction531* group.532*/533best_txg = 0;534for (ce = ve->ve_configs; ce != NULL;535ce = ce->ce_next) {536537if (ce->ce_txg > best_txg) {538tmp = ce->ce_config;539best_txg = ce->ce_txg;540}541}542543/*544* We rely on the fact that the max txg for the545* pool will contain the most up-to-date information546* about the valid top-levels in the vdev namespace.547*/548if (best_txg > max_txg) {549(void) nvlist_remove(config,550ZPOOL_CONFIG_VDEV_CHILDREN,551DATA_TYPE_UINT64);552(void) nvlist_remove(config,553ZPOOL_CONFIG_HOLE_ARRAY,554DATA_TYPE_UINT64_ARRAY);555556max_txg = best_txg;557hole_array = NULL;558holes = 0;559max_id = 0;560valid_top_config = B_FALSE;561562if (nvlist_lookup_uint64(tmp,563ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {564verify(nvlist_add_uint64(config,565ZPOOL_CONFIG_VDEV_CHILDREN,566max_id) == 0);567valid_top_config = B_TRUE;568}569570if (nvlist_lookup_uint64_array(tmp,571ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,572&holes) == 0) {573verify(nvlist_add_uint64_array(config,574ZPOOL_CONFIG_HOLE_ARRAY,575hole_array, holes) == 0);576}577}578579if (!config_seen) {580/*581* Copy the relevant pieces of data to the pool582* configuration:583*584* version585* pool guid586* name587* comment (if available)588* compatibility features (if available)589* pool state590* hostid (if available)591* hostname (if available)592*/593uint64_t state, version;594const char *comment = NULL;595const char *compatibility = NULL;596597version = fnvlist_lookup_uint64(tmp,598ZPOOL_CONFIG_VERSION);599fnvlist_add_uint64(config,600ZPOOL_CONFIG_VERSION, version);601guid = fnvlist_lookup_uint64(tmp,602ZPOOL_CONFIG_POOL_GUID);603fnvlist_add_uint64(config,604ZPOOL_CONFIG_POOL_GUID, guid);605name = fnvlist_lookup_string(tmp,606ZPOOL_CONFIG_POOL_NAME);607fnvlist_add_string(config,608ZPOOL_CONFIG_POOL_NAME, name);609610if (nvlist_lookup_string(tmp,611ZPOOL_CONFIG_COMMENT, &comment) == 0)612fnvlist_add_string(config,613ZPOOL_CONFIG_COMMENT, comment);614615if (nvlist_lookup_string(tmp,616ZPOOL_CONFIG_COMPATIBILITY,617&compatibility) == 0)618fnvlist_add_string(config,619ZPOOL_CONFIG_COMPATIBILITY,620compatibility);621622state = fnvlist_lookup_uint64(tmp,623ZPOOL_CONFIG_POOL_STATE);624fnvlist_add_uint64(config,625ZPOOL_CONFIG_POOL_STATE, state);626627hostid = 0;628if (nvlist_lookup_uint64(tmp,629ZPOOL_CONFIG_HOSTID, &hostid) == 0) {630fnvlist_add_uint64(config,631ZPOOL_CONFIG_HOSTID, hostid);632hostname = fnvlist_lookup_string(tmp,633ZPOOL_CONFIG_HOSTNAME);634fnvlist_add_string(config,635ZPOOL_CONFIG_HOSTNAME, hostname);636}637638config_seen = B_TRUE;639}640641/*642* Add this top-level vdev to the child array.643*/644verify(nvlist_lookup_nvlist(tmp,645ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);646verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,647&id) == 0);648649if (id >= children) {650nvlist_t **newchild;651652newchild = zutil_alloc(hdl, (id + 1) *653sizeof (nvlist_t *));654if (newchild == NULL)655goto nomem;656657for (c = 0; c < children; c++)658newchild[c] = child[c];659660free(child);661child = newchild;662children = id + 1;663}664if (nvlist_dup(nvtop, &child[id], 0) != 0)665goto nomem;666667}668669/*670* If we have information about all the top-levels then671* clean up the nvlist which we've constructed. This672* means removing any extraneous devices that are673* beyond the valid range or adding devices to the end674* of our array which appear to be missing.675*/676if (valid_top_config) {677if (max_id < children) {678for (c = max_id; c < children; c++)679nvlist_free(child[c]);680children = max_id;681} else if (max_id > children) {682nvlist_t **newchild;683684newchild = zutil_alloc(hdl, (max_id) *685sizeof (nvlist_t *));686if (newchild == NULL)687goto nomem;688689for (c = 0; c < children; c++)690newchild[c] = child[c];691692free(child);693child = newchild;694children = max_id;695}696}697698verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,699&guid) == 0);700701/*702* The vdev namespace may contain holes as a result of703* device removal. We must add them back into the vdev704* tree before we process any missing devices.705*/706if (holes > 0) {707ASSERT(valid_top_config);708709for (c = 0; c < children; c++) {710nvlist_t *holey;711712if (child[c] != NULL ||713!vdev_is_hole(hole_array, holes, c))714continue;715716if (nvlist_alloc(&holey, NV_UNIQUE_NAME,7170) != 0)718goto nomem;719720/*721* Holes in the namespace are treated as722* "hole" top-level vdevs and have a723* special flag set on them.724*/725if (nvlist_add_string(holey,726ZPOOL_CONFIG_TYPE,727VDEV_TYPE_HOLE) != 0 ||728nvlist_add_uint64(holey,729ZPOOL_CONFIG_ID, c) != 0 ||730nvlist_add_uint64(holey,731ZPOOL_CONFIG_GUID, 0ULL) != 0) {732nvlist_free(holey);733goto nomem;734}735child[c] = holey;736}737}738739/*740* Look for any missing top-level vdevs. If this is the case,741* create a faked up 'missing' vdev as a placeholder. We cannot742* simply compress the child array, because the kernel performs743* certain checks to make sure the vdev IDs match their location744* in the configuration.745*/746for (c = 0; c < children; c++) {747if (child[c] == NULL) {748nvlist_t *missing;749if (nvlist_alloc(&missing, NV_UNIQUE_NAME,7500) != 0)751goto nomem;752if (nvlist_add_string(missing,753ZPOOL_CONFIG_TYPE,754VDEV_TYPE_MISSING) != 0 ||755nvlist_add_uint64(missing,756ZPOOL_CONFIG_ID, c) != 0 ||757nvlist_add_uint64(missing,758ZPOOL_CONFIG_GUID, 0ULL) != 0) {759nvlist_free(missing);760goto nomem;761}762child[c] = missing;763}764}765766/*767* Put all of this pool's top-level vdevs into a root vdev.768*/769if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)770goto nomem;771if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,772VDEV_TYPE_ROOT) != 0 ||773nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||774nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||775nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,776(const nvlist_t **)child, children) != 0) {777nvlist_free(nvroot);778goto nomem;779}780781for (c = 0; c < children; c++)782nvlist_free(child[c]);783free(child);784children = 0;785child = NULL;786787/*788* Go through and fix up any paths and/or devids based on our789* known list of vdev GUID -> path mappings.790*/791if (fix_paths(hdl, nvroot, pl->names) != 0) {792nvlist_free(nvroot);793goto nomem;794}795796/*797* Add the root vdev to this pool's configuration.798*/799if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,800nvroot) != 0) {801nvlist_free(nvroot);802goto nomem;803}804nvlist_free(nvroot);805806/*807* zdb uses this path to report on active pools that were808* imported or created using -R.809*/810if (active_ok)811goto add_pool;812813/*814* Determine if this pool is currently active, in which case we815* can't actually import it.816*/817verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,818&name) == 0);819verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,820&guid) == 0);821822if (zutil_pool_active(hdl, name, guid, &isactive) != 0)823goto error;824825if (isactive) {826nvlist_free(config);827config = NULL;828continue;829}830831if (policy != NULL) {832if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,833policy) != 0)834goto nomem;835}836837if ((nvl = zutil_refresh_config(hdl, config)) == NULL) {838nvlist_free(config);839config = NULL;840continue;841}842843nvlist_free(config);844config = nvl;845846/*847* Go through and update the paths for spares, now that we have848* them.849*/850verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,851&nvroot) == 0);852if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,853&spares, &nspares) == 0) {854for (i = 0; i < nspares; i++) {855if (fix_paths(hdl, spares[i], pl->names) != 0)856goto nomem;857}858}859860/*861* Update the paths for l2cache devices.862*/863if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,864&l2cache, &nl2cache) == 0) {865for (i = 0; i < nl2cache; i++) {866if (fix_paths(hdl, l2cache[i], pl->names) != 0)867goto nomem;868}869}870871/*872* Restore the original information read from the actual label.873*/874(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,875DATA_TYPE_UINT64);876(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,877DATA_TYPE_STRING);878if (hostid != 0) {879verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,880hostid) == 0);881verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,882hostname) == 0);883}884885add_pool:886/*887* Add this pool to the list of configs.888*/889verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,890&name) == 0);891892if (nvlist_add_nvlist(ret, name, config) != 0)893goto nomem;894895nvlist_free(config);896config = NULL;897}898899return (ret);900901nomem:902(void) zutil_no_memory(hdl);903error:904nvlist_free(config);905nvlist_free(ret);906for (c = 0; c < children; c++)907nvlist_free(child[c]);908free(child);909910return (NULL);911}912913/*914* Return the offset of the given label.915*/916static uint64_t917label_offset(uint64_t size, int l)918{919ASSERT0(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t));920return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?9210 : size - VDEV_LABELS * sizeof (vdev_label_t)));922}923924/*925* The same description applies as to zpool_read_label below,926* except here we do it without aio, presumably because an aio call927* errored out in a way we think not using it could circumvent.928*/929static int930zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels)931{932struct stat64 statbuf;933int l, count = 0;934vdev_phys_t *label;935nvlist_t *expected_config = NULL;936uint64_t expected_guid = 0, size;937938*config = NULL;939940if (fstat64_blk(fd, &statbuf) == -1)941return (0);942size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);943944label = (vdev_phys_t *)umem_alloc_aligned(sizeof (*label), PAGESIZE,945UMEM_DEFAULT);946if (label == NULL)947return (-1);948949for (l = 0; l < VDEV_LABELS; l++) {950uint64_t state, guid, txg;951off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;952953if (pread64(fd, label, sizeof (vdev_phys_t),954offset) != sizeof (vdev_phys_t))955continue;956957if (nvlist_unpack(label->vp_nvlist,958sizeof (label->vp_nvlist), config, 0) != 0)959continue;960961if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,962&guid) != 0 || guid == 0) {963nvlist_free(*config);964continue;965}966967if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,968&state) != 0 || state > POOL_STATE_L2CACHE) {969nvlist_free(*config);970continue;971}972973if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&974(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,975&txg) != 0 || txg == 0)) {976nvlist_free(*config);977continue;978}979980if (expected_guid) {981if (expected_guid == guid)982count++;983984nvlist_free(*config);985} else {986expected_config = *config;987expected_guid = guid;988count++;989}990}991992if (num_labels != NULL)993*num_labels = count;994995umem_free_aligned(label, sizeof (*label));996*config = expected_config;997998return (0);999}10001001/*1002* Given a file descriptor, read the label information and return an nvlist1003* describing the configuration, if there is one. The number of valid1004* labels found will be returned in num_labels when non-NULL.1005*/1006int1007zpool_read_label(int fd, nvlist_t **config, int *num_labels)1008{1009#ifndef HAVE_AIO_H1010return (zpool_read_label_slow(fd, config, num_labels));1011#else1012struct stat64 statbuf;1013struct aiocb aiocbs[VDEV_LABELS];1014struct aiocb *aiocbps[VDEV_LABELS];1015vdev_phys_t *labels;1016nvlist_t *expected_config = NULL;1017uint64_t expected_guid = 0, size;1018int error, l, count = 0;10191020*config = NULL;10211022if (fstat64_blk(fd, &statbuf) == -1)1023return (0);1024size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);10251026labels = (vdev_phys_t *)umem_alloc_aligned(1027VDEV_LABELS * sizeof (*labels), PAGESIZE, UMEM_DEFAULT);1028if (labels == NULL)1029return (-1);10301031memset(aiocbs, 0, sizeof (aiocbs));1032for (l = 0; l < VDEV_LABELS; l++) {1033off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE;10341035aiocbs[l].aio_fildes = fd;1036aiocbs[l].aio_offset = offset;1037aiocbs[l].aio_buf = &labels[l];1038aiocbs[l].aio_nbytes = sizeof (vdev_phys_t);1039aiocbs[l].aio_lio_opcode = LIO_READ;1040aiocbps[l] = &aiocbs[l];1041}10421043if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) {1044int saved_errno = errno;1045boolean_t do_slow = B_FALSE;1046error = -1;10471048if (errno == EAGAIN || errno == EINTR || errno == EIO) {1049/*1050* A portion of the requests may have been submitted.1051* Clean them up.1052*/1053for (l = 0; l < VDEV_LABELS; l++) {1054errno = 0;1055switch (aio_error(&aiocbs[l])) {1056case EINVAL:1057break;1058case EINPROGRESS:1059/*1060* This shouldn't be possible to1061* encounter, die if we do.1062*/1063ASSERT(B_FALSE);1064zfs_fallthrough;1065case EREMOTEIO:1066/*1067* May be returned by an NVMe device1068* which is visible in /dev/ but due1069* to a low-level format change, or1070* other error, needs to be rescanned.1071* Try the slow method.1072*/1073zfs_fallthrough;1074case EAGAIN:1075case EOPNOTSUPP:1076case ENOSYS:1077do_slow = B_TRUE;1078zfs_fallthrough;1079case 0:1080default:1081(void) aio_return(&aiocbs[l]);1082}1083}1084}1085if (do_slow) {1086/*1087* At least some IO involved access unsafe-for-AIO1088* files. Let's try again, without AIO this time.1089*/1090error = zpool_read_label_slow(fd, config, num_labels);1091saved_errno = errno;1092}1093umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels));1094errno = saved_errno;1095return (error);1096}10971098for (l = 0; l < VDEV_LABELS; l++) {1099uint64_t state, guid, txg;11001101if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t))1102continue;11031104if (nvlist_unpack(labels[l].vp_nvlist,1105sizeof (labels[l].vp_nvlist), config, 0) != 0)1106continue;11071108if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,1109&guid) != 0 || guid == 0) {1110nvlist_free(*config);1111continue;1112}11131114if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,1115&state) != 0 || state > POOL_STATE_L2CACHE) {1116nvlist_free(*config);1117continue;1118}11191120if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&1121(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,1122&txg) != 0 || txg == 0)) {1123nvlist_free(*config);1124continue;1125}11261127if (expected_guid) {1128if (expected_guid == guid)1129count++;11301131nvlist_free(*config);1132} else {1133expected_config = *config;1134expected_guid = guid;1135count++;1136}1137}11381139if (num_labels != NULL)1140*num_labels = count;11411142umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels));1143*config = expected_config;11441145return (0);1146#endif1147}11481149/*1150* Sorted by full path and then vdev guid to allow for multiple entries with1151* the same full path name. This is required because it's possible to1152* have multiple block devices with labels that refer to the same1153* ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both1154* entries need to be added to the cache. Scenarios where this can occur1155* include overwritten pool labels, devices which are visible from multiple1156* hosts and multipath devices.1157*/1158int1159slice_cache_compare(const void *arg1, const void *arg2)1160{1161const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;1162const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;1163uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;1164uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;1165int rv;11661167rv = TREE_ISIGN(strcmp(nm1, nm2));1168if (rv)1169return (rv);11701171return (TREE_CMP(guid1, guid2));1172}11731174static int1175label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,1176uint64_t vdev_guid, const char **path, const char **devid)1177{1178nvlist_t **child;1179uint_t c, children;1180uint64_t guid;1181const char *val;1182int error;11831184if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,1185&child, &children) == 0) {1186for (c = 0; c < children; c++) {1187error = label_paths_impl(hdl, child[c],1188pool_guid, vdev_guid, path, devid);1189if (error)1190return (error);1191}1192return (0);1193}11941195if (nvroot == NULL)1196return (0);11971198error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);1199if ((error != 0) || (guid != vdev_guid))1200return (0);12011202error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);1203if (error == 0)1204*path = val;12051206error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);1207if (error == 0)1208*devid = val;12091210return (0);1211}12121213/*1214* Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID1215* and store these strings as config_path and devid_path respectively.1216* The returned pointers are only valid as long as label remains valid.1217*/1218int1219label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path,1220const char **devid)1221{1222nvlist_t *nvroot;1223uint64_t pool_guid;1224uint64_t vdev_guid;1225uint64_t state;12261227*path = NULL;1228*devid = NULL;1229if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0)1230return (ENOENT);12311232/*1233* In case of spare or l2cache, we directly return path/devid from the1234* label.1235*/1236if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) &&1237(state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) {1238(void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path);1239(void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid);1240return (0);1241}12421243if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||1244nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid))1245return (ENOENT);12461247return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,1248devid));1249}12501251static void1252zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock,1253avl_tree_t *cache, const char *path, const char *name, int order)1254{1255avl_index_t where;1256rdsk_node_t *slice;12571258slice = zutil_alloc(hdl, sizeof (rdsk_node_t));1259if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {1260free(slice);1261return;1262}1263slice->rn_vdev_guid = 0;1264slice->rn_lock = lock;1265slice->rn_avl = cache;1266slice->rn_hdl = hdl;1267slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;1268slice->rn_labelpaths = B_FALSE;12691270pthread_mutex_lock(lock);1271if (avl_find(cache, slice, &where)) {1272free(slice->rn_name);1273free(slice);1274} else {1275avl_insert(cache, slice, where);1276}1277pthread_mutex_unlock(lock);1278}12791280static int1281zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock,1282avl_tree_t *cache, const char *dir, int order)1283{1284int error;1285char path[MAXPATHLEN];1286struct dirent64 *dp;1287DIR *dirp;12881289if (realpath(dir, path) == NULL) {1290error = errno;1291if (error == ENOENT)1292return (0);12931294zutil_error_aux(hdl, "%s", zfs_strerror(error));1295(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1296"cannot resolve path '%s'"), dir);1297return (error);1298}12991300dirp = opendir(path);1301if (dirp == NULL) {1302error = errno;1303zutil_error_aux(hdl, "%s", zfs_strerror(error));1304(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1305"cannot open '%s'"), path);1306return (error);1307}13081309while ((dp = readdir64(dirp)) != NULL) {1310const char *name = dp->d_name;1311if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)1312continue;13131314switch (dp->d_type) {1315case DT_UNKNOWN:1316case DT_BLK:1317case DT_LNK:1318#ifdef __FreeBSD__1319case DT_CHR:1320#endif1321case DT_REG:1322break;1323default:1324continue;1325}13261327zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,1328order);1329}13301331(void) closedir(dirp);1332return (0);1333}13341335static int1336zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock,1337avl_tree_t *cache, const char *dir, int order)1338{1339int error = 0;1340char path[MAXPATHLEN];1341char *d = NULL;1342ssize_t dl;1343const char *dpath, *name;13441345/*1346* Separate the directory and the basename.1347* We do this so that we can get the realpath of1348* the directory. We don't get the realpath on the1349* whole path because if it's a symlink, we want the1350* path of the symlink not where it points to.1351*/1352name = zfs_basename(dir);1353if ((dl = zfs_dirnamelen(dir)) == -1)1354dpath = ".";1355else1356dpath = d = zutil_strndup(hdl, dir, dl);13571358if (realpath(dpath, path) == NULL) {1359error = errno;1360if (error == ENOENT) {1361error = 0;1362goto out;1363}13641365zutil_error_aux(hdl, "%s", zfs_strerror(error));1366(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN,1367"cannot resolve path '%s'"), dir);1368goto out;1369}13701371zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);13721373out:1374free(d);1375return (error);1376}13771378/*1379* Scan a list of directories for zfs devices.1380*/1381static int1382zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock,1383avl_tree_t **slice_cache, const char * const *dir, size_t dirs)1384{1385avl_tree_t *cache;1386rdsk_node_t *slice;1387void *cookie;1388int i, error;13891390*slice_cache = NULL;1391cache = zutil_alloc(hdl, sizeof (avl_tree_t));1392avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),1393offsetof(rdsk_node_t, rn_node));13941395for (i = 0; i < dirs; i++) {1396struct stat sbuf;13971398if (stat(dir[i], &sbuf) != 0) {1399error = errno;1400if (error == ENOENT)1401continue;14021403zutil_error_aux(hdl, "%s", zfs_strerror(error));1404(void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(1405TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);1406goto error;1407}14081409/*1410* If dir[i] is a directory, we walk through it and add all1411* the entries to the cache. If it's not a directory, we just1412* add it to the cache.1413*/1414if (S_ISDIR(sbuf.st_mode)) {1415if ((error = zpool_find_import_scan_dir(hdl, lock,1416cache, dir[i], i)) != 0)1417goto error;1418} else {1419if ((error = zpool_find_import_scan_path(hdl, lock,1420cache, dir[i], i)) != 0)1421goto error;1422}1423}14241425*slice_cache = cache;1426return (0);14271428error:1429cookie = NULL;1430while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {1431free(slice->rn_name);1432free(slice);1433}1434free(cache);14351436return (error);1437}14381439/*1440* Given a list of directories to search, find all pools stored on disk. This1441* includes partial pools which are not available to import. If no args are1442* given (argc is 0), then the default directory (/dev/dsk) is searched.1443* poolname or guid (but not both) are provided by the caller when trying1444* to import a specific pool.1445*/1446static nvlist_t *1447zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg,1448pthread_mutex_t *lock, avl_tree_t *cache)1449{1450(void) lock;1451nvlist_t *ret = NULL;1452pool_list_t pools = { 0 };1453pool_entry_t *pe, *penext;1454vdev_entry_t *ve, *venext;1455config_entry_t *ce, *cenext;1456name_entry_t *ne, *nenext;1457rdsk_node_t *slice;1458void *cookie;1459taskq_t *tq;14601461verify(iarg->poolname == NULL || iarg->guid == 0);14621463/*1464* Create a thread pool to parallelize the process of reading and1465* validating labels, a large number of threads can be used due to1466* minimal contention.1467*/1468long threads = 2 * sysconf(_SC_NPROCESSORS_ONLN);1469#ifdef HAVE_AIO_H1470long am;1471#ifdef _SC_AIO_LISTIO_MAX1472am = sysconf(_SC_AIO_LISTIO_MAX);1473if (am >= VDEV_LABELS)1474threads = MIN(threads, am / VDEV_LABELS);1475#endif1476#ifdef _SC_AIO_MAX1477am = sysconf(_SC_AIO_MAX);1478if (am >= VDEV_LABELS)1479threads = MIN(threads, am / VDEV_LABELS);1480#endif1481#endif1482tq = taskq_create("zpool_find_import", threads, minclsyspri, 1, INT_MAX,1483TASKQ_DYNAMIC);1484for (slice = avl_first(cache); slice;1485(slice = avl_walk(cache, slice, AVL_AFTER)))1486(void) taskq_dispatch(tq, zpool_open_func, slice, TQ_SLEEP);14871488taskq_wait(tq);1489taskq_destroy(tq);14901491/*1492* Process the cache, filtering out any entries which are not1493* for the specified pool then adding matching label configs.1494*/1495cookie = NULL;1496while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {1497if (slice->rn_config != NULL) {1498nvlist_t *config = slice->rn_config;1499boolean_t matched = B_TRUE;1500boolean_t aux = B_FALSE;1501int fd;15021503/*1504* Check if it's a spare or l2cache device. If it is,1505* we need to skip the name and guid check since they1506* don't exist on aux device label.1507*/1508if (iarg->poolname != NULL || iarg->guid != 0) {1509uint64_t state;1510aux = nvlist_lookup_uint64(config,1511ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&1512(state == POOL_STATE_SPARE ||1513state == POOL_STATE_L2CACHE);1514}15151516if (iarg->poolname != NULL && !aux) {1517const char *pname;15181519matched = nvlist_lookup_string(config,1520ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&1521strcmp(iarg->poolname, pname) == 0;1522} else if (iarg->guid != 0 && !aux) {1523uint64_t this_guid;15241525matched = nvlist_lookup_uint64(config,1526ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&1527iarg->guid == this_guid;1528}1529if (matched) {1530/*1531* Verify all remaining entries can be opened1532* exclusively. This will prune all underlying1533* multipath devices which otherwise could1534* result in the vdev appearing as UNAVAIL.1535*1536* Under zdb, this step isn't required and1537* would prevent a zdb -e of active pools with1538* no cachefile.1539*/1540fd = open(slice->rn_name,1541O_RDONLY | O_EXCL | O_CLOEXEC);1542if (fd >= 0 || iarg->can_be_active) {1543if (fd >= 0)1544close(fd);1545add_config(hdl, &pools,1546slice->rn_name, slice->rn_order,1547slice->rn_num_labels, config);1548}1549}1550nvlist_free(config);1551}1552free(slice->rn_name);1553free(slice);1554}1555avl_destroy(cache);1556free(cache);15571558ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);15591560for (pe = pools.pools; pe != NULL; pe = penext) {1561penext = pe->pe_next;1562for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {1563venext = ve->ve_next;1564for (ce = ve->ve_configs; ce != NULL; ce = cenext) {1565cenext = ce->ce_next;1566nvlist_free(ce->ce_config);1567free(ce);1568}1569free(ve);1570}1571free(pe);1572}15731574for (ne = pools.names; ne != NULL; ne = nenext) {1575nenext = ne->ne_next;1576free(ne->ne_name);1577free(ne);1578}15791580return (ret);1581}15821583/*1584* Given a config, discover the paths for the devices which1585* exist in the config.1586*/1587static int1588discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv,1589avl_tree_t *cache, pthread_mutex_t *lock)1590{1591const char *path = NULL;1592ssize_t dl;1593uint_t children;1594nvlist_t **child;15951596if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,1597&child, &children) == 0) {1598for (int c = 0; c < children; c++) {1599discover_cached_paths(hdl, child[c], cache, lock);1600}1601}16021603/*1604* Once we have the path, we need to add the directory to1605* our directory cache.1606*/1607if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {1608int ret;1609char c = '\0';1610if ((dl = zfs_dirnamelen(path)) == -1) {1611path = ".";1612} else {1613c = path[dl];1614((char *)path)[dl] = '\0';16151616}1617ret = zpool_find_import_scan_dir(hdl, lock, cache,1618path, 0);1619if (c != '\0')1620((char *)path)[dl] = c;16211622return (ret);1623}1624return (0);1625}16261627/*1628* Given a cache file, return the contents as a list of importable pools.1629* poolname or guid (but not both) are provided by the caller when trying1630* to import a specific pool.1631*/1632static nvlist_t *1633zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg)1634{1635char *buf;1636int fd;1637struct stat64 statbuf;1638nvlist_t *raw, *src, *dst;1639nvlist_t *pools;1640nvpair_t *elem;1641const char *name;1642uint64_t this_guid;1643boolean_t active;16441645verify(iarg->poolname == NULL || iarg->guid == 0);16461647if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) {1648zutil_error_aux(hdl, "%s", zfs_strerror(errno));1649(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1650"failed to open cache file"));1651return (NULL);1652}16531654if (fstat64(fd, &statbuf) != 0) {1655zutil_error_aux(hdl, "%s", zfs_strerror(errno));1656(void) close(fd);1657(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1658"failed to get size of cache file"));1659return (NULL);1660}16611662if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) {1663(void) close(fd);1664return (NULL);1665}16661667if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {1668(void) close(fd);1669free(buf);1670(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1671"failed to read cache file contents"));1672return (NULL);1673}16741675(void) close(fd);16761677if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {1678free(buf);1679(void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN,1680"invalid or corrupt cache file contents"));1681return (NULL);1682}16831684free(buf);16851686/*1687* Go through and get the current state of the pools and refresh their1688* state.1689*/1690if (nvlist_alloc(&pools, 0, 0) != 0) {1691(void) zutil_no_memory(hdl);1692nvlist_free(raw);1693return (NULL);1694}16951696elem = NULL;1697while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {1698src = fnvpair_value_nvlist(elem);16991700name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);1701if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0)1702continue;17031704this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);1705if (iarg->guid != 0 && iarg->guid != this_guid)1706continue;17071708if (zutil_pool_active(hdl, name, this_guid, &active) != 0) {1709nvlist_free(raw);1710nvlist_free(pools);1711return (NULL);1712}17131714if (active)1715continue;17161717if (iarg->scan) {1718uint64_t saved_guid = iarg->guid;1719const char *saved_poolname = iarg->poolname;1720pthread_mutex_t lock;17211722/*1723* Create the device cache that will hold the1724* devices we will scan based on the cachefile.1725* This will get destroyed and freed by1726* zpool_find_import_impl.1727*/1728avl_tree_t *cache = zutil_alloc(hdl,1729sizeof (avl_tree_t));1730avl_create(cache, slice_cache_compare,1731sizeof (rdsk_node_t),1732offsetof(rdsk_node_t, rn_node));1733nvlist_t *nvroot = fnvlist_lookup_nvlist(src,1734ZPOOL_CONFIG_VDEV_TREE);17351736/*1737* We only want to find the pool with this_guid.1738* We will reset these values back later.1739*/1740iarg->guid = this_guid;1741iarg->poolname = NULL;17421743/*1744* We need to build up a cache of devices that exists1745* in the paths pointed to by the cachefile. This allows1746* us to preserve the device namespace that was1747* originally specified by the user but also lets us1748* scan devices in those directories in case they had1749* been renamed.1750*/1751pthread_mutex_init(&lock, NULL);1752discover_cached_paths(hdl, nvroot, cache, &lock);1753nvlist_t *nv = zpool_find_import_impl(hdl, iarg,1754&lock, cache);1755pthread_mutex_destroy(&lock);17561757/*1758* zpool_find_import_impl will return back1759* a list of pools that it found based on the1760* device cache. There should only be one pool1761* since we're looking for a specific guid.1762* We will use that pool to build up the final1763* pool nvlist which is returned back to the1764* caller.1765*/1766nvpair_t *pair = nvlist_next_nvpair(nv, NULL);1767if (pair == NULL)1768continue;1769fnvlist_add_nvlist(pools, nvpair_name(pair),1770fnvpair_value_nvlist(pair));17711772VERIFY0P(nvlist_next_nvpair(nv, pair));17731774iarg->guid = saved_guid;1775iarg->poolname = saved_poolname;1776continue;1777}17781779if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,1780iarg->cachefile) != 0) {1781(void) zutil_no_memory(hdl);1782nvlist_free(raw);1783nvlist_free(pools);1784return (NULL);1785}17861787update_vdevs_config_dev_sysfs_path(src);17881789if ((dst = zutil_refresh_config(hdl, src)) == NULL) {1790nvlist_free(raw);1791nvlist_free(pools);1792return (NULL);1793}17941795if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {1796(void) zutil_no_memory(hdl);1797nvlist_free(dst);1798nvlist_free(raw);1799nvlist_free(pools);1800return (NULL);1801}1802nvlist_free(dst);1803}1804nvlist_free(raw);1805return (pools);1806}18071808static nvlist_t *1809zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg)1810{1811pthread_mutex_t lock;1812avl_tree_t *cache;1813nvlist_t *pools = NULL;18141815verify(iarg->poolname == NULL || iarg->guid == 0);1816pthread_mutex_init(&lock, NULL);18171818/*1819* Locate pool member vdevs by blkid or by directory scanning.1820* On success a newly allocated AVL tree which is populated with an1821* entry for each discovered vdev will be returned in the cache.1822* It's the caller's responsibility to consume and destroy this tree.1823*/1824if (iarg->scan || iarg->paths != 0) {1825size_t dirs = iarg->paths;1826const char * const *dir = (const char * const *)iarg->path;18271828if (dirs == 0)1829dir = zpool_default_search_paths(&dirs);18301831if (zpool_find_import_scan(hdl, &lock, &cache,1832dir, dirs) != 0) {1833pthread_mutex_destroy(&lock);1834return (NULL);1835}1836} else {1837if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) {1838pthread_mutex_destroy(&lock);1839return (NULL);1840}1841}18421843pools = zpool_find_import_impl(hdl, iarg, &lock, cache);1844pthread_mutex_destroy(&lock);1845return (pools);1846}184718481849nvlist_t *1850zpool_search_import(libpc_handle_t *hdl, importargs_t *import)1851{1852nvlist_t *pools = NULL;18531854verify(import->poolname == NULL || import->guid == 0);18551856if (import->cachefile != NULL)1857pools = zpool_find_import_cached(hdl, import);1858else1859pools = zpool_find_import(hdl, import);18601861if ((pools == NULL || nvlist_empty(pools)) &&1862hdl->lpc_open_access_error && geteuid() != 0) {1863(void) zutil_error(hdl, LPC_EACCESS, dgettext(TEXT_DOMAIN,1864"no pools found"));1865}18661867return (pools);1868}18691870static boolean_t1871pool_match(nvlist_t *cfg, const char *tgt)1872{1873uint64_t v, guid = strtoull(tgt, NULL, 0);1874const char *s;18751876if (guid != 0) {1877if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)1878return (v == guid);1879} else {1880if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)1881return (strcmp(s, tgt) == 0);1882}1883return (B_FALSE);1884}18851886int1887zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,1888importargs_t *args)1889{1890nvlist_t *pools;1891nvlist_t *match = NULL;1892nvlist_t *config = NULL;1893char *sepp = NULL;1894int count = 0;1895char *targetdup = strdup(target);18961897if (targetdup == NULL)1898return (ENOMEM);18991900*configp = NULL;19011902if ((sepp = strpbrk(targetdup, "/@")) != NULL)1903*sepp = '\0';19041905pools = zpool_search_import(hdl, args);1906if (pools == NULL) {1907zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, "no pools found"));1908(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1909"failed to find config for pool '%s'"), targetdup);1910free(targetdup);1911return (ENOENT);1912}19131914nvpair_t *elem = NULL;1915while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {1916VERIFY0(nvpair_value_nvlist(elem, &config));1917if (pool_match(config, targetdup)) {1918count++;1919if (match != NULL) {1920/* multiple matches found */1921continue;1922} else {1923match = fnvlist_dup(config);1924}1925}1926}1927fnvlist_free(pools);19281929if (count == 0) {1930zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,1931"no matching pools"));1932(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1933"failed to find config for pool '%s'"), targetdup);1934free(targetdup);1935return (ENOENT);1936}19371938if (count > 1) {1939zutil_error_aux(hdl, dgettext(TEXT_DOMAIN,1940"more than one matching pool"));1941(void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN,1942"failed to find config for pool '%s'"), targetdup);1943free(targetdup);1944fnvlist_free(match);1945return (EINVAL);1946}19471948*configp = match;1949free(targetdup);19501951return (0);1952}19531954/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */1955static boolean_t1956vdev_is_leaf(nvlist_t *nv)1957{1958uint_t children = 0;1959nvlist_t **child;19601961(void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,1962&child, &children);19631964return (children == 0);1965}19661967/* Return if a vdev is a leaf vdev and a real device (disk or file) */1968static boolean_t1969vdev_is_real_leaf(nvlist_t *nv)1970{1971const char *type = NULL;1972if (!vdev_is_leaf(nv))1973return (B_FALSE);19741975(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type);1976if ((strcmp(type, VDEV_TYPE_DISK) == 0) ||1977(strcmp(type, VDEV_TYPE_FILE) == 0)) {1978return (B_TRUE);1979}19801981return (B_FALSE);1982}19831984/*1985* This function is called by our FOR_EACH_VDEV() macros.1986*1987* state: State machine status (stored inside of a (nvlist_t *))1988* nv: The current vdev nvlist_t we are iterating over.1989* last_nv: The previous vdev nvlist_t we returned to the user in1990* the last iteration of FOR_EACH_VDEV(). We use it1991* to find the next vdev nvlist_t we should return.1992* real_leaves_only: Only return leaf vdevs.1993*1994* Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if1995* we're still searching for it.1996*/1997static int1998__for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,1999boolean_t real_leaves_only)2000{2001enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2};20022003/* The very first entry in the NV list is a special case */2004if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) {2005if (real_leaves_only && !vdev_is_real_leaf(nv))2006return (0);20072008*((nvlist_t **)last_nv) = nv;2009*((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING;2010return (1);2011}20122013/*2014* We came across our last_nv, meaning the next one is the one we2015* want2016*/2017if (nv == *((nvlist_t **)last_nv)) {2018/* Next iteration of this function will return the nvlist_t */2019*((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH;2020return (0);2021}20222023/*2024* We marked NEXT_IS_MATCH on the previous iteration, so this is the one2025* we want.2026*/2027if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) {2028if (real_leaves_only && !vdev_is_real_leaf(nv))2029return (0);20302031*((nvlist_t **)last_nv) = nv;2032*((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING;2033return (1);2034}20352036return (0);2037}20382039int2040for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv)2041{2042return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE));2043}20442045int2046for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv,2047void *last_nv)2048{2049return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE));2050}20512052/*2053* Internal function for iterating over the vdevs.2054*2055* For each vdev, func() will be called and will be passed 'zhp' (which is2056* typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and2057* a user-defined data pointer).2058*2059* The return values from all the func() calls will be OR'd together and2060* returned.2061*/2062int2063for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func,2064void *data)2065{2066nvlist_t **child;2067uint_t c, children;2068int ret = 0;2069int i;2070const char *type;20712072const char *list[] = {2073ZPOOL_CONFIG_SPARES,2074ZPOOL_CONFIG_L2CACHE,2075ZPOOL_CONFIG_CHILDREN2076};20772078if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)2079return (ret);20802081/* Don't run our function on indirect vdevs */2082if (strcmp(type, VDEV_TYPE_INDIRECT) != 0) {2083ret |= func(zhp, nv, data);2084}20852086for (i = 0; i < ARRAY_SIZE(list); i++) {2087if (nvlist_lookup_nvlist_array(nv, list[i], &child,2088&children) == 0) {2089for (c = 0; c < children; c++) {2090uint64_t ishole = 0;20912092(void) nvlist_lookup_uint64(child[c],2093ZPOOL_CONFIG_IS_HOLE, &ishole);20942095if (ishole)2096continue;20972098ret |= for_each_vdev_cb(zhp, child[c],2099func, data);2100}2101}2102}21032104return (ret);2105}21062107/*2108* Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling2109* func() for each one. func() is passed the vdev's nvlist and an optional2110* user-defined 'data' pointer.2111*/2112int2113for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data)2114{2115return (for_each_vdev_cb(NULL, nvroot, func, data));2116}211721182119