Path: blob/main/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c
48529 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.23*24* Copyright (c) 2016, Intel Corporation.25* Copyright (c) 2018, loli10K <[email protected]>26*/2728/*29* The ZFS retire agent is responsible for managing hot spares across all pools.30* When we see a device fault or a device removal, we try to open the associated31* pool and look for any hot spares. We iterate over any available hot spares32* and attempt a 'zpool replace' for each one.33*34* For vdevs diagnosed as faulty, the agent is also responsible for proactively35* marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).36*/3738#include <sys/fs/zfs.h>39#include <sys/fm/protocol.h>40#include <sys/fm/fs/zfs.h>41#include <libzutil.h>42#include <libzfs.h>43#include <string.h>44#include <libgen.h>4546#include "zfs_agents.h"47#include "fmd_api.h"484950typedef struct zfs_retire_repaired {51struct zfs_retire_repaired *zrr_next;52uint64_t zrr_pool;53uint64_t zrr_vdev;54} zfs_retire_repaired_t;5556typedef struct zfs_retire_data {57libzfs_handle_t *zrd_hdl;58zfs_retire_repaired_t *zrd_repaired;59} zfs_retire_data_t;6061static void62zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)63{64zfs_retire_repaired_t *zrp;6566while ((zrp = zdp->zrd_repaired) != NULL) {67zdp->zrd_repaired = zrp->zrr_next;68fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));69}70}7172/*73* Find a pool with a matching GUID.74*/75typedef struct find_cbdata {76uint64_t cb_guid;77zpool_handle_t *cb_zhp;78nvlist_t *cb_vdev;79uint64_t cb_vdev_guid;80uint64_t cb_num_spares;81} find_cbdata_t;8283static int84find_pool(zpool_handle_t *zhp, void *data)85{86find_cbdata_t *cbp = data;8788if (cbp->cb_guid ==89zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {90cbp->cb_zhp = zhp;91return (1);92}9394zpool_close(zhp);95return (0);96}9798/*99* Find a vdev within a tree with a matching GUID.100*/101static nvlist_t *102find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid)103{104uint64_t guid;105nvlist_t **child;106uint_t c, children;107nvlist_t *ret;108109if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&110guid == search_guid) {111fmd_hdl_debug(fmd_module_hdl("zfs-retire"),112"matched vdev %llu", guid);113return (nv);114}115116if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,117&child, &children) != 0)118return (NULL);119120for (c = 0; c < children; c++) {121if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)122return (ret);123}124125if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,126&child, &children) != 0)127return (NULL);128129for (c = 0; c < children; c++) {130if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)131return (ret);132}133134if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,135&child, &children) != 0)136return (NULL);137138for (c = 0; c < children; c++) {139if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)140return (ret);141}142143return (NULL);144}145146static int147remove_spares(zpool_handle_t *zhp, void *data)148{149nvlist_t *config, *nvroot;150nvlist_t **spares;151uint_t nspares;152char *devname;153find_cbdata_t *cbp = data;154uint64_t spareguid = 0;155vdev_stat_t *vs;156unsigned int c;157158config = zpool_get_config(zhp, NULL);159if (nvlist_lookup_nvlist(config,160ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) {161zpool_close(zhp);162return (0);163}164165if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,166&spares, &nspares) != 0) {167zpool_close(zhp);168return (0);169}170171for (int i = 0; i < nspares; i++) {172if (nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID,173&spareguid) == 0 && spareguid == cbp->cb_vdev_guid) {174devname = zpool_vdev_name(NULL, zhp, spares[i],175B_FALSE);176nvlist_lookup_uint64_array(spares[i],177ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c);178if (vs->vs_state != VDEV_STATE_REMOVED &&179zpool_vdev_remove_wanted(zhp, devname) == 0)180cbp->cb_num_spares++;181break;182}183}184185zpool_close(zhp);186return (0);187}188189/*190* Given a vdev guid, find and remove all spares associated with it.191*/192static int193find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid)194{195find_cbdata_t cb;196197cb.cb_num_spares = 0;198cb.cb_vdev_guid = vdev_guid;199zpool_iter(zhdl, remove_spares, &cb);200201return (cb.cb_num_spares);202}203204/*205* Given a (pool, vdev) GUID pair, find the matching pool and vdev.206*/207static zpool_handle_t *208find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,209nvlist_t **vdevp)210{211find_cbdata_t cb;212zpool_handle_t *zhp;213nvlist_t *config, *nvroot;214215/*216* Find the corresponding pool and make sure the vdev still exists.217*/218cb.cb_guid = pool_guid;219if (zpool_iter(zhdl, find_pool, &cb) != 1)220return (NULL);221222zhp = cb.cb_zhp;223config = zpool_get_config(zhp, NULL);224if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,225&nvroot) != 0) {226zpool_close(zhp);227return (NULL);228}229230if (vdev_guid != 0) {231if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) {232zpool_close(zhp);233return (NULL);234}235}236237return (zhp);238}239240/*241* Given a vdev, attempt to replace it with every known spare until one242* succeeds or we run out of devices to try.243* Return whether we were successful or not in replacing the device.244*/245static boolean_t246replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)247{248nvlist_t *config, *nvroot, *replacement;249nvlist_t **spares;250uint_t s, nspares;251char *dev_name;252zprop_source_t source;253int ashift;254255config = zpool_get_config(zhp, NULL);256if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,257&nvroot) != 0)258return (B_FALSE);259260/*261* Find out if there are any hot spares available in the pool.262*/263if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,264&spares, &nspares) != 0)265return (B_FALSE);266267/*268* lookup "ashift" pool property, we may need it for the replacement269*/270ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source);271272replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);273274(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,275VDEV_TYPE_ROOT);276277dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);278279/*280* Try to replace each spare, ending when we successfully281* replace it.282*/283for (s = 0; s < nspares; s++) {284boolean_t rebuild = B_FALSE;285const char *spare_name, *type;286287if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,288&spare_name) != 0)289continue;290291/* prefer sequential resilvering for distributed spares */292if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,293&type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)294rebuild = B_TRUE;295296/* if set, add the "ashift" pool property to the spare nvlist */297if (source != ZPROP_SRC_DEFAULT)298(void) nvlist_add_uint64(spares[s],299ZPOOL_CONFIG_ASHIFT, ashift);300301(void) nvlist_add_nvlist_array(replacement,302ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1);303304fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",305dev_name, zfs_basename(spare_name));306307if (zpool_vdev_attach(zhp, dev_name, spare_name,308replacement, B_TRUE, rebuild) == 0) {309free(dev_name);310nvlist_free(replacement);311return (B_TRUE);312}313}314315free(dev_name);316nvlist_free(replacement);317318return (B_FALSE);319}320321/*322* Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and323* ASRU is now usable. ZFS has found the device to be present and324* functioning.325*/326static void327zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)328{329zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);330zfs_retire_repaired_t *zrp;331uint64_t pool_guid, vdev_guid;332if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,333&pool_guid) != 0 || nvlist_lookup_uint64(nvl,334FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)335return;336337/*338* Before checking the state of the ASRU, go through and see if we've339* already made an attempt to repair this ASRU. This list is cleared340* whenever we receive any kind of list event, and is designed to341* prevent us from generating a feedback loop when we attempt repairs342* against a faulted pool. The problem is that checking the unusable343* state of the ASRU can involve opening the pool, which can post344* statechange events but otherwise leave the pool in the faulted345* state. This list allows us to detect when a statechange event is346* due to our own request.347*/348for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {349if (zrp->zrr_pool == pool_guid &&350zrp->zrr_vdev == vdev_guid)351return;352}353354zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);355zrp->zrr_next = zdp->zrd_repaired;356zrp->zrr_pool = pool_guid;357zrp->zrr_vdev = vdev_guid;358zdp->zrd_repaired = zrp;359360fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",361vdev_guid, pool_guid);362}363364static void365zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,366const char *class)367{368(void) ep;369uint64_t pool_guid, vdev_guid;370zpool_handle_t *zhp;371nvlist_t *resource, *fault;372nvlist_t **faults;373uint_t f, nfaults;374zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);375libzfs_handle_t *zhdl = zdp->zrd_hdl;376boolean_t fault_device, degrade_device;377boolean_t is_repair;378boolean_t l2arc = B_FALSE;379boolean_t spare = B_FALSE;380const char *scheme;381nvlist_t *vdev = NULL;382const char *uuid;383int repair_done = 0;384boolean_t retire;385boolean_t is_disk;386vdev_aux_t aux;387uint64_t state = 0;388vdev_stat_t *vs;389unsigned int c;390391fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);392393(void) nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE,394&state);395396/*397* If this is a resource notifying us of device removal then simply398* check for an available spare and continue unless the device is a399* l2arc vdev, in which case we just offline it.400*/401if (strcmp(class, "resource.fs.zfs.removed") == 0 ||402(strcmp(class, "resource.fs.zfs.statechange") == 0 &&403(state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) {404const char *devtype;405char *devname;406boolean_t skip_removal = B_FALSE;407408if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,409&devtype) == 0) {410if (strcmp(devtype, VDEV_TYPE_SPARE) == 0)411spare = B_TRUE;412else if (strcmp(devtype, VDEV_TYPE_L2CACHE) == 0)413l2arc = B_TRUE;414}415416if (nvlist_lookup_uint64(nvl,417FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)418return;419420if (vdev_guid == 0) {421fmd_hdl_debug(hdl, "Got a zero GUID");422return;423}424425if (spare) {426int nspares = find_and_remove_spares(zhdl, vdev_guid);427fmd_hdl_debug(hdl, "%d spares removed", nspares);428return;429}430431if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,432&pool_guid) != 0)433return;434435if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,436&vdev)) == NULL)437return;438439devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);440441nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,442(uint64_t **)&vs, &c);443444if (vs->vs_state == VDEV_STATE_OFFLINE)445return;446447/*448* If state removed is requested for already removed vdev,449* its a loopback event from spa_async_remove(). Just450* ignore it.451*/452if ((vs->vs_state == VDEV_STATE_REMOVED &&453state == VDEV_STATE_REMOVED)) {454if (strcmp(class, "resource.fs.zfs.removed") == 0 &&455nvlist_exists(nvl, "by_kernel")) {456skip_removal = B_TRUE;457} else {458return;459}460}461462/* Remove the vdev since device is unplugged */463int remove_status = 0;464if (!skip_removal && (l2arc ||465(strcmp(class, "resource.fs.zfs.removed") == 0))) {466remove_status = zpool_vdev_remove_wanted(zhp, devname);467fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'"468", err:%d", devname, libzfs_errno(zhdl));469}470471/* Replace the vdev with a spare if its not a l2arc */472if (!l2arc && !remove_status &&473(!fmd_prop_get_int32(hdl, "spare_on_remove") ||474replace_with_spare(hdl, zhp, vdev) == B_FALSE)) {475/* Could not handle with spare */476fmd_hdl_debug(hdl, "no spare for '%s'", devname);477}478479free(devname);480zpool_close(zhp);481return;482}483484if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)485return;486487/*488* Note: on Linux statechange events are more than just489* healthy ones so we need to confirm the actual state value.490*/491if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&492state == VDEV_STATE_HEALTHY) {493zfs_vdev_repair(hdl, nvl);494return;495}496if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {497zfs_vdev_repair(hdl, nvl);498return;499}500501zfs_retire_clear_data(hdl, zdp);502503if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)504is_repair = B_TRUE;505else506is_repair = B_FALSE;507508/*509* We subscribe to zfs faults as well as all repair events.510*/511if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,512&faults, &nfaults) != 0)513return;514515for (f = 0; f < nfaults; f++) {516fault = faults[f];517518fault_device = B_FALSE;519degrade_device = B_FALSE;520is_disk = B_FALSE;521522if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,523&retire) == 0 && retire == 0)524continue;525526/*527* While we subscribe to fault.fs.zfs.*, we only take action528* for faults targeting a specific vdev (open failure or SERD529* failure). We also subscribe to fault.io.* events, so that530* faulty disks will be faulted in the ZFS configuration.531*/532if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {533fault_device = B_TRUE;534} else if (fmd_nvl_class_match(hdl, fault,535"fault.fs.zfs.vdev.checksum")) {536degrade_device = B_TRUE;537} else if (fmd_nvl_class_match(hdl, fault,538"fault.fs.zfs.vdev.slow_io")) {539degrade_device = B_TRUE;540} else if (fmd_nvl_class_match(hdl, fault,541"fault.fs.zfs.device")) {542fault_device = B_FALSE;543} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {544is_disk = B_TRUE;545fault_device = B_TRUE;546} else {547continue;548}549550if (is_disk) {551continue;552} else {553/*554* This is a ZFS fault. Lookup the resource, and555* attempt to find the matching vdev.556*/557if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,558&resource) != 0 ||559nvlist_lookup_string(resource, FM_FMRI_SCHEME,560&scheme) != 0)561continue;562563if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)564continue;565566if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,567&pool_guid) != 0)568continue;569570if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,571&vdev_guid) != 0) {572if (is_repair)573vdev_guid = 0;574else575continue;576}577578if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,579&vdev)) == NULL)580continue;581582aux = VDEV_AUX_ERR_EXCEEDED;583}584585if (vdev_guid == 0) {586/*587* For pool-level repair events, clear the entire pool.588*/589fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",590zpool_get_name(zhp));591(void) zpool_clear(zhp, NULL, NULL);592zpool_close(zhp);593continue;594}595596/*597* If this is a repair event, then mark the vdev as repaired and598* continue.599*/600if (is_repair) {601repair_done = 1;602fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",603zpool_get_name(zhp), vdev_guid);604(void) zpool_vdev_clear(zhp, vdev_guid);605zpool_close(zhp);606continue;607}608609/*610* Actively fault the device if needed.611*/612if (fault_device)613(void) zpool_vdev_fault(zhp, vdev_guid, aux);614if (degrade_device)615(void) zpool_vdev_degrade(zhp, vdev_guid, aux);616617if (fault_device || degrade_device)618fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",619fault_device ? "fault" : "degrade", vdev_guid,620zpool_get_name(zhp));621622/*623* Attempt to substitute a hot spare.624*/625(void) replace_with_spare(hdl, zhp, vdev);626627zpool_close(zhp);628}629630if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&631nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)632fmd_case_uuresolved(hdl, uuid);633}634635static const fmd_hdl_ops_t fmd_ops = {636zfs_retire_recv, /* fmdo_recv */637NULL, /* fmdo_timeout */638NULL, /* fmdo_close */639NULL, /* fmdo_stats */640NULL, /* fmdo_gc */641};642643static const fmd_prop_t fmd_props[] = {644{ "spare_on_remove", FMD_TYPE_BOOL, "true" },645{ NULL, 0, NULL }646};647648static const fmd_hdl_info_t fmd_info = {649"ZFS Retire Agent", "1.0", &fmd_ops, fmd_props650};651652void653_zfs_retire_init(fmd_hdl_t *hdl)654{655zfs_retire_data_t *zdp;656libzfs_handle_t *zhdl;657658if ((zhdl = libzfs_init()) == NULL)659return;660661if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {662libzfs_fini(zhdl);663return;664}665666zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);667zdp->zrd_hdl = zhdl;668669fmd_hdl_setspecific(hdl, zdp);670}671672void673_zfs_retire_fini(fmd_hdl_t *hdl)674{675zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);676677if (zdp != NULL) {678zfs_retire_clear_data(hdl, zdp);679libzfs_fini(zdp->zrd_hdl);680fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));681}682}683684685