Path: blob/main/sys/contrib/openzfs/cmd/zinject/zinject.c
48380 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2012, 2015 by Delphix. All rights reserved.24* Copyright (c) 2017, Intel Corporation.25* Copyright (c) 2023-2025, Klara, Inc.26*/2728/*29* ZFS Fault Injector30*31* This userland component takes a set of options and uses libzpool to translate32* from a user-visible object type and name to an internal representation.33* There are two basic types of faults: device faults and data faults.34*35*36* DEVICE FAULTS37*38* Errors can be injected into a particular vdev using the '-d' option. This39* option takes a path or vdev GUID to uniquely identify the device within a40* pool. There are four types of errors that can be injected, IO, ENXIO,41* ECHILD, and EILSEQ. These can be controlled through the '-e' option and the42* default is ENXIO. For EIO failures, any attempt to read data from the device43* will return EIO, but a subsequent attempt to reopen the device will succeed.44* For ENXIO failures, any attempt to read from the device will return EIO, but45* any attempt to reopen the device will also return ENXIO. The EILSEQ failures46* only apply to read operations (-T read) and will flip a bit after the device47* has read the original data.48*49* For label faults, the -L option must be specified. This allows faults50* to be injected into either the nvlist, uberblock, pad1, or pad2 region51* of all the labels for the specified device.52*53* This form of the command looks like:54*55* zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool56*57*58* DATA FAULTS59*60* We begin with a tuple of the form:61*62* <type,level,range,object>63*64* type A string describing the type of data to target. Each type65* implicitly describes how to interpret 'object'. Currently,66* the following values are supported:67*68* data User data for a file69* dnode Dnode for a file or directory70*71* The following MOS objects are special. Instead of injecting72* errors on a particular object or blkid, we inject errors across73* all objects of the given type.74*75* mos Any data in the MOS76* mosdir object directory77* config pool configuration78* bpobj blkptr list79* spacemap spacemap80* metaslab metaslab81* errlog persistent error log82*83* level Object level. Defaults to '0', not applicable to all types. If84* a range is given, this corresponds to the indirect block85* corresponding to the specific range.86*87* range A numerical range [start,end) within the object. Defaults to88* the full size of the file.89*90* object A string describing the logical location of the object. For91* files and directories (currently the only supported types),92* this is the path of the object on disk.93*94* This is translated, via libzpool, into the following internal representation:95*96* <type,objset,object,level,range>97*98* These types should be self-explanatory. This tuple is then passed to the99* kernel via a special ioctl() to initiate fault injection for the given100* object. Note that 'type' is not strictly necessary for fault injection, but101* is used when translating existing faults into a human-readable string.102*103*104* The command itself takes one of the forms:105*106* zinject107* zinject <-a | -u pool>108* zinject -c <id|all>109* zinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]110* [-T iotype] [-t type object | -b bookmark pool]111* zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]112* [-r range] <object>113* zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool114*115* With no arguments, the command prints all currently registered injection116* handlers, with their numeric identifiers.117*118* The '-c' option will clear the given handler, or all handlers if 'all' is119* specified.120*121* The '-e' option takes a string describing the errno to simulate. This must122* be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this123* will result in the same behavior, but RAID-Z will produce a different set of124* ereports for this situation.125*126* The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is127* specified, then the ARC cache is flushed appropriately. If '-u' is128* specified, then the underlying SPA is unloaded. Either of these flags can be129* specified independently of any other handlers. The '-m' flag automatically130* does an unmount and remount of the underlying dataset to aid in flushing the131* cache.132*133* The '-f' flag controls the frequency of errors injected, expressed as a134* real number percentage between 0.0001 and 100. The default is 100.135*136* The <object> form is responsible for actually injecting the handler into the137* framework. It takes the arguments described above, translates them to the138* internal tuple using libzpool, and then issues an ioctl() to register the139* handler.140*141* The '-b' option can target a specific bookmark, regardless of whether a142* human-readable interface has been designed. It allows developers to specify143* a particular block by number.144*145* The '-E' option injects pipeline ready stage delays for the given object or146* bookmark. The delay is specified in milliseconds, and it supports I/O type147* and range filters.148*/149150#include <errno.h>151#include <fcntl.h>152#include <stdio.h>153#include <stdlib.h>154#include <string.h>155#include <strings.h>156#include <unistd.h>157158#include <sys/fs/zfs.h>159#include <sys/mount.h>160161#include <libzfs.h>162163#undef verify /* both libzfs.h and zfs_context.h want to define this */164165#include "zinject.h"166167libzfs_handle_t *g_zfs;168int zfs_fd;169170static const char *const errtable[TYPE_INVAL] = {171"data",172"dnode",173"mos",174"mosdir",175"metaslab",176"config",177"bpobj",178"spacemap",179"errlog",180"uber",181"nvlist",182"pad1",183"pad2"184};185186static err_type_t187name_to_type(const char *arg)188{189int i;190for (i = 0; i < TYPE_INVAL; i++)191if (strcmp(errtable[i], arg) == 0)192return (i);193194return (TYPE_INVAL);195}196197static const char *198type_to_name(uint64_t type)199{200switch (type) {201case DMU_OT_OBJECT_DIRECTORY:202return ("mosdir");203case DMU_OT_OBJECT_ARRAY:204return ("metaslab");205case DMU_OT_PACKED_NVLIST:206return ("config");207case DMU_OT_BPOBJ:208return ("bpobj");209case DMU_OT_SPACE_MAP:210return ("spacemap");211case DMU_OT_ERROR_LOG:212return ("errlog");213default:214return ("-");215}216}217218struct errstr {219int err;220const char *str;221};222static const struct errstr errstrtable[] = {223{ EIO, "io" },224{ ECKSUM, "checksum" },225{ EINVAL, "decompress" },226{ EACCES, "decrypt" },227{ ENXIO, "nxio" },228{ ECHILD, "dtl" },229{ EILSEQ, "corrupt" },230{ ENOSYS, "noop" },231{ 0, NULL },232};233234static int235str_to_err(const char *str)236{237for (int i = 0; errstrtable[i].str != NULL; i++)238if (strcasecmp(errstrtable[i].str, str) == 0)239return (errstrtable[i].err);240return (-1);241}242static const char *243err_to_str(int err)244{245for (int i = 0; errstrtable[i].str != NULL; i++)246if (errstrtable[i].err == err)247return (errstrtable[i].str);248return ("[unknown]");249}250251static const char *const iotypestrtable[ZINJECT_IOTYPES] = {252[ZINJECT_IOTYPE_NULL] = "null",253[ZINJECT_IOTYPE_READ] = "read",254[ZINJECT_IOTYPE_WRITE] = "write",255[ZINJECT_IOTYPE_FREE] = "free",256[ZINJECT_IOTYPE_CLAIM] = "claim",257[ZINJECT_IOTYPE_FLUSH] = "flush",258[ZINJECT_IOTYPE_TRIM] = "trim",259[ZINJECT_IOTYPE_ALL] = "all",260[ZINJECT_IOTYPE_PROBE] = "probe",261};262263static zinject_iotype_t264str_to_iotype(const char *arg)265{266for (uint_t iotype = 0; iotype < ZINJECT_IOTYPES; iotype++)267if (iotypestrtable[iotype] != NULL &&268strcasecmp(iotypestrtable[iotype], arg) == 0)269return (iotype);270return (ZINJECT_IOTYPES);271}272273static const char *274iotype_to_str(zinject_iotype_t iotype)275{276if (iotype >= ZINJECT_IOTYPES || iotypestrtable[iotype] == NULL)277return ("[unknown]");278return (iotypestrtable[iotype]);279}280281/*282* Print usage message.283*/284void285usage(void)286{287(void) printf(288"usage:\n"289"\n"290"\tzinject\n"291"\n"292"\t\tList all active injection records.\n"293"\n"294"\tzinject -c <id|all>\n"295"\n"296"\t\tClear the particular record (if given a numeric ID), or\n"297"\t\tall records if 'all' is specified.\n"298"\n"299"\tzinject -p <function name> pool\n"300"\t\tInject a panic fault at the specified function. Only \n"301"\t\tfunctions which call spa_vdev_config_exit(), or \n"302"\t\tspa_vdev_exit() will trigger a panic.\n"303"\n"304"\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"305"\t\t[-T <read|write|free|claim|flush|all>] [-f frequency] pool\n\n"306"\t\tInject a fault into a particular device or the device's\n"307"\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "308"\t\t'pad1', or 'pad2'.\n"309"\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n"310"\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n"311"\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"312"\t\tdevice error injection to a percentage of the IOs.\n"313"\n"314"\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"315"\t\tPerform a specific action on a particular device.\n"316"\n"317"\tzinject -d device -D latency:lanes pool\n"318"\n"319"\t\tAdd an artificial delay to IO requests on a particular\n"320"\t\tdevice, such that the requests take a minimum of 'latency'\n"321"\t\tmilliseconds to complete. Each delay has an associated\n"322"\t\tnumber of 'lanes' which defines the number of concurrent\n"323"\t\tIO requests that can be processed.\n"324"\n"325"\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"326"\t\tthe device will only be able to service a single IO request\n"327"\t\tat a time with each request taking 10 ms to complete. So,\n"328"\t\tif only a single request is submitted every 10 ms, the\n"329"\t\taverage latency will be 10 ms; but if more than one request\n"330"\t\tis submitted every 10 ms, the average latency will be more\n"331"\t\tthan 10 ms.\n"332"\n"333"\t\tSimilarly, if a delay of 10 ms is specified to have two\n"334"\t\tlanes (-D 10:2), then the device will be able to service\n"335"\t\ttwo requests at a time, each with a minimum latency of\n"336"\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"337"\t\tthe average latency will be 10 ms; but if more than two\n"338"\t\trequests are submitted every 10 ms, the average latency\n"339"\t\twill be more than 10 ms.\n"340"\n"341"\t\tAlso note, these delays are additive. So two invocations\n"342"\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"343"\t\tof '-D 10:2'. This also means, one can specify multiple\n"344"\t\tlanes with differing target latencies. For example, an\n"345"\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"346"\t\tcreate 3 lanes on the device; one lane with a latency\n"347"\t\tof 10 ms and two lanes with a 25 ms latency.\n"348"\n"349"\tzinject -P import|export -s <seconds> pool\n"350"\t\tAdd an artificial delay to a future pool import or export,\n"351"\t\tsuch that the operation takes a minimum of supplied seconds\n"352"\t\tto complete.\n"353"\n"354"\tzinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]\n"355"\t\t[-T iotype] [-t type object | -b bookmark pool]\n"356"\n"357"\t\tInject pipeline ready stage delays for the given object path\n"358"\t\t(data or dnode) or raw bookmark. The delay is specified in\n"359"\t\tmilliseconds.\n"360"\n"361"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"362"\t\tCause the pool to stop writing blocks yet not\n"363"\t\treport errors for a duration. Simulates buggy hardware\n"364"\t\tthat fails to honor cache flush requests.\n"365"\t\tDefault duration is 30 seconds. The machine is panicked\n"366"\t\tat the end of the duration.\n"367"\n"368"\tzinject -b objset:object:level:blkid pool\n"369"\n"370"\t\tInject an error into pool 'pool' with the numeric bookmark\n"371"\t\tspecified by the remaining tuple. Each number is in\n"372"\t\thexadecimal, and only one block can be specified.\n"373"\n"374"\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"375"\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"376"\n"377"\t\tInject an error into the object specified by the '-t' option\n"378"\t\tand the object descriptor. The 'object' parameter is\n"379"\t\tinterpreted depending on the '-t' option.\n"380"\n"381"\t\t-q\tQuiet mode. Only print out the handler number added.\n"382"\t\t-e\tInject a specific error. Must be one of 'io',\n"383"\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n"384"\t\t-C\tInject the given error only into specific DVAs. The\n"385"\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"386"\t\t\tseparated by commas (ex. '0,2').\n"387"\t\t-l\tInject error at a particular block level. Default is "388"0.\n"389"\t\t-m\tAutomatically remount underlying filesystem.\n"390"\t\t-r\tInject error over a particular logical range of an\n"391"\t\t\tobject. Will be translated to the appropriate blkid\n"392"\t\t\trange according to the object's properties.\n"393"\t\t-a\tFlush the ARC cache. Can be specified without any\n"394"\t\t\tassociated object.\n"395"\t\t-u\tUnload the associated pool. Can be specified with only\n"396"\t\t\ta pool object.\n"397"\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n"398"\t\t\ta percentage between 0.0001 and 100.\n"399"\n"400"\t-t data\t\tInject an error into the plain file contents of a\n"401"\t\t\tfile. The object must be specified as a complete path\n"402"\t\t\tto a file on a ZFS filesystem.\n"403"\n"404"\t-t dnode\tInject an error into the metadnode in the block\n"405"\t\t\tcorresponding to the dnode for a file or directory. The\n"406"\t\t\t'-r' option is incompatible with this mode. The object\n"407"\t\t\tis specified as a complete path to a file or directory\n"408"\t\t\ton a ZFS filesystem.\n"409"\n"410"\t-t <mos>\tInject errors into the MOS for objects of the given\n"411"\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n"412"\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n"413"\t\t\tthe poolname.\n");414}415416static int417iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),418void *data)419{420zfs_cmd_t zc = {"\0"};421int ret;422423while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)424if ((ret = func((int)zc.zc_guid, zc.zc_name,425&zc.zc_inject_record, data)) != 0)426return (ret);427428if (errno != ENOENT) {429(void) fprintf(stderr, "Unable to list handlers: %s\n",430strerror(errno));431return (-1);432}433434return (0);435}436437static int438print_data_handler(int id, const char *pool, zinject_record_t *record,439void *data)440{441int *count = data;442443if (record->zi_guid != 0 || record->zi_func[0] != '\0' ||444record->zi_duration != 0) {445return (0);446}447448if (*count == 0) {449(void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "450"%-15s %-6s %-15s\n", "ID", "POOL", "OBJSET", "OBJECT",451"TYPE", "LVL", "DVAs", "RANGE", "MATCH", "INJECT");452(void) printf("--- --------------- ------ "453"------ -------- --- ---- --------------- "454"------ ------\n");455}456457*count += 1;458459char rangebuf[32];460if (record->zi_start == 0 && record->zi_end == -1ULL)461snprintf(rangebuf, sizeof (rangebuf), "all");462else463snprintf(rangebuf, sizeof (rangebuf), "[%llu, %llu]",464(u_longlong_t)record->zi_start,465(u_longlong_t)record->zi_end);466467468(void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x %-15s "469"%6" PRIu64 " %6" PRIu64 "\n", id, pool,470(u_longlong_t)record->zi_objset,471(u_longlong_t)record->zi_object, type_to_name(record->zi_type),472record->zi_level, record->zi_dvas, rangebuf,473record->zi_match_count, record->zi_inject_count);474475return (0);476}477478static int479print_device_handler(int id, const char *pool, zinject_record_t *record,480void *data)481{482int *count = data;483484if (record->zi_guid == 0 || record->zi_func[0] != '\0')485return (0);486487if (record->zi_cmd == ZINJECT_DELAY_IO)488return (0);489490if (*count == 0) {491(void) printf("%3s %-15s %-16s %-5s %-10s %-9s "492"%-6s %-6s\n",493"ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ",494"MATCH", "INJECT");495(void) printf(496"--- --------------- ---------------- "497"----- ---------- --------- "498"------ ------\n");499}500501*count += 1;502503double freq = record->zi_freq == 0 ? 100.0f :504(((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f;505506(void) printf("%3d %-15s %llx %-5s %-10s %8.4f%% "507"%6" PRIu64 " %6" PRIu64 "\n", id, pool,508(u_longlong_t)record->zi_guid,509iotype_to_str(record->zi_iotype), err_to_str(record->zi_error),510freq, record->zi_match_count, record->zi_inject_count);511512return (0);513}514515static int516print_delay_handler(int id, const char *pool, zinject_record_t *record,517void *data)518{519int *count = data;520521if (record->zi_guid == 0 || record->zi_func[0] != '\0')522return (0);523524if (record->zi_cmd != ZINJECT_DELAY_IO)525return (0);526527if (*count == 0) {528(void) printf("%3s %-15s %-16s %-10s %-5s %-9s "529"%-6s %-6s\n",530"ID", "POOL", "GUID", "DELAY (ms)", "LANES", "FREQ",531"MATCH", "INJECT");532(void) printf("--- --------------- ---------------- "533"---------- ----- --------- "534"------ ------\n");535}536537*count += 1;538539double freq = record->zi_freq == 0 ? 100.0f :540(((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f;541542(void) printf("%3d %-15s %llx %10llu %5llu %8.4f%% "543"%6" PRIu64 " %6" PRIu64 "\n", id, pool,544(u_longlong_t)record->zi_guid,545(u_longlong_t)NSEC2MSEC(record->zi_timer),546(u_longlong_t)record->zi_nlanes,547freq, record->zi_match_count, record->zi_inject_count);548549return (0);550}551552static int553print_panic_handler(int id, const char *pool, zinject_record_t *record,554void *data)555{556int *count = data;557558if (record->zi_func[0] == '\0')559return (0);560561if (*count == 0) {562(void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION");563(void) printf("--- --------------- ----------------\n");564}565566*count += 1;567568(void) printf("%3d %-15s %s\n", id, pool, record->zi_func);569570return (0);571}572573static int574print_pool_delay_handler(int id, const char *pool, zinject_record_t *record,575void *data)576{577int *count = data;578579if (record->zi_cmd != ZINJECT_DELAY_IMPORT &&580record->zi_cmd != ZINJECT_DELAY_EXPORT) {581return (0);582}583584if (*count == 0) {585(void) printf("%3s %-19s %-11s %s\n",586"ID", "POOL", "DELAY (sec)", "COMMAND");587(void) printf("--- ------------------- -----------"588" -------\n");589}590591*count += 1;592593(void) printf("%3d %-19s %-11llu %s\n",594id, pool, (u_longlong_t)record->zi_duration,595record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export");596597return (0);598}599600/*601* Print all registered error handlers. Returns the number of handlers602* registered.603*/604static int605print_all_handlers(void)606{607int count = 0, total = 0;608609(void) iter_handlers(print_device_handler, &count);610if (count > 0) {611total += count;612(void) printf("\n");613count = 0;614}615616(void) iter_handlers(print_delay_handler, &count);617if (count > 0) {618total += count;619(void) printf("\n");620count = 0;621}622623(void) iter_handlers(print_data_handler, &count);624if (count > 0) {625total += count;626(void) printf("\n");627count = 0;628}629630(void) iter_handlers(print_pool_delay_handler, &count);631if (count > 0) {632total += count;633(void) printf("\n");634count = 0;635}636637(void) iter_handlers(print_panic_handler, &count);638639return (count + total);640}641642static int643cancel_one_handler(int id, const char *pool, zinject_record_t *record,644void *data)645{646(void) pool, (void) record, (void) data;647zfs_cmd_t zc = {"\0"};648649zc.zc_guid = (uint64_t)id;650651if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {652(void) fprintf(stderr, "failed to remove handler %d: %s\n",653id, strerror(errno));654return (1);655}656657return (0);658}659660/*661* Remove all fault injection handlers.662*/663static int664cancel_all_handlers(void)665{666int ret = iter_handlers(cancel_one_handler, NULL);667668if (ret == 0)669(void) printf("removed all registered handlers\n");670671return (ret);672}673674/*675* Remove a specific fault injection handler.676*/677static int678cancel_handler(int id)679{680zfs_cmd_t zc = {"\0"};681682zc.zc_guid = (uint64_t)id;683684if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {685(void) fprintf(stderr, "failed to remove handler %d: %s\n",686id, strerror(errno));687return (1);688}689690(void) printf("removed handler %d\n", id);691692return (0);693}694695/*696* Register a new fault injection handler.697*/698static int699register_handler(const char *pool, int flags, zinject_record_t *record,700int quiet)701{702zfs_cmd_t zc = {"\0"};703704(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));705zc.zc_inject_record = *record;706zc.zc_guid = flags;707708if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {709const char *errmsg = strerror(errno);710711switch (errno) {712case EDOM:713errmsg = "block level exceeds max level of object";714break;715case EEXIST:716if (record->zi_cmd == ZINJECT_DELAY_IMPORT)717errmsg = "pool already imported";718if (record->zi_cmd == ZINJECT_DELAY_EXPORT)719errmsg = "a handler already exists";720break;721case ENOENT:722/* import delay injector running on older zfs module */723if (record->zi_cmd == ZINJECT_DELAY_IMPORT)724errmsg = "import delay injector not supported";725break;726default:727break;728}729(void) fprintf(stderr, "failed to add handler: %s\n", errmsg);730return (1);731}732733if (flags & ZINJECT_NULL)734return (0);735736if (quiet) {737(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);738} else {739boolean_t show_object = B_FALSE;740boolean_t show_iotype = B_FALSE;741(void) printf("Added handler %llu with the following "742"properties:\n", (u_longlong_t)zc.zc_guid);743(void) printf(" pool: %s\n", pool);744if (record->zi_guid) {745(void) printf(" vdev: %llx\n",746(u_longlong_t)record->zi_guid);747show_iotype = B_TRUE;748} else if (record->zi_func[0] != '\0') {749(void) printf(" panic function: %s\n",750record->zi_func);751} else if (record->zi_duration > 0) {752(void) printf(" time: %lld seconds\n",753(u_longlong_t)record->zi_duration);754} else if (record->zi_duration < 0) {755(void) printf(" txgs: %lld \n",756(u_longlong_t)-record->zi_duration);757} else if (record->zi_timer > 0) {758(void) printf(" timer: %lld ms\n",759(u_longlong_t)NSEC2MSEC(record->zi_timer));760if (record->zi_cmd == ZINJECT_DELAY_READY) {761show_object = B_TRUE;762show_iotype = B_TRUE;763}764} else {765show_object = B_TRUE;766}767if (show_iotype) {768(void) printf("iotype: %s\n",769iotype_to_str(record->zi_iotype));770}771if (show_object) {772(void) printf("objset: %llu\n",773(u_longlong_t)record->zi_objset);774(void) printf("object: %llu\n",775(u_longlong_t)record->zi_object);776(void) printf(" type: %llu\n",777(u_longlong_t)record->zi_type);778(void) printf(" level: %d\n", record->zi_level);779if (record->zi_start == 0 &&780record->zi_end == -1ULL)781(void) printf(" range: all\n");782else783(void) printf(" range: [%llu, %llu)\n",784(u_longlong_t)record->zi_start,785(u_longlong_t)record->zi_end);786(void) printf(" dvas: 0x%x\n", record->zi_dvas);787}788}789790return (0);791}792793static int794perform_action(const char *pool, zinject_record_t *record, int cmd)795{796zfs_cmd_t zc = {"\0"};797798ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);799(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));800zc.zc_guid = record->zi_guid;801zc.zc_cookie = cmd;802803if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)804return (0);805806return (1);807}808809static int810parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)811{812unsigned long scan_delay;813unsigned long scan_nlanes;814815if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)816return (1);817818/*819* We explicitly disallow a delay of zero here, because we key820* off this value being non-zero in translate_device(), to821* determine if the fault is a ZINJECT_DELAY_IO fault or not.822*/823if (scan_delay == 0)824return (1);825826/*827* The units for the CLI delay parameter is milliseconds, but828* the data passed to the kernel is interpreted as nanoseconds.829* Thus we scale the milliseconds to nanoseconds here, and this830* nanosecond value is used to pass the delay to the kernel.831*/832*delay = MSEC2NSEC(scan_delay);833*nlanes = scan_nlanes;834835return (0);836}837838static int839parse_frequency(const char *str, uint32_t *percent)840{841double val;842char *post;843844val = strtod(str, &post);845if (post == NULL || *post != '\0')846return (EINVAL);847848/* valid range is [0.0001, 100.0] */849val /= 100.0f;850if (val < 0.000001f || val > 1.0f)851return (ERANGE);852853/* convert to an integer for use by kernel */854*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));855856return (0);857}858859/*860* This function converts a string specifier for DVAs into a bit mask.861* The dva's provided by the user should be 0 indexed and separated by862* a comma. For example:863* "1" -> 0b0010 (0x2)864* "0,1" -> 0b0011 (0x3)865* "0,1,2" -> 0b0111 (0x7)866*/867static int868parse_dvas(const char *str, uint32_t *dvas_out)869{870const char *c = str;871uint32_t mask = 0;872boolean_t need_delim = B_FALSE;873874/* max string length is 5 ("0,1,2") */875if (strlen(str) > 5 || strlen(str) == 0)876return (EINVAL);877878while (*c != '\0') {879switch (*c) {880case '0':881case '1':882case '2':883/* check for pipe between DVAs */884if (need_delim)885return (EINVAL);886887/* check if this DVA has been set already */888if (mask & (1 << ((*c) - '0')))889return (EINVAL);890891mask |= (1 << ((*c) - '0'));892need_delim = B_TRUE;893break;894case ',':895need_delim = B_FALSE;896break;897default:898/* check for invalid character */899return (EINVAL);900}901c++;902}903904/* check for dangling delimiter */905if (!need_delim)906return (EINVAL);907908*dvas_out = mask;909return (0);910}911912int913main(int argc, char **argv)914{915int c;916char *range = NULL;917char *cancel = NULL;918char *end;919char *raw = NULL;920char *device = NULL;921int level = 0;922int quiet = 0;923int error = 0;924int domount = 0;925int io_type = ZINJECT_IOTYPE_ALL;926int action = VDEV_STATE_UNKNOWN;927err_type_t type = TYPE_INVAL;928err_type_t label = TYPE_INVAL;929zinject_record_t record = { 0 };930char pool[MAXNAMELEN] = "";931char dataset[MAXNAMELEN] = "";932zfs_handle_t *zhp = NULL;933int nowrites = 0;934int dur_txg = 0;935int dur_secs = 0;936int ret;937int flags = 0;938uint32_t dvas = 0;939hrtime_t ready_delay = -1;940941if ((g_zfs = libzfs_init()) == NULL) {942(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));943return (1);944}945946libzfs_print_on_error(g_zfs, B_TRUE);947948if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {949(void) fprintf(stderr, "failed to open ZFS device\n");950libzfs_fini(g_zfs);951return (1);952}953954if (argc == 1) {955/*956* No arguments. Print the available handlers. If there are no957* available handlers, direct the user to '-h' for help958* information.959*/960if (print_all_handlers() == 0) {961(void) printf("No handlers registered.\n");962(void) printf("Run 'zinject -h' for usage "963"information.\n");964}965libzfs_fini(g_zfs);966return (0);967}968969while ((c = getopt(argc, argv,970":aA:b:C:d:D:E:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {971switch (c) {972case 'a':973flags |= ZINJECT_FLUSH_ARC;974break;975case 'A':976if (strcasecmp(optarg, "degrade") == 0) {977action = VDEV_STATE_DEGRADED;978} else if (strcasecmp(optarg, "fault") == 0) {979action = VDEV_STATE_FAULTED;980} else {981(void) fprintf(stderr, "invalid action '%s': "982"must be 'degrade' or 'fault'\n", optarg);983usage();984libzfs_fini(g_zfs);985return (1);986}987break;988case 'b':989raw = optarg;990break;991case 'c':992cancel = optarg;993break;994case 'C':995ret = parse_dvas(optarg, &dvas);996if (ret != 0) {997(void) fprintf(stderr, "invalid DVA list '%s': "998"DVAs should be 0 indexed and separated by "999"commas.\n", optarg);1000usage();1001libzfs_fini(g_zfs);1002return (1);1003}1004break;1005case 'd':1006device = optarg;1007break;1008case 'D':1009errno = 0;1010ret = parse_delay(optarg, &record.zi_timer,1011&record.zi_nlanes);1012if (ret != 0) {10131014(void) fprintf(stderr, "invalid i/o delay "1015"value: '%s'\n", optarg);1016usage();1017libzfs_fini(g_zfs);1018return (1);1019}1020break;1021case 'e':1022error = str_to_err(optarg);1023if (error < 0) {1024(void) fprintf(stderr, "invalid error type "1025"'%s': must be one of: io decompress "1026"decrypt nxio dtl corrupt noop\n",1027optarg);1028usage();1029libzfs_fini(g_zfs);1030return (1);1031}1032break;1033case 'f':1034ret = parse_frequency(optarg, &record.zi_freq);1035if (ret != 0) {1036(void) fprintf(stderr, "%sfrequency value must "1037"be in the range [0.0001, 100.0]\n",1038ret == EINVAL ? "invalid value: " :1039ret == ERANGE ? "out of range: " : "");1040libzfs_fini(g_zfs);1041return (1);1042}1043break;1044case 'F':1045record.zi_failfast = B_TRUE;1046break;1047case 'g':1048dur_txg = 1;1049record.zi_duration = (int)strtol(optarg, &end, 10);1050if (record.zi_duration <= 0 || *end != '\0') {1051(void) fprintf(stderr, "invalid duration '%s': "1052"must be a positive integer\n", optarg);1053usage();1054libzfs_fini(g_zfs);1055return (1);1056}1057/* store duration of txgs as its negative */1058record.zi_duration *= -1;1059break;1060case 'h':1061usage();1062libzfs_fini(g_zfs);1063return (0);1064case 'I':1065/* default duration, if one hasn't yet been defined */1066nowrites = 1;1067if (dur_secs == 0 && dur_txg == 0)1068record.zi_duration = 30;1069break;1070case 'l':1071level = (int)strtol(optarg, &end, 10);1072if (*end != '\0') {1073(void) fprintf(stderr, "invalid level '%s': "1074"must be an integer\n", optarg);1075usage();1076libzfs_fini(g_zfs);1077return (1);1078}1079break;1080case 'm':1081domount = 1;1082break;1083case 'p':1084(void) strlcpy(record.zi_func, optarg,1085sizeof (record.zi_func));1086record.zi_cmd = ZINJECT_PANIC;1087break;1088case 'P':1089if (strcasecmp(optarg, "import") == 0) {1090record.zi_cmd = ZINJECT_DELAY_IMPORT;1091} else if (strcasecmp(optarg, "export") == 0) {1092record.zi_cmd = ZINJECT_DELAY_EXPORT;1093} else {1094(void) fprintf(stderr, "invalid command '%s': "1095"must be 'import' or 'export'\n", optarg);1096usage();1097libzfs_fini(g_zfs);1098return (1);1099}1100break;1101case 'q':1102quiet = 1;1103break;1104case 'r':1105range = optarg;1106flags |= ZINJECT_CALC_RANGE;1107break;1108case 's':1109dur_secs = 1;1110record.zi_duration = (int)strtol(optarg, &end, 10);1111if (record.zi_duration <= 0 || *end != '\0') {1112(void) fprintf(stderr, "invalid duration '%s': "1113"must be a positive integer\n", optarg);1114usage();1115libzfs_fini(g_zfs);1116return (1);1117}1118break;1119case 'T':1120io_type = str_to_iotype(optarg);1121if (io_type == ZINJECT_IOTYPES) {1122(void) fprintf(stderr, "invalid I/O type "1123"'%s': must be 'read', 'write', 'free', "1124"'claim', 'flush' or 'all'\n", optarg);1125usage();1126libzfs_fini(g_zfs);1127return (1);1128}1129break;1130case 't':1131if ((type = name_to_type(optarg)) == TYPE_INVAL &&1132!MOS_TYPE(type)) {1133(void) fprintf(stderr, "invalid type '%s'\n",1134optarg);1135usage();1136libzfs_fini(g_zfs);1137return (1);1138}1139break;1140case 'u':1141flags |= ZINJECT_UNLOAD_SPA;1142break;1143case 'E':1144ready_delay = MSEC2NSEC(strtol(optarg, &end, 10));1145if (ready_delay <= 0 || *end != '\0') {1146(void) fprintf(stderr, "invalid delay '%s': "1147"must be a positive duration\n", optarg);1148usage();1149libzfs_fini(g_zfs);1150return (1);1151}1152record.zi_cmd = ZINJECT_DELAY_READY;1153record.zi_timer = ready_delay;1154break;1155case 'L':1156if ((label = name_to_type(optarg)) == TYPE_INVAL &&1157!LABEL_TYPE(type)) {1158(void) fprintf(stderr, "invalid label type "1159"'%s'\n", optarg);1160usage();1161libzfs_fini(g_zfs);1162return (1);1163}1164break;1165case ':':1166(void) fprintf(stderr, "option -%c requires an "1167"operand\n", optopt);1168usage();1169libzfs_fini(g_zfs);1170return (1);1171case '?':1172(void) fprintf(stderr, "invalid option '%c'\n",1173optopt);1174usage();1175libzfs_fini(g_zfs);1176return (2);1177}1178}11791180argc -= optind;1181argv += optind;11821183if (record.zi_duration != 0 && record.zi_cmd == 0)1184record.zi_cmd = ZINJECT_IGNORED_WRITES;11851186if (cancel != NULL) {1187/*1188* '-c' is invalid with any other options.1189*/1190if (raw != NULL || range != NULL || type != TYPE_INVAL ||1191level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||1192record.zi_freq > 0 || dvas != 0 || ready_delay >= 0) {1193(void) fprintf(stderr, "cancel (-c) incompatible with "1194"any other options\n");1195usage();1196libzfs_fini(g_zfs);1197return (2);1198}1199if (argc != 0) {1200(void) fprintf(stderr, "extraneous argument to '-c'\n");1201usage();1202libzfs_fini(g_zfs);1203return (2);1204}12051206if (strcmp(cancel, "all") == 0) {1207return (cancel_all_handlers());1208} else {1209int id = (int)strtol(cancel, &end, 10);1210if (*end != '\0') {1211(void) fprintf(stderr, "invalid handle id '%s':"1212" must be an integer or 'all'\n", cancel);1213usage();1214libzfs_fini(g_zfs);1215return (1);1216}1217return (cancel_handler(id));1218}1219}12201221if (device != NULL) {1222/*1223* Device (-d) injection uses a completely different mechanism1224* for doing injection, so handle it separately here.1225*/1226if (raw != NULL || range != NULL || type != TYPE_INVAL ||1227level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||1228dvas != 0 || ready_delay >= 0) {1229(void) fprintf(stderr, "device (-d) incompatible with "1230"data error injection\n");1231usage();1232libzfs_fini(g_zfs);1233return (2);1234}12351236if (argc != 1) {1237(void) fprintf(stderr, "device (-d) injection requires "1238"a single pool name\n");1239usage();1240libzfs_fini(g_zfs);1241return (2);1242}12431244(void) strlcpy(pool, argv[0], sizeof (pool));1245dataset[0] = '\0';12461247if (error == ECKSUM) {1248(void) fprintf(stderr, "device error type must be "1249"'io', 'nxio' or 'corrupt'\n");1250libzfs_fini(g_zfs);1251return (1);1252}12531254if (error == EILSEQ &&1255(record.zi_freq == 0 || io_type != ZINJECT_IOTYPE_READ)) {1256(void) fprintf(stderr, "device corrupt errors require "1257"io type read and a frequency value\n");1258libzfs_fini(g_zfs);1259return (1);1260}12611262record.zi_iotype = io_type;1263if (translate_device(pool, device, label, &record) != 0) {1264libzfs_fini(g_zfs);1265return (1);1266}12671268if (record.zi_nlanes) {1269switch (io_type) {1270case ZINJECT_IOTYPE_READ:1271case ZINJECT_IOTYPE_WRITE:1272case ZINJECT_IOTYPE_ALL:1273break;1274default:1275(void) fprintf(stderr, "I/O type for a delay "1276"must be 'read' or 'write'\n");1277usage();1278libzfs_fini(g_zfs);1279return (1);1280}1281}12821283if (!error)1284error = ENXIO;12851286if (action != VDEV_STATE_UNKNOWN)1287return (perform_action(pool, &record, action));12881289} else if (raw != NULL) {1290if (range != NULL || type != TYPE_INVAL || level != 0 ||1291record.zi_cmd != ZINJECT_UNINITIALIZED ||1292record.zi_freq > 0 || dvas != 0) {1293(void) fprintf(stderr, "raw (-b) format with "1294"any other options\n");1295usage();1296libzfs_fini(g_zfs);1297return (2);1298}12991300if (argc != 1) {1301(void) fprintf(stderr, "raw (-b) format expects a "1302"single pool name\n");1303usage();1304libzfs_fini(g_zfs);1305return (2);1306}13071308(void) strlcpy(pool, argv[0], sizeof (pool));1309dataset[0] = '\0';13101311if (error == ENXIO) {1312(void) fprintf(stderr, "data error type must be "1313"'checksum' or 'io'\n");1314libzfs_fini(g_zfs);1315return (1);1316}13171318if (record.zi_cmd == ZINJECT_UNINITIALIZED) {1319record.zi_cmd = ZINJECT_DATA_FAULT;1320if (!error)1321error = EIO;1322} else if (error != 0) {1323(void) fprintf(stderr, "error type -e incompatible "1324"with delay injection\n");1325libzfs_fini(g_zfs);1326return (1);1327} else {1328record.zi_iotype = io_type;1329}13301331if (translate_raw(raw, &record) != 0) {1332libzfs_fini(g_zfs);1333return (1);1334}1335} else if (record.zi_cmd == ZINJECT_PANIC) {1336if (raw != NULL || range != NULL || type != TYPE_INVAL ||1337level != 0 || device != NULL || record.zi_freq > 0 ||1338dvas != 0) {1339(void) fprintf(stderr, "%s incompatible with other "1340"options\n", "import|export delay (-P)");1341usage();1342libzfs_fini(g_zfs);1343return (2);1344}13451346if (argc < 1 || argc > 2) {1347(void) fprintf(stderr, "panic (-p) injection requires "1348"a single pool name and an optional id\n");1349usage();1350libzfs_fini(g_zfs);1351return (2);1352}13531354(void) strlcpy(pool, argv[0], sizeof (pool));1355if (argv[1] != NULL)1356record.zi_type = atoi(argv[1]);1357dataset[0] = '\0';1358} else if (record.zi_cmd == ZINJECT_DELAY_IMPORT ||1359record.zi_cmd == ZINJECT_DELAY_EXPORT) {1360if (raw != NULL || range != NULL || type != TYPE_INVAL ||1361level != 0 || device != NULL || record.zi_freq > 0 ||1362dvas != 0) {1363(void) fprintf(stderr, "%s incompatible with other "1364"options\n", "import|export delay (-P)");1365usage();1366libzfs_fini(g_zfs);1367return (2);1368}13691370if (argc != 1 || record.zi_duration <= 0) {1371(void) fprintf(stderr, "import|export delay (-P) "1372"injection requires a duration (-s) and a single "1373"pool name\n");1374usage();1375libzfs_fini(g_zfs);1376return (2);1377}13781379(void) strlcpy(pool, argv[0], sizeof (pool));1380} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {1381if (raw != NULL || range != NULL || type != TYPE_INVAL ||1382level != 0 || record.zi_freq > 0 || dvas != 0) {1383(void) fprintf(stderr, "hardware failure (-I) "1384"incompatible with other options\n");1385usage();1386libzfs_fini(g_zfs);1387return (2);1388}13891390if (nowrites == 0) {1391(void) fprintf(stderr, "-s or -g meaningless "1392"without -I (ignore writes)\n");1393usage();1394libzfs_fini(g_zfs);1395return (2);1396} else if (dur_secs && dur_txg) {1397(void) fprintf(stderr, "choose a duration either "1398"in seconds (-s) or a number of txgs (-g) "1399"but not both\n");1400usage();1401libzfs_fini(g_zfs);1402return (2);1403} else if (argc != 1) {1404(void) fprintf(stderr, "ignore writes (-I) "1405"injection requires a single pool name\n");1406usage();1407libzfs_fini(g_zfs);1408return (2);1409}14101411(void) strlcpy(pool, argv[0], sizeof (pool));1412dataset[0] = '\0';1413} else if (type == TYPE_INVAL) {1414if (flags == 0) {1415(void) fprintf(stderr, "at least one of '-b', '-d', "1416"'-t', '-a', '-p', '-I' or '-u' "1417"must be specified\n");1418usage();1419libzfs_fini(g_zfs);1420return (2);1421}14221423if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {1424(void) strlcpy(pool, argv[0], sizeof (pool));1425dataset[0] = '\0';1426} else if (argc != 0) {1427(void) fprintf(stderr, "extraneous argument for "1428"'-f'\n");1429usage();1430libzfs_fini(g_zfs);1431return (2);1432}14331434flags |= ZINJECT_NULL;1435} else {1436if (argc != 1) {1437(void) fprintf(stderr, "missing object\n");1438usage();1439libzfs_fini(g_zfs);1440return (2);1441}14421443if (error == ENXIO || error == EILSEQ) {1444(void) fprintf(stderr, "data error type must be "1445"'checksum' or 'io'\n");1446libzfs_fini(g_zfs);1447return (1);1448}14491450if (dvas != 0) {1451if (error == EACCES || error == EINVAL) {1452(void) fprintf(stderr, "the '-C' option may "1453"not be used with logical data errors "1454"'decrypt' and 'decompress'\n");1455libzfs_fini(g_zfs);1456return (1);1457}14581459record.zi_dvas = dvas;1460}14611462if (record.zi_cmd != ZINJECT_UNINITIALIZED && error != 0) {1463(void) fprintf(stderr, "error type -e incompatible "1464"with delay injection\n");1465libzfs_fini(g_zfs);1466return (1);1467}14681469if (error == EACCES) {1470if (type != TYPE_DATA) {1471(void) fprintf(stderr, "decryption errors "1472"may only be injected for 'data' types\n");1473libzfs_fini(g_zfs);1474return (1);1475}14761477record.zi_cmd = ZINJECT_DECRYPT_FAULT;1478/*1479* Internally, ZFS actually uses ECKSUM for decryption1480* errors since EACCES is used to indicate the key was1481* not found.1482*/1483error = ECKSUM;1484} else if (record.zi_cmd == ZINJECT_UNINITIALIZED) {1485record.zi_cmd = ZINJECT_DATA_FAULT;1486if (!error)1487error = EIO;1488} else {1489record.zi_iotype = io_type;1490}14911492if (translate_record(type, argv[0], range, level, &record, pool,1493dataset) != 0) {1494libzfs_fini(g_zfs);1495return (1);1496}1497}14981499/*1500* If this is pool-wide metadata, unmount everything. The ioctl() will1501* unload the pool, so that we trigger spa-wide reopen of metadata next1502* time we access the pool.1503*/1504if (dataset[0] != '\0' && domount) {1505if ((zhp = zfs_open(g_zfs, dataset,1506ZFS_TYPE_DATASET)) == NULL) {1507libzfs_fini(g_zfs);1508return (1);1509}1510if (zfs_unmount(zhp, NULL, 0) != 0) {1511libzfs_fini(g_zfs);1512return (1);1513}1514}15151516record.zi_error = error;15171518ret = register_handler(pool, flags, &record, quiet);15191520if (dataset[0] != '\0' && domount)1521ret = (zfs_mount(zhp, NULL, 0) != 0);15221523libzfs_fini(g_zfs);15241525return (ret);1526}152715281529