Path: blob/main/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c
48529 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2018 Intel Corporation.23* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.24*/2526#include <stdio.h>27#include <zlib.h>28#include <zfs_fletcher.h>29#include <sys/vdev_draid.h>30#include <sys/nvpair.h>31#include <sys/stat.h>3233/*34* The number of rows to generate for new permutation maps.35*/36#define MAP_ROWS_DEFAULT 2563738/*39* Key values for dRAID maps when stored as nvlists.40*/41#define MAP_SEED "seed"42#define MAP_CHECKSUM "checksum"43#define MAP_WORST_RATIO "worst_ratio"44#define MAP_AVG_RATIO "avg_ratio"45#define MAP_CHILDREN "children"46#define MAP_NPERMS "nperms"47#define MAP_PERMS "perms"4849static void50draid_usage(void)51{52(void) fprintf(stderr,53"usage: draid command args ...\n"54"Available commands are:\n"55"\n"56"\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"57"\tdraid verify [-rv] FILE\n"58"\tdraid dump [-v] [-m min] [-n max] FILE\n"59"\tdraid table FILE\n"60"\tdraid merge FILE SRC SRC...\n");61exit(1);62}6364static int65read_map(const char *filename, nvlist_t **allcfgs)66{67int block_size = 131072;68int buf_size = 131072;69int tmp_size, error;70char *tmp_buf;7172struct stat64 stat;73if (lstat64(filename, &stat) != 0)74return (errno);7576if (stat.st_size == 0 ||77!(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {78return (EINVAL);79}8081gzFile fp = gzopen(filename, "rb");82if (fp == Z_NULL)83return (errno);8485char *buf = malloc(buf_size);86if (buf == NULL) {87(void) gzclose(fp);88return (ENOMEM);89}9091ssize_t rc, bytes = 0;92while (!gzeof(fp)) {93rc = gzread(fp, buf + bytes, block_size);94if ((rc < 0) || (rc == 0 && !gzeof(fp))) {95free(buf);96(void) gzerror(fp, &error);97(void) gzclose(fp);98return (error);99} else {100bytes += rc;101102if (bytes + block_size >= buf_size) {103tmp_size = 2 * buf_size;104tmp_buf = malloc(tmp_size);105if (tmp_buf == NULL) {106free(buf);107(void) gzclose(fp);108return (ENOMEM);109}110111memcpy(tmp_buf, buf, bytes);112free(buf);113buf = tmp_buf;114buf_size = tmp_size;115}116}117}118119(void) gzclose(fp);120121error = nvlist_unpack(buf, bytes, allcfgs, 0);122free(buf);123124return (error);125}126127/*128* Read a map from the specified filename. A file contains multiple maps129* which are indexed by the number of children. The caller is responsible130* for freeing the configuration returned.131*/132static int133read_map_key(const char *filename, const char *key, nvlist_t **cfg)134{135nvlist_t *allcfgs, *foundcfg = NULL;136int error;137138error = read_map(filename, &allcfgs);139if (error != 0)140return (error);141142(void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);143if (foundcfg != NULL) {144nvlist_dup(foundcfg, cfg, KM_SLEEP);145error = 0;146} else {147error = ENOENT;148}149150nvlist_free(allcfgs);151152return (error);153}154155/*156* Write all mappings to the map file.157*/158static int159write_map(const char *filename, nvlist_t *allcfgs)160{161size_t buflen = 0;162int error;163164error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);165if (error)166return (error);167168char *buf = malloc(buflen);169if (buf == NULL)170return (ENOMEM);171172error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);173if (error) {174free(buf);175return (error);176}177178/*179* Atomically update the file using a temporary file and the180* traditional unlink then rename steps. This code provides181* no locking, it only guarantees the packed nvlist on disk182* is updated atomically and is internally consistent.183*/184char *tmpname = calloc(1, MAXPATHLEN);185if (tmpname == NULL) {186free(buf);187return (ENOMEM);188}189190snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);191192int fd = mkstemp(tmpname);193if (fd < 0) {194error = errno;195free(buf);196free(tmpname);197return (error);198}199(void) close(fd);200201gzFile fp = gzopen(tmpname, "w9b");202if (fp == Z_NULL) {203error = errno;204free(buf);205free(tmpname);206return (error);207}208209ssize_t rc, bytes = 0;210while (bytes < buflen) {211size_t size = MIN(buflen - bytes, 131072);212rc = gzwrite(fp, buf + bytes, size);213if (rc < 0) {214free(buf);215(void) gzerror(fp, &error);216(void) gzclose(fp);217(void) unlink(tmpname);218free(tmpname);219return (error);220} else if (rc == 0) {221break;222} else {223bytes += rc;224}225}226227free(buf);228(void) gzclose(fp);229230if (bytes != buflen) {231(void) unlink(tmpname);232free(tmpname);233return (EIO);234}235236/*237* Unlink the previous config file and replace it with the updated238* version. If we're able to unlink the file then directory is239* writable by us and the subsequent rename should never fail.240*/241error = unlink(filename);242if (error != 0 && errno != ENOENT) {243error = errno;244(void) unlink(tmpname);245free(tmpname);246return (error);247}248249error = rename(tmpname, filename);250if (error != 0) {251error = errno;252(void) unlink(tmpname);253free(tmpname);254return (error);255}256257free(tmpname);258259return (0);260}261262/*263* Add the dRAID map to the file and write it out.264*/265static int266write_map_key(const char *filename, char *key, draid_map_t *map,267double worst_ratio, double avg_ratio)268{269nvlist_t *nv_cfg, *allcfgs;270int error;271272/*273* Add the configuration to an existing or new file. The new274* configuration will replace an existing configuration with the275* same key if it has a lower ratio and is therefore better.276*/277error = read_map(filename, &allcfgs);278if (error == ENOENT) {279allcfgs = fnvlist_alloc();280} else if (error != 0) {281return (error);282}283284error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);285if (error == 0) {286uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,287MAP_WORST_RATIO);288double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;289290if (worst_ratio < nv_worst_ratio) {291/* Replace old map with the more balanced new map. */292fnvlist_remove(allcfgs, key);293} else {294/* The old map is preferable, keep it. */295nvlist_free(allcfgs);296return (EEXIST);297}298}299300nvlist_t *cfg = fnvlist_alloc();301fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);302fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);303fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);304fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);305fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,306map->dm_children * map->dm_nperms * sizeof (uint8_t));307308fnvlist_add_uint64(cfg, MAP_WORST_RATIO,309(uint64_t)(worst_ratio * 1000.0));310fnvlist_add_uint64(cfg, MAP_AVG_RATIO,311(uint64_t)(avg_ratio * 1000.0));312313error = nvlist_add_nvlist(allcfgs, key, cfg);314if (error == 0)315error = write_map(filename, allcfgs);316317nvlist_free(cfg);318nvlist_free(allcfgs);319return (error);320}321322static void323dump_map(draid_map_t *map, const char *key, double worst_ratio,324double avg_ratio, int verbose)325{326if (verbose == 0) {327return;328} else if (verbose == 1) {329printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "330"avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,331worst_ratio, avg_ratio);332return;333} else {334printf(" \"%s\":\n"335" seed: 0x%016llx\n"336" checksum: 0x%016llx\n"337" worst_ratio: %2.03f\n"338" avg_ratio: %2.03f\n"339" children: %llu\n"340" nperms: %llu\n",341key, (u_longlong_t)map->dm_seed,342(u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,343(u_longlong_t)map->dm_children,344(u_longlong_t)map->dm_nperms);345346if (verbose > 2) {347printf(" perms = {\n");348for (int i = 0; i < map->dm_nperms; i++) {349printf(" { ");350for (int j = 0; j < map->dm_children; j++) {351printf("%3d%s ", map->dm_perms[352i * map->dm_children + j],353j < map->dm_children - 1 ?354"," : "");355}356printf(" },\n");357}358printf(" }\n");359} else if (verbose == 2) {360printf(" draid_perms = <omitted>\n");361}362}363}364365static void366dump_map_nv(const char *key, nvlist_t *cfg, int verbose)367{368draid_map_t map;369uint_t c;370371uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);372uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);373374map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);375map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);376map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);377map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);378map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);379380dump_map(&map, key, (double)worst_ratio / 1000.0,381avg_ratio / 1000.0, verbose);382}383384/*385* Print a summary of the mapping.386*/387static int388dump_map_key(const char *filename, const char *key, int verbose)389{390nvlist_t *cfg;391int error;392393error = read_map_key(filename, key, &cfg);394if (error != 0)395return (error);396397dump_map_nv(key, cfg, verbose);398399return (0);400}401402/*403* Allocate a new permutation map for evaluation.404*/405static int406alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,407draid_map_t **mapp)408{409draid_map_t *map;410int error;411412map = malloc(sizeof (draid_map_t));413if (map == NULL)414return (ENOMEM);415416map->dm_children = children;417map->dm_nperms = nperms;418map->dm_seed = seed;419map->dm_checksum = 0;420421error = vdev_draid_generate_perms(map, &map->dm_perms);422if (error) {423free(map);424return (error);425}426427*mapp = map;428429return (0);430}431432/*433* Allocate the fixed permutation map for N children.434*/435static int436alloc_fixed_map(uint64_t children, draid_map_t **mapp)437{438const draid_map_t *fixed_map;439draid_map_t *map;440int error;441442error = vdev_draid_lookup_map(children, &fixed_map);443if (error)444return (error);445446map = malloc(sizeof (draid_map_t));447if (map == NULL)448return (ENOMEM);449450memcpy(map, fixed_map, sizeof (draid_map_t));451VERIFY3U(map->dm_checksum, !=, 0);452453error = vdev_draid_generate_perms(map, &map->dm_perms);454if (error) {455free(map);456return (error);457}458459*mapp = map;460461return (0);462}463464/*465* Free a permutation map.466*/467static void468free_map(draid_map_t *map)469{470free(map->dm_perms);471free(map);472}473474/*475* Check if dev is in the provided list of faulted devices.476*/477static inline boolean_t478is_faulted(int *faulted_devs, int nfaulted, int dev)479{480for (int i = 0; i < nfaulted; i++)481if (faulted_devs[i] == dev)482return (B_TRUE);483484return (B_FALSE);485}486487/*488* Evaluate how resilvering I/O will be distributed given a list of faulted489* vdevs. As a simplification we assume one IO is sufficient to repair each490* damaged device in a group.491*/492static double493eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,494int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)495{496uint64_t children = map->dm_children;497uint64_t ngroups = 1;498uint64_t ndisks = children - nspares;499500/*501* Calculate the minimum number of groups required to fill a slice.502*/503while (ngroups * (groupwidth) % (children - nspares) != 0)504ngroups++;505506int *ios = calloc(map->dm_children, sizeof (uint64_t));507508ASSERT3P(ios, !=, NULL);509510/* Resilver all rows */511for (int i = 0; i < map->dm_nperms; i++) {512uint8_t *row = &map->dm_perms[i * map->dm_children];513514/* Resilver all groups with faulted drives */515for (int j = 0; j < ngroups; j++) {516uint64_t spareidx = map->dm_children - nspares;517boolean_t repair_needed = B_FALSE;518519/* See if any devices in this group are faulted */520uint64_t groupstart = (j * groupwidth) % ndisks;521522for (int k = 0; k < groupwidth; k++) {523uint64_t groupidx = (groupstart + k) % ndisks;524525repair_needed = is_faulted(faulted_devs,526nfaulted, row[groupidx]);527if (repair_needed)528break;529}530531if (repair_needed == B_FALSE)532continue;533534/*535* This group is degraded. Calculate the number of536* reads the non-faulted drives require and the number537* of writes to the distributed hot spare for this row.538*/539for (int k = 0; k < groupwidth; k++) {540uint64_t groupidx = (groupstart + k) % ndisks;541542if (!is_faulted(faulted_devs, nfaulted,543row[groupidx])) {544ios[row[groupidx]]++;545} else if (nspares > 0) {546while (is_faulted(faulted_devs,547nfaulted, row[spareidx])) {548spareidx++;549}550551ASSERT3U(spareidx, <, map->dm_children);552ios[row[spareidx]]++;553spareidx++;554}555}556}557}558559*min_child_ios = INT_MAX;560*max_child_ios = 0;561562/*563* Find the drives with fewest and most required I/O. These values564* are used to calculate the imbalance ratio. To avoid returning an565* infinite value for permutations which have children that perform566* no IO a floor of 1 IO per child is set. This ensures a meaningful567* ratio is returned for comparison and it is not an uncommon when568* there are a large number of children.569*/570for (int i = 0; i < map->dm_children; i++) {571572if (is_faulted(faulted_devs, nfaulted, i)) {573ASSERT0(ios[i]);574continue;575}576577if (ios[i] == 0)578ios[i] = 1;579580if (ios[i] < *min_child_ios)581*min_child_ios = ios[i];582583if (ios[i] > *max_child_ios)584*max_child_ios = ios[i];585}586587ASSERT3S(*min_child_ios, !=, INT_MAX);588ASSERT3S(*max_child_ios, !=, 0);589590double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);591592free(ios);593594return (ratio);595}596597/*598* Evaluate the quality of the permutation mapping by considering possible599* device failures. Returns the imbalance ratio for the worst mapping which600* is defined to be the largest number of child IOs over the fewest number601* child IOs. A value of 1.0 indicates the mapping is perfectly balance and602* all children perform an equal amount of work during reconstruction.603*/604static void605eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)606{607uint64_t children = map->dm_children;608double worst_ratio = 1.0;609double sum = 0;610int worst_min_ios = 0, worst_max_ios = 0;611int n = 0;612613/*614* When there are only 2 children there can be no distributed615* spare and no resilver to evaluate. Default to a ratio of 1.0616* for this degenerate case.617*/618if (children == VDEV_DRAID_MIN_CHILDREN) {619*worst_ratiop = 1.0;620*avg_ratiop = 1.0;621return;622}623624/*625* Score the mapping as if it had either 1 or 2 distributed spares.626*/627for (int nspares = 1; nspares <= 2; nspares++) {628uint64_t faults = nspares;629630/*631* Score groupwidths up to 19. This value was chosen as the632* largest reasonable width (16d+3p). dRAID pools may be still633* be created with wider stripes but they are not considered in634* this analysis in order to optimize for the most common cases.635*/636for (uint64_t groupwidth = 2;637groupwidth <= MIN(children - nspares, 19);638groupwidth++) {639int faulted_devs[2];640int min_ios, max_ios;641642/*643* Score possible devices faults. This is limited644* to exactly one fault per distributed spare for645* the purposes of this similation.646*/647for (int f1 = 0; f1 < children; f1++) {648faulted_devs[0] = f1;649double ratio;650651if (faults == 1) {652ratio = eval_resilver(map, groupwidth,653nspares, faulted_devs, faults,654&min_ios, &max_ios);655656if (ratio > worst_ratio) {657worst_ratio = ratio;658worst_min_ios = min_ios;659worst_max_ios = max_ios;660}661662sum += ratio;663n++;664} else if (faults == 2) {665for (int f2 = f1 + 1; f2 < children;666f2++) {667faulted_devs[1] = f2;668669ratio = eval_resilver(map,670groupwidth, nspares,671faulted_devs, faults,672&min_ios, &max_ios);673674if (ratio > worst_ratio) {675worst_ratio = ratio;676worst_min_ios = min_ios;677worst_max_ios = max_ios;678}679680sum += ratio;681n++;682}683}684}685}686}687688*worst_ratiop = worst_ratio;689*avg_ratiop = sum / n;690691/*692* Log the min/max io values for particularly unbalanced maps.693* Since the maps are generated entirely randomly these are possible694* be exceedingly unlikely. We log it for possible investigation.695*/696if (worst_ratio > 100.0) {697dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);698printf("worst_min_ios=%d worst_max_ios=%d\n",699worst_min_ios, worst_max_ios);700}701}702703static int704eval_maps(uint64_t children, int passes, uint64_t *map_seed,705draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)706{707draid_map_t *best_map = NULL;708double best_worst_ratio = 1000.0;709double best_avg_ratio = 1000.0;710711/*712* Perform the requested number of passes evaluating randomly713* generated permutation maps. Only the best version is kept.714*/715for (int i = 0; i < passes; i++) {716double worst_ratio, avg_ratio;717draid_map_t *map;718int error;719720/*721* Calculate the next seed and generate a new candidate map.722*/723error = alloc_new_map(children, MAP_ROWS_DEFAULT,724vdev_draid_rand(map_seed), &map);725if (error) {726if (best_map != NULL)727free_map(best_map);728return (error);729}730731/*732* Consider maps with a lower worst_ratio to be of higher733* quality. Some maps may have a lower avg_ratio but they734* are discarded since they might include some particularly735* imbalanced permutations. The average is tracked to in736* order to get a sense of the average permutation quality.737*/738eval_decluster(map, &worst_ratio, &avg_ratio);739740if (best_map == NULL || worst_ratio < best_worst_ratio) {741742if (best_map != NULL)743free_map(best_map);744745best_map = map;746best_worst_ratio = worst_ratio;747best_avg_ratio = avg_ratio;748} else {749free_map(map);750}751}752753/*754* After determining the best map generate a checksum over the full755* permutation array. This checksum is verified when opening a dRAID756* pool to ensure the generated in memory permutations are correct.757*/758zio_cksum_t cksum;759fletcher_4_native_varsize(best_map->dm_perms,760sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,761&cksum);762best_map->dm_checksum = cksum.zc_word[0];763764*best_mapp = best_map;765*best_ratiop = best_worst_ratio;766*avg_ratiop = best_avg_ratio;767768return (0);769}770771static int772draid_generate(int argc, char *argv[])773{774char filename[MAXPATHLEN] = {0};775uint64_t map_seed[2];776int c, fd, error, verbose = 0, passes = 1, continuous = 0;777int min_children = VDEV_DRAID_MIN_CHILDREN;778int max_children = VDEV_DRAID_MAX_CHILDREN;779int restarts = 0;780781while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {782switch (c) {783case 'c':784continuous++;785break;786case 'm':787min_children = (int)strtol(optarg, NULL, 0);788if (min_children < VDEV_DRAID_MIN_CHILDREN) {789(void) fprintf(stderr, "A minimum of 2 "790"children are required.\n");791return (1);792}793794break;795case 'n':796max_children = (int)strtol(optarg, NULL, 0);797if (max_children > VDEV_DRAID_MAX_CHILDREN) {798(void) fprintf(stderr, "A maximum of %d "799"children are allowed.\n",800VDEV_DRAID_MAX_CHILDREN);801return (1);802}803break;804case 'p':805passes = (int)strtol(optarg, NULL, 0);806break;807case 'v':808/*809* 0 - Only log when a better map is added to the file.810* 1 - Log the current best map for each child count.811* Minimal output on a single summary line.812* 2 - Log the current best map for each child count.813* More verbose includes most map fields.814* 3 - Log the current best map for each child count.815* Very verbose all fields including the full map.816*/817verbose++;818break;819case ':':820(void) fprintf(stderr,821"missing argument for '%c' option\n", optopt);822draid_usage();823break;824case '?':825(void) fprintf(stderr, "invalid option '%c'\n",826optopt);827draid_usage();828break;829}830}831832if (argc > optind)833strlcpy(filename, argv[optind], sizeof (filename));834else {835(void) fprintf(stderr, "A FILE must be specified.\n");836return (1);837}838839restart:840/*841* Start with a fresh seed from /dev/urandom.842*/843fd = open("/dev/urandom", O_RDONLY);844if (fd < 0) {845printf("Unable to open /dev/urandom: %s\n:", strerror(errno));846return (1);847} else {848ssize_t bytes = sizeof (map_seed);849ssize_t bytes_read = 0;850851while (bytes_read < bytes) {852ssize_t rc = read(fd, ((char *)map_seed) + bytes_read,853bytes - bytes_read);854if (rc < 0) {855printf("Unable to read /dev/urandom: %s\n:",856strerror(errno));857close(fd);858return (1);859}860bytes_read += rc;861}862863(void) close(fd);864}865866if (restarts == 0)867printf("Writing generated mappings to '%s':\n", filename);868869/*870* Generate maps for all requested child counts. The best map for871* each child count is written out to the specified file. If the file872* already contains a better mapping this map will not be added.873*/874for (uint64_t children = min_children;875children <= max_children; children++) {876char key[8] = { 0 };877draid_map_t *map;878double worst_ratio = 1000.0;879double avg_ratio = 1000.0;880881error = eval_maps(children, passes, map_seed, &map,882&worst_ratio, &avg_ratio);883if (error) {884printf("Error eval_maps(): %s\n", strerror(error));885return (1);886}887888if (worst_ratio < 1.0 || avg_ratio < 1.0) {889printf("Error ratio < 1.0: worst_ratio = %2.03f "890"avg_ratio = %2.03f\n", worst_ratio, avg_ratio);891return (1);892}893894snprintf(key, 7, "%llu", (u_longlong_t)children);895error = write_map_key(filename, key, map, worst_ratio,896avg_ratio);897if (error == 0) {898/* The new map was added to the file. */899dump_map(map, key, worst_ratio, avg_ratio,900MAX(verbose, 1));901} else if (error == EEXIST) {902/* The existing map was preferable and kept. */903if (verbose > 0)904dump_map_key(filename, key, verbose);905} else {906printf("Error write_map_key(): %s\n", strerror(error));907return (1);908}909910free_map(map);911}912913/*914* When the continuous option is set restart at the minimum number of915* children instead of exiting. This option is useful as a mechanism916* to continuous try and refine the discovered permutations.917*/918if (continuous) {919restarts++;920printf("Restarting by request (-c): %d\n", restarts);921goto restart;922}923924return (0);925}926927/*928* Verify each map in the file by generating its in-memory permutation array929* and comfirming its checksum is correct.930*/931static int932draid_verify(int argc, char *argv[])933{934char filename[MAXPATHLEN] = {0};935int n = 0, c, error, verbose = 1;936int check_ratios = 0;937938while ((c = getopt(argc, argv, ":rv")) != -1) {939switch (c) {940case 'r':941check_ratios++;942break;943case 'v':944verbose++;945break;946case ':':947(void) fprintf(stderr,948"missing argument for '%c' option\n", optopt);949draid_usage();950break;951case '?':952(void) fprintf(stderr, "invalid option '%c'\n",953optopt);954draid_usage();955break;956}957}958959if (argc > optind) {960char *abspath = malloc(MAXPATHLEN);961if (abspath == NULL)962return (ENOMEM);963964if (realpath(argv[optind], abspath) != NULL)965strlcpy(filename, abspath, sizeof (filename));966else967strlcpy(filename, argv[optind], sizeof (filename));968969free(abspath);970} else {971(void) fprintf(stderr, "A FILE must be specified.\n");972return (1);973}974975printf("Verifying permutation maps: '%s'\n", filename);976977/*978* Lookup hardcoded permutation map for each valid number of children979* and verify a generated map has the correct checksum. Then compare980* the generated map values with the nvlist map values read from the981* reference file to cross-check the permutation.982*/983for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;984children <= VDEV_DRAID_MAX_CHILDREN;985children++) {986draid_map_t *map;987char key[8] = {0};988989snprintf(key, 8, "%llu", (u_longlong_t)children);990991error = alloc_fixed_map(children, &map);992if (error) {993printf("Error alloc_fixed_map() failed: %s\n",994error == ECKSUM ? "Invalid checksum" :995strerror(error));996return (1);997}998999uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;1000uint8_t *nv_perms;1001nvlist_t *cfg;1002uint_t c;10031004error = read_map_key(filename, key, &cfg);1005if (error != 0) {1006printf("Error read_map_key() failed: %s\n",1007strerror(error));1008free_map(map);1009return (1);1010}10111012nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);1013nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);1014nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);1015nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);1016nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);10171018/*1019* Compare draid_map_t and nvlist reference values.1020*/1021if (map->dm_seed != nv_seed) {1022printf("Error different seeds: 0x%016llx != "1023"0x%016llx\n", (u_longlong_t)map->dm_seed,1024(u_longlong_t)nv_seed);1025error = EINVAL;1026}10271028if (map->dm_checksum != nv_checksum) {1029printf("Error different checksums: 0x%016llx "1030"!= 0x%016llx\n",1031(u_longlong_t)map->dm_checksum,1032(u_longlong_t)nv_checksum);1033error = EINVAL;1034}10351036if (map->dm_children != nv_children) {1037printf("Error different children: %llu "1038"!= %llu\n", (u_longlong_t)map->dm_children,1039(u_longlong_t)nv_children);1040error = EINVAL;1041}10421043if (map->dm_nperms != nv_nperms) {1044printf("Error different nperms: %llu "1045"!= %llu\n", (u_longlong_t)map->dm_nperms,1046(u_longlong_t)nv_nperms);1047error = EINVAL;1048}10491050for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {1051if (map->dm_perms[i] != nv_perms[i]) {1052printf("Error different perms[%llu]: "1053"%d != %d\n", (u_longlong_t)i,1054(int)map->dm_perms[i],1055(int)nv_perms[i]);1056error = EINVAL;1057break;1058}1059}10601061/*1062* For good measure recalculate the worst and average1063* ratios and confirm they match the nvlist values.1064*/1065if (check_ratios) {1066uint64_t nv_worst_ratio, nv_avg_ratio;1067double worst_ratio, avg_ratio;10681069eval_decluster(map, &worst_ratio, &avg_ratio);10701071nv_worst_ratio = fnvlist_lookup_uint64(cfg,1072MAP_WORST_RATIO);1073nv_avg_ratio = fnvlist_lookup_uint64(cfg,1074MAP_AVG_RATIO);10751076if (worst_ratio < 1.0 || avg_ratio < 1.0) {1077printf("Error ratio out of range %2.03f, "1078"%2.03f\n", worst_ratio, avg_ratio);1079error = EINVAL;1080}10811082if ((uint64_t)(worst_ratio * 1000.0) !=1083nv_worst_ratio) {1084printf("Error different worst_ratio %2.03f "1085"!= %2.03f\n", (double)nv_worst_ratio /10861000.0, worst_ratio);1087error = EINVAL;1088}10891090if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {1091printf("Error different average_ratio %2.03f "1092"!= %2.03f\n", (double)nv_avg_ratio /10931000.0, avg_ratio);1094error = EINVAL;1095}1096}10971098if (error) {1099free_map(map);1100nvlist_free(cfg);1101return (1);1102}11031104if (verbose > 0) {1105printf("- %llu children: good\n",1106(u_longlong_t)children);1107}1108n++;11091110free_map(map);1111nvlist_free(cfg);1112}11131114if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {1115printf("Error permutation maps missing: %d / %d checked\n",1116n, VDEV_DRAID_MAX_CHILDREN - 1);1117return (1);1118}11191120printf("Successfully verified %d / %d permutation maps\n",1121n, VDEV_DRAID_MAX_CHILDREN - 1);11221123return (0);1124}11251126/*1127* Dump the contents of the specified mapping(s) for inspection.1128*/1129static int1130draid_dump(int argc, char *argv[])1131{1132char filename[MAXPATHLEN] = {0};1133int c, error, verbose = 1;1134int min_children = VDEV_DRAID_MIN_CHILDREN;1135int max_children = VDEV_DRAID_MAX_CHILDREN;11361137while ((c = getopt(argc, argv, ":vm:n:")) != -1) {1138switch (c) {1139case 'm':1140min_children = (int)strtol(optarg, NULL, 0);1141if (min_children < 2) {1142(void) fprintf(stderr, "A minimum of 2 "1143"children are required.\n");1144return (1);1145}11461147break;1148case 'n':1149max_children = (int)strtol(optarg, NULL, 0);1150if (max_children > VDEV_DRAID_MAX_CHILDREN) {1151(void) fprintf(stderr, "A maximum of %d "1152"children are allowed.\n",1153VDEV_DRAID_MAX_CHILDREN);1154return (1);1155}1156break;1157case 'v':1158verbose++;1159break;1160case ':':1161(void) fprintf(stderr,1162"missing argument for '%c' option\n", optopt);1163draid_usage();1164break;1165case '?':1166(void) fprintf(stderr, "invalid option '%c'\n",1167optopt);1168draid_usage();1169break;1170}1171}11721173if (argc > optind)1174strlcpy(filename, argv[optind], sizeof (filename));1175else {1176(void) fprintf(stderr, "A FILE must be specified.\n");1177return (1);1178}11791180/*1181* Dump maps for the requested child counts.1182*/1183for (uint64_t children = min_children;1184children <= max_children; children++) {1185char key[8] = { 0 };11861187snprintf(key, 7, "%llu", (u_longlong_t)children);1188error = dump_map_key(filename, key, verbose);1189if (error) {1190printf("Error dump_map_key(): %s\n", strerror(error));1191return (1);1192}1193}11941195return (0);1196}11971198/*1199* Print all of the mappings as a C formatted draid_map_t array. This table1200* is found in the module/zcommon/zfs_draid.c file and is the definitive1201* source for all mapping used by dRAID. It cannot be updated without1202* changing the dRAID on disk format.1203*/1204static int1205draid_table(int argc, char *argv[])1206{1207char filename[MAXPATHLEN] = {0};1208int error;12091210if (argc > optind)1211strlcpy(filename, argv[optind], sizeof (filename));1212else {1213(void) fprintf(stderr, "A FILE must be specified.\n");1214return (1);1215}12161217printf("static const draid_map_t "1218"draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");12191220for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;1221children <= VDEV_DRAID_MAX_CHILDREN;1222children++) {1223uint64_t seed, checksum, nperms, avg_ratio;1224nvlist_t *cfg;1225char key[8] = {0};12261227snprintf(key, 8, "%llu", (u_longlong_t)children);12281229error = read_map_key(filename, key, &cfg);1230if (error != 0) {1231printf("Error read_map_key() failed: %s\n",1232strerror(error));1233return (1);1234}12351236seed = fnvlist_lookup_uint64(cfg, MAP_SEED);1237checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);1238children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);1239nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);1240avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);12411242printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"1243"/* %2.03f */\n", (u_longlong_t)children,1244(u_longlong_t)nperms, (u_longlong_t)seed,1245(u_longlong_t)checksum, (double)avg_ratio / 1000.0);12461247nvlist_free(cfg);1248}12491250printf("};\n");12511252return (0);1253}12541255static int1256draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)1257{1258nvlist_t *srccfgs;1259nvpair_t *elem = NULL;1260int error, merged = 0;12611262error = read_map(srcfilename, &srccfgs);1263if (error != 0)1264return (error);12651266while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {1267uint64_t nv_worst_ratio;1268uint64_t allcfg_worst_ratio;1269nvlist_t *cfg, *allcfg;1270const char *key;12711272switch (nvpair_type(elem)) {1273case DATA_TYPE_NVLIST:12741275(void) nvpair_value_nvlist(elem, &cfg);1276key = nvpair_name(elem);12771278nv_worst_ratio = fnvlist_lookup_uint64(cfg,1279MAP_WORST_RATIO);12801281error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);1282if (error == 0) {1283allcfg_worst_ratio = fnvlist_lookup_uint64(1284allcfg, MAP_WORST_RATIO);12851286if (nv_worst_ratio < allcfg_worst_ratio) {1287fnvlist_remove(allcfgs, key);1288fnvlist_add_nvlist(allcfgs, key, cfg);1289merged++;1290}1291} else if (error == ENOENT) {1292fnvlist_add_nvlist(allcfgs, key, cfg);1293merged++;1294} else {1295return (error);1296}12971298break;1299default:1300continue;1301}1302}13031304nvlist_free(srccfgs);13051306*mergedp = merged;13071308return (0);1309}13101311/*1312* Merge the best map for each child count found in the listed files into1313* a new file. This allows 'draid generate' to be run in parallel and for1314* the results maps to be combined.1315*/1316static int1317draid_merge(int argc, char *argv[])1318{1319char filename[MAXPATHLEN] = {0};1320int c, error, total_merged = 0;1321nvlist_t *allcfgs;13221323while ((c = getopt(argc, argv, ":")) != -1) {1324switch (c) {1325case ':':1326(void) fprintf(stderr,1327"missing argument for '%c' option\n", optopt);1328draid_usage();1329break;1330case '?':1331(void) fprintf(stderr, "invalid option '%c'\n",1332optopt);1333draid_usage();1334break;1335}1336}13371338if (argc < 4) {1339(void) fprintf(stderr,1340"A FILE and multiple SRCs must be specified.\n");1341return (1);1342}13431344strlcpy(filename, argv[optind], sizeof (filename));1345optind++;13461347error = read_map(filename, &allcfgs);1348if (error == ENOENT) {1349allcfgs = fnvlist_alloc();1350} else if (error != 0) {1351printf("Error read_map(): %s\n", strerror(error));1352return (error);1353}13541355while (optind < argc) {1356char srcfilename[MAXPATHLEN] = {0};1357int merged = 0;13581359strlcpy(srcfilename, argv[optind], sizeof (srcfilename));13601361error = draid_merge_impl(allcfgs, srcfilename, &merged);1362if (error) {1363printf("Error draid_merge_impl(): %s\n",1364strerror(error));1365nvlist_free(allcfgs);1366return (1);1367}13681369total_merged += merged;1370printf("Merged %d key(s) from '%s' into '%s'\n", merged,1371srcfilename, filename);13721373optind++;1374}13751376if (total_merged > 0)1377write_map(filename, allcfgs);13781379printf("Merged a total of %d key(s) into '%s'\n", total_merged,1380filename);13811382nvlist_free(allcfgs);13831384return (0);1385}13861387int1388main(int argc, char *argv[])1389{1390if (argc < 2)1391draid_usage();13921393char *subcommand = argv[1];13941395if (strcmp(subcommand, "generate") == 0) {1396return (draid_generate(argc - 1, argv + 1));1397} else if (strcmp(subcommand, "verify") == 0) {1398return (draid_verify(argc - 1, argv + 1));1399} else if (strcmp(subcommand, "dump") == 0) {1400return (draid_dump(argc - 1, argv + 1));1401} else if (strcmp(subcommand, "table") == 0) {1402return (draid_table(argc - 1, argv + 1));1403} else if (strcmp(subcommand, "merge") == 0) {1404return (draid_merge(argc - 1, argv + 1));1405} else {1406draid_usage();1407}1408}140914101411