Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c
48529 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2018 Intel Corporation.
24
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
25
*/
26
27
#include <stdio.h>
28
#include <zlib.h>
29
#include <zfs_fletcher.h>
30
#include <sys/vdev_draid.h>
31
#include <sys/nvpair.h>
32
#include <sys/stat.h>
33
34
/*
35
* The number of rows to generate for new permutation maps.
36
*/
37
#define MAP_ROWS_DEFAULT 256
38
39
/*
40
* Key values for dRAID maps when stored as nvlists.
41
*/
42
#define MAP_SEED "seed"
43
#define MAP_CHECKSUM "checksum"
44
#define MAP_WORST_RATIO "worst_ratio"
45
#define MAP_AVG_RATIO "avg_ratio"
46
#define MAP_CHILDREN "children"
47
#define MAP_NPERMS "nperms"
48
#define MAP_PERMS "perms"
49
50
static void
51
draid_usage(void)
52
{
53
(void) fprintf(stderr,
54
"usage: draid command args ...\n"
55
"Available commands are:\n"
56
"\n"
57
"\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
58
"\tdraid verify [-rv] FILE\n"
59
"\tdraid dump [-v] [-m min] [-n max] FILE\n"
60
"\tdraid table FILE\n"
61
"\tdraid merge FILE SRC SRC...\n");
62
exit(1);
63
}
64
65
static int
66
read_map(const char *filename, nvlist_t **allcfgs)
67
{
68
int block_size = 131072;
69
int buf_size = 131072;
70
int tmp_size, error;
71
char *tmp_buf;
72
73
struct stat64 stat;
74
if (lstat64(filename, &stat) != 0)
75
return (errno);
76
77
if (stat.st_size == 0 ||
78
!(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
79
return (EINVAL);
80
}
81
82
gzFile fp = gzopen(filename, "rb");
83
if (fp == Z_NULL)
84
return (errno);
85
86
char *buf = malloc(buf_size);
87
if (buf == NULL) {
88
(void) gzclose(fp);
89
return (ENOMEM);
90
}
91
92
ssize_t rc, bytes = 0;
93
while (!gzeof(fp)) {
94
rc = gzread(fp, buf + bytes, block_size);
95
if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
96
free(buf);
97
(void) gzerror(fp, &error);
98
(void) gzclose(fp);
99
return (error);
100
} else {
101
bytes += rc;
102
103
if (bytes + block_size >= buf_size) {
104
tmp_size = 2 * buf_size;
105
tmp_buf = malloc(tmp_size);
106
if (tmp_buf == NULL) {
107
free(buf);
108
(void) gzclose(fp);
109
return (ENOMEM);
110
}
111
112
memcpy(tmp_buf, buf, bytes);
113
free(buf);
114
buf = tmp_buf;
115
buf_size = tmp_size;
116
}
117
}
118
}
119
120
(void) gzclose(fp);
121
122
error = nvlist_unpack(buf, bytes, allcfgs, 0);
123
free(buf);
124
125
return (error);
126
}
127
128
/*
129
* Read a map from the specified filename. A file contains multiple maps
130
* which are indexed by the number of children. The caller is responsible
131
* for freeing the configuration returned.
132
*/
133
static int
134
read_map_key(const char *filename, const char *key, nvlist_t **cfg)
135
{
136
nvlist_t *allcfgs, *foundcfg = NULL;
137
int error;
138
139
error = read_map(filename, &allcfgs);
140
if (error != 0)
141
return (error);
142
143
(void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
144
if (foundcfg != NULL) {
145
nvlist_dup(foundcfg, cfg, KM_SLEEP);
146
error = 0;
147
} else {
148
error = ENOENT;
149
}
150
151
nvlist_free(allcfgs);
152
153
return (error);
154
}
155
156
/*
157
* Write all mappings to the map file.
158
*/
159
static int
160
write_map(const char *filename, nvlist_t *allcfgs)
161
{
162
size_t buflen = 0;
163
int error;
164
165
error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
166
if (error)
167
return (error);
168
169
char *buf = malloc(buflen);
170
if (buf == NULL)
171
return (ENOMEM);
172
173
error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
174
if (error) {
175
free(buf);
176
return (error);
177
}
178
179
/*
180
* Atomically update the file using a temporary file and the
181
* traditional unlink then rename steps. This code provides
182
* no locking, it only guarantees the packed nvlist on disk
183
* is updated atomically and is internally consistent.
184
*/
185
char *tmpname = calloc(1, MAXPATHLEN);
186
if (tmpname == NULL) {
187
free(buf);
188
return (ENOMEM);
189
}
190
191
snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
192
193
int fd = mkstemp(tmpname);
194
if (fd < 0) {
195
error = errno;
196
free(buf);
197
free(tmpname);
198
return (error);
199
}
200
(void) close(fd);
201
202
gzFile fp = gzopen(tmpname, "w9b");
203
if (fp == Z_NULL) {
204
error = errno;
205
free(buf);
206
free(tmpname);
207
return (error);
208
}
209
210
ssize_t rc, bytes = 0;
211
while (bytes < buflen) {
212
size_t size = MIN(buflen - bytes, 131072);
213
rc = gzwrite(fp, buf + bytes, size);
214
if (rc < 0) {
215
free(buf);
216
(void) gzerror(fp, &error);
217
(void) gzclose(fp);
218
(void) unlink(tmpname);
219
free(tmpname);
220
return (error);
221
} else if (rc == 0) {
222
break;
223
} else {
224
bytes += rc;
225
}
226
}
227
228
free(buf);
229
(void) gzclose(fp);
230
231
if (bytes != buflen) {
232
(void) unlink(tmpname);
233
free(tmpname);
234
return (EIO);
235
}
236
237
/*
238
* Unlink the previous config file and replace it with the updated
239
* version. If we're able to unlink the file then directory is
240
* writable by us and the subsequent rename should never fail.
241
*/
242
error = unlink(filename);
243
if (error != 0 && errno != ENOENT) {
244
error = errno;
245
(void) unlink(tmpname);
246
free(tmpname);
247
return (error);
248
}
249
250
error = rename(tmpname, filename);
251
if (error != 0) {
252
error = errno;
253
(void) unlink(tmpname);
254
free(tmpname);
255
return (error);
256
}
257
258
free(tmpname);
259
260
return (0);
261
}
262
263
/*
264
* Add the dRAID map to the file and write it out.
265
*/
266
static int
267
write_map_key(const char *filename, char *key, draid_map_t *map,
268
double worst_ratio, double avg_ratio)
269
{
270
nvlist_t *nv_cfg, *allcfgs;
271
int error;
272
273
/*
274
* Add the configuration to an existing or new file. The new
275
* configuration will replace an existing configuration with the
276
* same key if it has a lower ratio and is therefore better.
277
*/
278
error = read_map(filename, &allcfgs);
279
if (error == ENOENT) {
280
allcfgs = fnvlist_alloc();
281
} else if (error != 0) {
282
return (error);
283
}
284
285
error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
286
if (error == 0) {
287
uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
288
MAP_WORST_RATIO);
289
double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
290
291
if (worst_ratio < nv_worst_ratio) {
292
/* Replace old map with the more balanced new map. */
293
fnvlist_remove(allcfgs, key);
294
} else {
295
/* The old map is preferable, keep it. */
296
nvlist_free(allcfgs);
297
return (EEXIST);
298
}
299
}
300
301
nvlist_t *cfg = fnvlist_alloc();
302
fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
303
fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
304
fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
305
fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
306
fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
307
map->dm_children * map->dm_nperms * sizeof (uint8_t));
308
309
fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
310
(uint64_t)(worst_ratio * 1000.0));
311
fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
312
(uint64_t)(avg_ratio * 1000.0));
313
314
error = nvlist_add_nvlist(allcfgs, key, cfg);
315
if (error == 0)
316
error = write_map(filename, allcfgs);
317
318
nvlist_free(cfg);
319
nvlist_free(allcfgs);
320
return (error);
321
}
322
323
static void
324
dump_map(draid_map_t *map, const char *key, double worst_ratio,
325
double avg_ratio, int verbose)
326
{
327
if (verbose == 0) {
328
return;
329
} else if (verbose == 1) {
330
printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
331
"avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
332
worst_ratio, avg_ratio);
333
return;
334
} else {
335
printf(" \"%s\":\n"
336
" seed: 0x%016llx\n"
337
" checksum: 0x%016llx\n"
338
" worst_ratio: %2.03f\n"
339
" avg_ratio: %2.03f\n"
340
" children: %llu\n"
341
" nperms: %llu\n",
342
key, (u_longlong_t)map->dm_seed,
343
(u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
344
(u_longlong_t)map->dm_children,
345
(u_longlong_t)map->dm_nperms);
346
347
if (verbose > 2) {
348
printf(" perms = {\n");
349
for (int i = 0; i < map->dm_nperms; i++) {
350
printf(" { ");
351
for (int j = 0; j < map->dm_children; j++) {
352
printf("%3d%s ", map->dm_perms[
353
i * map->dm_children + j],
354
j < map->dm_children - 1 ?
355
"," : "");
356
}
357
printf(" },\n");
358
}
359
printf(" }\n");
360
} else if (verbose == 2) {
361
printf(" draid_perms = <omitted>\n");
362
}
363
}
364
}
365
366
static void
367
dump_map_nv(const char *key, nvlist_t *cfg, int verbose)
368
{
369
draid_map_t map;
370
uint_t c;
371
372
uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
373
uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
374
375
map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
376
map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
377
map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
378
map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
379
map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);
380
381
dump_map(&map, key, (double)worst_ratio / 1000.0,
382
avg_ratio / 1000.0, verbose);
383
}
384
385
/*
386
* Print a summary of the mapping.
387
*/
388
static int
389
dump_map_key(const char *filename, const char *key, int verbose)
390
{
391
nvlist_t *cfg;
392
int error;
393
394
error = read_map_key(filename, key, &cfg);
395
if (error != 0)
396
return (error);
397
398
dump_map_nv(key, cfg, verbose);
399
400
return (0);
401
}
402
403
/*
404
* Allocate a new permutation map for evaluation.
405
*/
406
static int
407
alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
408
draid_map_t **mapp)
409
{
410
draid_map_t *map;
411
int error;
412
413
map = malloc(sizeof (draid_map_t));
414
if (map == NULL)
415
return (ENOMEM);
416
417
map->dm_children = children;
418
map->dm_nperms = nperms;
419
map->dm_seed = seed;
420
map->dm_checksum = 0;
421
422
error = vdev_draid_generate_perms(map, &map->dm_perms);
423
if (error) {
424
free(map);
425
return (error);
426
}
427
428
*mapp = map;
429
430
return (0);
431
}
432
433
/*
434
* Allocate the fixed permutation map for N children.
435
*/
436
static int
437
alloc_fixed_map(uint64_t children, draid_map_t **mapp)
438
{
439
const draid_map_t *fixed_map;
440
draid_map_t *map;
441
int error;
442
443
error = vdev_draid_lookup_map(children, &fixed_map);
444
if (error)
445
return (error);
446
447
map = malloc(sizeof (draid_map_t));
448
if (map == NULL)
449
return (ENOMEM);
450
451
memcpy(map, fixed_map, sizeof (draid_map_t));
452
VERIFY3U(map->dm_checksum, !=, 0);
453
454
error = vdev_draid_generate_perms(map, &map->dm_perms);
455
if (error) {
456
free(map);
457
return (error);
458
}
459
460
*mapp = map;
461
462
return (0);
463
}
464
465
/*
466
* Free a permutation map.
467
*/
468
static void
469
free_map(draid_map_t *map)
470
{
471
free(map->dm_perms);
472
free(map);
473
}
474
475
/*
476
* Check if dev is in the provided list of faulted devices.
477
*/
478
static inline boolean_t
479
is_faulted(int *faulted_devs, int nfaulted, int dev)
480
{
481
for (int i = 0; i < nfaulted; i++)
482
if (faulted_devs[i] == dev)
483
return (B_TRUE);
484
485
return (B_FALSE);
486
}
487
488
/*
489
* Evaluate how resilvering I/O will be distributed given a list of faulted
490
* vdevs. As a simplification we assume one IO is sufficient to repair each
491
* damaged device in a group.
492
*/
493
static double
494
eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
495
int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
496
{
497
uint64_t children = map->dm_children;
498
uint64_t ngroups = 1;
499
uint64_t ndisks = children - nspares;
500
501
/*
502
* Calculate the minimum number of groups required to fill a slice.
503
*/
504
while (ngroups * (groupwidth) % (children - nspares) != 0)
505
ngroups++;
506
507
int *ios = calloc(map->dm_children, sizeof (uint64_t));
508
509
ASSERT3P(ios, !=, NULL);
510
511
/* Resilver all rows */
512
for (int i = 0; i < map->dm_nperms; i++) {
513
uint8_t *row = &map->dm_perms[i * map->dm_children];
514
515
/* Resilver all groups with faulted drives */
516
for (int j = 0; j < ngroups; j++) {
517
uint64_t spareidx = map->dm_children - nspares;
518
boolean_t repair_needed = B_FALSE;
519
520
/* See if any devices in this group are faulted */
521
uint64_t groupstart = (j * groupwidth) % ndisks;
522
523
for (int k = 0; k < groupwidth; k++) {
524
uint64_t groupidx = (groupstart + k) % ndisks;
525
526
repair_needed = is_faulted(faulted_devs,
527
nfaulted, row[groupidx]);
528
if (repair_needed)
529
break;
530
}
531
532
if (repair_needed == B_FALSE)
533
continue;
534
535
/*
536
* This group is degraded. Calculate the number of
537
* reads the non-faulted drives require and the number
538
* of writes to the distributed hot spare for this row.
539
*/
540
for (int k = 0; k < groupwidth; k++) {
541
uint64_t groupidx = (groupstart + k) % ndisks;
542
543
if (!is_faulted(faulted_devs, nfaulted,
544
row[groupidx])) {
545
ios[row[groupidx]]++;
546
} else if (nspares > 0) {
547
while (is_faulted(faulted_devs,
548
nfaulted, row[spareidx])) {
549
spareidx++;
550
}
551
552
ASSERT3U(spareidx, <, map->dm_children);
553
ios[row[spareidx]]++;
554
spareidx++;
555
}
556
}
557
}
558
}
559
560
*min_child_ios = INT_MAX;
561
*max_child_ios = 0;
562
563
/*
564
* Find the drives with fewest and most required I/O. These values
565
* are used to calculate the imbalance ratio. To avoid returning an
566
* infinite value for permutations which have children that perform
567
* no IO a floor of 1 IO per child is set. This ensures a meaningful
568
* ratio is returned for comparison and it is not an uncommon when
569
* there are a large number of children.
570
*/
571
for (int i = 0; i < map->dm_children; i++) {
572
573
if (is_faulted(faulted_devs, nfaulted, i)) {
574
ASSERT0(ios[i]);
575
continue;
576
}
577
578
if (ios[i] == 0)
579
ios[i] = 1;
580
581
if (ios[i] < *min_child_ios)
582
*min_child_ios = ios[i];
583
584
if (ios[i] > *max_child_ios)
585
*max_child_ios = ios[i];
586
}
587
588
ASSERT3S(*min_child_ios, !=, INT_MAX);
589
ASSERT3S(*max_child_ios, !=, 0);
590
591
double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
592
593
free(ios);
594
595
return (ratio);
596
}
597
598
/*
599
* Evaluate the quality of the permutation mapping by considering possible
600
* device failures. Returns the imbalance ratio for the worst mapping which
601
* is defined to be the largest number of child IOs over the fewest number
602
* child IOs. A value of 1.0 indicates the mapping is perfectly balance and
603
* all children perform an equal amount of work during reconstruction.
604
*/
605
static void
606
eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
607
{
608
uint64_t children = map->dm_children;
609
double worst_ratio = 1.0;
610
double sum = 0;
611
int worst_min_ios = 0, worst_max_ios = 0;
612
int n = 0;
613
614
/*
615
* When there are only 2 children there can be no distributed
616
* spare and no resilver to evaluate. Default to a ratio of 1.0
617
* for this degenerate case.
618
*/
619
if (children == VDEV_DRAID_MIN_CHILDREN) {
620
*worst_ratiop = 1.0;
621
*avg_ratiop = 1.0;
622
return;
623
}
624
625
/*
626
* Score the mapping as if it had either 1 or 2 distributed spares.
627
*/
628
for (int nspares = 1; nspares <= 2; nspares++) {
629
uint64_t faults = nspares;
630
631
/*
632
* Score groupwidths up to 19. This value was chosen as the
633
* largest reasonable width (16d+3p). dRAID pools may be still
634
* be created with wider stripes but they are not considered in
635
* this analysis in order to optimize for the most common cases.
636
*/
637
for (uint64_t groupwidth = 2;
638
groupwidth <= MIN(children - nspares, 19);
639
groupwidth++) {
640
int faulted_devs[2];
641
int min_ios, max_ios;
642
643
/*
644
* Score possible devices faults. This is limited
645
* to exactly one fault per distributed spare for
646
* the purposes of this similation.
647
*/
648
for (int f1 = 0; f1 < children; f1++) {
649
faulted_devs[0] = f1;
650
double ratio;
651
652
if (faults == 1) {
653
ratio = eval_resilver(map, groupwidth,
654
nspares, faulted_devs, faults,
655
&min_ios, &max_ios);
656
657
if (ratio > worst_ratio) {
658
worst_ratio = ratio;
659
worst_min_ios = min_ios;
660
worst_max_ios = max_ios;
661
}
662
663
sum += ratio;
664
n++;
665
} else if (faults == 2) {
666
for (int f2 = f1 + 1; f2 < children;
667
f2++) {
668
faulted_devs[1] = f2;
669
670
ratio = eval_resilver(map,
671
groupwidth, nspares,
672
faulted_devs, faults,
673
&min_ios, &max_ios);
674
675
if (ratio > worst_ratio) {
676
worst_ratio = ratio;
677
worst_min_ios = min_ios;
678
worst_max_ios = max_ios;
679
}
680
681
sum += ratio;
682
n++;
683
}
684
}
685
}
686
}
687
}
688
689
*worst_ratiop = worst_ratio;
690
*avg_ratiop = sum / n;
691
692
/*
693
* Log the min/max io values for particularly unbalanced maps.
694
* Since the maps are generated entirely randomly these are possible
695
* be exceedingly unlikely. We log it for possible investigation.
696
*/
697
if (worst_ratio > 100.0) {
698
dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
699
printf("worst_min_ios=%d worst_max_ios=%d\n",
700
worst_min_ios, worst_max_ios);
701
}
702
}
703
704
static int
705
eval_maps(uint64_t children, int passes, uint64_t *map_seed,
706
draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
707
{
708
draid_map_t *best_map = NULL;
709
double best_worst_ratio = 1000.0;
710
double best_avg_ratio = 1000.0;
711
712
/*
713
* Perform the requested number of passes evaluating randomly
714
* generated permutation maps. Only the best version is kept.
715
*/
716
for (int i = 0; i < passes; i++) {
717
double worst_ratio, avg_ratio;
718
draid_map_t *map;
719
int error;
720
721
/*
722
* Calculate the next seed and generate a new candidate map.
723
*/
724
error = alloc_new_map(children, MAP_ROWS_DEFAULT,
725
vdev_draid_rand(map_seed), &map);
726
if (error) {
727
if (best_map != NULL)
728
free_map(best_map);
729
return (error);
730
}
731
732
/*
733
* Consider maps with a lower worst_ratio to be of higher
734
* quality. Some maps may have a lower avg_ratio but they
735
* are discarded since they might include some particularly
736
* imbalanced permutations. The average is tracked to in
737
* order to get a sense of the average permutation quality.
738
*/
739
eval_decluster(map, &worst_ratio, &avg_ratio);
740
741
if (best_map == NULL || worst_ratio < best_worst_ratio) {
742
743
if (best_map != NULL)
744
free_map(best_map);
745
746
best_map = map;
747
best_worst_ratio = worst_ratio;
748
best_avg_ratio = avg_ratio;
749
} else {
750
free_map(map);
751
}
752
}
753
754
/*
755
* After determining the best map generate a checksum over the full
756
* permutation array. This checksum is verified when opening a dRAID
757
* pool to ensure the generated in memory permutations are correct.
758
*/
759
zio_cksum_t cksum;
760
fletcher_4_native_varsize(best_map->dm_perms,
761
sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
762
&cksum);
763
best_map->dm_checksum = cksum.zc_word[0];
764
765
*best_mapp = best_map;
766
*best_ratiop = best_worst_ratio;
767
*avg_ratiop = best_avg_ratio;
768
769
return (0);
770
}
771
772
static int
773
draid_generate(int argc, char *argv[])
774
{
775
char filename[MAXPATHLEN] = {0};
776
uint64_t map_seed[2];
777
int c, fd, error, verbose = 0, passes = 1, continuous = 0;
778
int min_children = VDEV_DRAID_MIN_CHILDREN;
779
int max_children = VDEV_DRAID_MAX_CHILDREN;
780
int restarts = 0;
781
782
while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
783
switch (c) {
784
case 'c':
785
continuous++;
786
break;
787
case 'm':
788
min_children = (int)strtol(optarg, NULL, 0);
789
if (min_children < VDEV_DRAID_MIN_CHILDREN) {
790
(void) fprintf(stderr, "A minimum of 2 "
791
"children are required.\n");
792
return (1);
793
}
794
795
break;
796
case 'n':
797
max_children = (int)strtol(optarg, NULL, 0);
798
if (max_children > VDEV_DRAID_MAX_CHILDREN) {
799
(void) fprintf(stderr, "A maximum of %d "
800
"children are allowed.\n",
801
VDEV_DRAID_MAX_CHILDREN);
802
return (1);
803
}
804
break;
805
case 'p':
806
passes = (int)strtol(optarg, NULL, 0);
807
break;
808
case 'v':
809
/*
810
* 0 - Only log when a better map is added to the file.
811
* 1 - Log the current best map for each child count.
812
* Minimal output on a single summary line.
813
* 2 - Log the current best map for each child count.
814
* More verbose includes most map fields.
815
* 3 - Log the current best map for each child count.
816
* Very verbose all fields including the full map.
817
*/
818
verbose++;
819
break;
820
case ':':
821
(void) fprintf(stderr,
822
"missing argument for '%c' option\n", optopt);
823
draid_usage();
824
break;
825
case '?':
826
(void) fprintf(stderr, "invalid option '%c'\n",
827
optopt);
828
draid_usage();
829
break;
830
}
831
}
832
833
if (argc > optind)
834
strlcpy(filename, argv[optind], sizeof (filename));
835
else {
836
(void) fprintf(stderr, "A FILE must be specified.\n");
837
return (1);
838
}
839
840
restart:
841
/*
842
* Start with a fresh seed from /dev/urandom.
843
*/
844
fd = open("/dev/urandom", O_RDONLY);
845
if (fd < 0) {
846
printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
847
return (1);
848
} else {
849
ssize_t bytes = sizeof (map_seed);
850
ssize_t bytes_read = 0;
851
852
while (bytes_read < bytes) {
853
ssize_t rc = read(fd, ((char *)map_seed) + bytes_read,
854
bytes - bytes_read);
855
if (rc < 0) {
856
printf("Unable to read /dev/urandom: %s\n:",
857
strerror(errno));
858
close(fd);
859
return (1);
860
}
861
bytes_read += rc;
862
}
863
864
(void) close(fd);
865
}
866
867
if (restarts == 0)
868
printf("Writing generated mappings to '%s':\n", filename);
869
870
/*
871
* Generate maps for all requested child counts. The best map for
872
* each child count is written out to the specified file. If the file
873
* already contains a better mapping this map will not be added.
874
*/
875
for (uint64_t children = min_children;
876
children <= max_children; children++) {
877
char key[8] = { 0 };
878
draid_map_t *map;
879
double worst_ratio = 1000.0;
880
double avg_ratio = 1000.0;
881
882
error = eval_maps(children, passes, map_seed, &map,
883
&worst_ratio, &avg_ratio);
884
if (error) {
885
printf("Error eval_maps(): %s\n", strerror(error));
886
return (1);
887
}
888
889
if (worst_ratio < 1.0 || avg_ratio < 1.0) {
890
printf("Error ratio < 1.0: worst_ratio = %2.03f "
891
"avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
892
return (1);
893
}
894
895
snprintf(key, 7, "%llu", (u_longlong_t)children);
896
error = write_map_key(filename, key, map, worst_ratio,
897
avg_ratio);
898
if (error == 0) {
899
/* The new map was added to the file. */
900
dump_map(map, key, worst_ratio, avg_ratio,
901
MAX(verbose, 1));
902
} else if (error == EEXIST) {
903
/* The existing map was preferable and kept. */
904
if (verbose > 0)
905
dump_map_key(filename, key, verbose);
906
} else {
907
printf("Error write_map_key(): %s\n", strerror(error));
908
return (1);
909
}
910
911
free_map(map);
912
}
913
914
/*
915
* When the continuous option is set restart at the minimum number of
916
* children instead of exiting. This option is useful as a mechanism
917
* to continuous try and refine the discovered permutations.
918
*/
919
if (continuous) {
920
restarts++;
921
printf("Restarting by request (-c): %d\n", restarts);
922
goto restart;
923
}
924
925
return (0);
926
}
927
928
/*
929
* Verify each map in the file by generating its in-memory permutation array
930
* and comfirming its checksum is correct.
931
*/
932
static int
933
draid_verify(int argc, char *argv[])
934
{
935
char filename[MAXPATHLEN] = {0};
936
int n = 0, c, error, verbose = 1;
937
int check_ratios = 0;
938
939
while ((c = getopt(argc, argv, ":rv")) != -1) {
940
switch (c) {
941
case 'r':
942
check_ratios++;
943
break;
944
case 'v':
945
verbose++;
946
break;
947
case ':':
948
(void) fprintf(stderr,
949
"missing argument for '%c' option\n", optopt);
950
draid_usage();
951
break;
952
case '?':
953
(void) fprintf(stderr, "invalid option '%c'\n",
954
optopt);
955
draid_usage();
956
break;
957
}
958
}
959
960
if (argc > optind) {
961
char *abspath = malloc(MAXPATHLEN);
962
if (abspath == NULL)
963
return (ENOMEM);
964
965
if (realpath(argv[optind], abspath) != NULL)
966
strlcpy(filename, abspath, sizeof (filename));
967
else
968
strlcpy(filename, argv[optind], sizeof (filename));
969
970
free(abspath);
971
} else {
972
(void) fprintf(stderr, "A FILE must be specified.\n");
973
return (1);
974
}
975
976
printf("Verifying permutation maps: '%s'\n", filename);
977
978
/*
979
* Lookup hardcoded permutation map for each valid number of children
980
* and verify a generated map has the correct checksum. Then compare
981
* the generated map values with the nvlist map values read from the
982
* reference file to cross-check the permutation.
983
*/
984
for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
985
children <= VDEV_DRAID_MAX_CHILDREN;
986
children++) {
987
draid_map_t *map;
988
char key[8] = {0};
989
990
snprintf(key, 8, "%llu", (u_longlong_t)children);
991
992
error = alloc_fixed_map(children, &map);
993
if (error) {
994
printf("Error alloc_fixed_map() failed: %s\n",
995
error == ECKSUM ? "Invalid checksum" :
996
strerror(error));
997
return (1);
998
}
999
1000
uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
1001
uint8_t *nv_perms;
1002
nvlist_t *cfg;
1003
uint_t c;
1004
1005
error = read_map_key(filename, key, &cfg);
1006
if (error != 0) {
1007
printf("Error read_map_key() failed: %s\n",
1008
strerror(error));
1009
free_map(map);
1010
return (1);
1011
}
1012
1013
nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1014
nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1015
nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1016
nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1017
nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1018
1019
/*
1020
* Compare draid_map_t and nvlist reference values.
1021
*/
1022
if (map->dm_seed != nv_seed) {
1023
printf("Error different seeds: 0x%016llx != "
1024
"0x%016llx\n", (u_longlong_t)map->dm_seed,
1025
(u_longlong_t)nv_seed);
1026
error = EINVAL;
1027
}
1028
1029
if (map->dm_checksum != nv_checksum) {
1030
printf("Error different checksums: 0x%016llx "
1031
"!= 0x%016llx\n",
1032
(u_longlong_t)map->dm_checksum,
1033
(u_longlong_t)nv_checksum);
1034
error = EINVAL;
1035
}
1036
1037
if (map->dm_children != nv_children) {
1038
printf("Error different children: %llu "
1039
"!= %llu\n", (u_longlong_t)map->dm_children,
1040
(u_longlong_t)nv_children);
1041
error = EINVAL;
1042
}
1043
1044
if (map->dm_nperms != nv_nperms) {
1045
printf("Error different nperms: %llu "
1046
"!= %llu\n", (u_longlong_t)map->dm_nperms,
1047
(u_longlong_t)nv_nperms);
1048
error = EINVAL;
1049
}
1050
1051
for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1052
if (map->dm_perms[i] != nv_perms[i]) {
1053
printf("Error different perms[%llu]: "
1054
"%d != %d\n", (u_longlong_t)i,
1055
(int)map->dm_perms[i],
1056
(int)nv_perms[i]);
1057
error = EINVAL;
1058
break;
1059
}
1060
}
1061
1062
/*
1063
* For good measure recalculate the worst and average
1064
* ratios and confirm they match the nvlist values.
1065
*/
1066
if (check_ratios) {
1067
uint64_t nv_worst_ratio, nv_avg_ratio;
1068
double worst_ratio, avg_ratio;
1069
1070
eval_decluster(map, &worst_ratio, &avg_ratio);
1071
1072
nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1073
MAP_WORST_RATIO);
1074
nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1075
MAP_AVG_RATIO);
1076
1077
if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1078
printf("Error ratio out of range %2.03f, "
1079
"%2.03f\n", worst_ratio, avg_ratio);
1080
error = EINVAL;
1081
}
1082
1083
if ((uint64_t)(worst_ratio * 1000.0) !=
1084
nv_worst_ratio) {
1085
printf("Error different worst_ratio %2.03f "
1086
"!= %2.03f\n", (double)nv_worst_ratio /
1087
1000.0, worst_ratio);
1088
error = EINVAL;
1089
}
1090
1091
if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1092
printf("Error different average_ratio %2.03f "
1093
"!= %2.03f\n", (double)nv_avg_ratio /
1094
1000.0, avg_ratio);
1095
error = EINVAL;
1096
}
1097
}
1098
1099
if (error) {
1100
free_map(map);
1101
nvlist_free(cfg);
1102
return (1);
1103
}
1104
1105
if (verbose > 0) {
1106
printf("- %llu children: good\n",
1107
(u_longlong_t)children);
1108
}
1109
n++;
1110
1111
free_map(map);
1112
nvlist_free(cfg);
1113
}
1114
1115
if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1116
printf("Error permutation maps missing: %d / %d checked\n",
1117
n, VDEV_DRAID_MAX_CHILDREN - 1);
1118
return (1);
1119
}
1120
1121
printf("Successfully verified %d / %d permutation maps\n",
1122
n, VDEV_DRAID_MAX_CHILDREN - 1);
1123
1124
return (0);
1125
}
1126
1127
/*
1128
* Dump the contents of the specified mapping(s) for inspection.
1129
*/
1130
static int
1131
draid_dump(int argc, char *argv[])
1132
{
1133
char filename[MAXPATHLEN] = {0};
1134
int c, error, verbose = 1;
1135
int min_children = VDEV_DRAID_MIN_CHILDREN;
1136
int max_children = VDEV_DRAID_MAX_CHILDREN;
1137
1138
while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1139
switch (c) {
1140
case 'm':
1141
min_children = (int)strtol(optarg, NULL, 0);
1142
if (min_children < 2) {
1143
(void) fprintf(stderr, "A minimum of 2 "
1144
"children are required.\n");
1145
return (1);
1146
}
1147
1148
break;
1149
case 'n':
1150
max_children = (int)strtol(optarg, NULL, 0);
1151
if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1152
(void) fprintf(stderr, "A maximum of %d "
1153
"children are allowed.\n",
1154
VDEV_DRAID_MAX_CHILDREN);
1155
return (1);
1156
}
1157
break;
1158
case 'v':
1159
verbose++;
1160
break;
1161
case ':':
1162
(void) fprintf(stderr,
1163
"missing argument for '%c' option\n", optopt);
1164
draid_usage();
1165
break;
1166
case '?':
1167
(void) fprintf(stderr, "invalid option '%c'\n",
1168
optopt);
1169
draid_usage();
1170
break;
1171
}
1172
}
1173
1174
if (argc > optind)
1175
strlcpy(filename, argv[optind], sizeof (filename));
1176
else {
1177
(void) fprintf(stderr, "A FILE must be specified.\n");
1178
return (1);
1179
}
1180
1181
/*
1182
* Dump maps for the requested child counts.
1183
*/
1184
for (uint64_t children = min_children;
1185
children <= max_children; children++) {
1186
char key[8] = { 0 };
1187
1188
snprintf(key, 7, "%llu", (u_longlong_t)children);
1189
error = dump_map_key(filename, key, verbose);
1190
if (error) {
1191
printf("Error dump_map_key(): %s\n", strerror(error));
1192
return (1);
1193
}
1194
}
1195
1196
return (0);
1197
}
1198
1199
/*
1200
* Print all of the mappings as a C formatted draid_map_t array. This table
1201
* is found in the module/zcommon/zfs_draid.c file and is the definitive
1202
* source for all mapping used by dRAID. It cannot be updated without
1203
* changing the dRAID on disk format.
1204
*/
1205
static int
1206
draid_table(int argc, char *argv[])
1207
{
1208
char filename[MAXPATHLEN] = {0};
1209
int error;
1210
1211
if (argc > optind)
1212
strlcpy(filename, argv[optind], sizeof (filename));
1213
else {
1214
(void) fprintf(stderr, "A FILE must be specified.\n");
1215
return (1);
1216
}
1217
1218
printf("static const draid_map_t "
1219
"draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1220
1221
for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1222
children <= VDEV_DRAID_MAX_CHILDREN;
1223
children++) {
1224
uint64_t seed, checksum, nperms, avg_ratio;
1225
nvlist_t *cfg;
1226
char key[8] = {0};
1227
1228
snprintf(key, 8, "%llu", (u_longlong_t)children);
1229
1230
error = read_map_key(filename, key, &cfg);
1231
if (error != 0) {
1232
printf("Error read_map_key() failed: %s\n",
1233
strerror(error));
1234
return (1);
1235
}
1236
1237
seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1238
checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1239
children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1240
nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1241
avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1242
1243
printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1244
"/* %2.03f */\n", (u_longlong_t)children,
1245
(u_longlong_t)nperms, (u_longlong_t)seed,
1246
(u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1247
1248
nvlist_free(cfg);
1249
}
1250
1251
printf("};\n");
1252
1253
return (0);
1254
}
1255
1256
static int
1257
draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1258
{
1259
nvlist_t *srccfgs;
1260
nvpair_t *elem = NULL;
1261
int error, merged = 0;
1262
1263
error = read_map(srcfilename, &srccfgs);
1264
if (error != 0)
1265
return (error);
1266
1267
while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1268
uint64_t nv_worst_ratio;
1269
uint64_t allcfg_worst_ratio;
1270
nvlist_t *cfg, *allcfg;
1271
const char *key;
1272
1273
switch (nvpair_type(elem)) {
1274
case DATA_TYPE_NVLIST:
1275
1276
(void) nvpair_value_nvlist(elem, &cfg);
1277
key = nvpair_name(elem);
1278
1279
nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1280
MAP_WORST_RATIO);
1281
1282
error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1283
if (error == 0) {
1284
allcfg_worst_ratio = fnvlist_lookup_uint64(
1285
allcfg, MAP_WORST_RATIO);
1286
1287
if (nv_worst_ratio < allcfg_worst_ratio) {
1288
fnvlist_remove(allcfgs, key);
1289
fnvlist_add_nvlist(allcfgs, key, cfg);
1290
merged++;
1291
}
1292
} else if (error == ENOENT) {
1293
fnvlist_add_nvlist(allcfgs, key, cfg);
1294
merged++;
1295
} else {
1296
return (error);
1297
}
1298
1299
break;
1300
default:
1301
continue;
1302
}
1303
}
1304
1305
nvlist_free(srccfgs);
1306
1307
*mergedp = merged;
1308
1309
return (0);
1310
}
1311
1312
/*
1313
* Merge the best map for each child count found in the listed files into
1314
* a new file. This allows 'draid generate' to be run in parallel and for
1315
* the results maps to be combined.
1316
*/
1317
static int
1318
draid_merge(int argc, char *argv[])
1319
{
1320
char filename[MAXPATHLEN] = {0};
1321
int c, error, total_merged = 0;
1322
nvlist_t *allcfgs;
1323
1324
while ((c = getopt(argc, argv, ":")) != -1) {
1325
switch (c) {
1326
case ':':
1327
(void) fprintf(stderr,
1328
"missing argument for '%c' option\n", optopt);
1329
draid_usage();
1330
break;
1331
case '?':
1332
(void) fprintf(stderr, "invalid option '%c'\n",
1333
optopt);
1334
draid_usage();
1335
break;
1336
}
1337
}
1338
1339
if (argc < 4) {
1340
(void) fprintf(stderr,
1341
"A FILE and multiple SRCs must be specified.\n");
1342
return (1);
1343
}
1344
1345
strlcpy(filename, argv[optind], sizeof (filename));
1346
optind++;
1347
1348
error = read_map(filename, &allcfgs);
1349
if (error == ENOENT) {
1350
allcfgs = fnvlist_alloc();
1351
} else if (error != 0) {
1352
printf("Error read_map(): %s\n", strerror(error));
1353
return (error);
1354
}
1355
1356
while (optind < argc) {
1357
char srcfilename[MAXPATHLEN] = {0};
1358
int merged = 0;
1359
1360
strlcpy(srcfilename, argv[optind], sizeof (srcfilename));
1361
1362
error = draid_merge_impl(allcfgs, srcfilename, &merged);
1363
if (error) {
1364
printf("Error draid_merge_impl(): %s\n",
1365
strerror(error));
1366
nvlist_free(allcfgs);
1367
return (1);
1368
}
1369
1370
total_merged += merged;
1371
printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1372
srcfilename, filename);
1373
1374
optind++;
1375
}
1376
1377
if (total_merged > 0)
1378
write_map(filename, allcfgs);
1379
1380
printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1381
filename);
1382
1383
nvlist_free(allcfgs);
1384
1385
return (0);
1386
}
1387
1388
int
1389
main(int argc, char *argv[])
1390
{
1391
if (argc < 2)
1392
draid_usage();
1393
1394
char *subcommand = argv[1];
1395
1396
if (strcmp(subcommand, "generate") == 0) {
1397
return (draid_generate(argc - 1, argv + 1));
1398
} else if (strcmp(subcommand, "verify") == 0) {
1399
return (draid_verify(argc - 1, argv + 1));
1400
} else if (strcmp(subcommand, "dump") == 0) {
1401
return (draid_dump(argc - 1, argv + 1));
1402
} else if (strcmp(subcommand, "table") == 0) {
1403
return (draid_table(argc - 1, argv + 1));
1404
} else if (strcmp(subcommand, "merge") == 0) {
1405
return (draid_merge(argc - 1, argv + 1));
1406
} else {
1407
draid_usage();
1408
}
1409
}
1410
1411