Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/cddl/boot/zfs/zfssubr.c
48253 views
1
/*
2
* CDDL HEADER START
3
*
4
* The contents of this file are subject to the terms of the
5
* Common Development and Distribution License (the "License").
6
* You may not use this file except in compliance with the License.
7
*
8
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9
* or http://www.opensolaris.org/os/licensing.
10
* See the License for the specific language governing permissions
11
* and limitations under the License.
12
*
13
* When distributing Covered Code, include this CDDL HEADER in each
14
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15
* If applicable, add the following below this CDDL HEADER, with the
16
* fields enclosed by brackets "[]" replaced with your own identifying
17
* information: Portions Copyright [yyyy] [name of copyright owner]
18
*
19
* CDDL HEADER END
20
*/
21
/*
22
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23
* Use is subject to license terms.
24
*/
25
26
#include <sys/cdefs.h>
27
#include <lz4.h>
28
29
static uint64_t zfs_crc64_table[256];
30
31
#ifndef ASSERT3S /* Proxy for all the assert defines */
32
#define ASSERT3S(x, y, z) ((void)0)
33
#define ASSERT3U(x, y, z) ((void)0)
34
#define ASSERT3P(x, y, z) ((void)0)
35
#define ASSERT0(x) ((void)0)
36
#define ASSERT(x) ((void)0)
37
#endif
38
39
#define panic(...) do { \
40
printf(__VA_ARGS__); \
41
for (;;) ; \
42
} while (0)
43
44
static void
45
zfs_init_crc(void)
46
{
47
int i, j;
48
uint64_t *ct;
49
50
/*
51
* Calculate the crc64 table (used for the zap hash
52
* function).
53
*/
54
if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55
memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56
for (i = 0; i < 256; i++)
57
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59
}
60
}
61
62
static void
63
zio_checksum_off(const void *buf, uint64_t size,
64
const void *ctx_template, zio_cksum_t *zcp)
65
{
66
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67
}
68
69
/*
70
* Signature for checksum functions.
71
*/
72
typedef void zio_checksum_t(const void *data, uint64_t size,
73
const void *ctx_template, zio_cksum_t *zcp);
74
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75
typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76
77
typedef enum zio_checksum_flags {
78
/* Strong enough for metadata? */
79
ZCHECKSUM_FLAG_METADATA = (1 << 1),
80
/* ZIO embedded checksum */
81
ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82
/* Strong enough for dedup (without verification)? */
83
ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84
/* Uses salt value */
85
ZCHECKSUM_FLAG_SALTED = (1 << 4),
86
/* Strong enough for nopwrite? */
87
ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88
} zio_checksum_flags_t;
89
90
/*
91
* Information about each checksum function.
92
*/
93
typedef struct zio_checksum_info {
94
/* checksum function for each byteorder */
95
zio_checksum_t *ci_func[2];
96
zio_checksum_tmpl_init_t *ci_tmpl_init;
97
zio_checksum_tmpl_free_t *ci_tmpl_free;
98
zio_checksum_flags_t ci_flags;
99
const char *ci_name; /* descriptive name */
100
} zio_checksum_info_t;
101
102
#include "blkptr.c"
103
104
#include "fletcher.c"
105
#include "blake3_zfs.c"
106
#include "sha256.c"
107
#include "skein_zfs.c"
108
109
#ifdef HAS_ZSTD_ZFS
110
extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
111
size_t d_len, int n);
112
#endif
113
114
static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
115
{{NULL, NULL}, NULL, NULL, 0, "inherit"},
116
{{NULL, NULL}, NULL, NULL, 0, "on"},
117
{{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
118
{{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
119
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
120
{{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
121
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
122
{{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
123
ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
124
{{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
125
0, "fletcher2"},
126
{{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
127
ZCHECKSUM_FLAG_METADATA, "fletcher4"},
128
{{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
129
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130
ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
131
{{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
132
ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
133
{{zio_checksum_off, zio_checksum_off}, NULL, NULL,
134
0, "noparity"},
135
{{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
136
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137
ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
138
{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
139
zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
140
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
141
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
142
/* no edonr for now */
143
{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
144
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
145
{{zio_checksum_blake3_native, zio_checksum_blake3_byteswap},
146
zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
147
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
148
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
149
};
150
151
/*
152
* Common signature for all zio compress/decompress functions.
153
*/
154
typedef size_t zio_compress_func_t(void *src, void *dst,
155
size_t s_len, size_t d_len, int);
156
typedef int zio_decompress_func_t(void *src, void *dst,
157
size_t s_len, size_t d_len, int);
158
159
/*
160
* Information about each compression function.
161
*/
162
typedef struct zio_compress_info {
163
zio_compress_func_t *ci_compress; /* compression function */
164
zio_decompress_func_t *ci_decompress; /* decompression function */
165
int ci_level; /* level parameter */
166
const char *ci_name; /* algorithm name */
167
} zio_compress_info_t;
168
169
#include "lzjb.c"
170
#include "zle.c"
171
#include "gzip.c"
172
173
/*
174
* Compression vectors.
175
*/
176
static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
177
{NULL, NULL, 0, "inherit"},
178
{NULL, NULL, 0, "on"},
179
{NULL, NULL, 0, "uncompressed"},
180
{NULL, lzjb_decompress, 0, "lzjb"},
181
{NULL, NULL, 0, "empty"},
182
{NULL, gzip_decompress, 1, "gzip-1"},
183
{NULL, gzip_decompress, 2, "gzip-2"},
184
{NULL, gzip_decompress, 3, "gzip-3"},
185
{NULL, gzip_decompress, 4, "gzip-4"},
186
{NULL, gzip_decompress, 5, "gzip-5"},
187
{NULL, gzip_decompress, 6, "gzip-6"},
188
{NULL, gzip_decompress, 7, "gzip-7"},
189
{NULL, gzip_decompress, 8, "gzip-8"},
190
{NULL, gzip_decompress, 9, "gzip-9"},
191
{NULL, zle_decompress, 64, "zle"},
192
{NULL, lz4_decompress, 0, "lz4"},
193
#ifdef HAS_ZSTD_ZFS
194
{NULL, zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
195
#endif
196
};
197
198
static void
199
byteswap_uint64_array(void *vbuf, size_t size)
200
{
201
uint64_t *buf = vbuf;
202
size_t count = size >> 3;
203
int i;
204
205
ASSERT((size & 7) == 0);
206
207
for (i = 0; i < count; i++)
208
buf[i] = BSWAP_64(buf[i]);
209
}
210
211
/*
212
* Set the external verifier for a gang block based on <vdev, offset, txg>,
213
* a tuple which is guaranteed to be unique for the life of the pool.
214
*/
215
static void
216
zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
217
{
218
const dva_t *dva = BP_IDENTITY(bp);
219
uint64_t txg = BP_PHYSICAL_BIRTH(bp);
220
221
ASSERT(BP_IS_GANG(bp));
222
223
ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
224
}
225
226
/*
227
* Set the external verifier for a label block based on its offset.
228
* The vdev is implicit, and the txg is unknowable at pool open time --
229
* hence the logic in vdev_uberblock_load() to find the most recent copy.
230
*/
231
static void
232
zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
233
{
234
ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
235
}
236
237
/*
238
* Calls the template init function of a checksum which supports context
239
* templates and installs the template into the spa_t.
240
*/
241
static void
242
zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
243
{
244
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
245
246
if (ci->ci_tmpl_init == NULL)
247
return;
248
249
if (spa->spa_cksum_tmpls[checksum] != NULL)
250
return;
251
252
if (spa->spa_cksum_tmpls[checksum] == NULL) {
253
spa->spa_cksum_tmpls[checksum] =
254
ci->ci_tmpl_init(&spa->spa_cksum_salt);
255
}
256
}
257
258
/*
259
* Called by a spa_t that's about to be deallocated. This steps through
260
* all of the checksum context templates and deallocates any that were
261
* initialized using the algorithm-specific template init function.
262
*/
263
static void __unused
264
zio_checksum_templates_free(spa_t *spa)
265
{
266
for (enum zio_checksum checksum = 0;
267
checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
268
if (spa->spa_cksum_tmpls[checksum] != NULL) {
269
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270
271
ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
272
spa->spa_cksum_tmpls[checksum] = NULL;
273
}
274
}
275
}
276
277
static int
278
zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
279
{
280
uint64_t size;
281
unsigned int checksum;
282
zio_checksum_info_t *ci;
283
void *ctx = NULL;
284
zio_cksum_t actual_cksum, expected_cksum, verifier;
285
int byteswap;
286
287
checksum = BP_GET_CHECKSUM(bp);
288
size = BP_GET_PSIZE(bp);
289
290
if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
291
return (EINVAL);
292
ci = &zio_checksum_table[checksum];
293
if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
294
return (EINVAL);
295
296
if (spa != NULL) {
297
zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
298
ctx = spa->spa_cksum_tmpls[checksum];
299
}
300
301
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
302
zio_eck_t *eck;
303
304
ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
305
checksum == ZIO_CHECKSUM_LABEL);
306
307
eck = (zio_eck_t *)((char *)data + size) - 1;
308
309
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
310
zio_checksum_gang_verifier(&verifier, bp);
311
else if (checksum == ZIO_CHECKSUM_LABEL)
312
zio_checksum_label_verifier(&verifier,
313
DVA_GET_OFFSET(BP_IDENTITY(bp)));
314
else
315
verifier = bp->blk_cksum;
316
317
byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
318
319
if (byteswap)
320
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
321
322
expected_cksum = eck->zec_cksum;
323
eck->zec_cksum = verifier;
324
ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325
eck->zec_cksum = expected_cksum;
326
327
if (byteswap)
328
byteswap_uint64_array(&expected_cksum,
329
sizeof (zio_cksum_t));
330
} else {
331
byteswap = BP_SHOULD_BYTESWAP(bp);
332
expected_cksum = bp->blk_cksum;
333
ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
334
}
335
336
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
337
/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
338
return (EIO);
339
}
340
341
return (0);
342
}
343
344
static int
345
zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
346
void *dest, uint64_t destsize)
347
{
348
zio_compress_info_t *ci;
349
350
if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
351
printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
352
return (EIO);
353
}
354
355
ci = &zio_compress_table[cpfunc];
356
if (!ci->ci_decompress) {
357
printf("ZFS: unsupported compression algorithm %s\n",
358
ci->ci_name);
359
return (EIO);
360
}
361
362
return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
363
}
364
365
static uint64_t
366
zap_hash(uint64_t salt, const char *name)
367
{
368
const uint8_t *cp;
369
uint8_t c;
370
uint64_t crc = salt;
371
372
ASSERT(crc != 0);
373
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
374
for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
375
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
376
377
/*
378
* Only use 28 bits, since we need 4 bits in the cookie for the
379
* collision differentiator. We MUST use the high bits, since
380
* those are the onces that we first pay attention to when
381
* chosing the bucket.
382
*/
383
crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
384
385
return (crc);
386
}
387
388
typedef struct raidz_col {
389
uint64_t rc_devidx; /* child device index for I/O */
390
uint64_t rc_offset; /* device offset */
391
uint64_t rc_size; /* I/O size */
392
void *rc_data; /* I/O data */
393
int rc_error; /* I/O error for this device */
394
uint8_t rc_tried; /* Did we attempt this I/O column? */
395
uint8_t rc_skipped; /* Did we skip this I/O column? */
396
} raidz_col_t;
397
398
typedef struct raidz_map {
399
uint64_t rm_cols; /* Regular column count */
400
uint64_t rm_scols; /* Count including skipped columns */
401
uint64_t rm_bigcols; /* Number of oversized columns */
402
uint64_t rm_asize; /* Actual total I/O size */
403
uint64_t rm_missingdata; /* Count of missing data devices */
404
uint64_t rm_missingparity; /* Count of missing parity devices */
405
uint64_t rm_firstdatacol; /* First data column/parity count */
406
uint64_t rm_nskip; /* Skipped sectors for padding */
407
uint64_t rm_skipstart; /* Column index of padding start */
408
uintptr_t rm_reports; /* # of referencing checksum reports */
409
uint8_t rm_freed; /* map no longer has referencing ZIO */
410
uint8_t rm_ecksuminjected; /* checksum error was injected */
411
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
412
} raidz_map_t;
413
414
#define VDEV_RAIDZ_P 0
415
#define VDEV_RAIDZ_Q 1
416
#define VDEV_RAIDZ_R 2
417
418
#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
419
#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
420
421
/*
422
* We provide a mechanism to perform the field multiplication operation on a
423
* 64-bit value all at once rather than a byte at a time. This works by
424
* creating a mask from the top bit in each byte and using that to
425
* conditionally apply the XOR of 0x1d.
426
*/
427
#define VDEV_RAIDZ_64MUL_2(x, mask) \
428
{ \
429
(mask) = (x) & 0x8080808080808080ULL; \
430
(mask) = ((mask) << 1) - ((mask) >> 7); \
431
(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
432
((mask) & 0x1d1d1d1d1d1d1d1dULL); \
433
}
434
435
#define VDEV_RAIDZ_64MUL_4(x, mask) \
436
{ \
437
VDEV_RAIDZ_64MUL_2((x), mask); \
438
VDEV_RAIDZ_64MUL_2((x), mask); \
439
}
440
441
/*
442
* These two tables represent powers and logs of 2 in the Galois field defined
443
* above. These values were computed by repeatedly multiplying by 2 as above.
444
*/
445
static const uint8_t vdev_raidz_pow2[256] = {
446
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
447
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
448
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
449
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
450
0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
451
0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
452
0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
453
0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
454
0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
455
0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
456
0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
457
0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
458
0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
459
0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
460
0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
461
0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
462
0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
463
0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
464
0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
465
0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
466
0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
467
0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
468
0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
469
0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
470
0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
471
0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
472
0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
473
0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
474
0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
475
0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
476
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
477
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
478
};
479
static const uint8_t vdev_raidz_log2[256] = {
480
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
481
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
482
0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
483
0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
484
0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
485
0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
486
0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
487
0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
488
0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
489
0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
490
0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
491
0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
492
0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
493
0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
494
0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
495
0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
496
0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
497
0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
498
0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
499
0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
500
0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
501
0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
502
0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
503
0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
504
0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
505
0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
506
0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
507
0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
508
0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
509
0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
510
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
511
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
512
};
513
514
/*
515
* Multiply a given number by 2 raised to the given power.
516
*/
517
static uint8_t
518
vdev_raidz_exp2(uint8_t a, int exp)
519
{
520
if (a == 0)
521
return (0);
522
523
ASSERT(exp >= 0);
524
ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
525
526
exp += vdev_raidz_log2[a];
527
if (exp > 255)
528
exp -= 255;
529
530
return (vdev_raidz_pow2[exp]);
531
}
532
533
static void
534
vdev_raidz_generate_parity_p(raidz_map_t *rm)
535
{
536
uint64_t *p, *src, ccount, i;
537
uint64_t pcount __unused;
538
int c;
539
540
pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
541
542
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543
src = rm->rm_col[c].rc_data;
544
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545
ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
546
547
if (c == rm->rm_firstdatacol) {
548
ASSERT(ccount == pcount);
549
for (i = 0; i < ccount; i++, src++, p++) {
550
*p = *src;
551
}
552
} else {
553
ASSERT(ccount <= pcount);
554
for (i = 0; i < ccount; i++, src++, p++) {
555
*p ^= *src;
556
}
557
}
558
}
559
}
560
561
static void
562
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
563
{
564
uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
565
int c;
566
567
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
568
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
569
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
570
571
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
572
src = rm->rm_col[c].rc_data;
573
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
574
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
575
576
ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
577
578
if (c == rm->rm_firstdatacol) {
579
ASSERT(ccnt == pcnt || ccnt == 0);
580
for (i = 0; i < ccnt; i++, src++, p++, q++) {
581
*p = *src;
582
*q = *src;
583
}
584
for (; i < pcnt; i++, src++, p++, q++) {
585
*p = 0;
586
*q = 0;
587
}
588
} else {
589
ASSERT(ccnt <= pcnt);
590
591
/*
592
* Apply the algorithm described above by multiplying
593
* the previous result and adding in the new value.
594
*/
595
for (i = 0; i < ccnt; i++, src++, p++, q++) {
596
*p ^= *src;
597
598
VDEV_RAIDZ_64MUL_2(*q, mask);
599
*q ^= *src;
600
}
601
602
/*
603
* Treat short columns as though they are full of 0s.
604
* Note that there's therefore nothing needed for P.
605
*/
606
for (; i < pcnt; i++, q++) {
607
VDEV_RAIDZ_64MUL_2(*q, mask);
608
}
609
}
610
}
611
}
612
613
static void
614
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
615
{
616
uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
617
int c;
618
619
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
623
rm->rm_col[VDEV_RAIDZ_R].rc_size);
624
625
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
626
src = rm->rm_col[c].rc_data;
627
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
628
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
629
r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
630
631
ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
632
633
if (c == rm->rm_firstdatacol) {
634
ASSERT(ccnt == pcnt || ccnt == 0);
635
for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
636
*p = *src;
637
*q = *src;
638
*r = *src;
639
}
640
for (; i < pcnt; i++, src++, p++, q++, r++) {
641
*p = 0;
642
*q = 0;
643
*r = 0;
644
}
645
} else {
646
ASSERT(ccnt <= pcnt);
647
648
/*
649
* Apply the algorithm described above by multiplying
650
* the previous result and adding in the new value.
651
*/
652
for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
653
*p ^= *src;
654
655
VDEV_RAIDZ_64MUL_2(*q, mask);
656
*q ^= *src;
657
658
VDEV_RAIDZ_64MUL_4(*r, mask);
659
*r ^= *src;
660
}
661
662
/*
663
* Treat short columns as though they are full of 0s.
664
* Note that there's therefore nothing needed for P.
665
*/
666
for (; i < pcnt; i++, q++, r++) {
667
VDEV_RAIDZ_64MUL_2(*q, mask);
668
VDEV_RAIDZ_64MUL_4(*r, mask);
669
}
670
}
671
}
672
}
673
674
/*
675
* Generate RAID parity in the first virtual columns according to the number of
676
* parity columns available.
677
*/
678
static void
679
vdev_raidz_generate_parity(raidz_map_t *rm)
680
{
681
switch (rm->rm_firstdatacol) {
682
case 1:
683
vdev_raidz_generate_parity_p(rm);
684
break;
685
case 2:
686
vdev_raidz_generate_parity_pq(rm);
687
break;
688
case 3:
689
vdev_raidz_generate_parity_pqr(rm);
690
break;
691
default:
692
panic("invalid RAID-Z configuration");
693
}
694
}
695
696
/* BEGIN CSTYLED */
697
/*
698
* In the general case of reconstruction, we must solve the system of linear
699
* equations defined by the coeffecients used to generate parity as well as
700
* the contents of the data and parity disks. This can be expressed with
701
* vectors for the original data (D) and the actual data (d) and parity (p)
702
* and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
703
*
704
* __ __ __ __
705
* | | __ __ | p_0 |
706
* | V | | D_0 | | p_m-1 |
707
* | | x | : | = | d_0 |
708
* | I | | D_n-1 | | : |
709
* | | ~~ ~~ | d_n-1 |
710
* ~~ ~~ ~~ ~~
711
*
712
* I is simply a square identity matrix of size n, and V is a vandermonde
713
* matrix defined by the coeffecients we chose for the various parity columns
714
* (1, 2, 4). Note that these values were chosen both for simplicity, speedy
715
* computation as well as linear separability.
716
*
717
* __ __ __ __
718
* | 1 .. 1 1 1 | | p_0 |
719
* | 2^n-1 .. 4 2 1 | __ __ | : |
720
* | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
721
* | 1 .. 0 0 0 | | D_1 | | d_0 |
722
* | 0 .. 0 0 0 | x | D_2 | = | d_1 |
723
* | : : : : | | : | | d_2 |
724
* | 0 .. 1 0 0 | | D_n-1 | | : |
725
* | 0 .. 0 1 0 | ~~ ~~ | : |
726
* | 0 .. 0 0 1 | | d_n-1 |
727
* ~~ ~~ ~~ ~~
728
*
729
* Note that I, V, d, and p are known. To compute D, we must invert the
730
* matrix and use the known data and parity values to reconstruct the unknown
731
* data values. We begin by removing the rows in V|I and d|p that correspond
732
* to failed or missing columns; we then make V|I square (n x n) and d|p
733
* sized n by removing rows corresponding to unused parity from the bottom up
734
* to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
735
* using Gauss-Jordan elimination. In the example below we use m=3 parity
736
* columns, n=8 data columns, with errors in d_1, d_2, and p_1:
737
* __ __
738
* | 1 1 1 1 1 1 1 1 |
739
* | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
740
* | 19 205 116 29 64 16 4 1 | / /
741
* | 1 0 0 0 0 0 0 0 | / /
742
* | 0 1 0 0 0 0 0 0 | <--' /
743
* (V|I) = | 0 0 1 0 0 0 0 0 | <---'
744
* | 0 0 0 1 0 0 0 0 |
745
* | 0 0 0 0 1 0 0 0 |
746
* | 0 0 0 0 0 1 0 0 |
747
* | 0 0 0 0 0 0 1 0 |
748
* | 0 0 0 0 0 0 0 1 |
749
* ~~ ~~
750
* __ __
751
* | 1 1 1 1 1 1 1 1 |
752
* | 128 64 32 16 8 4 2 1 |
753
* | 19 205 116 29 64 16 4 1 |
754
* | 1 0 0 0 0 0 0 0 |
755
* | 0 1 0 0 0 0 0 0 |
756
* (V|I)' = | 0 0 1 0 0 0 0 0 |
757
* | 0 0 0 1 0 0 0 0 |
758
* | 0 0 0 0 1 0 0 0 |
759
* | 0 0 0 0 0 1 0 0 |
760
* | 0 0 0 0 0 0 1 0 |
761
* | 0 0 0 0 0 0 0 1 |
762
* ~~ ~~
763
*
764
* Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
765
* have carefully chosen the seed values 1, 2, and 4 to ensure that this
766
* matrix is not singular.
767
* __ __
768
* | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
769
* | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
770
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
771
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
772
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
773
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
774
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
775
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
776
* ~~ ~~
777
* __ __
778
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
779
* | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
780
* | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
781
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
782
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
783
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
784
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
785
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
786
* ~~ ~~
787
* __ __
788
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
789
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
790
* | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
791
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
792
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
793
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
794
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
795
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
796
* ~~ ~~
797
* __ __
798
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
799
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
800
* | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
801
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
802
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
803
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
804
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
805
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
806
* ~~ ~~
807
* __ __
808
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
809
* | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
810
* | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
811
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
812
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
813
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
814
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
815
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
816
* ~~ ~~
817
* __ __
818
* | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
819
* | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
820
* | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
821
* | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
822
* | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
823
* | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
824
* | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
825
* | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
826
* ~~ ~~
827
* __ __
828
* | 0 0 1 0 0 0 0 0 |
829
* | 167 100 5 41 159 169 217 208 |
830
* | 166 100 4 40 158 168 216 209 |
831
* (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
832
* | 0 0 0 0 1 0 0 0 |
833
* | 0 0 0 0 0 1 0 0 |
834
* | 0 0 0 0 0 0 1 0 |
835
* | 0 0 0 0 0 0 0 1 |
836
* ~~ ~~
837
*
838
* We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
839
* of the missing data.
840
*
841
* As is apparent from the example above, the only non-trivial rows in the
842
* inverse matrix correspond to the data disks that we're trying to
843
* reconstruct. Indeed, those are the only rows we need as the others would
844
* only be useful for reconstructing data known or assumed to be valid. For
845
* that reason, we only build the coefficients in the rows that correspond to
846
* targeted columns.
847
*/
848
/* END CSTYLED */
849
850
static void
851
vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
852
uint8_t **rows)
853
{
854
int i, j;
855
int pow;
856
857
ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
858
859
/*
860
* Fill in the missing rows of interest.
861
*/
862
for (i = 0; i < nmap; i++) {
863
ASSERT3S(0, <=, map[i]);
864
ASSERT3S(map[i], <=, 2);
865
866
pow = map[i] * n;
867
if (pow > 255)
868
pow -= 255;
869
ASSERT(pow <= 255);
870
871
for (j = 0; j < n; j++) {
872
pow -= map[i];
873
if (pow < 0)
874
pow += 255;
875
rows[i][j] = vdev_raidz_pow2[pow];
876
}
877
}
878
}
879
880
static void
881
vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
882
uint8_t **rows, uint8_t **invrows, const uint8_t *used)
883
{
884
int i, j, ii, jj;
885
uint8_t log;
886
887
/*
888
* Assert that the first nmissing entries from the array of used
889
* columns correspond to parity columns and that subsequent entries
890
* correspond to data columns.
891
*/
892
for (i = 0; i < nmissing; i++) {
893
ASSERT3S(used[i], <, rm->rm_firstdatacol);
894
}
895
for (; i < n; i++) {
896
ASSERT3S(used[i], >=, rm->rm_firstdatacol);
897
}
898
899
/*
900
* First initialize the storage where we'll compute the inverse rows.
901
*/
902
for (i = 0; i < nmissing; i++) {
903
for (j = 0; j < n; j++) {
904
invrows[i][j] = (i == j) ? 1 : 0;
905
}
906
}
907
908
/*
909
* Subtract all trivial rows from the rows of consequence.
910
*/
911
for (i = 0; i < nmissing; i++) {
912
for (j = nmissing; j < n; j++) {
913
ASSERT3U(used[j], >=, rm->rm_firstdatacol);
914
jj = used[j] - rm->rm_firstdatacol;
915
ASSERT3S(jj, <, n);
916
invrows[i][j] = rows[i][jj];
917
rows[i][jj] = 0;
918
}
919
}
920
921
/*
922
* For each of the rows of interest, we must normalize it and subtract
923
* a multiple of it from the other rows.
924
*/
925
for (i = 0; i < nmissing; i++) {
926
for (j = 0; j < missing[i]; j++) {
927
ASSERT3U(rows[i][j], ==, 0);
928
}
929
ASSERT3U(rows[i][missing[i]], !=, 0);
930
931
/*
932
* Compute the inverse of the first element and multiply each
933
* element in the row by that value.
934
*/
935
log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
936
937
for (j = 0; j < n; j++) {
938
rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
939
invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
940
}
941
942
for (ii = 0; ii < nmissing; ii++) {
943
if (i == ii)
944
continue;
945
946
ASSERT3U(rows[ii][missing[i]], !=, 0);
947
948
log = vdev_raidz_log2[rows[ii][missing[i]]];
949
950
for (j = 0; j < n; j++) {
951
rows[ii][j] ^=
952
vdev_raidz_exp2(rows[i][j], log);
953
invrows[ii][j] ^=
954
vdev_raidz_exp2(invrows[i][j], log);
955
}
956
}
957
}
958
959
/*
960
* Verify that the data that is left in the rows are properly part of
961
* an identity matrix.
962
*/
963
for (i = 0; i < nmissing; i++) {
964
for (j = 0; j < n; j++) {
965
if (j == missing[i]) {
966
ASSERT3U(rows[i][j], ==, 1);
967
} else {
968
ASSERT3U(rows[i][j], ==, 0);
969
}
970
}
971
}
972
}
973
974
static void
975
vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
976
int *missing, uint8_t **invrows, const uint8_t *used)
977
{
978
int i, j, x, cc, c;
979
uint8_t *src;
980
uint64_t ccount;
981
uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
982
uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
983
uint8_t log, val;
984
int ll;
985
uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
986
uint8_t *p, *pp;
987
size_t psize;
988
989
log = 0; /* gcc */
990
psize = sizeof (invlog[0][0]) * n * nmissing;
991
p = malloc(psize);
992
if (p == NULL) {
993
printf("Out of memory\n");
994
return;
995
}
996
997
for (pp = p, i = 0; i < nmissing; i++) {
998
invlog[i] = pp;
999
pp += n;
1000
}
1001
1002
for (i = 0; i < nmissing; i++) {
1003
for (j = 0; j < n; j++) {
1004
ASSERT3U(invrows[i][j], !=, 0);
1005
invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1006
}
1007
}
1008
1009
for (i = 0; i < n; i++) {
1010
c = used[i];
1011
ASSERT3U(c, <, rm->rm_cols);
1012
1013
src = rm->rm_col[c].rc_data;
1014
ccount = rm->rm_col[c].rc_size;
1015
for (j = 0; j < nmissing; j++) {
1016
cc = missing[j] + rm->rm_firstdatacol;
1017
ASSERT3U(cc, >=, rm->rm_firstdatacol);
1018
ASSERT3U(cc, <, rm->rm_cols);
1019
ASSERT3U(cc, !=, c);
1020
1021
dst[j] = rm->rm_col[cc].rc_data;
1022
dcount[j] = rm->rm_col[cc].rc_size;
1023
}
1024
1025
ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1026
1027
for (x = 0; x < ccount; x++, src++) {
1028
if (*src != 0)
1029
log = vdev_raidz_log2[*src];
1030
1031
for (cc = 0; cc < nmissing; cc++) {
1032
if (x >= dcount[cc])
1033
continue;
1034
1035
if (*src == 0) {
1036
val = 0;
1037
} else {
1038
if ((ll = log + invlog[cc][i]) >= 255)
1039
ll -= 255;
1040
val = vdev_raidz_pow2[ll];
1041
}
1042
1043
if (i == 0)
1044
dst[cc][x] = val;
1045
else
1046
dst[cc][x] ^= val;
1047
}
1048
}
1049
}
1050
1051
free(p);
1052
}
1053
1054
static int
1055
vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1056
{
1057
int n, i, c, t, tt;
1058
int nmissing_rows;
1059
int missing_rows[VDEV_RAIDZ_MAXPARITY];
1060
int parity_map[VDEV_RAIDZ_MAXPARITY];
1061
1062
uint8_t *p, *pp;
1063
size_t psize;
1064
1065
uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1066
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1067
uint8_t *used;
1068
1069
int code = 0;
1070
1071
1072
n = rm->rm_cols - rm->rm_firstdatacol;
1073
1074
/*
1075
* Figure out which data columns are missing.
1076
*/
1077
nmissing_rows = 0;
1078
for (t = 0; t < ntgts; t++) {
1079
if (tgts[t] >= rm->rm_firstdatacol) {
1080
missing_rows[nmissing_rows++] =
1081
tgts[t] - rm->rm_firstdatacol;
1082
}
1083
}
1084
1085
/*
1086
* Figure out which parity columns to use to help generate the missing
1087
* data columns.
1088
*/
1089
for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1090
ASSERT(tt < ntgts);
1091
ASSERT(c < rm->rm_firstdatacol);
1092
1093
/*
1094
* Skip any targeted parity columns.
1095
*/
1096
if (c == tgts[tt]) {
1097
tt++;
1098
continue;
1099
}
1100
1101
code |= 1 << c;
1102
1103
parity_map[i] = c;
1104
i++;
1105
}
1106
1107
ASSERT(code != 0);
1108
ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1109
1110
psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1111
nmissing_rows * n + sizeof (used[0]) * n;
1112
p = malloc(psize);
1113
if (p == NULL) {
1114
printf("Out of memory\n");
1115
return (code);
1116
}
1117
1118
for (pp = p, i = 0; i < nmissing_rows; i++) {
1119
rows[i] = pp;
1120
pp += n;
1121
invrows[i] = pp;
1122
pp += n;
1123
}
1124
used = pp;
1125
1126
for (i = 0; i < nmissing_rows; i++) {
1127
used[i] = parity_map[i];
1128
}
1129
1130
for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1131
if (tt < nmissing_rows &&
1132
c == missing_rows[tt] + rm->rm_firstdatacol) {
1133
tt++;
1134
continue;
1135
}
1136
1137
ASSERT3S(i, <, n);
1138
used[i] = c;
1139
i++;
1140
}
1141
1142
/*
1143
* Initialize the interesting rows of the matrix.
1144
*/
1145
vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1146
1147
/*
1148
* Invert the matrix.
1149
*/
1150
vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1151
invrows, used);
1152
1153
/*
1154
* Reconstruct the missing data using the generated matrix.
1155
*/
1156
vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1157
invrows, used);
1158
1159
free(p);
1160
1161
return (code);
1162
}
1163
1164
static int
1165
vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1166
{
1167
int tgts[VDEV_RAIDZ_MAXPARITY];
1168
int ntgts;
1169
int i, c;
1170
int code;
1171
int nbadparity, nbaddata;
1172
1173
/*
1174
* The tgts list must already be sorted.
1175
*/
1176
for (i = 1; i < nt; i++) {
1177
ASSERT(t[i] > t[i - 1]);
1178
}
1179
1180
nbadparity = rm->rm_firstdatacol;
1181
nbaddata = rm->rm_cols - nbadparity;
1182
ntgts = 0;
1183
for (i = 0, c = 0; c < rm->rm_cols; c++) {
1184
if (i < nt && c == t[i]) {
1185
tgts[ntgts++] = c;
1186
i++;
1187
} else if (rm->rm_col[c].rc_error != 0) {
1188
tgts[ntgts++] = c;
1189
} else if (c >= rm->rm_firstdatacol) {
1190
nbaddata--;
1191
} else {
1192
nbadparity--;
1193
}
1194
}
1195
1196
ASSERT(ntgts >= nt);
1197
ASSERT(nbaddata >= 0);
1198
ASSERT(nbaddata + nbadparity == ntgts);
1199
1200
code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1201
ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1202
ASSERT(code > 0);
1203
return (code);
1204
}
1205
1206
static raidz_map_t *
1207
vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1208
uint64_t dcols, uint64_t nparity)
1209
{
1210
raidz_map_t *rm;
1211
uint64_t b = offset >> unit_shift;
1212
uint64_t s = size >> unit_shift;
1213
uint64_t f = b % dcols;
1214
uint64_t o = (b / dcols) << unit_shift;
1215
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1216
1217
q = s / (dcols - nparity);
1218
r = s - q * (dcols - nparity);
1219
bc = (r == 0 ? 0 : r + nparity);
1220
tot = s + nparity * (q + (r == 0 ? 0 : 1));
1221
1222
if (q == 0) {
1223
acols = bc;
1224
scols = MIN(dcols, roundup(bc, nparity + 1));
1225
} else {
1226
acols = dcols;
1227
scols = dcols;
1228
}
1229
1230
ASSERT3U(acols, <=, scols);
1231
1232
rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1233
if (rm == NULL)
1234
return (rm);
1235
1236
rm->rm_cols = acols;
1237
rm->rm_scols = scols;
1238
rm->rm_bigcols = bc;
1239
rm->rm_skipstart = bc;
1240
rm->rm_missingdata = 0;
1241
rm->rm_missingparity = 0;
1242
rm->rm_firstdatacol = nparity;
1243
rm->rm_reports = 0;
1244
rm->rm_freed = 0;
1245
rm->rm_ecksuminjected = 0;
1246
1247
asize = 0;
1248
1249
for (c = 0; c < scols; c++) {
1250
col = f + c;
1251
coff = o;
1252
if (col >= dcols) {
1253
col -= dcols;
1254
coff += 1ULL << unit_shift;
1255
}
1256
rm->rm_col[c].rc_devidx = col;
1257
rm->rm_col[c].rc_offset = coff;
1258
rm->rm_col[c].rc_data = NULL;
1259
rm->rm_col[c].rc_error = 0;
1260
rm->rm_col[c].rc_tried = 0;
1261
rm->rm_col[c].rc_skipped = 0;
1262
1263
if (c >= acols)
1264
rm->rm_col[c].rc_size = 0;
1265
else if (c < bc)
1266
rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1267
else
1268
rm->rm_col[c].rc_size = q << unit_shift;
1269
1270
asize += rm->rm_col[c].rc_size;
1271
}
1272
1273
ASSERT3U(asize, ==, tot << unit_shift);
1274
rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1275
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1276
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1277
ASSERT3U(rm->rm_nskip, <=, nparity);
1278
1279
for (c = 0; c < rm->rm_firstdatacol; c++) {
1280
rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1281
if (rm->rm_col[c].rc_data == NULL) {
1282
c++;
1283
while (c != 0)
1284
free(rm->rm_col[--c].rc_data);
1285
free(rm);
1286
return (NULL);
1287
}
1288
}
1289
1290
rm->rm_col[c].rc_data = data;
1291
1292
for (c = c + 1; c < acols; c++)
1293
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1294
rm->rm_col[c - 1].rc_size;
1295
1296
/*
1297
* If all data stored spans all columns, there's a danger that parity
1298
* will always be on the same device and, since parity isn't read
1299
* during normal operation, that that device's I/O bandwidth won't be
1300
* used effectively. We therefore switch the parity every 1MB.
1301
*
1302
* ... at least that was, ostensibly, the theory. As a practical
1303
* matter unless we juggle the parity between all devices evenly, we
1304
* won't see any benefit. Further, occasional writes that aren't a
1305
* multiple of the LCM of the number of children and the minimum
1306
* stripe width are sufficient to avoid pessimal behavior.
1307
* Unfortunately, this decision created an implicit on-disk format
1308
* requirement that we need to support for all eternity, but only
1309
* for single-parity RAID-Z.
1310
*
1311
* If we intend to skip a sector in the zeroth column for padding
1312
* we must make sure to note this swap. We will never intend to
1313
* skip the first column since at least one data and one parity
1314
* column must appear in each row.
1315
*/
1316
ASSERT(rm->rm_cols >= 2);
1317
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1318
1319
if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1320
devidx = rm->rm_col[0].rc_devidx;
1321
o = rm->rm_col[0].rc_offset;
1322
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1323
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1324
rm->rm_col[1].rc_devidx = devidx;
1325
rm->rm_col[1].rc_offset = o;
1326
1327
if (rm->rm_skipstart == 0)
1328
rm->rm_skipstart = 1;
1329
}
1330
1331
return (rm);
1332
}
1333
1334
static void
1335
vdev_raidz_map_free(raidz_map_t *rm)
1336
{
1337
int c;
1338
1339
for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1340
free(rm->rm_col[c].rc_data);
1341
1342
free(rm);
1343
}
1344
1345
static vdev_t *
1346
vdev_child(vdev_t *pvd, uint64_t devidx)
1347
{
1348
vdev_t *cvd;
1349
1350
STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1351
if (cvd->v_id == devidx)
1352
break;
1353
}
1354
1355
return (cvd);
1356
}
1357
1358
/*
1359
* We keep track of whether or not there were any injected errors, so that
1360
* any ereports we generate can note it.
1361
*/
1362
static int
1363
raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1364
uint64_t size)
1365
{
1366
return (zio_checksum_verify(spa, bp, data));
1367
}
1368
1369
/*
1370
* Generate the parity from the data columns. If we tried and were able to
1371
* read the parity without error, verify that the generated parity matches the
1372
* data we read. If it doesn't, we fire off a checksum error. Return the
1373
* number such failures.
1374
*/
1375
static int
1376
raidz_parity_verify(raidz_map_t *rm)
1377
{
1378
void *orig[VDEV_RAIDZ_MAXPARITY];
1379
int c, ret = 0;
1380
raidz_col_t *rc;
1381
1382
for (c = 0; c < rm->rm_firstdatacol; c++) {
1383
rc = &rm->rm_col[c];
1384
if (!rc->rc_tried || rc->rc_error != 0)
1385
continue;
1386
orig[c] = malloc(rc->rc_size);
1387
if (orig[c] != NULL) {
1388
bcopy(rc->rc_data, orig[c], rc->rc_size);
1389
} else {
1390
printf("Out of memory\n");
1391
}
1392
}
1393
1394
vdev_raidz_generate_parity(rm);
1395
1396
for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1397
rc = &rm->rm_col[c];
1398
if (!rc->rc_tried || rc->rc_error != 0)
1399
continue;
1400
if (orig[c] == NULL ||
1401
bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1402
rc->rc_error = ECKSUM;
1403
ret++;
1404
}
1405
free(orig[c]);
1406
}
1407
1408
return (ret);
1409
}
1410
1411
/*
1412
* Iterate over all combinations of bad data and attempt a reconstruction.
1413
* Note that the algorithm below is non-optimal because it doesn't take into
1414
* account how reconstruction is actually performed. For example, with
1415
* triple-parity RAID-Z the reconstruction procedure is the same if column 4
1416
* is targeted as invalid as if columns 1 and 4 are targeted since in both
1417
* cases we'd only use parity information in column 0.
1418
*/
1419
static int
1420
vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1421
void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1422
{
1423
raidz_col_t *rc;
1424
void *orig[VDEV_RAIDZ_MAXPARITY];
1425
int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1426
int *tgts = &tstore[1];
1427
int current, next, i, c, n;
1428
int code, ret = 0;
1429
1430
ASSERT(total_errors < rm->rm_firstdatacol);
1431
1432
/*
1433
* This simplifies one edge condition.
1434
*/
1435
tgts[-1] = -1;
1436
1437
for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1438
/*
1439
* Initialize the targets array by finding the first n columns
1440
* that contain no error.
1441
*
1442
* If there were no data errors, we need to ensure that we're
1443
* always explicitly attempting to reconstruct at least one
1444
* data column. To do this, we simply push the highest target
1445
* up into the data columns.
1446
*/
1447
for (c = 0, i = 0; i < n; i++) {
1448
if (i == n - 1 && data_errors == 0 &&
1449
c < rm->rm_firstdatacol) {
1450
c = rm->rm_firstdatacol;
1451
}
1452
1453
while (rm->rm_col[c].rc_error != 0) {
1454
c++;
1455
ASSERT3S(c, <, rm->rm_cols);
1456
}
1457
1458
tgts[i] = c++;
1459
}
1460
1461
/*
1462
* Setting tgts[n] simplifies the other edge condition.
1463
*/
1464
tgts[n] = rm->rm_cols;
1465
1466
/*
1467
* These buffers were allocated in previous iterations.
1468
*/
1469
for (i = 0; i < n - 1; i++) {
1470
ASSERT(orig[i] != NULL);
1471
}
1472
1473
orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1474
if (orig[n - 1] == NULL) {
1475
ret = ENOMEM;
1476
goto done;
1477
}
1478
1479
current = 0;
1480
next = tgts[current];
1481
1482
while (current != n) {
1483
tgts[current] = next;
1484
current = 0;
1485
1486
/*
1487
* Save off the original data that we're going to
1488
* attempt to reconstruct.
1489
*/
1490
for (i = 0; i < n; i++) {
1491
ASSERT(orig[i] != NULL);
1492
c = tgts[i];
1493
ASSERT3S(c, >=, 0);
1494
ASSERT3S(c, <, rm->rm_cols);
1495
rc = &rm->rm_col[c];
1496
bcopy(rc->rc_data, orig[i], rc->rc_size);
1497
}
1498
1499
/*
1500
* Attempt a reconstruction and exit the outer loop on
1501
* success.
1502
*/
1503
code = vdev_raidz_reconstruct(rm, tgts, n);
1504
if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1505
for (i = 0; i < n; i++) {
1506
c = tgts[i];
1507
rc = &rm->rm_col[c];
1508
ASSERT(rc->rc_error == 0);
1509
rc->rc_error = ECKSUM;
1510
}
1511
1512
ret = code;
1513
goto done;
1514
}
1515
1516
/*
1517
* Restore the original data.
1518
*/
1519
for (i = 0; i < n; i++) {
1520
c = tgts[i];
1521
rc = &rm->rm_col[c];
1522
bcopy(orig[i], rc->rc_data, rc->rc_size);
1523
}
1524
1525
do {
1526
/*
1527
* Find the next valid column after the current
1528
* position..
1529
*/
1530
for (next = tgts[current] + 1;
1531
next < rm->rm_cols &&
1532
rm->rm_col[next].rc_error != 0; next++)
1533
continue;
1534
1535
ASSERT(next <= tgts[current + 1]);
1536
1537
/*
1538
* If that spot is available, we're done here.
1539
*/
1540
if (next != tgts[current + 1])
1541
break;
1542
1543
/*
1544
* Otherwise, find the next valid column after
1545
* the previous position.
1546
*/
1547
for (c = tgts[current - 1] + 1;
1548
rm->rm_col[c].rc_error != 0; c++)
1549
continue;
1550
1551
tgts[current] = c;
1552
current++;
1553
1554
} while (current != n);
1555
}
1556
}
1557
n--;
1558
done:
1559
for (i = n - 1; i >= 0; i--) {
1560
free(orig[i]);
1561
}
1562
1563
return (ret);
1564
}
1565
1566
static int
1567
vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1568
off_t offset, size_t bytes)
1569
{
1570
vdev_t *tvd = vd->v_top;
1571
vdev_t *cvd;
1572
raidz_map_t *rm;
1573
raidz_col_t *rc;
1574
int c, error;
1575
int unexpected_errors __unused;
1576
int parity_errors;
1577
int parity_untried;
1578
int data_errors;
1579
int total_errors;
1580
int n;
1581
int tgts[VDEV_RAIDZ_MAXPARITY];
1582
int code;
1583
1584
rc = NULL; /* gcc */
1585
error = 0;
1586
1587
rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1588
vd->v_nchildren, vd->v_nparity);
1589
if (rm == NULL)
1590
return (ENOMEM);
1591
1592
/*
1593
* Iterate over the columns in reverse order so that we hit the parity
1594
* last -- any errors along the way will force us to read the parity.
1595
*/
1596
for (c = rm->rm_cols - 1; c >= 0; c--) {
1597
rc = &rm->rm_col[c];
1598
cvd = vdev_child(vd, rc->rc_devidx);
1599
if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1600
if (c >= rm->rm_firstdatacol)
1601
rm->rm_missingdata++;
1602
else
1603
rm->rm_missingparity++;
1604
rc->rc_error = ENXIO;
1605
rc->rc_tried = 1; /* don't even try */
1606
rc->rc_skipped = 1;
1607
continue;
1608
}
1609
#if 0 /* XXX: Too hard for the boot code. */
1610
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1611
if (c >= rm->rm_firstdatacol)
1612
rm->rm_missingdata++;
1613
else
1614
rm->rm_missingparity++;
1615
rc->rc_error = ESTALE;
1616
rc->rc_skipped = 1;
1617
continue;
1618
}
1619
#endif
1620
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1621
rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1622
rc->rc_offset, rc->rc_size);
1623
rc->rc_tried = 1;
1624
rc->rc_skipped = 0;
1625
}
1626
}
1627
1628
reconstruct:
1629
unexpected_errors = 0;
1630
parity_errors = 0;
1631
parity_untried = 0;
1632
data_errors = 0;
1633
total_errors = 0;
1634
1635
ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1636
ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1637
1638
for (c = 0; c < rm->rm_cols; c++) {
1639
rc = &rm->rm_col[c];
1640
1641
if (rc->rc_error) {
1642
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1643
1644
if (c < rm->rm_firstdatacol)
1645
parity_errors++;
1646
else
1647
data_errors++;
1648
1649
if (!rc->rc_skipped)
1650
unexpected_errors++;
1651
1652
total_errors++;
1653
} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1654
parity_untried++;
1655
}
1656
}
1657
1658
/*
1659
* There are three potential phases for a read:
1660
* 1. produce valid data from the columns read
1661
* 2. read all disks and try again
1662
* 3. perform combinatorial reconstruction
1663
*
1664
* Each phase is progressively both more expensive and less likely to
1665
* occur. If we encounter more errors than we can repair or all phases
1666
* fail, we have no choice but to return an error.
1667
*/
1668
1669
/*
1670
* If the number of errors we saw was correctable -- less than or equal
1671
* to the number of parity disks read -- attempt to produce data that
1672
* has a valid checksum. Naturally, this case applies in the absence of
1673
* any errors.
1674
*/
1675
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1676
int rv;
1677
1678
if (data_errors == 0) {
1679
rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1680
if (rv == 0) {
1681
/*
1682
* If we read parity information (unnecessarily
1683
* as it happens since no reconstruction was
1684
* needed) regenerate and verify the parity.
1685
* We also regenerate parity when resilvering
1686
* so we can write it out to the failed device
1687
* later.
1688
*/
1689
if (parity_errors + parity_untried <
1690
rm->rm_firstdatacol) {
1691
n = raidz_parity_verify(rm);
1692
unexpected_errors += n;
1693
ASSERT(parity_errors + n <=
1694
rm->rm_firstdatacol);
1695
}
1696
goto done;
1697
}
1698
} else {
1699
/*
1700
* We either attempt to read all the parity columns or
1701
* none of them. If we didn't try to read parity, we
1702
* wouldn't be here in the correctable case. There must
1703
* also have been fewer parity errors than parity
1704
* columns or, again, we wouldn't be in this code path.
1705
*/
1706
ASSERT(parity_untried == 0);
1707
ASSERT(parity_errors < rm->rm_firstdatacol);
1708
1709
/*
1710
* Identify the data columns that reported an error.
1711
*/
1712
n = 0;
1713
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1714
rc = &rm->rm_col[c];
1715
if (rc->rc_error != 0) {
1716
ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1717
tgts[n++] = c;
1718
}
1719
}
1720
1721
ASSERT(rm->rm_firstdatacol >= n);
1722
1723
code = vdev_raidz_reconstruct(rm, tgts, n);
1724
1725
rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1726
if (rv == 0) {
1727
/*
1728
* If we read more parity disks than were used
1729
* for reconstruction, confirm that the other
1730
* parity disks produced correct data. This
1731
* routine is suboptimal in that it regenerates
1732
* the parity that we already used in addition
1733
* to the parity that we're attempting to
1734
* verify, but this should be a relatively
1735
* uncommon case, and can be optimized if it
1736
* becomes a problem. Note that we regenerate
1737
* parity when resilvering so we can write it
1738
* out to failed devices later.
1739
*/
1740
if (parity_errors < rm->rm_firstdatacol - n) {
1741
n = raidz_parity_verify(rm);
1742
unexpected_errors += n;
1743
ASSERT(parity_errors + n <=
1744
rm->rm_firstdatacol);
1745
}
1746
1747
goto done;
1748
}
1749
}
1750
}
1751
1752
/*
1753
* This isn't a typical situation -- either we got a read
1754
* error or a child silently returned bad data. Read every
1755
* block so we can try again with as much data and parity as
1756
* we can track down. If we've already been through once
1757
* before, all children will be marked as tried so we'll
1758
* proceed to combinatorial reconstruction.
1759
*/
1760
unexpected_errors = 1;
1761
rm->rm_missingdata = 0;
1762
rm->rm_missingparity = 0;
1763
1764
n = 0;
1765
for (c = 0; c < rm->rm_cols; c++) {
1766
rc = &rm->rm_col[c];
1767
1768
if (rc->rc_tried)
1769
continue;
1770
1771
cvd = vdev_child(vd, rc->rc_devidx);
1772
ASSERT(cvd != NULL);
1773
rc->rc_error = cvd->v_read(cvd, NULL,
1774
rc->rc_data, rc->rc_offset, rc->rc_size);
1775
if (rc->rc_error == 0)
1776
n++;
1777
rc->rc_tried = 1;
1778
rc->rc_skipped = 0;
1779
}
1780
/*
1781
* If we managed to read anything more, retry the
1782
* reconstruction.
1783
*/
1784
if (n > 0)
1785
goto reconstruct;
1786
1787
/*
1788
* At this point we've attempted to reconstruct the data given the
1789
* errors we detected, and we've attempted to read all columns. There
1790
* must, therefore, be one or more additional problems -- silent errors
1791
* resulting in invalid data rather than explicit I/O errors resulting
1792
* in absent data. We check if there is enough additional data to
1793
* possibly reconstruct the data and then perform combinatorial
1794
* reconstruction over all possible combinations. If that fails,
1795
* we're cooked.
1796
*/
1797
if (total_errors > rm->rm_firstdatacol) {
1798
error = EIO;
1799
} else if (total_errors < rm->rm_firstdatacol &&
1800
(code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1801
total_errors, data_errors)) != 0) {
1802
/*
1803
* If we didn't use all the available parity for the
1804
* combinatorial reconstruction, verify that the remaining
1805
* parity is correct.
1806
*/
1807
if (code != (1 << rm->rm_firstdatacol) - 1)
1808
(void) raidz_parity_verify(rm);
1809
} else {
1810
/*
1811
* We're here because either:
1812
*
1813
* total_errors == rm_first_datacol, or
1814
* vdev_raidz_combrec() failed
1815
*
1816
* In either case, there is enough bad data to prevent
1817
* reconstruction.
1818
*
1819
* Start checksum ereports for all children which haven't
1820
* failed, and the IO wasn't speculative.
1821
*/
1822
error = ECKSUM;
1823
}
1824
1825
done:
1826
vdev_raidz_map_free(rm);
1827
1828
return (error);
1829
}
1830
1831