Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/cddl/boot/zfs/zfsimpl.h
105977 views
1
/*-
2
* Copyright (c) 2002 McAfee, Inc.
3
* All rights reserved.
4
*
5
* This software was developed for the FreeBSD Project by Marshall
6
* Kirk McKusick and McAfee Research,, the Security Research Division of
7
* McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as
8
* part of the DARPA CHATS research program
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
*
19
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
* SUCH DAMAGE.
30
*/
31
/*
32
* CDDL HEADER START
33
*
34
* The contents of this file are subject to the terms of the
35
* Common Development and Distribution License (the "License").
36
* You may not use this file except in compliance with the License.
37
*
38
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
39
* or http://www.opensolaris.org/os/licensing.
40
* See the License for the specific language governing permissions
41
* and limitations under the License.
42
*
43
* When distributing Covered Code, include this CDDL HEADER in each
44
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
45
* If applicable, add the following below this CDDL HEADER, with the
46
* fields enclosed by brackets "[]" replaced with your own identifying
47
* information: Portions Copyright [yyyy] [name of copyright owner]
48
*
49
* CDDL HEADER END
50
*/
51
/*
52
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
53
* Use is subject to license terms.
54
*/
55
/*
56
* Copyright 2013 by Saso Kiselkov. All rights reserved.
57
*/
58
/*
59
* Copyright (c) 2020 by Delphix. All rights reserved.
60
*/
61
62
#include <sys/queue.h>
63
64
#ifndef _ZFSIMPL_H_
65
#define _ZFSIMPL_H_
66
67
#define MAXNAMELEN 256
68
69
#define _NOTE(s)
70
71
/*
72
* AVL comparator helpers
73
*/
74
#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
75
#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
76
#define AVL_PCMP(a, b) \
77
(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
78
79
#if !defined(NEED_SOLARIS_BOOLEAN) /* Only defined when we'll define this elsewhere */
80
typedef enum { B_FALSE, B_TRUE } boolean_t;
81
#endif
82
83
/* CRC64 table */
84
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
85
86
/*
87
* Macros for various sorts of alignment and rounding when the alignment
88
* is known to be a power of 2.
89
*/
90
#define P2ALIGN(x, align) ((x) & -(align))
91
#define P2PHASE(x, align) ((x) & ((align) - 1))
92
#define P2NPHASE(x, align) (-(x) & ((align) - 1))
93
#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
94
#define P2END(x, align) (-(~(x) & -(align)))
95
#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
96
#define P2BOUNDARY(off, len, align) (((off) ^ ((off) + (len) - 1)) > (align) - 1)
97
#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
98
99
/*
100
* General-purpose 32-bit and 64-bit bitfield encodings.
101
*/
102
#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
103
#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
104
#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
105
#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
106
107
#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
108
#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
109
110
#define BF32_SET(x, low, len, val) \
111
((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
112
#define BF64_SET(x, low, len, val) \
113
((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
114
115
#define BF32_GET_SB(x, low, len, shift, bias) \
116
((BF32_GET(x, low, len) + (bias)) << (shift))
117
#define BF64_GET_SB(x, low, len, shift, bias) \
118
((BF64_GET(x, low, len) + (bias)) << (shift))
119
120
#define BF32_SET_SB(x, low, len, shift, bias, val) \
121
BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
122
#define BF64_SET_SB(x, low, len, shift, bias, val) \
123
BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
124
125
/*
126
* Macros to reverse byte order
127
*/
128
#define BSWAP_8(x) ((x) & 0xff)
129
#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
130
#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
131
#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
132
133
#define SPA_MINBLOCKSHIFT 9
134
#define SPA_OLDMAXBLOCKSHIFT 17
135
#define SPA_MAXBLOCKSHIFT 24
136
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
137
#define SPA_OLDMAXBLOCKSIZE (1ULL << SPA_OLDMAXBLOCKSHIFT)
138
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
139
140
/*
141
* The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
142
* The ASIZE encoding should be at least 64 times larger (6 more bits)
143
* to support up to 4-way RAID-Z mirror mode with worst-case gang block
144
* overhead, three DVAs per bp, plus one more bit in case we do anything
145
* else that expands the ASIZE.
146
*/
147
#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
148
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
149
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
150
151
/*
152
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
153
* The members of the dva_t should be considered opaque outside the SPA.
154
*/
155
typedef struct dva {
156
uint64_t dva_word[2];
157
} dva_t;
158
159
/*
160
* Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
161
*/
162
typedef struct zio_cksum {
163
uint64_t zc_word[4];
164
} zio_cksum_t;
165
166
/*
167
* Some checksums/hashes need a 256-bit initialization salt. This salt is kept
168
* secret and is suitable for use in MAC algorithms as the key.
169
*/
170
typedef struct zio_cksum_salt {
171
uint8_t zcs_bytes[32];
172
} zio_cksum_salt_t;
173
174
/*
175
* Each block is described by its DVAs, time of birth, checksum, etc.
176
* The word-by-word, bit-by-bit layout of the blkptr is as follows:
177
*
178
* 64 56 48 40 32 24 16 8 0
179
* +-------+-------+-------+-------+-------+-------+-------+-------+
180
* 0 | vdev1 | GRID | ASIZE |
181
* +-------+-------+-------+-------+-------+-------+-------+-------+
182
* 1 |G| offset1 |
183
* +-------+-------+-------+-------+-------+-------+-------+-------+
184
* 2 | vdev2 | GRID | ASIZE |
185
* +-------+-------+-------+-------+-------+-------+-------+-------+
186
* 3 |G| offset2 |
187
* +-------+-------+-------+-------+-------+-------+-------+-------+
188
* 4 | vdev3 | GRID | ASIZE |
189
* +-------+-------+-------+-------+-------+-------+-------+-------+
190
* 5 |G| offset3 |
191
* +-------+-------+-------+-------+-------+-------+-------+-------+
192
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
193
* +-------+-------+-------+-------+-------+-------+-------+-------+
194
* 7 | padding |
195
* +-------+-------+-------+-------+-------+-------+-------+-------+
196
* 8 | padding |
197
* +-------+-------+-------+-------+-------+-------+-------+-------+
198
* 9 | physical birth txg |
199
* +-------+-------+-------+-------+-------+-------+-------+-------+
200
* a | logical birth txg |
201
* +-------+-------+-------+-------+-------+-------+-------+-------+
202
* b | fill count |
203
* +-------+-------+-------+-------+-------+-------+-------+-------+
204
* c | checksum[0] |
205
* +-------+-------+-------+-------+-------+-------+-------+-------+
206
* d | checksum[1] |
207
* +-------+-------+-------+-------+-------+-------+-------+-------+
208
* e | checksum[2] |
209
* +-------+-------+-------+-------+-------+-------+-------+-------+
210
* f | checksum[3] |
211
* +-------+-------+-------+-------+-------+-------+-------+-------+
212
*
213
* Legend:
214
*
215
* vdev virtual device ID
216
* offset offset into virtual device
217
* LSIZE logical size
218
* PSIZE physical size (after compression)
219
* ASIZE allocated size (including RAID-Z parity and gang block headers)
220
* GRID RAID-Z layout information (reserved for future use)
221
* cksum checksum function
222
* comp compression function
223
* G gang block indicator
224
* B byteorder (endianness)
225
* D dedup
226
* X encryption (on version 30, which is not supported)
227
* E blkptr_t contains embedded data (see below)
228
* lvl level of indirection
229
* type DMU object type
230
* phys birth txg of block allocation; zero if same as logical birth txg
231
* log. birth transaction group in which the block was logically born
232
* fill count number of non-zero blocks under this bp
233
* checksum[4] 256-bit checksum of the data this bp describes
234
*/
235
236
/*
237
* "Embedded" blkptr_t's don't actually point to a block, instead they
238
* have a data payload embedded in the blkptr_t itself. See the comment
239
* in blkptr.c for more details.
240
*
241
* The blkptr_t is laid out as follows:
242
*
243
* 64 56 48 40 32 24 16 8 0
244
* +-------+-------+-------+-------+-------+-------+-------+-------+
245
* 0 | payload |
246
* 1 | payload |
247
* 2 | payload |
248
* 3 | payload |
249
* 4 | payload |
250
* 5 | payload |
251
* +-------+-------+-------+-------+-------+-------+-------+-------+
252
* 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
253
* +-------+-------+-------+-------+-------+-------+-------+-------+
254
* 7 | payload |
255
* 8 | payload |
256
* 9 | payload |
257
* +-------+-------+-------+-------+-------+-------+-------+-------+
258
* a | logical birth txg |
259
* +-------+-------+-------+-------+-------+-------+-------+-------+
260
* b | payload |
261
* c | payload |
262
* d | payload |
263
* e | payload |
264
* f | payload |
265
* +-------+-------+-------+-------+-------+-------+-------+-------+
266
*
267
* Legend:
268
*
269
* payload contains the embedded data
270
* B (byteorder) byteorder (endianness)
271
* D (dedup) padding (set to zero)
272
* X encryption (set to zero; see above)
273
* E (embedded) set to one
274
* lvl indirection level
275
* type DMU object type
276
* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
277
* comp compression function of payload
278
* PSIZE size of payload after compression, in bytes
279
* LSIZE logical size of payload, in bytes
280
* note that 25 bits is enough to store the largest
281
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
282
* log. birth transaction group in which the block was logically born
283
*
284
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
285
* bp's they are stored in units of SPA_MINBLOCKSHIFT.
286
* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
287
* The B, D, X, lvl, type, and comp fields are stored the same as with normal
288
* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
289
* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
290
* other macros, as they assert that they are only used on BP's of the correct
291
* "embedded-ness".
292
*/
293
294
#define BPE_GET_ETYPE(bp) \
295
(ASSERT(BP_IS_EMBEDDED(bp)), \
296
BF64_GET((bp)->blk_prop, 40, 8))
297
#define BPE_SET_ETYPE(bp, t) do { \
298
ASSERT(BP_IS_EMBEDDED(bp)); \
299
BF64_SET((bp)->blk_prop, 40, 8, t); \
300
_NOTE(CONSTCOND) } while (0)
301
302
#define BPE_GET_LSIZE(bp) \
303
(ASSERT(BP_IS_EMBEDDED(bp)), \
304
BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
305
#define BPE_SET_LSIZE(bp, x) do { \
306
ASSERT(BP_IS_EMBEDDED(bp)); \
307
BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
308
_NOTE(CONSTCOND) } while (0)
309
310
#define BPE_GET_PSIZE(bp) \
311
(ASSERT(BP_IS_EMBEDDED(bp)), \
312
BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
313
#define BPE_SET_PSIZE(bp, x) do { \
314
ASSERT(BP_IS_EMBEDDED(bp)); \
315
BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
316
_NOTE(CONSTCOND) } while (0)
317
318
typedef enum bp_embedded_type {
319
BP_EMBEDDED_TYPE_DATA,
320
BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
321
NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
322
} bp_embedded_type_t;
323
324
#define BPE_NUM_WORDS 14
325
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
326
#define BPE_IS_PAYLOADWORD(bp, wp) \
327
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
328
329
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
330
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
331
332
typedef struct blkptr {
333
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
334
uint64_t blk_prop; /* size, compression, type, etc */
335
uint64_t blk_pad[2]; /* Extra space for the future */
336
uint64_t blk_phys_birth; /* txg when block was allocated */
337
uint64_t blk_birth; /* transaction group at birth */
338
uint64_t blk_fill; /* fill count */
339
zio_cksum_t blk_cksum; /* 256-bit checksum */
340
} blkptr_t;
341
342
/*
343
* Macros to get and set fields in a bp or DVA.
344
*/
345
#define DVA_GET_ASIZE(dva) \
346
BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
347
#define DVA_SET_ASIZE(dva, x) \
348
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
349
SPA_MINBLOCKSHIFT, 0, x)
350
351
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
352
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
353
354
#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
355
#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
356
357
#define DVA_GET_OFFSET(dva) \
358
BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
359
#define DVA_SET_OFFSET(dva, x) \
360
BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
361
362
#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
363
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
364
365
#define BP_GET_LSIZE(bp) \
366
(BP_IS_EMBEDDED(bp) ? \
367
(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
368
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
369
#define BP_SET_LSIZE(bp, x) do { \
370
ASSERT(!BP_IS_EMBEDDED(bp)); \
371
BF64_SET_SB((bp)->blk_prop, \
372
0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
373
_NOTE(CONSTCOND) } while (0)
374
375
#define BP_GET_PSIZE(bp) \
376
BF64_GET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
377
#define BP_SET_PSIZE(bp, x) \
378
BF64_SET_SB((bp)->blk_prop, 16, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
379
380
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
381
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
382
383
#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
384
#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
385
386
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
387
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
388
389
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
390
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
391
392
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
393
394
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
395
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
396
397
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
398
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
399
400
#define BP_PHYSICAL_BIRTH(bp) \
401
((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
402
403
#define BP_SET_BIRTH(bp, logical, physical) \
404
{ \
405
ASSERT(!BP_IS_EMBEDDED(bp)); \
406
(bp)->blk_birth = (logical); \
407
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
408
}
409
410
#define BP_GET_FILL(bp) \
411
((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)
412
413
#define BP_SET_FILL(bp, fill) \
414
{ \
415
(bp)->blk_fill = fill; \
416
}
417
418
#define BP_GET_ASIZE(bp) \
419
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
420
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
421
422
#define BP_GET_UCSIZE(bp) \
423
((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
424
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
425
426
#define BP_GET_NDVAS(bp) \
427
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
428
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
429
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
430
431
#define DVA_EQUAL(dva1, dva2) \
432
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
433
(dva1)->dva_word[0] == (dva2)->dva_word[0])
434
435
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
436
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
437
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
438
((zc1).zc_word[2] - (zc2).zc_word[2]) | \
439
((zc1).zc_word[3] - (zc2).zc_word[3])))
440
441
442
#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
443
444
#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
445
{ \
446
(zcp)->zc_word[0] = w0; \
447
(zcp)->zc_word[1] = w1; \
448
(zcp)->zc_word[2] = w2; \
449
(zcp)->zc_word[3] = w3; \
450
}
451
452
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
453
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
454
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
455
(dva)->dva_word[1] == 0ULL)
456
#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
457
#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
458
459
#define BP_ZERO(bp) \
460
{ \
461
(bp)->blk_dva[0].dva_word[0] = 0; \
462
(bp)->blk_dva[0].dva_word[1] = 0; \
463
(bp)->blk_dva[1].dva_word[0] = 0; \
464
(bp)->blk_dva[1].dva_word[1] = 0; \
465
(bp)->blk_dva[2].dva_word[0] = 0; \
466
(bp)->blk_dva[2].dva_word[1] = 0; \
467
(bp)->blk_prop = 0; \
468
(bp)->blk_pad[0] = 0; \
469
(bp)->blk_pad[1] = 0; \
470
(bp)->blk_phys_birth = 0; \
471
(bp)->blk_birth = 0; \
472
(bp)->blk_fill = 0; \
473
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
474
}
475
476
#if BYTE_ORDER == _BIG_ENDIAN
477
#define ZFS_HOST_BYTEORDER (0ULL)
478
#else
479
#define ZFS_HOST_BYTEORDER (1ULL)
480
#endif
481
482
#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
483
#define BPE_NUM_WORDS 14
484
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
485
#define BPE_IS_PAYLOADWORD(bp, wp) \
486
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
487
488
/*
489
* Embedded checksum
490
*/
491
#define ZEC_MAGIC 0x210da7ab10c7a11ULL
492
493
typedef struct zio_eck {
494
uint64_t zec_magic; /* for validation, endianness */
495
zio_cksum_t zec_cksum; /* 256-bit checksum */
496
} zio_eck_t;
497
498
/*
499
* Gang block headers are self-checksumming and contain an array
500
* of block pointers.
501
*/
502
#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE
503
504
#define VDEV_RAIDZ_MAXPARITY 3
505
506
#define VDEV_PAD_SIZE (8 << 10)
507
/* 2 padding areas (vl_pad1 and vl_be) to skip */
508
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
509
#define VDEV_PHYS_SIZE (112 << 10)
510
#define VDEV_UBERBLOCK_RING (128 << 10)
511
512
/*
513
* MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
514
* ring when MMP is enabled.
515
*/
516
#define MMP_BLOCKS_PER_LABEL 1
517
518
/* The largest uberblock we support is 8k. */
519
#define MAX_UBERBLOCK_SHIFT (13)
520
#define VDEV_UBERBLOCK_SHIFT(vd) \
521
MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
522
#define VDEV_UBERBLOCK_COUNT(vd) \
523
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
524
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
525
offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
526
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
527
528
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
529
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
530
MAX_UBERBLOCK_SHIFT)
531
#define ASHIFT_UBERBLOCK_SIZE(ashift) \
532
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
533
534
typedef struct vdev_phys {
535
char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
536
zio_eck_t vp_zbt;
537
} vdev_phys_t;
538
539
typedef enum vbe_vers {
540
/* The bootenv file is stored as ascii text in the envblock */
541
VB_RAW = 0,
542
543
/*
544
* The bootenv file is converted to an nvlist and then packed into the
545
* envblock.
546
*/
547
VB_NVLIST = 1
548
} vbe_vers_t;
549
550
typedef struct vdev_boot_envblock {
551
uint64_t vbe_version;
552
char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
553
sizeof (zio_eck_t)];
554
zio_eck_t vbe_zbt;
555
} vdev_boot_envblock_t;
556
557
_Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE,
558
"bad size for vdev_boot_envblock_t");
559
560
typedef struct vdev_label {
561
char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
562
vdev_boot_envblock_t vl_be; /* 8K */
563
vdev_phys_t vl_vdev_phys; /* 112K */
564
char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
565
} vdev_label_t; /* 256K total */
566
567
/*
568
* vdev_dirty() flags
569
*/
570
#define VDD_METASLAB 0x01
571
#define VDD_DTL 0x02
572
573
/*
574
* Size and offset of embedded boot loader region on each label.
575
* The total size of the first two labels plus the boot area is 4MB.
576
*/
577
#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
578
#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
579
580
/*
581
* Size of label regions at the start and end of each leaf device.
582
*/
583
#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
584
#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
585
#define VDEV_LABELS 4
586
587
enum zio_checksum {
588
ZIO_CHECKSUM_INHERIT = 0,
589
ZIO_CHECKSUM_ON,
590
ZIO_CHECKSUM_OFF,
591
ZIO_CHECKSUM_LABEL,
592
ZIO_CHECKSUM_GANG_HEADER,
593
ZIO_CHECKSUM_ZILOG,
594
ZIO_CHECKSUM_FLETCHER_2,
595
ZIO_CHECKSUM_FLETCHER_4,
596
ZIO_CHECKSUM_SHA256,
597
ZIO_CHECKSUM_ZILOG2,
598
ZIO_CHECKSUM_NOPARITY,
599
ZIO_CHECKSUM_SHA512,
600
ZIO_CHECKSUM_SKEIN,
601
ZIO_CHECKSUM_EDONR,
602
ZIO_CHECKSUM_BLAKE3,
603
ZIO_CHECKSUM_FUNCTIONS
604
};
605
606
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
607
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
608
609
enum zio_compress {
610
ZIO_COMPRESS_INHERIT = 0,
611
ZIO_COMPRESS_ON,
612
ZIO_COMPRESS_OFF,
613
ZIO_COMPRESS_LZJB,
614
ZIO_COMPRESS_EMPTY,
615
ZIO_COMPRESS_GZIP_1,
616
ZIO_COMPRESS_GZIP_2,
617
ZIO_COMPRESS_GZIP_3,
618
ZIO_COMPRESS_GZIP_4,
619
ZIO_COMPRESS_GZIP_5,
620
ZIO_COMPRESS_GZIP_6,
621
ZIO_COMPRESS_GZIP_7,
622
ZIO_COMPRESS_GZIP_8,
623
ZIO_COMPRESS_GZIP_9,
624
ZIO_COMPRESS_ZLE,
625
ZIO_COMPRESS_LZ4,
626
ZIO_COMPRESS_ZSTD,
627
ZIO_COMPRESS_FUNCTIONS
628
};
629
630
enum zio_zstd_levels {
631
ZIO_ZSTD_LEVEL_INHERIT = 0,
632
ZIO_ZSTD_LEVEL_1,
633
#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1
634
ZIO_ZSTD_LEVEL_2,
635
ZIO_ZSTD_LEVEL_3,
636
#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3
637
ZIO_ZSTD_LEVEL_4,
638
ZIO_ZSTD_LEVEL_5,
639
ZIO_ZSTD_LEVEL_6,
640
ZIO_ZSTD_LEVEL_7,
641
ZIO_ZSTD_LEVEL_8,
642
ZIO_ZSTD_LEVEL_9,
643
ZIO_ZSTD_LEVEL_10,
644
ZIO_ZSTD_LEVEL_11,
645
ZIO_ZSTD_LEVEL_12,
646
ZIO_ZSTD_LEVEL_13,
647
ZIO_ZSTD_LEVEL_14,
648
ZIO_ZSTD_LEVEL_15,
649
ZIO_ZSTD_LEVEL_16,
650
ZIO_ZSTD_LEVEL_17,
651
ZIO_ZSTD_LEVEL_18,
652
ZIO_ZSTD_LEVEL_19,
653
#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19
654
ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */
655
ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */
656
ZIO_ZSTD_LEVEL_FAST_1,
657
#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1
658
ZIO_ZSTD_LEVEL_FAST_2,
659
ZIO_ZSTD_LEVEL_FAST_3,
660
ZIO_ZSTD_LEVEL_FAST_4,
661
ZIO_ZSTD_LEVEL_FAST_5,
662
ZIO_ZSTD_LEVEL_FAST_6,
663
ZIO_ZSTD_LEVEL_FAST_7,
664
ZIO_ZSTD_LEVEL_FAST_8,
665
ZIO_ZSTD_LEVEL_FAST_9,
666
ZIO_ZSTD_LEVEL_FAST_10,
667
ZIO_ZSTD_LEVEL_FAST_20,
668
ZIO_ZSTD_LEVEL_FAST_30,
669
ZIO_ZSTD_LEVEL_FAST_40,
670
ZIO_ZSTD_LEVEL_FAST_50,
671
ZIO_ZSTD_LEVEL_FAST_60,
672
ZIO_ZSTD_LEVEL_FAST_70,
673
ZIO_ZSTD_LEVEL_FAST_80,
674
ZIO_ZSTD_LEVEL_FAST_90,
675
ZIO_ZSTD_LEVEL_FAST_100,
676
ZIO_ZSTD_LEVEL_FAST_500,
677
ZIO_ZSTD_LEVEL_FAST_1000,
678
#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000
679
ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */
680
ZIO_ZSTD_LEVEL_LEVELS
681
};
682
683
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
684
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
685
686
/*
687
* On-disk version number.
688
*/
689
#define SPA_VERSION_1 1ULL
690
#define SPA_VERSION_2 2ULL
691
#define SPA_VERSION_3 3ULL
692
#define SPA_VERSION_4 4ULL
693
#define SPA_VERSION_5 5ULL
694
#define SPA_VERSION_6 6ULL
695
#define SPA_VERSION_7 7ULL
696
#define SPA_VERSION_8 8ULL
697
#define SPA_VERSION_9 9ULL
698
#define SPA_VERSION_10 10ULL
699
#define SPA_VERSION_11 11ULL
700
#define SPA_VERSION_12 12ULL
701
#define SPA_VERSION_13 13ULL
702
#define SPA_VERSION_14 14ULL
703
#define SPA_VERSION_15 15ULL
704
#define SPA_VERSION_16 16ULL
705
#define SPA_VERSION_17 17ULL
706
#define SPA_VERSION_18 18ULL
707
#define SPA_VERSION_19 19ULL
708
#define SPA_VERSION_20 20ULL
709
#define SPA_VERSION_21 21ULL
710
#define SPA_VERSION_22 22ULL
711
#define SPA_VERSION_23 23ULL
712
#define SPA_VERSION_24 24ULL
713
#define SPA_VERSION_25 25ULL
714
#define SPA_VERSION_26 26ULL
715
#define SPA_VERSION_27 27ULL
716
#define SPA_VERSION_28 28ULL
717
#define SPA_VERSION_5000 5000ULL
718
719
/*
720
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
721
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
722
* and do the appropriate changes. Also bump the version number in
723
* usr/src/grub/capability.
724
*/
725
#define SPA_VERSION SPA_VERSION_5000
726
#define SPA_VERSION_STRING "5000"
727
728
/*
729
* Symbolic names for the changes that caused a SPA_VERSION switch.
730
* Used in the code when checking for presence or absence of a feature.
731
* Feel free to define multiple symbolic names for each version if there
732
* were multiple changes to on-disk structures during that version.
733
*
734
* NOTE: When checking the current SPA_VERSION in your code, be sure
735
* to use spa_version() since it reports the version of the
736
* last synced uberblock. Checking the in-flight version can
737
* be dangerous in some cases.
738
*/
739
#define SPA_VERSION_INITIAL SPA_VERSION_1
740
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
741
#define SPA_VERSION_SPARES SPA_VERSION_3
742
#define SPA_VERSION_RAID6 SPA_VERSION_3
743
#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
744
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
745
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
746
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
747
#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
748
#define SPA_VERSION_BOOTFS SPA_VERSION_6
749
#define SPA_VERSION_SLOGS SPA_VERSION_7
750
#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
751
#define SPA_VERSION_FUID SPA_VERSION_9
752
#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
753
#define SPA_VERSION_REFQUOTA SPA_VERSION_9
754
#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
755
#define SPA_VERSION_L2CACHE SPA_VERSION_10
756
#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
757
#define SPA_VERSION_ORIGIN SPA_VERSION_11
758
#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
759
#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
760
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
761
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
762
#define SPA_VERSION_USERSPACE SPA_VERSION_15
763
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
764
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
765
#define SPA_VERSION_USERREFS SPA_VERSION_18
766
#define SPA_VERSION_HOLES SPA_VERSION_19
767
#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
768
#define SPA_VERSION_DEDUP SPA_VERSION_21
769
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
770
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
771
#define SPA_VERSION_SA SPA_VERSION_24
772
#define SPA_VERSION_SCAN SPA_VERSION_25
773
#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
774
#define SPA_VERSION_DEADLISTS SPA_VERSION_26
775
#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
776
#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
777
#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
778
#define SPA_VERSION_FEATURES SPA_VERSION_5000
779
780
#define SPA_VERSION_IS_SUPPORTED(v) \
781
(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
782
((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
783
784
/*
785
* The following are configuration names used in the nvlist describing a pool's
786
* configuration.
787
*/
788
#define ZPOOL_CONFIG_VERSION "version"
789
#define ZPOOL_CONFIG_POOL_NAME "name"
790
#define ZPOOL_CONFIG_POOL_STATE "state"
791
#define ZPOOL_CONFIG_POOL_TXG "txg"
792
#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
793
#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
794
#define ZPOOL_CONFIG_TOP_GUID "top_guid"
795
#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
796
#define ZPOOL_CONFIG_TYPE "type"
797
#define ZPOOL_CONFIG_CHILDREN "children"
798
#define ZPOOL_CONFIG_ID "id"
799
#define ZPOOL_CONFIG_GUID "guid"
800
#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
801
#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
802
#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
803
#define ZPOOL_CONFIG_PATH "path"
804
#define ZPOOL_CONFIG_DEVID "devid"
805
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
806
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
807
#define ZPOOL_CONFIG_ASHIFT "ashift"
808
#define ZPOOL_CONFIG_ASIZE "asize"
809
#define ZPOOL_CONFIG_DTL "DTL"
810
#define ZPOOL_CONFIG_STATS "stats"
811
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
812
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
813
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
814
#define ZPOOL_CONFIG_SPARES "spares"
815
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
816
#define ZPOOL_CONFIG_NPARITY "nparity"
817
#define ZPOOL_CONFIG_HOSTID "hostid"
818
#define ZPOOL_CONFIG_HOSTNAME "hostname"
819
#define ZPOOL_CONFIG_IS_LOG "is_log"
820
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
821
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
822
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
823
824
/*
825
* The persistent vdev state is stored as separate values rather than a single
826
* 'vdev_state' entry. This is because a device can be in multiple states, such
827
* as offline and degraded.
828
*/
829
#define ZPOOL_CONFIG_OFFLINE "offline"
830
#define ZPOOL_CONFIG_FAULTED "faulted"
831
#define ZPOOL_CONFIG_DEGRADED "degraded"
832
#define ZPOOL_CONFIG_REMOVED "removed"
833
#define ZPOOL_CONFIG_FRU "fru"
834
#define ZPOOL_CONFIG_AUX_STATE "aux_state"
835
836
#define VDEV_TYPE_ROOT "root"
837
#define VDEV_TYPE_MIRROR "mirror"
838
#define VDEV_TYPE_REPLACING "replacing"
839
#define VDEV_TYPE_RAIDZ "raidz"
840
#define VDEV_TYPE_DISK "disk"
841
#define VDEV_TYPE_FILE "file"
842
#define VDEV_TYPE_MISSING "missing"
843
#define VDEV_TYPE_HOLE "hole"
844
#define VDEV_TYPE_SPARE "spare"
845
#define VDEV_TYPE_LOG "log"
846
#define VDEV_TYPE_L2CACHE "l2cache"
847
#define VDEV_TYPE_INDIRECT "indirect"
848
849
/*
850
* This is needed in userland to report the minimum necessary device size.
851
*/
852
#define SPA_MINDEVSIZE (64ULL << 20)
853
854
/*
855
* The location of the pool configuration repository, shared between kernel and
856
* userland.
857
*/
858
#define ZPOOL_CACHE "/boot/zfs/zpool.cache"
859
860
/*
861
* vdev states are ordered from least to most healthy.
862
* A vdev that's CANT_OPEN or below is considered unusable.
863
*/
864
typedef enum vdev_state {
865
VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
866
VDEV_STATE_CLOSED, /* Not currently open */
867
VDEV_STATE_OFFLINE, /* Not allowed to open */
868
VDEV_STATE_REMOVED, /* Explicitly removed from system */
869
VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
870
VDEV_STATE_FAULTED, /* External request to fault device */
871
VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
872
VDEV_STATE_HEALTHY /* Presumed good */
873
} vdev_state_t;
874
875
/*
876
* vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
877
* of the vdev stats structure uses these constants to distinguish why.
878
*/
879
typedef enum vdev_aux {
880
VDEV_AUX_NONE, /* no error */
881
VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
882
VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
883
VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
884
VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
885
VDEV_AUX_TOO_SMALL, /* vdev size is too small */
886
VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
887
VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
888
VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
889
VDEV_AUX_SPARED /* hot spare used in another pool */
890
} vdev_aux_t;
891
892
/*
893
* pool state. The following states are written to disk as part of the normal
894
* SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
895
* software abstractions used at various levels to communicate pool state.
896
*/
897
typedef enum pool_state {
898
POOL_STATE_ACTIVE = 0, /* In active use */
899
POOL_STATE_EXPORTED, /* Explicitly exported */
900
POOL_STATE_DESTROYED, /* Explicitly destroyed */
901
POOL_STATE_SPARE, /* Reserved for hot spare use */
902
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
903
POOL_STATE_UNAVAIL, /* Internal libzfs state */
904
POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
905
} pool_state_t;
906
907
/*
908
* The uberblock version is incremented whenever an incompatible on-disk
909
* format change is made to the SPA, DMU, or ZAP.
910
*
911
* Note: the first two fields should never be moved. When a storage pool
912
* is opened, the uberblock must be read off the disk before the version
913
* can be checked. If the ub_version field is moved, we may not detect
914
* version mismatch. If the ub_magic field is moved, applications that
915
* expect the magic number in the first word won't work.
916
*/
917
#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
918
#define UBERBLOCK_SHIFT 10 /* up to 1K */
919
920
#define MMP_MAGIC 0xa11cea11 /* all-see-all */
921
922
#define MMP_INTERVAL_VALID_BIT 0x01
923
#define MMP_SEQ_VALID_BIT 0x02
924
#define MMP_FAIL_INT_VALID_BIT 0x04
925
926
#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
927
ubp->ub_mmp_magic == MMP_MAGIC)
928
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
929
MMP_INTERVAL_VALID_BIT))
930
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
931
MMP_SEQ_VALID_BIT))
932
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
933
MMP_FAIL_INT_VALID_BIT))
934
935
#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
936
>> 8)
937
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
938
>> 32)
939
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
940
>> 48)
941
942
typedef struct uberblock {
943
uint64_t ub_magic; /* UBERBLOCK_MAGIC */
944
uint64_t ub_version; /* SPA_VERSION */
945
uint64_t ub_txg; /* txg of last sync */
946
uint64_t ub_guid_sum; /* sum of all vdev guids */
947
uint64_t ub_timestamp; /* UTC time of last sync */
948
blkptr_t ub_rootbp; /* MOS objset_phys_t */
949
/* highest SPA_VERSION supported by software that wrote this txg */
950
uint64_t ub_software_version;
951
/* Maybe missing in uberblocks we read, but always written */
952
uint64_t ub_mmp_magic;
953
/*
954
* If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
955
* Otherwise, nanosec since last MMP write.
956
*/
957
uint64_t ub_mmp_delay;
958
959
/*
960
* The ub_mmp_config contains the multihost write interval, multihost
961
* fail intervals, sequence number for sub-second granularity, and
962
* valid bit mask. This layout is as follows:
963
*
964
* 64 56 48 40 32 24 16 8 0
965
* +-------+-------+-------+-------+-------+-------+-------+-------+
966
* 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
967
* +-------+-------+-------+-------+-------+-------+-------+-------+
968
*
969
* This allows a write_interval of (2^24/1000)s, over 4.5 hours
970
*
971
* VALID Bits:
972
* - 0x01 - Write Interval (ms)
973
* - 0x02 - Sequence number exists
974
* - 0x04 - Fail Intervals
975
* - 0xf8 - Reserved
976
*/
977
uint64_t ub_mmp_config;
978
979
/*
980
* ub_checkpoint_txg indicates two things about the current uberblock:
981
*
982
* 1] If it is not zero then this uberblock is a checkpoint. If it is
983
* zero, then this uberblock is not a checkpoint.
984
*
985
* 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
986
* the ub_txg that the uberblock had at the time we moved it to
987
* the MOS config.
988
*
989
* The field is set when we checkpoint the uberblock and continues to
990
* hold that value even after we've rewound (unlike the ub_txg that
991
* is reset to a higher value).
992
*
993
* Besides checks used to determine whether we are reopening the
994
* pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
995
* the value of the field is used to determine which ZIL blocks have
996
* been allocated according to the ms_sm when we are rewinding to a
997
* checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
998
* the ZIL block is not allocated [see uses of spa_min_claim_txg()].
999
*/
1000
uint64_t ub_checkpoint_txg;
1001
} uberblock_t;
1002
1003
/*
1004
* Flags.
1005
*/
1006
#define DNODE_MUST_BE_ALLOCATED 1
1007
#define DNODE_MUST_BE_FREE 2
1008
1009
/*
1010
* Fixed constants.
1011
*/
1012
#define DNODE_SHIFT 9 /* 512 bytes */
1013
#define DN_MIN_INDBLKSHIFT 12 /* 4k */
1014
#define DN_MAX_INDBLKSHIFT 17 /* 128k */
1015
#define DNODE_BLOCK_SHIFT 14 /* 16k */
1016
#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
1017
#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
1018
#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
1019
1020
/*
1021
* Derived constants.
1022
*/
1023
#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
1024
#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
1025
#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
1026
#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
1027
#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
1028
#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
1029
(1 << SPA_BLKPTRSHIFT))
1030
#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
1031
#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
1032
#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> \
1033
SPA_BLKPTRSHIFT)
1034
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
1035
#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
1036
1037
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
1038
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
1039
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
1040
1041
/* The +2 here is a cheesy way to round up */
1042
#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
1043
(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
1044
1045
#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
1046
(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
1047
1048
#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
1049
(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
1050
1051
#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
1052
1053
/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
1054
#define DNODE_FLAG_USED_BYTES (1<<0)
1055
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
1056
1057
/* Does dnode have a SA spill blkptr in bonus? */
1058
#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
1059
1060
typedef struct dnode_phys {
1061
uint8_t dn_type; /* dmu_object_type_t */
1062
uint8_t dn_indblkshift; /* ln2(indirect block size) */
1063
uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
1064
uint8_t dn_nblkptr; /* length of dn_blkptr */
1065
uint8_t dn_bonustype; /* type of data in bonus buffer */
1066
uint8_t dn_checksum; /* ZIO_CHECKSUM type */
1067
uint8_t dn_compress; /* ZIO_COMPRESS type */
1068
uint8_t dn_flags; /* DNODE_FLAG_* */
1069
uint16_t dn_datablkszsec; /* data block size in 512b sectors */
1070
uint16_t dn_bonuslen; /* length of dn_bonus */
1071
uint8_t dn_extra_slots; /* # of subsequent slots consumed */
1072
uint8_t dn_pad2[3];
1073
1074
/* accounting is protected by dn_dirty_mtx */
1075
uint64_t dn_maxblkid; /* largest allocated block ID */
1076
uint64_t dn_used; /* bytes (or sectors) of disk space */
1077
1078
uint64_t dn_pad3[4];
1079
1080
/*
1081
* The tail region is 448 bytes for a 512 byte dnode, and
1082
* correspondingly larger for larger dnode sizes. The spill
1083
* block pointer, when present, is always at the end of the tail
1084
* region. There are three ways this space may be used, using
1085
* a 512 byte dnode for this diagram:
1086
*
1087
* 0 64 128 192 256 320 384 448 (offset)
1088
* +---------------+---------------+---------------+-------+
1089
* | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / |
1090
* +---------------+---------------+---------------+-------+
1091
* | dn_blkptr[0] | dn_bonus[0..319] |
1092
* +---------------+-----------------------+---------------+
1093
* | dn_blkptr[0] | dn_bonus[0..191] | dn_spill |
1094
* +---------------+-----------------------+---------------+
1095
*/
1096
union {
1097
blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
1098
struct {
1099
blkptr_t __dn_ignore1;
1100
uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
1101
};
1102
struct {
1103
blkptr_t __dn_ignore2;
1104
uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
1105
sizeof (blkptr_t)];
1106
blkptr_t dn_spill;
1107
};
1108
};
1109
} dnode_phys_t;
1110
1111
#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
1112
(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
1113
1114
typedef enum dmu_object_byteswap {
1115
DMU_BSWAP_UINT8,
1116
DMU_BSWAP_UINT16,
1117
DMU_BSWAP_UINT32,
1118
DMU_BSWAP_UINT64,
1119
DMU_BSWAP_ZAP,
1120
DMU_BSWAP_DNODE,
1121
DMU_BSWAP_OBJSET,
1122
DMU_BSWAP_ZNODE,
1123
DMU_BSWAP_OLDACL,
1124
DMU_BSWAP_ACL,
1125
/*
1126
* Allocating a new byteswap type number makes the on-disk format
1127
* incompatible with any other format that uses the same number.
1128
*
1129
* Data can usually be structured to work with one of the
1130
* DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
1131
*/
1132
DMU_BSWAP_NUMFUNCS
1133
} dmu_object_byteswap_t;
1134
1135
#define DMU_OT_NEWTYPE 0x80
1136
#define DMU_OT_METADATA 0x40
1137
#define DMU_OT_BYTESWAP_MASK 0x3f
1138
1139
/*
1140
* Defines a uint8_t object type. Object types specify if the data
1141
* in the object is metadata (boolean) and how to byteswap the data
1142
* (dmu_object_byteswap_t).
1143
*/
1144
#define DMU_OT(byteswap, metadata) \
1145
(DMU_OT_NEWTYPE | \
1146
((metadata) ? DMU_OT_METADATA : 0) | \
1147
((byteswap) & DMU_OT_BYTESWAP_MASK))
1148
1149
typedef enum dmu_object_type {
1150
DMU_OT_NONE,
1151
/* general: */
1152
DMU_OT_OBJECT_DIRECTORY, /* ZAP */
1153
DMU_OT_OBJECT_ARRAY, /* UINT64 */
1154
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
1155
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
1156
DMU_OT_BPOBJ, /* UINT64 */
1157
DMU_OT_BPOBJ_HDR, /* UINT64 */
1158
/* spa: */
1159
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
1160
DMU_OT_SPACE_MAP, /* UINT64 */
1161
/* zil: */
1162
DMU_OT_INTENT_LOG, /* UINT64 */
1163
/* dmu: */
1164
DMU_OT_DNODE, /* DNODE */
1165
DMU_OT_OBJSET, /* OBJSET */
1166
/* dsl: */
1167
DMU_OT_DSL_DIR, /* UINT64 */
1168
DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
1169
DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
1170
DMU_OT_DSL_PROPS, /* ZAP */
1171
DMU_OT_DSL_DATASET, /* UINT64 */
1172
/* zpl: */
1173
DMU_OT_ZNODE, /* ZNODE */
1174
DMU_OT_OLDACL, /* Old ACL */
1175
DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
1176
DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
1177
DMU_OT_MASTER_NODE, /* ZAP */
1178
DMU_OT_UNLINKED_SET, /* ZAP */
1179
/* zvol: */
1180
DMU_OT_ZVOL, /* UINT8 */
1181
DMU_OT_ZVOL_PROP, /* ZAP */
1182
/* other; for testing only! */
1183
DMU_OT_PLAIN_OTHER, /* UINT8 */
1184
DMU_OT_UINT64_OTHER, /* UINT64 */
1185
DMU_OT_ZAP_OTHER, /* ZAP */
1186
/* new object types: */
1187
DMU_OT_ERROR_LOG, /* ZAP */
1188
DMU_OT_SPA_HISTORY, /* UINT8 */
1189
DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
1190
DMU_OT_POOL_PROPS, /* ZAP */
1191
DMU_OT_DSL_PERMS, /* ZAP */
1192
DMU_OT_ACL, /* ACL */
1193
DMU_OT_SYSACL, /* SYSACL */
1194
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
1195
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
1196
DMU_OT_NEXT_CLONES, /* ZAP */
1197
DMU_OT_SCAN_QUEUE, /* ZAP */
1198
DMU_OT_USERGROUP_USED, /* ZAP */
1199
DMU_OT_USERGROUP_QUOTA, /* ZAP */
1200
DMU_OT_USERREFS, /* ZAP */
1201
DMU_OT_DDT_ZAP, /* ZAP */
1202
DMU_OT_DDT_STATS, /* ZAP */
1203
DMU_OT_SA, /* System attr */
1204
DMU_OT_SA_MASTER_NODE, /* ZAP */
1205
DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
1206
DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
1207
DMU_OT_SCAN_XLATE, /* ZAP */
1208
DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
1209
DMU_OT_DEADLIST, /* ZAP */
1210
DMU_OT_DEADLIST_HDR, /* UINT64 */
1211
DMU_OT_DSL_CLONES, /* ZAP */
1212
DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
1213
DMU_OT_NUMTYPES,
1214
1215
/*
1216
* Names for valid types declared with DMU_OT().
1217
*/
1218
DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
1219
DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
1220
DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
1221
DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
1222
DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
1223
DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
1224
DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
1225
DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
1226
DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
1227
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE)
1228
} dmu_object_type_t;
1229
1230
typedef enum dmu_objset_type {
1231
DMU_OST_NONE,
1232
DMU_OST_META,
1233
DMU_OST_ZFS,
1234
DMU_OST_ZVOL,
1235
DMU_OST_OTHER, /* For testing only! */
1236
DMU_OST_ANY, /* Be careful! */
1237
DMU_OST_NUMTYPES
1238
} dmu_objset_type_t;
1239
1240
#define ZAP_MAXVALUELEN (1024 * 8)
1241
1242
/*
1243
* header for all bonus and spill buffers.
1244
* The header has a fixed portion with a variable number
1245
* of "lengths" depending on the number of variable sized
1246
* attribues which are determined by the "layout number"
1247
*/
1248
1249
#define SA_MAGIC 0x2F505A /* ZFS SA */
1250
typedef struct sa_hdr_phys {
1251
uint32_t sa_magic;
1252
uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
1253
uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
1254
/* ... Data follows the lengths. */
1255
} sa_hdr_phys_t;
1256
1257
/*
1258
* sa_hdr_phys -> sa_layout_info
1259
*
1260
* 16 10 0
1261
* +--------+-------+
1262
* | hdrsz |layout |
1263
* +--------+-------+
1264
*
1265
* Bits 0-10 are the layout number
1266
* Bits 11-16 are the size of the header.
1267
* The hdrsize is the number * 8
1268
*
1269
* For example.
1270
* hdrsz of 1 ==> 8 byte header
1271
* 2 ==> 16 byte header
1272
*
1273
*/
1274
1275
#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
1276
#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
1277
#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
1278
{ \
1279
BF32_SET_SB(x, 10, 6, 3, 0, size); \
1280
BF32_SET(x, 0, 10, num); \
1281
}
1282
1283
#define SA_ATTR_BSWAP(x) BF32_GET(x, 16, 8)
1284
#define SA_ATTR_LENGTH(x) BF32_GET(x, 24, 16)
1285
#define SA_ATTR_NUM(x) BF32_GET(x, 0, 16)
1286
#define SA_ATTR_ENCODE(x, attr, length, bswap) \
1287
{ \
1288
BF64_SET(x, 24, 16, length); \
1289
BF64_SET(x, 16, 8, bswap); \
1290
BF64_SET(x, 0, 16, attr); \
1291
}
1292
1293
#define SA_MODE_OFFSET 0
1294
#define SA_SIZE_OFFSET 8
1295
#define SA_GEN_OFFSET 16
1296
#define SA_UID_OFFSET 24
1297
#define SA_GID_OFFSET 32
1298
#define SA_PARENT_OFFSET 40
1299
#define SA_SYMLINK_OFFSET 160
1300
1301
#define SA_REGISTRY "REGISTRY"
1302
#define SA_LAYOUTS "LAYOUTS"
1303
1304
typedef enum sa_bswap_type {
1305
SA_UINT64_ARRAY,
1306
SA_UINT32_ARRAY,
1307
SA_UINT16_ARRAY,
1308
SA_UINT8_ARRAY,
1309
SA_ACL,
1310
} sa_bswap_type_t;
1311
1312
typedef uint16_t sa_attr_type_t;
1313
1314
#define ZIO_OBJSET_MAC_LEN 32
1315
1316
/*
1317
* Intent log header - this on disk structure holds fields to manage
1318
* the log. All fields are 64 bit to easily handle cross architectures.
1319
*/
1320
typedef struct zil_header {
1321
uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
1322
uint64_t zh_replay_seq; /* highest replayed sequence number */
1323
blkptr_t zh_log; /* log chain */
1324
uint64_t zh_claim_seq; /* highest claimed sequence number */
1325
uint64_t zh_pad[5];
1326
} zil_header_t;
1327
1328
#define OBJSET_PHYS_SIZE_V2 2048
1329
#define OBJSET_PHYS_SIZE_V3 4096
1330
1331
typedef struct objset_phys {
1332
dnode_phys_t os_meta_dnode;
1333
zil_header_t os_zil_header;
1334
uint64_t os_type;
1335
uint64_t os_flags;
1336
uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN];
1337
uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN];
1338
char os_pad0[OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)*3 -
1339
sizeof (zil_header_t) - sizeof (uint64_t)*2 -
1340
2*ZIO_OBJSET_MAC_LEN];
1341
dnode_phys_t os_userused_dnode;
1342
dnode_phys_t os_groupused_dnode;
1343
dnode_phys_t os_projectused_dnode;
1344
char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 -
1345
sizeof (dnode_phys_t)];
1346
} objset_phys_t;
1347
1348
typedef struct space_map_phys {
1349
/* object number: not needed but kept for backwards compatibility */
1350
uint64_t smp_object;
1351
1352
/* length of the object in bytes */
1353
uint64_t smp_length;
1354
1355
/* space allocated from the map */
1356
int64_t smp_alloc;
1357
} space_map_phys_t;
1358
1359
typedef enum {
1360
SM_ALLOC,
1361
SM_FREE
1362
} maptype_t;
1363
1364
/* one-word entry constants */
1365
#define SM_DEBUG_PREFIX 2
1366
#define SM_OFFSET_BITS 47
1367
#define SM_RUN_BITS 15
1368
1369
/* two-word entry constants */
1370
#define SM2_PREFIX 3
1371
#define SM2_OFFSET_BITS 63
1372
#define SM2_RUN_BITS 36
1373
1374
#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
1375
#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
1376
1377
#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
1378
#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
1379
#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
1380
#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
1381
#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
1382
#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
1383
1384
#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
1385
#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
1386
#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
1387
#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
1388
#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
1389
#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
1390
#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
1391
#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
1392
1393
#define SM2_RUN_DECODE(x) (BF64_DECODE(x, 24, SM2_RUN_BITS) + 1)
1394
#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 24, SM2_RUN_BITS)
1395
#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, 24)
1396
#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, 24)
1397
#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
1398
#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
1399
#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
1400
#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
1401
#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
1402
#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
1403
1404
typedef enum dd_used {
1405
DD_USED_HEAD,
1406
DD_USED_SNAP,
1407
DD_USED_CHILD,
1408
DD_USED_CHILD_RSRV,
1409
DD_USED_REFRSRV,
1410
DD_USED_NUM
1411
} dd_used_t;
1412
1413
#define DD_FLAG_USED_BREAKDOWN (1 << 0)
1414
1415
typedef struct dsl_dir_phys {
1416
uint64_t dd_creation_time; /* not actually used */
1417
uint64_t dd_head_dataset_obj;
1418
uint64_t dd_parent_obj;
1419
uint64_t dd_clone_parent_obj;
1420
uint64_t dd_child_dir_zapobj;
1421
/*
1422
* how much space our children are accounting for; for leaf
1423
* datasets, == physical space used by fs + snaps
1424
*/
1425
uint64_t dd_used_bytes;
1426
uint64_t dd_compressed_bytes;
1427
uint64_t dd_uncompressed_bytes;
1428
/* Administrative quota setting */
1429
uint64_t dd_quota;
1430
/* Administrative reservation setting */
1431
uint64_t dd_reserved;
1432
uint64_t dd_props_zapobj;
1433
uint64_t dd_pad[1];
1434
uint64_t dd_flags;
1435
uint64_t dd_used_breakdown[DD_USED_NUM];
1436
uint64_t dd_clones;
1437
uint64_t dd_pad1[13]; /* pad out to 256 bytes for good measure */
1438
} dsl_dir_phys_t;
1439
1440
typedef struct dsl_dataset_phys {
1441
uint64_t ds_dir_obj;
1442
uint64_t ds_prev_snap_obj;
1443
uint64_t ds_prev_snap_txg;
1444
uint64_t ds_next_snap_obj;
1445
uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
1446
uint64_t ds_num_children; /* clone/snap children; ==0 for head */
1447
uint64_t ds_creation_time; /* seconds since 1970 */
1448
uint64_t ds_creation_txg;
1449
uint64_t ds_deadlist_obj;
1450
uint64_t ds_used_bytes;
1451
uint64_t ds_compressed_bytes;
1452
uint64_t ds_uncompressed_bytes;
1453
uint64_t ds_unique_bytes; /* only relevant to snapshots */
1454
/*
1455
* The ds_fsid_guid is a 56-bit ID that can change to avoid
1456
* collisions. The ds_guid is a 64-bit ID that will never
1457
* change, so there is a small probability that it will collide.
1458
*/
1459
uint64_t ds_fsid_guid;
1460
uint64_t ds_guid;
1461
uint64_t ds_flags;
1462
blkptr_t ds_bp;
1463
uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
1464
uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
1465
uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
1466
uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
1467
} dsl_dataset_phys_t;
1468
1469
typedef struct dsl_deadlist_phys {
1470
uint64_t dl_used;
1471
uint64_t dl_comp;
1472
uint64_t dl_uncomp;
1473
uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
1474
} dsl_deadlist_phys_t;
1475
1476
#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
1477
1478
typedef struct bpobj_phys {
1479
uint64_t bpo_num_blkptrs;
1480
uint64_t bpo_bytes;
1481
uint64_t bpo_comp;
1482
uint64_t bpo_uncomp;
1483
uint64_t bpo_subobjs;
1484
uint64_t bpo_num_subobjs;
1485
uint64_t bpo_num_freed;
1486
} bpobj_phys_t;
1487
1488
/*
1489
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
1490
*/
1491
#define DMU_POOL_DIRECTORY_OBJECT 1
1492
#define DMU_POOL_CONFIG "config"
1493
#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
1494
#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
1495
#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
1496
#define DMU_POOL_ROOT_DATASET "root_dataset"
1497
#define DMU_POOL_SYNC_BPLIST "sync_bplist"
1498
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
1499
#define DMU_POOL_ERRLOG_LAST "errlog_last"
1500
#define DMU_POOL_SPARES "spares"
1501
#define DMU_POOL_DEFLATE "deflate"
1502
#define DMU_POOL_HISTORY "history"
1503
#define DMU_POOL_PROPS "pool_props"
1504
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
1505
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
1506
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
1507
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
1508
#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
1509
#define DMU_POOL_REMOVING "com.delphix:removing"
1510
#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
1511
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
1512
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
1513
1514
#define ZAP_MAGIC 0x2F52AB2ABULL
1515
1516
#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_block_shift)
1517
1518
#define ZAP_MAXCD (uint32_t)(-1)
1519
#define ZAP_HASHBITS 28
1520
#define MZAP_ENT_LEN 64
1521
#define MZAP_ENT_MAX \
1522
((MZAP_MAX_BLKSZ - sizeof(mzap_phys_t)) / sizeof(mzap_ent_phys_t) + 1)
1523
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
1524
#define MZAP_MAX_BLKSZ SPA_OLDMAXBLOCKSIZE
1525
1526
typedef struct mzap_ent_phys {
1527
uint64_t mze_value;
1528
uint32_t mze_cd;
1529
uint16_t mze_pad; /* in case we want to chain them someday */
1530
char mze_name[MZAP_NAME_LEN];
1531
} mzap_ent_phys_t;
1532
1533
typedef struct mzap_phys {
1534
uint64_t mz_block_type; /* ZBT_MICRO */
1535
uint64_t mz_salt;
1536
uint64_t mz_normflags;
1537
uint64_t mz_pad[5];
1538
mzap_ent_phys_t mz_chunk[1];
1539
/* actually variable size depending on block size */
1540
} mzap_phys_t;
1541
1542
/*
1543
* The (fat) zap is stored in one object. It is an array of
1544
* 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
1545
*
1546
* ptrtbl fits in first block:
1547
* [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
1548
*
1549
* ptrtbl too big for first block:
1550
* [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
1551
*
1552
*/
1553
1554
#define ZBT_LEAF ((1ULL << 63) + 0)
1555
#define ZBT_HEADER ((1ULL << 63) + 1)
1556
#define ZBT_MICRO ((1ULL << 63) + 3)
1557
/* any other values are ptrtbl blocks */
1558
1559
/*
1560
* the embedded pointer table takes up half a block:
1561
* block size / entry size (2^3) / 2
1562
*/
1563
#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
1564
1565
/*
1566
* The embedded pointer table starts half-way through the block. Since
1567
* the pointer table itself is half the block, it starts at (64-bit)
1568
* word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
1569
*/
1570
#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
1571
((uint64_t *)(zap)->zap_phys) \
1572
[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
1573
1574
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
1575
1576
/*
1577
* TAKE NOTE:
1578
* If zap_phys_t is modified, zap_byteswap() must be modified.
1579
*/
1580
typedef struct zap_phys {
1581
uint64_t zap_block_type; /* ZBT_HEADER */
1582
uint64_t zap_magic; /* ZAP_MAGIC */
1583
1584
struct zap_table_phys {
1585
uint64_t zt_blk; /* starting block number */
1586
uint64_t zt_numblks; /* number of blocks */
1587
uint64_t zt_shift; /* bits to index it */
1588
uint64_t zt_nextblk; /* next (larger) copy start block */
1589
uint64_t zt_blks_copied; /* number source blocks copied */
1590
} zap_ptrtbl;
1591
1592
uint64_t zap_freeblk; /* the next free block */
1593
uint64_t zap_num_leafs; /* number of leafs */
1594
uint64_t zap_num_entries; /* number of entries */
1595
uint64_t zap_salt; /* salt to stir into hash function */
1596
uint64_t zap_normflags; /* flags for u8_textprep_str() */
1597
uint64_t zap_flags; /* zap_flags_t */
1598
/*
1599
* This structure is followed by padding, and then the embedded
1600
* pointer table. The embedded pointer table takes up second
1601
* half of the block. It is accessed using the
1602
* ZAP_EMBEDDED_PTRTBL_ENT() macro.
1603
*/
1604
} zap_phys_t;
1605
1606
typedef struct zap_table_phys zap_table_phys_t;
1607
1608
struct spa;
1609
typedef struct fat_zap {
1610
int zap_block_shift; /* block size shift */
1611
zap_phys_t *zap_phys;
1612
const struct spa *zap_spa;
1613
const dnode_phys_t *zap_dnode;
1614
} fat_zap_t;
1615
1616
#define ZAP_LEAF_MAGIC 0x2AB1EAF
1617
1618
/* chunk size = 24 bytes */
1619
#define ZAP_LEAF_CHUNKSIZE 24
1620
1621
/*
1622
* The amount of space available for chunks is:
1623
* block size (1<<l->l_bs) - hash entry size (2) * number of hash
1624
* entries - header space (2*chunksize)
1625
*/
1626
#define ZAP_LEAF_NUMCHUNKS(l) \
1627
(((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
1628
ZAP_LEAF_CHUNKSIZE - 2)
1629
1630
/*
1631
* The amount of space within the chunk available for the array is:
1632
* chunk size - space for type (1) - space for next pointer (2)
1633
*/
1634
#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
1635
1636
#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
1637
(((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
1638
1639
/*
1640
* Low water mark: when there are only this many chunks free, start
1641
* growing the ptrtbl. Ideally, this should be larger than a
1642
* "reasonably-sized" entry. 20 chunks is more than enough for the
1643
* largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
1644
* while still being only around 3% for 16k blocks.
1645
*/
1646
#define ZAP_LEAF_LOW_WATER (20)
1647
1648
/*
1649
* The leaf hash table has block size / 2^5 (32) number of entries,
1650
* which should be more than enough for the maximum number of entries,
1651
* which is less than block size / CHUNKSIZE (24) / minimum number of
1652
* chunks per entry (3).
1653
*/
1654
#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
1655
#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
1656
1657
/*
1658
* The chunks start immediately after the hash table. The end of the
1659
* hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
1660
* chunk_t.
1661
*/
1662
#define ZAP_LEAF_CHUNK(l, idx) \
1663
((zap_leaf_chunk_t *)(void *) \
1664
((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
1665
#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
1666
1667
#define ZAP_LEAF_HASH(l, h) \
1668
((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
1669
((h) >> \
1670
(64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
1671
#define ZAP_LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[ZAP_LEAF_HASH(l, h)])
1672
1673
typedef enum zap_chunk_type {
1674
ZAP_CHUNK_FREE = 253,
1675
ZAP_CHUNK_ENTRY = 252,
1676
ZAP_CHUNK_ARRAY = 251,
1677
ZAP_CHUNK_TYPE_MAX = 250
1678
} zap_chunk_type_t;
1679
1680
/*
1681
* TAKE NOTE:
1682
* If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
1683
*/
1684
typedef struct zap_leaf_phys {
1685
struct zap_leaf_header {
1686
uint64_t lh_block_type; /* ZBT_LEAF */
1687
uint64_t lh_pad1;
1688
uint64_t lh_prefix; /* hash prefix of this leaf */
1689
uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
1690
uint16_t lh_nfree; /* number free chunks */
1691
uint16_t lh_nentries; /* number of entries */
1692
uint16_t lh_prefix_len; /* num bits used to id this */
1693
1694
/* above is accessable to zap, below is zap_leaf private */
1695
1696
uint16_t lh_freelist; /* chunk head of free list */
1697
uint8_t lh_pad2[12];
1698
} l_hdr; /* 2 24-byte chunks */
1699
1700
/*
1701
* The header is followed by a hash table with
1702
* ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
1703
* followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
1704
* zap_leaf_chunk structures. These structures are accessed
1705
* with the ZAP_LEAF_CHUNK() macro.
1706
*/
1707
1708
uint16_t l_hash[1];
1709
} zap_leaf_phys_t;
1710
1711
typedef union zap_leaf_chunk {
1712
struct zap_leaf_entry {
1713
uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
1714
uint8_t le_value_intlen; /* size of ints */
1715
uint16_t le_next; /* next entry in hash chain */
1716
uint16_t le_name_chunk; /* first chunk of the name */
1717
uint16_t le_name_numints; /* bytes in name, incl null */
1718
uint16_t le_value_chunk; /* first chunk of the value */
1719
uint16_t le_value_numints; /* value length in ints */
1720
uint32_t le_cd; /* collision differentiator */
1721
uint64_t le_hash; /* hash value of the name */
1722
} l_entry;
1723
struct zap_leaf_array {
1724
uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
1725
uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
1726
uint16_t la_next; /* next blk or CHAIN_END */
1727
} l_array;
1728
struct zap_leaf_free {
1729
uint8_t lf_type; /* always ZAP_CHUNK_FREE */
1730
uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
1731
uint16_t lf_next; /* next in free list, or CHAIN_END */
1732
} l_free;
1733
} zap_leaf_chunk_t;
1734
1735
typedef struct zap_leaf {
1736
int l_bs; /* block size shift */
1737
zap_leaf_phys_t *l_phys;
1738
} zap_leaf_t;
1739
1740
#define ZAP_MAXNAMELEN 256
1741
#define ZAP_MAXVALUELEN (1024 * 8)
1742
1743
#define ACE_READ_DATA 0x00000001 /* file: read data */
1744
#define ACE_LIST_DIRECTORY 0x00000001 /* dir: list files */
1745
#define ACE_WRITE_DATA 0x00000002 /* file: write data */
1746
#define ACE_ADD_FILE 0x00000002 /* dir: create file */
1747
#define ACE_APPEND_DATA 0x00000004 /* file: append data */
1748
#define ACE_ADD_SUBDIRECTORY 0x00000004 /* dir: create subdir */
1749
#define ACE_READ_NAMED_ATTRS 0x00000008 /* FILE_READ_EA */
1750
#define ACE_WRITE_NAMED_ATTRS 0x00000010 /* FILE_WRITE_EA */
1751
#define ACE_EXECUTE 0x00000020 /* file: execute */
1752
#define ACE_TRAVERSE 0x00000020 /* dir: lookup name */
1753
#define ACE_DELETE_CHILD 0x00000040 /* dir: unlink child */
1754
#define ACE_READ_ATTRIBUTES 0x00000080 /* (all) stat, etc. */
1755
#define ACE_WRITE_ATTRIBUTES 0x00000100 /* (all) utimes, etc. */
1756
#define ACE_DELETE 0x00010000 /* (all) unlink self */
1757
#define ACE_READ_ACL 0x00020000 /* (all) getsecattr */
1758
#define ACE_WRITE_ACL 0x00040000 /* (all) setsecattr */
1759
#define ACE_WRITE_OWNER 0x00080000 /* (all) chown */
1760
#define ACE_SYNCHRONIZE 0x00100000 /* (all) */
1761
1762
#define ACE_FILE_INHERIT_ACE 0x0001
1763
#define ACE_DIRECTORY_INHERIT_ACE 0x0002
1764
#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
1765
#define ACE_INHERIT_ONLY_ACE 0x0008
1766
#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
1767
#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
1768
#define ACE_IDENTIFIER_GROUP 0x0040
1769
#define ACE_INHERITED_ACE 0x0080
1770
#define ACE_OWNER 0x1000
1771
#define ACE_GROUP 0x2000
1772
#define ACE_EVERYONE 0x4000
1773
1774
#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
1775
#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
1776
#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
1777
#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
1778
1779
typedef struct zfs_ace_hdr {
1780
uint16_t z_type;
1781
uint16_t z_flags;
1782
uint32_t z_access_mask;
1783
} zfs_ace_hdr_t;
1784
1785
/*
1786
* Define special zfs pflags
1787
*/
1788
#define ZFS_XATTR 0x1 /* is an extended attribute */
1789
#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
1790
#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
1791
#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
1792
#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
1793
#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
1794
#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
1795
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
1796
#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
1797
1798
#define ZFS_READONLY 0x0000000100000000ull
1799
#define ZFS_HIDDEN 0x0000000200000000ull
1800
#define ZFS_SYSTEM 0x0000000400000000ull
1801
#define ZFS_ARCHIVE 0x0000000800000000ull
1802
#define ZFS_IMMUTABLE 0x0000001000000000ull
1803
#define ZFS_NOUNLINK 0x0000002000000000ull
1804
#define ZFS_APPENDONLY 0x0000004000000000ull
1805
#define ZFS_NODUMP 0x0000008000000000ull
1806
#define ZFS_OPAQUE 0x0000010000000000ull
1807
#define ZFS_AV_QUARANTINED 0x0000020000000000ull
1808
#define ZFS_AV_MODIFIED 0x0000040000000000ull
1809
#define ZFS_REPARSE 0x0000080000000000ull
1810
#define ZFS_OFFLINE 0x0000100000000000ull
1811
#define ZFS_SPARSE 0x0000200000000000ull
1812
1813
#define MASTER_NODE_OBJ 1
1814
1815
/*
1816
* special attributes for master node.
1817
*/
1818
1819
#define ZFS_FSID "FSID"
1820
#define ZFS_UNLINKED_SET "DELETE_QUEUE"
1821
#define ZFS_ROOT_OBJ "ROOT"
1822
#define ZPL_VERSION_OBJ "VERSION"
1823
#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
1824
#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
1825
#define ZFS_SA_ATTRS "SA_ATTRS"
1826
1827
#define ZFS_FLAG_BLOCKPERPAGE 0x1
1828
#define ZFS_FLAG_NOGROWBLOCKS 0x2
1829
1830
/*
1831
* ZPL version - rev'd whenever an incompatible on-disk format change
1832
* occurs. Independent of SPA/DMU/ZAP versioning.
1833
*/
1834
1835
#define ZPL_VERSION 1ULL
1836
1837
/*
1838
* The directory entry has the type (currently unused on Solaris) in the
1839
* top 4 bits, and the object number in the low 48 bits. The "middle"
1840
* 12 bits are unused.
1841
*/
1842
#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
1843
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
1844
#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
1845
1846
typedef struct ace {
1847
uid_t a_who; /* uid or gid */
1848
uint32_t a_access_mask; /* read,write,... */
1849
uint16_t a_flags; /* see below */
1850
uint16_t a_type; /* allow or deny */
1851
} ace_t;
1852
1853
#define ACE_SLOT_CNT 6
1854
1855
typedef struct zfs_znode_acl {
1856
uint64_t z_acl_extern_obj; /* ext acl pieces */
1857
uint32_t z_acl_count; /* Number of ACEs */
1858
uint16_t z_acl_version; /* acl version */
1859
uint16_t z_acl_pad; /* pad */
1860
ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
1861
} zfs_znode_acl_t;
1862
1863
/*
1864
* This is the persistent portion of the znode. It is stored
1865
* in the "bonus buffer" of the file. Short symbolic links
1866
* are also stored in the bonus buffer.
1867
*/
1868
typedef struct znode_phys {
1869
uint64_t zp_atime[2]; /* 0 - last file access time */
1870
uint64_t zp_mtime[2]; /* 16 - last file modification time */
1871
uint64_t zp_ctime[2]; /* 32 - last file change time */
1872
uint64_t zp_crtime[2]; /* 48 - creation time */
1873
uint64_t zp_gen; /* 64 - generation (txg of creation) */
1874
uint64_t zp_mode; /* 72 - file mode bits */
1875
uint64_t zp_size; /* 80 - size of file */
1876
uint64_t zp_parent; /* 88 - directory parent (`..') */
1877
uint64_t zp_links; /* 96 - number of links to file */
1878
uint64_t zp_xattr; /* 104 - DMU object for xattrs */
1879
uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
1880
uint64_t zp_flags; /* 120 - persistent flags */
1881
uint64_t zp_uid; /* 128 - file owner */
1882
uint64_t zp_gid; /* 136 - owning group */
1883
uint64_t zp_pad[4]; /* 144 - future */
1884
zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
1885
/*
1886
* Data may pad out any remaining bytes in the znode buffer, eg:
1887
*
1888
* |<---------------------- dnode_phys (512) ------------------------>|
1889
* |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
1890
* |<---- znode (264) ---->|<---- data (56) ---->|
1891
*
1892
* At present, we only use this space to store symbolic links.
1893
*/
1894
} znode_phys_t;
1895
1896
/*
1897
* In-core vdev representation.
1898
*/
1899
struct vdev;
1900
struct spa;
1901
typedef int vdev_phys_read_t(struct vdev *, void *, off_t, void *, size_t);
1902
typedef int vdev_phys_write_t(struct vdev *, off_t, void *, size_t);
1903
typedef int vdev_read_t(struct vdev *, const blkptr_t *, void *, off_t, size_t);
1904
1905
typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t;
1906
1907
typedef struct vdev_indirect_mapping_entry_phys {
1908
/*
1909
* Decode with DVA_MAPPING_* macros.
1910
* Contains:
1911
* the source offset (low 63 bits)
1912
* the one-bit "mark", used for garbage collection (by zdb)
1913
*/
1914
uint64_t vimep_src;
1915
1916
/*
1917
* Note: the DVA's asize is 24 bits, and can thus store ranges
1918
* up to 8GB.
1919
*/
1920
dva_t vimep_dst;
1921
} vdev_indirect_mapping_entry_phys_t;
1922
1923
#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \
1924
BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
1925
#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \
1926
BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
1927
1928
/*
1929
* This is stored in the bonus buffer of the mapping object, see comment of
1930
* vdev_indirect_config for more details.
1931
*/
1932
typedef struct vdev_indirect_mapping_phys {
1933
uint64_t vimp_max_offset;
1934
uint64_t vimp_bytes_mapped;
1935
uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */
1936
1937
/*
1938
* For each entry in the mapping object, this object contains an
1939
* entry representing the number of bytes of that mapping entry
1940
* that were no longer in use by the pool at the time this indirect
1941
* vdev was last condensed.
1942
*/
1943
uint64_t vimp_counts_object;
1944
} vdev_indirect_mapping_phys_t;
1945
1946
#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t))
1947
1948
typedef struct vdev_indirect_mapping {
1949
uint64_t vim_object;
1950
boolean_t vim_havecounts;
1951
1952
/* vim_entries segment offset currently in memory. */
1953
uint64_t vim_entry_offset;
1954
/* vim_entries segment size. */
1955
size_t vim_num_entries;
1956
1957
/* Needed by dnode_read() */
1958
const void *vim_spa;
1959
dnode_phys_t *vim_dn;
1960
1961
/*
1962
* An ordered array of mapping entries, sorted by source offset.
1963
* Note that vim_entries is needed during a removal (and contains
1964
* mappings that have been synced to disk so far) to handle frees
1965
* from the removing device.
1966
*/
1967
vdev_indirect_mapping_entry_phys_t *vim_entries;
1968
objset_phys_t *vim_objset;
1969
vdev_indirect_mapping_phys_t *vim_phys;
1970
} vdev_indirect_mapping_t;
1971
1972
/*
1973
* On-disk indirect vdev state.
1974
*
1975
* An indirect vdev is described exclusively in the MOS config of a pool.
1976
* The config for an indirect vdev includes several fields, which are
1977
* accessed in memory by a vdev_indirect_config_t.
1978
*/
1979
typedef struct vdev_indirect_config {
1980
/*
1981
* Object (in MOS) which contains the indirect mapping. This object
1982
* contains an array of vdev_indirect_mapping_entry_phys_t ordered by
1983
* vimep_src. The bonus buffer for this object is a
1984
* vdev_indirect_mapping_phys_t. This object is allocated when a vdev
1985
* removal is initiated.
1986
*
1987
* Note that this object can be empty if none of the data on the vdev
1988
* has been copied yet.
1989
*/
1990
uint64_t vic_mapping_object;
1991
1992
/*
1993
* Object (in MOS) which contains the birth times for the mapping
1994
* entries. This object contains an array of
1995
* vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
1996
* buffer for this object is a vdev_indirect_birth_phys_t. This object
1997
* is allocated when a vdev removal is initiated.
1998
*
1999
* Note that this object can be empty if none of the vdev has yet been
2000
* copied.
2001
*/
2002
uint64_t vic_births_object;
2003
2004
/*
2005
* This is the vdev ID which was removed previous to this vdev, or
2006
* UINT64_MAX if there are no previously removed vdevs.
2007
*/
2008
uint64_t vic_prev_indirect_vdev;
2009
} vdev_indirect_config_t;
2010
2011
typedef struct vdev {
2012
STAILQ_ENTRY(vdev) v_childlink; /* link in parent's child list */
2013
vdev_list_t v_children; /* children of this vdev */
2014
const char *v_name; /* vdev name */
2015
uint64_t v_guid; /* vdev guid */
2016
uint64_t v_label; /* label instantiated from (top vdev) */
2017
uint64_t v_txg; /* most recent transaction (top vdev) */
2018
uint64_t v_id; /* index in parent */
2019
uint64_t v_psize; /* physical device capacity */
2020
int v_ashift; /* offset to block shift */
2021
int v_nparity; /* # parity for raidz */
2022
struct vdev *v_top; /* parent vdev */
2023
size_t v_nchildren; /* # children */
2024
vdev_state_t v_state; /* current state */
2025
vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
2026
vdev_phys_write_t *v_phys_write; /* write to raw leaf vdev */
2027
vdev_read_t *v_read; /* read from vdev */
2028
void *v_priv; /* data for read/write function */
2029
boolean_t v_islog;
2030
struct spa *v_spa; /* link to spa */
2031
/*
2032
* Values stored in the config for an indirect or removing vdev.
2033
*/
2034
vdev_indirect_config_t vdev_indirect_config;
2035
vdev_indirect_mapping_t *v_mapping;
2036
} vdev_t;
2037
2038
/*
2039
* In-core pool representation.
2040
*/
2041
typedef STAILQ_HEAD(spa_list, spa) spa_list_t;
2042
2043
typedef struct spa {
2044
STAILQ_ENTRY(spa) spa_link; /* link in global pool list */
2045
char *spa_name; /* pool name */
2046
uint64_t spa_guid; /* pool guid */
2047
struct uberblock *spa_uberblock; /* best uberblock so far */
2048
vdev_t *spa_root_vdev; /* toplevel vdev container */
2049
objset_phys_t *spa_mos; /* MOS for this pool */
2050
zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
2051
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
2052
boolean_t spa_with_log; /* this pool has log */
2053
2054
struct uberblock spa_uberblock_master; /* best uberblock so far */
2055
objset_phys_t spa_mos_master; /* MOS for this pool */
2056
struct uberblock spa_uberblock_checkpoint; /* checkpoint uberblock */
2057
objset_phys_t spa_mos_checkpoint; /* Checkpoint MOS */
2058
void *spa_bootenv; /* bootenv from pool label */
2059
} spa_t;
2060
2061
/* IO related arguments. */
2062
typedef struct zio {
2063
spa_t *io_spa;
2064
blkptr_t *io_bp;
2065
void *io_data;
2066
uint64_t io_size;
2067
uint64_t io_offset;
2068
2069
/* Stuff for the vdev stack */
2070
vdev_t *io_vd;
2071
void *io_vsd;
2072
2073
int io_error;
2074
} zio_t;
2075
2076
extern void decode_embedded_bp_compressed(const blkptr_t *, void *);
2077
2078
#endif /* _ZFSIMPL_H_ */
2079
2080