Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/btree.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* This file and its contents are supplied under the terms of the
6
* Common Development and Distribution License ("CDDL"), version 1.0.
7
* You may only use this file in accordance with the terms of version
8
* 1.0 of the CDDL.
9
*
10
* A full copy of the text of the CDDL should have accompanied this
11
* source. A copy of the CDDL is also available via the Internet at
12
* http://www.illumos.org/license/CDDL.
13
*
14
* CDDL HEADER END
15
*/
16
/*
17
* Copyright (c) 2019 by Delphix. All rights reserved.
18
*/
19
20
#include <sys/btree.h>
21
#include <sys/bitops.h>
22
#include <sys/zfs_context.h>
23
24
kmem_cache_t *zfs_btree_leaf_cache;
25
26
/*
27
* Control the extent of the verification that occurs when zfs_btree_verify is
28
* called. Primarily used for debugging when extending the btree logic and
29
* functionality. As the intensity is increased, new verification steps are
30
* added. These steps are cumulative; intensity = 3 includes the intensity = 1
31
* and intensity = 2 steps as well.
32
*
33
* Intensity 1: Verify that the tree's height is consistent throughout.
34
* Intensity 2: Verify that a core node's children's parent pointers point
35
* to the core node.
36
* Intensity 3: Verify that the total number of elements in the tree matches the
37
* sum of the number of elements in each node. Also verifies that each node's
38
* count obeys the invariants (less than or equal to maximum value, greater than
39
* or equal to half the maximum minus one).
40
* Intensity 4: Verify that each element compares less than the element
41
* immediately after it and greater than the one immediately before it using the
42
* comparator function. For core nodes, also checks that each element is greater
43
* than the last element in the first of the two nodes it separates, and less
44
* than the first element in the second of the two nodes.
45
* Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
46
* of each node is poisoned appropriately. Note that poisoning always occurs if
47
* ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
48
* operation.
49
*
50
* Intensity 4 and 5 are particularly expensive to perform; the previous levels
51
* are a few memory operations per node, while these levels require multiple
52
* operations per element. In addition, when creating large btrees, these
53
* operations are called at every step, resulting in extremely slow operation
54
* (while the asymptotic complexity of the other steps is the same, the
55
* importance of the constant factors cannot be denied).
56
*/
57
uint_t zfs_btree_verify_intensity = 0;
58
59
/*
60
* Convenience functions to silence warnings from memcpy/memmove's
61
* return values and change argument order to src, dest.
62
*/
63
static void
64
bcpy(const void *src, void *dest, size_t size)
65
{
66
(void) memcpy(dest, src, size);
67
}
68
69
static void
70
bmov(const void *src, void *dest, size_t size)
71
{
72
(void) memmove(dest, src, size);
73
}
74
75
static boolean_t
76
zfs_btree_is_core(struct zfs_btree_hdr *hdr)
77
{
78
return (hdr->bth_first == -1);
79
}
80
81
#ifdef _ILP32
82
#define BTREE_POISON 0xabadb10c
83
#else
84
#define BTREE_POISON 0xabadb10cdeadbeef
85
#endif
86
87
static void
88
zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
89
{
90
#ifdef ZFS_DEBUG
91
size_t size = tree->bt_elem_size;
92
if (zfs_btree_is_core(hdr)) {
93
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
94
for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
95
i++) {
96
node->btc_children[i] =
97
(zfs_btree_hdr_t *)BTREE_POISON;
98
}
99
(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
100
(BTREE_CORE_ELEMS - hdr->bth_count) * size);
101
} else {
102
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
103
(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
104
(void) memset(leaf->btl_elems +
105
(hdr->bth_first + hdr->bth_count) * size, 0x0f,
106
tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
107
(hdr->bth_first + hdr->bth_count) * size);
108
}
109
#endif
110
}
111
112
static inline void
113
zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
114
uint32_t idx, uint32_t count)
115
{
116
#ifdef ZFS_DEBUG
117
size_t size = tree->bt_elem_size;
118
if (zfs_btree_is_core(hdr)) {
119
ASSERT3U(idx, >=, hdr->bth_count);
120
ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
121
ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
122
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
123
for (uint32_t i = 1; i <= count; i++) {
124
node->btc_children[idx + i] =
125
(zfs_btree_hdr_t *)BTREE_POISON;
126
}
127
(void) memset(node->btc_elems + idx * size, 0x0f, count * size);
128
} else {
129
ASSERT3U(idx, <=, tree->bt_leaf_cap);
130
ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
131
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
132
(void) memset(leaf->btl_elems +
133
(hdr->bth_first + idx) * size, 0x0f, count * size);
134
}
135
#endif
136
}
137
138
static inline void
139
zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
140
uint32_t idx)
141
{
142
#ifdef ZFS_DEBUG
143
size_t size = tree->bt_elem_size;
144
if (zfs_btree_is_core(hdr)) {
145
ASSERT3U(idx, <, BTREE_CORE_ELEMS);
146
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
147
zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
148
VERIFY3P(node->btc_children[idx + 1], ==, cval);
149
for (size_t i = 0; i < size; i++)
150
VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
151
} else {
152
ASSERT3U(idx, <, tree->bt_leaf_cap);
153
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
154
if (idx >= tree->bt_leaf_cap - hdr->bth_first)
155
return;
156
for (size_t i = 0; i < size; i++) {
157
VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
158
* size + i], ==, 0x0f);
159
}
160
}
161
#endif
162
}
163
164
void
165
zfs_btree_init(void)
166
{
167
zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
168
BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
169
}
170
171
void
172
zfs_btree_fini(void)
173
{
174
kmem_cache_destroy(zfs_btree_leaf_cache);
175
}
176
177
static void *
178
zfs_btree_leaf_alloc(zfs_btree_t *tree)
179
{
180
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
181
return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
182
else
183
return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
184
}
185
186
static void
187
zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
188
{
189
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
190
return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
191
else
192
return (kmem_free(ptr, tree->bt_leaf_size));
193
}
194
195
void
196
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
197
bt_find_in_buf_f bt_find_in_buf, size_t size)
198
{
199
zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
200
BTREE_LEAF_SIZE);
201
}
202
203
static void *
204
zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
205
const void *value, zfs_btree_index_t *where);
206
207
void
208
zfs_btree_create_custom(zfs_btree_t *tree,
209
int (*compar) (const void *, const void *),
210
bt_find_in_buf_f bt_find_in_buf,
211
size_t size, size_t lsize)
212
{
213
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
214
215
ASSERT3U(size, <=, esize / 2);
216
memset(tree, 0, sizeof (*tree));
217
tree->bt_compar = compar;
218
tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
219
zfs_btree_find_in_buf : bt_find_in_buf;
220
tree->bt_elem_size = size;
221
tree->bt_leaf_size = lsize;
222
tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
223
tree->bt_height = -1;
224
tree->bt_bulk = NULL;
225
}
226
227
/*
228
* Find value in the array of elements provided. Uses a simple binary search.
229
*/
230
static void *
231
zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
232
const void *value, zfs_btree_index_t *where)
233
{
234
uint32_t max = nelems;
235
uint32_t min = 0;
236
while (max > min) {
237
uint32_t idx = (min + max) / 2;
238
uint8_t *cur = buf + idx * tree->bt_elem_size;
239
int comp = tree->bt_compar(cur, value);
240
if (comp < 0) {
241
min = idx + 1;
242
} else if (comp > 0) {
243
max = idx;
244
} else {
245
where->bti_offset = idx;
246
where->bti_before = B_FALSE;
247
return (cur);
248
}
249
}
250
251
where->bti_offset = max;
252
where->bti_before = B_TRUE;
253
return (NULL);
254
}
255
256
/*
257
* Find the given value in the tree. where may be passed as null to use as a
258
* membership test or if the btree is being used as a map.
259
*/
260
void *
261
zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
262
{
263
if (tree->bt_height == -1) {
264
if (where != NULL) {
265
where->bti_node = NULL;
266
where->bti_offset = 0;
267
}
268
ASSERT0(tree->bt_num_elems);
269
return (NULL);
270
}
271
272
/*
273
* If we're in bulk-insert mode, we check the last spot in the tree
274
* and the last leaf in the tree before doing the normal search,
275
* because for most workloads the vast majority of finds in
276
* bulk-insert mode are to insert new elements.
277
*/
278
zfs_btree_index_t idx;
279
size_t size = tree->bt_elem_size;
280
if (tree->bt_bulk != NULL) {
281
zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
282
int comp = tree->bt_compar(last_leaf->btl_elems +
283
(last_leaf->btl_hdr.bth_first +
284
last_leaf->btl_hdr.bth_count - 1) * size, value);
285
if (comp < 0) {
286
/*
287
* If what they're looking for is after the last
288
* element, it's not in the tree.
289
*/
290
if (where != NULL) {
291
where->bti_node = (zfs_btree_hdr_t *)last_leaf;
292
where->bti_offset =
293
last_leaf->btl_hdr.bth_count;
294
where->bti_before = B_TRUE;
295
}
296
return (NULL);
297
} else if (comp == 0) {
298
if (where != NULL) {
299
where->bti_node = (zfs_btree_hdr_t *)last_leaf;
300
where->bti_offset =
301
last_leaf->btl_hdr.bth_count - 1;
302
where->bti_before = B_FALSE;
303
}
304
return (last_leaf->btl_elems +
305
(last_leaf->btl_hdr.bth_first +
306
last_leaf->btl_hdr.bth_count - 1) * size);
307
}
308
if (tree->bt_compar(last_leaf->btl_elems +
309
last_leaf->btl_hdr.bth_first * size, value) <= 0) {
310
/*
311
* If what they're looking for is after the first
312
* element in the last leaf, it's in the last leaf or
313
* it's not in the tree.
314
*/
315
void *d = tree->bt_find_in_buf(tree,
316
last_leaf->btl_elems +
317
last_leaf->btl_hdr.bth_first * size,
318
last_leaf->btl_hdr.bth_count, value, &idx);
319
320
if (where != NULL) {
321
idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
322
*where = idx;
323
}
324
return (d);
325
}
326
}
327
328
zfs_btree_core_t *node = NULL;
329
uint32_t child = 0;
330
uint32_t depth = 0;
331
332
/*
333
* Iterate down the tree, finding which child the value should be in
334
* by comparing with the separators.
335
*/
336
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
337
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
338
ASSERT3P(node, !=, NULL);
339
void *d = tree->bt_find_in_buf(tree, node->btc_elems,
340
node->btc_hdr.bth_count, value, &idx);
341
EQUIV(d != NULL, !idx.bti_before);
342
if (d != NULL) {
343
if (where != NULL) {
344
idx.bti_node = (zfs_btree_hdr_t *)node;
345
*where = idx;
346
}
347
return (d);
348
}
349
ASSERT(idx.bti_before);
350
child = idx.bti_offset;
351
}
352
353
/*
354
* The value is in this leaf, or it would be if it were in the
355
* tree. Find its proper location and return it.
356
*/
357
zfs_btree_leaf_t *leaf = (depth == 0 ?
358
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
359
void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
360
leaf->btl_hdr.bth_first * size,
361
leaf->btl_hdr.bth_count, value, &idx);
362
363
if (where != NULL) {
364
idx.bti_node = (zfs_btree_hdr_t *)leaf;
365
*where = idx;
366
}
367
368
return (d);
369
}
370
371
/*
372
* To explain the following functions, it is useful to understand the four
373
* kinds of shifts used in btree operation. First, a shift is a movement of
374
* elements within a node. It is used to create gaps for inserting new
375
* elements and children, or cover gaps created when things are removed. A
376
* shift has two fundamental properties, each of which can be one of two
377
* values, making four types of shifts. There is the direction of the shift
378
* (left or right) and the shape of the shift (parallelogram or isoceles
379
* trapezoid (shortened to trapezoid hereafter)). The shape distinction only
380
* applies to shifts of core nodes.
381
*
382
* The names derive from the following imagining of the layout of a node:
383
*
384
* Elements: * * * * * * * ... * * *
385
* Children: * * * * * * * * ... * * *
386
*
387
* This layout follows from the fact that the elements act as separators
388
* between pairs of children, and that children root subtrees "below" the
389
* current node. A left and right shift are fairly self-explanatory; a left
390
* shift moves things to the left, while a right shift moves things to the
391
* right. A parallelogram shift is a shift with the same number of elements
392
* and children being moved, while a trapezoid shift is a shift that moves one
393
* more children than elements. An example follows:
394
*
395
* A parallelogram shift could contain the following:
396
* _______________
397
* \* * * * \ * * * ... * * *
398
* * \ * * * *\ * * * ... * * *
399
* ---------------
400
* A trapezoid shift could contain the following:
401
* ___________
402
* * / * * * \ * * * ... * * *
403
* * / * * * *\ * * * ... * * *
404
* ---------------
405
*
406
* Note that a parallelogram shift is always shaped like a "left-leaning"
407
* parallelogram, where the starting index of the children being moved is
408
* always one higher than the starting index of the elements being moved. No
409
* "right-leaning" parallelogram shifts are needed (shifts where the starting
410
* element index and starting child index being moved are the same) to achieve
411
* any btree operations, so we ignore them.
412
*/
413
414
enum bt_shift_shape {
415
BSS_TRAPEZOID,
416
BSS_PARALLELOGRAM
417
};
418
419
enum bt_shift_direction {
420
BSD_LEFT,
421
BSD_RIGHT
422
};
423
424
/*
425
* Shift elements and children in the provided core node by off spots. The
426
* first element moved is idx, and count elements are moved. The shape of the
427
* shift is determined by shape. The direction is determined by dir.
428
*/
429
static inline void
430
bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
431
uint32_t count, uint32_t off, enum bt_shift_shape shape,
432
enum bt_shift_direction dir)
433
{
434
size_t size = tree->bt_elem_size;
435
ASSERT(zfs_btree_is_core(&node->btc_hdr));
436
437
uint8_t *e_start = node->btc_elems + idx * size;
438
uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
439
e_start + off * size);
440
bmov(e_start, e_out, count * size);
441
442
zfs_btree_hdr_t **c_start = node->btc_children + idx +
443
(shape == BSS_TRAPEZOID ? 0 : 1);
444
zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
445
c_start + off);
446
uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
447
bmov(c_start, c_out, c_count * sizeof (*c_start));
448
}
449
450
/*
451
* Shift elements and children in the provided core node left by one spot.
452
* The first element moved is idx, and count elements are moved. The
453
* shape of the shift is determined by trap; true if the shift is a trapezoid,
454
* false if it is a parallelogram.
455
*/
456
static inline void
457
bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
458
uint32_t count, enum bt_shift_shape shape)
459
{
460
bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
461
}
462
463
/*
464
* Shift elements and children in the provided core node right by one spot.
465
* Starts with elements[idx] and children[idx] and one more child than element.
466
*/
467
static inline void
468
bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
469
uint32_t count, enum bt_shift_shape shape)
470
{
471
bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
472
}
473
474
/*
475
* Shift elements and children in the provided leaf node by off spots.
476
* The first element moved is idx, and count elements are moved. The direction
477
* is determined by left.
478
*/
479
static inline void
480
bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
481
uint32_t count, uint32_t off, enum bt_shift_direction dir)
482
{
483
size_t size = tree->bt_elem_size;
484
zfs_btree_hdr_t *hdr = &node->btl_hdr;
485
ASSERT(!zfs_btree_is_core(hdr));
486
487
if (count == 0)
488
return;
489
uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
490
uint8_t *out = (dir == BSD_LEFT ? start - off * size :
491
start + off * size);
492
bmov(start, out, count * size);
493
}
494
495
/*
496
* Grow leaf for n new elements before idx.
497
*/
498
static void
499
bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
500
uint32_t n)
501
{
502
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
503
ASSERT(!zfs_btree_is_core(hdr));
504
ASSERT3U(idx, <=, hdr->bth_count);
505
uint32_t capacity = tree->bt_leaf_cap;
506
ASSERT3U(hdr->bth_count + n, <=, capacity);
507
boolean_t cl = (hdr->bth_first >= n);
508
boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
509
510
if (cl && (!cr || idx <= hdr->bth_count / 2)) {
511
/* Grow left. */
512
hdr->bth_first -= n;
513
bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
514
} else if (cr) {
515
/* Grow right. */
516
bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
517
BSD_RIGHT);
518
} else {
519
/* Grow both ways. */
520
uint32_t fn = hdr->bth_first -
521
(capacity - (hdr->bth_count + n)) / 2;
522
hdr->bth_first -= fn;
523
bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
524
bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
525
n - fn, BSD_RIGHT);
526
}
527
hdr->bth_count += n;
528
}
529
530
/*
531
* Shrink leaf for count elements starting from idx.
532
*/
533
static void
534
bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
535
uint32_t n)
536
{
537
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
538
ASSERT(!zfs_btree_is_core(hdr));
539
ASSERT3U(idx, <=, hdr->bth_count);
540
ASSERT3U(idx + n, <=, hdr->bth_count);
541
542
if (idx <= (hdr->bth_count - n) / 2) {
543
bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
544
zfs_btree_poison_node_at(tree, hdr, 0, n);
545
hdr->bth_first += n;
546
} else {
547
bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
548
BSD_LEFT);
549
zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
550
}
551
hdr->bth_count -= n;
552
}
553
554
/*
555
* Move children and elements from one core node to another. The shape
556
* parameter behaves the same as it does in the shift logic.
557
*/
558
static inline void
559
bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
560
uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
561
enum bt_shift_shape shape)
562
{
563
size_t size = tree->bt_elem_size;
564
ASSERT(zfs_btree_is_core(&source->btc_hdr));
565
ASSERT(zfs_btree_is_core(&dest->btc_hdr));
566
567
bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
568
count * size);
569
570
uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
571
bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
572
dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
573
c_count * sizeof (*source->btc_children));
574
}
575
576
static inline void
577
bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
578
uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
579
{
580
size_t size = tree->bt_elem_size;
581
ASSERT(!zfs_btree_is_core(&source->btl_hdr));
582
ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
583
584
bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
585
dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
586
count * size);
587
}
588
589
/*
590
* Find the first element in the subtree rooted at hdr, return its value and
591
* put its location in where if non-null.
592
*/
593
static void *
594
zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
595
zfs_btree_index_t *where)
596
{
597
zfs_btree_hdr_t *node;
598
599
for (node = hdr; zfs_btree_is_core(node);
600
node = ((zfs_btree_core_t *)node)->btc_children[0])
601
;
602
603
ASSERT(!zfs_btree_is_core(node));
604
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
605
if (where != NULL) {
606
where->bti_node = node;
607
where->bti_offset = 0;
608
where->bti_before = B_FALSE;
609
}
610
return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
611
}
612
613
/* Insert an element and a child into a core node at the given offset. */
614
static void
615
zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
616
uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
617
{
618
size_t size = tree->bt_elem_size;
619
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
620
ASSERT3P(par_hdr, ==, new_node->bth_parent);
621
ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
622
623
if (zfs_btree_verify_intensity >= 5) {
624
zfs_btree_verify_poison_at(tree, par_hdr,
625
par_hdr->bth_count);
626
}
627
/* Shift existing elements and children */
628
uint32_t count = par_hdr->bth_count - offset;
629
bt_shift_core_right(tree, parent, offset, count,
630
BSS_PARALLELOGRAM);
631
632
/* Insert new values */
633
parent->btc_children[offset + 1] = new_node;
634
bcpy(buf, parent->btc_elems + offset * size, size);
635
par_hdr->bth_count++;
636
}
637
638
/*
639
* Insert new_node into the parent of old_node directly after old_node, with
640
* buf as the dividing element between the two.
641
*/
642
static void
643
zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
644
zfs_btree_hdr_t *new_node, void *buf)
645
{
646
ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
647
size_t size = tree->bt_elem_size;
648
zfs_btree_core_t *parent = old_node->bth_parent;
649
650
/*
651
* If this is the root node we were splitting, we create a new root
652
* and increase the height of the tree.
653
*/
654
if (parent == NULL) {
655
ASSERT3P(old_node, ==, tree->bt_root);
656
tree->bt_num_nodes++;
657
zfs_btree_core_t *new_root =
658
kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
659
size, KM_SLEEP);
660
zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
661
new_root_hdr->bth_parent = NULL;
662
new_root_hdr->bth_first = -1;
663
new_root_hdr->bth_count = 1;
664
665
old_node->bth_parent = new_node->bth_parent = new_root;
666
new_root->btc_children[0] = old_node;
667
new_root->btc_children[1] = new_node;
668
bcpy(buf, new_root->btc_elems, size);
669
670
tree->bt_height++;
671
tree->bt_root = new_root_hdr;
672
zfs_btree_poison_node(tree, new_root_hdr);
673
return;
674
}
675
676
/*
677
* Since we have the new separator, binary search for where to put
678
* new_node.
679
*/
680
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
681
zfs_btree_index_t idx;
682
ASSERT(zfs_btree_is_core(par_hdr));
683
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
684
par_hdr->bth_count, buf, &idx), ==, NULL);
685
ASSERT(idx.bti_before);
686
uint32_t offset = idx.bti_offset;
687
ASSERT3U(offset, <=, par_hdr->bth_count);
688
ASSERT3P(parent->btc_children[offset], ==, old_node);
689
690
/*
691
* If the parent isn't full, shift things to accommodate our insertions
692
* and return.
693
*/
694
if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
695
zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
696
return;
697
}
698
699
/*
700
* We need to split this core node into two. Currently there are
701
* BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
702
* BTREE_CORE_ELEMS + 2. Some of the children will be part of the
703
* current node, and the others will be moved to the new core node.
704
* There are BTREE_CORE_ELEMS + 1 elements including the new one. One
705
* will be used as the new separator in our parent, and the others
706
* will be split among the two core nodes.
707
*
708
* Usually we will split the node in half evenly, with
709
* BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
710
* instead move only about a quarter of the elements (and children) to
711
* the new node. Since the average state after a long time is a 3/4
712
* full node, shortcutting directly to that state improves efficiency.
713
*
714
* We do this in two stages: first we split into two nodes, and then we
715
* reuse our existing logic to insert the new element and child.
716
*/
717
uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
718
2 : 4)) - 1, 2);
719
uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
720
ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
721
tree->bt_num_nodes++;
722
zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
723
BTREE_CORE_ELEMS * size, KM_SLEEP);
724
zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
725
new_par_hdr->bth_parent = par_hdr->bth_parent;
726
new_par_hdr->bth_first = -1;
727
new_par_hdr->bth_count = move_count;
728
zfs_btree_poison_node(tree, new_par_hdr);
729
730
par_hdr->bth_count = keep_count;
731
732
bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
733
0, BSS_TRAPEZOID);
734
735
/* Store the new separator in a buffer. */
736
uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
737
bcpy(parent->btc_elems + keep_count * size, tmp_buf,
738
size);
739
zfs_btree_poison_node(tree, par_hdr);
740
741
if (offset < keep_count) {
742
/* Insert the new node into the left half */
743
zfs_btree_insert_core_impl(tree, parent, offset, new_node,
744
buf);
745
746
/*
747
* Move the new separator to the existing buffer.
748
*/
749
bcpy(tmp_buf, buf, size);
750
} else if (offset > keep_count) {
751
/* Insert the new node into the right half */
752
new_node->bth_parent = new_parent;
753
zfs_btree_insert_core_impl(tree, new_parent,
754
offset - keep_count - 1, new_node, buf);
755
756
/*
757
* Move the new separator to the existing buffer.
758
*/
759
bcpy(tmp_buf, buf, size);
760
} else {
761
/*
762
* Move the new separator into the right half, and replace it
763
* with buf. We also need to shift back the elements in the
764
* right half to accommodate new_node.
765
*/
766
bt_shift_core_right(tree, new_parent, 0, move_count,
767
BSS_TRAPEZOID);
768
new_parent->btc_children[0] = new_node;
769
bcpy(tmp_buf, new_parent->btc_elems, size);
770
new_par_hdr->bth_count++;
771
}
772
kmem_free(tmp_buf, size);
773
zfs_btree_poison_node(tree, par_hdr);
774
775
for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
776
new_parent->btc_children[i]->bth_parent = new_parent;
777
778
for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
779
ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
780
781
/*
782
* Now that the node is split, we need to insert the new node into its
783
* parent. This may cause further splitting.
784
*/
785
zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
786
&new_parent->btc_hdr, buf);
787
}
788
789
/* Insert an element into a leaf node at the given offset. */
790
static void
791
zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
792
uint32_t idx, const void *value)
793
{
794
size_t size = tree->bt_elem_size;
795
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
796
ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
797
798
if (zfs_btree_verify_intensity >= 5) {
799
zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
800
leaf->btl_hdr.bth_count);
801
}
802
803
bt_grow_leaf(tree, leaf, idx, 1);
804
uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
805
bcpy(value, start, size);
806
}
807
808
static void
809
zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
810
811
/* Helper function for inserting a new value into leaf at the given index. */
812
static void
813
zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
814
const void *value, uint32_t idx)
815
{
816
size_t size = tree->bt_elem_size;
817
uint32_t capacity = tree->bt_leaf_cap;
818
819
/*
820
* If the leaf isn't full, shift the elements after idx and insert
821
* value.
822
*/
823
if (leaf->btl_hdr.bth_count != capacity) {
824
zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
825
return;
826
}
827
828
/*
829
* Otherwise, we split the leaf node into two nodes. If we're not bulk
830
* inserting, each is of size (capacity / 2). If we are bulk
831
* inserting, we move a quarter of the elements to the new node so
832
* inserts into the old node don't cause immediate splitting but the
833
* tree stays relatively dense. Since the average state after a long
834
* time is a 3/4 full node, shortcutting directly to that state
835
* improves efficiency. At the end of the bulk insertion process
836
* we'll need to go through and fix up any nodes (the last leaf and
837
* its ancestors, potentially) that are below the minimum.
838
*
839
* In either case, we're left with one extra element. The leftover
840
* element will become the new dividing element between the two nodes.
841
*/
842
uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
843
uint32_t keep_count = capacity - move_count - 1;
844
ASSERT3U(keep_count, >=, 1);
845
/* If we insert on left. move one more to keep leaves balanced. */
846
if (idx < keep_count) {
847
keep_count--;
848
move_count++;
849
}
850
tree->bt_num_nodes++;
851
zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
852
zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
853
new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
854
new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
855
(idx >= keep_count && idx <= keep_count + move_count / 2);
856
new_hdr->bth_count = move_count;
857
zfs_btree_poison_node(tree, new_hdr);
858
859
if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
860
tree->bt_bulk = new_leaf;
861
862
/* Copy the back part to the new leaf. */
863
bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
864
865
/* We store the new separator in a buffer we control for simplicity. */
866
uint8_t *buf = kmem_alloc(size, KM_SLEEP);
867
bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
868
buf, size);
869
870
bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
871
872
if (idx < keep_count) {
873
/* Insert into the existing leaf. */
874
zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
875
} else if (idx > keep_count) {
876
/* Insert into the new leaf. */
877
zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
878
1, value);
879
} else {
880
/*
881
* Insert planned separator into the new leaf, and use
882
* the new value as the new separator.
883
*/
884
zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
885
bcpy(value, buf, size);
886
}
887
888
/*
889
* Now that the node is split, we need to insert the new node into its
890
* parent. This may cause further splitting, bur only of core nodes.
891
*/
892
zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
893
buf);
894
kmem_free(buf, size);
895
}
896
897
static uint32_t
898
zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
899
{
900
void *buf;
901
if (zfs_btree_is_core(hdr)) {
902
buf = ((zfs_btree_core_t *)hdr)->btc_elems;
903
} else {
904
buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
905
hdr->bth_first * tree->bt_elem_size;
906
}
907
zfs_btree_index_t idx;
908
zfs_btree_core_t *parent = hdr->bth_parent;
909
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
910
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
911
ASSERT(idx.bti_before);
912
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
913
ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
914
return (idx.bti_offset);
915
}
916
917
/*
918
* Take the b-tree out of bulk insert mode. During bulk-insert mode, some
919
* nodes may violate the invariant that non-root nodes must be at least half
920
* full. All nodes violating this invariant should be the last node in their
921
* particular level. To correct the invariant, we take values from their left
922
* neighbor until they are half full. They must have a left neighbor at their
923
* level because the last node at a level is not the first node unless it's
924
* the root.
925
*/
926
static void
927
zfs_btree_bulk_finish(zfs_btree_t *tree)
928
{
929
ASSERT3P(tree->bt_bulk, !=, NULL);
930
ASSERT3P(tree->bt_root, !=, NULL);
931
zfs_btree_leaf_t *leaf = tree->bt_bulk;
932
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
933
zfs_btree_core_t *parent = hdr->bth_parent;
934
size_t size = tree->bt_elem_size;
935
uint32_t capacity = tree->bt_leaf_cap;
936
937
/*
938
* The invariant doesn't apply to the root node, if that's the only
939
* node in the tree we're done.
940
*/
941
if (parent == NULL) {
942
tree->bt_bulk = NULL;
943
return;
944
}
945
946
/* First, take elements to rebalance the leaf node. */
947
if (hdr->bth_count < capacity / 2) {
948
/*
949
* First, find the left neighbor. The simplest way to do this
950
* is to call zfs_btree_prev twice; the first time finds some
951
* ancestor of this node, and the second time finds the left
952
* neighbor. The ancestor found is the lowest common ancestor
953
* of leaf and the neighbor.
954
*/
955
zfs_btree_index_t idx = {
956
.bti_node = hdr,
957
.bti_offset = 0
958
};
959
VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
960
ASSERT(zfs_btree_is_core(idx.bti_node));
961
zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
962
uint32_t common_idx = idx.bti_offset;
963
964
VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
965
ASSERT(!zfs_btree_is_core(idx.bti_node));
966
zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
967
zfs_btree_hdr_t *l_hdr = idx.bti_node;
968
uint32_t move_count = (capacity / 2) - hdr->bth_count;
969
ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
970
capacity / 2);
971
972
if (zfs_btree_verify_intensity >= 5) {
973
for (uint32_t i = 0; i < move_count; i++) {
974
zfs_btree_verify_poison_at(tree, hdr,
975
leaf->btl_hdr.bth_count + i);
976
}
977
}
978
979
/* First, shift elements in leaf back. */
980
bt_grow_leaf(tree, leaf, 0, move_count);
981
982
/* Next, move the separator from the common ancestor to leaf. */
983
uint8_t *separator = common->btc_elems + common_idx * size;
984
uint8_t *out = leaf->btl_elems +
985
(hdr->bth_first + move_count - 1) * size;
986
bcpy(separator, out, size);
987
988
/*
989
* Now we move elements from the tail of the left neighbor to
990
* fill the remaining spots in leaf.
991
*/
992
bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
993
(move_count - 1), move_count - 1, leaf, 0);
994
995
/*
996
* Finally, move the new last element in the left neighbor to
997
* the separator.
998
*/
999
bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
1000
l_hdr->bth_count - move_count) * size, separator, size);
1001
1002
/* Adjust the node's counts, and we're done. */
1003
bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
1004
move_count);
1005
1006
ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
1007
ASSERT3U(hdr->bth_count, >=, capacity / 2);
1008
}
1009
1010
/*
1011
* Now we have to rebalance any ancestors of leaf that may also
1012
* violate the invariant.
1013
*/
1014
capacity = BTREE_CORE_ELEMS;
1015
while (parent->btc_hdr.bth_parent != NULL) {
1016
zfs_btree_core_t *cur = parent;
1017
zfs_btree_hdr_t *hdr = &cur->btc_hdr;
1018
parent = hdr->bth_parent;
1019
/*
1020
* If the invariant isn't violated, move on to the next
1021
* ancestor.
1022
*/
1023
if (hdr->bth_count >= capacity / 2)
1024
continue;
1025
1026
/*
1027
* Because the smallest number of nodes we can move when
1028
* splitting is 2, we never need to worry about not having a
1029
* left sibling (a sibling is a neighbor with the same parent).
1030
*/
1031
uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1032
ASSERT3U(parent_idx, >, 0);
1033
zfs_btree_core_t *l_neighbor =
1034
(zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
1035
uint32_t move_count = (capacity / 2) - hdr->bth_count;
1036
ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1037
capacity / 2);
1038
1039
if (zfs_btree_verify_intensity >= 5) {
1040
for (uint32_t i = 0; i < move_count; i++) {
1041
zfs_btree_verify_poison_at(tree, hdr,
1042
hdr->bth_count + i);
1043
}
1044
}
1045
/* First, shift things in the right node back. */
1046
bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1047
BSS_TRAPEZOID, BSD_RIGHT);
1048
1049
/* Next, move the separator to the right node. */
1050
uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1051
size);
1052
uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1053
bcpy(separator, e_out, size);
1054
1055
/*
1056
* Now, move elements and children from the left node to the
1057
* right. We move one more child than elements.
1058
*/
1059
move_count--;
1060
uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1061
bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1062
BSS_TRAPEZOID);
1063
1064
/*
1065
* Finally, move the last element in the left node to the
1066
* separator's position.
1067
*/
1068
move_idx--;
1069
bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1070
1071
l_neighbor->btc_hdr.bth_count -= move_count + 1;
1072
hdr->bth_count += move_count + 1;
1073
1074
ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1075
ASSERT3U(hdr->bth_count, >=, capacity / 2);
1076
1077
zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1078
1079
for (uint32_t i = 0; i <= hdr->bth_count; i++)
1080
cur->btc_children[i]->bth_parent = cur;
1081
}
1082
1083
tree->bt_bulk = NULL;
1084
zfs_btree_verify(tree);
1085
}
1086
1087
/*
1088
* Insert value into tree at the location specified by where.
1089
*/
1090
void
1091
zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1092
const zfs_btree_index_t *where)
1093
{
1094
zfs_btree_index_t idx = {0};
1095
1096
/* If we're not inserting in the last leaf, end bulk insert mode. */
1097
if (tree->bt_bulk != NULL) {
1098
if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1099
zfs_btree_bulk_finish(tree);
1100
VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1101
where = &idx;
1102
}
1103
}
1104
1105
tree->bt_num_elems++;
1106
/*
1107
* If this is the first element in the tree, create a leaf root node
1108
* and add the value to it.
1109
*/
1110
if (where->bti_node == NULL) {
1111
ASSERT3U(tree->bt_num_elems, ==, 1);
1112
ASSERT3S(tree->bt_height, ==, -1);
1113
ASSERT0P(tree->bt_root);
1114
ASSERT0(where->bti_offset);
1115
1116
tree->bt_num_nodes++;
1117
zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
1118
tree->bt_root = &leaf->btl_hdr;
1119
tree->bt_height++;
1120
1121
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1122
hdr->bth_parent = NULL;
1123
hdr->bth_first = 0;
1124
hdr->bth_count = 0;
1125
zfs_btree_poison_node(tree, hdr);
1126
1127
zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1128
tree->bt_bulk = leaf;
1129
} else if (!zfs_btree_is_core(where->bti_node)) {
1130
/*
1131
* If we're inserting into a leaf, go directly to the helper
1132
* function.
1133
*/
1134
zfs_btree_insert_into_leaf(tree,
1135
(zfs_btree_leaf_t *)where->bti_node, value,
1136
where->bti_offset);
1137
} else {
1138
/*
1139
* If we're inserting into a core node, we can't just shift
1140
* the existing element in that slot in the same node without
1141
* breaking our ordering invariants. Instead we place the new
1142
* value in the node at that spot and then insert the old
1143
* separator into the first slot in the subtree to the right.
1144
*/
1145
zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1146
1147
/*
1148
* We can ignore bti_before, because either way the value
1149
* should end up in bti_offset.
1150
*/
1151
uint32_t off = where->bti_offset;
1152
zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1153
size_t size = tree->bt_elem_size;
1154
uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1155
bcpy(node->btc_elems + off * size, buf, size);
1156
bcpy(value, node->btc_elems + off * size, size);
1157
1158
/*
1159
* Find the first slot in the subtree to the right, insert
1160
* there.
1161
*/
1162
zfs_btree_index_t new_idx;
1163
VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1164
NULL);
1165
ASSERT0(new_idx.bti_offset);
1166
ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1167
zfs_btree_insert_into_leaf(tree,
1168
(zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1169
kmem_free(buf, size);
1170
}
1171
zfs_btree_verify(tree);
1172
}
1173
1174
/*
1175
* Return the first element in the tree, and put its location in where if
1176
* non-null.
1177
*/
1178
void *
1179
zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1180
{
1181
if (tree->bt_height == -1) {
1182
ASSERT0(tree->bt_num_elems);
1183
return (NULL);
1184
}
1185
return (zfs_btree_first_helper(tree, tree->bt_root, where));
1186
}
1187
1188
/*
1189
* Find the last element in the subtree rooted at hdr, return its value and
1190
* put its location in where if non-null.
1191
*/
1192
static void *
1193
zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1194
zfs_btree_index_t *where)
1195
{
1196
zfs_btree_hdr_t *node;
1197
1198
for (node = hdr; zfs_btree_is_core(node); node =
1199
((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1200
;
1201
1202
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1203
if (where != NULL) {
1204
where->bti_node = node;
1205
where->bti_offset = node->bth_count - 1;
1206
where->bti_before = B_FALSE;
1207
}
1208
return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1209
btree->bt_elem_size);
1210
}
1211
1212
/*
1213
* Return the last element in the tree, and put its location in where if
1214
* non-null.
1215
*/
1216
void *
1217
zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1218
{
1219
if (tree->bt_height == -1) {
1220
ASSERT0(tree->bt_num_elems);
1221
return (NULL);
1222
}
1223
return (zfs_btree_last_helper(tree, tree->bt_root, where));
1224
}
1225
1226
/*
1227
* This function contains the logic to find the next node in the tree. A
1228
* helper function is used because there are multiple internal consumemrs of
1229
* this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1230
* node after we've finished with it.
1231
*/
1232
static void *
1233
zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1234
zfs_btree_index_t *out_idx,
1235
void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1236
{
1237
if (idx->bti_node == NULL) {
1238
ASSERT3S(tree->bt_height, ==, -1);
1239
return (NULL);
1240
}
1241
1242
uint32_t offset = idx->bti_offset;
1243
if (!zfs_btree_is_core(idx->bti_node)) {
1244
/*
1245
* When finding the next element of an element in a leaf,
1246
* there are two cases. If the element isn't the last one in
1247
* the leaf, in which case we just return the next element in
1248
* the leaf. Otherwise, we need to traverse up our parents
1249
* until we find one where our ancestor isn't the last child
1250
* of its parent. Once we do, the next element is the
1251
* separator after our ancestor in its parent.
1252
*/
1253
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1254
uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1255
if (leaf->btl_hdr.bth_count > new_off) {
1256
out_idx->bti_node = &leaf->btl_hdr;
1257
out_idx->bti_offset = new_off;
1258
out_idx->bti_before = B_FALSE;
1259
return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1260
new_off) * tree->bt_elem_size);
1261
}
1262
1263
zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1264
for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1265
node != NULL; node = node->btc_hdr.bth_parent) {
1266
zfs_btree_hdr_t *hdr = &node->btc_hdr;
1267
ASSERT(zfs_btree_is_core(hdr));
1268
uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1269
if (done_func != NULL)
1270
done_func(tree, prev);
1271
if (i == hdr->bth_count) {
1272
prev = hdr;
1273
continue;
1274
}
1275
out_idx->bti_node = hdr;
1276
out_idx->bti_offset = i;
1277
out_idx->bti_before = B_FALSE;
1278
return (node->btc_elems + i * tree->bt_elem_size);
1279
}
1280
if (done_func != NULL)
1281
done_func(tree, prev);
1282
/*
1283
* We've traversed all the way up and been at the end of the
1284
* node every time, so this was the last element in the tree.
1285
*/
1286
return (NULL);
1287
}
1288
1289
/* If we were before an element in a core node, return that element. */
1290
ASSERT(zfs_btree_is_core(idx->bti_node));
1291
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1292
if (idx->bti_before) {
1293
out_idx->bti_before = B_FALSE;
1294
return (node->btc_elems + offset * tree->bt_elem_size);
1295
}
1296
1297
/*
1298
* The next element from one in a core node is the first element in
1299
* the subtree just to the right of the separator.
1300
*/
1301
zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1302
return (zfs_btree_first_helper(tree, child, out_idx));
1303
}
1304
1305
/*
1306
* Return the next valued node in the tree. The same address can be safely
1307
* passed for idx and out_idx.
1308
*/
1309
void *
1310
zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1311
zfs_btree_index_t *out_idx)
1312
{
1313
return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1314
}
1315
1316
/*
1317
* Return the previous valued node in the tree. The same value can be safely
1318
* passed for idx and out_idx.
1319
*/
1320
void *
1321
zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1322
zfs_btree_index_t *out_idx)
1323
{
1324
if (idx->bti_node == NULL) {
1325
ASSERT3S(tree->bt_height, ==, -1);
1326
return (NULL);
1327
}
1328
1329
uint32_t offset = idx->bti_offset;
1330
if (!zfs_btree_is_core(idx->bti_node)) {
1331
/*
1332
* When finding the previous element of an element in a leaf,
1333
* there are two cases. If the element isn't the first one in
1334
* the leaf, in which case we just return the previous element
1335
* in the leaf. Otherwise, we need to traverse up our parents
1336
* until we find one where our previous ancestor isn't the
1337
* first child. Once we do, the previous element is the
1338
* separator after our previous ancestor.
1339
*/
1340
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1341
if (offset != 0) {
1342
out_idx->bti_node = &leaf->btl_hdr;
1343
out_idx->bti_offset = offset - 1;
1344
out_idx->bti_before = B_FALSE;
1345
return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1346
offset - 1) * tree->bt_elem_size);
1347
}
1348
zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1349
for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1350
node != NULL; node = node->btc_hdr.bth_parent) {
1351
zfs_btree_hdr_t *hdr = &node->btc_hdr;
1352
ASSERT(zfs_btree_is_core(hdr));
1353
uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1354
if (i == 0) {
1355
prev = hdr;
1356
continue;
1357
}
1358
out_idx->bti_node = hdr;
1359
out_idx->bti_offset = i - 1;
1360
out_idx->bti_before = B_FALSE;
1361
return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1362
}
1363
/*
1364
* We've traversed all the way up and been at the start of the
1365
* node every time, so this was the first node in the tree.
1366
*/
1367
return (NULL);
1368
}
1369
1370
/*
1371
* The previous element from one in a core node is the last element in
1372
* the subtree just to the left of the separator.
1373
*/
1374
ASSERT(zfs_btree_is_core(idx->bti_node));
1375
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1376
zfs_btree_hdr_t *child = node->btc_children[offset];
1377
return (zfs_btree_last_helper(tree, child, out_idx));
1378
}
1379
1380
/*
1381
* Get the value at the provided index in the tree.
1382
*
1383
* Note that the value returned from this function can be mutated, but only
1384
* if it will not change the ordering of the element with respect to any other
1385
* elements that could be in the tree.
1386
*/
1387
void *
1388
zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1389
{
1390
ASSERT(!idx->bti_before);
1391
size_t size = tree->bt_elem_size;
1392
if (!zfs_btree_is_core(idx->bti_node)) {
1393
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1394
return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1395
idx->bti_offset) * size);
1396
}
1397
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1398
return (node->btc_elems + idx->bti_offset * size);
1399
}
1400
1401
/* Add the given value to the tree. Must not already be in the tree. */
1402
void
1403
zfs_btree_add(zfs_btree_t *tree, const void *node)
1404
{
1405
zfs_btree_index_t where = {0};
1406
VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1407
zfs_btree_add_idx(tree, node, &where);
1408
}
1409
1410
/* Helper function to free a tree node. */
1411
static void
1412
zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1413
{
1414
tree->bt_num_nodes--;
1415
if (!zfs_btree_is_core(node)) {
1416
zfs_btree_leaf_free(tree, node);
1417
} else {
1418
kmem_free(node, sizeof (zfs_btree_core_t) +
1419
BTREE_CORE_ELEMS * tree->bt_elem_size);
1420
}
1421
}
1422
1423
/*
1424
* Remove the rm_hdr and the separator to its left from the parent node. The
1425
* buffer that rm_hdr was stored in may already be freed, so its contents
1426
* cannot be accessed.
1427
*/
1428
static void
1429
zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1430
zfs_btree_hdr_t *rm_hdr)
1431
{
1432
size_t size = tree->bt_elem_size;
1433
uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1434
zfs_btree_hdr_t *hdr = &node->btc_hdr;
1435
/*
1436
* If the node is the root node and rm_hdr is one of two children,
1437
* promote the other child to the root.
1438
*/
1439
if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1440
ASSERT3U(hdr->bth_count, ==, 1);
1441
ASSERT3P(tree->bt_root, ==, node);
1442
ASSERT3P(node->btc_children[1], ==, rm_hdr);
1443
tree->bt_root = node->btc_children[0];
1444
node->btc_children[0]->bth_parent = NULL;
1445
zfs_btree_node_destroy(tree, hdr);
1446
tree->bt_height--;
1447
return;
1448
}
1449
1450
uint32_t idx;
1451
for (idx = 0; idx <= hdr->bth_count; idx++) {
1452
if (node->btc_children[idx] == rm_hdr)
1453
break;
1454
}
1455
ASSERT3U(idx, <=, hdr->bth_count);
1456
1457
/*
1458
* If the node is the root or it has more than the minimum number of
1459
* children, just remove the child and separator, and return.
1460
*/
1461
if (hdr->bth_parent == NULL ||
1462
hdr->bth_count > min_count) {
1463
/*
1464
* Shift the element and children to the right of rm_hdr to
1465
* the left by one spot.
1466
*/
1467
bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1468
BSS_PARALLELOGRAM);
1469
hdr->bth_count--;
1470
zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1471
return;
1472
}
1473
1474
ASSERT3U(hdr->bth_count, ==, min_count);
1475
1476
/*
1477
* Now we try to take a node from a neighbor. We check left, then
1478
* right. If the neighbor exists and has more than the minimum number
1479
* of elements, we move the separator between us and them to our
1480
* node, move their closest element (last for left, first for right)
1481
* to the separator, and move their closest child to our node. Along
1482
* the way we need to collapse the gap made by idx, and (for our right
1483
* neighbor) the gap made by removing their first element and child.
1484
*
1485
* Note: this logic currently doesn't support taking from a neighbor
1486
* that isn't a sibling (i.e. a neighbor with a different
1487
* parent). This isn't critical functionality, but may be worth
1488
* implementing in the future for completeness' sake.
1489
*/
1490
zfs_btree_core_t *parent = hdr->bth_parent;
1491
uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1492
1493
zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1494
parent->btc_children[parent_idx - 1]);
1495
if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1496
/* We can take a node from the left neighbor. */
1497
ASSERT(zfs_btree_is_core(l_hdr));
1498
zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1499
1500
/*
1501
* Start by shifting the elements and children in the current
1502
* node to the right by one spot.
1503
*/
1504
bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1505
1506
/*
1507
* Move the separator between node and neighbor to the first
1508
* element slot in the current node.
1509
*/
1510
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1511
size;
1512
bcpy(separator, node->btc_elems, size);
1513
1514
/* Move the last child of neighbor to our first child slot. */
1515
node->btc_children[0] =
1516
neighbor->btc_children[l_hdr->bth_count];
1517
node->btc_children[0]->bth_parent = node;
1518
1519
/* Move the last element of neighbor to the separator spot. */
1520
uint8_t *take_elem = neighbor->btc_elems +
1521
(l_hdr->bth_count - 1) * size;
1522
bcpy(take_elem, separator, size);
1523
l_hdr->bth_count--;
1524
zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1525
return;
1526
}
1527
1528
zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1529
NULL : parent->btc_children[parent_idx + 1]);
1530
if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1531
/* We can take a node from the right neighbor. */
1532
ASSERT(zfs_btree_is_core(r_hdr));
1533
zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1534
1535
/*
1536
* Shift elements in node left by one spot to overwrite rm_hdr
1537
* and the separator before it.
1538
*/
1539
bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1540
BSS_PARALLELOGRAM);
1541
1542
/*
1543
* Move the separator between node and neighbor to the last
1544
* element spot in node.
1545
*/
1546
uint8_t *separator = parent->btc_elems + parent_idx * size;
1547
bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1548
size);
1549
1550
/*
1551
* Move the first child of neighbor to the last child spot in
1552
* node.
1553
*/
1554
node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1555
node->btc_children[hdr->bth_count]->bth_parent = node;
1556
1557
/* Move the first element of neighbor to the separator spot. */
1558
uint8_t *take_elem = neighbor->btc_elems;
1559
bcpy(take_elem, separator, size);
1560
r_hdr->bth_count--;
1561
1562
/*
1563
* Shift the elements and children of neighbor to cover the
1564
* stolen elements.
1565
*/
1566
bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1567
BSS_TRAPEZOID);
1568
zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1569
return;
1570
}
1571
1572
/*
1573
* In this case, neither of our neighbors can spare an element, so we
1574
* need to merge with one of them. We prefer the left one,
1575
* arbitrarily. Move the separator into the leftmost merging node
1576
* (which may be us or the left neighbor), and then move the right
1577
* merging node's elements. Once that's done, we go back and delete
1578
* the element we're removing. Finally, go into the parent and delete
1579
* the right merging node and the separator. This may cause further
1580
* merging.
1581
*/
1582
zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1583
uint32_t new_idx = idx;
1584
if (l_hdr != NULL) {
1585
keep_hdr = l_hdr;
1586
new_rm_hdr = hdr;
1587
new_idx += keep_hdr->bth_count + 1;
1588
} else {
1589
ASSERT3P(r_hdr, !=, NULL);
1590
keep_hdr = hdr;
1591
new_rm_hdr = r_hdr;
1592
parent_idx++;
1593
}
1594
1595
ASSERT(zfs_btree_is_core(keep_hdr));
1596
ASSERT(zfs_btree_is_core(new_rm_hdr));
1597
1598
zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1599
zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1600
1601
if (zfs_btree_verify_intensity >= 5) {
1602
for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1603
zfs_btree_verify_poison_at(tree, keep_hdr,
1604
keep_hdr->bth_count + i);
1605
}
1606
}
1607
1608
/* Move the separator into the left node. */
1609
uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1610
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1611
size;
1612
bcpy(separator, e_out, size);
1613
keep_hdr->bth_count++;
1614
1615
/* Move all our elements and children into the left node. */
1616
bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1617
keep_hdr->bth_count, BSS_TRAPEZOID);
1618
1619
uint32_t old_count = keep_hdr->bth_count;
1620
1621
/* Update bookkeeping */
1622
keep_hdr->bth_count += new_rm_hdr->bth_count;
1623
ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1624
1625
/*
1626
* Shift the element and children to the right of rm_hdr to
1627
* the left by one spot.
1628
*/
1629
ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1630
bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1631
BSS_PARALLELOGRAM);
1632
keep_hdr->bth_count--;
1633
1634
/* Reparent all our children to point to the left node. */
1635
zfs_btree_hdr_t **new_start = keep->btc_children +
1636
old_count - 1;
1637
for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1638
new_start[i]->bth_parent = keep;
1639
for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1640
ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1641
ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1642
}
1643
zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1644
1645
new_rm_hdr->bth_count = 0;
1646
zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1647
zfs_btree_node_destroy(tree, new_rm_hdr);
1648
}
1649
1650
/* Remove the element at the specific location. */
1651
void
1652
zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1653
{
1654
size_t size = tree->bt_elem_size;
1655
zfs_btree_hdr_t *hdr = where->bti_node;
1656
uint32_t idx = where->bti_offset;
1657
1658
ASSERT(!where->bti_before);
1659
if (tree->bt_bulk != NULL) {
1660
/*
1661
* Leave bulk insert mode. Note that our index would be
1662
* invalid after we correct the tree, so we copy the value
1663
* we're planning to remove and find it again after
1664
* bulk_finish.
1665
*/
1666
uint8_t *value = zfs_btree_get(tree, where);
1667
uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1668
bcpy(value, tmp, size);
1669
zfs_btree_bulk_finish(tree);
1670
VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1671
kmem_free(tmp, size);
1672
hdr = where->bti_node;
1673
idx = where->bti_offset;
1674
}
1675
1676
tree->bt_num_elems--;
1677
/*
1678
* If the element happens to be in a core node, we move a leaf node's
1679
* element into its place and then remove the leaf node element. This
1680
* makes the rebalance logic not need to be recursive both upwards and
1681
* downwards.
1682
*/
1683
if (zfs_btree_is_core(hdr)) {
1684
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1685
zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1686
void *new_value = zfs_btree_last_helper(tree, left_subtree,
1687
where);
1688
ASSERT3P(new_value, !=, NULL);
1689
1690
bcpy(new_value, node->btc_elems + idx * size, size);
1691
1692
hdr = where->bti_node;
1693
idx = where->bti_offset;
1694
ASSERT(!where->bti_before);
1695
}
1696
1697
/*
1698
* First, we'll update the leaf's metadata. Then, we shift any
1699
* elements after the idx to the left. After that, we rebalance if
1700
* needed.
1701
*/
1702
ASSERT(!zfs_btree_is_core(hdr));
1703
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1704
ASSERT3U(hdr->bth_count, >, 0);
1705
1706
uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1707
1708
/*
1709
* If we're over the minimum size or this is the root, just overwrite
1710
* the value and return.
1711
*/
1712
if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1713
bt_shrink_leaf(tree, leaf, idx, 1);
1714
if (hdr->bth_parent == NULL) {
1715
ASSERT0(tree->bt_height);
1716
if (hdr->bth_count == 0) {
1717
tree->bt_root = NULL;
1718
tree->bt_height--;
1719
zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1720
}
1721
}
1722
zfs_btree_verify(tree);
1723
return;
1724
}
1725
ASSERT3U(hdr->bth_count, ==, min_count);
1726
1727
/*
1728
* Now we try to take a node from a sibling. We check left, then
1729
* right. If they exist and have more than the minimum number of
1730
* elements, we move the separator between us and them to our node
1731
* and move their closest element (last for left, first for right) to
1732
* the separator. Along the way we need to collapse the gap made by
1733
* idx, and (for our right neighbor) the gap made by removing their
1734
* first element.
1735
*
1736
* Note: this logic currently doesn't support taking from a neighbor
1737
* that isn't a sibling. This isn't critical functionality, but may be
1738
* worth implementing in the future for completeness' sake.
1739
*/
1740
zfs_btree_core_t *parent = hdr->bth_parent;
1741
uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1742
1743
zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1744
parent->btc_children[parent_idx - 1]);
1745
if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1746
/* We can take a node from the left neighbor. */
1747
ASSERT(!zfs_btree_is_core(l_hdr));
1748
zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1749
1750
/*
1751
* Move our elements back by one spot to make room for the
1752
* stolen element and overwrite the element being removed.
1753
*/
1754
bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1755
1756
/* Move the separator to our first spot. */
1757
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1758
size;
1759
bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1760
1761
/* Move our neighbor's last element to the separator. */
1762
uint8_t *take_elem = neighbor->btl_elems +
1763
(l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1764
bcpy(take_elem, separator, size);
1765
1766
/* Delete our neighbor's last element. */
1767
bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1768
zfs_btree_verify(tree);
1769
return;
1770
}
1771
1772
zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1773
NULL : parent->btc_children[parent_idx + 1]);
1774
if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1775
/* We can take a node from the right neighbor. */
1776
ASSERT(!zfs_btree_is_core(r_hdr));
1777
zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1778
1779
/*
1780
* Move our elements after the element being removed forwards
1781
* by one spot to make room for the stolen element and
1782
* overwrite the element being removed.
1783
*/
1784
bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1785
1, BSD_LEFT);
1786
1787
/* Move the separator between us to our last spot. */
1788
uint8_t *separator = parent->btc_elems + parent_idx * size;
1789
bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1790
hdr->bth_count - 1) * size, size);
1791
1792
/* Move our neighbor's first element to the separator. */
1793
uint8_t *take_elem = neighbor->btl_elems +
1794
r_hdr->bth_first * size;
1795
bcpy(take_elem, separator, size);
1796
1797
/* Delete our neighbor's first element. */
1798
bt_shrink_leaf(tree, neighbor, 0, 1);
1799
zfs_btree_verify(tree);
1800
return;
1801
}
1802
1803
/*
1804
* In this case, neither of our neighbors can spare an element, so we
1805
* need to merge with one of them. We prefer the left one, arbitrarily.
1806
* After remove we move the separator into the leftmost merging node
1807
* (which may be us or the left neighbor), and then move the right
1808
* merging node's elements. Once that's done, we go back and delete
1809
* the element we're removing. Finally, go into the parent and delete
1810
* the right merging node and the separator. This may cause further
1811
* merging.
1812
*/
1813
zfs_btree_hdr_t *rm_hdr, *k_hdr;
1814
if (l_hdr != NULL) {
1815
k_hdr = l_hdr;
1816
rm_hdr = hdr;
1817
} else {
1818
ASSERT3P(r_hdr, !=, NULL);
1819
k_hdr = hdr;
1820
rm_hdr = r_hdr;
1821
parent_idx++;
1822
}
1823
ASSERT(!zfs_btree_is_core(k_hdr));
1824
ASSERT(!zfs_btree_is_core(rm_hdr));
1825
ASSERT3U(k_hdr->bth_count, ==, min_count);
1826
ASSERT3U(rm_hdr->bth_count, ==, min_count);
1827
zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1828
zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1829
1830
if (zfs_btree_verify_intensity >= 5) {
1831
for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1832
zfs_btree_verify_poison_at(tree, k_hdr,
1833
k_hdr->bth_count + i);
1834
}
1835
}
1836
1837
/*
1838
* Remove the value from the node. It will go below the minimum,
1839
* but we'll fix it in no time.
1840
*/
1841
bt_shrink_leaf(tree, leaf, idx, 1);
1842
1843
/* Prepare space for elements to be moved from the right. */
1844
uint32_t k_count = k_hdr->bth_count;
1845
bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1846
ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1847
1848
/* Move the separator into the first open spot. */
1849
uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1850
uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1851
bcpy(separator, out, size);
1852
1853
/* Move our elements to the left neighbor. */
1854
bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1855
1856
/* Remove the emptied node from the parent. */
1857
zfs_btree_remove_from_node(tree, parent, rm_hdr);
1858
zfs_btree_node_destroy(tree, rm_hdr);
1859
zfs_btree_verify(tree);
1860
}
1861
1862
/* Remove the given value from the tree. */
1863
void
1864
zfs_btree_remove(zfs_btree_t *tree, const void *value)
1865
{
1866
zfs_btree_index_t where = {0};
1867
VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1868
zfs_btree_remove_idx(tree, &where);
1869
}
1870
1871
/* Return the number of elements in the tree. */
1872
ulong_t
1873
zfs_btree_numnodes(zfs_btree_t *tree)
1874
{
1875
return (tree->bt_num_elems);
1876
}
1877
1878
/*
1879
* This function is used to visit all the elements in the tree before
1880
* destroying the tree. This allows the calling code to perform any cleanup it
1881
* needs to do. This is more efficient than just removing the first element
1882
* over and over, because it removes all rebalancing. Once the destroy_nodes()
1883
* function has been called, no other btree operations are valid until it
1884
* returns NULL, which point the only valid operation is zfs_btree_destroy().
1885
*
1886
* example:
1887
*
1888
* zfs_btree_index_t *cookie = NULL;
1889
* my_data_t *node;
1890
*
1891
* while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1892
* free(node->ptr);
1893
* zfs_btree_destroy(tree);
1894
*
1895
*/
1896
void *
1897
zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1898
{
1899
if (*cookie == NULL) {
1900
if (tree->bt_height == -1)
1901
return (NULL);
1902
*cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1903
return (zfs_btree_first(tree, *cookie));
1904
}
1905
1906
void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1907
zfs_btree_node_destroy);
1908
if (rval == NULL) {
1909
tree->bt_root = NULL;
1910
tree->bt_height = -1;
1911
tree->bt_num_elems = 0;
1912
kmem_free(*cookie, sizeof (**cookie));
1913
tree->bt_bulk = NULL;
1914
}
1915
return (rval);
1916
}
1917
1918
static void
1919
zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1920
{
1921
if (zfs_btree_is_core(hdr)) {
1922
zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1923
for (uint32_t i = 0; i <= hdr->bth_count; i++)
1924
zfs_btree_clear_helper(tree, btc->btc_children[i]);
1925
}
1926
1927
zfs_btree_node_destroy(tree, hdr);
1928
}
1929
1930
void
1931
zfs_btree_clear(zfs_btree_t *tree)
1932
{
1933
if (tree->bt_root == NULL) {
1934
ASSERT0(tree->bt_num_elems);
1935
return;
1936
}
1937
1938
zfs_btree_clear_helper(tree, tree->bt_root);
1939
tree->bt_num_elems = 0;
1940
tree->bt_root = NULL;
1941
tree->bt_num_nodes = 0;
1942
tree->bt_height = -1;
1943
tree->bt_bulk = NULL;
1944
}
1945
1946
void
1947
zfs_btree_destroy(zfs_btree_t *tree)
1948
{
1949
ASSERT0(tree->bt_num_elems);
1950
ASSERT0P(tree->bt_root);
1951
}
1952
1953
/* Verify that every child of this node has the correct parent pointer. */
1954
static void
1955
zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1956
{
1957
if (!zfs_btree_is_core(hdr))
1958
return;
1959
1960
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1961
for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1962
VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1963
zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1964
}
1965
}
1966
1967
/* Verify that every node has the correct parent pointer. */
1968
static void
1969
zfs_btree_verify_pointers(zfs_btree_t *tree)
1970
{
1971
if (tree->bt_height == -1) {
1972
VERIFY0P(tree->bt_root);
1973
return;
1974
}
1975
VERIFY0P(tree->bt_root->bth_parent);
1976
zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1977
}
1978
1979
/*
1980
* Verify that all the current node and its children satisfy the count
1981
* invariants, and return the total count in the subtree rooted in this node.
1982
*/
1983
static uint64_t
1984
zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1985
{
1986
if (!zfs_btree_is_core(hdr)) {
1987
if (tree->bt_root != hdr && tree->bt_bulk &&
1988
hdr != &tree->bt_bulk->btl_hdr) {
1989
VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
1990
}
1991
1992
return (hdr->bth_count);
1993
} else {
1994
1995
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1996
uint64_t ret = hdr->bth_count;
1997
if (tree->bt_root != hdr && tree->bt_bulk == NULL)
1998
VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
1999
for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2000
ret += zfs_btree_verify_counts_helper(tree,
2001
node->btc_children[i]);
2002
}
2003
2004
return (ret);
2005
}
2006
}
2007
2008
/*
2009
* Verify that all nodes satisfy the invariants and that the total number of
2010
* elements is correct.
2011
*/
2012
static void
2013
zfs_btree_verify_counts(zfs_btree_t *tree)
2014
{
2015
EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
2016
if (tree->bt_height == -1) {
2017
return;
2018
}
2019
VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
2020
tree->bt_num_elems);
2021
}
2022
2023
/*
2024
* Check that the subtree rooted at this node has a uniform height. Returns
2025
* the number of nodes under this node, to help verify bt_num_nodes.
2026
*/
2027
static uint64_t
2028
zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
2029
int32_t height)
2030
{
2031
if (!zfs_btree_is_core(hdr)) {
2032
VERIFY0(height);
2033
return (1);
2034
}
2035
2036
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2037
uint64_t ret = 1;
2038
for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2039
ret += zfs_btree_verify_height_helper(tree,
2040
node->btc_children[i], height - 1);
2041
}
2042
return (ret);
2043
}
2044
2045
/*
2046
* Check that the tree rooted at this node has a uniform height, and that the
2047
* bt_height in the tree is correct.
2048
*/
2049
static void
2050
zfs_btree_verify_height(zfs_btree_t *tree)
2051
{
2052
EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2053
if (tree->bt_height == -1) {
2054
return;
2055
}
2056
2057
VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2058
tree->bt_height), ==, tree->bt_num_nodes);
2059
}
2060
2061
/*
2062
* Check that the elements in this node are sorted, and that if this is a core
2063
* node, the separators are properly between the subtrees they separaate and
2064
* that the children also satisfy this requirement.
2065
*/
2066
static void
2067
zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2068
{
2069
size_t size = tree->bt_elem_size;
2070
if (!zfs_btree_is_core(hdr)) {
2071
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2072
for (uint32_t i = 1; i < hdr->bth_count; i++) {
2073
VERIFY3S(tree->bt_compar(leaf->btl_elems +
2074
(hdr->bth_first + i - 1) * size,
2075
leaf->btl_elems +
2076
(hdr->bth_first + i) * size), ==, -1);
2077
}
2078
return;
2079
}
2080
2081
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2082
for (uint32_t i = 1; i < hdr->bth_count; i++) {
2083
VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2084
node->btc_elems + i * size), ==, -1);
2085
}
2086
for (uint32_t i = 0; i < hdr->bth_count; i++) {
2087
uint8_t *left_child_last = NULL;
2088
zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2089
if (zfs_btree_is_core(left_child_hdr)) {
2090
zfs_btree_core_t *left_child =
2091
(zfs_btree_core_t *)left_child_hdr;
2092
left_child_last = left_child->btc_elems +
2093
(left_child_hdr->bth_count - 1) * size;
2094
} else {
2095
zfs_btree_leaf_t *left_child =
2096
(zfs_btree_leaf_t *)left_child_hdr;
2097
left_child_last = left_child->btl_elems +
2098
(left_child_hdr->bth_first +
2099
left_child_hdr->bth_count - 1) * size;
2100
}
2101
int comp = tree->bt_compar(node->btc_elems + i * size,
2102
left_child_last);
2103
if (comp <= 0) {
2104
panic("btree: compar returned %d (expected 1) at "
2105
"%px %d: compar(%px, %px)", comp, node, i,
2106
node->btc_elems + i * size, left_child_last);
2107
}
2108
2109
uint8_t *right_child_first = NULL;
2110
zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2111
if (zfs_btree_is_core(right_child_hdr)) {
2112
zfs_btree_core_t *right_child =
2113
(zfs_btree_core_t *)right_child_hdr;
2114
right_child_first = right_child->btc_elems;
2115
} else {
2116
zfs_btree_leaf_t *right_child =
2117
(zfs_btree_leaf_t *)right_child_hdr;
2118
right_child_first = right_child->btl_elems +
2119
right_child_hdr->bth_first * size;
2120
}
2121
comp = tree->bt_compar(node->btc_elems + i * size,
2122
right_child_first);
2123
if (comp >= 0) {
2124
panic("btree: compar returned %d (expected -1) at "
2125
"%px %d: compar(%px, %px)", comp, node, i,
2126
node->btc_elems + i * size, right_child_first);
2127
}
2128
}
2129
for (uint32_t i = 0; i <= hdr->bth_count; i++)
2130
zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2131
}
2132
2133
/* Check that all elements in the tree are in sorted order. */
2134
static void
2135
zfs_btree_verify_order(zfs_btree_t *tree)
2136
{
2137
EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2138
if (tree->bt_height == -1) {
2139
return;
2140
}
2141
2142
zfs_btree_verify_order_helper(tree, tree->bt_root);
2143
}
2144
2145
#ifdef ZFS_DEBUG
2146
/* Check that all unused memory is poisoned correctly. */
2147
static void
2148
zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2149
{
2150
size_t size = tree->bt_elem_size;
2151
if (!zfs_btree_is_core(hdr)) {
2152
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2153
for (size_t i = 0; i < hdr->bth_first * size; i++)
2154
VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2155
size_t esize = tree->bt_leaf_size -
2156
offsetof(zfs_btree_leaf_t, btl_elems);
2157
for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2158
i < esize; i++)
2159
VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2160
} else {
2161
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2162
for (size_t i = hdr->bth_count * size;
2163
i < BTREE_CORE_ELEMS * size; i++)
2164
VERIFY3U(node->btc_elems[i], ==, 0x0f);
2165
2166
for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2167
i++) {
2168
VERIFY3P(node->btc_children[i], ==,
2169
(zfs_btree_hdr_t *)BTREE_POISON);
2170
}
2171
2172
for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2173
zfs_btree_verify_poison_helper(tree,
2174
node->btc_children[i]);
2175
}
2176
}
2177
}
2178
#endif
2179
2180
/* Check that unused memory in the tree is still poisoned. */
2181
static void
2182
zfs_btree_verify_poison(zfs_btree_t *tree)
2183
{
2184
#ifdef ZFS_DEBUG
2185
if (tree->bt_height == -1)
2186
return;
2187
zfs_btree_verify_poison_helper(tree, tree->bt_root);
2188
#endif
2189
}
2190
2191
void
2192
zfs_btree_verify(zfs_btree_t *tree)
2193
{
2194
if (zfs_btree_verify_intensity == 0)
2195
return;
2196
zfs_btree_verify_height(tree);
2197
if (zfs_btree_verify_intensity == 1)
2198
return;
2199
zfs_btree_verify_pointers(tree);
2200
if (zfs_btree_verify_intensity == 2)
2201
return;
2202
zfs_btree_verify_counts(tree);
2203
if (zfs_btree_verify_intensity == 3)
2204
return;
2205
zfs_btree_verify_order(tree);
2206
2207
if (zfs_btree_verify_intensity == 4)
2208
return;
2209
zfs_btree_verify_poison(tree);
2210
}
2211
2212
ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
2213
"Enable btree verification. Levels above 4 require ZFS be built "
2214
"with debugging");
2215
2216