Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
25
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26
*/
27
28
#include <sys/dmu.h>
29
#include <sys/zap.h>
30
#include <sys/zfs_context.h>
31
#include <sys/dsl_pool.h>
32
#include <sys/dsl_dataset.h>
33
34
/*
35
* Deadlist concurrency:
36
*
37
* Deadlists can only be modified from the syncing thread.
38
*
39
* Except for dsl_deadlist_insert(), it can only be modified with the
40
* dp_config_rwlock held with RW_WRITER.
41
*
42
* The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
43
* be called concurrently, from open context, with the dl_config_rwlock held
44
* with RW_READER.
45
*
46
* Therefore, we only need to provide locking between dsl_deadlist_insert() and
47
* the accessors, protecting:
48
* dl_phys->dl_used,comp,uncomp
49
* and protecting the dl_tree from being loaded.
50
* The locking is provided by dl_lock. Note that locking on the bpobj_t
51
* provides its own locking, and dl_oldfmt is immutable.
52
*/
53
54
/*
55
* Livelist Overview
56
* ================
57
*
58
* Livelists use the same 'deadlist_t' struct as deadlists and are also used
59
* to track blkptrs over the lifetime of a dataset. Livelists however, belong
60
* to clones and track the blkptrs that are clone-specific (were born after
61
* the clone's creation). The exception is embedded block pointers which are
62
* not included in livelists because they do not need to be freed.
63
*
64
* When it comes time to delete the clone, the livelist provides a quick
65
* reference as to what needs to be freed. For this reason, livelists also track
66
* when clone-specific blkptrs are freed before deletion to prevent double
67
* frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
68
* deletion algorithm iterates backwards over the livelist, matching
69
* FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
70
* are also updated in the case when blkptrs are remapped: the old version
71
* of the blkptr is cancelled out with a FREE and the new version is tracked
72
* with an ALLOC.
73
*
74
* To bound the amount of memory required for deletion, livelists over a
75
* certain size are spread over multiple entries. Entries are grouped by
76
* birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
77
* be in the same entry. This allows us to delete livelists incrementally
78
* over multiple syncs, one entry at a time.
79
*
80
* During the lifetime of the clone, livelists can get extremely large.
81
* Their size is managed by periodic condensing (preemptively cancelling out
82
* FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
83
* the shared space between the clone and its origin is so small that it
84
* doesn't make sense to use livelists anymore.
85
*/
86
87
/*
88
* The threshold sublist size at which we create a new sub-livelist for the
89
* next txg. However, since blkptrs of the same transaction group must be in
90
* the same sub-list, the actual sublist size may exceed this. When picking the
91
* size we had to balance the fact that larger sublists mean fewer sublists
92
* (decreasing the cost of insertion) against the consideration that sublists
93
* will be loaded into memory and shouldn't take up an inordinate amount of
94
* space. We settled on ~500000 entries, corresponding to roughly 128M.
95
*/
96
uint64_t zfs_livelist_max_entries = 500000;
97
98
/*
99
* We can approximate how much of a performance gain a livelist will give us
100
* based on the percentage of blocks shared between the clone and its origin.
101
* 0 percent shared means that the clone has completely diverged and that the
102
* old method is maximally effective: every read from the block tree will
103
* result in lots of frees. Livelists give us gains when they track blocks
104
* scattered across the tree, when one read in the old method might only
105
* result in a few frees. Once the clone has been overwritten enough,
106
* writes are no longer sparse and we'll no longer get much of a benefit from
107
* tracking them with a livelist. We chose a lower limit of 75 percent shared
108
* (25 percent overwritten). This means that 1/4 of all block pointers will be
109
* freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
110
* to make deletion 4x faster. Once the amount of shared space drops below this
111
* threshold, the clone will revert to the old deletion method.
112
*/
113
int zfs_livelist_min_percent_shared = 75;
114
115
static int
116
dsl_deadlist_compare(const void *arg1, const void *arg2)
117
{
118
const dsl_deadlist_entry_t *dle1 = arg1;
119
const dsl_deadlist_entry_t *dle2 = arg2;
120
121
return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
122
}
123
124
static int
125
dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
126
{
127
const dsl_deadlist_cache_entry_t *dlce1 = arg1;
128
const dsl_deadlist_cache_entry_t *dlce2 = arg2;
129
130
return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
131
}
132
133
static void
134
dsl_deadlist_load_tree(dsl_deadlist_t *dl)
135
{
136
zap_cursor_t zc;
137
zap_attribute_t *za;
138
int error;
139
140
ASSERT(MUTEX_HELD(&dl->dl_lock));
141
142
ASSERT(!dl->dl_oldfmt);
143
if (dl->dl_havecache) {
144
/*
145
* After loading the tree, the caller may modify the tree,
146
* e.g. to add or remove nodes, or to make a node no longer
147
* refer to the empty_bpobj. These changes would make the
148
* dl_cache incorrect. Therefore we discard the cache here,
149
* so that it can't become incorrect.
150
*/
151
dsl_deadlist_cache_entry_t *dlce;
152
void *cookie = NULL;
153
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
154
!= NULL) {
155
kmem_free(dlce, sizeof (*dlce));
156
}
157
avl_destroy(&dl->dl_cache);
158
dl->dl_havecache = B_FALSE;
159
}
160
if (dl->dl_havetree)
161
return;
162
163
za = zap_attribute_alloc();
164
avl_create(&dl->dl_tree, dsl_deadlist_compare,
165
sizeof (dsl_deadlist_entry_t),
166
offsetof(dsl_deadlist_entry_t, dle_node));
167
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
168
(error = zap_cursor_retrieve(&zc, za)) == 0;
169
zap_cursor_advance(&zc)) {
170
dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
171
dle->dle_mintxg = zfs_strtonum(za->za_name, NULL);
172
173
/*
174
* Prefetch all the bpobj's so that we do that i/o
175
* in parallel. Then open them all in a second pass.
176
*/
177
dle->dle_bpobj.bpo_object = za->za_first_integer;
178
dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
179
ZIO_PRIORITY_SYNC_READ);
180
181
avl_add(&dl->dl_tree, dle);
182
}
183
VERIFY3U(error, ==, ENOENT);
184
zap_cursor_fini(&zc);
185
zap_attribute_free(za);
186
187
for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
188
dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
189
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
190
dle->dle_bpobj.bpo_object));
191
}
192
dl->dl_havetree = B_TRUE;
193
}
194
195
/*
196
* Load only the non-empty bpobj's into the dl_cache. The cache is an analog
197
* of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
198
* is used only for gathering space statistics. The dl_cache has two
199
* advantages over the dl_tree:
200
*
201
* 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
202
* mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
203
* many times and to inquire about its (zero) space stats many times.
204
*
205
* 2. The dl_cache uses less memory than the dl_tree. We only need to load
206
* the dl_tree of snapshots when deleting a snapshot, after which we free the
207
* dl_tree with dsl_deadlist_discard_tree
208
*/
209
static void
210
dsl_deadlist_load_cache(dsl_deadlist_t *dl)
211
{
212
zap_cursor_t zc;
213
zap_attribute_t *za;
214
int error;
215
216
ASSERT(MUTEX_HELD(&dl->dl_lock));
217
218
ASSERT(!dl->dl_oldfmt);
219
if (dl->dl_havecache)
220
return;
221
222
uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
223
224
avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
225
sizeof (dsl_deadlist_cache_entry_t),
226
offsetof(dsl_deadlist_cache_entry_t, dlce_node));
227
za = zap_attribute_alloc();
228
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
229
(error = zap_cursor_retrieve(&zc, za)) == 0;
230
zap_cursor_advance(&zc)) {
231
if (za->za_first_integer == empty_bpobj)
232
continue;
233
dsl_deadlist_cache_entry_t *dlce =
234
kmem_zalloc(sizeof (*dlce), KM_SLEEP);
235
dlce->dlce_mintxg = zfs_strtonum(za->za_name, NULL);
236
237
/*
238
* Prefetch all the bpobj's so that we do that i/o
239
* in parallel. Then open them all in a second pass.
240
*/
241
dlce->dlce_bpobj = za->za_first_integer;
242
dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
243
ZIO_PRIORITY_SYNC_READ);
244
avl_add(&dl->dl_cache, dlce);
245
}
246
VERIFY3U(error, ==, ENOENT);
247
zap_cursor_fini(&zc);
248
zap_attribute_free(za);
249
250
for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
251
dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
252
bpobj_t bpo;
253
VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
254
255
VERIFY0(bpobj_space(&bpo,
256
&dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
257
bpobj_close(&bpo);
258
}
259
dl->dl_havecache = B_TRUE;
260
}
261
262
/*
263
* Discard the tree to save memory.
264
*/
265
void
266
dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
267
{
268
mutex_enter(&dl->dl_lock);
269
270
if (!dl->dl_havetree) {
271
mutex_exit(&dl->dl_lock);
272
return;
273
}
274
dsl_deadlist_entry_t *dle;
275
void *cookie = NULL;
276
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
277
bpobj_close(&dle->dle_bpobj);
278
kmem_free(dle, sizeof (*dle));
279
}
280
avl_destroy(&dl->dl_tree);
281
282
dl->dl_havetree = B_FALSE;
283
mutex_exit(&dl->dl_lock);
284
}
285
286
void
287
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
288
{
289
dsl_deadlist_entry_t *dle;
290
291
ASSERT(dsl_deadlist_is_open(dl));
292
293
mutex_enter(&dl->dl_lock);
294
dsl_deadlist_load_tree(dl);
295
mutex_exit(&dl->dl_lock);
296
for (dle = avl_first(&dl->dl_tree); dle != NULL;
297
dle = AVL_NEXT(&dl->dl_tree, dle)) {
298
if (func(args, dle) != 0)
299
break;
300
}
301
}
302
303
int
304
dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
305
{
306
dmu_object_info_t doi;
307
int err;
308
309
ASSERT(!dsl_deadlist_is_open(dl));
310
311
mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
312
dl->dl_os = os;
313
dl->dl_object = object;
314
err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf);
315
if (err != 0)
316
return (err);
317
dmu_object_info_from_db(dl->dl_dbuf, &doi);
318
if (doi.doi_type == DMU_OT_BPOBJ) {
319
dmu_buf_rele(dl->dl_dbuf, dl);
320
dl->dl_dbuf = NULL;
321
dl->dl_oldfmt = B_TRUE;
322
return (bpobj_open(&dl->dl_bpobj, os, object));
323
}
324
325
dl->dl_oldfmt = B_FALSE;
326
dl->dl_phys = dl->dl_dbuf->db_data;
327
dl->dl_havetree = B_FALSE;
328
dl->dl_havecache = B_FALSE;
329
return (0);
330
}
331
332
boolean_t
333
dsl_deadlist_is_open(dsl_deadlist_t *dl)
334
{
335
return (dl->dl_os != NULL);
336
}
337
338
void
339
dsl_deadlist_close(dsl_deadlist_t *dl)
340
{
341
ASSERT(dsl_deadlist_is_open(dl));
342
mutex_destroy(&dl->dl_lock);
343
344
if (dl->dl_oldfmt) {
345
dl->dl_oldfmt = B_FALSE;
346
bpobj_close(&dl->dl_bpobj);
347
dl->dl_os = NULL;
348
dl->dl_object = 0;
349
return;
350
}
351
352
if (dl->dl_havetree) {
353
dsl_deadlist_entry_t *dle;
354
void *cookie = NULL;
355
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
356
!= NULL) {
357
bpobj_close(&dle->dle_bpobj);
358
kmem_free(dle, sizeof (*dle));
359
}
360
avl_destroy(&dl->dl_tree);
361
}
362
if (dl->dl_havecache) {
363
dsl_deadlist_cache_entry_t *dlce;
364
void *cookie = NULL;
365
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
366
!= NULL) {
367
kmem_free(dlce, sizeof (*dlce));
368
}
369
avl_destroy(&dl->dl_cache);
370
}
371
dmu_buf_rele(dl->dl_dbuf, dl);
372
dl->dl_dbuf = NULL;
373
dl->dl_phys = NULL;
374
dl->dl_os = NULL;
375
dl->dl_object = 0;
376
}
377
378
uint64_t
379
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
380
{
381
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
382
return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
383
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
384
sizeof (dsl_deadlist_phys_t), tx));
385
}
386
387
void
388
dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
389
{
390
dmu_object_info_t doi;
391
zap_cursor_t zc;
392
zap_attribute_t *za;
393
int error;
394
395
VERIFY0(dmu_object_info(os, dlobj, &doi));
396
if (doi.doi_type == DMU_OT_BPOBJ) {
397
bpobj_free(os, dlobj, tx);
398
return;
399
}
400
401
za = zap_attribute_alloc();
402
for (zap_cursor_init(&zc, os, dlobj);
403
(error = zap_cursor_retrieve(&zc, za)) == 0;
404
zap_cursor_advance(&zc)) {
405
uint64_t obj = za->za_first_integer;
406
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
407
bpobj_decr_empty(os, tx);
408
else
409
bpobj_free(os, obj, tx);
410
}
411
VERIFY3U(error, ==, ENOENT);
412
zap_cursor_fini(&zc);
413
zap_attribute_free(za);
414
VERIFY0(dmu_object_free(os, dlobj, tx));
415
}
416
417
static void
418
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
419
const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
420
{
421
ASSERT(MUTEX_HELD(&dl->dl_lock));
422
if (dle->dle_bpobj.bpo_object ==
423
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
424
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
425
bpobj_close(&dle->dle_bpobj);
426
bpobj_decr_empty(dl->dl_os, tx);
427
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
428
VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
429
dle->dle_mintxg, obj, tx));
430
}
431
bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
432
}
433
434
static void
435
dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
436
uint64_t obj, dmu_tx_t *tx)
437
{
438
ASSERT(MUTEX_HELD(&dl->dl_lock));
439
if (dle->dle_bpobj.bpo_object !=
440
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
441
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
442
} else {
443
bpobj_close(&dle->dle_bpobj);
444
bpobj_decr_empty(dl->dl_os, tx);
445
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
446
VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
447
dle->dle_mintxg, obj, tx));
448
}
449
}
450
451
/*
452
* Prefetch metadata required for dle_enqueue_subobj().
453
*/
454
static void
455
dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
456
uint64_t obj)
457
{
458
if (dle->dle_bpobj.bpo_object !=
459
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
460
bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
461
}
462
463
void
464
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
465
dmu_tx_t *tx)
466
{
467
dsl_deadlist_entry_t dle_tofind;
468
dsl_deadlist_entry_t *dle;
469
avl_index_t where;
470
471
if (dl->dl_oldfmt) {
472
bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
473
return;
474
}
475
476
mutex_enter(&dl->dl_lock);
477
dsl_deadlist_load_tree(dl);
478
479
dmu_buf_will_dirty(dl->dl_dbuf, tx);
480
481
int sign = bp_freed ? -1 : +1;
482
dl->dl_phys->dl_used +=
483
sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
484
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
485
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
486
487
dle_tofind.dle_mintxg = BP_GET_BIRTH(bp);
488
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
489
if (dle == NULL)
490
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
491
else
492
dle = AVL_PREV(&dl->dl_tree, dle);
493
494
if (dle == NULL) {
495
zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
496
bp, (longlong_t)BP_GET_BIRTH(bp));
497
dle = avl_first(&dl->dl_tree);
498
}
499
500
ASSERT3P(dle, !=, NULL);
501
dle_enqueue(dl, dle, bp, bp_freed, tx);
502
mutex_exit(&dl->dl_lock);
503
}
504
505
int
506
dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
507
{
508
dsl_deadlist_t *dl = arg;
509
dsl_deadlist_insert(dl, bp, B_FALSE, tx);
510
return (0);
511
}
512
513
int
514
dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
515
{
516
dsl_deadlist_t *dl = arg;
517
dsl_deadlist_insert(dl, bp, B_TRUE, tx);
518
return (0);
519
}
520
521
/*
522
* Insert new key in deadlist, which must be > all current entries.
523
* mintxg is not inclusive.
524
*/
525
void
526
dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
527
{
528
uint64_t obj;
529
dsl_deadlist_entry_t *dle;
530
531
if (dl->dl_oldfmt)
532
return;
533
534
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
535
dle->dle_mintxg = mintxg;
536
537
mutex_enter(&dl->dl_lock);
538
dsl_deadlist_load_tree(dl);
539
540
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
541
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
542
avl_add(&dl->dl_tree, dle);
543
544
VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
545
mintxg, obj, tx));
546
mutex_exit(&dl->dl_lock);
547
}
548
549
/*
550
* Remove this key, merging its entries into the previous key.
551
*/
552
void
553
dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
554
{
555
dsl_deadlist_entry_t dle_tofind;
556
dsl_deadlist_entry_t *dle, *dle_prev;
557
558
if (dl->dl_oldfmt)
559
return;
560
mutex_enter(&dl->dl_lock);
561
dsl_deadlist_load_tree(dl);
562
563
dle_tofind.dle_mintxg = mintxg;
564
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
565
ASSERT3P(dle, !=, NULL);
566
dle_prev = AVL_PREV(&dl->dl_tree, dle);
567
ASSERT3P(dle_prev, !=, NULL);
568
569
dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
570
571
avl_remove(&dl->dl_tree, dle);
572
bpobj_close(&dle->dle_bpobj);
573
kmem_free(dle, sizeof (*dle));
574
575
VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
576
mutex_exit(&dl->dl_lock);
577
}
578
579
/*
580
* Remove a deadlist entry and all of its contents by removing the entry from
581
* the deadlist's avl tree, freeing the entry's bpobj and adjusting the
582
* deadlist's space accounting accordingly.
583
*/
584
void
585
dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
586
{
587
uint64_t used, comp, uncomp;
588
dsl_deadlist_entry_t dle_tofind;
589
dsl_deadlist_entry_t *dle;
590
objset_t *os = dl->dl_os;
591
592
if (dl->dl_oldfmt)
593
return;
594
595
mutex_enter(&dl->dl_lock);
596
dsl_deadlist_load_tree(dl);
597
598
dle_tofind.dle_mintxg = mintxg;
599
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
600
VERIFY3P(dle, !=, NULL);
601
602
avl_remove(&dl->dl_tree, dle);
603
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
604
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
605
dmu_buf_will_dirty(dl->dl_dbuf, tx);
606
dl->dl_phys->dl_used -= used;
607
dl->dl_phys->dl_comp -= comp;
608
dl->dl_phys->dl_uncomp -= uncomp;
609
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
610
bpobj_decr_empty(os, tx);
611
} else {
612
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
613
}
614
bpobj_close(&dle->dle_bpobj);
615
kmem_free(dle, sizeof (*dle));
616
mutex_exit(&dl->dl_lock);
617
}
618
619
/*
620
* Clear out the contents of a deadlist_entry by freeing its bpobj,
621
* replacing it with an empty bpobj and adjusting the deadlist's
622
* space accounting
623
*/
624
void
625
dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
626
dmu_tx_t *tx)
627
{
628
uint64_t new_obj, used, comp, uncomp;
629
objset_t *os = dl->dl_os;
630
631
mutex_enter(&dl->dl_lock);
632
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
633
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
634
dmu_buf_will_dirty(dl->dl_dbuf, tx);
635
dl->dl_phys->dl_used -= used;
636
dl->dl_phys->dl_comp -= comp;
637
dl->dl_phys->dl_uncomp -= uncomp;
638
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
639
bpobj_decr_empty(os, tx);
640
else
641
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
642
bpobj_close(&dle->dle_bpobj);
643
new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
644
VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
645
VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
646
new_obj, tx));
647
ASSERT(bpobj_is_empty(&dle->dle_bpobj));
648
mutex_exit(&dl->dl_lock);
649
}
650
651
/*
652
* Return the first entry in deadlist's avl tree
653
*/
654
dsl_deadlist_entry_t *
655
dsl_deadlist_first(dsl_deadlist_t *dl)
656
{
657
dsl_deadlist_entry_t *dle;
658
659
mutex_enter(&dl->dl_lock);
660
dsl_deadlist_load_tree(dl);
661
dle = avl_first(&dl->dl_tree);
662
mutex_exit(&dl->dl_lock);
663
664
return (dle);
665
}
666
667
/*
668
* Return the last entry in deadlist's avl tree
669
*/
670
dsl_deadlist_entry_t *
671
dsl_deadlist_last(dsl_deadlist_t *dl)
672
{
673
dsl_deadlist_entry_t *dle;
674
675
mutex_enter(&dl->dl_lock);
676
dsl_deadlist_load_tree(dl);
677
dle = avl_last(&dl->dl_tree);
678
mutex_exit(&dl->dl_lock);
679
680
return (dle);
681
}
682
683
/*
684
* Walk ds's snapshots to regenerate generate ZAP & AVL.
685
*/
686
static void
687
dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
688
uint64_t mrs_obj, dmu_tx_t *tx)
689
{
690
dsl_deadlist_t dl = { 0 };
691
dsl_pool_t *dp = dmu_objset_pool(os);
692
693
VERIFY0(dsl_deadlist_open(&dl, os, dlobj));
694
if (dl.dl_oldfmt) {
695
dsl_deadlist_close(&dl);
696
return;
697
}
698
699
while (mrs_obj != 0) {
700
dsl_dataset_t *ds;
701
VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
702
dsl_deadlist_add_key(&dl,
703
dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
704
mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
705
dsl_dataset_rele(ds, FTAG);
706
}
707
dsl_deadlist_close(&dl);
708
}
709
710
uint64_t
711
dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
712
uint64_t mrs_obj, dmu_tx_t *tx)
713
{
714
dsl_deadlist_entry_t *dle;
715
uint64_t newobj;
716
717
newobj = dsl_deadlist_alloc(dl->dl_os, tx);
718
719
if (dl->dl_oldfmt) {
720
dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
721
return (newobj);
722
}
723
724
mutex_enter(&dl->dl_lock);
725
dsl_deadlist_load_tree(dl);
726
727
for (dle = avl_first(&dl->dl_tree); dle;
728
dle = AVL_NEXT(&dl->dl_tree, dle)) {
729
uint64_t obj;
730
731
if (dle->dle_mintxg >= maxtxg)
732
break;
733
734
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
735
VERIFY0(zap_add_int_key(dl->dl_os, newobj,
736
dle->dle_mintxg, obj, tx));
737
}
738
mutex_exit(&dl->dl_lock);
739
return (newobj);
740
}
741
742
void
743
dsl_deadlist_space(dsl_deadlist_t *dl,
744
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
745
{
746
ASSERT(dsl_deadlist_is_open(dl));
747
if (dl->dl_oldfmt) {
748
VERIFY0(bpobj_space(&dl->dl_bpobj,
749
usedp, compp, uncompp));
750
return;
751
}
752
753
mutex_enter(&dl->dl_lock);
754
*usedp = dl->dl_phys->dl_used;
755
*compp = dl->dl_phys->dl_comp;
756
*uncompp = dl->dl_phys->dl_uncomp;
757
mutex_exit(&dl->dl_lock);
758
}
759
760
/*
761
* return space used in the range (mintxg, maxtxg].
762
* Includes maxtxg, does not include mintxg.
763
* mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
764
* UINT64_MAX).
765
*/
766
void
767
dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
768
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
769
{
770
dsl_deadlist_cache_entry_t *dlce;
771
dsl_deadlist_cache_entry_t dlce_tofind;
772
avl_index_t where;
773
774
if (dl->dl_oldfmt) {
775
VERIFY0(bpobj_space_range(&dl->dl_bpobj,
776
mintxg, maxtxg, usedp, compp, uncompp));
777
return;
778
}
779
780
*usedp = *compp = *uncompp = 0;
781
782
mutex_enter(&dl->dl_lock);
783
dsl_deadlist_load_cache(dl);
784
dlce_tofind.dlce_mintxg = mintxg;
785
dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
786
787
/*
788
* If this mintxg doesn't exist, it may be an empty_bpobj which
789
* is omitted from the sparse tree. Start at the next non-empty
790
* entry.
791
*/
792
if (dlce == NULL)
793
dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
794
795
for (; dlce && dlce->dlce_mintxg < maxtxg;
796
dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
797
*usedp += dlce->dlce_bytes;
798
*compp += dlce->dlce_comp;
799
*uncompp += dlce->dlce_uncomp;
800
}
801
802
mutex_exit(&dl->dl_lock);
803
}
804
805
static void
806
dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
807
dmu_tx_t *tx)
808
{
809
dsl_deadlist_entry_t dle_tofind;
810
dsl_deadlist_entry_t *dle;
811
avl_index_t where;
812
uint64_t used, comp, uncomp;
813
bpobj_t bpo;
814
815
ASSERT(MUTEX_HELD(&dl->dl_lock));
816
817
VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
818
VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
819
bpobj_close(&bpo);
820
821
dsl_deadlist_load_tree(dl);
822
823
dmu_buf_will_dirty(dl->dl_dbuf, tx);
824
dl->dl_phys->dl_used += used;
825
dl->dl_phys->dl_comp += comp;
826
dl->dl_phys->dl_uncomp += uncomp;
827
828
dle_tofind.dle_mintxg = birth;
829
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
830
if (dle == NULL)
831
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
832
dle_enqueue_subobj(dl, dle, obj, tx);
833
}
834
835
/*
836
* Prefetch metadata required for dsl_deadlist_insert_bpobj().
837
*/
838
static void
839
dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
840
{
841
dsl_deadlist_entry_t dle_tofind;
842
dsl_deadlist_entry_t *dle;
843
avl_index_t where;
844
845
ASSERT(MUTEX_HELD(&dl->dl_lock));
846
847
dsl_deadlist_load_tree(dl);
848
849
dle_tofind.dle_mintxg = birth;
850
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
851
if (dle == NULL)
852
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
853
dle_prefetch_subobj(dl, dle, obj);
854
}
855
856
static int
857
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
858
dmu_tx_t *tx)
859
{
860
dsl_deadlist_t *dl = arg;
861
dsl_deadlist_insert(dl, bp, bp_freed, tx);
862
return (0);
863
}
864
865
/*
866
* Merge the deadlist pointed to by 'obj' into dl. obj will be left as
867
* an empty deadlist.
868
*/
869
void
870
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
871
{
872
zap_cursor_t zc, pzc;
873
zap_attribute_t *za, *pza;
874
dmu_buf_t *bonus;
875
dsl_deadlist_phys_t *dlp;
876
dmu_object_info_t doi;
877
int error, perror, i;
878
879
VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
880
if (doi.doi_type == DMU_OT_BPOBJ) {
881
bpobj_t bpo;
882
VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
883
VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
884
bpobj_close(&bpo);
885
return;
886
}
887
888
za = zap_attribute_alloc();
889
pza = zap_attribute_alloc();
890
891
mutex_enter(&dl->dl_lock);
892
/*
893
* Prefetch up to 128 deadlists first and then more as we progress.
894
* The limit is a balance between ARC use and diminishing returns.
895
*/
896
for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
897
(perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
898
zap_cursor_advance(&pzc), i++) {
899
dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
900
zfs_strtonum(pza->za_name, NULL));
901
}
902
for (zap_cursor_init(&zc, dl->dl_os, obj);
903
(error = zap_cursor_retrieve(&zc, za)) == 0;
904
zap_cursor_advance(&zc)) {
905
dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
906
zfs_strtonum(za->za_name, NULL), tx);
907
VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
908
if (perror == 0) {
909
dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
910
zfs_strtonum(pza->za_name, NULL));
911
zap_cursor_advance(&pzc);
912
perror = zap_cursor_retrieve(&pzc, pza);
913
}
914
}
915
VERIFY3U(error, ==, ENOENT);
916
zap_cursor_fini(&zc);
917
zap_cursor_fini(&pzc);
918
919
VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
920
dlp = bonus->db_data;
921
dmu_buf_will_dirty(bonus, tx);
922
memset(dlp, 0, sizeof (*dlp));
923
dmu_buf_rele(bonus, FTAG);
924
mutex_exit(&dl->dl_lock);
925
926
zap_attribute_free(za);
927
zap_attribute_free(pza);
928
}
929
930
/*
931
* Remove entries on dl that are born > mintxg, and put them on the bpobj.
932
*/
933
void
934
dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
935
dmu_tx_t *tx)
936
{
937
dsl_deadlist_entry_t dle_tofind;
938
dsl_deadlist_entry_t *dle, *pdle;
939
avl_index_t where;
940
int i;
941
942
ASSERT(!dl->dl_oldfmt);
943
944
mutex_enter(&dl->dl_lock);
945
dmu_buf_will_dirty(dl->dl_dbuf, tx);
946
dsl_deadlist_load_tree(dl);
947
948
dle_tofind.dle_mintxg = mintxg;
949
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
950
if (dle == NULL)
951
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
952
/*
953
* Prefetch up to 128 deadlists first and then more as we progress.
954
* The limit is a balance between ARC use and diminishing returns.
955
*/
956
for (pdle = dle, i = 0; pdle && i < 128; i++) {
957
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
958
pdle = AVL_NEXT(&dl->dl_tree, pdle);
959
}
960
while (dle) {
961
uint64_t used, comp, uncomp;
962
dsl_deadlist_entry_t *dle_next;
963
964
bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
965
if (pdle) {
966
bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
967
pdle = AVL_NEXT(&dl->dl_tree, pdle);
968
}
969
970
VERIFY0(bpobj_space(&dle->dle_bpobj,
971
&used, &comp, &uncomp));
972
ASSERT3U(dl->dl_phys->dl_used, >=, used);
973
ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
974
ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
975
dl->dl_phys->dl_used -= used;
976
dl->dl_phys->dl_comp -= comp;
977
dl->dl_phys->dl_uncomp -= uncomp;
978
979
VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
980
dle->dle_mintxg, tx));
981
982
dle_next = AVL_NEXT(&dl->dl_tree, dle);
983
avl_remove(&dl->dl_tree, dle);
984
bpobj_close(&dle->dle_bpobj);
985
kmem_free(dle, sizeof (*dle));
986
dle = dle_next;
987
}
988
mutex_exit(&dl->dl_lock);
989
}
990
991
typedef struct livelist_entry {
992
blkptr_t le_bp;
993
uint32_t le_refcnt;
994
avl_node_t le_node;
995
} livelist_entry_t;
996
997
static int
998
livelist_compare(const void *larg, const void *rarg)
999
{
1000
const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
1001
const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
1002
1003
/* Sort them according to dva[0] */
1004
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
1005
uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
1006
1007
if (l_dva0_vdev != r_dva0_vdev)
1008
return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
1009
1010
/* if vdevs are equal, sort by offsets. */
1011
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
1012
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
1013
return (TREE_CMP(l_dva0_offset, r_dva0_offset));
1014
}
1015
1016
struct livelist_iter_arg {
1017
avl_tree_t *avl;
1018
bplist_t *to_free;
1019
zthr_t *t;
1020
};
1021
1022
/*
1023
* Expects an AVL tree which is incrementally filled will FREE blkptrs
1024
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
1025
* corresponding FREE are stored in the supplied bplist.
1026
*
1027
* Note that multiple FREE and ALLOC entries for the same blkptr may be
1028
* encountered when dedup or block cloning is involved. For this reason we
1029
* keep a refcount for all the FREE entries of each blkptr and ensure that
1030
* each of those FREE entries has a corresponding ALLOC preceding it.
1031
*/
1032
static int
1033
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
1034
dmu_tx_t *tx)
1035
{
1036
struct livelist_iter_arg *lia = arg;
1037
avl_tree_t *avl = lia->avl;
1038
bplist_t *to_free = lia->to_free;
1039
zthr_t *t = lia->t;
1040
ASSERT0P(tx);
1041
1042
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
1043
return (SET_ERROR(EINTR));
1044
1045
livelist_entry_t node;
1046
node.le_bp = *bp;
1047
livelist_entry_t *found = avl_find(avl, &node, NULL);
1048
if (found) {
1049
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
1050
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
1051
BP_GET_CHECKSUM(&found->le_bp));
1052
ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,
1053
BP_GET_PHYSICAL_BIRTH(&found->le_bp));
1054
}
1055
if (bp_freed) {
1056
if (found == NULL) {
1057
/* first free entry for this blkptr */
1058
livelist_entry_t *e =
1059
kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
1060
e->le_bp = *bp;
1061
e->le_refcnt = 1;
1062
avl_add(avl, e);
1063
} else {
1064
/*
1065
* Deduped or cloned block free. We could assert D bit
1066
* for dedup, but there is no such one for cloning.
1067
*/
1068
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
1069
found->le_refcnt++;
1070
}
1071
} else {
1072
if (found == NULL) {
1073
/* block is currently marked as allocated */
1074
bplist_append(to_free, bp);
1075
} else {
1076
/* alloc matches a free entry */
1077
ASSERT3U(found->le_refcnt, !=, 0);
1078
found->le_refcnt--;
1079
if (found->le_refcnt == 0) {
1080
/* all tracked free pairs have been matched */
1081
avl_remove(avl, found);
1082
kmem_free(found, sizeof (livelist_entry_t));
1083
}
1084
}
1085
}
1086
return (0);
1087
}
1088
1089
/*
1090
* Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
1091
* which have an ALLOC entry but no matching FREE
1092
*/
1093
int
1094
dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
1095
uint64_t *size)
1096
{
1097
avl_tree_t avl;
1098
avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
1099
offsetof(livelist_entry_t, le_node));
1100
1101
/* process the sublist */
1102
struct livelist_iter_arg arg = {
1103
.avl = &avl,
1104
.to_free = to_free,
1105
.t = t
1106
};
1107
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
1108
VERIFY(err != 0 || avl_numnodes(&avl) == 0);
1109
1110
void *cookie = NULL;
1111
livelist_entry_t *le = NULL;
1112
while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
1113
kmem_free(le, sizeof (livelist_entry_t));
1114
}
1115
avl_destroy(&avl);
1116
return (err);
1117
}
1118
1119
ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
1120
"Size to start the next sub-livelist in a livelist");
1121
1122
ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
1123
"Threshold at which livelist is disabled");
1124
1125