Path: blob/main/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2012, 2019 by Delphix. All rights reserved.24* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.25*/2627#include <sys/dmu.h>28#include <sys/zap.h>29#include <sys/zfs_context.h>30#include <sys/dsl_pool.h>31#include <sys/dsl_dataset.h>3233/*34* Deadlist concurrency:35*36* Deadlists can only be modified from the syncing thread.37*38* Except for dsl_deadlist_insert(), it can only be modified with the39* dp_config_rwlock held with RW_WRITER.40*41* The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can42* be called concurrently, from open context, with the dl_config_rwlock held43* with RW_READER.44*45* Therefore, we only need to provide locking between dsl_deadlist_insert() and46* the accessors, protecting:47* dl_phys->dl_used,comp,uncomp48* and protecting the dl_tree from being loaded.49* The locking is provided by dl_lock. Note that locking on the bpobj_t50* provides its own locking, and dl_oldfmt is immutable.51*/5253/*54* Livelist Overview55* ================56*57* Livelists use the same 'deadlist_t' struct as deadlists and are also used58* to track blkptrs over the lifetime of a dataset. Livelists however, belong59* to clones and track the blkptrs that are clone-specific (were born after60* the clone's creation). The exception is embedded block pointers which are61* not included in livelists because they do not need to be freed.62*63* When it comes time to delete the clone, the livelist provides a quick64* reference as to what needs to be freed. For this reason, livelists also track65* when clone-specific blkptrs are freed before deletion to prevent double66* frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the67* deletion algorithm iterates backwards over the livelist, matching68* FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists69* are also updated in the case when blkptrs are remapped: the old version70* of the blkptr is cancelled out with a FREE and the new version is tracked71* with an ALLOC.72*73* To bound the amount of memory required for deletion, livelists over a74* certain size are spread over multiple entries. Entries are grouped by75* birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will76* be in the same entry. This allows us to delete livelists incrementally77* over multiple syncs, one entry at a time.78*79* During the lifetime of the clone, livelists can get extremely large.80* Their size is managed by periodic condensing (preemptively cancelling out81* FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when82* the shared space between the clone and its origin is so small that it83* doesn't make sense to use livelists anymore.84*/8586/*87* The threshold sublist size at which we create a new sub-livelist for the88* next txg. However, since blkptrs of the same transaction group must be in89* the same sub-list, the actual sublist size may exceed this. When picking the90* size we had to balance the fact that larger sublists mean fewer sublists91* (decreasing the cost of insertion) against the consideration that sublists92* will be loaded into memory and shouldn't take up an inordinate amount of93* space. We settled on ~500000 entries, corresponding to roughly 128M.94*/95uint64_t zfs_livelist_max_entries = 500000;9697/*98* We can approximate how much of a performance gain a livelist will give us99* based on the percentage of blocks shared between the clone and its origin.100* 0 percent shared means that the clone has completely diverged and that the101* old method is maximally effective: every read from the block tree will102* result in lots of frees. Livelists give us gains when they track blocks103* scattered across the tree, when one read in the old method might only104* result in a few frees. Once the clone has been overwritten enough,105* writes are no longer sparse and we'll no longer get much of a benefit from106* tracking them with a livelist. We chose a lower limit of 75 percent shared107* (25 percent overwritten). This means that 1/4 of all block pointers will be108* freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists109* to make deletion 4x faster. Once the amount of shared space drops below this110* threshold, the clone will revert to the old deletion method.111*/112int zfs_livelist_min_percent_shared = 75;113114static int115dsl_deadlist_compare(const void *arg1, const void *arg2)116{117const dsl_deadlist_entry_t *dle1 = arg1;118const dsl_deadlist_entry_t *dle2 = arg2;119120return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));121}122123static int124dsl_deadlist_cache_compare(const void *arg1, const void *arg2)125{126const dsl_deadlist_cache_entry_t *dlce1 = arg1;127const dsl_deadlist_cache_entry_t *dlce2 = arg2;128129return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));130}131132static void133dsl_deadlist_load_tree(dsl_deadlist_t *dl)134{135zap_cursor_t zc;136zap_attribute_t *za;137int error;138139ASSERT(MUTEX_HELD(&dl->dl_lock));140141ASSERT(!dl->dl_oldfmt);142if (dl->dl_havecache) {143/*144* After loading the tree, the caller may modify the tree,145* e.g. to add or remove nodes, or to make a node no longer146* refer to the empty_bpobj. These changes would make the147* dl_cache incorrect. Therefore we discard the cache here,148* so that it can't become incorrect.149*/150dsl_deadlist_cache_entry_t *dlce;151void *cookie = NULL;152while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))153!= NULL) {154kmem_free(dlce, sizeof (*dlce));155}156avl_destroy(&dl->dl_cache);157dl->dl_havecache = B_FALSE;158}159if (dl->dl_havetree)160return;161162za = zap_attribute_alloc();163avl_create(&dl->dl_tree, dsl_deadlist_compare,164sizeof (dsl_deadlist_entry_t),165offsetof(dsl_deadlist_entry_t, dle_node));166for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);167(error = zap_cursor_retrieve(&zc, za)) == 0;168zap_cursor_advance(&zc)) {169dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);170dle->dle_mintxg = zfs_strtonum(za->za_name, NULL);171172/*173* Prefetch all the bpobj's so that we do that i/o174* in parallel. Then open them all in a second pass.175*/176dle->dle_bpobj.bpo_object = za->za_first_integer;177dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,178ZIO_PRIORITY_SYNC_READ);179180avl_add(&dl->dl_tree, dle);181}182VERIFY3U(error, ==, ENOENT);183zap_cursor_fini(&zc);184zap_attribute_free(za);185186for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);187dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {188VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,189dle->dle_bpobj.bpo_object));190}191dl->dl_havetree = B_TRUE;192}193194/*195* Load only the non-empty bpobj's into the dl_cache. The cache is an analog196* of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It197* is used only for gathering space statistics. The dl_cache has two198* advantages over the dl_tree:199*200* 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's201* mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj202* many times and to inquire about its (zero) space stats many times.203*204* 2. The dl_cache uses less memory than the dl_tree. We only need to load205* the dl_tree of snapshots when deleting a snapshot, after which we free the206* dl_tree with dsl_deadlist_discard_tree207*/208static void209dsl_deadlist_load_cache(dsl_deadlist_t *dl)210{211zap_cursor_t zc;212zap_attribute_t *za;213int error;214215ASSERT(MUTEX_HELD(&dl->dl_lock));216217ASSERT(!dl->dl_oldfmt);218if (dl->dl_havecache)219return;220221uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;222223avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,224sizeof (dsl_deadlist_cache_entry_t),225offsetof(dsl_deadlist_cache_entry_t, dlce_node));226za = zap_attribute_alloc();227for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);228(error = zap_cursor_retrieve(&zc, za)) == 0;229zap_cursor_advance(&zc)) {230if (za->za_first_integer == empty_bpobj)231continue;232dsl_deadlist_cache_entry_t *dlce =233kmem_zalloc(sizeof (*dlce), KM_SLEEP);234dlce->dlce_mintxg = zfs_strtonum(za->za_name, NULL);235236/*237* Prefetch all the bpobj's so that we do that i/o238* in parallel. Then open them all in a second pass.239*/240dlce->dlce_bpobj = za->za_first_integer;241dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,242ZIO_PRIORITY_SYNC_READ);243avl_add(&dl->dl_cache, dlce);244}245VERIFY3U(error, ==, ENOENT);246zap_cursor_fini(&zc);247zap_attribute_free(za);248249for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);250dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {251bpobj_t bpo;252VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));253254VERIFY0(bpobj_space(&bpo,255&dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));256bpobj_close(&bpo);257}258dl->dl_havecache = B_TRUE;259}260261/*262* Discard the tree to save memory.263*/264void265dsl_deadlist_discard_tree(dsl_deadlist_t *dl)266{267mutex_enter(&dl->dl_lock);268269if (!dl->dl_havetree) {270mutex_exit(&dl->dl_lock);271return;272}273dsl_deadlist_entry_t *dle;274void *cookie = NULL;275while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {276bpobj_close(&dle->dle_bpobj);277kmem_free(dle, sizeof (*dle));278}279avl_destroy(&dl->dl_tree);280281dl->dl_havetree = B_FALSE;282mutex_exit(&dl->dl_lock);283}284285void286dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)287{288dsl_deadlist_entry_t *dle;289290ASSERT(dsl_deadlist_is_open(dl));291292mutex_enter(&dl->dl_lock);293dsl_deadlist_load_tree(dl);294mutex_exit(&dl->dl_lock);295for (dle = avl_first(&dl->dl_tree); dle != NULL;296dle = AVL_NEXT(&dl->dl_tree, dle)) {297if (func(args, dle) != 0)298break;299}300}301302int303dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)304{305dmu_object_info_t doi;306int err;307308ASSERT(!dsl_deadlist_is_open(dl));309310mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);311dl->dl_os = os;312dl->dl_object = object;313err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf);314if (err != 0)315return (err);316dmu_object_info_from_db(dl->dl_dbuf, &doi);317if (doi.doi_type == DMU_OT_BPOBJ) {318dmu_buf_rele(dl->dl_dbuf, dl);319dl->dl_dbuf = NULL;320dl->dl_oldfmt = B_TRUE;321return (bpobj_open(&dl->dl_bpobj, os, object));322}323324dl->dl_oldfmt = B_FALSE;325dl->dl_phys = dl->dl_dbuf->db_data;326dl->dl_havetree = B_FALSE;327dl->dl_havecache = B_FALSE;328return (0);329}330331boolean_t332dsl_deadlist_is_open(dsl_deadlist_t *dl)333{334return (dl->dl_os != NULL);335}336337void338dsl_deadlist_close(dsl_deadlist_t *dl)339{340ASSERT(dsl_deadlist_is_open(dl));341mutex_destroy(&dl->dl_lock);342343if (dl->dl_oldfmt) {344dl->dl_oldfmt = B_FALSE;345bpobj_close(&dl->dl_bpobj);346dl->dl_os = NULL;347dl->dl_object = 0;348return;349}350351if (dl->dl_havetree) {352dsl_deadlist_entry_t *dle;353void *cookie = NULL;354while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))355!= NULL) {356bpobj_close(&dle->dle_bpobj);357kmem_free(dle, sizeof (*dle));358}359avl_destroy(&dl->dl_tree);360}361if (dl->dl_havecache) {362dsl_deadlist_cache_entry_t *dlce;363void *cookie = NULL;364while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))365!= NULL) {366kmem_free(dlce, sizeof (*dlce));367}368avl_destroy(&dl->dl_cache);369}370dmu_buf_rele(dl->dl_dbuf, dl);371dl->dl_dbuf = NULL;372dl->dl_phys = NULL;373dl->dl_os = NULL;374dl->dl_object = 0;375}376377uint64_t378dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)379{380if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)381return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));382return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,383sizeof (dsl_deadlist_phys_t), tx));384}385386void387dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)388{389dmu_object_info_t doi;390zap_cursor_t zc;391zap_attribute_t *za;392int error;393394VERIFY0(dmu_object_info(os, dlobj, &doi));395if (doi.doi_type == DMU_OT_BPOBJ) {396bpobj_free(os, dlobj, tx);397return;398}399400za = zap_attribute_alloc();401for (zap_cursor_init(&zc, os, dlobj);402(error = zap_cursor_retrieve(&zc, za)) == 0;403zap_cursor_advance(&zc)) {404uint64_t obj = za->za_first_integer;405if (obj == dmu_objset_pool(os)->dp_empty_bpobj)406bpobj_decr_empty(os, tx);407else408bpobj_free(os, obj, tx);409}410VERIFY3U(error, ==, ENOENT);411zap_cursor_fini(&zc);412zap_attribute_free(za);413VERIFY0(dmu_object_free(os, dlobj, tx));414}415416static void417dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,418const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)419{420ASSERT(MUTEX_HELD(&dl->dl_lock));421if (dle->dle_bpobj.bpo_object ==422dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {423uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);424bpobj_close(&dle->dle_bpobj);425bpobj_decr_empty(dl->dl_os, tx);426VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));427VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,428dle->dle_mintxg, obj, tx));429}430bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);431}432433static void434dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,435uint64_t obj, dmu_tx_t *tx)436{437ASSERT(MUTEX_HELD(&dl->dl_lock));438if (dle->dle_bpobj.bpo_object !=439dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {440bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);441} else {442bpobj_close(&dle->dle_bpobj);443bpobj_decr_empty(dl->dl_os, tx);444VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));445VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,446dle->dle_mintxg, obj, tx));447}448}449450/*451* Prefetch metadata required for dle_enqueue_subobj().452*/453static void454dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,455uint64_t obj)456{457if (dle->dle_bpobj.bpo_object !=458dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)459bpobj_prefetch_subobj(&dle->dle_bpobj, obj);460}461462void463dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,464dmu_tx_t *tx)465{466dsl_deadlist_entry_t dle_tofind;467dsl_deadlist_entry_t *dle;468avl_index_t where;469470if (dl->dl_oldfmt) {471bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);472return;473}474475mutex_enter(&dl->dl_lock);476dsl_deadlist_load_tree(dl);477478dmu_buf_will_dirty(dl->dl_dbuf, tx);479480int sign = bp_freed ? -1 : +1;481dl->dl_phys->dl_used +=482sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);483dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);484dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);485486dle_tofind.dle_mintxg = BP_GET_BIRTH(bp);487dle = avl_find(&dl->dl_tree, &dle_tofind, &where);488if (dle == NULL)489dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);490else491dle = AVL_PREV(&dl->dl_tree, dle);492493if (dle == NULL) {494zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",495bp, (longlong_t)BP_GET_BIRTH(bp));496dle = avl_first(&dl->dl_tree);497}498499ASSERT3P(dle, !=, NULL);500dle_enqueue(dl, dle, bp, bp_freed, tx);501mutex_exit(&dl->dl_lock);502}503504int505dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)506{507dsl_deadlist_t *dl = arg;508dsl_deadlist_insert(dl, bp, B_FALSE, tx);509return (0);510}511512int513dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)514{515dsl_deadlist_t *dl = arg;516dsl_deadlist_insert(dl, bp, B_TRUE, tx);517return (0);518}519520/*521* Insert new key in deadlist, which must be > all current entries.522* mintxg is not inclusive.523*/524void525dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)526{527uint64_t obj;528dsl_deadlist_entry_t *dle;529530if (dl->dl_oldfmt)531return;532533dle = kmem_alloc(sizeof (*dle), KM_SLEEP);534dle->dle_mintxg = mintxg;535536mutex_enter(&dl->dl_lock);537dsl_deadlist_load_tree(dl);538539obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);540VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));541avl_add(&dl->dl_tree, dle);542543VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,544mintxg, obj, tx));545mutex_exit(&dl->dl_lock);546}547548/*549* Remove this key, merging its entries into the previous key.550*/551void552dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)553{554dsl_deadlist_entry_t dle_tofind;555dsl_deadlist_entry_t *dle, *dle_prev;556557if (dl->dl_oldfmt)558return;559mutex_enter(&dl->dl_lock);560dsl_deadlist_load_tree(dl);561562dle_tofind.dle_mintxg = mintxg;563dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);564ASSERT3P(dle, !=, NULL);565dle_prev = AVL_PREV(&dl->dl_tree, dle);566ASSERT3P(dle_prev, !=, NULL);567568dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);569570avl_remove(&dl->dl_tree, dle);571bpobj_close(&dle->dle_bpobj);572kmem_free(dle, sizeof (*dle));573574VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));575mutex_exit(&dl->dl_lock);576}577578/*579* Remove a deadlist entry and all of its contents by removing the entry from580* the deadlist's avl tree, freeing the entry's bpobj and adjusting the581* deadlist's space accounting accordingly.582*/583void584dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)585{586uint64_t used, comp, uncomp;587dsl_deadlist_entry_t dle_tofind;588dsl_deadlist_entry_t *dle;589objset_t *os = dl->dl_os;590591if (dl->dl_oldfmt)592return;593594mutex_enter(&dl->dl_lock);595dsl_deadlist_load_tree(dl);596597dle_tofind.dle_mintxg = mintxg;598dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);599VERIFY3P(dle, !=, NULL);600601avl_remove(&dl->dl_tree, dle);602VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));603VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));604dmu_buf_will_dirty(dl->dl_dbuf, tx);605dl->dl_phys->dl_used -= used;606dl->dl_phys->dl_comp -= comp;607dl->dl_phys->dl_uncomp -= uncomp;608if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {609bpobj_decr_empty(os, tx);610} else {611bpobj_free(os, dle->dle_bpobj.bpo_object, tx);612}613bpobj_close(&dle->dle_bpobj);614kmem_free(dle, sizeof (*dle));615mutex_exit(&dl->dl_lock);616}617618/*619* Clear out the contents of a deadlist_entry by freeing its bpobj,620* replacing it with an empty bpobj and adjusting the deadlist's621* space accounting622*/623void624dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,625dmu_tx_t *tx)626{627uint64_t new_obj, used, comp, uncomp;628objset_t *os = dl->dl_os;629630mutex_enter(&dl->dl_lock);631VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));632VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));633dmu_buf_will_dirty(dl->dl_dbuf, tx);634dl->dl_phys->dl_used -= used;635dl->dl_phys->dl_comp -= comp;636dl->dl_phys->dl_uncomp -= uncomp;637if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)638bpobj_decr_empty(os, tx);639else640bpobj_free(os, dle->dle_bpobj.bpo_object, tx);641bpobj_close(&dle->dle_bpobj);642new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);643VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));644VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,645new_obj, tx));646ASSERT(bpobj_is_empty(&dle->dle_bpobj));647mutex_exit(&dl->dl_lock);648}649650/*651* Return the first entry in deadlist's avl tree652*/653dsl_deadlist_entry_t *654dsl_deadlist_first(dsl_deadlist_t *dl)655{656dsl_deadlist_entry_t *dle;657658mutex_enter(&dl->dl_lock);659dsl_deadlist_load_tree(dl);660dle = avl_first(&dl->dl_tree);661mutex_exit(&dl->dl_lock);662663return (dle);664}665666/*667* Return the last entry in deadlist's avl tree668*/669dsl_deadlist_entry_t *670dsl_deadlist_last(dsl_deadlist_t *dl)671{672dsl_deadlist_entry_t *dle;673674mutex_enter(&dl->dl_lock);675dsl_deadlist_load_tree(dl);676dle = avl_last(&dl->dl_tree);677mutex_exit(&dl->dl_lock);678679return (dle);680}681682/*683* Walk ds's snapshots to regenerate generate ZAP & AVL.684*/685static void686dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,687uint64_t mrs_obj, dmu_tx_t *tx)688{689dsl_deadlist_t dl = { 0 };690dsl_pool_t *dp = dmu_objset_pool(os);691692VERIFY0(dsl_deadlist_open(&dl, os, dlobj));693if (dl.dl_oldfmt) {694dsl_deadlist_close(&dl);695return;696}697698while (mrs_obj != 0) {699dsl_dataset_t *ds;700VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));701dsl_deadlist_add_key(&dl,702dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);703mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;704dsl_dataset_rele(ds, FTAG);705}706dsl_deadlist_close(&dl);707}708709uint64_t710dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,711uint64_t mrs_obj, dmu_tx_t *tx)712{713dsl_deadlist_entry_t *dle;714uint64_t newobj;715716newobj = dsl_deadlist_alloc(dl->dl_os, tx);717718if (dl->dl_oldfmt) {719dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);720return (newobj);721}722723mutex_enter(&dl->dl_lock);724dsl_deadlist_load_tree(dl);725726for (dle = avl_first(&dl->dl_tree); dle;727dle = AVL_NEXT(&dl->dl_tree, dle)) {728uint64_t obj;729730if (dle->dle_mintxg >= maxtxg)731break;732733obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);734VERIFY0(zap_add_int_key(dl->dl_os, newobj,735dle->dle_mintxg, obj, tx));736}737mutex_exit(&dl->dl_lock);738return (newobj);739}740741void742dsl_deadlist_space(dsl_deadlist_t *dl,743uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)744{745ASSERT(dsl_deadlist_is_open(dl));746if (dl->dl_oldfmt) {747VERIFY0(bpobj_space(&dl->dl_bpobj,748usedp, compp, uncompp));749return;750}751752mutex_enter(&dl->dl_lock);753*usedp = dl->dl_phys->dl_used;754*compp = dl->dl_phys->dl_comp;755*uncompp = dl->dl_phys->dl_uncomp;756mutex_exit(&dl->dl_lock);757}758759/*760* return space used in the range (mintxg, maxtxg].761* Includes maxtxg, does not include mintxg.762* mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is763* UINT64_MAX).764*/765void766dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,767uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)768{769dsl_deadlist_cache_entry_t *dlce;770dsl_deadlist_cache_entry_t dlce_tofind;771avl_index_t where;772773if (dl->dl_oldfmt) {774VERIFY0(bpobj_space_range(&dl->dl_bpobj,775mintxg, maxtxg, usedp, compp, uncompp));776return;777}778779*usedp = *compp = *uncompp = 0;780781mutex_enter(&dl->dl_lock);782dsl_deadlist_load_cache(dl);783dlce_tofind.dlce_mintxg = mintxg;784dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);785786/*787* If this mintxg doesn't exist, it may be an empty_bpobj which788* is omitted from the sparse tree. Start at the next non-empty789* entry.790*/791if (dlce == NULL)792dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);793794for (; dlce && dlce->dlce_mintxg < maxtxg;795dlce = AVL_NEXT(&dl->dl_tree, dlce)) {796*usedp += dlce->dlce_bytes;797*compp += dlce->dlce_comp;798*uncompp += dlce->dlce_uncomp;799}800801mutex_exit(&dl->dl_lock);802}803804static void805dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,806dmu_tx_t *tx)807{808dsl_deadlist_entry_t dle_tofind;809dsl_deadlist_entry_t *dle;810avl_index_t where;811uint64_t used, comp, uncomp;812bpobj_t bpo;813814ASSERT(MUTEX_HELD(&dl->dl_lock));815816VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));817VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));818bpobj_close(&bpo);819820dsl_deadlist_load_tree(dl);821822dmu_buf_will_dirty(dl->dl_dbuf, tx);823dl->dl_phys->dl_used += used;824dl->dl_phys->dl_comp += comp;825dl->dl_phys->dl_uncomp += uncomp;826827dle_tofind.dle_mintxg = birth;828dle = avl_find(&dl->dl_tree, &dle_tofind, &where);829if (dle == NULL)830dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);831dle_enqueue_subobj(dl, dle, obj, tx);832}833834/*835* Prefetch metadata required for dsl_deadlist_insert_bpobj().836*/837static void838dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)839{840dsl_deadlist_entry_t dle_tofind;841dsl_deadlist_entry_t *dle;842avl_index_t where;843844ASSERT(MUTEX_HELD(&dl->dl_lock));845846dsl_deadlist_load_tree(dl);847848dle_tofind.dle_mintxg = birth;849dle = avl_find(&dl->dl_tree, &dle_tofind, &where);850if (dle == NULL)851dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);852dle_prefetch_subobj(dl, dle, obj);853}854855static int856dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,857dmu_tx_t *tx)858{859dsl_deadlist_t *dl = arg;860dsl_deadlist_insert(dl, bp, bp_freed, tx);861return (0);862}863864/*865* Merge the deadlist pointed to by 'obj' into dl. obj will be left as866* an empty deadlist.867*/868void869dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)870{871zap_cursor_t zc, pzc;872zap_attribute_t *za, *pza;873dmu_buf_t *bonus;874dsl_deadlist_phys_t *dlp;875dmu_object_info_t doi;876int error, perror, i;877878VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));879if (doi.doi_type == DMU_OT_BPOBJ) {880bpobj_t bpo;881VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));882VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));883bpobj_close(&bpo);884return;885}886887za = zap_attribute_alloc();888pza = zap_attribute_alloc();889890mutex_enter(&dl->dl_lock);891/*892* Prefetch up to 128 deadlists first and then more as we progress.893* The limit is a balance between ARC use and diminishing returns.894*/895for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;896(perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;897zap_cursor_advance(&pzc), i++) {898dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,899zfs_strtonum(pza->za_name, NULL));900}901for (zap_cursor_init(&zc, dl->dl_os, obj);902(error = zap_cursor_retrieve(&zc, za)) == 0;903zap_cursor_advance(&zc)) {904dsl_deadlist_insert_bpobj(dl, za->za_first_integer,905zfs_strtonum(za->za_name, NULL), tx);906VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));907if (perror == 0) {908dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,909zfs_strtonum(pza->za_name, NULL));910zap_cursor_advance(&pzc);911perror = zap_cursor_retrieve(&pzc, pza);912}913}914VERIFY3U(error, ==, ENOENT);915zap_cursor_fini(&zc);916zap_cursor_fini(&pzc);917918VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));919dlp = bonus->db_data;920dmu_buf_will_dirty(bonus, tx);921memset(dlp, 0, sizeof (*dlp));922dmu_buf_rele(bonus, FTAG);923mutex_exit(&dl->dl_lock);924925zap_attribute_free(za);926zap_attribute_free(pza);927}928929/*930* Remove entries on dl that are born > mintxg, and put them on the bpobj.931*/932void933dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,934dmu_tx_t *tx)935{936dsl_deadlist_entry_t dle_tofind;937dsl_deadlist_entry_t *dle, *pdle;938avl_index_t where;939int i;940941ASSERT(!dl->dl_oldfmt);942943mutex_enter(&dl->dl_lock);944dmu_buf_will_dirty(dl->dl_dbuf, tx);945dsl_deadlist_load_tree(dl);946947dle_tofind.dle_mintxg = mintxg;948dle = avl_find(&dl->dl_tree, &dle_tofind, &where);949if (dle == NULL)950dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);951/*952* Prefetch up to 128 deadlists first and then more as we progress.953* The limit is a balance between ARC use and diminishing returns.954*/955for (pdle = dle, i = 0; pdle && i < 128; i++) {956bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);957pdle = AVL_NEXT(&dl->dl_tree, pdle);958}959while (dle) {960uint64_t used, comp, uncomp;961dsl_deadlist_entry_t *dle_next;962963bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);964if (pdle) {965bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);966pdle = AVL_NEXT(&dl->dl_tree, pdle);967}968969VERIFY0(bpobj_space(&dle->dle_bpobj,970&used, &comp, &uncomp));971ASSERT3U(dl->dl_phys->dl_used, >=, used);972ASSERT3U(dl->dl_phys->dl_comp, >=, comp);973ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);974dl->dl_phys->dl_used -= used;975dl->dl_phys->dl_comp -= comp;976dl->dl_phys->dl_uncomp -= uncomp;977978VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,979dle->dle_mintxg, tx));980981dle_next = AVL_NEXT(&dl->dl_tree, dle);982avl_remove(&dl->dl_tree, dle);983bpobj_close(&dle->dle_bpobj);984kmem_free(dle, sizeof (*dle));985dle = dle_next;986}987mutex_exit(&dl->dl_lock);988}989990typedef struct livelist_entry {991blkptr_t le_bp;992uint32_t le_refcnt;993avl_node_t le_node;994} livelist_entry_t;995996static int997livelist_compare(const void *larg, const void *rarg)998{999const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;1000const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;10011002/* Sort them according to dva[0] */1003uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);1004uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);10051006if (l_dva0_vdev != r_dva0_vdev)1007return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));10081009/* if vdevs are equal, sort by offsets. */1010uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);1011uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);1012return (TREE_CMP(l_dva0_offset, r_dva0_offset));1013}10141015struct livelist_iter_arg {1016avl_tree_t *avl;1017bplist_t *to_free;1018zthr_t *t;1019};10201021/*1022* Expects an AVL tree which is incrementally filled will FREE blkptrs1023* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a1024* corresponding FREE are stored in the supplied bplist.1025*1026* Note that multiple FREE and ALLOC entries for the same blkptr may be1027* encountered when dedup or block cloning is involved. For this reason we1028* keep a refcount for all the FREE entries of each blkptr and ensure that1029* each of those FREE entries has a corresponding ALLOC preceding it.1030*/1031static int1032dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,1033dmu_tx_t *tx)1034{1035struct livelist_iter_arg *lia = arg;1036avl_tree_t *avl = lia->avl;1037bplist_t *to_free = lia->to_free;1038zthr_t *t = lia->t;1039ASSERT0P(tx);10401041if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))1042return (SET_ERROR(EINTR));10431044livelist_entry_t node;1045node.le_bp = *bp;1046livelist_entry_t *found = avl_find(avl, &node, NULL);1047if (found) {1048ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));1049ASSERT3U(BP_GET_CHECKSUM(bp), ==,1050BP_GET_CHECKSUM(&found->le_bp));1051ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,1052BP_GET_PHYSICAL_BIRTH(&found->le_bp));1053}1054if (bp_freed) {1055if (found == NULL) {1056/* first free entry for this blkptr */1057livelist_entry_t *e =1058kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);1059e->le_bp = *bp;1060e->le_refcnt = 1;1061avl_add(avl, e);1062} else {1063/*1064* Deduped or cloned block free. We could assert D bit1065* for dedup, but there is no such one for cloning.1066*/1067ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);1068found->le_refcnt++;1069}1070} else {1071if (found == NULL) {1072/* block is currently marked as allocated */1073bplist_append(to_free, bp);1074} else {1075/* alloc matches a free entry */1076ASSERT3U(found->le_refcnt, !=, 0);1077found->le_refcnt--;1078if (found->le_refcnt == 0) {1079/* all tracked free pairs have been matched */1080avl_remove(avl, found);1081kmem_free(found, sizeof (livelist_entry_t));1082}1083}1084}1085return (0);1086}10871088/*1089* Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs1090* which have an ALLOC entry but no matching FREE1091*/1092int1093dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,1094uint64_t *size)1095{1096avl_tree_t avl;1097avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),1098offsetof(livelist_entry_t, le_node));10991100/* process the sublist */1101struct livelist_iter_arg arg = {1102.avl = &avl,1103.to_free = to_free,1104.t = t1105};1106int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);1107VERIFY(err != 0 || avl_numnodes(&avl) == 0);11081109void *cookie = NULL;1110livelist_entry_t *le = NULL;1111while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {1112kmem_free(le, sizeof (livelist_entry_t));1113}1114avl_destroy(&avl);1115return (err);1116}11171118ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,1119"Size to start the next sub-livelist in a livelist");11201121ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,1122"Threshold at which livelist is disabled");112311241125