Path: blob/main/sys/contrib/openzfs/module/zfs/bpobj.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2011, 2018 by Delphix. All rights reserved.24* Copyright (c) 2017 Datto Inc.25*/2627#include <sys/bpobj.h>28#include <sys/zfs_context.h>29#include <sys/zfs_refcount.h>30#include <sys/dsl_pool.h>31#include <sys/zfeature.h>32#include <sys/zap.h>3334/*35* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).36*/37uint64_t38bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)39{40spa_t *spa = dmu_objset_spa(os);41dsl_pool_t *dp = dmu_objset_pool(os);4243if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {44if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {45ASSERT0(dp->dp_empty_bpobj);46dp->dp_empty_bpobj =47bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);48VERIFY(zap_add(os,49DMU_POOL_DIRECTORY_OBJECT,50DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,51&dp->dp_empty_bpobj, tx) == 0);52}53spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);54ASSERT(dp->dp_empty_bpobj != 0);55return (dp->dp_empty_bpobj);56} else {57return (bpobj_alloc(os, blocksize, tx));58}59}6061void62bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)63{64dsl_pool_t *dp = dmu_objset_pool(os);6566spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);67if (!spa_feature_is_active(dmu_objset_spa(os),68SPA_FEATURE_EMPTY_BPOBJ)) {69VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,70DMU_POOL_DIRECTORY_OBJECT,71DMU_POOL_EMPTY_BPOBJ, tx));72VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));73dp->dp_empty_bpobj = 0;74}75}7677uint64_t78bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)79{80int size;8182if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)83size = BPOBJ_SIZE_V0;84else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)85size = BPOBJ_SIZE_V1;86else if (!spa_feature_is_active(dmu_objset_spa(os),87SPA_FEATURE_LIVELIST))88size = BPOBJ_SIZE_V2;89else90size = sizeof (bpobj_phys_t);9192return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,93DMU_OT_BPOBJ_HDR, size, tx));94}9596void97bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)98{99int64_t i;100bpobj_t bpo;101dmu_object_info_t doi;102int epb;103dmu_buf_t *dbuf = NULL;104105ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);106VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));107108mutex_enter(&bpo.bpo_lock);109110if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)111goto out;112113VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));114epb = doi.doi_data_block_size / sizeof (uint64_t);115116for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {117uint64_t *objarray;118uint64_t offset, blkoff;119120offset = i * sizeof (uint64_t);121blkoff = P2PHASE(i, epb);122123if (dbuf == NULL || dbuf->db_offset > offset) {124if (dbuf)125dmu_buf_rele(dbuf, FTAG);126VERIFY3U(0, ==, dmu_buf_hold(os,127bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));128}129130ASSERT3U(offset, >=, dbuf->db_offset);131ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);132133objarray = dbuf->db_data;134bpobj_free(os, objarray[blkoff], tx);135}136if (dbuf) {137dmu_buf_rele(dbuf, FTAG);138dbuf = NULL;139}140VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));141142out:143mutex_exit(&bpo.bpo_lock);144bpobj_close(&bpo);145146VERIFY3U(0, ==, dmu_object_free(os, obj, tx));147}148149int150bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)151{152dmu_object_info_t doi;153int err;154155err = dmu_object_info(os, object, &doi);156if (err)157return (err);158159memset(bpo, 0, sizeof (*bpo));160mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);161162ASSERT0P(bpo->bpo_dbuf);163ASSERT0P(bpo->bpo_phys);164ASSERT(object != 0);165ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);166ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);167168err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);169if (err)170return (err);171172bpo->bpo_os = os;173bpo->bpo_object = object;174bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;175bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);176bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);177bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);178bpo->bpo_phys = bpo->bpo_dbuf->db_data;179return (0);180}181182boolean_t183bpobj_is_open(const bpobj_t *bpo)184{185return (bpo->bpo_object != 0);186}187188void189bpobj_close(bpobj_t *bpo)190{191/* Lame workaround for closing a bpobj that was never opened. */192if (bpo->bpo_object == 0)193return;194195dmu_buf_rele(bpo->bpo_dbuf, bpo);196if (bpo->bpo_cached_dbuf != NULL)197dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);198bpo->bpo_dbuf = NULL;199bpo->bpo_phys = NULL;200bpo->bpo_cached_dbuf = NULL;201bpo->bpo_object = 0;202203mutex_destroy(&bpo->bpo_lock);204}205206static boolean_t207bpobj_is_empty_impl(bpobj_t *bpo)208{209ASSERT(MUTEX_HELD(&bpo->bpo_lock));210return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&211(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));212}213214boolean_t215bpobj_is_empty(bpobj_t *bpo)216{217mutex_enter(&bpo->bpo_lock);218boolean_t is_empty = bpobj_is_empty_impl(bpo);219mutex_exit(&bpo->bpo_lock);220return (is_empty);221}222223/*224* A recursive iteration of the bpobjs would be nice here but we run the risk225* of overflowing function stack space. Instead, find each subobj and add it226* to the head of our list so it can be scanned for subjobjs. Like a227* recursive implementation, the "deepest" subobjs will be freed first.228* When a subobj is found to have no additional subojs, free it.229*/230typedef struct bpobj_info {231bpobj_t *bpi_bpo;232/*233* This object is a subobj of bpi_parent,234* at bpi_index in its subobj array.235*/236struct bpobj_info *bpi_parent;237uint64_t bpi_index;238/* How many of our subobj's are left to process. */239uint64_t bpi_unprocessed_subobjs;240/* True after having visited this bpo's directly referenced BPs. */241boolean_t bpi_visited;242list_node_t bpi_node;243} bpobj_info_t;244245static bpobj_info_t *246bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)247{248bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);249bpi->bpi_bpo = bpo;250bpi->bpi_parent = parent;251bpi->bpi_index = index;252if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {253bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;254}255return (bpi);256}257258/*259* Update bpobj and all of its parents with new space accounting.260*/261static void262propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,263int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)264{265266for (; bpi != NULL; bpi = bpi->bpi_parent) {267bpobj_t *p = bpi->bpi_bpo;268ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));269p->bpo_phys->bpo_bytes -= freed;270ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);271if (p->bpo_havecomp) {272p->bpo_phys->bpo_comp -= comp_freed;273p->bpo_phys->bpo_uncomp -= uncomp_freed;274}275}276}277278static int279bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,280int64_t start, dmu_tx_t *tx, boolean_t free)281{282int err = 0;283int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;284dmu_buf_t *dbuf = NULL;285bpobj_t *bpo = bpi->bpi_bpo;286287int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;288uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *289sizeof (blkptr_t);290uint64_t ps = start * sizeof (blkptr_t);291uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,292ps);293if (pe > pb) {294dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,295ZIO_PRIORITY_ASYNC_READ);296}297for (; i >= start; i--) {298uint64_t offset = i * sizeof (blkptr_t);299uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);300301if (dbuf == NULL || dbuf->db_offset > offset) {302if (dbuf)303dmu_buf_rele(dbuf, FTAG);304err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,305offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);306if (err)307break;308pe = pb;309pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?310dbuf->db_offset - dmu_prefetch_max : 0, ps);311if (pe > pb) {312dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,313pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);314}315}316317ASSERT3U(offset, >=, dbuf->db_offset);318ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);319320blkptr_t *bparray = dbuf->db_data;321blkptr_t *bp = &bparray[blkoff];322323boolean_t bp_freed = BP_GET_FREE(bp);324err = func(arg, bp, bp_freed, tx);325if (err)326break;327328if (free) {329int sign = bp_freed ? -1 : +1;330spa_t *spa = dmu_objset_spa(bpo->bpo_os);331freed += sign * bp_get_dsize_sync(spa, bp);332comp_freed += sign * BP_GET_PSIZE(bp);333uncomp_freed += sign * BP_GET_UCSIZE(bp);334ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));335bpo->bpo_phys->bpo_num_blkptrs--;336ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);337if (bp_freed) {338ASSERT(bpo->bpo_havefreed);339bpo->bpo_phys->bpo_num_freed--;340ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);341}342}343}344if (free) {345propagate_space_reduction(bpi, freed, comp_freed,346uncomp_freed, tx);347VERIFY0(dmu_free_range(bpo->bpo_os,348bpo->bpo_object,349bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),350DMU_OBJECT_END, tx));351}352if (dbuf) {353dmu_buf_rele(dbuf, FTAG);354dbuf = NULL;355}356return (err);357}358359/*360* Given an initial bpo, start by freeing the BPs that are directly referenced361* by that bpo. If the bpo has subobjs, read in its last subobj and push the362* subobj to our stack. By popping items off our stack, eventually we will363* encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if364* requested also free the now-empty bpo from disk and decrement365* its parent's subobj count. We continue popping each subobj from our stack,366* visiting its last subobj until they too have no more subobjs, and so on.367*/368static int369bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,370dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)371{372list_t stack;373bpobj_info_t *bpi;374int err = 0;375376/*377* Create a "stack" for us to work with without worrying about378* stack overflows. Initialize it with the initial_bpo.379*/380list_create(&stack, sizeof (bpobj_info_t),381offsetof(bpobj_info_t, bpi_node));382mutex_enter(&initial_bpo->bpo_lock);383384if (bpobj_size != NULL)385*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;386387list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));388389while ((bpi = list_head(&stack)) != NULL) {390bpobj_t *bpo = bpi->bpi_bpo;391392ASSERT3P(bpo, !=, NULL);393ASSERT(MUTEX_HELD(&bpo->bpo_lock));394ASSERT(bpobj_is_open(bpo));395396if (free)397dmu_buf_will_dirty(bpo->bpo_dbuf, tx);398399if (bpi->bpi_visited == B_FALSE) {400err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,401free);402bpi->bpi_visited = B_TRUE;403if (err != 0)404break;405}406/*407* We've finished with this bpo's directly-referenced BP's and408* it has no more unprocessed subobjs. We can free its409* bpobj_info_t (unless it is the topmost, initial_bpo).410* If we are freeing from disk, we can also do that.411*/412if (bpi->bpi_unprocessed_subobjs == 0) {413/*414* If there are no entries, there should415* be no bytes.416*/417if (bpobj_is_empty_impl(bpo)) {418ASSERT0(bpo->bpo_phys->bpo_bytes);419ASSERT0(bpo->bpo_phys->bpo_comp);420ASSERT0(bpo->bpo_phys->bpo_uncomp);421}422423/* The initial_bpo has no parent and is not closed. */424if (bpi->bpi_parent != NULL) {425if (free) {426bpobj_t *p = bpi->bpi_parent->bpi_bpo;427428ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);429ASSERT3U(p->bpo_phys->bpo_num_subobjs,430>, 0);431ASSERT3U(bpi->bpi_index, ==,432p->bpo_phys->bpo_num_subobjs - 1);433ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,434tx));435436p->bpo_phys->bpo_num_subobjs--;437438VERIFY0(dmu_free_range(p->bpo_os,439p->bpo_phys->bpo_subobjs,440bpi->bpi_index * sizeof (uint64_t),441sizeof (uint64_t), tx));442443/* eliminate the empty subobj list */444if (bpo->bpo_havesubobj &&445bpo->bpo_phys->bpo_subobjs != 0) {446ASSERT0(bpo->bpo_phys->447bpo_num_subobjs);448err = dmu_object_free(449bpo->bpo_os,450bpo->bpo_phys->bpo_subobjs,451tx);452if (err)453break;454bpo->bpo_phys->bpo_subobjs = 0;455}456err = dmu_object_free(p->bpo_os,457bpo->bpo_object, tx);458if (err)459break;460}461462mutex_exit(&bpo->bpo_lock);463bpobj_close(bpo);464kmem_free(bpo, sizeof (bpobj_t));465} else {466mutex_exit(&bpo->bpo_lock);467}468469/*470* Finished processing this bpo. Unlock, and free471* our "stack" info.472*/473list_remove_head(&stack);474kmem_free(bpi, sizeof (bpobj_info_t));475} else {476/*477* We have unprocessed subobjs. Process the next one.478*/479ASSERT(bpo->bpo_havecomp);480ASSERT0P(bpobj_size);481482/* Add the last subobj to stack. */483int64_t i = bpi->bpi_unprocessed_subobjs - 1;484uint64_t offset = i * sizeof (uint64_t);485486uint64_t subobj;487err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,488offset, sizeof (uint64_t), &subobj,489DMU_READ_NO_PREFETCH);490if (err)491break;492493bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),494KM_SLEEP);495err = bpobj_open(subbpo, bpo->bpo_os, subobj);496if (err) {497kmem_free(subbpo, sizeof (bpobj_t));498break;499}500501if (subbpo->bpo_havesubobj &&502subbpo->bpo_phys->bpo_subobjs != 0) {503dmu_prefetch(subbpo->bpo_os,504subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,505ZIO_PRIORITY_ASYNC_READ);506}507508list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));509mutex_enter(&subbpo->bpo_lock);510bpi->bpi_unprocessed_subobjs--;511}512}513/*514* Cleanup anything left on the "stack" after we left the loop.515* Every bpo on the stack is locked so we must remember to undo516* that now (in LIFO order).517*/518while ((bpi = list_remove_head(&stack)) != NULL) {519bpobj_t *bpo = bpi->bpi_bpo;520ASSERT(err != 0);521ASSERT3P(bpo, !=, NULL);522523mutex_exit(&bpo->bpo_lock);524525/* do not free the initial_bpo */526if (bpi->bpi_parent != NULL) {527bpobj_close(bpi->bpi_bpo);528kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));529}530kmem_free(bpi, sizeof (bpobj_info_t));531}532533list_destroy(&stack);534535return (err);536}537538/*539* Iterate and remove the entries. If func returns nonzero, iteration540* will stop and that entry will not be removed.541*/542int543bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)544{545return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));546}547548/*549* Iterate the entries. If func returns nonzero, iteration will stop.550*551* If there are no subobjs:552*553* *bpobj_size can be used to return the number of block pointers in the554* bpobj. Note that this may be different from the number of block pointers555* that are iterated over, if iteration is terminated early (e.g. by the func556* returning nonzero).557*558* If there are concurrent (or subsequent) modifications to the bpobj then the559* returned *bpobj_size can be passed as "start" to560* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.561*/562int563bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,564uint64_t *bpobj_size)565{566return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));567}568569/*570* Iterate over the blkptrs in the bpobj beginning at index start. If func571* returns nonzero, iteration will stop. This is a livelist specific function572* since it assumes that there are no subobjs present.573*/574int575livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,576int64_t start)577{578if (bpo->bpo_havesubobj)579VERIFY0(bpo->bpo_phys->bpo_subobjs);580bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);581int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);582kmem_free(bpi, sizeof (bpobj_info_t));583return (err);584}585586/*587* Logically add subobj's contents to the parent bpobj.588*589* In the most general case, this is accomplished in constant time by adding590* a reference to subobj. This case is used when enqueuing a large subobj:591* +--------------+ +--------------+592* | bpobj |----------------------->| subobj list |593* +----+----+----+----+----+ +-----+-----+--+--+594* | bp | bp | bp | bp | bp | | obj | obj | obj |595* +----+----+----+----+----+ +-----+-----+-----+596*597* +--------------+ +--------------+598* | sub-bpobj |----------------------> | subsubobj |599* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+600* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |601* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+602*603* Result: sub-bpobj added to parent's subobj list.604* +--------------+ +--------------+605* | bpobj |----------------------->| subobj list |606* +----+----+----+----+----+ +-----+-----+--+--+-----+607* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |608* +----+----+----+----+----+ +-----+-----+-----+--|--+609* |610* /-----------------------------------------------------/611* v612* +--------------+ +--------------+613* | sub-bpobj |----------------------> | subsubobj |614* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+615* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |616* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+617*618*619* In a common case, the subobj is small: its bp's and its list of subobj's620* are each stored in a single block. In this case we copy the subobj's621* contents to the parent:622* +--------------+ +--------------+623* | bpobj |----------------------->| subobj list |624* +----+----+----+----+----+ +-----+-----+--+--+625* | bp | bp | bp | bp | bp | | obj | obj | obj |626* +----+----+----+----+----+ +-----+-----+-----+627* ^ ^628* +--------------+ | +--------------+ |629* | sub-bpobj |---------^------------> | subsubobj | ^630* +----+----+----+ | +-----+-----+--+ |631* | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/632* +----+----+ +-----+-----+633*634* Result: subobj destroyed, contents copied to parent:635* +--------------+ +--------------+636* | bpobj |----------------------->| subobj list |637* +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+638* | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |639* +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+640*641*642* If the subobj has many BP's but few subobj's, we can copy the sub-subobj's643* but retain the sub-bpobj:644* +--------------+ +--------------+645* | bpobj |----------------------->| subobj list |646* +----+----+----+----+----+ +-----+-----+--+--+647* | bp | bp | bp | bp | bp | | obj | obj | obj |648* +----+----+----+----+----+ +-----+-----+-----+649* ^650* +--------------+ +--------------+ |651* | sub-bpobj |----------------------> | subsubobj | ^652* +----+----+----+----+---------+----+ +-----+-----+--+ |653* | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/654* +----+----+----+----+---------+----+ +-----+-----+655*656* Result: sub-sub-bpobjs and subobj added to parent's subobj list.657* +--------------+ +--------------+658* | bpobj |-------------------->| subobj list |659* +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+660* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |661* +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+662* |663* /--------------------------------------------------------------/664* v665* +--------------+666* | sub-bpobj |667* +----+----+----+----+---------+----+668* | bp | bp | bp | bp | ... | bp |669* +----+----+----+----+---------+----+670*/671void672bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)673{674bpobj_t subbpo;675uint64_t used, comp, uncomp, subsubobjs;676boolean_t copy_subsub = B_TRUE;677boolean_t copy_bps = B_TRUE;678679ASSERT(bpobj_is_open(bpo));680ASSERT(subobj != 0);681ASSERT(bpo->bpo_havesubobj);682ASSERT(bpo->bpo_havecomp);683ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);684685if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {686bpobj_decr_empty(bpo->bpo_os, tx);687return;688}689690VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));691if (bpobj_is_empty(&subbpo)) {692/* No point in having an empty subobj. */693bpobj_close(&subbpo);694bpobj_free(bpo->bpo_os, subobj, tx);695return;696}697VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));698699mutex_enter(&bpo->bpo_lock);700dmu_buf_will_dirty(bpo->bpo_dbuf, tx);701702dmu_object_info_t doi;703704if (bpo->bpo_phys->bpo_subobjs != 0) {705ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,706&doi));707ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);708}709710/*711* If subobj has only one block of subobjs, then move subobj's712* subobjs to bpo's subobj list directly. This reduces recursion in713* bpobj_iterate due to nested subobjs.714*/715subsubobjs = subbpo.bpo_phys->bpo_subobjs;716if (subsubobjs != 0) {717VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));718if (doi.doi_max_offset > doi.doi_data_block_size) {719copy_subsub = B_FALSE;720}721}722723/*724* If, in addition to having only one block of subobj's, subobj has725* only one block of bp's, then move subobj's bp's to bpo's bp list726* directly. This reduces recursion in bpobj_iterate due to nested727* subobjs.728*/729VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));730if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {731copy_bps = B_FALSE;732}733734if (copy_subsub && subsubobjs != 0) {735dmu_buf_t *subdb;736uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;737738VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,7390, FTAG, &subdb, 0));740/*741* Make sure that we are not asking dmu_write()742* to write more data than we have in our buffer.743*/744VERIFY3U(subdb->db_size, >=,745numsubsub * sizeof (subobj));746if (bpo->bpo_phys->bpo_subobjs == 0) {747bpo->bpo_phys->bpo_subobjs =748dmu_object_alloc(bpo->bpo_os,749DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,750DMU_OT_NONE, 0, tx);751}752dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,753bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),754numsubsub * sizeof (subobj), subdb->db_data, tx);755dmu_buf_rele(subdb, FTAG);756bpo->bpo_phys->bpo_num_subobjs += numsubsub;757758dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);759subbpo.bpo_phys->bpo_subobjs = 0;760VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));761}762763if (copy_bps) {764dmu_buf_t *bps;765uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;766767ASSERT(copy_subsub);768VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,7690, FTAG, &bps, 0));770771/*772* Make sure that we are not asking dmu_write()773* to write more data than we have in our buffer.774*/775VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));776dmu_write(bpo->bpo_os, bpo->bpo_object,777bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),778numbps * sizeof (blkptr_t),779bps->db_data, tx);780dmu_buf_rele(bps, FTAG);781bpo->bpo_phys->bpo_num_blkptrs += numbps;782783bpobj_close(&subbpo);784VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));785} else {786bpobj_close(&subbpo);787if (bpo->bpo_phys->bpo_subobjs == 0) {788bpo->bpo_phys->bpo_subobjs =789dmu_object_alloc(bpo->bpo_os,790DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,791DMU_OT_NONE, 0, tx);792}793794dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,795bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),796sizeof (subobj), &subobj, tx);797bpo->bpo_phys->bpo_num_subobjs++;798}799800bpo->bpo_phys->bpo_bytes += used;801bpo->bpo_phys->bpo_comp += comp;802bpo->bpo_phys->bpo_uncomp += uncomp;803mutex_exit(&bpo->bpo_lock);804805}806807/*808* Prefetch metadata required for bpobj_enqueue_subobj().809*/810void811bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)812{813dmu_object_info_t doi;814bpobj_t subbpo;815uint64_t subsubobjs;816boolean_t copy_subsub = B_TRUE;817boolean_t copy_bps = B_TRUE;818819ASSERT(bpobj_is_open(bpo));820ASSERT(subobj != 0);821822if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)823return;824825if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)826return;827if (bpobj_is_empty(&subbpo)) {828bpobj_close(&subbpo);829return;830}831subsubobjs = subbpo.bpo_phys->bpo_subobjs;832bpobj_close(&subbpo);833834if (subsubobjs != 0) {835if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)836return;837if (doi.doi_max_offset > doi.doi_data_block_size)838copy_subsub = B_FALSE;839}840841if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)842return;843if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)844copy_bps = B_FALSE;845846if (copy_subsub && subsubobjs != 0) {847if (bpo->bpo_phys->bpo_subobjs) {848dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,849bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,850ZIO_PRIORITY_ASYNC_READ);851}852dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,853ZIO_PRIORITY_ASYNC_READ);854}855856if (copy_bps) {857dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,858bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,859ZIO_PRIORITY_ASYNC_READ);860dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,861ZIO_PRIORITY_ASYNC_READ);862} else if (bpo->bpo_phys->bpo_subobjs) {863dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,864bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,865ZIO_PRIORITY_ASYNC_READ);866}867}868869void870bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,871dmu_tx_t *tx)872{873blkptr_t stored_bp = *bp;874uint64_t offset;875int blkoff;876blkptr_t *bparray;877878ASSERT(bpobj_is_open(bpo));879ASSERT(!BP_IS_HOLE(bp));880ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);881882if (BP_IS_EMBEDDED(bp)) {883/*884* The bpobj will compress better without the payload.885*886* Note that we store EMBEDDED bp's because they have an887* uncompressed size, which must be accounted for. An888* alternative would be to add their size to bpo_uncomp889* without storing the bp, but that would create additional890* complications: bpo_uncomp would be inconsistent with the891* set of BP's stored, and bpobj_iterate() wouldn't visit892* all the space accounted for in the bpobj.893*/894memset(&stored_bp, 0, sizeof (stored_bp));895stored_bp.blk_prop = bp->blk_prop;896BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));897} else if (!BP_GET_DEDUP(bp)) {898/* The bpobj will compress better without the checksum */899memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));900}901902stored_bp.blk_fill = 0;903BP_SET_FREE(&stored_bp, bp_freed);904905mutex_enter(&bpo->bpo_lock);906907offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);908blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);909910if (bpo->bpo_cached_dbuf == NULL ||911offset < bpo->bpo_cached_dbuf->db_offset ||912offset >= bpo->bpo_cached_dbuf->db_offset +913bpo->bpo_cached_dbuf->db_size) {914if (bpo->bpo_cached_dbuf)915dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);916VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,917offset, bpo, &bpo->bpo_cached_dbuf, 0));918ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);919}920921dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);922bparray = bpo->bpo_cached_dbuf->db_data;923bparray[blkoff] = stored_bp;924925dmu_buf_will_dirty(bpo->bpo_dbuf, tx);926bpo->bpo_phys->bpo_num_blkptrs++;927int sign = bp_freed ? -1 : +1;928bpo->bpo_phys->bpo_bytes += sign *929bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);930if (bpo->bpo_havecomp) {931bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);932bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);933}934if (bp_freed) {935ASSERT(bpo->bpo_havefreed);936bpo->bpo_phys->bpo_num_freed++;937}938mutex_exit(&bpo->bpo_lock);939}940941struct space_range_arg {942spa_t *spa;943uint64_t mintxg;944uint64_t maxtxg;945uint64_t used;946uint64_t comp;947uint64_t uncomp;948};949950static int951space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)952{953(void) bp_freed, (void) tx;954struct space_range_arg *sra = arg;955956if (BP_GET_BIRTH(bp) > sra->mintxg &&957BP_GET_BIRTH(bp) <= sra->maxtxg) {958if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))959sra->used += bp_get_dsize_sync(sra->spa, bp);960else961sra->used += bp_get_dsize(sra->spa, bp);962sra->comp += BP_GET_PSIZE(bp);963sra->uncomp += BP_GET_UCSIZE(bp);964}965return (0);966}967968int969bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)970{971ASSERT(bpobj_is_open(bpo));972mutex_enter(&bpo->bpo_lock);973974*usedp = bpo->bpo_phys->bpo_bytes;975if (bpo->bpo_havecomp) {976*compp = bpo->bpo_phys->bpo_comp;977*uncompp = bpo->bpo_phys->bpo_uncomp;978mutex_exit(&bpo->bpo_lock);979return (0);980} else {981mutex_exit(&bpo->bpo_lock);982return (bpobj_space_range(bpo, 0, UINT64_MAX,983usedp, compp, uncompp));984}985}986987/*988* Return the amount of space in the bpobj which is:989* mintxg < logical birth <= maxtxg990*/991int992bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,993uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)994{995struct space_range_arg sra = { 0 };996int err;997998ASSERT(bpobj_is_open(bpo));9991000/*1001* As an optimization, if they want the whole txg range, just1002* get bpo_bytes rather than iterating over the bps.1003*/1004if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)1005return (bpobj_space(bpo, usedp, compp, uncompp));10061007sra.spa = dmu_objset_spa(bpo->bpo_os);1008sra.mintxg = mintxg;1009sra.maxtxg = maxtxg;10101011err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);1012*usedp = sra.used;1013*compp = sra.comp;1014*uncompp = sra.uncomp;1015return (err);1016}10171018/*1019* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a1020* bpobj are designated as free or allocated that information is not preserved1021* in bplists.1022*/1023int1024bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,1025dmu_tx_t *tx)1026{1027(void) bp_freed, (void) tx;1028bplist_t *bpl = arg;1029bplist_append(bpl, bp);1030return (0);1031}103210331034