Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_traverse.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2012, 2018 by Delphix. All rights reserved.24*/2526#include <sys/zfs_context.h>27#include <sys/dmu_objset.h>28#include <sys/dmu_traverse.h>29#include <sys/dsl_dataset.h>30#include <sys/dsl_dir.h>31#include <sys/dsl_pool.h>32#include <sys/dnode.h>33#include <sys/spa.h>34#include <sys/spa_impl.h>35#include <sys/zio.h>36#include <sys/dmu_impl.h>37#include <sys/sa.h>38#include <sys/sa_impl.h>39#include <sys/callb.h>40#include <sys/zfeature.h>4142static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */43static int32_t send_holes_without_birth_time = 1;44static uint_t zfs_traverse_indirect_prefetch_limit = 32;4546typedef struct prefetch_data {47kmutex_t pd_mtx;48kcondvar_t pd_cv;49int32_t pd_bytes_fetched;50int pd_flags;51boolean_t pd_cancel;52boolean_t pd_exited;53zbookmark_phys_t pd_resume;54} prefetch_data_t;5556typedef struct traverse_data {57spa_t *td_spa;58uint64_t td_objset;59blkptr_t *td_rootbp;60uint64_t td_min_txg;61zbookmark_phys_t *td_resume;62int td_flags;63prefetch_data_t *td_pfd;64boolean_t td_paused;65uint64_t td_hole_birth_enabled_txg;66blkptr_cb_t *td_func;67void *td_arg;68boolean_t td_realloc_possible;69} traverse_data_t;7071static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,72const dnode_phys_t *dnp, uint64_t objset, uint64_t object);73static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,74uint64_t objset, uint64_t object);7576static inline uint64_t77get_birth_time(traverse_data_t *td, const blkptr_t *bp)78{79if (td->td_flags & TRAVERSE_LOGICAL)80return (BP_GET_LOGICAL_BIRTH(bp));81else82return (BP_GET_BIRTH(bp));83}8485static int86traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,87uint64_t claim_txg)88{89traverse_data_t *td = arg;90zbookmark_phys_t zb;9192if (BP_IS_HOLE(bp))93return (0);9495if (claim_txg == 0 &&96get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa))97return (-1);9899SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,100bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);101102(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);103104return (0);105}106107static int108traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,109uint64_t claim_txg)110{111traverse_data_t *td = arg;112113if (lrc->lrc_txtype == TX_WRITE) {114lr_write_t *lr = (lr_write_t *)lrc;115blkptr_t *bp = &lr->lr_blkptr;116zbookmark_phys_t zb;117118if (BP_IS_HOLE(bp))119return (0);120121if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg)122return (0);123124ASSERT3U(BP_GET_LSIZE(bp), !=, 0);125SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,126ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));127128(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,129td->td_arg);130}131return (0);132}133134static void135traverse_zil(traverse_data_t *td, zil_header_t *zh)136{137uint64_t claim_txg = zh->zh_claim_txg;138139/*140* We only want to visit blocks that have been claimed but not yet141* replayed; plus blocks that are already stable in read-only mode.142*/143if (claim_txg == 0 && spa_writeable(td->td_spa))144return;145146zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);147(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,148claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));149zil_free(zilog);150}151152typedef enum resume_skip {153RESUME_SKIP_ALL,154RESUME_SKIP_NONE,155RESUME_SKIP_CHILDREN156} resume_skip_t;157158/*159* Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and160* the block indicated by zb does not need to be visited at all. Returns161* RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the162* resume point. This indicates that this block should be visited but not its163* children (since they must have been visited in a previous traversal).164* Otherwise returns RESUME_SKIP_NONE.165*/166static resume_skip_t167resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,168const zbookmark_phys_t *zb)169{170if (td->td_resume != NULL) {171/*172* If we already visited this bp & everything below,173* don't bother doing it again.174*/175if (zbookmark_subtree_completed(dnp, zb, td->td_resume))176return (RESUME_SKIP_ALL);177178if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {179if (td->td_flags & TRAVERSE_POST)180return (RESUME_SKIP_CHILDREN);181}182}183return (RESUME_SKIP_NONE);184}185186/*187* Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.188*/189static boolean_t190traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,191const blkptr_t *bp, const zbookmark_phys_t *zb)192{193arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |194ARC_FLAG_PRESCIENT_PREFETCH;195int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;196197if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))198return (B_FALSE);199/*200* If this bp is before the resume point, it may have already been201* freed.202*/203if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)204return (B_FALSE);205if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg)206return (B_FALSE);207if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)208return (B_FALSE);209ASSERT(!BP_IS_REDACTED(bp));210211if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))212zio_flags |= ZIO_FLAG_RAW;213214(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,215ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);216return (B_TRUE);217}218219static boolean_t220prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)221{222ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);223if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||224BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))225return (B_FALSE);226return (B_TRUE);227}228229static int230traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,231const blkptr_t *bp, const zbookmark_phys_t *zb)232{233int err = 0;234arc_buf_t *buf = NULL;235prefetch_data_t *pd = td->td_pfd;236237switch (resume_skip_check(td, dnp, zb)) {238case RESUME_SKIP_ALL:239return (0);240case RESUME_SKIP_CHILDREN:241goto post;242case RESUME_SKIP_NONE:243break;244default:245ASSERT(0);246}247248if (BP_GET_LOGICAL_BIRTH(bp) == 0) {249/*250* Since this block has a birth time of 0 it must be one of251* two things: a hole created before the252* SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole253* which has always been a hole in an object.254*255* If a file is written sparsely, then the unwritten parts of256* the file were "always holes" -- that is, they have been257* holes since this object was allocated. However, we (and258* our callers) can not necessarily tell when an object was259* allocated. Therefore, if it's possible that this object260* was freed and then its object number reused, we need to261* visit all the holes with birth==0.262*263* If it isn't possible that the object number was reused,264* then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote265* all the blocks we will visit as part of this traversal,266* then this hole must have always existed, so we can skip267* it. We visit blocks born after (exclusive) td_min_txg.268*269* Note that the meta-dnode cannot be reallocated.270*/271if (!send_holes_without_birth_time &&272(!td->td_realloc_possible ||273zb->zb_object == DMU_META_DNODE_OBJECT) &&274td->td_hole_birth_enabled_txg <= td->td_min_txg)275return (0);276} else if (get_birth_time(td, bp) <= td->td_min_txg) {277return (0);278}279280if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {281uint64_t size = BP_GET_LSIZE(bp);282mutex_enter(&pd->pd_mtx);283ASSERT(pd->pd_bytes_fetched >= 0);284while (pd->pd_bytes_fetched < size && !pd->pd_exited)285cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);286pd->pd_bytes_fetched -= size;287cv_broadcast(&pd->pd_cv);288mutex_exit(&pd->pd_mtx);289}290291if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {292err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);293if (err != 0)294goto post;295return (0);296}297298if (td->td_flags & TRAVERSE_PRE) {299err = td->td_func(td->td_spa, NULL, bp, zb, dnp,300td->td_arg);301if (err == TRAVERSE_VISIT_NO_CHILDREN)302return (0);303if (err != 0)304goto post;305}306307if (BP_GET_LEVEL(bp) > 0) {308arc_flags_t flags = ARC_FLAG_WAIT;309int32_t i, ptidx, pidx;310uint32_t prefetchlimit;311int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;312zbookmark_phys_t *czb;313314ASSERT(!BP_IS_PROTECTED(bp));315316err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,317ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);318if (err != 0)319goto post;320321czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);322323/*324* When performing a traversal it is beneficial to325* asynchronously read-ahead the upcoming indirect326* blocks since they will be needed shortly. However,327* since a 128k indirect (non-L0) block may contain up328* to 1024 128-byte block pointers, its preferable to not329* prefetch them all at once. Issuing a large number of330* async reads may effect performance, and the earlier331* the indirect blocks are prefetched the less likely332* they are to still be resident in the ARC when needed.333* Therefore, prefetching indirect blocks is limited to334* zfs_traverse_indirect_prefetch_limit=32 blocks by335* default.336*337* pidx: Index for which next prefetch to be issued.338* ptidx: Index at which next prefetch to be triggered.339*/340ptidx = 0;341pidx = 1;342prefetchlimit = zfs_traverse_indirect_prefetch_limit;343for (i = 0; i < epb; i++) {344if (prefetchlimit && i == ptidx) {345ASSERT3S(ptidx, <=, pidx);346for (uint32_t prefetched = 0; pidx < epb &&347prefetched < prefetchlimit; pidx++) {348SET_BOOKMARK(czb, zb->zb_objset,349zb->zb_object, zb->zb_level - 1,350zb->zb_blkid * epb + pidx);351if (traverse_prefetch_metadata(td, dnp,352&((blkptr_t *)buf->b_data)[pidx],353czb) == B_TRUE) {354prefetched++;355if (prefetched ==356MAX(prefetchlimit / 2, 1))357ptidx = pidx;358}359}360}361362/* recursively visitbp() blocks below this */363SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,364zb->zb_level - 1,365zb->zb_blkid * epb + i);366err = traverse_visitbp(td, dnp,367&((blkptr_t *)buf->b_data)[i], czb);368if (err != 0)369break;370}371372kmem_free(czb, sizeof (zbookmark_phys_t));373374} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {375arc_flags_t flags = ARC_FLAG_WAIT;376zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;377int32_t i;378int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;379dnode_phys_t *child_dnp;380381/*382* dnode blocks might have their bonus buffers encrypted, so383* we must be careful to honor TRAVERSE_NO_DECRYPT384*/385if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))386zio_flags |= ZIO_FLAG_RAW;387388err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,389ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);390if (err != 0)391goto post;392393child_dnp = buf->b_data;394395for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {396prefetch_dnode_metadata(td, &child_dnp[i],397zb->zb_objset, zb->zb_blkid * epb + i);398}399400/* recursively visitbp() blocks below this */401for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {402err = traverse_dnode(td, bp, &child_dnp[i],403zb->zb_objset, zb->zb_blkid * epb + i);404if (err != 0)405break;406}407} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {408zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;409arc_flags_t flags = ARC_FLAG_WAIT;410objset_phys_t *osp;411412if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))413zio_flags |= ZIO_FLAG_RAW;414415err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,416ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);417if (err != 0)418goto post;419420osp = buf->b_data;421prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,422DMU_META_DNODE_OBJECT);423/*424* See the block comment above for the goal of this variable.425* If the maxblkid of the meta-dnode is 0, then we know that426* we've never had more than DNODES_PER_BLOCK objects in the427* dataset, which means we can't have reused any object ids.428*/429if (osp->os_meta_dnode.dn_maxblkid == 0)430td->td_realloc_possible = B_FALSE;431432if (OBJSET_BUF_HAS_USERUSED(buf)) {433if (OBJSET_BUF_HAS_PROJECTUSED(buf))434prefetch_dnode_metadata(td,435&osp->os_projectused_dnode,436zb->zb_objset, DMU_PROJECTUSED_OBJECT);437prefetch_dnode_metadata(td, &osp->os_groupused_dnode,438zb->zb_objset, DMU_GROUPUSED_OBJECT);439prefetch_dnode_metadata(td, &osp->os_userused_dnode,440zb->zb_objset, DMU_USERUSED_OBJECT);441}442443err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,444DMU_META_DNODE_OBJECT);445if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {446if (OBJSET_BUF_HAS_PROJECTUSED(buf))447err = traverse_dnode(td, bp,448&osp->os_projectused_dnode, zb->zb_objset,449DMU_PROJECTUSED_OBJECT);450if (err == 0)451err = traverse_dnode(td, bp,452&osp->os_groupused_dnode, zb->zb_objset,453DMU_GROUPUSED_OBJECT);454if (err == 0)455err = traverse_dnode(td, bp,456&osp->os_userused_dnode, zb->zb_objset,457DMU_USERUSED_OBJECT);458}459}460461if (buf)462arc_buf_destroy(buf, &buf);463464post:465if (err == 0 && (td->td_flags & TRAVERSE_POST))466err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);467468if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {469/*470* Ignore this disk error as requested by the HARD flag,471* and continue traversal.472*/473err = 0;474}475476/*477* If we are stopping here, set td_resume.478*/479if (td->td_resume != NULL && err != 0 && !td->td_paused) {480td->td_resume->zb_objset = zb->zb_objset;481td->td_resume->zb_object = zb->zb_object;482td->td_resume->zb_level = 0;483/*484* If we have stopped on an indirect block (e.g. due to485* i/o error), we have not visited anything below it.486* Set the bookmark to the first level-0 block that we need487* to visit. This way, the resuming code does not need to488* deal with resuming from indirect blocks.489*490* Note, if zb_level <= 0, dnp may be NULL, so we don't want491* to dereference it.492*/493td->td_resume->zb_blkid = zb->zb_blkid;494if (zb->zb_level > 0) {495td->td_resume->zb_blkid <<= zb->zb_level *496(dnp->dn_indblkshift - SPA_BLKPTRSHIFT);497}498td->td_paused = B_TRUE;499}500501return (err);502}503504static void505prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,506uint64_t objset, uint64_t object)507{508int j;509zbookmark_phys_t czb;510511for (j = 0; j < dnp->dn_nblkptr; j++) {512SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);513traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);514}515516if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {517SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);518traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);519}520}521522static int523traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,524uint64_t objset, uint64_t object)525{526int j, err = 0;527zbookmark_phys_t czb;528529if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&530object < td->td_resume->zb_object)531return (0);532533if (td->td_flags & TRAVERSE_PRE) {534SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,535ZB_DNODE_BLKID);536err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,537td->td_arg);538if (err == TRAVERSE_VISIT_NO_CHILDREN)539return (0);540if (err != 0)541return (err);542}543544for (j = 0; j < dnp->dn_nblkptr; j++) {545SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);546err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);547if (err != 0)548break;549}550551if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {552SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);553err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);554}555556if (err == 0 && (td->td_flags & TRAVERSE_POST)) {557SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,558ZB_DNODE_BLKID);559err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,560td->td_arg);561if (err == TRAVERSE_VISIT_NO_CHILDREN)562return (0);563if (err != 0)564return (err);565}566return (err);567}568569static int570traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,571const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)572{573(void) zilog, (void) dnp;574prefetch_data_t *pfd = arg;575int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;576arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |577ARC_FLAG_PRESCIENT_PREFETCH;578579ASSERT(pfd->pd_bytes_fetched >= 0);580if (zb->zb_level == ZB_DNODE_LEVEL)581return (0);582if (pfd->pd_cancel)583return (SET_ERROR(EINTR));584585if (!prefetch_needed(pfd, bp))586return (0);587588mutex_enter(&pfd->pd_mtx);589while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)590cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);591pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);592cv_broadcast(&pfd->pd_cv);593mutex_exit(&pfd->pd_mtx);594595if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))596zio_flags |= ZIO_FLAG_RAW;597598(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,599zio_flags, &aflags, zb);600601return (0);602}603604static void605traverse_prefetch_thread(void *arg)606{607traverse_data_t *td_main = arg;608traverse_data_t td = *td_main;609zbookmark_phys_t czb;610fstrans_cookie_t cookie = spl_fstrans_mark();611612td.td_func = traverse_prefetcher;613td.td_arg = td_main->td_pfd;614td.td_pfd = NULL;615td.td_resume = &td_main->td_pfd->pd_resume;616617SET_BOOKMARK(&czb, td.td_objset,618ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);619(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);620621mutex_enter(&td_main->td_pfd->pd_mtx);622td_main->td_pfd->pd_exited = B_TRUE;623cv_broadcast(&td_main->td_pfd->pd_cv);624mutex_exit(&td_main->td_pfd->pd_mtx);625spl_fstrans_unmark(cookie);626}627628/*629* NB: dataset must not be changing on-disk (eg, is a snapshot or we are630* in syncing context).631*/632static int633traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,634uint64_t txg_start, zbookmark_phys_t *resume, int flags,635blkptr_cb_t func, void *arg)636{637traverse_data_t *td;638prefetch_data_t *pd;639zbookmark_phys_t *czb;640int err;641642ASSERT(ds == NULL || objset == ds->ds_object);643ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));644645td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);646pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);647czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);648649td->td_spa = spa;650td->td_objset = objset;651td->td_rootbp = rootbp;652td->td_min_txg = txg_start;653td->td_resume = resume;654td->td_func = func;655td->td_arg = arg;656td->td_pfd = pd;657td->td_flags = flags;658td->td_paused = B_FALSE;659td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);660661if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {662VERIFY(spa_feature_enabled_txg(spa,663SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));664} else {665td->td_hole_birth_enabled_txg = UINT64_MAX;666}667668pd->pd_flags = flags;669if (resume != NULL)670pd->pd_resume = *resume;671mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);672cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);673674SET_BOOKMARK(czb, td->td_objset,675ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);676677/* See comment on ZIL traversal in dsl_scan_visitds. */678if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {679zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;680arc_flags_t flags = ARC_FLAG_WAIT;681objset_phys_t *osp;682arc_buf_t *buf;683ASSERT(!BP_IS_REDACTED(rootbp));684685if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&686BP_IS_PROTECTED(rootbp))687zio_flags |= ZIO_FLAG_RAW;688689err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,690&buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);691if (err != 0) {692/*693* If both TRAVERSE_HARD and TRAVERSE_PRE are set,694* continue to visitbp so that td_func can be called695* in pre stage, and err will reset to zero.696*/697if (!(td->td_flags & TRAVERSE_HARD) ||698!(td->td_flags & TRAVERSE_PRE))699goto out;700} else {701osp = buf->b_data;702traverse_zil(td, &osp->os_zil_header);703arc_buf_destroy(buf, &buf);704}705}706707if (!(flags & TRAVERSE_PREFETCH_DATA) ||708taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,709td, TQ_NOQUEUE) == TASKQID_INVALID)710pd->pd_exited = B_TRUE;711712err = traverse_visitbp(td, NULL, rootbp, czb);713714mutex_enter(&pd->pd_mtx);715pd->pd_cancel = B_TRUE;716cv_broadcast(&pd->pd_cv);717while (!pd->pd_exited)718cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);719mutex_exit(&pd->pd_mtx);720out:721mutex_destroy(&pd->pd_mtx);722cv_destroy(&pd->pd_cv);723724kmem_free(czb, sizeof (zbookmark_phys_t));725kmem_free(pd, sizeof (struct prefetch_data));726kmem_free(td, sizeof (struct traverse_data));727728return (err);729}730731/*732* NB: dataset must not be changing on-disk (eg, is a snapshot or we are733* in syncing context).734*/735int736traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,737zbookmark_phys_t *resume,738int flags, blkptr_cb_t func, void *arg)739{740return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,741&dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));742}743744int745traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,746int flags, blkptr_cb_t func, void *arg)747{748return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));749}750751int752traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,753uint64_t txg_start, zbookmark_phys_t *resume, int flags,754blkptr_cb_t func, void *arg)755{756return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,757blkptr, txg_start, resume, flags, func, arg));758}759760/*761* NB: pool must not be changing on-disk (eg, from zdb or sync context).762*/763int764traverse_pool(spa_t *spa, uint64_t txg_start, int flags,765blkptr_cb_t func, void *arg)766{767int err;768dsl_pool_t *dp = spa_get_dsl(spa);769objset_t *mos = dp->dp_meta_objset;770boolean_t hard = (flags & TRAVERSE_HARD);771772/* visit the MOS */773err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),774txg_start, NULL, flags, func, arg);775if (err != 0)776return (err);777778/* visit each dataset */779for (uint64_t obj = 1; err == 0;780err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {781dmu_object_info_t doi;782783err = dmu_object_info(mos, obj, &doi);784if (err != 0) {785if (hard)786continue;787break;788}789790if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {791dsl_dataset_t *ds;792uint64_t txg = txg_start;793794dsl_pool_config_enter(dp, FTAG);795err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);796dsl_pool_config_exit(dp, FTAG);797if (err != 0) {798if (hard)799continue;800break;801}802if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)803txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;804err = traverse_dataset(ds, txg, flags, func, arg);805dsl_dataset_rele(ds, FTAG);806if (err != 0)807break;808}809}810if (err == ESRCH)811err = 0;812return (err);813}814815EXPORT_SYMBOL(traverse_dataset);816EXPORT_SYMBOL(traverse_pool);817818ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,819"Max number of bytes to prefetch");820821ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,822"Traverse prefetch number of blocks pointed by indirect block");823824ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,825"Ignore hole_birth txg for zfs send");826827828