Path: blob/main/sys/contrib/openzfs/cmd/zdb/zdb.c
107074 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2011, 2019 by Delphix. All rights reserved.25* Copyright (c) 2014 Integros [integros.com]26* Copyright 2016 Nexenta Systems, Inc.27* Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.28* Copyright (c) 2015, 2017, Intel Corporation.29* Copyright (c) 2020 Datto Inc.30* Copyright (c) 2020, The FreeBSD Foundation [1]31*32* [1] Portions of this software were developed by Allan Jude33* under sponsorship from the FreeBSD Foundation.34* Copyright (c) 2021 Allan Jude35* Copyright (c) 2021 Toomas Soome <[email protected]>36* Copyright (c) 2023, 2024, Klara Inc.37* Copyright (c) 2023, Rob Norris <[email protected]>38*/3940#include <stdio.h>41#include <unistd.h>42#include <stdlib.h>43#include <ctype.h>44#include <getopt.h>45#include <openssl/evp.h>46#include <sys/zfs_context.h>47#include <sys/spa.h>48#include <sys/spa_impl.h>49#include <sys/dmu.h>50#include <sys/zap.h>51#include <sys/zap_impl.h>52#include <sys/fs/zfs.h>53#include <sys/zfs_znode.h>54#include <sys/zfs_sa.h>55#include <sys/sa.h>56#include <sys/sa_impl.h>57#include <sys/vdev.h>58#include <sys/vdev_impl.h>59#include <sys/metaslab_impl.h>60#include <sys/dmu_objset.h>61#include <sys/dsl_dir.h>62#include <sys/dsl_dataset.h>63#include <sys/dsl_pool.h>64#include <sys/dsl_bookmark.h>65#include <sys/dbuf.h>66#include <sys/zil.h>67#include <sys/zil_impl.h>68#include <sys/stat.h>69#include <sys/resource.h>70#include <sys/dmu_send.h>71#include <sys/dmu_traverse.h>72#include <sys/zio_checksum.h>73#include <sys/zio_compress.h>74#include <sys/zfs_fuid.h>75#include <sys/arc.h>76#include <sys/arc_impl.h>77#include <sys/ddt.h>78#include <sys/ddt_impl.h>79#include <sys/zfeature.h>80#include <sys/abd.h>81#include <sys/blkptr.h>82#include <sys/dsl_crypt.h>83#include <sys/dsl_scan.h>84#include <sys/btree.h>85#include <sys/brt.h>86#include <sys/brt_impl.h>87#include <zfs_comutil.h>88#include <sys/zstd/zstd.h>89#include <sys/backtrace.h>9091#include <libzpool.h>92#include <libnvpair.h>93#include <libzutil.h>94#include <libzfs_core.h>9596#include <libzdb.h>9798#include "zdb.h"99100101extern int reference_tracking_enable;102extern int zfs_recover;103extern uint_t zfs_vdev_async_read_max_active;104extern boolean_t spa_load_verify_dryrun;105extern boolean_t spa_mode_readable_spacemaps;106extern uint_t zfs_reconstruct_indirect_combinations_max;107extern uint_t zfs_btree_verify_intensity;108109enum {110ARG_ALLOCATED = 256,111ARG_BLOCK_BIN_MODE,112ARG_BLOCK_CLASSES,113};114115static const char cmdname[] = "zdb";116uint8_t dump_opt[512];117118typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);119120static uint64_t *zopt_metaslab = NULL;121static unsigned zopt_metaslab_args = 0;122123124static zopt_object_range_t *zopt_object_ranges = NULL;125static unsigned zopt_object_args = 0;126127static int flagbits[256];128129130static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */131static int leaked_objects = 0;132static zfs_range_tree_t *mos_refd_objs;133static spa_t *spa;134static objset_t *os;135static boolean_t kernel_init_done;136static boolean_t corruption_found = B_FALSE;137138static enum {139BIN_AUTO = 0,140BIN_PSIZE,141BIN_LSIZE,142BIN_ASIZE,143} block_bin_mode = BIN_AUTO;144145static enum {146CLASS_NORMAL = 1 << 1,147CLASS_SPECIAL = 1 << 2,148CLASS_DEDUP = 1 << 3,149CLASS_OTHER = 1 << 4,150} block_classes = 0;151152static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,153boolean_t);154static void mos_obj_refd(uint64_t);155static void mos_obj_refd_multiple(uint64_t);156static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,157dmu_tx_t *tx);158159160161static void zdb_print_blkptr(const blkptr_t *bp, int flags);162static void zdb_exit(int reason);163164typedef struct sublivelist_verify_block_refcnt {165/* block pointer entry in livelist being verified */166blkptr_t svbr_blk;167168/*169* Refcount gets incremented to 1 when we encounter the first170* FREE entry for the svfbr block pointer and a node for it171* is created in our ZDB verification/tracking metadata.172*173* As we encounter more FREE entries we increment this counter174* and similarly decrement it whenever we find the respective175* ALLOC entries for this block.176*177* When the refcount gets to 0 it means that all the FREE and178* ALLOC entries of this block have paired up and we no longer179* need to track it in our verification logic (e.g. the node180* containing this struct in our verification data structure181* should be freed).182*183* [refer to sublivelist_verify_blkptr() for the actual code]184*/185uint32_t svbr_refcnt;186} sublivelist_verify_block_refcnt_t;187188static int189sublivelist_block_refcnt_compare(const void *larg, const void *rarg)190{191const sublivelist_verify_block_refcnt_t *l = larg;192const sublivelist_verify_block_refcnt_t *r = rarg;193return (livelist_compare(&l->svbr_blk, &r->svbr_blk));194}195196static int197sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,198dmu_tx_t *tx)199{200ASSERT0P(tx);201struct sublivelist_verify *sv = arg;202sublivelist_verify_block_refcnt_t current = {203.svbr_blk = *bp,204205/*206* Start with 1 in case this is the first free entry.207* This field is not used for our B-Tree comparisons208* anyway.209*/210.svbr_refcnt = 1,211};212213zfs_btree_index_t where;214sublivelist_verify_block_refcnt_t *pair =215zfs_btree_find(&sv->sv_pair, ¤t, &where);216if (free) {217if (pair == NULL) {218/* first free entry for this block pointer */219zfs_btree_add(&sv->sv_pair, ¤t);220} else {221pair->svbr_refcnt++;222}223} else {224if (pair == NULL) {225/* block that is currently marked as allocated */226for (int i = 0; i < SPA_DVAS_PER_BP; i++) {227if (DVA_IS_EMPTY(&bp->blk_dva[i]))228break;229sublivelist_verify_block_t svb = {230.svb_dva = bp->blk_dva[i],231.svb_allocated_txg =232BP_GET_BIRTH(bp)233};234235if (zfs_btree_find(&sv->sv_leftover, &svb,236&where) == NULL) {237zfs_btree_add_idx(&sv->sv_leftover,238&svb, &where);239}240}241} else {242/* alloc matches a free entry */243pair->svbr_refcnt--;244if (pair->svbr_refcnt == 0) {245/* all allocs and frees have been matched */246zfs_btree_remove_idx(&sv->sv_pair, &where);247}248}249}250251return (0);252}253254static int255sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)256{257int err;258struct sublivelist_verify *sv = args;259260zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,261sizeof (sublivelist_verify_block_refcnt_t));262263err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,264sv, NULL);265266sublivelist_verify_block_refcnt_t *e;267zfs_btree_index_t *cookie = NULL;268while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {269char blkbuf[BP_SPRINTF_LEN];270snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),271&e->svbr_blk, B_TRUE);272(void) printf("\tERROR: %d unmatched FREE(s): %s\n",273e->svbr_refcnt, blkbuf);274corruption_found = B_TRUE;275}276zfs_btree_destroy(&sv->sv_pair);277278return (err);279}280281static int282livelist_block_compare(const void *larg, const void *rarg)283{284const sublivelist_verify_block_t *l = larg;285const sublivelist_verify_block_t *r = rarg;286287if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))288return (-1);289else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))290return (+1);291292if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))293return (-1);294else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))295return (+1);296297if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))298return (-1);299else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))300return (+1);301302return (0);303}304305/*306* Check for errors in a livelist while tracking all unfreed ALLOCs in the307* sublivelist_verify_t: sv->sv_leftover308*/309static void310livelist_verify(dsl_deadlist_t *dl, void *arg)311{312sublivelist_verify_t *sv = arg;313dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);314}315316/*317* Check for errors in the livelist entry and discard the intermediary318* data structures319*/320static int321sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)322{323(void) args;324sublivelist_verify_t sv;325zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,326sizeof (sublivelist_verify_block_t));327int err = sublivelist_verify_func(&sv, dle);328zfs_btree_clear(&sv.sv_leftover);329zfs_btree_destroy(&sv.sv_leftover);330return (err);331}332333typedef struct metaslab_verify {334/*335* Tree containing all the leftover ALLOCs from the livelists336* that are part of this metaslab.337*/338zfs_btree_t mv_livelist_allocs;339340/*341* Metaslab information.342*/343uint64_t mv_vdid;344uint64_t mv_msid;345uint64_t mv_start;346uint64_t mv_end;347348/*349* What's currently allocated for this metaslab.350*/351zfs_range_tree_t *mv_allocated;352} metaslab_verify_t;353354typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);355356typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,357void *arg);358359typedef struct unflushed_iter_cb_arg {360spa_t *uic_spa;361uint64_t uic_txg;362void *uic_arg;363zdb_log_sm_cb_t uic_cb;364} unflushed_iter_cb_arg_t;365366static int367iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)368{369unflushed_iter_cb_arg_t *uic = arg;370return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));371}372373static void374iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)375{376if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))377return;378379spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);380for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);381sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {382space_map_t *sm = NULL;383VERIFY0(space_map_open(&sm, spa_meta_objset(spa),384sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));385386unflushed_iter_cb_arg_t uic = {387.uic_spa = spa,388.uic_txg = sls->sls_txg,389.uic_arg = arg,390.uic_cb = cb391};392VERIFY0(space_map_iterate(sm, space_map_length(sm),393iterate_through_spacemap_logs_cb, &uic));394space_map_close(sm);395}396spa_config_exit(spa, SCL_CONFIG, FTAG);397}398399static void400verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,401uint64_t offset, uint64_t size)402{403sublivelist_verify_block_t svb = {{{0}}};404DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);405DVA_SET_OFFSET(&svb.svb_dva, offset);406DVA_SET_ASIZE(&svb.svb_dva, 0);407zfs_btree_index_t where;408uint64_t end_offset = offset + size;409410/*411* Look for an exact match for spacemap entry in the livelist entries.412* Then, look for other livelist entries that fall within the range413* of the spacemap entry as it may have been condensed414*/415sublivelist_verify_block_t *found =416zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);417if (found == NULL) {418found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);419}420for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&421DVA_GET_OFFSET(&found->svb_dva) < end_offset;422found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {423if (found->svb_allocated_txg <= txg) {424(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "425"from TXG %llx FREED at TXG %llx\n",426(u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),427(u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),428(u_longlong_t)found->svb_allocated_txg,429(u_longlong_t)txg);430corruption_found = B_TRUE;431}432}433}434435static int436metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)437{438metaslab_verify_t *mv = arg;439uint64_t offset = sme->sme_offset;440uint64_t size = sme->sme_run;441uint64_t txg = sme->sme_txg;442443if (sme->sme_type == SM_ALLOC) {444if (zfs_range_tree_contains(mv->mv_allocated,445offset, size)) {446(void) printf("ERROR: DOUBLE ALLOC: "447"%llu [%llx:%llx] "448"%llu:%llu LOG_SM\n",449(u_longlong_t)txg, (u_longlong_t)offset,450(u_longlong_t)size, (u_longlong_t)mv->mv_vdid,451(u_longlong_t)mv->mv_msid);452corruption_found = B_TRUE;453} else {454zfs_range_tree_add(mv->mv_allocated,455offset, size);456}457} else {458if (!zfs_range_tree_contains(mv->mv_allocated,459offset, size)) {460(void) printf("ERROR: DOUBLE FREE: "461"%llu [%llx:%llx] "462"%llu:%llu LOG_SM\n",463(u_longlong_t)txg, (u_longlong_t)offset,464(u_longlong_t)size, (u_longlong_t)mv->mv_vdid,465(u_longlong_t)mv->mv_msid);466corruption_found = B_TRUE;467} else {468zfs_range_tree_remove(mv->mv_allocated,469offset, size);470}471}472473if (sme->sme_type != SM_ALLOC) {474/*475* If something is freed in the spacemap, verify that476* it is not listed as allocated in the livelist.477*/478verify_livelist_allocs(mv, txg, offset, size);479}480return (0);481}482483static int484spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,485uint64_t txg, void *arg)486{487metaslab_verify_t *mv = arg;488uint64_t offset = sme->sme_offset;489uint64_t vdev_id = sme->sme_vdev;490491vdev_t *vd = vdev_lookup_top(spa, vdev_id);492493/* skip indirect vdevs */494if (!vdev_is_concrete(vd))495return (0);496497if (vdev_id != mv->mv_vdid)498return (0);499500metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];501if (ms->ms_id != mv->mv_msid)502return (0);503504if (txg < metaslab_unflushed_txg(ms))505return (0);506507508ASSERT3U(txg, ==, sme->sme_txg);509return (metaslab_spacemap_validation_cb(sme, mv));510}511512static void513spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)514{515iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);516}517518static void519spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)520{521if (sm == NULL)522return;523524VERIFY0(space_map_iterate(sm, space_map_length(sm),525metaslab_spacemap_validation_cb, mv));526}527528static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);529530/*531* Transfer blocks from sv_leftover tree to the mv_livelist_allocs if532* they are part of that metaslab (mv_msid).533*/534static void535mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)536{537zfs_btree_index_t where;538sublivelist_verify_block_t *svb;539ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);540for (svb = zfs_btree_first(&sv->sv_leftover, &where);541svb != NULL;542svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {543if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)544continue;545546if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&547(DVA_GET_OFFSET(&svb->svb_dva) +548DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {549(void) printf("ERROR: Found block that crosses "550"metaslab boundary: <%llu:%llx:%llx>\n",551(u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),552(u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),553(u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));554corruption_found = B_TRUE;555continue;556}557558if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)559continue;560561if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)562continue;563564if ((DVA_GET_OFFSET(&svb->svb_dva) +565DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {566(void) printf("ERROR: Found block that crosses "567"metaslab boundary: <%llu:%llx:%llx>\n",568(u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),569(u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),570(u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));571corruption_found = B_TRUE;572continue;573}574575zfs_btree_add(&mv->mv_livelist_allocs, svb);576}577578for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);579svb != NULL;580svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {581zfs_btree_remove(&sv->sv_leftover, svb);582}583}584585/*586* [Livelist Check]587* Iterate through all the sublivelists and:588* - report leftover frees (**)589* - record leftover ALLOCs together with their TXG [see Cross Check]590*591* (**) Note: Double ALLOCs are valid in datasets that have dedup592* enabled. Similarly double FREEs are allowed as well but593* only if they pair up with a corresponding ALLOC entry once594* we our done with our sublivelist iteration.595*596* [Spacemap Check]597* for each metaslab:598* - iterate over spacemap and then the metaslab's entries in the599* spacemap log, then report any double FREEs and ALLOCs (do not600* blow up).601*602* [Cross Check]603* After finishing the Livelist Check phase and while being in the604* Spacemap Check phase, we find all the recorded leftover ALLOCs605* of the livelist check that are part of the metaslab that we are606* currently looking at in the Spacemap Check. We report any entries607* that are marked as ALLOCs in the livelists but have been actually608* freed (and potentially allocated again) after their TXG stamp in609* the spacemaps. Also report any ALLOCs from the livelists that610* belong to indirect vdevs (e.g. their vdev completed removal).611*612* Note that this will miss Log Spacemap entries that cancelled each other613* out before being flushed to the metaslab, so we are not guaranteed614* to match all erroneous ALLOCs.615*/616static void617livelist_metaslab_validate(spa_t *spa)618{619(void) printf("Verifying deleted livelist entries\n");620621sublivelist_verify_t sv;622zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,623sizeof (sublivelist_verify_block_t));624iterate_deleted_livelists(spa, livelist_verify, &sv);625626(void) printf("Verifying metaslab entries\n");627vdev_t *rvd = spa->spa_root_vdev;628for (uint64_t c = 0; c < rvd->vdev_children; c++) {629vdev_t *vd = rvd->vdev_child[c];630631if (!vdev_is_concrete(vd))632continue;633634for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {635metaslab_t *m = vd->vdev_ms[mid];636637(void) fprintf(stderr,638"\rverifying concrete vdev %llu, "639"metaslab %llu of %llu ...",640(longlong_t)vd->vdev_id,641(longlong_t)mid,642(longlong_t)vd->vdev_ms_count);643644uint64_t shift, start;645zfs_range_seg_type_t type =646metaslab_calculate_range_tree_type(vd, m,647&start, &shift);648metaslab_verify_t mv;649mv.mv_allocated = zfs_range_tree_create_flags(650NULL, type, NULL, start, shift,6510, "livelist_metaslab_validate:mv_allocated");652mv.mv_vdid = vd->vdev_id;653mv.mv_msid = m->ms_id;654mv.mv_start = m->ms_start;655mv.mv_end = m->ms_start + m->ms_size;656zfs_btree_create(&mv.mv_livelist_allocs,657livelist_block_compare, NULL,658sizeof (sublivelist_verify_block_t));659660mv_populate_livelist_allocs(&mv, &sv);661662spacemap_check_ms_sm(m->ms_sm, &mv);663spacemap_check_sm_log(spa, &mv);664665zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL);666zfs_range_tree_destroy(mv.mv_allocated);667zfs_btree_clear(&mv.mv_livelist_allocs);668zfs_btree_destroy(&mv.mv_livelist_allocs);669}670}671(void) fprintf(stderr, "\n");672673/*674* If there are any segments in the leftover tree after we walked675* through all the metaslabs in the concrete vdevs then this means676* that we have segments in the livelists that belong to indirect677* vdevs and are marked as allocated.678*/679if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {680zfs_btree_destroy(&sv.sv_leftover);681return;682}683(void) printf("ERROR: Found livelist blocks marked as allocated "684"for indirect vdevs:\n");685corruption_found = B_TRUE;686687zfs_btree_index_t *where = NULL;688sublivelist_verify_block_t *svb;689while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=690NULL) {691int vdev_id = DVA_GET_VDEV(&svb->svb_dva);692ASSERT3U(vdev_id, <, rvd->vdev_children);693vdev_t *vd = rvd->vdev_child[vdev_id];694ASSERT(!vdev_is_concrete(vd));695(void) printf("<%d:%llx:%llx> TXG %llx\n",696vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),697(u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),698(u_longlong_t)svb->svb_allocated_txg);699}700(void) printf("\n");701zfs_btree_destroy(&sv.sv_leftover);702}703704/*705* These libumem hooks provide a reasonable set of defaults for the allocator's706* debugging facilities.707*/708const char *709_umem_debug_init(void)710{711return ("default,verbose"); /* $UMEM_DEBUG setting */712}713714const char *715_umem_logging_init(void)716{717return ("fail,contents"); /* $UMEM_LOGGING setting */718}719720static void721usage(void)722{723(void) fprintf(stderr,724"Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "725"[-I <inflight I/Os>]\n"726"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"727"\t\t[-K <key>]\n"728"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"729"\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"730"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"731"\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"732"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"733"\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"734"\t%s [-v] <bookmark>\n"735"\t%s -C [-A] [-U <cache>] [<poolname>]\n"736"\t%s -l [-Aqu] <device>\n"737"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "738"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"739"\t%s -O [-K <key>] <dataset> <path>\n"740"\t%s -r [-K <key>] <dataset> <path> <destination>\n"741"\t%s -r [-K <key>] -O <dataset> <object-id> <destination>\n"742"\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"743"\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"744"\t%s -E [-A] word0:word1:...:word15\n"745"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "746"<poolname>\n\n",747cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,748cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);749750(void) fprintf(stderr, " Dataset name must include at least one "751"separator character '/' or '@'\n");752(void) fprintf(stderr, " If dataset name is specified, only that "753"dataset is dumped\n");754(void) fprintf(stderr, " If object numbers or object number "755"ranges are specified, only those\n"756" objects or ranges are dumped.\n\n");757(void) fprintf(stderr,758" Object ranges take the form <start>:<end>[:<flags>]\n"759" start Starting object number\n"760" end Ending object number, or -1 for no upper bound\n"761" flags Optional flags to select object types:\n"762" A All objects (this is the default)\n"763" d ZFS directories\n"764" f ZFS files \n"765" m SPA space maps\n"766" z ZAPs\n"767" - Negate effect of next flag\n\n");768(void) fprintf(stderr, " Options to control amount of output:\n");769(void) fprintf(stderr, " -b --block-stats "770"block statistics\n");771(void) fprintf(stderr, " --bin=(lsize|psize|asize) "772"bin blocks based on this size in all three columns\n");773(void) fprintf(stderr,774" --class=(normal|special|dedup|other)[,...]\n"775" only consider blocks from "776"these allocation classes\n");777(void) fprintf(stderr, " -B --backup "778"backup stream\n");779(void) fprintf(stderr, " -c --checksum "780"checksum all metadata (twice for all data) blocks\n");781(void) fprintf(stderr, " -C --config "782"config (or cachefile if alone)\n");783(void) fprintf(stderr, " -d --datasets "784"dataset(s)\n");785(void) fprintf(stderr, " -D --dedup-stats "786"dedup statistics\n");787(void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n"788" decode and display block "789"from an embedded block pointer\n");790(void) fprintf(stderr, " -h --history "791"pool history\n");792(void) fprintf(stderr, " -i --intent-logs "793"intent logs\n");794(void) fprintf(stderr, " -l --label "795"read label contents\n");796(void) fprintf(stderr, " -k --checkpointed-state "797"examine the checkpointed state of the pool\n");798(void) fprintf(stderr, " -L --disable-leak-tracking "799"disable leak tracking (do not load spacemaps)\n");800(void) fprintf(stderr, " -m --metaslabs "801"metaslabs\n");802(void) fprintf(stderr, " -M --metaslab-groups "803"metaslab groups\n");804(void) fprintf(stderr, " -O --object-lookups "805"perform object lookups by path\n");806(void) fprintf(stderr, " -r --copy-object "807"copy an object by path to file\n");808(void) fprintf(stderr, " -R --read-block "809"read and display block from a device\n");810(void) fprintf(stderr, " -s --io-stats "811"report stats on zdb's I/O\n");812(void) fprintf(stderr, " -S --simulate-dedup "813"simulate dedup to measure effect\n");814(void) fprintf(stderr, " -v --verbose "815"verbose (applies to all others)\n");816(void) fprintf(stderr, " -y --livelist "817"perform livelist and metaslab validation on any livelists being "818"deleted\n\n");819(void) fprintf(stderr, " Below options are intended for use "820"with other options:\n");821(void) fprintf(stderr, " -A --ignore-assertions "822"ignore assertions (-A), enable panic recovery (-AA) or both "823"(-AAA)\n");824(void) fprintf(stderr, " -e --exported "825"pool is exported/destroyed/has altroot/not in a cachefile\n");826(void) fprintf(stderr, " -F --automatic-rewind "827"attempt automatic rewind within safe range of transaction "828"groups\n");829(void) fprintf(stderr, " -G --dump-debug-msg "830"dump zfs_dbgmsg buffer before exiting\n");831(void) fprintf(stderr, " -I --inflight=INTEGER "832"specify the maximum number of checksumming I/Os "833"[default is 200]\n");834(void) fprintf(stderr, " -K --key=KEY "835"decryption key for encrypted dataset\n");836(void) fprintf(stderr, " -o --option=\"NAME=VALUE\" "837"set the named tunable to the given value\n");838(void) fprintf(stderr, " -p --path==PATH "839"use one or more with -e to specify path to vdev dir\n");840(void) fprintf(stderr, " -P --parseable "841"print numbers in parseable form\n");842(void) fprintf(stderr, " -q --skip-label "843"don't print label contents\n");844(void) fprintf(stderr, " -t --txg=INTEGER "845"highest txg to use when searching for uberblocks\n");846(void) fprintf(stderr, " -T --brt-stats "847"BRT statistics\n");848(void) fprintf(stderr, " -u --uberblock "849"uberblock\n");850(void) fprintf(stderr, " -U --cachefile=PATH "851"use alternate cachefile\n");852(void) fprintf(stderr, " -V --verbatim "853"do verbatim import\n");854(void) fprintf(stderr, " -x --dump-blocks=PATH "855"dump all read blocks into specified directory\n");856(void) fprintf(stderr, " -X --extreme-rewind "857"attempt extreme rewind (does not work with dataset)\n");858(void) fprintf(stderr, " -Y --all-reconstruction "859"attempt all reconstruction combinations for split blocks\n");860(void) fprintf(stderr, " -Z --zstd-headers "861"show ZSTD headers \n");862(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "863"to make only that option verbose\n");864(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");865zdb_exit(2);866}867868static void869dump_debug_buffer(void)870{871ssize_t ret __attribute__((unused));872873if (!dump_opt['G'])874return;875/*876* We use write() instead of printf() so that this function877* is safe to call from a signal handler.878*/879ret = write(STDERR_FILENO, "\n", 1);880zfs_dbgmsg_print(STDERR_FILENO, "zdb");881}882883static void sig_handler(int signo)884{885struct sigaction action;886887libspl_backtrace(STDERR_FILENO);888dump_debug_buffer();889890/*891* Restore default action and re-raise signal so SIGSEGV and892* SIGABRT can trigger a core dump.893*/894action.sa_handler = SIG_DFL;895sigemptyset(&action.sa_mask);896action.sa_flags = 0;897(void) sigaction(signo, &action, NULL);898raise(signo);899}900901/*902* Called for usage errors that are discovered after a call to spa_open(),903* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.904*/905906static void907fatal(const char *fmt, ...)908{909va_list ap;910911va_start(ap, fmt);912(void) fprintf(stderr, "%s: ", cmdname);913(void) vfprintf(stderr, fmt, ap);914va_end(ap);915(void) fprintf(stderr, "\n");916917dump_debug_buffer();918919zdb_exit(1);920}921922static void923dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)924{925(void) size;926nvlist_t *nv;927size_t nvsize = *(uint64_t *)data;928char *packed = umem_alloc(nvsize, UMEM_NOFAIL);929930VERIFY0(dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));931932VERIFY0(nvlist_unpack(packed, nvsize, &nv, 0));933934umem_free(packed, nvsize);935936dump_nvlist(nv, 8);937938nvlist_free(nv);939}940941static void942dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)943{944(void) os, (void) object, (void) size;945spa_history_phys_t *shp = data;946947if (shp == NULL)948return;949950(void) printf("\t\tpool_create_len = %llu\n",951(u_longlong_t)shp->sh_pool_create_len);952(void) printf("\t\tphys_max_off = %llu\n",953(u_longlong_t)shp->sh_phys_max_off);954(void) printf("\t\tbof = %llu\n",955(u_longlong_t)shp->sh_bof);956(void) printf("\t\teof = %llu\n",957(u_longlong_t)shp->sh_eof);958(void) printf("\t\trecords_lost = %llu\n",959(u_longlong_t)shp->sh_records_lost);960}961962static void963zdb_nicenum(uint64_t num, char *buf, size_t buflen)964{965if (dump_opt['P'])966(void) snprintf(buf, buflen, "%llu", (longlong_t)num);967else968nicenum(num, buf, buflen);969}970971static void972zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)973{974if (dump_opt['P'])975(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);976else977zfs_nicebytes(bytes, buf, buflen);978}979980static const char histo_stars[] = "****************************************";981static const uint64_t histo_width = sizeof (histo_stars) - 1;982983static void984dump_histogram(const uint64_t *histo, int size, int offset)985{986int i;987int minidx = size - 1;988int maxidx = 0;989uint64_t max = 0;990991for (i = 0; i < size; i++) {992if (histo[i] == 0)993continue;994if (histo[i] > max)995max = histo[i];996if (i > maxidx)997maxidx = i;998if (i < minidx)999minidx = i;1000}10011002if (max < histo_width)1003max = histo_width;10041005for (i = minidx; i <= maxidx; i++) {1006(void) printf("\t\t\t%3u: %6llu %s\n",1007i + offset, (u_longlong_t)histo[i],1008&histo_stars[(max - histo[i]) * histo_width / max]);1009}1010}10111012static void1013dump_zap_stats(objset_t *os, uint64_t object)1014{1015int error;1016zap_stats_t zs;10171018error = zap_get_stats(os, object, &zs);1019if (error)1020return;10211022if (zs.zs_ptrtbl_len == 0) {1023ASSERT(zs.zs_num_blocks == 1);1024(void) printf("\tmicrozap: %llu bytes, %llu entries\n",1025(u_longlong_t)zs.zs_blocksize,1026(u_longlong_t)zs.zs_num_entries);1027return;1028}10291030(void) printf("\tFat ZAP stats:\n");10311032(void) printf("\t\tPointer table:\n");1033(void) printf("\t\t\t%llu elements\n",1034(u_longlong_t)zs.zs_ptrtbl_len);1035(void) printf("\t\t\tzt_blk: %llu\n",1036(u_longlong_t)zs.zs_ptrtbl_zt_blk);1037(void) printf("\t\t\tzt_numblks: %llu\n",1038(u_longlong_t)zs.zs_ptrtbl_zt_numblks);1039(void) printf("\t\t\tzt_shift: %llu\n",1040(u_longlong_t)zs.zs_ptrtbl_zt_shift);1041(void) printf("\t\t\tzt_blks_copied: %llu\n",1042(u_longlong_t)zs.zs_ptrtbl_blks_copied);1043(void) printf("\t\t\tzt_nextblk: %llu\n",1044(u_longlong_t)zs.zs_ptrtbl_nextblk);10451046(void) printf("\t\tZAP entries: %llu\n",1047(u_longlong_t)zs.zs_num_entries);1048(void) printf("\t\tLeaf blocks: %llu\n",1049(u_longlong_t)zs.zs_num_leafs);1050(void) printf("\t\tTotal blocks: %llu\n",1051(u_longlong_t)zs.zs_num_blocks);1052(void) printf("\t\tzap_block_type: 0x%llx\n",1053(u_longlong_t)zs.zs_block_type);1054(void) printf("\t\tzap_magic: 0x%llx\n",1055(u_longlong_t)zs.zs_magic);1056(void) printf("\t\tzap_salt: 0x%llx\n",1057(u_longlong_t)zs.zs_salt);10581059(void) printf("\t\tLeafs with 2^n pointers:\n");1060dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);10611062(void) printf("\t\tBlocks with n*5 entries:\n");1063dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);10641065(void) printf("\t\tBlocks n/10 full:\n");1066dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);10671068(void) printf("\t\tEntries with n chunks:\n");1069dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);10701071(void) printf("\t\tBuckets with n entries:\n");1072dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);1073}10741075static void1076dump_none(objset_t *os, uint64_t object, void *data, size_t size)1077{1078(void) os, (void) object, (void) data, (void) size;1079}10801081static void1082dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)1083{1084(void) os, (void) object, (void) data, (void) size;1085(void) printf("\tUNKNOWN OBJECT TYPE\n");1086}10871088static void1089dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)1090{1091(void) os, (void) object, (void) data, (void) size;1092}10931094static void1095dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)1096{1097uint64_t *arr;1098uint64_t oursize;1099if (dump_opt['d'] < 6)1100return;11011102if (data == NULL) {1103dmu_object_info_t doi;11041105VERIFY0(dmu_object_info(os, object, &doi));1106size = doi.doi_max_offset;1107/*1108* We cap the size at 1 mebibyte here to prevent1109* allocation failures and nigh-infinite printing if the1110* object is extremely large.1111*/1112oursize = MIN(size, 1 << 20);1113arr = kmem_alloc(oursize, KM_SLEEP);11141115int err = dmu_read(os, object, 0, oursize, arr, 0);1116if (err != 0) {1117(void) printf("got error %u from dmu_read\n", err);1118kmem_free(arr, oursize);1119return;1120}1121} else {1122/*1123* Even though the allocation is already done in this code path,1124* we still cap the size to prevent excessive printing.1125*/1126oursize = MIN(size, 1 << 20);1127arr = data;1128}11291130if (size == 0) {1131if (data == NULL)1132kmem_free(arr, oursize);1133(void) printf("\t\t[]\n");1134return;1135}11361137(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);1138for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {1139if (i % 4 != 0)1140(void) printf(", %0llx", (u_longlong_t)arr[i]);1141else1142(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);1143}1144if (oursize != size)1145(void) printf(", ... ");1146(void) printf("]\n");11471148if (data == NULL)1149kmem_free(arr, oursize);1150}11511152static void1153dump_zap(objset_t *os, uint64_t object, void *data, size_t size)1154{1155(void) data, (void) size;1156zap_cursor_t zc;1157zap_attribute_t *attrp = zap_attribute_long_alloc();1158void *prop;1159unsigned i;11601161dump_zap_stats(os, object);1162(void) printf("\n");11631164for (zap_cursor_init(&zc, os, object);1165zap_cursor_retrieve(&zc, attrp) == 0;1166zap_cursor_advance(&zc)) {1167boolean_t key64 =1168!!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);11691170if (key64)1171(void) printf("\t\t0x%010" PRIu64 "x = ",1172*(uint64_t *)attrp->za_name);1173else1174(void) printf("\t\t%s = ", attrp->za_name);11751176if (attrp->za_num_integers == 0) {1177(void) printf("\n");1178continue;1179}1180prop = umem_zalloc(attrp->za_num_integers *1181attrp->za_integer_length, UMEM_NOFAIL);11821183if (key64)1184(void) zap_lookup_uint64(os, object,1185(const uint64_t *)attrp->za_name, 1,1186attrp->za_integer_length, attrp->za_num_integers,1187prop);1188else1189(void) zap_lookup(os, object, attrp->za_name,1190attrp->za_integer_length, attrp->za_num_integers,1191prop);11921193if (attrp->za_integer_length == 1 && !key64) {1194if (strcmp(attrp->za_name,1195DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||1196strcmp(attrp->za_name,1197DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||1198strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 ||1199strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 ||1200strcmp(attrp->za_name,1201DMU_POOL_CHECKSUM_SALT) == 0) {1202uint8_t *u8 = prop;12031204for (i = 0; i < attrp->za_num_integers; i++) {1205(void) printf("%02x", u8[i]);1206}1207} else {1208(void) printf("%s", (char *)prop);1209}1210} else {1211for (i = 0; i < attrp->za_num_integers; i++) {1212switch (attrp->za_integer_length) {1213case 1:1214(void) printf("%u ",1215((uint8_t *)prop)[i]);1216break;1217case 2:1218(void) printf("%u ",1219((uint16_t *)prop)[i]);1220break;1221case 4:1222(void) printf("%u ",1223((uint32_t *)prop)[i]);1224break;1225case 8:1226(void) printf("%lld ",1227(u_longlong_t)((int64_t *)prop)[i]);1228break;1229}1230}1231}1232(void) printf("\n");1233umem_free(prop,1234attrp->za_num_integers * attrp->za_integer_length);1235}1236zap_cursor_fini(&zc);1237zap_attribute_free(attrp);1238}12391240static void1241dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)1242{1243bpobj_phys_t *bpop = data;1244uint64_t i;1245char bytes[32], comp[32], uncomp[32];12461247/* make sure the output won't get truncated */1248_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");1249_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");1250_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");12511252if (bpop == NULL)1253return;12541255zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));1256zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));1257zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));12581259(void) printf("\t\tnum_blkptrs = %llu\n",1260(u_longlong_t)bpop->bpo_num_blkptrs);1261(void) printf("\t\tbytes = %s\n", bytes);1262if (size >= BPOBJ_SIZE_V1) {1263(void) printf("\t\tcomp = %s\n", comp);1264(void) printf("\t\tuncomp = %s\n", uncomp);1265}1266if (size >= BPOBJ_SIZE_V2) {1267(void) printf("\t\tsubobjs = %llu\n",1268(u_longlong_t)bpop->bpo_subobjs);1269(void) printf("\t\tnum_subobjs = %llu\n",1270(u_longlong_t)bpop->bpo_num_subobjs);1271}1272if (size >= sizeof (*bpop)) {1273(void) printf("\t\tnum_freed = %llu\n",1274(u_longlong_t)bpop->bpo_num_freed);1275}12761277if (dump_opt['d'] < 5)1278return;12791280for (i = 0; i < bpop->bpo_num_blkptrs; i++) {1281char blkbuf[BP_SPRINTF_LEN];1282blkptr_t bp;12831284int err = dmu_read(os, object,1285i * sizeof (bp), sizeof (bp), &bp, 0);1286if (err != 0) {1287(void) printf("got error %u from dmu_read\n", err);1288break;1289}1290snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,1291BP_GET_FREE(&bp));1292(void) printf("\t%s\n", blkbuf);1293}1294}12951296static void1297dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)1298{1299(void) data, (void) size;1300dmu_object_info_t doi;1301int64_t i;13021303VERIFY0(dmu_object_info(os, object, &doi));1304uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);13051306int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);1307if (err != 0) {1308(void) printf("got error %u from dmu_read\n", err);1309kmem_free(subobjs, doi.doi_max_offset);1310return;1311}13121313int64_t last_nonzero = -1;1314for (i = 0; i < doi.doi_max_offset / 8; i++) {1315if (subobjs[i] != 0)1316last_nonzero = i;1317}13181319for (i = 0; i <= last_nonzero; i++) {1320(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);1321}1322kmem_free(subobjs, doi.doi_max_offset);1323}13241325static void1326dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)1327{1328(void) data, (void) size;1329dump_zap_stats(os, object);1330/* contents are printed elsewhere, properly decoded */1331}13321333static void1334dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)1335{1336(void) data, (void) size;1337zap_cursor_t zc;1338zap_attribute_t *attrp = zap_attribute_alloc();13391340dump_zap_stats(os, object);1341(void) printf("\n");13421343for (zap_cursor_init(&zc, os, object);1344zap_cursor_retrieve(&zc, attrp) == 0;1345zap_cursor_advance(&zc)) {1346(void) printf("\t\t%s = ", attrp->za_name);1347if (attrp->za_num_integers == 0) {1348(void) printf("\n");1349continue;1350}1351(void) printf(" %llx : [%d:%d:%d]\n",1352(u_longlong_t)attrp->za_first_integer,1353(int)ATTR_LENGTH(attrp->za_first_integer),1354(int)ATTR_BSWAP(attrp->za_first_integer),1355(int)ATTR_NUM(attrp->za_first_integer));1356}1357zap_cursor_fini(&zc);1358zap_attribute_free(attrp);1359}13601361static void1362dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)1363{1364(void) data, (void) size;1365zap_cursor_t zc;1366zap_attribute_t *attrp = zap_attribute_alloc();1367uint16_t *layout_attrs;1368unsigned i;13691370dump_zap_stats(os, object);1371(void) printf("\n");13721373for (zap_cursor_init(&zc, os, object);1374zap_cursor_retrieve(&zc, attrp) == 0;1375zap_cursor_advance(&zc)) {1376(void) printf("\t\t%s = [", attrp->za_name);1377if (attrp->za_num_integers == 0) {1378(void) printf("\n");1379continue;1380}13811382VERIFY(attrp->za_integer_length == 2);1383layout_attrs = umem_zalloc(attrp->za_num_integers *1384attrp->za_integer_length, UMEM_NOFAIL);13851386VERIFY(zap_lookup(os, object, attrp->za_name,1387attrp->za_integer_length,1388attrp->za_num_integers, layout_attrs) == 0);13891390for (i = 0; i != attrp->za_num_integers; i++)1391(void) printf(" %d ", (int)layout_attrs[i]);1392(void) printf("]\n");1393umem_free(layout_attrs,1394attrp->za_num_integers * attrp->za_integer_length);1395}1396zap_cursor_fini(&zc);1397zap_attribute_free(attrp);1398}13991400static void1401dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)1402{1403(void) data, (void) size;1404zap_cursor_t zc;1405zap_attribute_t *attrp = zap_attribute_long_alloc();1406const char *typenames[] = {1407/* 0 */ "not specified",1408/* 1 */ "FIFO",1409/* 2 */ "Character Device",1410/* 3 */ "3 (invalid)",1411/* 4 */ "Directory",1412/* 5 */ "5 (invalid)",1413/* 6 */ "Block Device",1414/* 7 */ "7 (invalid)",1415/* 8 */ "Regular File",1416/* 9 */ "9 (invalid)",1417/* 10 */ "Symbolic Link",1418/* 11 */ "11 (invalid)",1419/* 12 */ "Socket",1420/* 13 */ "Door",1421/* 14 */ "Event Port",1422/* 15 */ "15 (invalid)",1423};14241425dump_zap_stats(os, object);1426(void) printf("\n");14271428for (zap_cursor_init(&zc, os, object);1429zap_cursor_retrieve(&zc, attrp) == 0;1430zap_cursor_advance(&zc)) {1431(void) printf("\t\t%s = %lld (type: %s)\n",1432attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer),1433typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]);1434}1435zap_cursor_fini(&zc);1436zap_attribute_free(attrp);1437}14381439static int1440get_dtl_refcount(vdev_t *vd)1441{1442int refcount = 0;14431444if (vd->vdev_ops->vdev_op_leaf) {1445space_map_t *sm = vd->vdev_dtl_sm;14461447if (sm != NULL &&1448sm->sm_dbuf->db_size == sizeof (space_map_phys_t))1449return (1);1450return (0);1451}14521453for (unsigned c = 0; c < vd->vdev_children; c++)1454refcount += get_dtl_refcount(vd->vdev_child[c]);1455return (refcount);1456}14571458static int1459get_metaslab_refcount(vdev_t *vd)1460{1461int refcount = 0;14621463if (vd->vdev_top == vd) {1464for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {1465space_map_t *sm = vd->vdev_ms[m]->ms_sm;14661467if (sm != NULL &&1468sm->sm_dbuf->db_size == sizeof (space_map_phys_t))1469refcount++;1470}1471}1472for (unsigned c = 0; c < vd->vdev_children; c++)1473refcount += get_metaslab_refcount(vd->vdev_child[c]);14741475return (refcount);1476}14771478static int1479get_obsolete_refcount(vdev_t *vd)1480{1481uint64_t obsolete_sm_object;1482int refcount = 0;14831484VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));1485if (vd->vdev_top == vd && obsolete_sm_object != 0) {1486dmu_object_info_t doi;1487VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,1488obsolete_sm_object, &doi));1489if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {1490refcount++;1491}1492} else {1493ASSERT0P(vd->vdev_obsolete_sm);1494ASSERT0(obsolete_sm_object);1495}1496for (unsigned c = 0; c < vd->vdev_children; c++) {1497refcount += get_obsolete_refcount(vd->vdev_child[c]);1498}14991500return (refcount);1501}15021503static int1504get_prev_obsolete_spacemap_refcount(spa_t *spa)1505{1506uint64_t prev_obj =1507spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;1508if (prev_obj != 0) {1509dmu_object_info_t doi;1510VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));1511if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {1512return (1);1513}1514}1515return (0);1516}15171518static int1519get_checkpoint_refcount(vdev_t *vd)1520{1521int refcount = 0;15221523if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&1524zap_contains(spa_meta_objset(vd->vdev_spa),1525vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)1526refcount++;15271528for (uint64_t c = 0; c < vd->vdev_children; c++)1529refcount += get_checkpoint_refcount(vd->vdev_child[c]);15301531return (refcount);1532}15331534static int1535get_log_spacemap_refcount(spa_t *spa)1536{1537return (avl_numnodes(&spa->spa_sm_logs_by_txg));1538}15391540static int1541verify_spacemap_refcounts(spa_t *spa)1542{1543uint64_t expected_refcount = 0;1544uint64_t actual_refcount;15451546(void) feature_get_refcount(spa,1547&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],1548&expected_refcount);1549actual_refcount = get_dtl_refcount(spa->spa_root_vdev);1550actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);1551actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);1552actual_refcount += get_prev_obsolete_spacemap_refcount(spa);1553actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);1554actual_refcount += get_log_spacemap_refcount(spa);15551556if (expected_refcount != actual_refcount) {1557(void) printf("space map refcount mismatch: expected %lld != "1558"actual %lld\n",1559(longlong_t)expected_refcount,1560(longlong_t)actual_refcount);1561return (2);1562}1563return (0);1564}15651566static void1567dump_spacemap(objset_t *os, space_map_t *sm)1568{1569const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",1570"INVALID", "INVALID", "INVALID", "INVALID" };15711572if (sm == NULL)1573return;15741575(void) printf("space map object %llu:\n",1576(longlong_t)sm->sm_object);1577(void) printf(" smp_length = 0x%llx\n",1578(longlong_t)sm->sm_phys->smp_length);1579(void) printf(" smp_alloc = 0x%llx\n",1580(longlong_t)sm->sm_phys->smp_alloc);15811582if (dump_opt['d'] < 6 && dump_opt['m'] < 4)1583return;15841585/*1586* Print out the freelist entries in both encoded and decoded form.1587*/1588uint8_t mapshift = sm->sm_shift;1589int64_t alloc = 0;1590uint64_t word, entry_id = 0;1591for (uint64_t offset = 0; offset < space_map_length(sm);1592offset += sizeof (word)) {15931594VERIFY0(dmu_read(os, space_map_object(sm), offset,1595sizeof (word), &word, DMU_READ_PREFETCH));15961597if (sm_entry_is_debug(word)) {1598uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);1599uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);1600if (de_txg == 0) {1601(void) printf(1602"\t [%6llu] PADDING\n",1603(u_longlong_t)entry_id);1604} else {1605(void) printf(1606"\t [%6llu] %s: txg %llu pass %llu\n",1607(u_longlong_t)entry_id,1608ddata[SM_DEBUG_ACTION_DECODE(word)],1609(u_longlong_t)de_txg,1610(u_longlong_t)de_sync_pass);1611}1612entry_id++;1613continue;1614}16151616char entry_type;1617uint64_t entry_off, entry_run, entry_vdev;16181619if (sm_entry_is_single_word(word)) {1620entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?1621'A' : 'F';1622entry_off = (SM_OFFSET_DECODE(word) << mapshift) +1623sm->sm_start;1624entry_run = SM_RUN_DECODE(word) << mapshift;16251626(void) printf("\t [%6llu] %c "1627"range: %012llx-%012llx size: %08llx\n",1628(u_longlong_t)entry_id, entry_type,1629(u_longlong_t)entry_off,1630(u_longlong_t)(entry_off + entry_run - 1),1631(u_longlong_t)entry_run);1632} else {1633/* it is a two-word entry so we read another word */1634ASSERT(sm_entry_is_double_word(word));16351636uint64_t extra_word;1637offset += sizeof (extra_word);1638ASSERT3U(offset, <, space_map_length(sm));1639VERIFY0(dmu_read(os, space_map_object(sm), offset,1640sizeof (extra_word), &extra_word,1641DMU_READ_PREFETCH));16421643entry_run = SM2_RUN_DECODE(word) << mapshift;1644entry_vdev = SM2_VDEV_DECODE(word);1645entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?1646'A' : 'F';1647entry_off = (SM2_OFFSET_DECODE(extra_word) <<1648mapshift) + sm->sm_start;16491650if (zopt_metaslab_args == 0 ||1651zopt_metaslab[0] == entry_vdev) {1652(void) printf("\t [%6llu] %c "1653"range: %012llx-%012llx size: %08llx "1654"vdev: %llu\n",1655(u_longlong_t)entry_id, entry_type,1656(u_longlong_t)entry_off,1657(u_longlong_t)(entry_off + entry_run - 1),1658(u_longlong_t)entry_run,1659(u_longlong_t)entry_vdev);1660}1661}16621663if (entry_type == 'A')1664alloc += entry_run;1665else1666alloc -= entry_run;1667entry_id++;1668}1669if (alloc != space_map_allocated(sm)) {1670(void) printf("space_map_object alloc (%lld) INCONSISTENT "1671"with space map summary (%lld)\n",1672(longlong_t)space_map_allocated(sm), (longlong_t)alloc);1673}1674}16751676static void1677dump_metaslab_stats(metaslab_t *msp)1678{1679char maxbuf[32];1680zfs_range_tree_t *rt = msp->ms_allocatable;1681zfs_btree_t *t = &msp->ms_allocatable_by_size;1682int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;16831684/* max sure nicenum has enough space */1685_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");16861687zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));16881689(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",1690"segments", zfs_btree_numnodes(t), "maxsize", maxbuf,1691"freepct", free_pct);1692(void) printf("\tIn-memory histogram:\n");1693dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);1694}16951696static void1697dump_allocated(void *arg, uint64_t start, uint64_t size)1698{1699uint64_t *off = arg;1700if (*off != start)1701(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off,1702start - *off);1703*off = start + size;1704}17051706static void1707dump_metaslab(metaslab_t *msp)1708{1709vdev_t *vd = msp->ms_group->mg_vd;1710spa_t *spa = vd->vdev_spa;1711space_map_t *sm = msp->ms_sm;1712char freebuf[32];17131714zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,1715sizeof (freebuf));17161717(void) printf(1718"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",1719(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,1720(u_longlong_t)space_map_object(sm), freebuf);17211722if (dump_opt[ARG_ALLOCATED] ||1723(dump_opt['m'] > 2 && !dump_opt['L'])) {1724mutex_enter(&msp->ms_lock);1725VERIFY0(metaslab_load(msp));1726}17271728if (dump_opt['m'] > 2 && !dump_opt['L']) {1729zfs_range_tree_stat_verify(msp->ms_allocatable);1730dump_metaslab_stats(msp);1731}17321733if (dump_opt[ARG_ALLOCATED]) {1734uint64_t off = msp->ms_start;1735zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,1736&off);1737if (off != msp->ms_start + msp->ms_size)1738(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off,1739msp->ms_size - off);1740}17411742if (dump_opt['m'] > 1 && sm != NULL &&1743spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {1744/*1745* The space map histogram represents free space in chunks1746* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).1747*/1748(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",1749(u_longlong_t)msp->ms_fragmentation);1750dump_histogram(sm->sm_phys->smp_histogram,1751SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);1752}17531754if (dump_opt[ARG_ALLOCATED] ||1755(dump_opt['m'] > 2 && !dump_opt['L'])) {1756metaslab_unload(msp);1757mutex_exit(&msp->ms_lock);1758}17591760if (vd->vdev_ops == &vdev_draid_ops)1761ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);1762else1763ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);17641765dump_spacemap(spa->spa_meta_objset, msp->ms_sm);17661767if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {1768(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",1769(u_longlong_t)metaslab_unflushed_txg(msp));1770}1771}17721773static void1774print_vdev_metaslab_header(vdev_t *vd)1775{1776vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;1777const char *bias_str = "";1778if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {1779bias_str = VDEV_ALLOC_BIAS_LOG;1780} else if (alloc_bias == VDEV_BIAS_SPECIAL) {1781bias_str = VDEV_ALLOC_BIAS_SPECIAL;1782} else if (alloc_bias == VDEV_BIAS_DEDUP) {1783bias_str = VDEV_ALLOC_BIAS_DEDUP;1784}17851786uint64_t ms_flush_data_obj = 0;1787if (vd->vdev_top_zap != 0) {1788int error = zap_lookup(spa_meta_objset(vd->vdev_spa),1789vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,1790sizeof (uint64_t), 1, &ms_flush_data_obj);1791if (error != ENOENT) {1792ASSERT0(error);1793}1794}17951796(void) printf("\tvdev %10llu\t%s metaslab shift %4llu",1797(u_longlong_t)vd->vdev_id, bias_str,1798(u_longlong_t)vd->vdev_ms_shift);17991800if (ms_flush_data_obj != 0) {1801(void) printf(" ms_unflushed_phys object %llu",1802(u_longlong_t)ms_flush_data_obj);1803}18041805(void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",1806"metaslabs", (u_longlong_t)vd->vdev_ms_count,1807"offset", "spacemap", "free");1808(void) printf("\t%15s %19s %15s %12s\n",1809"---------------", "-------------------",1810"---------------", "------------");1811}18121813static void1814dump_metaslab_groups(spa_t *spa, boolean_t show_special)1815{1816vdev_t *rvd = spa->spa_root_vdev;1817metaslab_class_t *mc = spa_normal_class(spa);1818metaslab_class_t *smc = spa_special_class(spa);1819uint64_t fragmentation;18201821metaslab_class_histogram_verify(mc);18221823for (unsigned c = 0; c < rvd->vdev_children; c++) {1824vdev_t *tvd = rvd->vdev_child[c];1825metaslab_group_t *mg = tvd->vdev_mg;18261827if (mg == NULL || (mg->mg_class != mc &&1828(!show_special || mg->mg_class != smc)))1829continue;18301831metaslab_group_histogram_verify(mg);1832mg->mg_fragmentation = metaslab_group_fragmentation(mg);18331834(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"1835"fragmentation",1836(u_longlong_t)tvd->vdev_id,1837(u_longlong_t)tvd->vdev_ms_count);1838if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {1839(void) printf("%3s\n", "-");1840} else {1841(void) printf("%3llu%%\n",1842(u_longlong_t)mg->mg_fragmentation);1843}1844dump_histogram(mg->mg_histogram,1845ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);1846}18471848(void) printf("\tpool %s\tfragmentation", spa_name(spa));1849fragmentation = metaslab_class_fragmentation(mc);1850if (fragmentation == ZFS_FRAG_INVALID)1851(void) printf("\t%3s\n", "-");1852else1853(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);1854dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);1855}18561857static void1858print_vdev_indirect(vdev_t *vd)1859{1860vdev_indirect_config_t *vic = &vd->vdev_indirect_config;1861vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;1862vdev_indirect_births_t *vib = vd->vdev_indirect_births;18631864if (vim == NULL) {1865ASSERT0P(vib);1866return;1867}18681869ASSERT3U(vdev_indirect_mapping_object(vim), ==,1870vic->vic_mapping_object);1871ASSERT3U(vdev_indirect_births_object(vib), ==,1872vic->vic_births_object);18731874(void) printf("indirect births obj %llu:\n",1875(longlong_t)vic->vic_births_object);1876(void) printf(" vib_count = %llu\n",1877(longlong_t)vdev_indirect_births_count(vib));1878for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {1879vdev_indirect_birth_entry_phys_t *cur_vibe =1880&vib->vib_entries[i];1881(void) printf("\toffset %llx -> txg %llu\n",1882(longlong_t)cur_vibe->vibe_offset,1883(longlong_t)cur_vibe->vibe_phys_birth_txg);1884}1885(void) printf("\n");18861887(void) printf("indirect mapping obj %llu:\n",1888(longlong_t)vic->vic_mapping_object);1889(void) printf(" vim_max_offset = 0x%llx\n",1890(longlong_t)vdev_indirect_mapping_max_offset(vim));1891(void) printf(" vim_bytes_mapped = 0x%llx\n",1892(longlong_t)vdev_indirect_mapping_bytes_mapped(vim));1893(void) printf(" vim_count = %llu\n",1894(longlong_t)vdev_indirect_mapping_num_entries(vim));18951896if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)1897return;18981899uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);19001901for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {1902vdev_indirect_mapping_entry_phys_t *vimep =1903&vim->vim_entries[i];1904(void) printf("\t<%llx:%llx:%llx> -> "1905"<%llx:%llx:%llx> (%x obsolete)\n",1906(longlong_t)vd->vdev_id,1907(longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),1908(longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),1909(longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),1910(longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),1911(longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),1912counts[i]);1913}1914(void) printf("\n");19151916uint64_t obsolete_sm_object;1917VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));1918if (obsolete_sm_object != 0) {1919objset_t *mos = vd->vdev_spa->spa_meta_objset;1920(void) printf("obsolete space map object %llu:\n",1921(u_longlong_t)obsolete_sm_object);1922ASSERT(vd->vdev_obsolete_sm != NULL);1923ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,1924obsolete_sm_object);1925dump_spacemap(mos, vd->vdev_obsolete_sm);1926(void) printf("\n");1927}1928}19291930static void1931dump_metaslabs(spa_t *spa)1932{1933vdev_t *vd, *rvd = spa->spa_root_vdev;1934uint64_t m, c = 0, children = rvd->vdev_children;19351936(void) printf("\nMetaslabs:\n");19371938if (zopt_metaslab_args > 0) {1939c = zopt_metaslab[0];19401941if (c >= children)1942(void) fatal("bad vdev id: %llu", (u_longlong_t)c);19431944if (zopt_metaslab_args > 1) {1945vd = rvd->vdev_child[c];1946print_vdev_metaslab_header(vd);19471948for (m = 1; m < zopt_metaslab_args; m++) {1949if (zopt_metaslab[m] < vd->vdev_ms_count)1950dump_metaslab(1951vd->vdev_ms[zopt_metaslab[m]]);1952else1953(void) fprintf(stderr, "bad metaslab "1954"number %llu\n",1955(u_longlong_t)zopt_metaslab[m]);1956}1957(void) printf("\n");1958return;1959}1960children = c + 1;1961}1962for (; c < children; c++) {1963vd = rvd->vdev_child[c];1964print_vdev_metaslab_header(vd);19651966print_vdev_indirect(vd);19671968for (m = 0; m < vd->vdev_ms_count; m++)1969dump_metaslab(vd->vdev_ms[m]);1970(void) printf("\n");1971}1972}19731974static void1975dump_log_spacemaps(spa_t *spa)1976{1977if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))1978return;19791980(void) printf("\nLog Space Maps in Pool:\n");1981for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);1982sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {1983space_map_t *sm = NULL;1984VERIFY0(space_map_open(&sm, spa_meta_objset(spa),1985sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));19861987(void) printf("Log Spacemap object %llu txg %llu\n",1988(u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);1989dump_spacemap(spa->spa_meta_objset, sm);1990space_map_close(sm);1991}1992(void) printf("\n");1993}19941995static void1996dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,1997uint64_t index)1998{1999const ddt_key_t *ddk = &ddlwe->ddlwe_key;2000char blkbuf[BP_SPRINTF_LEN];2001blkptr_t blk;2002int p;20032004for (p = 0; p < DDT_NPHYS(ddt); p++) {2005const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;2006ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);20072008if (ddt_phys_birth(ddp, v) == 0)2009continue;2010ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);2011snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);2012(void) printf("index %llx refcnt %llu phys %d %s\n",2013(u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),2014p, blkbuf);2015}2016}20172018static void2019dump_dedup_ratio(const ddt_stat_t *dds)2020{2021double rL, rP, rD, D, dedup, compress, copies;20222023if (dds->dds_blocks == 0)2024return;20252026rL = (double)dds->dds_ref_lsize;2027rP = (double)dds->dds_ref_psize;2028rD = (double)dds->dds_ref_dsize;2029D = (double)dds->dds_dsize;20302031dedup = rD / D;2032compress = rL / rP;2033copies = rD / rP;20342035(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "2036"dedup * compress / copies = %.2f\n\n",2037dedup, compress, copies, dedup * compress / copies);2038}20392040static void2041dump_ddt_log(ddt_t *ddt)2042{2043if (ddt->ddt_version != DDT_VERSION_FDT ||2044!(ddt->ddt_flags & DDT_FLAG_LOG))2045return;20462047for (int n = 0; n < 2; n++) {2048ddt_log_t *ddl = &ddt->ddt_log[n];20492050char flagstr[64] = {0};2051if (ddl->ddl_flags > 0) {2052flagstr[0] = ' ';2053int c = 1;2054if (ddl->ddl_flags & DDL_FLAG_FLUSHING)2055c += strlcpy(&flagstr[c], " FLUSHING",2056sizeof (flagstr) - c);2057if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT)2058c += strlcpy(&flagstr[c], " CHECKPOINT",2059sizeof (flagstr) - c);2060if (ddl->ddl_flags &2061~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT))2062c += strlcpy(&flagstr[c], " UNKNOWN",2063sizeof (flagstr) - c);2064flagstr[1] = '[';2065flagstr[c] = ']';2066}20672068uint64_t count = avl_numnodes(&ddl->ddl_tree);20692070printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; "2071"len=%llu; txg=%llu; entries=%llu\n",2072zio_checksum_table[ddt->ddt_checksum].ci_name, n,2073ddl->ddl_flags, flagstr,2074(u_longlong_t)ddl->ddl_object,2075(u_longlong_t)ddl->ddl_length,2076(u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count);20772078if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) {2079const ddt_key_t *ddk = &ddl->ddl_checkpoint;2080printf(" checkpoint: "2081"%016llx:%016llx:%016llx:%016llx:%016llx\n",2082(u_longlong_t)ddk->ddk_cksum.zc_word[0],2083(u_longlong_t)ddk->ddk_cksum.zc_word[1],2084(u_longlong_t)ddk->ddk_cksum.zc_word[2],2085(u_longlong_t)ddk->ddk_cksum.zc_word[3],2086(u_longlong_t)ddk->ddk_prop);2087}20882089if (count == 0 || dump_opt['D'] < 4)2090continue;20912092ddt_lightweight_entry_t ddlwe;2093uint64_t index = 0;2094for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);2095ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {2096DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);2097dump_ddt_entry(ddt, &ddlwe, index++);2098}2099}2100}21012102static void2103dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class)2104{2105char name[DDT_NAMELEN];2106ddt_lightweight_entry_t ddlwe;2107uint64_t walk = 0;2108dmu_object_info_t doi;2109uint64_t count, dspace, mspace;2110int error;21112112error = ddt_object_info(ddt, type, class, &doi);21132114if (error == ENOENT)2115return;2116ASSERT0(error);21172118error = ddt_object_count(ddt, type, class, &count);2119ASSERT0(error);2120if (count == 0)2121return;21222123dspace = doi.doi_physical_blocks_512 << 9;2124mspace = doi.doi_fill_count * doi.doi_data_block_size;21252126ddt_object_name(ddt, type, class, name);21272128(void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name,2129(u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count);21302131if (dump_opt['D'] < 3)2132return;21332134(void) printf("%s: object=%llu\n", name,2135(u_longlong_t)ddt->ddt_object[type][class]);2136zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);21372138if (dump_opt['D'] < 4)2139return;21402141if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)2142return;21432144(void) printf("%s contents:\n\n", name);21452146while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)2147dump_ddt_entry(ddt, &ddlwe, walk);21482149ASSERT3U(error, ==, ENOENT);21502151(void) printf("\n");2152}21532154static void2155dump_ddt(ddt_t *ddt)2156{2157if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)2158return;21592160char flagstr[64] = {0};2161if (ddt->ddt_flags > 0) {2162flagstr[0] = ' ';2163int c = 1;2164if (ddt->ddt_flags & DDT_FLAG_FLAT)2165c += strlcpy(&flagstr[c], " FLAT",2166sizeof (flagstr) - c);2167if (ddt->ddt_flags & DDT_FLAG_LOG)2168c += strlcpy(&flagstr[c], " LOG",2169sizeof (flagstr) - c);2170if (ddt->ddt_flags & ~DDT_FLAG_MASK)2171c += strlcpy(&flagstr[c], " UNKNOWN",2172sizeof (flagstr) - c);2173flagstr[1] = '[';2174flagstr[c] = ']';2175}21762177printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n",2178zio_checksum_table[ddt->ddt_checksum].ci_name,2179(u_longlong_t)ddt->ddt_version,2180(ddt->ddt_version == 0) ? "LEGACY" :2181(ddt->ddt_version == 1) ? "FDT" : "UNKNOWN",2182(u_longlong_t)ddt->ddt_flags, flagstr,2183(u_longlong_t)ddt->ddt_dir_object);21842185for (ddt_type_t type = 0; type < DDT_TYPES; type++)2186for (ddt_class_t class = 0; class < DDT_CLASSES; class++)2187dump_ddt_object(ddt, type, class);21882189dump_ddt_log(ddt);2190}21912192static void2193dump_all_ddts(spa_t *spa)2194{2195ddt_histogram_t ddh_total = {{{0}}};2196ddt_stat_t dds_total = {0};21972198for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)2199dump_ddt(spa->spa_ddt[c]);22002201ddt_get_dedup_stats(spa, &dds_total);22022203if (dds_total.dds_blocks == 0) {2204(void) printf("All DDTs are empty\n");2205return;2206}22072208(void) printf("\n");22092210if (dump_opt['D'] > 1) {2211(void) printf("DDT histogram (aggregated over all DDTs):\n");2212ddt_get_dedup_histogram(spa, &ddh_total);2213zpool_dump_ddt(&dds_total, &ddh_total);2214}22152216dump_dedup_ratio(&dds_total);22172218/*2219* Dump a histogram of unique class entry age2220*/2221if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {2222ddt_age_histo_t histogram;22232224(void) printf("DDT walk unique, building age histogram...\n");2225ddt_prune_walk(spa, 0, &histogram);22262227/*2228* print out histogram for unique entry class birth2229*/2230if (histogram.dah_entries > 0) {2231(void) printf("%5s %9s %4s\n",2232"age", "blocks", "amnt");2233(void) printf("%5s %9s %4s\n",2234"-----", "---------", "----");2235for (int i = 0; i < HIST_BINS; i++) {2236(void) printf("%5d %9d %4d%%\n", 1 << i,2237(int)histogram.dah_age_histo[i],2238(int)((histogram.dah_age_histo[i] * 100) /2239histogram.dah_entries));2240}2241}2242}2243}22442245static void2246dump_brt(spa_t *spa)2247{2248if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {2249printf("BRT: unsupported on this pool\n");2250return;2251}22522253if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {2254printf("BRT: empty\n");2255return;2256}22572258char count[32], used[32], saved[32];2259zdb_nicebytes(brt_get_used(spa), used, sizeof (used));2260zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));2261uint64_t ratio = brt_get_ratio(spa);2262printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,2263(u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));22642265if (dump_opt['T'] < 2)2266return;22672268for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {2269brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];2270if (!brtvd->bv_initiated) {2271printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);2272continue;2273}22742275zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));2276zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));2277zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));2278printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",2279vdevid, count, used, saved);2280}22812282if (dump_opt['T'] < 3)2283return;22842285/* -TTT shows a per-vdev histograms; -TTTT shows all entries */2286boolean_t do_histo = dump_opt['T'] == 3;22872288char dva[64];22892290if (!do_histo)2291printf("\n%-16s %-10s\n", "DVA", "REFCNT");22922293for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {2294brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];2295if (!brtvd->bv_initiated)2296continue;22972298uint64_t counts[64] = {};22992300zap_cursor_t zc;2301zap_attribute_t *za = zap_attribute_alloc();2302for (zap_cursor_init(&zc, spa->spa_meta_objset,2303brtvd->bv_mos_entries);2304zap_cursor_retrieve(&zc, za) == 0;2305zap_cursor_advance(&zc)) {2306uint64_t refcnt;2307VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,2308brtvd->bv_mos_entries,2309(const uint64_t *)za->za_name, 1,2310za->za_integer_length, za->za_num_integers,2311&refcnt));23122313if (do_histo)2314counts[highbit64(refcnt)]++;2315else {2316uint64_t offset =2317*(const uint64_t *)za->za_name;23182319snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx",2320vdevid, (u_longlong_t)offset);2321printf("%-16s %-10llu\n", dva,2322(u_longlong_t)refcnt);2323}2324}2325zap_cursor_fini(&zc);2326zap_attribute_free(za);23272328if (do_histo) {2329printf("\nBRT: vdev %" PRIu642330": DVAs with 2^n refcnts:\n", vdevid);2331dump_histogram(counts, 64, 0);2332}2333}2334}23352336static void2337dump_dtl_seg(void *arg, uint64_t start, uint64_t size)2338{2339char *prefix = arg;23402341(void) printf("%s [%llu,%llu) length %llu\n",2342prefix,2343(u_longlong_t)start,2344(u_longlong_t)(start + size),2345(u_longlong_t)(size));2346}23472348static void2349dump_dtl(vdev_t *vd, int indent)2350{2351spa_t *spa = vd->vdev_spa;2352boolean_t required;2353const char *name[DTL_TYPES] = { "missing", "partial", "scrub",2354"outage" };2355char prefix[256];23562357spa_vdev_state_enter(spa, SCL_NONE);2358required = vdev_dtl_required(vd);2359(void) spa_vdev_state_exit(spa, NULL, 0);23602361if (indent == 0)2362(void) printf("\nDirty time logs:\n\n");23632364(void) printf("\t%*s%s [%s]\n", indent, "",2365vd->vdev_path ? vd->vdev_path :2366vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),2367required ? "DTL-required" : "DTL-expendable");23682369for (int t = 0; t < DTL_TYPES; t++) {2370zfs_range_tree_t *rt = vd->vdev_dtl[t];2371if (zfs_range_tree_space(rt) == 0)2372continue;2373(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",2374indent + 2, "", name[t]);2375zfs_range_tree_walk(rt, dump_dtl_seg, prefix);2376if (dump_opt['d'] > 5 && vd->vdev_children == 0)2377dump_spacemap(spa->spa_meta_objset,2378vd->vdev_dtl_sm);2379}23802381for (unsigned c = 0; c < vd->vdev_children; c++)2382dump_dtl(vd->vdev_child[c], indent + 4);2383}23842385static void2386dump_history(spa_t *spa)2387{2388nvlist_t **events = NULL;2389char *buf;2390uint64_t resid, len, off = 0;2391uint_t num = 0;2392int error;2393char tbuf[30];23942395if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {2396(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",2397__func__);2398return;2399}24002401do {2402len = SPA_OLD_MAXBLOCKSIZE;24032404if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {2405(void) fprintf(stderr, "Unable to read history: "2406"error %d\n", error);2407free(buf);2408return;2409}24102411if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)2412break;24132414off -= resid;2415} while (len != 0);24162417(void) printf("\nHistory:\n");2418for (unsigned i = 0; i < num; i++) {2419boolean_t printed = B_FALSE;24202421if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {2422time_t tsec;2423struct tm t;24242425tsec = fnvlist_lookup_uint64(events[i],2426ZPOOL_HIST_TIME);2427(void) localtime_r(&tsec, &t);2428(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);2429} else {2430tbuf[0] = '\0';2431}24322433if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {2434(void) printf("%s %s\n", tbuf,2435fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));2436} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {2437uint64_t ievent;24382439ievent = fnvlist_lookup_uint64(events[i],2440ZPOOL_HIST_INT_EVENT);2441if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)2442goto next;24432444(void) printf(" %s [internal %s txg:%ju] %s\n",2445tbuf,2446zfs_history_event_names[ievent],2447fnvlist_lookup_uint64(events[i],2448ZPOOL_HIST_TXG),2449fnvlist_lookup_string(events[i],2450ZPOOL_HIST_INT_STR));2451} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {2452(void) printf("%s [txg:%ju] %s", tbuf,2453fnvlist_lookup_uint64(events[i],2454ZPOOL_HIST_TXG),2455fnvlist_lookup_string(events[i],2456ZPOOL_HIST_INT_NAME));24572458if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {2459(void) printf(" %s (%llu)",2460fnvlist_lookup_string(events[i],2461ZPOOL_HIST_DSNAME),2462(u_longlong_t)fnvlist_lookup_uint64(2463events[i],2464ZPOOL_HIST_DSID));2465}24662467(void) printf(" %s\n", fnvlist_lookup_string(events[i],2468ZPOOL_HIST_INT_STR));2469} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {2470(void) printf("%s ioctl %s\n", tbuf,2471fnvlist_lookup_string(events[i],2472ZPOOL_HIST_IOCTL));24732474if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {2475(void) printf(" input:\n");2476dump_nvlist(fnvlist_lookup_nvlist(events[i],2477ZPOOL_HIST_INPUT_NVL), 8);2478}2479if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {2480(void) printf(" output:\n");2481dump_nvlist(fnvlist_lookup_nvlist(events[i],2482ZPOOL_HIST_OUTPUT_NVL), 8);2483}2484if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {2485(void) printf(" errno: %lld\n",2486(longlong_t)fnvlist_lookup_int64(events[i],2487ZPOOL_HIST_ERRNO));2488}2489} else {2490goto next;2491}24922493printed = B_TRUE;2494next:2495if (dump_opt['h'] > 1) {2496if (!printed)2497(void) printf("unrecognized record:\n");2498dump_nvlist(events[i], 2);2499}2500}2501free(buf);2502}25032504static void2505dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)2506{2507(void) os, (void) object, (void) data, (void) size;2508}25092510static uint64_t2511blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,2512const zbookmark_phys_t *zb)2513{2514if (dnp == NULL) {2515ASSERT(zb->zb_level < 0);2516if (zb->zb_object == 0)2517return (zb->zb_blkid);2518return (zb->zb_blkid * BP_GET_LSIZE(bp));2519}25202521ASSERT(zb->zb_level >= 0);25222523return ((zb->zb_blkid <<2524(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *2525dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);2526}25272528static void2529snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,2530const blkptr_t *bp)2531{2532static abd_t *pabd = NULL;2533void *buf;2534zio_t *zio;2535zfs_zstdhdr_t zstd_hdr;2536int error;25372538if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)2539return;25402541if (BP_IS_HOLE(bp))2542return;25432544if (BP_IS_EMBEDDED(bp)) {2545buf = malloc(SPA_MAXBLOCKSIZE);2546if (buf == NULL) {2547(void) fprintf(stderr, "out of memory\n");2548zdb_exit(1);2549}2550decode_embedded_bp_compressed(bp, buf);2551memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));2552free(buf);2553zstd_hdr.c_len = BE_32(zstd_hdr.c_len);2554zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);2555(void) snprintf(blkbuf + strlen(blkbuf),2556buflen - strlen(blkbuf),2557" ZSTD:size=%u:version=%u:level=%u:EMBEDDED",2558zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),2559zfs_get_hdrlevel(&zstd_hdr));2560return;2561}25622563if (!pabd)2564pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);2565zio = zio_root(spa, NULL, NULL, 0);25662567/* Decrypt but don't decompress so we can read the compression header */2568zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,2569ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,2570NULL));2571error = zio_wait(zio);2572if (error) {2573(void) fprintf(stderr, "read failed: %d\n", error);2574return;2575}2576buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));2577memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));2578zstd_hdr.c_len = BE_32(zstd_hdr.c_len);2579zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);25802581(void) snprintf(blkbuf + strlen(blkbuf),2582buflen - strlen(blkbuf),2583" ZSTD:size=%u:version=%u:level=%u:NORMAL",2584zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),2585zfs_get_hdrlevel(&zstd_hdr));25862587abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));2588}25892590static void2591snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,2592boolean_t bp_freed)2593{2594const dva_t *dva = bp->blk_dva;2595int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;2596int i;25972598if (dump_opt['b'] >= 6) {2599snprintf_blkptr(blkbuf, buflen, bp);2600if (bp_freed) {2601(void) snprintf(blkbuf + strlen(blkbuf),2602buflen - strlen(blkbuf), " %s", "FREE");2603}2604return;2605}26062607if (BP_IS_EMBEDDED(bp)) {2608(void) sprintf(blkbuf,2609"EMBEDDED et=%u %llxL/%llxP B=%llu",2610(int)BPE_GET_ETYPE(bp),2611(u_longlong_t)BPE_GET_LSIZE(bp),2612(u_longlong_t)BPE_GET_PSIZE(bp),2613(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));2614return;2615}26162617blkbuf[0] = '\0';26182619for (i = 0; i < ndvas; i++) {2620(void) snprintf(blkbuf + strlen(blkbuf),2621buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",2622(u_longlong_t)DVA_GET_VDEV(&dva[i]),2623(u_longlong_t)DVA_GET_OFFSET(&dva[i]),2624(u_longlong_t)DVA_GET_ASIZE(&dva[i]),2625(DVA_GET_GANG(&dva[i]) ? "G" : ""));2626}26272628if (BP_IS_HOLE(bp)) {2629(void) snprintf(blkbuf + strlen(blkbuf),2630buflen - strlen(blkbuf),2631"%llxL B=%llu",2632(u_longlong_t)BP_GET_LSIZE(bp),2633(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));2634} else {2635(void) snprintf(blkbuf + strlen(blkbuf),2636buflen - strlen(blkbuf),2637"%llxL/%llxP F=%llu B=%llu/%llu",2638(u_longlong_t)BP_GET_LSIZE(bp),2639(u_longlong_t)BP_GET_PSIZE(bp),2640(u_longlong_t)BP_GET_FILL(bp),2641(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),2642(u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp));2643if (bp_freed)2644(void) snprintf(blkbuf + strlen(blkbuf),2645buflen - strlen(blkbuf), " %s", "FREE");2646(void) snprintf(blkbuf + strlen(blkbuf),2647buflen - strlen(blkbuf),2648" cksum=%016llx:%016llx:%016llx:%016llx",2649(u_longlong_t)bp->blk_cksum.zc_word[0],2650(u_longlong_t)bp->blk_cksum.zc_word[1],2651(u_longlong_t)bp->blk_cksum.zc_word[2],2652(u_longlong_t)bp->blk_cksum.zc_word[3]);2653}2654}26552656static u_longlong_t2657print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,2658const dnode_phys_t *dnp)2659{2660char blkbuf[BP_SPRINTF_LEN];2661u_longlong_t offset;2662int l;26632664offset = (u_longlong_t)blkid2offset(dnp, bp, zb);26652666(void) printf("%16llx ", offset);26672668ASSERT(zb->zb_level >= 0);26692670for (l = dnp->dn_nlevels - 1; l >= -1; l--) {2671if (l == zb->zb_level) {2672(void) printf("L%llx", (u_longlong_t)zb->zb_level);2673} else {2674(void) printf(" ");2675}2676}26772678snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);2679if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)2680snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);2681(void) printf("%s", blkbuf);26822683if (!BP_IS_EMBEDDED(bp)) {2684if (BP_GET_TYPE(bp) != dnp->dn_type) {2685(void) printf(" (ERROR: Block pointer type "2686"(%llu) does not match dnode type (%hhu))",2687BP_GET_TYPE(bp), dnp->dn_type);2688corruption_found = B_TRUE;2689}2690if (BP_GET_LEVEL(bp) != zb->zb_level) {2691(void) printf(" (ERROR: Block pointer level "2692"(%llu) does not match bookmark level (%lld))",2693BP_GET_LEVEL(bp), (longlong_t)zb->zb_level);2694corruption_found = B_TRUE;2695}2696}2697(void) printf("\n");26982699return (offset);2700}27012702static int2703visit_indirect(spa_t *spa, const dnode_phys_t *dnp,2704blkptr_t *bp, const zbookmark_phys_t *zb)2705{2706u_longlong_t offset;2707int err = 0;27082709if (BP_GET_BIRTH(bp) == 0)2710return (0);27112712offset = print_indirect(spa, bp, zb, dnp);27132714if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {2715arc_flags_t flags = ARC_FLAG_WAIT;2716int i;2717blkptr_t *cbp;2718int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;2719arc_buf_t *buf;2720uint64_t fill = 0;2721ASSERT(!BP_IS_REDACTED(bp));27222723err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,2724ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);2725if (err)2726return (err);2727ASSERT(buf->b_data);27282729/* recursively visit blocks below this */2730cbp = buf->b_data;2731for (i = 0; i < epb; i++, cbp++) {2732zbookmark_phys_t czb;27332734SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,2735zb->zb_level - 1,2736zb->zb_blkid * epb + i);2737err = visit_indirect(spa, dnp, cbp, &czb);2738if (err)2739break;2740fill += BP_GET_FILL(cbp);2741}2742if (!err) {2743if (fill != BP_GET_FILL(bp)) {2744(void) printf("%16llx: Block pointer "2745"fill (%llu) does not match calculated "2746"value (%llu)\n", offset, BP_GET_FILL(bp),2747(u_longlong_t)fill);2748corruption_found = B_TRUE;2749}2750}2751arc_buf_destroy(buf, &buf);2752}27532754return (err);2755}27562757static void2758dump_indirect(dnode_t *dn)2759{2760dnode_phys_t *dnp = dn->dn_phys;2761zbookmark_phys_t czb;27622763(void) printf("Indirect blocks:\n");27642765SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),2766dn->dn_object, dnp->dn_nlevels - 1, 0);2767for (int j = 0; j < dnp->dn_nblkptr; j++) {2768czb.zb_blkid = j;2769(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,2770&dnp->dn_blkptr[j], &czb);2771}27722773(void) printf("\n");2774}27752776static void2777dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)2778{2779(void) os, (void) object;2780dsl_dir_phys_t *dd = data;2781time_t crtime;2782char nice[32];27832784/* make sure nicenum has enough space */2785_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");27862787if (dd == NULL)2788return;27892790ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));27912792crtime = dd->dd_creation_time;2793(void) printf("\t\tcreation_time = %s", ctime(&crtime));2794(void) printf("\t\thead_dataset_obj = %llu\n",2795(u_longlong_t)dd->dd_head_dataset_obj);2796(void) printf("\t\tparent_dir_obj = %llu\n",2797(u_longlong_t)dd->dd_parent_obj);2798(void) printf("\t\torigin_obj = %llu\n",2799(u_longlong_t)dd->dd_origin_obj);2800(void) printf("\t\tchild_dir_zapobj = %llu\n",2801(u_longlong_t)dd->dd_child_dir_zapobj);2802zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));2803(void) printf("\t\tused_bytes = %s\n", nice);2804zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));2805(void) printf("\t\tcompressed_bytes = %s\n", nice);2806zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));2807(void) printf("\t\tuncompressed_bytes = %s\n", nice);2808zdb_nicenum(dd->dd_quota, nice, sizeof (nice));2809(void) printf("\t\tquota = %s\n", nice);2810zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));2811(void) printf("\t\treserved = %s\n", nice);2812(void) printf("\t\tprops_zapobj = %llu\n",2813(u_longlong_t)dd->dd_props_zapobj);2814(void) printf("\t\tdeleg_zapobj = %llu\n",2815(u_longlong_t)dd->dd_deleg_zapobj);2816(void) printf("\t\tflags = %llx\n",2817(u_longlong_t)dd->dd_flags);28182819#define DO(which) \2820zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \2821sizeof (nice)); \2822(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)2823DO(HEAD);2824DO(SNAP);2825DO(CHILD);2826DO(CHILD_RSRV);2827DO(REFRSRV);2828#undef DO2829(void) printf("\t\tclones = %llu\n",2830(u_longlong_t)dd->dd_clones);2831}28322833static void2834dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)2835{2836(void) os, (void) object;2837dsl_dataset_phys_t *ds = data;2838time_t crtime;2839char used[32], compressed[32], uncompressed[32], unique[32];2840char blkbuf[BP_SPRINTF_LEN];28412842/* make sure nicenum has enough space */2843_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");2844_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,2845"compressed truncated");2846_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,2847"uncompressed truncated");2848_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");28492850if (ds == NULL)2851return;28522853ASSERT(size == sizeof (*ds));2854crtime = ds->ds_creation_time;2855zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));2856zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));2857zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,2858sizeof (uncompressed));2859zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));2860snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);28612862(void) printf("\t\tdir_obj = %llu\n",2863(u_longlong_t)ds->ds_dir_obj);2864(void) printf("\t\tprev_snap_obj = %llu\n",2865(u_longlong_t)ds->ds_prev_snap_obj);2866(void) printf("\t\tprev_snap_txg = %llu\n",2867(u_longlong_t)ds->ds_prev_snap_txg);2868(void) printf("\t\tnext_snap_obj = %llu\n",2869(u_longlong_t)ds->ds_next_snap_obj);2870(void) printf("\t\tsnapnames_zapobj = %llu\n",2871(u_longlong_t)ds->ds_snapnames_zapobj);2872(void) printf("\t\tnum_children = %llu\n",2873(u_longlong_t)ds->ds_num_children);2874(void) printf("\t\tuserrefs_obj = %llu\n",2875(u_longlong_t)ds->ds_userrefs_obj);2876(void) printf("\t\tcreation_time = %s", ctime(&crtime));2877(void) printf("\t\tcreation_txg = %llu\n",2878(u_longlong_t)ds->ds_creation_txg);2879(void) printf("\t\tdeadlist_obj = %llu\n",2880(u_longlong_t)ds->ds_deadlist_obj);2881(void) printf("\t\tused_bytes = %s\n", used);2882(void) printf("\t\tcompressed_bytes = %s\n", compressed);2883(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);2884(void) printf("\t\tunique = %s\n", unique);2885(void) printf("\t\tfsid_guid = %llu\n",2886(u_longlong_t)ds->ds_fsid_guid);2887(void) printf("\t\tguid = %llu\n",2888(u_longlong_t)ds->ds_guid);2889(void) printf("\t\tflags = %llx\n",2890(u_longlong_t)ds->ds_flags);2891(void) printf("\t\tnext_clones_obj = %llu\n",2892(u_longlong_t)ds->ds_next_clones_obj);2893(void) printf("\t\tprops_obj = %llu\n",2894(u_longlong_t)ds->ds_props_obj);2895(void) printf("\t\tbp = %s\n", blkbuf);2896}28972898static int2899dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)2900{2901(void) arg, (void) tx;2902char blkbuf[BP_SPRINTF_LEN];29032904if (BP_GET_BIRTH(bp) != 0) {2905snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);2906(void) printf("\t%s\n", blkbuf);2907}2908return (0);2909}29102911static void2912dump_bptree(objset_t *os, uint64_t obj, const char *name)2913{2914char bytes[32];2915bptree_phys_t *bt;2916dmu_buf_t *db;29172918/* make sure nicenum has enough space */2919_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");29202921if (dump_opt['d'] < 3)2922return;29232924VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));2925bt = db->db_data;2926zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));2927(void) printf("\n %s: %llu datasets, %s\n",2928name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);2929dmu_buf_rele(db, FTAG);29302931if (dump_opt['d'] < 5)2932return;29332934(void) printf("\n");29352936(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);2937}29382939static int2940dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)2941{2942(void) arg, (void) tx;2943char blkbuf[BP_SPRINTF_LEN];29442945ASSERT(BP_GET_BIRTH(bp) != 0);2946snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);2947(void) printf("\t%s\n", blkbuf);2948return (0);2949}29502951static void2952dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)2953{2954char bytes[32];2955char comp[32];2956char uncomp[32];2957uint64_t i;29582959/* make sure nicenum has enough space */2960_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");2961_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");2962_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");29632964if (dump_opt['d'] < 3)2965return;29662967zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));2968if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {2969zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));2970zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));2971if (bpo->bpo_havefreed) {2972(void) printf(" %*s: object %llu, %llu local "2973"blkptrs, %llu freed, %llu subobjs in object %llu, "2974"%s (%s/%s comp)\n",2975indent * 8, name,2976(u_longlong_t)bpo->bpo_object,2977(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,2978(u_longlong_t)bpo->bpo_phys->bpo_num_freed,2979(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,2980(u_longlong_t)bpo->bpo_phys->bpo_subobjs,2981bytes, comp, uncomp);2982} else {2983(void) printf(" %*s: object %llu, %llu local "2984"blkptrs, %llu subobjs in object %llu, "2985"%s (%s/%s comp)\n",2986indent * 8, name,2987(u_longlong_t)bpo->bpo_object,2988(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,2989(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,2990(u_longlong_t)bpo->bpo_phys->bpo_subobjs,2991bytes, comp, uncomp);2992}29932994for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {2995uint64_t subobj;2996bpobj_t subbpo;2997int error;2998VERIFY0(dmu_read(bpo->bpo_os,2999bpo->bpo_phys->bpo_subobjs,3000i * sizeof (subobj), sizeof (subobj), &subobj, 0));3001error = bpobj_open(&subbpo, bpo->bpo_os, subobj);3002if (error != 0) {3003(void) printf("ERROR %u while trying to open "3004"subobj id %llu\n",3005error, (u_longlong_t)subobj);3006corruption_found = B_TRUE;3007continue;3008}3009dump_full_bpobj(&subbpo, "subobj", indent + 1);3010bpobj_close(&subbpo);3011}3012} else {3013if (bpo->bpo_havefreed) {3014(void) printf(" %*s: object %llu, %llu blkptrs, "3015"%llu freed, %s\n",3016indent * 8, name,3017(u_longlong_t)bpo->bpo_object,3018(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,3019(u_longlong_t)bpo->bpo_phys->bpo_num_freed,3020bytes);3021} else {3022(void) printf(" %*s: object %llu, %llu blkptrs, "3023"%s\n",3024indent * 8, name,3025(u_longlong_t)bpo->bpo_object,3026(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,3027bytes);3028}3029}30303031if (dump_opt['d'] < 5)3032return;303330343035if (indent == 0) {3036(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);3037(void) printf("\n");3038}3039}30403041static int3042dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,3043boolean_t print_list)3044{3045int err = 0;3046zfs_bookmark_phys_t prop;3047objset_t *mos = dp->dp_spa->spa_meta_objset;3048err = dsl_bookmark_lookup(dp, name, NULL, &prop);30493050if (err != 0) {3051return (err);3052}30533054(void) printf("\t#%s: ", strchr(name, '#') + 1);3055(void) printf("{guid: %llx creation_txg: %llu creation_time: "3056"%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,3057(u_longlong_t)prop.zbm_creation_txg,3058(u_longlong_t)prop.zbm_creation_time,3059(u_longlong_t)prop.zbm_redaction_obj);30603061IMPLY(print_list, print_redact);3062if (!print_redact || prop.zbm_redaction_obj == 0)3063return (0);30643065redaction_list_t *rl;3066VERIFY0(dsl_redaction_list_hold_obj(dp,3067prop.zbm_redaction_obj, FTAG, &rl));30683069redaction_list_phys_t *rlp = rl->rl_phys;3070(void) printf("\tRedacted:\n\t\tProgress: ");3071if (rlp->rlp_last_object != UINT64_MAX ||3072rlp->rlp_last_blkid != UINT64_MAX) {3073(void) printf("%llu %llu (incomplete)\n",3074(u_longlong_t)rlp->rlp_last_object,3075(u_longlong_t)rlp->rlp_last_blkid);3076} else {3077(void) printf("complete\n");3078}3079(void) printf("\t\tSnapshots: [");3080for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {3081if (i > 0)3082(void) printf(", ");3083(void) printf("%0llu",3084(u_longlong_t)rlp->rlp_snaps[i]);3085}3086(void) printf("]\n\t\tLength: %llu\n",3087(u_longlong_t)rlp->rlp_num_entries);30883089if (!print_list) {3090dsl_redaction_list_rele(rl, FTAG);3091return (0);3092}30933094if (rlp->rlp_num_entries == 0) {3095dsl_redaction_list_rele(rl, FTAG);3096(void) printf("\t\tRedaction List: []\n\n");3097return (0);3098}30993100redact_block_phys_t *rbp_buf;3101uint64_t size;3102dmu_object_info_t doi;31033104VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));3105size = doi.doi_max_offset;3106rbp_buf = kmem_alloc(size, KM_SLEEP);31073108err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,3109rbp_buf, 0);3110if (err != 0) {3111dsl_redaction_list_rele(rl, FTAG);3112kmem_free(rbp_buf, size);3113return (err);3114}31153116(void) printf("\t\tRedaction List: [{object: %llx, offset: "3117"%llx, blksz: %x, count: %llx}",3118(u_longlong_t)rbp_buf[0].rbp_object,3119(u_longlong_t)rbp_buf[0].rbp_blkid,3120(uint_t)(redact_block_get_size(&rbp_buf[0])),3121(u_longlong_t)redact_block_get_count(&rbp_buf[0]));31223123for (size_t i = 1; i < rlp->rlp_num_entries; i++) {3124(void) printf(",\n\t\t{object: %llx, offset: %llx, "3125"blksz: %x, count: %llx}",3126(u_longlong_t)rbp_buf[i].rbp_object,3127(u_longlong_t)rbp_buf[i].rbp_blkid,3128(uint_t)(redact_block_get_size(&rbp_buf[i])),3129(u_longlong_t)redact_block_get_count(&rbp_buf[i]));3130}3131dsl_redaction_list_rele(rl, FTAG);3132kmem_free(rbp_buf, size);3133(void) printf("]\n\n");3134return (0);3135}31363137static void3138dump_bookmarks(objset_t *os, int verbosity)3139{3140zap_cursor_t zc;3141zap_attribute_t *attrp;3142dsl_dataset_t *ds = dmu_objset_ds(os);3143dsl_pool_t *dp = spa_get_dsl(os->os_spa);3144objset_t *mos = os->os_spa->spa_meta_objset;3145if (verbosity < 4)3146return;3147attrp = zap_attribute_alloc();3148dsl_pool_config_enter(dp, FTAG);31493150for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);3151zap_cursor_retrieve(&zc, attrp) == 0;3152zap_cursor_advance(&zc)) {3153char osname[ZFS_MAX_DATASET_NAME_LEN];3154char buf[ZFS_MAX_DATASET_NAME_LEN];3155int len;3156dmu_objset_name(os, osname);3157len = snprintf(buf, sizeof (buf), "%s#%s", osname,3158attrp->za_name);3159VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);3160(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);3161}3162zap_cursor_fini(&zc);3163dsl_pool_config_exit(dp, FTAG);3164zap_attribute_free(attrp);3165}31663167static void3168bpobj_count_refd(bpobj_t *bpo)3169{3170mos_obj_refd(bpo->bpo_object);31713172if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {3173mos_obj_refd(bpo->bpo_phys->bpo_subobjs);3174for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {3175uint64_t subobj;3176bpobj_t subbpo;3177int error;3178VERIFY0(dmu_read(bpo->bpo_os,3179bpo->bpo_phys->bpo_subobjs,3180i * sizeof (subobj), sizeof (subobj), &subobj, 0));3181error = bpobj_open(&subbpo, bpo->bpo_os, subobj);3182if (error != 0) {3183(void) printf("ERROR %u while trying to open "3184"subobj id %llu\n",3185error, (u_longlong_t)subobj);3186corruption_found = B_TRUE;3187continue;3188}3189bpobj_count_refd(&subbpo);3190bpobj_close(&subbpo);3191}3192}3193}31943195static int3196dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)3197{3198spa_t *spa = arg;3199uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;3200if (dle->dle_bpobj.bpo_object != empty_bpobj)3201bpobj_count_refd(&dle->dle_bpobj);3202return (0);3203}32043205static int3206dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)3207{3208ASSERT0P(arg);3209if (dump_opt['d'] >= 5) {3210char buf[128];3211(void) snprintf(buf, sizeof (buf),3212"mintxg %llu -> obj %llu",3213(longlong_t)dle->dle_mintxg,3214(longlong_t)dle->dle_bpobj.bpo_object);32153216dump_full_bpobj(&dle->dle_bpobj, buf, 0);3217} else {3218(void) printf("mintxg %llu -> obj %llu\n",3219(longlong_t)dle->dle_mintxg,3220(longlong_t)dle->dle_bpobj.bpo_object);3221}3222return (0);3223}32243225static void3226dump_blkptr_list(dsl_deadlist_t *dl, const char *name)3227{3228char bytes[32];3229char comp[32];3230char uncomp[32];3231char entries[32];3232spa_t *spa = dmu_objset_spa(dl->dl_os);3233uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;32343235if (dl->dl_oldfmt) {3236if (dl->dl_bpobj.bpo_object != empty_bpobj)3237bpobj_count_refd(&dl->dl_bpobj);3238} else {3239mos_obj_refd(dl->dl_object);3240dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);3241}32423243/* make sure nicenum has enough space */3244_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");3245_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");3246_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");3247_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");32483249if (dump_opt['d'] < 3)3250return;32513252if (dl->dl_oldfmt) {3253dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);3254return;3255}32563257zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));3258zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));3259zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));3260zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));3261(void) printf("\n %s: %s (%s/%s comp), %s entries\n",3262name, bytes, comp, uncomp, entries);32633264if (dump_opt['d'] < 4)3265return;32663267(void) putchar('\n');32683269dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);3270}32713272static int3273verify_dd_livelist(objset_t *os)3274{3275uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;3276dsl_pool_t *dp = spa_get_dsl(os->os_spa);3277dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;32783279ASSERT(!dmu_objset_is_snapshot(os));3280if (!dsl_deadlist_is_open(&dd->dd_livelist))3281return (0);32823283/* Iterate through the livelist to check for duplicates */3284dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,3285NULL);32863287dsl_pool_config_enter(dp, FTAG);3288dsl_deadlist_space(&dd->dd_livelist, &ll_used,3289&ll_comp, &ll_uncomp);32903291dsl_dataset_t *origin_ds;3292ASSERT(dsl_pool_config_held(dp));3293VERIFY0(dsl_dataset_hold_obj(dp,3294dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));3295VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,3296&used, &comp, &uncomp));3297dsl_dataset_rele(origin_ds, FTAG);3298dsl_pool_config_exit(dp, FTAG);3299/*3300* It's possible that the dataset's uncomp space is larger than the3301* livelist's because livelists do not track embedded block pointers3302*/3303if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {3304char nice_used[32], nice_comp[32], nice_uncomp[32];3305(void) printf("Discrepancy in space accounting:\n");3306zdb_nicenum(used, nice_used, sizeof (nice_used));3307zdb_nicenum(comp, nice_comp, sizeof (nice_comp));3308zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));3309(void) printf("dir: used %s, comp %s, uncomp %s\n",3310nice_used, nice_comp, nice_uncomp);3311zdb_nicenum(ll_used, nice_used, sizeof (nice_used));3312zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));3313zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));3314(void) printf("livelist: used %s, comp %s, uncomp %s\n",3315nice_used, nice_comp, nice_uncomp);3316return (1);3317}3318return (0);3319}33203321static char *key_material = NULL;33223323static boolean_t3324zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)3325{3326uint64_t keyformat, salt, iters;3327int i;3328unsigned char c;3329FILE *f;33303331VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,3332zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),33331, &keyformat));33343335switch (keyformat) {3336case ZFS_KEYFORMAT_HEX:3337for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {3338if (!isxdigit(key_material[i]) ||3339!isxdigit(key_material[i+1]))3340return (B_FALSE);3341if (sscanf(&key_material[i], "%02hhx", &c) != 1)3342return (B_FALSE);3343key_out[i / 2] = c;3344}3345break;33463347case ZFS_KEYFORMAT_PASSPHRASE:3348VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,3349dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),3350sizeof (uint64_t), 1, &salt));3351VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,3352dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),3353sizeof (uint64_t), 1, &iters));33543355if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),3356((uint8_t *)&salt), sizeof (uint64_t), iters,3357WRAPPING_KEY_LEN, key_out) != 1)3358return (B_FALSE);33593360break;33613362case ZFS_KEYFORMAT_RAW:3363if ((f = fopen(key_material, "r")) == NULL)3364return (B_FALSE);33653366if (fread(key_out, 1, WRAPPING_KEY_LEN, f) !=3367WRAPPING_KEY_LEN) {3368(void) fclose(f);3369return (B_FALSE);3370}33713372/* Check the key length */3373if (fgetc(f) != EOF) {3374(void) fclose(f);3375return (B_FALSE);3376}33773378(void) fclose(f);3379break;33803381default:3382fatal("no support for key format %u\n",3383(unsigned int) keyformat);3384}33853386return (B_TRUE);3387}33883389static char encroot[ZFS_MAX_DATASET_NAME_LEN];3390static boolean_t key_loaded = B_FALSE;33913392static void3393zdb_load_key(objset_t *os)3394{3395dsl_pool_t *dp;3396dsl_dir_t *dd, *rdd;3397uint8_t key[WRAPPING_KEY_LEN];3398uint64_t rddobj;3399int err;34003401dp = spa_get_dsl(os->os_spa);3402dd = os->os_dsl_dataset->ds_dir;34033404dsl_pool_config_enter(dp, FTAG);3405VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,3406DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));3407VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));3408dsl_dir_name(rdd, encroot);3409dsl_dir_rele(rdd, FTAG);34103411if (!zdb_derive_key(dd, key))3412fatal("couldn't derive encryption key");34133414dsl_pool_config_exit(dp, FTAG);34153416ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);34173418dsl_crypto_params_t *dcp;3419nvlist_t *crypto_args;34203421crypto_args = fnvlist_alloc();3422fnvlist_add_uint8_array(crypto_args, "wkeydata",3423(uint8_t *)key, WRAPPING_KEY_LEN);3424VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,3425NULL, crypto_args, &dcp));3426err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);34273428dsl_crypto_params_free(dcp, (err != 0));3429fnvlist_free(crypto_args);34303431if (err != 0)3432fatal(3433"couldn't load encryption key for %s: %s",3434encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?3435"crypto params not supported" : strerror(err));34363437ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);34383439printf("Unlocked encryption root: %s\n", encroot);3440key_loaded = B_TRUE;3441}34423443static void3444zdb_unload_key(void)3445{3446if (!key_loaded)3447return;34483449VERIFY0(spa_keystore_unload_wkey(encroot));3450key_loaded = B_FALSE;3451}34523453static avl_tree_t idx_tree;3454static avl_tree_t domain_tree;3455static boolean_t fuid_table_loaded;3456static objset_t *sa_os = NULL;3457static sa_attr_type_t *sa_attr_table = NULL;34583459static int3460open_objset(const char *path, const void *tag, objset_t **osp)3461{3462int err;3463uint64_t sa_attrs = 0;3464uint64_t version = 0;34653466VERIFY0P(sa_os);34673468/*3469* We can't own an objset if it's redacted. Therefore, we do this3470* dance: hold the objset, then acquire a long hold on its dataset, then3471* release the pool (which is held as part of holding the objset).3472*/34733474if (dump_opt['K']) {3475/* decryption requested, try to load keys */3476err = dmu_objset_hold(path, tag, osp);3477if (err != 0) {3478(void) fprintf(stderr, "failed to hold dataset "3479"'%s': %s\n",3480path, strerror(err));3481return (err);3482}3483dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);3484dsl_pool_rele(dmu_objset_pool(*osp), tag);34853486/* succeeds or dies */3487zdb_load_key(*osp);34883489/* release it all */3490dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);3491dsl_dataset_rele(dmu_objset_ds(*osp), tag);3492}34933494int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;34953496err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);3497if (err != 0) {3498(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",3499path, strerror(err));3500return (err);3501}3502dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);3503dsl_pool_rele(dmu_objset_pool(*osp), tag);35043505if (dmu_objset_type(*osp) == DMU_OST_ZFS &&3506(key_loaded || !(*osp)->os_encrypted)) {3507(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,35088, 1, &version);3509if (version >= ZPL_VERSION_SA) {3510(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,35118, 1, &sa_attrs);3512}3513err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,3514&sa_attr_table);3515if (err != 0) {3516(void) fprintf(stderr, "sa_setup failed: %s\n",3517strerror(err));3518dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);3519dsl_dataset_rele_flags(dmu_objset_ds(*osp),3520ds_hold_flags, tag);3521*osp = NULL;3522}3523}3524sa_os = *osp;35253526return (err);3527}35283529static void3530close_objset(objset_t *os, const void *tag)3531{3532VERIFY3P(os, ==, sa_os);3533if (os->os_sa != NULL)3534sa_tear_down(os);3535dsl_dataset_long_rele(dmu_objset_ds(os), tag);3536dsl_dataset_rele_flags(dmu_objset_ds(os),3537key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);3538sa_attr_table = NULL;3539sa_os = NULL;35403541zdb_unload_key();3542}35433544static void3545fuid_table_destroy(void)3546{3547if (fuid_table_loaded) {3548zfs_fuid_table_destroy(&idx_tree, &domain_tree);3549fuid_table_loaded = B_FALSE;3550}3551}35523553/*3554* Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on3555* a live pool are normally cleaned up during ddt_sync(). We can't do that (and3556* wouldn't want to anyway), but if we don't clean up the presence of stuff on3557* ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.3558*3559* Note that this is not a particularly efficient way to do this, but3560* ddt_remove() is the only public method that can do the work we need, and it3561* requires the right locks and etc to do the job. This is only ever called3562* during zdb shutdown so efficiency is not especially important.3563*/3564static void3565zdb_ddt_cleanup(spa_t *spa)3566{3567for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {3568ddt_t *ddt = spa->spa_ddt[c];3569if (!ddt)3570continue;35713572spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);3573ddt_enter(ddt);3574ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;3575while (dde) {3576next = AVL_NEXT(&ddt->ddt_tree, dde);3577dde->dde_io = NULL;3578ddt_remove(ddt, dde);3579dde = next;3580}3581ddt_exit(ddt);3582spa_config_exit(spa, SCL_CONFIG, FTAG);3583}3584}35853586static void3587zdb_exit(int reason)3588{3589if (spa != NULL)3590zdb_ddt_cleanup(spa);35913592if (os != NULL) {3593close_objset(os, FTAG);3594} else if (spa != NULL) {3595spa_close(spa, FTAG);3596}35973598fuid_table_destroy();35993600if (kernel_init_done)3601kernel_fini();36023603exit(reason);3604}36053606/*3607* print uid or gid information.3608* For normal POSIX id just the id is printed in decimal format.3609* For CIFS files with FUID the fuid is printed in hex followed by3610* the domain-rid string.3611*/3612static void3613print_idstr(uint64_t id, const char *id_type)3614{3615if (FUID_INDEX(id)) {3616const char *domain =3617zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));3618(void) printf("\t%s %llx [%s-%d]\n", id_type,3619(u_longlong_t)id, domain, (int)FUID_RID(id));3620} else {3621(void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);3622}36233624}36253626static void3627dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)3628{3629uint32_t uid_idx, gid_idx;36303631uid_idx = FUID_INDEX(uid);3632gid_idx = FUID_INDEX(gid);36333634/* Load domain table, if not already loaded */3635if (!fuid_table_loaded && (uid_idx || gid_idx)) {3636uint64_t fuid_obj;36373638/* first find the fuid object. It lives in the master node */3639VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,36408, 1, &fuid_obj));3641zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);3642(void) zfs_fuid_table_load(os, fuid_obj,3643&idx_tree, &domain_tree);3644fuid_table_loaded = B_TRUE;3645}36463647print_idstr(uid, "uid");3648print_idstr(gid, "gid");3649}36503651static void3652dump_znode_sa_xattr(sa_handle_t *hdl)3653{3654nvlist_t *sa_xattr;3655nvpair_t *elem = NULL;3656int sa_xattr_size = 0;3657int sa_xattr_entries = 0;3658int error;3659char *sa_xattr_packed;36603661error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);3662if (error || sa_xattr_size == 0)3663return;36643665sa_xattr_packed = malloc(sa_xattr_size);3666if (sa_xattr_packed == NULL)3667return;36683669error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],3670sa_xattr_packed, sa_xattr_size);3671if (error) {3672free(sa_xattr_packed);3673return;3674}36753676error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);3677if (error) {3678free(sa_xattr_packed);3679return;3680}36813682while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)3683sa_xattr_entries++;36843685(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",3686sa_xattr_size, sa_xattr_entries);3687while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {3688boolean_t can_print = !dump_opt['P'];3689uchar_t *value;3690uint_t cnt, idx;36913692(void) printf("\t\t%s = ", nvpair_name(elem));3693nvpair_value_byte_array(elem, &value, &cnt);36943695for (idx = 0; idx < cnt; ++idx) {3696if (!isprint(value[idx])) {3697can_print = B_FALSE;3698break;3699}3700}37013702for (idx = 0; idx < cnt; ++idx) {3703if (can_print)3704(void) putchar(value[idx]);3705else3706(void) printf("\\%3.3o", value[idx]);3707}3708(void) putchar('\n');3709}37103711nvlist_free(sa_xattr);3712free(sa_xattr_packed);3713}37143715static void3716dump_znode_symlink(sa_handle_t *hdl)3717{3718int sa_symlink_size = 0;3719char linktarget[MAXPATHLEN];3720int error;37213722error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);3723if (error || sa_symlink_size == 0) {3724return;3725}3726if (sa_symlink_size >= sizeof (linktarget)) {3727(void) printf("symlink size %d is too large\n",3728sa_symlink_size);3729return;3730}3731linktarget[sa_symlink_size] = '\0';3732if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],3733&linktarget, sa_symlink_size) == 0)3734(void) printf("\ttarget %s\n", linktarget);3735}37363737static void3738dump_znode(objset_t *os, uint64_t object, void *data, size_t size)3739{3740(void) data, (void) size;3741char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */3742sa_handle_t *hdl;3743uint64_t xattr, rdev, gen;3744uint64_t uid, gid, mode, fsize, parent, links;3745uint64_t pflags;3746uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];3747time_t z_crtime, z_atime, z_mtime, z_ctime;3748sa_bulk_attr_t bulk[12];3749int idx = 0;3750int error;37513752VERIFY3P(os, ==, sa_os);3753if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {3754(void) printf("Failed to get handle for SA znode\n");3755return;3756}37573758SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);3759SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);3760SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,3761&links, 8);3762SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);3763SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,3764&mode, 8);3765SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],3766NULL, &parent, 8);3767SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,3768&fsize, 8);3769SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,3770acctm, 16);3771SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,3772modtm, 16);3773SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,3774crtm, 16);3775SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,3776chgtm, 16);3777SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,3778&pflags, 8);37793780if (sa_bulk_lookup(hdl, bulk, idx)) {3781(void) sa_handle_destroy(hdl);3782return;3783}37843785z_crtime = (time_t)crtm[0];3786z_atime = (time_t)acctm[0];3787z_mtime = (time_t)modtm[0];3788z_ctime = (time_t)chgtm[0];37893790if (dump_opt['d'] > 4) {3791error = zfs_obj_to_path(os, object, path, sizeof (path));3792if (error == ESTALE) {3793(void) snprintf(path, sizeof (path), "on delete queue");3794} else if (error != 0) {3795leaked_objects++;3796(void) snprintf(path, sizeof (path),3797"path not found, possibly leaked");3798}3799(void) printf("\tpath %s\n", path);3800}38013802if (S_ISLNK(mode))3803dump_znode_symlink(hdl);3804dump_uidgid(os, uid, gid);3805(void) printf("\tatime %s", ctime(&z_atime));3806(void) printf("\tmtime %s", ctime(&z_mtime));3807(void) printf("\tctime %s", ctime(&z_ctime));3808(void) printf("\tcrtime %s", ctime(&z_crtime));3809(void) printf("\tgen %llu\n", (u_longlong_t)gen);3810(void) printf("\tmode %llo\n", (u_longlong_t)mode);3811(void) printf("\tsize %llu\n", (u_longlong_t)fsize);3812(void) printf("\tparent %llu\n", (u_longlong_t)parent);3813(void) printf("\tlinks %llu\n", (u_longlong_t)links);3814(void) printf("\tpflags %llx\n", (u_longlong_t)pflags);3815if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {3816uint64_t projid;38173818if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,3819sizeof (uint64_t)) == 0)3820(void) printf("\tprojid %llu\n", (u_longlong_t)projid);3821}3822if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,3823sizeof (uint64_t)) == 0)3824(void) printf("\txattr %llu\n", (u_longlong_t)xattr);3825if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,3826sizeof (uint64_t)) == 0)3827(void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);3828dump_znode_sa_xattr(hdl);3829sa_handle_destroy(hdl);3830}38313832static void3833dump_acl(objset_t *os, uint64_t object, void *data, size_t size)3834{3835(void) os, (void) object, (void) data, (void) size;3836}38373838static void3839dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)3840{3841(void) os, (void) object, (void) data, (void) size;3842}38433844static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {3845dump_none, /* unallocated */3846dump_zap, /* object directory */3847dump_uint64, /* object array */3848dump_none, /* packed nvlist */3849dump_packed_nvlist, /* packed nvlist size */3850dump_none, /* bpobj */3851dump_bpobj, /* bpobj header */3852dump_none, /* SPA space map header */3853dump_none, /* SPA space map */3854dump_none, /* ZIL intent log */3855dump_dnode, /* DMU dnode */3856dump_dmu_objset, /* DMU objset */3857dump_dsl_dir, /* DSL directory */3858dump_zap, /* DSL directory child map */3859dump_zap, /* DSL dataset snap map */3860dump_zap, /* DSL props */3861dump_dsl_dataset, /* DSL dataset */3862dump_znode, /* ZFS znode */3863dump_acl, /* ZFS V0 ACL */3864dump_uint8, /* ZFS plain file */3865dump_zpldir, /* ZFS directory */3866dump_zap, /* ZFS master node */3867dump_zap, /* ZFS delete queue */3868dump_uint8, /* zvol object */3869dump_zap, /* zvol prop */3870dump_uint8, /* other uint8[] */3871dump_uint64, /* other uint64[] */3872dump_zap, /* other ZAP */3873dump_zap, /* persistent error log */3874dump_uint8, /* SPA history */3875dump_history_offsets, /* SPA history offsets */3876dump_zap, /* Pool properties */3877dump_zap, /* DSL permissions */3878dump_acl, /* ZFS ACL */3879dump_uint8, /* ZFS SYSACL */3880dump_none, /* FUID nvlist */3881dump_packed_nvlist, /* FUID nvlist size */3882dump_zap, /* DSL dataset next clones */3883dump_zap, /* DSL scrub queue */3884dump_zap, /* ZFS user/group/project used */3885dump_zap, /* ZFS user/group/project quota */3886dump_zap, /* snapshot refcount tags */3887dump_ddt_zap, /* DDT ZAP object */3888dump_zap, /* DDT statistics */3889dump_znode, /* SA object */3890dump_zap, /* SA Master Node */3891dump_sa_attrs, /* SA attribute registration */3892dump_sa_layouts, /* SA attribute layouts */3893dump_zap, /* DSL scrub translations */3894dump_none, /* fake dedup BP */3895dump_zap, /* deadlist */3896dump_none, /* deadlist hdr */3897dump_zap, /* dsl clones */3898dump_bpobj_subobjs, /* bpobj subobjs */3899dump_unknown, /* Unknown type, must be last */3900};39013902static boolean_t3903match_object_type(dmu_object_type_t obj_type, uint64_t flags)3904{3905boolean_t match = B_TRUE;39063907switch (obj_type) {3908case DMU_OT_DIRECTORY_CONTENTS:3909if (!(flags & ZOR_FLAG_DIRECTORY))3910match = B_FALSE;3911break;3912case DMU_OT_PLAIN_FILE_CONTENTS:3913if (!(flags & ZOR_FLAG_PLAIN_FILE))3914match = B_FALSE;3915break;3916case DMU_OT_SPACE_MAP:3917if (!(flags & ZOR_FLAG_SPACE_MAP))3918match = B_FALSE;3919break;3920default:3921if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {3922if (!(flags & ZOR_FLAG_ZAP))3923match = B_FALSE;3924break;3925}39263927/*3928* If all bits except some of the supported flags are3929* set, the user combined the all-types flag (A) with3930* a negated flag to exclude some types (e.g. A-f to3931* show all object types except plain files).3932*/3933if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)3934match = B_FALSE;39353936break;3937}39383939return (match);3940}39413942static void3943dump_object(objset_t *os, uint64_t object, int verbosity,3944boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)3945{3946dmu_buf_t *db = NULL;3947dmu_object_info_t doi;3948dnode_t *dn;3949boolean_t dnode_held = B_FALSE;3950void *bonus = NULL;3951size_t bsize = 0;3952char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];3953char bonus_size[32];3954char aux[50];3955int error;39563957/* make sure nicenum has enough space */3958_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");3959_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");3960_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");3961_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");3962_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,3963"bonus_size truncated");39643965if (*print_header) {3966(void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",3967"Object", "lvl", "iblk", "dblk", "dsize", "dnsize",3968"lsize", "%full", "type");3969*print_header = 0;3970}39713972if (object == 0) {3973dn = DMU_META_DNODE(os);3974dmu_object_info_from_dnode(dn, &doi);3975} else {3976/*3977* Encrypted datasets will have sensitive bonus buffers3978* encrypted. Therefore we cannot hold the bonus buffer and3979* must hold the dnode itself instead.3980*/3981error = dmu_object_info(os, object, &doi);3982if (error)3983fatal("dmu_object_info() failed, errno %u", error);39843985if (!key_loaded && os->os_encrypted &&3986DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {3987error = dnode_hold(os, object, FTAG, &dn);3988if (error)3989fatal("dnode_hold() failed, errno %u", error);3990dnode_held = B_TRUE;3991} else {3992error = dmu_bonus_hold(os, object, FTAG, &db);3993if (error)3994fatal("dmu_bonus_hold(%llu) failed, errno %u",3995object, error);3996bonus = db->db_data;3997bsize = db->db_size;3998dn = DB_DNODE((dmu_buf_impl_t *)db);3999}4000}40014002/*4003* Default to showing all object types if no flags were specified.4004*/4005if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&4006!match_object_type(doi.doi_type, flags))4007goto out;40084009if (dnode_slots_used)4010*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;40114012zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));4013zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));4014zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));4015zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));4016zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));4017zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));4018(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *4019doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?4020DNODES_PER_BLOCK : 1) / doi.doi_max_offset);40214022aux[0] = '\0';40234024if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {4025(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),4026" (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));4027}40284029if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&4030ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {4031const char *compname = NULL;4032if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,4033ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),4034&compname) == 0) {4035(void) snprintf(aux + strlen(aux),4036sizeof (aux) - strlen(aux), " (Z=inherit=%s)",4037compname);4038} else {4039(void) snprintf(aux + strlen(aux),4040sizeof (aux) - strlen(aux),4041" (Z=inherit=%s-unknown)",4042ZDB_COMPRESS_NAME(os->os_compress));4043}4044} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {4045(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),4046" (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));4047} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {4048(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),4049" (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));4050}40514052(void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",4053(u_longlong_t)object, doi.doi_indirection, iblk, dblk,4054asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);40554056if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {4057(void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",4058"", "", "", "", "", "", bonus_size, "bonus",4059zdb_ot_name(doi.doi_bonus_type));4060}40614062if (verbosity >= 4) {4063(void) printf("\tdnode flags: %s%s%s%s\n",4064(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?4065"USED_BYTES " : "",4066(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?4067"USERUSED_ACCOUNTED " : "",4068(dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?4069"USEROBJUSED_ACCOUNTED " : "",4070(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?4071"SPILL_BLKPTR" : "");4072(void) printf("\tdnode maxblkid: %llu\n",4073(longlong_t)dn->dn_phys->dn_maxblkid);40744075if (!dnode_held) {4076object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,4077object, bonus, bsize);4078} else {4079(void) printf("\t\t(bonus encrypted)\n");4080}40814082if (key_loaded ||4083(!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {4084object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,4085NULL, 0);4086} else {4087(void) printf("\t\t(object encrypted)\n");4088}40894090*print_header = B_TRUE;4091}40924093if (verbosity >= 5) {4094if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {4095char blkbuf[BP_SPRINTF_LEN];4096snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),4097DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);4098(void) printf("\nSpill block: %s\n", blkbuf);4099}4100dump_indirect(dn);4101}41024103if (verbosity >= 5) {4104/*4105* Report the list of segments that comprise the object.4106*/4107uint64_t start = 0;4108uint64_t end;4109uint64_t blkfill = 1;4110int minlvl = 1;41114112if (dn->dn_type == DMU_OT_DNODE) {4113minlvl = 0;4114blkfill = DNODES_PER_BLOCK;4115}41164117for (;;) {4118char segsize[32];4119/* make sure nicenum has enough space */4120_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,4121"segsize truncated");4122error = dnode_next_offset(dn,41230, &start, minlvl, blkfill, 0);4124if (error)4125break;4126end = start;4127error = dnode_next_offset(dn,4128DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);4129zdb_nicenum(end - start, segsize, sizeof (segsize));4130(void) printf("\t\tsegment [%016llx, %016llx)"4131" size %5s\n", (u_longlong_t)start,4132(u_longlong_t)end, segsize);4133if (error)4134break;4135start = end;4136}4137}41384139out:4140if (db != NULL)4141dmu_buf_rele(db, FTAG);4142if (dnode_held)4143dnode_rele(dn, FTAG);4144}41454146static void4147count_dir_mos_objects(dsl_dir_t *dd)4148{4149mos_obj_refd(dd->dd_object);4150mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);4151mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);4152mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);4153mos_obj_refd(dsl_dir_phys(dd)->dd_clones);41544155/*4156* The dd_crypto_obj can be referenced by multiple dsl_dir's.4157* Ignore the references after the first one.4158*/4159mos_obj_refd_multiple(dd->dd_crypto_obj);4160}41614162static void4163count_ds_mos_objects(dsl_dataset_t *ds)4164{4165mos_obj_refd(ds->ds_object);4166mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);4167mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);4168mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);4169mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);4170mos_obj_refd(ds->ds_bookmarks_obj);41714172if (!dsl_dataset_is_snapshot(ds)) {4173count_dir_mos_objects(ds->ds_dir);4174}4175}41764177static const char *const objset_types[DMU_OST_NUMTYPES] = {4178"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };41794180/*4181* Parse a string denoting a range of object IDs of the form4182* <start>[:<end>[:flags]], and store the results in zor.4183* Return 0 on success. On error, return 1 and update the msg4184* pointer to point to a descriptive error message.4185*/4186static int4187parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)4188{4189uint64_t flags = 0;4190char *p, *s, *dup, *flagstr, *tmp = NULL;4191size_t len;4192int i;4193int rc = 0;41944195if (strchr(range, ':') == NULL) {4196zor->zor_obj_start = strtoull(range, &p, 0);4197if (*p != '\0') {4198*msg = "Invalid characters in object ID";4199rc = 1;4200}4201zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);4202zor->zor_obj_end = zor->zor_obj_start;4203return (rc);4204}42054206if (strchr(range, ':') == range) {4207*msg = "Invalid leading colon";4208rc = 1;4209return (rc);4210}42114212len = strlen(range);4213if (range[len - 1] == ':') {4214*msg = "Invalid trailing colon";4215rc = 1;4216return (rc);4217}42184219dup = strdup(range);4220s = strtok_r(dup, ":", &tmp);4221zor->zor_obj_start = strtoull(s, &p, 0);42224223if (*p != '\0') {4224*msg = "Invalid characters in start object ID";4225rc = 1;4226goto out;4227}42284229s = strtok_r(NULL, ":", &tmp);4230zor->zor_obj_end = strtoull(s, &p, 0);42314232if (*p != '\0') {4233*msg = "Invalid characters in end object ID";4234rc = 1;4235goto out;4236}42374238if (zor->zor_obj_start > zor->zor_obj_end) {4239*msg = "Start object ID may not exceed end object ID";4240rc = 1;4241goto out;4242}42434244s = strtok_r(NULL, ":", &tmp);4245if (s == NULL) {4246zor->zor_flags = ZOR_FLAG_ALL_TYPES;4247goto out;4248} else if (strtok_r(NULL, ":", &tmp) != NULL) {4249*msg = "Invalid colon-delimited field after flags";4250rc = 1;4251goto out;4252}42534254flagstr = s;4255for (i = 0; flagstr[i]; i++) {4256int bit;4257boolean_t negation = (flagstr[i] == '-');42584259if (negation) {4260i++;4261if (flagstr[i] == '\0') {4262*msg = "Invalid trailing negation operator";4263rc = 1;4264goto out;4265}4266}4267bit = flagbits[(uchar_t)flagstr[i]];4268if (bit == 0) {4269*msg = "Invalid flag";4270rc = 1;4271goto out;4272}4273if (negation)4274flags &= ~bit;4275else4276flags |= bit;4277}4278zor->zor_flags = flags;42794280zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);4281zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);42824283out:4284free(dup);4285return (rc);4286}42874288static void4289dump_objset(objset_t *os)4290{4291dmu_objset_stats_t dds = { 0 };4292uint64_t object, object_count;4293uint64_t refdbytes, usedobjs, scratch;4294char numbuf[32];4295char blkbuf[BP_SPRINTF_LEN + 20];4296char osname[ZFS_MAX_DATASET_NAME_LEN];4297const char *type = "UNKNOWN";4298int verbosity = dump_opt['d'];4299boolean_t print_header;4300unsigned i;4301int error;4302uint64_t total_slots_used = 0;4303uint64_t max_slot_used = 0;4304uint64_t dnode_slots;4305uint64_t obj_start;4306uint64_t obj_end;4307uint64_t flags;43084309/* make sure nicenum has enough space */4310_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");43114312dsl_pool_config_enter(dmu_objset_pool(os), FTAG);4313dmu_objset_fast_stat(os, &dds);4314dsl_pool_config_exit(dmu_objset_pool(os), FTAG);43154316print_header = B_TRUE;43174318if (dds.dds_type < DMU_OST_NUMTYPES)4319type = objset_types[dds.dds_type];43204321if (dds.dds_type == DMU_OST_META) {4322dds.dds_creation_txg = TXG_INITIAL;4323usedobjs = BP_GET_FILL(os->os_rootbp);4324refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->4325dd_used_bytes;4326} else {4327dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);4328}43294330ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));43314332zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));43334334if (verbosity >= 4) {4335(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");4336(void) snprintf_blkptr(blkbuf + strlen(blkbuf),4337sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);4338} else {4339blkbuf[0] = '\0';4340}43414342dmu_objset_name(os, osname);43434344(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "4345"%s, %llu objects%s%s\n",4346osname, type, (u_longlong_t)dmu_objset_id(os),4347(u_longlong_t)dds.dds_creation_txg,4348numbuf, (u_longlong_t)usedobjs, blkbuf,4349(dds.dds_inconsistent) ? " (inconsistent)" : "");43504351for (i = 0; i < zopt_object_args; i++) {4352obj_start = zopt_object_ranges[i].zor_obj_start;4353obj_end = zopt_object_ranges[i].zor_obj_end;4354flags = zopt_object_ranges[i].zor_flags;43554356object = obj_start;4357if (object == 0 || obj_start == obj_end)4358dump_object(os, object, verbosity, &print_header, NULL,4359flags);4360else4361object--;43624363while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&4364object <= obj_end) {4365dump_object(os, object, verbosity, &print_header, NULL,4366flags);4367}4368}43694370if (zopt_object_args > 0) {4371(void) printf("\n");4372return;4373}43744375if (dump_opt['i'] != 0 || verbosity >= 2)4376dump_intent_log(dmu_objset_zil(os));43774378if (dmu_objset_ds(os) != NULL) {4379dsl_dataset_t *ds = dmu_objset_ds(os);4380dump_blkptr_list(&ds->ds_deadlist, "Deadlist");4381if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&4382!dmu_objset_is_snapshot(os)) {4383dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");4384if (verify_dd_livelist(os) != 0)4385fatal("livelist is incorrect");4386}43874388if (dsl_dataset_remap_deadlist_exists(ds)) {4389(void) printf("ds_remap_deadlist:\n");4390dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");4391}4392count_ds_mos_objects(ds);4393}43944395if (dmu_objset_ds(os) != NULL)4396dump_bookmarks(os, verbosity);43974398if (verbosity < 2)4399return;44004401if (BP_IS_HOLE(os->os_rootbp))4402return;44034404dump_object(os, 0, verbosity, &print_header, NULL, 0);4405object_count = 0;4406if (DMU_USERUSED_DNODE(os) != NULL &&4407DMU_USERUSED_DNODE(os)->dn_type != 0) {4408dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,4409NULL, 0);4410dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,4411NULL, 0);4412}44134414if (DMU_PROJECTUSED_DNODE(os) != NULL &&4415DMU_PROJECTUSED_DNODE(os)->dn_type != 0)4416dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,4417&print_header, NULL, 0);44184419object = 0;4420while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {4421dump_object(os, object, verbosity, &print_header, &dnode_slots,44220);4423object_count++;4424total_slots_used += dnode_slots;4425max_slot_used = object + dnode_slots - 1;4426}44274428(void) printf("\n");44294430(void) printf(" Dnode slots:\n");4431(void) printf("\tTotal used: %10llu\n",4432(u_longlong_t)total_slots_used);4433(void) printf("\tMax used: %10llu\n",4434(u_longlong_t)max_slot_used);4435(void) printf("\tPercent empty: %10lf\n",4436(double)(max_slot_used - total_slots_used)*100 /4437(double)max_slot_used);4438(void) printf("\n");44394440if (error != ESRCH) {4441(void) fprintf(stderr, "dmu_object_next() = %d\n", error);4442abort();4443}44444445ASSERT3U(object_count, ==, usedobjs);44464447if (leaked_objects != 0) {4448(void) printf("%d potentially leaked objects detected\n",4449leaked_objects);4450leaked_objects = 0;4451}4452}44534454static void4455dump_uberblock(uberblock_t *ub, const char *header, const char *footer)4456{4457time_t timestamp = ub->ub_timestamp;44584459(void) printf("%s", header ? header : "");4460(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);4461(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);4462(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);4463(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);4464(void) printf("\ttimestamp = %llu UTC = %s",4465(u_longlong_t)ub->ub_timestamp, ctime(×tamp));44664467char blkbuf[BP_SPRINTF_LEN];4468snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);4469(void) printf("\tbp = %s\n", blkbuf);44704471(void) printf("\tmmp_magic = %016llx\n",4472(u_longlong_t)ub->ub_mmp_magic);4473if (MMP_VALID(ub)) {4474(void) printf("\tmmp_delay = %0llu\n",4475(u_longlong_t)ub->ub_mmp_delay);4476if (MMP_SEQ_VALID(ub))4477(void) printf("\tmmp_seq = %u\n",4478(unsigned int) MMP_SEQ(ub));4479if (MMP_FAIL_INT_VALID(ub))4480(void) printf("\tmmp_fail = %u\n",4481(unsigned int) MMP_FAIL_INT(ub));4482if (MMP_INTERVAL_VALID(ub))4483(void) printf("\tmmp_write = %u\n",4484(unsigned int) MMP_INTERVAL(ub));4485/* After MMP_* to make summarize_uberblock_mmp cleaner */4486(void) printf("\tmmp_valid = %x\n",4487(unsigned int) ub->ub_mmp_config & 0xFF);4488}44894490if (dump_opt['u'] >= 4) {4491char blkbuf[BP_SPRINTF_LEN];4492snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);4493(void) printf("\trootbp = %s\n", blkbuf);4494}4495(void) printf("\tcheckpoint_txg = %llu\n",4496(u_longlong_t)ub->ub_checkpoint_txg);44974498(void) printf("\traidz_reflow state=%u off=%llu\n",4499(int)RRSS_GET_STATE(ub),4500(u_longlong_t)RRSS_GET_OFFSET(ub));45014502(void) printf("%s", footer ? footer : "");4503}45044505static void4506dump_config(spa_t *spa)4507{4508dmu_buf_t *db;4509size_t nvsize = 0;4510int error = 0;451145124513error = dmu_bonus_hold(spa->spa_meta_objset,4514spa->spa_config_object, FTAG, &db);45154516if (error == 0) {4517nvsize = *(uint64_t *)db->db_data;4518dmu_buf_rele(db, FTAG);45194520(void) printf("\nMOS Configuration:\n");4521dump_packed_nvlist(spa->spa_meta_objset,4522spa->spa_config_object, (void *)&nvsize, 1);4523} else {4524(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",4525(u_longlong_t)spa->spa_config_object, error);4526}4527}45284529static void4530dump_cachefile(const char *cachefile)4531{4532int fd;4533struct stat64 statbuf;4534char *buf;4535nvlist_t *config;45364537if ((fd = open64(cachefile, O_RDONLY)) < 0) {4538(void) printf("cannot open '%s': %s\n", cachefile,4539strerror(errno));4540zdb_exit(1);4541}45424543if (fstat64(fd, &statbuf) != 0) {4544(void) printf("failed to stat '%s': %s\n", cachefile,4545strerror(errno));4546zdb_exit(1);4547}45484549if ((buf = malloc(statbuf.st_size)) == NULL) {4550(void) fprintf(stderr, "failed to allocate %llu bytes\n",4551(u_longlong_t)statbuf.st_size);4552zdb_exit(1);4553}45544555if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {4556(void) fprintf(stderr, "failed to read %llu bytes\n",4557(u_longlong_t)statbuf.st_size);4558zdb_exit(1);4559}45604561(void) close(fd);45624563if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {4564(void) fprintf(stderr, "failed to unpack nvlist\n");4565zdb_exit(1);4566}45674568free(buf);45694570dump_nvlist(config, 0);45714572nvlist_free(config);4573}45744575/*4576* ZFS label nvlist stats4577*/4578typedef struct zdb_nvl_stats {4579int zns_list_count;4580int zns_leaf_count;4581size_t zns_leaf_largest;4582size_t zns_leaf_total;4583nvlist_t *zns_string;4584nvlist_t *zns_uint64;4585nvlist_t *zns_boolean;4586} zdb_nvl_stats_t;45874588static void4589collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)4590{4591nvlist_t *list, **array;4592nvpair_t *nvp = NULL;4593const char *name;4594uint_t i, items;45954596stats->zns_list_count++;45974598while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {4599name = nvpair_name(nvp);46004601switch (nvpair_type(nvp)) {4602case DATA_TYPE_STRING:4603fnvlist_add_string(stats->zns_string, name,4604fnvpair_value_string(nvp));4605break;4606case DATA_TYPE_UINT64:4607fnvlist_add_uint64(stats->zns_uint64, name,4608fnvpair_value_uint64(nvp));4609break;4610case DATA_TYPE_BOOLEAN:4611fnvlist_add_boolean(stats->zns_boolean, name);4612break;4613case DATA_TYPE_NVLIST:4614if (nvpair_value_nvlist(nvp, &list) == 0)4615collect_nvlist_stats(list, stats);4616break;4617case DATA_TYPE_NVLIST_ARRAY:4618if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)4619break;46204621for (i = 0; i < items; i++) {4622collect_nvlist_stats(array[i], stats);46234624/* collect stats on leaf vdev */4625if (strcmp(name, "children") == 0) {4626size_t size;46274628(void) nvlist_size(array[i], &size,4629NV_ENCODE_XDR);4630stats->zns_leaf_total += size;4631if (size > stats->zns_leaf_largest)4632stats->zns_leaf_largest = size;4633stats->zns_leaf_count++;4634}4635}4636break;4637default:4638(void) printf("skip type %d!\n", (int)nvpair_type(nvp));4639}4640}4641}46424643static void4644dump_nvlist_stats(nvlist_t *nvl, size_t cap)4645{4646zdb_nvl_stats_t stats = { 0 };4647size_t size, sum = 0, total;4648size_t noise;46494650/* requires nvlist with non-unique names for stat collection */4651VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));4652VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));4653VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));4654VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));46554656(void) printf("\n\nZFS Label NVList Config Stats:\n");46574658VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));4659(void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",4660(int)total, (int)(cap - total), 100.0 * total / cap);46614662collect_nvlist_stats(nvl, &stats);46634664VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));4665size -= noise;4666sum += size;4667(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",4668(int)fnvlist_num_pairs(stats.zns_uint64),4669(int)size, 100.0 * size / total);46704671VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));4672size -= noise;4673sum += size;4674(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",4675(int)fnvlist_num_pairs(stats.zns_string),4676(int)size, 100.0 * size / total);46774678VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));4679size -= noise;4680sum += size;4681(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",4682(int)fnvlist_num_pairs(stats.zns_boolean),4683(int)size, 100.0 * size / total);46844685size = total - sum; /* treat remainder as nvlist overhead */4686(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",4687stats.zns_list_count, (int)size, 100.0 * size / total);46884689if (stats.zns_leaf_count > 0) {4690size_t average = stats.zns_leaf_total / stats.zns_leaf_count;46914692(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",4693stats.zns_leaf_count, (int)average);4694(void) printf("%24d bytes largest\n",4695(int)stats.zns_leaf_largest);46964697if (dump_opt['l'] >= 3 && average > 0)4698(void) printf(" space for %d additional leaf vdevs\n",4699(int)((cap - total) / average));4700}4701(void) printf("\n");47024703nvlist_free(stats.zns_string);4704nvlist_free(stats.zns_uint64);4705nvlist_free(stats.zns_boolean);4706}47074708typedef struct cksum_record {4709zio_cksum_t cksum;4710boolean_t labels[VDEV_LABELS];4711avl_node_t link;4712} cksum_record_t;47134714static int4715cksum_record_compare(const void *x1, const void *x2)4716{4717const cksum_record_t *l = (cksum_record_t *)x1;4718const cksum_record_t *r = (cksum_record_t *)x2;4719int arraysize = ARRAY_SIZE(l->cksum.zc_word);4720int difference = 0;47214722for (int i = 0; i < arraysize; i++) {4723difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);4724if (difference)4725break;4726}47274728return (difference);4729}47304731static cksum_record_t *4732cksum_record_alloc(zio_cksum_t *cksum, int l)4733{4734cksum_record_t *rec;47354736rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);4737rec->cksum = *cksum;4738rec->labels[l] = B_TRUE;47394740return (rec);4741}47424743static cksum_record_t *4744cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)4745{4746cksum_record_t lookup = { .cksum = *cksum };4747avl_index_t where;47484749return (avl_find(tree, &lookup, &where));4750}47514752static cksum_record_t *4753cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)4754{4755cksum_record_t *rec;47564757rec = cksum_record_lookup(tree, cksum);4758if (rec) {4759rec->labels[l] = B_TRUE;4760} else {4761rec = cksum_record_alloc(cksum, l);4762avl_add(tree, rec);4763}47644765return (rec);4766}47674768static int4769first_label(cksum_record_t *rec)4770{4771for (int i = 0; i < VDEV_LABELS; i++)4772if (rec->labels[i])4773return (i);47744775return (-1);4776}47774778static void4779print_label_numbers(const char *prefix, const cksum_record_t *rec)4780{4781fputs(prefix, stdout);4782for (int i = 0; i < VDEV_LABELS; i++)4783if (rec->labels[i] == B_TRUE)4784printf("%d ", i);4785putchar('\n');4786}47874788#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)47894790typedef struct zdb_label {4791vdev_label_t label;4792uint64_t label_offset;4793nvlist_t *config_nv;4794cksum_record_t *config;4795cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];4796boolean_t header_printed;4797boolean_t read_failed;4798boolean_t cksum_valid;4799} zdb_label_t;48004801static void4802print_label_header(zdb_label_t *label, int l)4803{48044805if (dump_opt['q'])4806return;48074808if (label->header_printed == B_TRUE)4809return;48104811(void) printf("------------------------------------\n");4812(void) printf("LABEL %d %s\n", l,4813label->cksum_valid ? "" : "(Bad label cksum)");4814(void) printf("------------------------------------\n");48154816label->header_printed = B_TRUE;4817}48184819static void4820print_l2arc_header(void)4821{4822(void) printf("------------------------------------\n");4823(void) printf("L2ARC device header\n");4824(void) printf("------------------------------------\n");4825}48264827static void4828print_l2arc_log_blocks(void)4829{4830(void) printf("------------------------------------\n");4831(void) printf("L2ARC device log blocks\n");4832(void) printf("------------------------------------\n");4833}48344835static void4836dump_l2arc_log_entries(uint64_t log_entries,4837l2arc_log_ent_phys_t *le, uint64_t i)4838{4839for (int j = 0; j < log_entries; j++) {4840dva_t dva = le[j].le_dva;4841(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "4842"vdev: %llu, offset: %llu\n",4843(u_longlong_t)i, j + 1,4844(u_longlong_t)DVA_GET_ASIZE(&dva),4845(u_longlong_t)DVA_GET_VDEV(&dva),4846(u_longlong_t)DVA_GET_OFFSET(&dva));4847(void) printf("|\t\t\t\tbirth: %llu\n",4848(u_longlong_t)le[j].le_birth);4849(void) printf("|\t\t\t\tlsize: %llu\n",4850(u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));4851(void) printf("|\t\t\t\tpsize: %llu\n",4852(u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));4853(void) printf("|\t\t\t\tcompr: %llu\n",4854(u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));4855(void) printf("|\t\t\t\tcomplevel: %llu\n",4856(u_longlong_t)(&le[j])->le_complevel);4857(void) printf("|\t\t\t\ttype: %llu\n",4858(u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));4859(void) printf("|\t\t\t\tprotected: %llu\n",4860(u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));4861(void) printf("|\t\t\t\tprefetch: %llu\n",4862(u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));4863(void) printf("|\t\t\t\taddress: %llu\n",4864(u_longlong_t)le[j].le_daddr);4865(void) printf("|\t\t\t\tARC state: %llu\n",4866(u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));4867(void) printf("|\n");4868}4869(void) printf("\n");4870}48714872static void4873dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)4874{4875(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);4876(void) printf("|\t\tpayload_asize: %llu\n",4877(u_longlong_t)lbps->lbp_payload_asize);4878(void) printf("|\t\tpayload_start: %llu\n",4879(u_longlong_t)lbps->lbp_payload_start);4880(void) printf("|\t\tlsize: %llu\n",4881(u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));4882(void) printf("|\t\tasize: %llu\n",4883(u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));4884(void) printf("|\t\tcompralgo: %llu\n",4885(u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));4886(void) printf("|\t\tcksumalgo: %llu\n",4887(u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));4888(void) printf("|\n\n");4889}48904891static void4892dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,4893l2arc_dev_hdr_phys_t *rebuild)4894{4895l2arc_log_blk_phys_t this_lb;4896uint64_t asize;4897l2arc_log_blkptr_t lbps[2];4898zio_cksum_t cksum;4899int failed = 0;4900l2arc_dev_t dev;49014902if (!dump_opt['q'])4903print_l2arc_log_blocks();4904memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));49054906dev.l2ad_evict = l2dhdr->dh_evict;4907dev.l2ad_start = l2dhdr->dh_start;4908dev.l2ad_end = l2dhdr->dh_end;49094910if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {4911/* no log blocks to read */4912if (!dump_opt['q']) {4913(void) printf("No log blocks to read\n");4914(void) printf("\n");4915}4916return;4917} else {4918dev.l2ad_hand = lbps[0].lbp_daddr +4919L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);4920}49214922dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);49234924for (;;) {4925if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))4926break;49274928/* L2BLK_GET_PSIZE returns aligned size for log blocks */4929asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);4930if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {4931if (!dump_opt['q']) {4932(void) printf("Error while reading next log "4933"block\n\n");4934}4935break;4936}49374938fletcher_4_native_varsize(&this_lb, asize, &cksum);4939if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {4940failed++;4941if (!dump_opt['q']) {4942(void) printf("Invalid cksum\n");4943dump_l2arc_log_blkptr(&lbps[0]);4944}4945break;4946}49474948switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {4949case ZIO_COMPRESS_OFF:4950break;4951default: {4952abd_t *abd = abd_alloc_linear(asize, B_TRUE);4953abd_copy_from_buf_off(abd, &this_lb, 0, asize);4954abd_t dabd;4955abd_get_from_buf_struct(&dabd, &this_lb,4956sizeof (this_lb));4957int err = zio_decompress_data(L2BLK_GET_COMPRESS(4958(&lbps[0])->lbp_prop), abd, &dabd,4959asize, sizeof (this_lb), NULL);4960abd_free(&dabd);4961abd_free(abd);4962if (err != 0) {4963(void) printf("L2ARC block decompression "4964"failed\n");4965goto out;4966}4967break;4968}4969}49704971if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))4972byteswap_uint64_array(&this_lb, sizeof (this_lb));4973if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {4974if (!dump_opt['q'])4975(void) printf("Invalid log block magic\n\n");4976break;4977}49784979rebuild->dh_lb_count++;4980rebuild->dh_lb_asize += asize;4981if (dump_opt['l'] > 1 && !dump_opt['q']) {4982(void) printf("lb[%4llu]\tmagic: %llu\n",4983(u_longlong_t)rebuild->dh_lb_count,4984(u_longlong_t)this_lb.lb_magic);4985dump_l2arc_log_blkptr(&lbps[0]);4986}49874988if (dump_opt['l'] > 2 && !dump_opt['q'])4989dump_l2arc_log_entries(l2dhdr->dh_log_entries,4990this_lb.lb_entries,4991rebuild->dh_lb_count);49924993if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,4994lbps[0].lbp_payload_start, dev.l2ad_evict) &&4995!dev.l2ad_first)4996break;49974998lbps[0] = lbps[1];4999lbps[1] = this_lb.lb_prev_lbp;5000}5001out:5002if (!dump_opt['q']) {5003(void) printf("log_blk_count:\t %llu with valid cksum\n",5004(u_longlong_t)rebuild->dh_lb_count);5005(void) printf("\t\t %d with invalid cksum\n", failed);5006(void) printf("log_blk_asize:\t %llu\n\n",5007(u_longlong_t)rebuild->dh_lb_asize);5008}5009}50105011static int5012dump_l2arc_header(int fd)5013{5014l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};5015int error = B_FALSE;50165017if (pread64(fd, &l2dhdr, sizeof (l2dhdr),5018VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {5019error = B_TRUE;5020} else {5021if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))5022byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));50235024if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)5025error = B_TRUE;5026}50275028if (error) {5029(void) printf("L2ARC device header not found\n\n");5030/* Do not return an error here for backward compatibility */5031return (0);5032} else if (!dump_opt['q']) {5033print_l2arc_header();50345035(void) printf(" magic: %llu\n",5036(u_longlong_t)l2dhdr.dh_magic);5037(void) printf(" version: %llu\n",5038(u_longlong_t)l2dhdr.dh_version);5039(void) printf(" pool_guid: %llu\n",5040(u_longlong_t)l2dhdr.dh_spa_guid);5041(void) printf(" flags: %llu\n",5042(u_longlong_t)l2dhdr.dh_flags);5043(void) printf(" start_lbps[0]: %llu\n",5044(u_longlong_t)5045l2dhdr.dh_start_lbps[0].lbp_daddr);5046(void) printf(" start_lbps[1]: %llu\n",5047(u_longlong_t)5048l2dhdr.dh_start_lbps[1].lbp_daddr);5049(void) printf(" log_blk_ent: %llu\n",5050(u_longlong_t)l2dhdr.dh_log_entries);5051(void) printf(" start: %llu\n",5052(u_longlong_t)l2dhdr.dh_start);5053(void) printf(" end: %llu\n",5054(u_longlong_t)l2dhdr.dh_end);5055(void) printf(" evict: %llu\n",5056(u_longlong_t)l2dhdr.dh_evict);5057(void) printf(" lb_asize_refcount: %llu\n",5058(u_longlong_t)l2dhdr.dh_lb_asize);5059(void) printf(" lb_count_refcount: %llu\n",5060(u_longlong_t)l2dhdr.dh_lb_count);5061(void) printf(" trim_action_time: %llu\n",5062(u_longlong_t)l2dhdr.dh_trim_action_time);5063(void) printf(" trim_state: %llu\n\n",5064(u_longlong_t)l2dhdr.dh_trim_state);5065}50665067dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);5068/*5069* The total aligned size of log blocks and the number of log blocks5070* reported in the header of the device may be less than what zdb5071* reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().5072* This happens because dump_l2arc_log_blocks() lacks the memory5073* pressure valve that l2arc_rebuild() has. Thus, if we are on a system5074* with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize5075* and dh_lb_count will be lower to begin with than what exists on the5076* device. This is normal and zdb should not exit with an error. The5077* opposite case should never happen though, the values reported in the5078* header should never be higher than what dump_l2arc_log_blocks() and5079* l2arc_rebuild() report. If this happens there is a leak in the5080* accounting of log blocks.5081*/5082if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||5083l2dhdr.dh_lb_count > rebuild.dh_lb_count)5084return (1);50855086return (0);5087}50885089static void5090dump_config_from_label(zdb_label_t *label, size_t buflen, int l)5091{5092if (dump_opt['q'])5093return;50945095if ((dump_opt['l'] < 3) && (first_label(label->config) != l))5096return;50975098print_label_header(label, l);5099dump_nvlist(label->config_nv, 4);5100print_label_numbers(" labels = ", label->config);51015102if (dump_opt['l'] >= 2)5103dump_nvlist_stats(label->config_nv, buflen);5104}51055106#define ZDB_MAX_UB_HEADER_SIZE 3251075108static void5109dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)5110{51115112vdev_t vd;5113char header[ZDB_MAX_UB_HEADER_SIZE];51145115vd.vdev_ashift = ashift;5116vd.vdev_top = &vd;51175118for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {5119uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);5120uberblock_t *ub = (void *)((char *)&label->label + uoff);5121cksum_record_t *rec = label->uberblocks[i];51225123if (rec == NULL) {5124if (dump_opt['u'] >= 2) {5125print_label_header(label, label_num);5126(void) printf(" Uberblock[%d] invalid\n", i);5127}5128continue;5129}51305131if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))5132continue;51335134if ((dump_opt['u'] < 4) &&5135(ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&5136(i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))5137continue;51385139print_label_header(label, label_num);5140(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,5141" Uberblock[%d]\n", i);5142dump_uberblock(ub, header, "");5143print_label_numbers(" labels = ", rec);5144}5145}51465147static char curpath[PATH_MAX];51485149/*5150* Iterate through the path components, recursively passing5151* current one's obj and remaining path until we find the obj5152* for the last one.5153*/5154static int5155dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)5156{5157int err;5158boolean_t header = B_TRUE;5159uint64_t child_obj;5160char *s;5161dmu_buf_t *db;5162dmu_object_info_t doi;51635164if ((s = strchr(name, '/')) != NULL)5165*s = '\0';5166err = zap_lookup(os, obj, name, 8, 1, &child_obj);51675168(void) strlcat(curpath, name, sizeof (curpath));51695170if (err != 0) {5171(void) fprintf(stderr, "failed to lookup %s: %s\n",5172curpath, strerror(err));5173return (err);5174}51755176child_obj = ZFS_DIRENT_OBJ(child_obj);5177err = sa_buf_hold(os, child_obj, FTAG, &db);5178if (err != 0) {5179(void) fprintf(stderr,5180"failed to get SA dbuf for obj %llu: %s\n",5181(u_longlong_t)child_obj, strerror(err));5182return (EINVAL);5183}5184dmu_object_info_from_db(db, &doi);5185sa_buf_rele(db, FTAG);51865187if (doi.doi_bonus_type != DMU_OT_SA &&5188doi.doi_bonus_type != DMU_OT_ZNODE) {5189(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",5190doi.doi_bonus_type, (u_longlong_t)child_obj);5191return (EINVAL);5192}51935194if (dump_opt['v'] > 6) {5195(void) printf("obj=%llu %s type=%d bonustype=%d\n",5196(u_longlong_t)child_obj, curpath, doi.doi_type,5197doi.doi_bonus_type);5198}51995200(void) strlcat(curpath, "/", sizeof (curpath));52015202switch (doi.doi_type) {5203case DMU_OT_DIRECTORY_CONTENTS:5204if (s != NULL && *(s + 1) != '\0')5205return (dump_path_impl(os, child_obj, s + 1, retobj));5206zfs_fallthrough;5207case DMU_OT_PLAIN_FILE_CONTENTS:5208if (retobj != NULL) {5209*retobj = child_obj;5210} else {5211dump_object(os, child_obj, dump_opt['v'], &header,5212NULL, 0);5213}5214return (0);5215default:5216(void) fprintf(stderr, "object %llu has non-file/directory "5217"type %d\n", (u_longlong_t)obj, doi.doi_type);5218break;5219}52205221return (EINVAL);5222}52235224/*5225* Dump the blocks for the object specified by path inside the dataset.5226*/5227static int5228dump_path(char *ds, char *path, uint64_t *retobj)5229{5230int err;5231objset_t *os;5232uint64_t root_obj;52335234err = open_objset(ds, FTAG, &os);5235if (err != 0)5236return (err);52375238err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);5239if (err != 0) {5240(void) fprintf(stderr, "can't lookup root znode: %s\n",5241strerror(err));5242close_objset(os, FTAG);5243return (EINVAL);5244}52455246(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);52475248err = dump_path_impl(os, root_obj, path, retobj);52495250close_objset(os, FTAG);5251return (err);5252}52535254static int5255dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)5256{5257const char *p = (const char *)buf;5258ssize_t nwritten;52595260(void) os;5261(void) arg;52625263/* Write the data out, handling short writes and signals. */5264while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {5265if (nwritten < 0) {5266if (errno == EINTR)5267continue;5268return (errno);5269}5270p += nwritten;5271len -= nwritten;5272}52735274return (0);5275}52765277static void5278dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)5279{5280boolean_t embed = B_FALSE;5281boolean_t large_block = B_FALSE;5282boolean_t compress = B_FALSE;5283boolean_t raw = B_FALSE;52845285const char *c;5286for (c = flagstr; c != NULL && *c != '\0'; c++) {5287switch (*c) {5288case 'e':5289embed = B_TRUE;5290break;5291case 'L':5292large_block = B_TRUE;5293break;5294case 'c':5295compress = B_TRUE;5296break;5297case 'w':5298raw = B_TRUE;5299break;5300default:5301fprintf(stderr, "dump_backup: invalid flag "5302"'%c'\n", *c);5303return;5304}5305}53065307if (isatty(STDOUT_FILENO)) {5308fprintf(stderr, "dump_backup: stream cannot be written "5309"to a terminal\n");5310return;5311}53125313offset_t off = 0;5314dmu_send_outparams_t out = {5315.dso_outfunc = dump_backup_bytes,5316.dso_dryrun = B_FALSE,5317};53185319int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,5320large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,5321&off, &out);5322if (err != 0) {5323fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",5324strerror(err));5325return;5326}5327}53285329static int5330zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)5331{5332int err = 0;5333uint64_t size, readsize, oursize, offset;5334ssize_t writesize;5335sa_handle_t *hdl;53365337(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,5338destfile);53395340VERIFY3P(os, ==, sa_os);5341if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {5342(void) printf("Failed to get handle for SA znode\n");5343return (err);5344}5345if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {5346(void) sa_handle_destroy(hdl);5347return (err);5348}5349(void) sa_handle_destroy(hdl);53505351(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,5352size);5353if (size == 0) {5354return (EINVAL);5355}53565357int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);5358if (fd == -1)5359return (errno);5360/*5361* We cap the size at 1 mebibyte here to prevent5362* allocation failures and nigh-infinite printing if the5363* object is extremely large.5364*/5365oursize = MIN(size, 1 << 20);5366offset = 0;5367char *buf = kmem_alloc(oursize, KM_NOSLEEP);5368if (buf == NULL) {5369(void) close(fd);5370return (ENOMEM);5371}53725373while (offset < size) {5374readsize = MIN(size - offset, 1 << 20);5375err = dmu_read(os, srcobj, offset, readsize, buf, 0);5376if (err != 0) {5377(void) printf("got error %u from dmu_read\n", err);5378kmem_free(buf, oursize);5379(void) close(fd);5380return (err);5381}5382if (dump_opt['v'] > 3) {5383(void) printf("Read offset=%" PRIu64 " size=%" PRIu645384" error=%d\n", offset, readsize, err);5385}53865387writesize = write(fd, buf, readsize);5388if (writesize < 0) {5389err = errno;5390break;5391} else if (writesize != readsize) {5392/* Incomplete write */5393(void) fprintf(stderr, "Short write, only wrote %llu of"5394" %" PRIu64 " bytes, exiting...\n",5395(u_longlong_t)writesize, readsize);5396break;5397}53985399offset += readsize;5400}54015402(void) close(fd);54035404if (buf != NULL)5405kmem_free(buf, oursize);54065407return (err);5408}54095410static boolean_t5411label_cksum_valid(vdev_label_t *label, uint64_t offset)5412{5413zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];5414zio_cksum_t expected_cksum;5415zio_cksum_t actual_cksum;5416zio_cksum_t verifier;5417zio_eck_t *eck;5418int byteswap;54195420void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);5421eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;54225423offset += offsetof(vdev_label_t, vl_vdev_phys);5424ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);54255426byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));5427if (byteswap)5428byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));54295430expected_cksum = eck->zec_cksum;5431eck->zec_cksum = verifier;54325433abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);5434ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);5435abd_free(abd);54365437if (byteswap)5438byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));54395440if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))5441return (B_TRUE);54425443return (B_FALSE);5444}54455446static int5447dump_label(const char *dev)5448{5449char path[MAXPATHLEN];5450zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};5451uint64_t psize, ashift, l2cache;5452struct stat64 statbuf;5453boolean_t config_found = B_FALSE;5454boolean_t error = B_FALSE;5455boolean_t read_l2arc_header = B_FALSE;5456avl_tree_t config_tree;5457avl_tree_t uberblock_tree;5458void *node, *cookie;5459int fd;54605461/*5462* Check if we were given absolute path and use it as is.5463* Otherwise if the provided vdev name doesn't point to a file,5464* try prepending expected disk paths and partition numbers.5465*/5466(void) strlcpy(path, dev, sizeof (path));5467if (dev[0] != '/' && stat64(path, &statbuf) != 0) {5468int error;54695470error = zfs_resolve_shortname(dev, path, MAXPATHLEN);5471if (error == 0 && zfs_dev_is_whole_disk(path)) {5472if (zfs_append_partition(path, MAXPATHLEN) == -1)5473error = ENOENT;5474}54755476if (error || (stat64(path, &statbuf) != 0)) {5477(void) printf("failed to find device %s, try "5478"specifying absolute path instead\n", dev);5479return (1);5480}5481}54825483if ((fd = open64(path, O_RDONLY)) < 0) {5484(void) printf("cannot open '%s': %s\n", path, strerror(errno));5485zdb_exit(1);5486}54875488if (fstat64_blk(fd, &statbuf) != 0) {5489(void) printf("failed to stat '%s': %s\n", path,5490strerror(errno));5491(void) close(fd);5492zdb_exit(1);5493}54945495if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)5496(void) printf("failed to invalidate cache '%s' : %s\n", path,5497strerror(errno));54985499avl_create(&config_tree, cksum_record_compare,5500sizeof (cksum_record_t), offsetof(cksum_record_t, link));5501avl_create(&uberblock_tree, cksum_record_compare,5502sizeof (cksum_record_t), offsetof(cksum_record_t, link));55035504psize = statbuf.st_size;5505psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);5506ashift = SPA_MINBLOCKSHIFT;55075508/*5509* 1. Read the label from disk5510* 2. Verify label cksum5511* 3. Unpack the configuration and insert in config tree.5512* 4. Traverse all uberblocks and insert in uberblock tree.5513*/5514for (int l = 0; l < VDEV_LABELS; l++) {5515zdb_label_t *label = &labels[l];5516char *buf = label->label.vl_vdev_phys.vp_nvlist;5517size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);5518nvlist_t *config;5519cksum_record_t *rec;5520zio_cksum_t cksum;5521vdev_t vd;55225523label->label_offset = vdev_label_offset(psize, l, 0);55245525if (pread64(fd, &label->label, sizeof (label->label),5526label->label_offset) != sizeof (label->label)) {5527if (!dump_opt['q'])5528(void) printf("failed to read label %d\n", l);5529label->read_failed = B_TRUE;5530error = B_TRUE;5531continue;5532}55335534label->read_failed = B_FALSE;5535label->cksum_valid = label_cksum_valid(&label->label,5536label->label_offset);55375538if (nvlist_unpack(buf, buflen, &config, 0) == 0) {5539nvlist_t *vdev_tree = NULL;5540size_t size;55415542if ((nvlist_lookup_nvlist(config,5543ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||5544(nvlist_lookup_uint64(vdev_tree,5545ZPOOL_CONFIG_ASHIFT, &ashift) != 0))5546ashift = SPA_MINBLOCKSHIFT;55475548if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)5549size = buflen;55505551/* If the device is a cache device read the header. */5552if (!read_l2arc_header) {5553if (nvlist_lookup_uint64(config,5554ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&5555l2cache == POOL_STATE_L2CACHE) {5556read_l2arc_header = B_TRUE;5557}5558}55595560fletcher_4_native_varsize(buf, size, &cksum);5561rec = cksum_record_insert(&config_tree, &cksum, l);55625563label->config = rec;5564label->config_nv = config;5565config_found = B_TRUE;5566} else {5567error = B_TRUE;5568}55695570vd.vdev_ashift = ashift;5571vd.vdev_top = &vd;55725573for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {5574uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);5575uberblock_t *ub = (void *)((char *)label + uoff);55765577if (uberblock_verify(ub))5578continue;55795580fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);5581rec = cksum_record_insert(&uberblock_tree, &cksum, l);55825583label->uberblocks[i] = rec;5584}5585}55865587/*5588* Dump the label and uberblocks.5589*/5590for (int l = 0; l < VDEV_LABELS; l++) {5591zdb_label_t *label = &labels[l];5592size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);55935594if (label->read_failed == B_TRUE)5595continue;55965597if (label->config_nv) {5598dump_config_from_label(label, buflen, l);5599} else {5600if (!dump_opt['q'])5601(void) printf("failed to unpack label %d\n", l);5602}56035604if (dump_opt['u'])5605dump_label_uberblocks(label, ashift, l);56065607nvlist_free(label->config_nv);5608}56095610/*5611* Dump the L2ARC header, if existent.5612*/5613if (read_l2arc_header)5614error |= dump_l2arc_header(fd);56155616cookie = NULL;5617while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)5618umem_free(node, sizeof (cksum_record_t));56195620cookie = NULL;5621while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)5622umem_free(node, sizeof (cksum_record_t));56235624avl_destroy(&config_tree);5625avl_destroy(&uberblock_tree);56265627(void) close(fd);56285629return (config_found == B_FALSE ? 2 :5630(error == B_TRUE ? 1 : 0));5631}56325633static uint64_t dataset_feature_count[SPA_FEATURES];5634static uint64_t global_feature_count[SPA_FEATURES];5635static uint64_t remap_deadlist_count = 0;56365637static int5638dump_one_objset(const char *dsname, void *arg)5639{5640(void) arg;5641int error;5642objset_t *os;5643spa_feature_t f;56445645error = open_objset(dsname, FTAG, &os);5646if (error != 0)5647return (0);56485649for (f = 0; f < SPA_FEATURES; f++) {5650if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))5651continue;5652ASSERT(spa_feature_table[f].fi_flags &5653ZFEATURE_FLAG_PER_DATASET);5654dataset_feature_count[f]++;5655}56565657if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {5658remap_deadlist_count++;5659}56605661for (dsl_bookmark_node_t *dbn =5662avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;5663dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {5664mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);5665if (dbn->dbn_phys.zbm_redaction_obj != 0) {5666global_feature_count[5667SPA_FEATURE_REDACTION_BOOKMARKS]++;5668objset_t *mos = os->os_spa->spa_meta_objset;5669dnode_t *rl;5670VERIFY0(dnode_hold(mos,5671dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));5672if (rl->dn_have_spill) {5673global_feature_count[5674SPA_FEATURE_REDACTION_LIST_SPILL]++;5675}5676}5677if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)5678global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;5679}56805681if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&5682!dmu_objset_is_snapshot(os)) {5683global_feature_count[SPA_FEATURE_LIVELIST]++;5684}56855686dump_objset(os);5687close_objset(os, FTAG);5688fuid_table_destroy();5689return (0);5690}56915692/*5693* Block statistics.5694*/5695#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)5696typedef struct zdb_blkstats {5697uint64_t zb_asize;5698uint64_t zb_lsize;5699uint64_t zb_psize;5700uint64_t zb_count;5701uint64_t zb_gangs;5702uint64_t zb_ditto_samevdev;5703uint64_t zb_ditto_same_ms;5704uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];5705} zdb_blkstats_t;57065707/*5708* Extended object types to report deferred frees and dedup auto-ditto blocks.5709*/5710#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)5711#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)5712#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)5713#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)57145715static const char *zdb_ot_extname[] = {5716"deferred free",5717"dedup ditto",5718"other",5719"Total",5720};57215722#define ZB_TOTAL DN_MAX_LEVELS5723#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)57245725typedef struct zdb_brt_entry {5726dva_t zbre_dva;5727uint64_t zbre_refcount;5728avl_node_t zbre_node;5729} zdb_brt_entry_t;57305731typedef struct zdb_cb {5732zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];5733uint64_t zcb_removing_size;5734uint64_t zcb_checkpoint_size;5735uint64_t zcb_dedup_asize;5736uint64_t zcb_dedup_blocks;5737uint64_t zcb_clone_asize;5738uint64_t zcb_clone_blocks;5739uint64_t zcb_psize_count[SPA_MAX_FOR_16M];5740uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];5741uint64_t zcb_asize_count[SPA_MAX_FOR_16M];5742uint64_t zcb_psize_len[SPA_MAX_FOR_16M];5743uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];5744uint64_t zcb_asize_len[SPA_MAX_FOR_16M];5745uint64_t zcb_psize_total;5746uint64_t zcb_lsize_total;5747uint64_t zcb_asize_total;5748uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];5749uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]5750[BPE_PAYLOAD_SIZE + 1];5751uint64_t zcb_start;5752hrtime_t zcb_lastprint;5753uint64_t zcb_totalasize;5754uint64_t zcb_errors[256];5755int zcb_readfails;5756int zcb_haderrors;5757spa_t *zcb_spa;5758uint32_t **zcb_vd_obsolete_counts;5759avl_tree_t zcb_brt;5760boolean_t zcb_brt_is_active;5761} zdb_cb_t;57625763/* test if two DVA offsets from same vdev are within the same metaslab */5764static boolean_t5765same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)5766{5767vdev_t *vd = vdev_lookup_top(spa, vdev);5768uint64_t ms_shift = vd->vdev_ms_shift;57695770return ((off1 >> ms_shift) == (off2 >> ms_shift));5771}57725773/*5774* Used to simplify reporting of the histogram data.5775*/5776typedef struct one_histo {5777const char *name;5778uint64_t *count;5779uint64_t *len;5780uint64_t cumulative;5781} one_histo_t;57825783/*5784* The number of separate histograms processed for psize, lsize and asize.5785*/5786#define NUM_HISTO 357875788/*5789* This routine will create a fixed column size output of three different5790* histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M5791* the count, length and cumulative length of the psize, lsize and5792* asize blocks.5793*5794* All three types of blocks are listed on a single line5795*5796* By default the table is printed in nicenumber format (e.g. 123K) but5797* if the '-P' parameter is specified then the full raw number (parseable)5798* is printed out.5799*/5800static void5801dump_size_histograms(zdb_cb_t *zcb)5802{5803/*5804* A temporary buffer that allows us to convert a number into5805* a string using zdb_nicenumber to allow either raw or human5806* readable numbers to be output.5807*/5808char numbuf[32];58095810/*5811* Define titles which are used in the headers of the tables5812* printed by this routine.5813*/5814const char blocksize_title1[] = "block";5815const char blocksize_title2[] = "size";5816const char count_title[] = "Count";5817const char length_title[] = "Size";5818const char cumulative_title[] = "Cum.";58195820/*5821* Setup the histogram arrays (psize, lsize, and asize).5822*/5823one_histo_t parm_histo[NUM_HISTO];58245825parm_histo[0].name = "psize";5826parm_histo[0].count = zcb->zcb_psize_count;5827parm_histo[0].len = zcb->zcb_psize_len;5828parm_histo[0].cumulative = 0;58295830parm_histo[1].name = "lsize";5831parm_histo[1].count = zcb->zcb_lsize_count;5832parm_histo[1].len = zcb->zcb_lsize_len;5833parm_histo[1].cumulative = 0;58345835parm_histo[2].name = "asize";5836parm_histo[2].count = zcb->zcb_asize_count;5837parm_histo[2].len = zcb->zcb_asize_len;5838parm_histo[2].cumulative = 0;583958405841(void) printf("\nBlock Size Histogram\n");5842switch (block_bin_mode) {5843case BIN_PSIZE:5844printf("(note: all categories are binned by %s)\n", "psize");5845break;5846case BIN_LSIZE:5847printf("(note: all categories are binned by %s)\n", "lsize");5848break;5849case BIN_ASIZE:5850printf("(note: all categories are binned by %s)\n", "asize");5851break;5852default:5853printf("(note: all categories are binned separately)\n");5854break;5855}5856if (block_classes != 0) {5857char buf[256] = "";5858if (block_classes & CLASS_NORMAL)5859strlcat(buf, "\"normal\", ", sizeof (buf));5860if (block_classes & CLASS_SPECIAL)5861strlcat(buf, "\"special\", ", sizeof (buf));5862if (block_classes & CLASS_DEDUP)5863strlcat(buf, "\"dedup\", ", sizeof (buf));5864if (block_classes & CLASS_OTHER)5865strlcat(buf, "\"other\", ", sizeof (buf));5866buf[strlen(buf)-2] = '\0';5867printf("(note: only blocks in these classes are counted: %s)\n",5868buf);5869}5870/*5871* Print the first line titles5872*/5873if (dump_opt['P'])5874(void) printf("\n%s\t", blocksize_title1);5875else5876(void) printf("\n%7s ", blocksize_title1);58775878for (int j = 0; j < NUM_HISTO; j++) {5879if (dump_opt['P']) {5880if (j < NUM_HISTO - 1) {5881(void) printf("%s\t\t\t", parm_histo[j].name);5882} else {5883/* Don't print trailing spaces */5884(void) printf(" %s", parm_histo[j].name);5885}5886} else {5887if (j < NUM_HISTO - 1) {5888/* Left aligned strings in the output */5889(void) printf("%-7s ",5890parm_histo[j].name);5891} else {5892/* Don't print trailing spaces */5893(void) printf("%s", parm_histo[j].name);5894}5895}5896}5897(void) printf("\n");58985899/*5900* Print the second line titles5901*/5902if (dump_opt['P']) {5903(void) printf("%s\t", blocksize_title2);5904} else {5905(void) printf("%7s ", blocksize_title2);5906}59075908for (int i = 0; i < NUM_HISTO; i++) {5909if (dump_opt['P']) {5910(void) printf("%s\t%s\t%s\t",5911count_title, length_title, cumulative_title);5912} else {5913(void) printf("%7s%7s%7s",5914count_title, length_title, cumulative_title);5915}5916}5917(void) printf("\n");59185919/*5920* Print the rows5921*/5922for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {59235924/*5925* Print the first column showing the blocksize5926*/5927zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));59285929if (dump_opt['P']) {5930printf("%s", numbuf);5931} else {5932printf("%7s:", numbuf);5933}59345935/*5936* Print the remaining set of 3 columns per size:5937* for psize, lsize and asize5938*/5939for (int j = 0; j < NUM_HISTO; j++) {5940parm_histo[j].cumulative += parm_histo[j].len[i];59415942zdb_nicenum(parm_histo[j].count[i],5943numbuf, sizeof (numbuf));5944if (dump_opt['P'])5945(void) printf("\t%s", numbuf);5946else5947(void) printf("%7s", numbuf);59485949zdb_nicenum(parm_histo[j].len[i],5950numbuf, sizeof (numbuf));5951if (dump_opt['P'])5952(void) printf("\t%s", numbuf);5953else5954(void) printf("%7s", numbuf);59555956zdb_nicenum(parm_histo[j].cumulative,5957numbuf, sizeof (numbuf));5958if (dump_opt['P'])5959(void) printf("\t%s", numbuf);5960else5961(void) printf("%7s", numbuf);5962}5963(void) printf("\n");5964}5965}59665967static void5968zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,5969dmu_object_type_t type)5970{5971int i;59725973ASSERT(type < ZDB_OT_TOTAL);59745975if (zilog && zil_bp_tree_add(zilog, bp) != 0)5976return;59775978/*5979* This flag controls if we will issue a claim for the block while5980* counting it, to ensure that all blocks are referenced in space maps.5981* We don't issue claims if we're not doing leak tracking, because it's5982* expensive if the user isn't interested. We also don't claim the5983* second or later occurences of cloned or dedup'd blocks, because we5984* already claimed them the first time.5985*/5986boolean_t do_claim = !dump_opt['L'];59875988spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);59895990blkptr_t tempbp;5991if (BP_GET_DEDUP(bp)) {5992/*5993* Dedup'd blocks are special. We need to count them, so we can5994* later uncount them when reporting leaked space, and we must5995* only claim them once.5996*5997* We use the existing dedup system to track what we've seen.5998* The first time we see a block, we do a ddt_lookup() to see5999* if it exists in the DDT. If we're doing leak tracking, we6000* claim the block at this time.6001*6002* Each time we see a block, we reduce the refcount in the6003* entry by one, and add to the size and count of dedup'd6004* blocks to report at the end.6005*/60066007ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);60086009ddt_enter(ddt);60106011/*6012* Find the block. This will create the entry in memory, but6013* we'll know if that happened by its refcount.6014*/6015ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE);60166017/*6018* ddt_lookup() can return NULL if this block didn't exist6019* in the DDT and creating it would take the DDT over its6020* quota. Since we got the block from disk, it must exist in6021* the DDT, so this can't happen. However, when unique entries6022* are pruned, the dedup bit can be set with no corresponding6023* entry in the DDT.6024*/6025if (dde == NULL) {6026ddt_exit(ddt);6027goto skipped;6028}60296030/* Get the phys for this variant */6031ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);60326033/*6034* This entry may have multiple sets of DVAs. We must claim6035* each set the first time we see them in a real block on disk,6036* or count them on subsequent occurences. We don't have a6037* convenient way to track the first time we see each variant,6038* so we repurpose dde_io as a set of "seen" flag bits. We can6039* do this safely in zdb because it never writes, so it will6040* never have a writing zio for this block in that pointer.6041*/6042boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));6043if (!seen)6044dde->dde_io =6045(void *)(((uintptr_t)dde->dde_io) | (1 << v));60466047/* Consume a reference for this block. */6048if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)6049ddt_phys_decref(dde->dde_phys, v);60506051/*6052* If this entry has a single flat phys, it may have been6053* extended with additional DVAs at some time in its life.6054* This block might be from before it was fully extended, and6055* so have fewer DVAs.6056*6057* If this is the first time we've seen this block, and we6058* claimed it as-is, then we would miss the claim on some6059* number of DVAs, which would then be seen as leaked.6060*6061* In all cases, if we've had fewer DVAs, then the asize would6062* be too small, and would lead to the pool apparently using6063* more space than allocated.6064*6065* To handle this, we copy the canonical set of DVAs from the6066* entry back to the block pointer before we claim it.6067*/6068if (v == DDT_PHYS_FLAT) {6069ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,6070ddt_phys_birth(dde->dde_phys, v));6071tempbp = *bp;6072ddt_bp_fill(dde->dde_phys, v, &tempbp,6073BP_GET_PHYSICAL_BIRTH(bp));6074bp = &tempbp;6075}60766077if (seen) {6078/*6079* The second or later time we see this block,6080* it's a duplicate and we count it.6081*/6082zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);6083zcb->zcb_dedup_blocks++;60846085/* Already claimed, don't do it again. */6086do_claim = B_FALSE;6087}60886089ddt_exit(ddt);6090} else if (zcb->zcb_brt_is_active &&6091brt_maybe_exists(zcb->zcb_spa, bp)) {6092/*6093* Cloned blocks are special. We need to count them, so we can6094* later uncount them when reporting leaked space, and we must6095* only claim them once.6096*6097* To do this, we keep our own in-memory BRT. For each block6098* we haven't seen before, we look it up in the real BRT and6099* if its there, we note it and its refcount then proceed as6100* normal. If we see the block again, we count it as a clone6101* and then give it no further consideration.6102*/6103zdb_brt_entry_t zbre_search, *zbre;6104avl_index_t where;61056106zbre_search.zbre_dva = bp->blk_dva[0];6107zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);6108if (zbre == NULL) {6109/* Not seen before; track it */6110uint64_t refcnt =6111brt_entry_get_refcount(zcb->zcb_spa, bp);6112if (refcnt > 0) {6113zbre = umem_zalloc(sizeof (zdb_brt_entry_t),6114UMEM_NOFAIL);6115zbre->zbre_dva = bp->blk_dva[0];6116zbre->zbre_refcount = refcnt;6117avl_insert(&zcb->zcb_brt, zbre, where);6118}6119} else {6120/*6121* Second or later occurrence, count it and take a6122* refcount.6123*/6124zcb->zcb_clone_asize += BP_GET_ASIZE(bp);6125zcb->zcb_clone_blocks++;61266127zbre->zbre_refcount--;6128if (zbre->zbre_refcount == 0) {6129avl_remove(&zcb->zcb_brt, zbre);6130umem_free(zbre, sizeof (zdb_brt_entry_t));6131}61326133/* Already claimed, don't do it again. */6134do_claim = B_FALSE;6135}6136}61376138skipped:6139for (i = 0; i < 4; i++) {6140int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;6141int t = (i & 1) ? type : ZDB_OT_TOTAL;6142int equal;6143zdb_blkstats_t *zb = &zcb->zcb_type[l][t];61446145zb->zb_asize += BP_GET_ASIZE(bp);6146zb->zb_lsize += BP_GET_LSIZE(bp);6147zb->zb_psize += BP_GET_PSIZE(bp);6148zb->zb_count++;61496150/*6151* The histogram is only big enough to record blocks up to6152* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,6153* "other", bucket.6154*/6155unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;6156idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);6157zb->zb_psize_histogram[idx]++;61586159zb->zb_gangs += BP_COUNT_GANG(bp);61606161switch (BP_GET_NDVAS(bp)) {6162case 2:6163if (DVA_GET_VDEV(&bp->blk_dva[0]) ==6164DVA_GET_VDEV(&bp->blk_dva[1])) {6165zb->zb_ditto_samevdev++;61666167if (same_metaslab(zcb->zcb_spa,6168DVA_GET_VDEV(&bp->blk_dva[0]),6169DVA_GET_OFFSET(&bp->blk_dva[0]),6170DVA_GET_OFFSET(&bp->blk_dva[1])))6171zb->zb_ditto_same_ms++;6172}6173break;6174case 3:6175equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==6176DVA_GET_VDEV(&bp->blk_dva[1])) +6177(DVA_GET_VDEV(&bp->blk_dva[0]) ==6178DVA_GET_VDEV(&bp->blk_dva[2])) +6179(DVA_GET_VDEV(&bp->blk_dva[1]) ==6180DVA_GET_VDEV(&bp->blk_dva[2]));6181if (equal != 0) {6182zb->zb_ditto_samevdev++;61836184if (DVA_GET_VDEV(&bp->blk_dva[0]) ==6185DVA_GET_VDEV(&bp->blk_dva[1]) &&6186same_metaslab(zcb->zcb_spa,6187DVA_GET_VDEV(&bp->blk_dva[0]),6188DVA_GET_OFFSET(&bp->blk_dva[0]),6189DVA_GET_OFFSET(&bp->blk_dva[1])))6190zb->zb_ditto_same_ms++;6191else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==6192DVA_GET_VDEV(&bp->blk_dva[2]) &&6193same_metaslab(zcb->zcb_spa,6194DVA_GET_VDEV(&bp->blk_dva[0]),6195DVA_GET_OFFSET(&bp->blk_dva[0]),6196DVA_GET_OFFSET(&bp->blk_dva[2])))6197zb->zb_ditto_same_ms++;6198else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==6199DVA_GET_VDEV(&bp->blk_dva[2]) &&6200same_metaslab(zcb->zcb_spa,6201DVA_GET_VDEV(&bp->blk_dva[1]),6202DVA_GET_OFFSET(&bp->blk_dva[1]),6203DVA_GET_OFFSET(&bp->blk_dva[2])))6204zb->zb_ditto_same_ms++;6205}6206break;6207}6208}62096210spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);62116212if (BP_IS_EMBEDDED(bp)) {6213zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;6214zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]6215[BPE_GET_PSIZE(bp)]++;6216return;6217}62186219if (block_classes != 0) {6220spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);62216222uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[0]);6223uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]);6224vdev_t *vd = vdev_lookup_top(zcb->zcb_spa, vdev);6225ASSERT(vd != NULL);6226metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];6227ASSERT(ms != NULL);6228metaslab_group_t *mg = ms->ms_group;6229ASSERT(mg != NULL);6230metaslab_class_t *mc = mg->mg_class;6231ASSERT(mc != NULL);62326233spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);62346235int class;6236if (mc == spa_normal_class(zcb->zcb_spa)) {6237class = CLASS_NORMAL;6238} else if (mc == spa_special_class(zcb->zcb_spa)) {6239class = CLASS_SPECIAL;6240} else if (mc == spa_dedup_class(zcb->zcb_spa)) {6241class = CLASS_DEDUP;6242} else {6243class = CLASS_OTHER;6244}62456246if (!(block_classes & class)) {6247goto hist_skipped;6248}6249}62506251/*6252* The binning histogram bins by powers of two up to6253* SPA_MAXBLOCKSIZE rather than creating bins for6254* every possible blocksize found in the pool.6255*/6256int bin;62576258/*6259* Binning strategy: each bin includes blocks up to and including6260* the given size (excluding blocks that fit into the previous bin).6261* This way, the "4K" bin includes blocks within the (2K; 4K] range.6262*/6263#define BIN(size) (highbit64((size) - 1))62646265switch (block_bin_mode) {6266case BIN_PSIZE: bin = BIN(BP_GET_PSIZE(bp)); break;6267case BIN_LSIZE: bin = BIN(BP_GET_LSIZE(bp)); break;6268case BIN_ASIZE: bin = BIN(BP_GET_ASIZE(bp)); break;6269case BIN_AUTO: break;6270default: PANIC("bad block_bin_mode"); abort();6271}62726273if (block_bin_mode == BIN_AUTO)6274bin = BIN(BP_GET_PSIZE(bp));62756276zcb->zcb_psize_count[bin]++;6277zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);6278zcb->zcb_psize_total += BP_GET_PSIZE(bp);62796280if (block_bin_mode == BIN_AUTO)6281bin = BIN(BP_GET_LSIZE(bp));62826283zcb->zcb_lsize_count[bin]++;6284zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);6285zcb->zcb_lsize_total += BP_GET_LSIZE(bp);62866287if (block_bin_mode == BIN_AUTO)6288bin = BIN(BP_GET_ASIZE(bp));62896290zcb->zcb_asize_count[bin]++;6291zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);6292zcb->zcb_asize_total += BP_GET_ASIZE(bp);62936294#undef BIN62956296hist_skipped:6297if (!do_claim)6298return;62996300VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,6301spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,6302ZIO_FLAG_CANFAIL)));6303}63046305static void6306zdb_blkptr_done(zio_t *zio)6307{6308spa_t *spa = zio->io_spa;6309blkptr_t *bp = zio->io_bp;6310int ioerr = zio->io_error;6311zdb_cb_t *zcb = zio->io_private;6312zbookmark_phys_t *zb = &zio->io_bookmark;63136314mutex_enter(&spa->spa_scrub_lock);6315spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);6316cv_broadcast(&spa->spa_scrub_io_cv);63176318if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {6319char blkbuf[BP_SPRINTF_LEN];63206321zcb->zcb_haderrors = 1;6322zcb->zcb_errors[ioerr]++;63236324if (dump_opt['b'] >= 2)6325snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);6326else6327blkbuf[0] = '\0';63286329(void) printf("zdb_blkptr_cb: "6330"Got error %d reading "6331"<%llu, %llu, %lld, %llx> %s -- skipping\n",6332ioerr,6333(u_longlong_t)zb->zb_objset,6334(u_longlong_t)zb->zb_object,6335(u_longlong_t)zb->zb_level,6336(u_longlong_t)zb->zb_blkid,6337blkbuf);6338}6339mutex_exit(&spa->spa_scrub_lock);63406341abd_free(zio->io_abd);6342}63436344static int6345zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,6346const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)6347{6348zdb_cb_t *zcb = arg;6349dmu_object_type_t type;6350boolean_t is_metadata;63516352if (zb->zb_level == ZB_DNODE_LEVEL)6353return (0);63546355if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) {6356char blkbuf[BP_SPRINTF_LEN];6357snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);6358(void) printf("objset %llu object %llu "6359"level %lld offset 0x%llx %s\n",6360(u_longlong_t)zb->zb_objset,6361(u_longlong_t)zb->zb_object,6362(longlong_t)zb->zb_level,6363(u_longlong_t)blkid2offset(dnp, bp, zb),6364blkbuf);6365}63666367if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))6368return (0);63696370type = BP_GET_TYPE(bp);63716372zdb_count_block(zcb, zilog, bp,6373(type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);63746375is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));63766377if (!BP_IS_EMBEDDED(bp) &&6378(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {6379size_t size = BP_GET_PSIZE(bp);6380abd_t *abd = abd_alloc(size, B_FALSE);6381int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;63826383/* If it's an intent log block, failure is expected. */6384if (zb->zb_level == ZB_ZIL_LEVEL)6385flags |= ZIO_FLAG_SPECULATIVE;63866387mutex_enter(&spa->spa_scrub_lock);6388while (spa->spa_load_verify_bytes > max_inflight_bytes)6389cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);6390spa->spa_load_verify_bytes += size;6391mutex_exit(&spa->spa_scrub_lock);63926393zio_nowait(zio_read(NULL, spa, bp, abd, size,6394zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));6395}63966397zcb->zcb_readfails = 0;63986399/* only call gethrtime() every 100 blocks */6400static int iters;6401if (++iters > 100)6402iters = 0;6403else6404return (0);64056406if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {6407uint64_t now = gethrtime();6408char buf[10];6409uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;6410uint64_t kb_per_sec =64111 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));6412uint64_t sec_remaining =6413(zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;64146415/* make sure nicenum has enough space */6416_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");64176418zfs_nicebytes(bytes, buf, sizeof (buf));6419(void) fprintf(stderr,6420"\r%5s completed (%4"PRIu64"MB/s) "6421"estimated time remaining: "6422"%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ",6423buf, kb_per_sec / 1024,6424sec_remaining / 60 / 60,6425sec_remaining / 60 % 60,6426sec_remaining % 60);64276428zcb->zcb_lastprint = now;6429}64306431return (0);6432}64336434static void6435zdb_leak(void *arg, uint64_t start, uint64_t size)6436{6437vdev_t *vd = arg;64386439(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",6440(u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);6441}64426443static metaslab_ops_t zdb_metaslab_ops = {6444NULL /* alloc */6445};64466447static int6448load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,6449uint64_t txg, void *arg)6450{6451spa_vdev_removal_t *svr = arg;64526453uint64_t offset = sme->sme_offset;6454uint64_t size = sme->sme_run;64556456/* skip vdevs we don't care about */6457if (sme->sme_vdev != svr->svr_vdev_id)6458return (0);64596460vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);6461metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];6462ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);64636464if (txg < metaslab_unflushed_txg(ms))6465return (0);64666467if (sme->sme_type == SM_ALLOC)6468zfs_range_tree_add(svr->svr_allocd_segs, offset, size);6469else6470zfs_range_tree_remove(svr->svr_allocd_segs, offset, size);64716472return (0);6473}64746475static void6476claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,6477uint64_t size, void *arg)6478{6479(void) inner_offset, (void) arg;64806481/*6482* This callback was called through a remap from6483* a device being removed. Therefore, the vdev that6484* this callback is applied to is a concrete6485* vdev.6486*/6487ASSERT(vdev_is_concrete(vd));64886489VERIFY0(metaslab_claim_impl(vd, offset, size,6490spa_min_claim_txg(vd->vdev_spa)));6491}64926493static void6494claim_segment_cb(void *arg, uint64_t offset, uint64_t size)6495{6496vdev_t *vd = arg;64976498vdev_indirect_ops.vdev_op_remap(vd, offset, size,6499claim_segment_impl_cb, NULL);6500}65016502/*6503* After accounting for all allocated blocks that are directly referenced,6504* we might have missed a reference to a block from a partially complete6505* (and thus unused) indirect mapping object. We perform a secondary pass6506* through the metaslabs we have already mapped and claim the destination6507* blocks.6508*/6509static void6510zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)6511{6512if (dump_opt['L'])6513return;65146515if (spa->spa_vdev_removal == NULL)6516return;65176518spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);65196520spa_vdev_removal_t *svr = spa->spa_vdev_removal;6521vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);6522vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;65236524ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));65256526zfs_range_tree_t *allocs = zfs_range_tree_create_flags(6527NULL, ZFS_RANGE_SEG64, NULL, 0, 0,65280, "zdb_claim_removing:allocs");6529for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {6530metaslab_t *msp = vd->vdev_ms[msi];65316532ASSERT0(zfs_range_tree_space(allocs));6533if (msp->ms_sm != NULL)6534VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));6535zfs_range_tree_vacate(allocs, zfs_range_tree_add,6536svr->svr_allocd_segs);6537}6538zfs_range_tree_destroy(allocs);65396540iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);65416542/*6543* Clear everything past what has been synced,6544* because we have not allocated mappings for6545* it yet.6546*/6547zfs_range_tree_clear(svr->svr_allocd_segs,6548vdev_indirect_mapping_max_offset(vim),6549vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));65506551zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs);6552zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);65536554spa_config_exit(spa, SCL_CONFIG, FTAG);6555}65566557static int6558increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,6559dmu_tx_t *tx)6560{6561(void) tx;6562zdb_cb_t *zcb = arg;6563spa_t *spa = zcb->zcb_spa;6564vdev_t *vd;6565const dva_t *dva = &bp->blk_dva[0];65666567ASSERT(!bp_freed);6568ASSERT(!dump_opt['L']);6569ASSERT3U(BP_GET_NDVAS(bp), ==, 1);65706571spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);6572vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));6573ASSERT3P(vd, !=, NULL);6574spa_config_exit(spa, SCL_VDEV, FTAG);65756576ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);6577ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);65786579vdev_indirect_mapping_increment_obsolete_count(6580vd->vdev_indirect_mapping,6581DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),6582zcb->zcb_vd_obsolete_counts[vd->vdev_id]);65836584return (0);6585}65866587static uint32_t *6588zdb_load_obsolete_counts(vdev_t *vd)6589{6590vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;6591spa_t *spa = vd->vdev_spa;6592spa_condensing_indirect_phys_t *scip =6593&spa->spa_condensing_indirect_phys;6594uint64_t obsolete_sm_object;6595uint32_t *counts;65966597VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));6598EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);6599counts = vdev_indirect_mapping_load_obsolete_counts(vim);6600if (vd->vdev_obsolete_sm != NULL) {6601vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,6602vd->vdev_obsolete_sm);6603}6604if (scip->scip_vdev == vd->vdev_id &&6605scip->scip_prev_obsolete_sm_object != 0) {6606space_map_t *prev_obsolete_sm = NULL;6607VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,6608scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));6609vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,6610prev_obsolete_sm);6611space_map_close(prev_obsolete_sm);6612}6613return (counts);6614}66156616typedef struct checkpoint_sm_exclude_entry_arg {6617vdev_t *cseea_vd;6618uint64_t cseea_checkpoint_size;6619} checkpoint_sm_exclude_entry_arg_t;66206621static int6622checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)6623{6624checkpoint_sm_exclude_entry_arg_t *cseea = arg;6625vdev_t *vd = cseea->cseea_vd;6626metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];6627uint64_t end = sme->sme_offset + sme->sme_run;66286629ASSERT(sme->sme_type == SM_FREE);66306631/*6632* Since the vdev_checkpoint_sm exists in the vdev level6633* and the ms_sm space maps exist in the metaslab level,6634* an entry in the checkpoint space map could theoretically6635* cross the boundaries of the metaslab that it belongs.6636*6637* In reality, because of the way that we populate and6638* manipulate the checkpoint's space maps currently,6639* there shouldn't be any entries that cross metaslabs.6640* Hence the assertion below.6641*6642* That said, there is no fundamental requirement that6643* the checkpoint's space map entries should not cross6644* metaslab boundaries. So if needed we could add code6645* that handles metaslab-crossing segments in the future.6646*/6647VERIFY3U(sme->sme_offset, >=, ms->ms_start);6648VERIFY3U(end, <=, ms->ms_start + ms->ms_size);66496650/*6651* By removing the entry from the allocated segments we6652* also verify that the entry is there to begin with.6653*/6654mutex_enter(&ms->ms_lock);6655zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset,6656sme->sme_run);6657mutex_exit(&ms->ms_lock);66586659cseea->cseea_checkpoint_size += sme->sme_run;6660return (0);6661}66626663static void6664zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)6665{6666spa_t *spa = vd->vdev_spa;6667space_map_t *checkpoint_sm = NULL;6668uint64_t checkpoint_sm_obj;66696670/*6671* If there is no vdev_top_zap, we are in a pool whose6672* version predates the pool checkpoint feature.6673*/6674if (vd->vdev_top_zap == 0)6675return;66766677/*6678* If there is no reference of the vdev_checkpoint_sm in6679* the vdev_top_zap, then one of the following scenarios6680* is true:6681*6682* 1] There is no checkpoint6683* 2] There is a checkpoint, but no checkpointed blocks6684* have been freed yet6685* 3] The current vdev is indirect6686*6687* In these cases we return immediately.6688*/6689if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,6690VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)6691return;66926693VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,6694VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,6695&checkpoint_sm_obj));66966697checkpoint_sm_exclude_entry_arg_t cseea;6698cseea.cseea_vd = vd;6699cseea.cseea_checkpoint_size = 0;67006701VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),6702checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));67036704VERIFY0(space_map_iterate(checkpoint_sm,6705space_map_length(checkpoint_sm),6706checkpoint_sm_exclude_entry_cb, &cseea));6707space_map_close(checkpoint_sm);67086709zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;6710}67116712static void6713zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)6714{6715ASSERT(!dump_opt['L']);67166717vdev_t *rvd = spa->spa_root_vdev;6718for (uint64_t c = 0; c < rvd->vdev_children; c++) {6719ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);6720zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);6721}6722}67236724static int6725count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,6726uint64_t txg, void *arg)6727{6728int64_t *ualloc_space = arg;67296730uint64_t offset = sme->sme_offset;6731uint64_t vdev_id = sme->sme_vdev;67326733vdev_t *vd = vdev_lookup_top(spa, vdev_id);6734if (!vdev_is_concrete(vd))6735return (0);67366737metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];6738ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);67396740if (txg < metaslab_unflushed_txg(ms))6741return (0);67426743if (sme->sme_type == SM_ALLOC)6744*ualloc_space += sme->sme_run;6745else6746*ualloc_space -= sme->sme_run;67476748return (0);6749}67506751static int64_t6752get_unflushed_alloc_space(spa_t *spa)6753{6754if (dump_opt['L'])6755return (0);67566757int64_t ualloc_space = 0;6758iterate_through_spacemap_logs(spa, count_unflushed_space_cb,6759&ualloc_space);6760return (ualloc_space);6761}67626763static int6764load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)6765{6766maptype_t *uic_maptype = arg;67676768uint64_t offset = sme->sme_offset;6769uint64_t size = sme->sme_run;6770uint64_t vdev_id = sme->sme_vdev;67716772vdev_t *vd = vdev_lookup_top(spa, vdev_id);67736774/* skip indirect vdevs */6775if (!vdev_is_concrete(vd))6776return (0);67776778metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];67796780ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);6781ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);67826783if (txg < metaslab_unflushed_txg(ms))6784return (0);67856786if (*uic_maptype == sme->sme_type)6787zfs_range_tree_add(ms->ms_allocatable, offset, size);6788else6789zfs_range_tree_remove(ms->ms_allocatable, offset, size);67906791return (0);6792}67936794static void6795load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)6796{6797iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);6798}67996800static void6801load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)6802{6803vdev_t *rvd = spa->spa_root_vdev;6804for (uint64_t i = 0; i < rvd->vdev_children; i++) {6805vdev_t *vd = rvd->vdev_child[i];68066807ASSERT3U(i, ==, vd->vdev_id);68086809if (vd->vdev_ops == &vdev_indirect_ops)6810continue;68116812for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {6813metaslab_t *msp = vd->vdev_ms[m];68146815(void) fprintf(stderr,6816"\rloading concrete vdev %llu, "6817"metaslab %llu of %llu ...",6818(longlong_t)vd->vdev_id,6819(longlong_t)msp->ms_id,6820(longlong_t)vd->vdev_ms_count);68216822mutex_enter(&msp->ms_lock);6823zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);68246825/*6826* We don't want to spend the CPU manipulating the6827* size-ordered tree, so clear the range_tree ops.6828*/6829msp->ms_allocatable->rt_ops = NULL;68306831if (msp->ms_sm != NULL) {6832VERIFY0(space_map_load(msp->ms_sm,6833msp->ms_allocatable, maptype));6834}6835if (!msp->ms_loaded)6836msp->ms_loaded = B_TRUE;6837mutex_exit(&msp->ms_lock);6838}6839}68406841load_unflushed_to_ms_allocatables(spa, maptype);6842}68436844/*6845* vm_idxp is an in-out parameter which (for indirect vdevs) is the6846* index in vim_entries that has the first entry in this metaslab.6847* On return, it will be set to the first entry after this metaslab.6848*/6849static void6850load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,6851uint64_t *vim_idxp)6852{6853vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;68546855mutex_enter(&msp->ms_lock);6856zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);68576858/*6859* We don't want to spend the CPU manipulating the6860* size-ordered tree, so clear the range_tree ops.6861*/6862msp->ms_allocatable->rt_ops = NULL;68636864for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);6865(*vim_idxp)++) {6866vdev_indirect_mapping_entry_phys_t *vimep =6867&vim->vim_entries[*vim_idxp];6868uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);6869uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);6870ASSERT3U(ent_offset, >=, msp->ms_start);6871if (ent_offset >= msp->ms_start + msp->ms_size)6872break;68736874/*6875* Mappings do not cross metaslab boundaries,6876* because we create them by walking the metaslabs.6877*/6878ASSERT3U(ent_offset + ent_len, <=,6879msp->ms_start + msp->ms_size);6880zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len);6881}68826883if (!msp->ms_loaded)6884msp->ms_loaded = B_TRUE;6885mutex_exit(&msp->ms_lock);6886}68876888static void6889zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)6890{6891ASSERT(!dump_opt['L']);68926893vdev_t *rvd = spa->spa_root_vdev;6894for (uint64_t c = 0; c < rvd->vdev_children; c++) {6895vdev_t *vd = rvd->vdev_child[c];68966897ASSERT3U(c, ==, vd->vdev_id);68986899if (vd->vdev_ops != &vdev_indirect_ops)6900continue;69016902/*6903* Note: we don't check for mapping leaks on6904* removing vdevs because their ms_allocatable's6905* are used to look for leaks in allocated space.6906*/6907zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);69086909/*6910* Normally, indirect vdevs don't have any6911* metaslabs. We want to set them up for6912* zio_claim().6913*/6914vdev_metaslab_group_create(vd);6915VERIFY0(vdev_metaslab_init(vd, 0));69166917vdev_indirect_mapping_t *vim __maybe_unused =6918vd->vdev_indirect_mapping;6919uint64_t vim_idx = 0;6920for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {69216922(void) fprintf(stderr,6923"\rloading indirect vdev %llu, "6924"metaslab %llu of %llu ...",6925(longlong_t)vd->vdev_id,6926(longlong_t)vd->vdev_ms[m]->ms_id,6927(longlong_t)vd->vdev_ms_count);69286929load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],6930&vim_idx);6931}6932ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));6933}6934}69356936static void6937zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)6938{6939zcb->zcb_spa = spa;69406941if (dump_opt['L'])6942return;69436944dsl_pool_t *dp = spa->spa_dsl_pool;6945vdev_t *rvd = spa->spa_root_vdev;69466947/*6948* We are going to be changing the meaning of the metaslab's6949* ms_allocatable. Ensure that the allocator doesn't try to6950* use the tree.6951*/6952spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;6953spa->spa_log_class->mc_ops = &zdb_metaslab_ops;6954spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;6955spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops;69566957zcb->zcb_vd_obsolete_counts =6958umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),6959UMEM_NOFAIL);69606961/*6962* For leak detection, we overload the ms_allocatable trees6963* to contain allocated segments instead of free segments.6964* As a result, we can't use the normal metaslab_load/unload6965* interfaces.6966*/6967zdb_leak_init_prepare_indirect_vdevs(spa, zcb);6968load_concrete_ms_allocatable_trees(spa, SM_ALLOC);69696970/*6971* On load_concrete_ms_allocatable_trees() we loaded all the6972* allocated entries from the ms_sm to the ms_allocatable for6973* each metaslab. If the pool has a checkpoint or is in the6974* middle of discarding a checkpoint, some of these blocks6975* may have been freed but their ms_sm may not have been6976* updated because they are referenced by the checkpoint. In6977* order to avoid false-positives during leak-detection, we6978* go through the vdev's checkpoint space map and exclude all6979* its entries from their relevant ms_allocatable.6980*6981* We also aggregate the space held by the checkpoint and add6982* it to zcb_checkpoint_size.6983*6984* Note that at this point we are also verifying that all the6985* entries on the checkpoint_sm are marked as allocated in6986* the ms_sm of their relevant metaslab.6987* [see comment in checkpoint_sm_exclude_entry_cb()]6988*/6989zdb_leak_init_exclude_checkpoint(spa, zcb);6990ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));69916992/* for cleaner progress output */6993(void) fprintf(stderr, "\n");69946995if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {6996ASSERT(spa_feature_is_enabled(spa,6997SPA_FEATURE_DEVICE_REMOVAL));6998(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,6999increment_indirect_mapping_cb, zcb, NULL);7000}7001}70027003static boolean_t7004zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)7005{7006boolean_t leaks = B_FALSE;7007vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;7008uint64_t total_leaked = 0;7009boolean_t are_precise = B_FALSE;70107011ASSERT(vim != NULL);70127013for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {7014vdev_indirect_mapping_entry_phys_t *vimep =7015&vim->vim_entries[i];7016uint64_t obsolete_bytes = 0;7017uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);7018metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];70197020/*7021* This is not very efficient but it's easy to7022* verify correctness.7023*/7024for (uint64_t inner_offset = 0;7025inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);7026inner_offset += 1ULL << vd->vdev_ashift) {7027if (zfs_range_tree_contains(msp->ms_allocatable,7028offset + inner_offset, 1ULL << vd->vdev_ashift)) {7029obsolete_bytes += 1ULL << vd->vdev_ashift;7030}7031}70327033int64_t bytes_leaked = obsolete_bytes -7034zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];7035ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,7036zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);70377038VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));7039if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {7040(void) printf("obsolete indirect mapping count "7041"mismatch on %llu:%llx:%llx : %llx bytes leaked\n",7042(u_longlong_t)vd->vdev_id,7043(u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),7044(u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),7045(u_longlong_t)bytes_leaked);7046}7047total_leaked += ABS(bytes_leaked);7048}70497050VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));7051if (!are_precise && total_leaked > 0) {7052int pct_leaked = total_leaked * 100 /7053vdev_indirect_mapping_bytes_mapped(vim);7054(void) printf("cannot verify obsolete indirect mapping "7055"counts of vdev %llu because precise feature was not "7056"enabled when it was removed: %d%% (%llx bytes) of mapping"7057"unreferenced\n",7058(u_longlong_t)vd->vdev_id, pct_leaked,7059(u_longlong_t)total_leaked);7060} else if (total_leaked > 0) {7061(void) printf("obsolete indirect mapping count mismatch "7062"for vdev %llu -- %llx total bytes mismatched\n",7063(u_longlong_t)vd->vdev_id,7064(u_longlong_t)total_leaked);7065leaks |= B_TRUE;7066}70677068vdev_indirect_mapping_free_obsolete_counts(vim,7069zcb->zcb_vd_obsolete_counts[vd->vdev_id]);7070zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;70717072return (leaks);7073}70747075static boolean_t7076zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)7077{7078if (dump_opt['L'])7079return (B_FALSE);70807081boolean_t leaks = B_FALSE;7082vdev_t *rvd = spa->spa_root_vdev;7083for (unsigned c = 0; c < rvd->vdev_children; c++) {7084vdev_t *vd = rvd->vdev_child[c];70857086if (zcb->zcb_vd_obsolete_counts[c] != NULL) {7087leaks |= zdb_check_for_obsolete_leaks(vd, zcb);7088}70897090for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {7091metaslab_t *msp = vd->vdev_ms[m];7092ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==7093spa_embedded_log_class(spa) ||7094msp->ms_group->mg_class ==7095spa_special_embedded_log_class(spa)) ?7096vd->vdev_log_mg : vd->vdev_mg);70977098/*7099* ms_allocatable has been overloaded7100* to contain allocated segments. Now that7101* we finished traversing all blocks, any7102* block that remains in the ms_allocatable7103* represents an allocated block that we7104* did not claim during the traversal.7105* Claimed blocks would have been removed7106* from the ms_allocatable. For indirect7107* vdevs, space remaining in the tree7108* represents parts of the mapping that are7109* not referenced, which is not a bug.7110*/7111if (vd->vdev_ops == &vdev_indirect_ops) {7112zfs_range_tree_vacate(msp->ms_allocatable,7113NULL, NULL);7114} else {7115zfs_range_tree_vacate(msp->ms_allocatable,7116zdb_leak, vd);7117}7118if (msp->ms_loaded) {7119msp->ms_loaded = B_FALSE;7120}7121}7122}71237124umem_free(zcb->zcb_vd_obsolete_counts,7125rvd->vdev_children * sizeof (uint32_t *));7126zcb->zcb_vd_obsolete_counts = NULL;71277128return (leaks);7129}71307131static int7132count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)7133{7134(void) tx;7135zdb_cb_t *zcb = arg;71367137if (dump_opt['b'] >= 5) {7138char blkbuf[BP_SPRINTF_LEN];7139snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);7140(void) printf("[%s] %s\n",7141"deferred free", blkbuf);7142}7143zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);7144return (0);7145}71467147/*7148* Iterate over livelists which have been destroyed by the user but7149* are still present in the MOS, waiting to be freed7150*/7151static void7152iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)7153{7154objset_t *mos = spa->spa_meta_objset;7155uint64_t zap_obj;7156int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,7157DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);7158if (err == ENOENT)7159return;7160ASSERT0(err);71617162zap_cursor_t zc;7163zap_attribute_t *attrp = zap_attribute_alloc();7164dsl_deadlist_t ll;7165/* NULL out os prior to dsl_deadlist_open in case it's garbage */7166ll.dl_os = NULL;7167for (zap_cursor_init(&zc, mos, zap_obj);7168zap_cursor_retrieve(&zc, attrp) == 0;7169(void) zap_cursor_advance(&zc)) {7170VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer));7171func(&ll, arg);7172dsl_deadlist_close(&ll);7173}7174zap_cursor_fini(&zc);7175zap_attribute_free(attrp);7176}71777178static int7179bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,7180dmu_tx_t *tx)7181{7182ASSERT(!bp_freed);7183return (count_block_cb(arg, bp, tx));7184}71857186static int7187livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)7188{7189zdb_cb_t *zbc = args;7190bplist_t blks;7191bplist_create(&blks);7192/* determine which blocks have been alloc'd but not freed */7193VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));7194/* count those blocks */7195(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);7196bplist_destroy(&blks);7197return (0);7198}71997200static void7201livelist_count_blocks(dsl_deadlist_t *ll, void *arg)7202{7203dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);7204}72057206/*7207* Count the blocks in the livelists that have been destroyed by the user7208* but haven't yet been freed.7209*/7210static void7211deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)7212{7213iterate_deleted_livelists(spa, livelist_count_blocks, zbc);7214}72157216static void7217dump_livelist_cb(dsl_deadlist_t *ll, void *arg)7218{7219ASSERT0P(arg);7220global_feature_count[SPA_FEATURE_LIVELIST]++;7221dump_blkptr_list(ll, "Deleted Livelist");7222dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);7223}72247225/*7226* Print out, register object references to, and increment feature counts for7227* livelists that have been destroyed by the user but haven't yet been freed.7228*/7229static void7230deleted_livelists_dump_mos(spa_t *spa)7231{7232uint64_t zap_obj;7233objset_t *mos = spa->spa_meta_objset;7234int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,7235DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);7236if (err == ENOENT)7237return;7238mos_obj_refd(zap_obj);7239iterate_deleted_livelists(spa, dump_livelist_cb, NULL);7240}72417242static int7243zdb_brt_entry_compare(const void *zcn1, const void *zcn2)7244{7245const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;7246const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;7247int cmp;72487249cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));7250if (cmp == 0)7251cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));72527253return (cmp);7254}72557256static int7257dump_block_stats(spa_t *spa)7258{7259zdb_cb_t *zcb;7260zdb_blkstats_t *zb, *tzb;7261uint64_t norm_alloc, norm_space, total_alloc, total_found;7262int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |7263TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;7264boolean_t leaks = B_FALSE;7265int e, c, err;7266bp_embedded_type_t i;72677268ddt_prefetch_all(spa);72697270zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);72717272if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {7273avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,7274sizeof (zdb_brt_entry_t),7275offsetof(zdb_brt_entry_t, zbre_node));7276zcb->zcb_brt_is_active = B_TRUE;7277}72787279(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",7280(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",7281(dump_opt['c'] == 1) ? "metadata " : "",7282dump_opt['c'] ? "checksums " : "",7283(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",7284!dump_opt['L'] ? "nothing leaked " : "");72857286/*7287* When leak detection is enabled we load all space maps as SM_ALLOC7288* maps, then traverse the pool claiming each block we discover. If7289* the pool is perfectly consistent, the segment trees will be empty7290* when we're done. Anything left over is a leak; any block we can't7291* claim (because it's not part of any space map) is a double7292* allocation, reference to a freed block, or an unclaimed log block.7293*7294* When leak detection is disabled (-L option) we still traverse the7295* pool claiming each block we discover, but we skip opening any space7296* maps.7297*/7298zdb_leak_init(spa, zcb);72997300/*7301* If there's a deferred-free bplist, process that first.7302*/7303(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,7304bpobj_count_block_cb, zcb, NULL);73057306if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {7307(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,7308bpobj_count_block_cb, zcb, NULL);7309}73107311zdb_claim_removing(spa, zcb);73127313if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {7314VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,7315spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,7316zcb, NULL));7317}73187319deleted_livelists_count_blocks(spa, zcb);73207321if (dump_opt['c'] > 1)7322flags |= TRAVERSE_PREFETCH_DATA;73237324zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));7325zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));7326zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));7327zcb->zcb_totalasize +=7328metaslab_class_get_alloc(spa_embedded_log_class(spa));7329zcb->zcb_totalasize +=7330metaslab_class_get_alloc(spa_special_embedded_log_class(spa));7331zcb->zcb_start = zcb->zcb_lastprint = gethrtime();7332err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);73337334/*7335* If we've traversed the data blocks then we need to wait for those7336* I/Os to complete. We leverage "The Godfather" zio to wait on7337* all async I/Os to complete.7338*/7339if (dump_opt['c']) {7340for (c = 0; c < max_ncpus; c++) {7341(void) zio_wait(spa->spa_async_zio_root[c]);7342spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,7343ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |7344ZIO_FLAG_GODFATHER);7345}7346}7347ASSERT0(spa->spa_load_verify_bytes);73487349/*7350* Done after zio_wait() since zcb_haderrors is modified in7351* zdb_blkptr_done()7352*/7353zcb->zcb_haderrors |= err;73547355if (zcb->zcb_haderrors) {7356(void) printf("\nError counts:\n\n");7357(void) printf("\t%5s %s\n", "errno", "count");7358for (e = 0; e < 256; e++) {7359if (zcb->zcb_errors[e] != 0) {7360(void) printf("\t%5d %llu\n",7361e, (u_longlong_t)zcb->zcb_errors[e]);7362}7363}7364}73657366/*7367* Report any leaked segments.7368*/7369leaks |= zdb_leak_fini(spa, zcb);73707371tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];73727373norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));7374norm_space = metaslab_class_get_space(spa_normal_class(spa));73757376total_alloc = norm_alloc +7377metaslab_class_get_alloc(spa_log_class(spa)) +7378metaslab_class_get_alloc(spa_embedded_log_class(spa)) +7379metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) +7380metaslab_class_get_alloc(spa_special_class(spa)) +7381metaslab_class_get_alloc(spa_dedup_class(spa)) +7382get_unflushed_alloc_space(spa);7383total_found =7384tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +7385zcb->zcb_removing_size + zcb->zcb_checkpoint_size;73867387if (total_found == total_alloc && !dump_opt['L']) {7388(void) printf("\n\tNo leaks (block sum matches space"7389" maps exactly)\n");7390} else if (!dump_opt['L']) {7391(void) printf("block traversal size %llu != alloc %llu "7392"(%s %lld)\n",7393(u_longlong_t)total_found,7394(u_longlong_t)total_alloc,7395(dump_opt['L']) ? "unreachable" : "leaked",7396(longlong_t)(total_alloc - total_found));7397}73987399if (tzb->zb_count == 0) {7400umem_free(zcb, sizeof (zdb_cb_t));7401return (2);7402}74037404(void) printf("\n");7405(void) printf("\t%-16s %14llu\n", "bp count:",7406(u_longlong_t)tzb->zb_count);7407(void) printf("\t%-16s %14llu\n", "ganged count:",7408(longlong_t)tzb->zb_gangs);7409(void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",7410(u_longlong_t)tzb->zb_lsize,7411(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));7412(void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",7413"bp physical:", (u_longlong_t)tzb->zb_psize,7414(u_longlong_t)(tzb->zb_psize / tzb->zb_count),7415(double)tzb->zb_lsize / tzb->zb_psize);7416(void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",7417"bp allocated:", (u_longlong_t)tzb->zb_asize,7418(u_longlong_t)(tzb->zb_asize / tzb->zb_count),7419(double)tzb->zb_lsize / tzb->zb_asize);7420(void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",7421"bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,7422(u_longlong_t)zcb->zcb_dedup_blocks,7423(double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);7424(void) printf("\t%-16s %14llu count: %6llu\n",7425"bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,7426(u_longlong_t)zcb->zcb_clone_blocks);7427(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",7428(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);74297430if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {7431uint64_t alloc = metaslab_class_get_alloc(7432spa_special_class(spa));7433uint64_t space = metaslab_class_get_space(7434spa_special_class(spa));74357436(void) printf("\t%-16s %14llu used: %5.2f%%\n",7437"Special class", (u_longlong_t)alloc,7438100.0 * alloc / space);7439}74407441if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {7442uint64_t alloc = metaslab_class_get_alloc(7443spa_dedup_class(spa));7444uint64_t space = metaslab_class_get_space(7445spa_dedup_class(spa));74467447(void) printf("\t%-16s %14llu used: %5.2f%%\n",7448"Dedup class", (u_longlong_t)alloc,7449100.0 * alloc / space);7450}74517452if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {7453uint64_t alloc = metaslab_class_get_alloc(7454spa_embedded_log_class(spa));7455uint64_t space = metaslab_class_get_space(7456spa_embedded_log_class(spa));74577458(void) printf("\t%-16s %14llu used: %5.2f%%\n",7459"Embedded log class", (u_longlong_t)alloc,7460100.0 * alloc / space);7461}74627463if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor7464!= NULL) {7465uint64_t alloc = metaslab_class_get_alloc(7466spa_special_embedded_log_class(spa));7467uint64_t space = metaslab_class_get_space(7468spa_special_embedded_log_class(spa));74697470(void) printf("\t%-16s %14llu used: %5.2f%%\n",7471"Special embedded log", (u_longlong_t)alloc,7472100.0 * alloc / space);7473}74747475for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {7476if (zcb->zcb_embedded_blocks[i] == 0)7477continue;7478(void) printf("\n");7479(void) printf("\tadditional, non-pointer bps of type %u: "7480"%10llu\n",7481i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);74827483if (dump_opt['b'] >= 3) {7484(void) printf("\t number of (compressed) bytes: "7485"number of bps\n");7486dump_histogram(zcb->zcb_embedded_histogram[i],7487sizeof (zcb->zcb_embedded_histogram[i]) /7488sizeof (zcb->zcb_embedded_histogram[i][0]), 0);7489}7490}74917492if (tzb->zb_ditto_samevdev != 0) {7493(void) printf("\tDittoed blocks on same vdev: %llu\n",7494(longlong_t)tzb->zb_ditto_samevdev);7495}7496if (tzb->zb_ditto_same_ms != 0) {7497(void) printf("\tDittoed blocks in same metaslab: %llu\n",7498(longlong_t)tzb->zb_ditto_same_ms);7499}75007501for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {7502vdev_t *vd = spa->spa_root_vdev->vdev_child[v];7503vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;75047505if (vim == NULL) {7506continue;7507}75087509char mem[32];7510zdb_nicenum(vdev_indirect_mapping_num_entries(vim),7511mem, vdev_indirect_mapping_size(vim));75127513(void) printf("\tindirect vdev id %llu has %llu segments "7514"(%s in memory)\n",7515(longlong_t)vd->vdev_id,7516(longlong_t)vdev_indirect_mapping_num_entries(vim), mem);7517}75187519if (dump_opt['b'] >= 2) {7520int l, t, level;7521char csize[32], lsize[32], psize[32], asize[32];7522char avg[32], gang[32];7523(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"7524"\t avg\t comp\t%%Total\tType\n");75257526zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),7527UMEM_NOFAIL);75287529for (t = 0; t <= ZDB_OT_TOTAL; t++) {7530const char *typename;75317532/* make sure nicenum has enough space */7533_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,7534"csize truncated");7535_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,7536"lsize truncated");7537_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,7538"psize truncated");7539_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,7540"asize truncated");7541_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,7542"avg truncated");7543_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,7544"gang truncated");75457546if (t < DMU_OT_NUMTYPES)7547typename = dmu_ot[t].ot_name;7548else7549typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];75507551if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {7552(void) printf("%6s\t%5s\t%5s\t%5s"7553"\t%5s\t%5s\t%6s\t%s\n",7554"-",7555"-",7556"-",7557"-",7558"-",7559"-",7560"-",7561typename);7562continue;7563}75647565for (l = ZB_TOTAL - 1; l >= -1; l--) {7566level = (l == -1 ? ZB_TOTAL : l);7567zb = &zcb->zcb_type[level][t];75687569if (zb->zb_asize == 0)7570continue;75717572if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&7573(level > 0 || DMU_OT_IS_METADATA(t))) {7574mdstats->zb_count += zb->zb_count;7575mdstats->zb_lsize += zb->zb_lsize;7576mdstats->zb_psize += zb->zb_psize;7577mdstats->zb_asize += zb->zb_asize;7578mdstats->zb_gangs += zb->zb_gangs;7579}75807581if (dump_opt['b'] < 3 && level != ZB_TOTAL)7582continue;75837584if (level == 0 && zb->zb_asize ==7585zcb->zcb_type[ZB_TOTAL][t].zb_asize)7586continue;75877588zdb_nicenum(zb->zb_count, csize,7589sizeof (csize));7590zdb_nicenum(zb->zb_lsize, lsize,7591sizeof (lsize));7592zdb_nicenum(zb->zb_psize, psize,7593sizeof (psize));7594zdb_nicenum(zb->zb_asize, asize,7595sizeof (asize));7596zdb_nicenum(zb->zb_asize / zb->zb_count, avg,7597sizeof (avg));7598zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));75997600(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"7601"\t%5.2f\t%6.2f\t",7602csize, lsize, psize, asize, avg,7603(double)zb->zb_lsize / zb->zb_psize,7604100.0 * zb->zb_asize / tzb->zb_asize);76057606if (level == ZB_TOTAL)7607(void) printf("%s\n", typename);7608else7609(void) printf(" L%d %s\n",7610level, typename);76117612if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {7613(void) printf("\t number of ganged "7614"blocks: %s\n", gang);7615}76167617if (dump_opt['b'] >= 4) {7618(void) printf("psize "7619"(in 512-byte sectors): "7620"number of blocks\n");7621dump_histogram(zb->zb_psize_histogram,7622PSIZE_HISTO_SIZE, 0);7623}7624}7625}7626zdb_nicenum(mdstats->zb_count, csize,7627sizeof (csize));7628zdb_nicenum(mdstats->zb_lsize, lsize,7629sizeof (lsize));7630zdb_nicenum(mdstats->zb_psize, psize,7631sizeof (psize));7632zdb_nicenum(mdstats->zb_asize, asize,7633sizeof (asize));7634zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,7635sizeof (avg));7636zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));76377638(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"7639"\t%5.2f\t%6.2f\t",7640csize, lsize, psize, asize, avg,7641(double)mdstats->zb_lsize / mdstats->zb_psize,7642100.0 * mdstats->zb_asize / tzb->zb_asize);7643(void) printf("%s\n", "Metadata Total");76447645/* Output a table summarizing block sizes in the pool */7646if (dump_opt['b'] >= 2) {7647dump_size_histograms(zcb);7648}76497650umem_free(mdstats, sizeof (zfs_blkstat_t));7651}76527653(void) printf("\n");76547655if (leaks) {7656umem_free(zcb, sizeof (zdb_cb_t));7657return (2);7658}76597660if (zcb->zcb_haderrors) {7661umem_free(zcb, sizeof (zdb_cb_t));7662return (3);7663}76647665umem_free(zcb, sizeof (zdb_cb_t));7666return (0);7667}76687669typedef struct zdb_ddt_entry {7670/* key must be first for ddt_key_compare */7671ddt_key_t zdde_key;7672uint64_t zdde_ref_blocks;7673uint64_t zdde_ref_lsize;7674uint64_t zdde_ref_psize;7675uint64_t zdde_ref_dsize;7676avl_node_t zdde_node;7677} zdb_ddt_entry_t;76787679static int7680zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,7681const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)7682{7683(void) zilog, (void) dnp;7684avl_tree_t *t = arg;7685avl_index_t where;7686zdb_ddt_entry_t *zdde, zdde_search;76877688if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||7689BP_IS_EMBEDDED(bp))7690return (0);76917692if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {7693(void) printf("traversing objset %llu, %llu objects, "7694"%lu blocks so far\n",7695(u_longlong_t)zb->zb_objset,7696(u_longlong_t)BP_GET_FILL(bp),7697avl_numnodes(t));7698}76997700if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||7701BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))7702return (0);77037704ddt_key_fill(&zdde_search.zdde_key, bp);77057706zdde = avl_find(t, &zdde_search, &where);77077708if (zdde == NULL) {7709zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);7710zdde->zdde_key = zdde_search.zdde_key;7711avl_insert(t, zdde, where);7712}77137714zdde->zdde_ref_blocks += 1;7715zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);7716zdde->zdde_ref_psize += BP_GET_PSIZE(bp);7717zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);77187719return (0);7720}77217722static void7723dump_simulated_ddt(spa_t *spa)7724{7725avl_tree_t t;7726void *cookie = NULL;7727zdb_ddt_entry_t *zdde;7728ddt_histogram_t ddh_total = {{{0}}};7729ddt_stat_t dds_total = {0};77307731avl_create(&t, ddt_key_compare,7732sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));77337734spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);77357736(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |7737TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);77387739spa_config_exit(spa, SCL_CONFIG, FTAG);77407741while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {7742uint64_t refcnt = zdde->zdde_ref_blocks;7743ASSERT(refcnt != 0);77447745ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];77467747dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;7748dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;7749dds->dds_psize += zdde->zdde_ref_psize / refcnt;7750dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;77517752dds->dds_ref_blocks += zdde->zdde_ref_blocks;7753dds->dds_ref_lsize += zdde->zdde_ref_lsize;7754dds->dds_ref_psize += zdde->zdde_ref_psize;7755dds->dds_ref_dsize += zdde->zdde_ref_dsize;77567757umem_free(zdde, sizeof (*zdde));7758}77597760avl_destroy(&t);77617762ddt_histogram_total(&dds_total, &ddh_total);77637764(void) printf("Simulated DDT histogram:\n");77657766zpool_dump_ddt(&dds_total, &ddh_total);77677768dump_dedup_ratio(&dds_total);7769}77707771static int7772verify_device_removal_feature_counts(spa_t *spa)7773{7774uint64_t dr_feature_refcount = 0;7775uint64_t oc_feature_refcount = 0;7776uint64_t indirect_vdev_count = 0;7777uint64_t precise_vdev_count = 0;7778uint64_t obsolete_counts_object_count = 0;7779uint64_t obsolete_sm_count = 0;7780uint64_t obsolete_counts_count = 0;7781uint64_t scip_count = 0;7782uint64_t obsolete_bpobj_count = 0;7783int ret = 0;77847785spa_condensing_indirect_phys_t *scip =7786&spa->spa_condensing_indirect_phys;7787if (scip->scip_next_mapping_object != 0) {7788vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];7789ASSERT(scip->scip_prev_obsolete_sm_object != 0);7790ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);77917792(void) printf("Condensing indirect vdev %llu: new mapping "7793"object %llu, prev obsolete sm %llu\n",7794(u_longlong_t)scip->scip_vdev,7795(u_longlong_t)scip->scip_next_mapping_object,7796(u_longlong_t)scip->scip_prev_obsolete_sm_object);7797if (scip->scip_prev_obsolete_sm_object != 0) {7798space_map_t *prev_obsolete_sm = NULL;7799VERIFY0(space_map_open(&prev_obsolete_sm,7800spa->spa_meta_objset,7801scip->scip_prev_obsolete_sm_object,78020, vd->vdev_asize, 0));7803dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);7804(void) printf("\n");7805space_map_close(prev_obsolete_sm);7806}78077808scip_count += 2;7809}78107811for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {7812vdev_t *vd = spa->spa_root_vdev->vdev_child[i];7813vdev_indirect_config_t *vic = &vd->vdev_indirect_config;78147815if (vic->vic_mapping_object != 0) {7816ASSERT(vd->vdev_ops == &vdev_indirect_ops ||7817vd->vdev_removing);7818indirect_vdev_count++;78197820if (vd->vdev_indirect_mapping->vim_havecounts) {7821obsolete_counts_count++;7822}7823}78247825boolean_t are_precise;7826VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));7827if (are_precise) {7828ASSERT(vic->vic_mapping_object != 0);7829precise_vdev_count++;7830}78317832uint64_t obsolete_sm_object;7833VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));7834if (obsolete_sm_object != 0) {7835ASSERT(vic->vic_mapping_object != 0);7836obsolete_sm_count++;7837}7838}78397840(void) feature_get_refcount(spa,7841&spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],7842&dr_feature_refcount);7843(void) feature_get_refcount(spa,7844&spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],7845&oc_feature_refcount);78467847if (dr_feature_refcount != indirect_vdev_count) {7848ret = 1;7849(void) printf("Number of indirect vdevs (%llu) " \7850"does not match feature count (%llu)\n",7851(u_longlong_t)indirect_vdev_count,7852(u_longlong_t)dr_feature_refcount);7853} else {7854(void) printf("Verified device_removal feature refcount " \7855"of %llu is correct\n",7856(u_longlong_t)dr_feature_refcount);7857}78587859if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,7860DMU_POOL_OBSOLETE_BPOBJ) == 0) {7861obsolete_bpobj_count++;7862}786378647865obsolete_counts_object_count = precise_vdev_count;7866obsolete_counts_object_count += obsolete_sm_count;7867obsolete_counts_object_count += obsolete_counts_count;7868obsolete_counts_object_count += scip_count;7869obsolete_counts_object_count += obsolete_bpobj_count;7870obsolete_counts_object_count += remap_deadlist_count;78717872if (oc_feature_refcount != obsolete_counts_object_count) {7873ret = 1;7874(void) printf("Number of obsolete counts objects (%llu) " \7875"does not match feature count (%llu)\n",7876(u_longlong_t)obsolete_counts_object_count,7877(u_longlong_t)oc_feature_refcount);7878(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "7879"ob:%llu rd:%llu\n",7880(u_longlong_t)precise_vdev_count,7881(u_longlong_t)obsolete_sm_count,7882(u_longlong_t)obsolete_counts_count,7883(u_longlong_t)scip_count,7884(u_longlong_t)obsolete_bpobj_count,7885(u_longlong_t)remap_deadlist_count);7886} else {7887(void) printf("Verified indirect_refcount feature refcount " \7888"of %llu is correct\n",7889(u_longlong_t)oc_feature_refcount);7890}7891return (ret);7892}78937894static void7895zdb_set_skip_mmp(char *target)7896{7897spa_t *spa;78987899/*7900* Disable the activity check to allow examination of7901* active pools.7902*/7903spa_namespace_enter(FTAG);7904if ((spa = spa_lookup(target)) != NULL) {7905spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;7906}7907spa_namespace_exit(FTAG);7908}79097910#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"7911/*7912* Import the checkpointed state of the pool specified by the target7913* parameter as readonly. The function also accepts a pool config7914* as an optional parameter, else it attempts to infer the config by7915* the name of the target pool.7916*7917* Note that the checkpointed state's pool name will be the name of7918* the original pool with the above suffix appended to it. In addition,7919* if the target is not a pool name (e.g. a path to a dataset) then7920* the new_path parameter is populated with the updated path to7921* reflect the fact that we are looking into the checkpointed state.7922*7923* The function returns a newly-allocated copy of the name of the7924* pool containing the checkpointed state. When this copy is no7925* longer needed it should be freed with free(3C). Same thing7926* applies to the new_path parameter if allocated.7927*/7928static char *7929import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,7930char **new_path)7931{7932int error = 0;7933char *poolname, *bogus_name = NULL;7934boolean_t freecfg = B_FALSE;79357936/* If the target is not a pool, the extract the pool name */7937char *path_start = strchr(target, '/');7938if (target_is_spa || path_start == NULL) {7939poolname = target;7940} else {7941size_t poolname_len = path_start - target;7942poolname = strndup(target, poolname_len);7943}79447945if (cfg == NULL) {7946zdb_set_skip_mmp(poolname);7947error = spa_get_stats(poolname, &cfg, NULL, 0);7948if (error != 0) {7949fatal("Tried to read config of pool \"%s\" but "7950"spa_get_stats() failed with error %d\n",7951poolname, error);7952}7953freecfg = B_TRUE;7954}79557956if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {7957if (target != poolname)7958free(poolname);7959return (NULL);7960}7961fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);79627963error = spa_import(bogus_name, cfg, NULL,7964ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |7965ZFS_IMPORT_SKIP_MMP);7966if (freecfg)7967nvlist_free(cfg);7968if (error != 0) {7969fatal("Tried to import pool \"%s\" but spa_import() failed "7970"with error %d\n", bogus_name, error);7971}79727973if (new_path != NULL && !target_is_spa) {7974if (asprintf(new_path, "%s%s", bogus_name,7975path_start != NULL ? path_start : "") == -1) {7976free(bogus_name);7977if (!target_is_spa && path_start != NULL)7978free(poolname);7979return (NULL);7980}7981}79827983if (target != poolname)7984free(poolname);79857986return (bogus_name);7987}79887989typedef struct verify_checkpoint_sm_entry_cb_arg {7990vdev_t *vcsec_vd;79917992/* the following fields are only used for printing progress */7993uint64_t vcsec_entryid;7994uint64_t vcsec_num_entries;7995} verify_checkpoint_sm_entry_cb_arg_t;79967997#define ENTRIES_PER_PROGRESS_UPDATE 1000079987999static int8000verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)8001{8002verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;8003vdev_t *vd = vcsec->vcsec_vd;8004metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];8005uint64_t end = sme->sme_offset + sme->sme_run;80068007ASSERT(sme->sme_type == SM_FREE);80088009if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {8010(void) fprintf(stderr,8011"\rverifying vdev %llu, space map entry %llu of %llu ...",8012(longlong_t)vd->vdev_id,8013(longlong_t)vcsec->vcsec_entryid,8014(longlong_t)vcsec->vcsec_num_entries);8015}8016vcsec->vcsec_entryid++;80178018/*8019* See comment in checkpoint_sm_exclude_entry_cb()8020*/8021VERIFY3U(sme->sme_offset, >=, ms->ms_start);8022VERIFY3U(end, <=, ms->ms_start + ms->ms_size);80238024/*8025* The entries in the vdev_checkpoint_sm should be marked as8026* allocated in the checkpointed state of the pool, therefore8027* their respective ms_allocateable trees should not contain them.8028*/8029mutex_enter(&ms->ms_lock);8030zfs_range_tree_verify_not_present(ms->ms_allocatable,8031sme->sme_offset, sme->sme_run);8032mutex_exit(&ms->ms_lock);80338034return (0);8035}80368037/*8038* Verify that all segments in the vdev_checkpoint_sm are allocated8039* according to the checkpoint's ms_sm (i.e. are not in the checkpoint's8040* ms_allocatable).8041*8042* Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of8043* each vdev in the current state of the pool to the metaslab space maps8044* (ms_sm) of the checkpointed state of the pool.8045*8046* Note that the function changes the state of the ms_allocatable8047* trees of the current spa_t. The entries of these ms_allocatable8048* trees are cleared out and then repopulated from with the free8049* entries of their respective ms_sm space maps.8050*/8051static void8052verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)8053{8054vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;8055vdev_t *current_rvd = current->spa_root_vdev;80568057load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);80588059for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {8060vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];8061vdev_t *current_vd = current_rvd->vdev_child[c];80628063space_map_t *checkpoint_sm = NULL;8064uint64_t checkpoint_sm_obj;80658066if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {8067/*8068* Since we don't allow device removal in a pool8069* that has a checkpoint, we expect that all removed8070* vdevs were removed from the pool before the8071* checkpoint.8072*/8073ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);8074continue;8075}80768077/*8078* If the checkpoint space map doesn't exist, then nothing8079* here is checkpointed so there's nothing to verify.8080*/8081if (current_vd->vdev_top_zap == 0 ||8082zap_contains(spa_meta_objset(current),8083current_vd->vdev_top_zap,8084VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)8085continue;80868087VERIFY0(zap_lookup(spa_meta_objset(current),8088current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,8089sizeof (uint64_t), 1, &checkpoint_sm_obj));80908091VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),8092checkpoint_sm_obj, 0, current_vd->vdev_asize,8093current_vd->vdev_ashift));80948095verify_checkpoint_sm_entry_cb_arg_t vcsec;8096vcsec.vcsec_vd = ckpoint_vd;8097vcsec.vcsec_entryid = 0;8098vcsec.vcsec_num_entries =8099space_map_length(checkpoint_sm) / sizeof (uint64_t);8100VERIFY0(space_map_iterate(checkpoint_sm,8101space_map_length(checkpoint_sm),8102verify_checkpoint_sm_entry_cb, &vcsec));8103if (dump_opt['m'] > 3)8104dump_spacemap(current->spa_meta_objset, checkpoint_sm);8105space_map_close(checkpoint_sm);8106}81078108/*8109* If we've added vdevs since we took the checkpoint, ensure8110* that their checkpoint space maps are empty.8111*/8112if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {8113for (uint64_t c = ckpoint_rvd->vdev_children;8114c < current_rvd->vdev_children; c++) {8115vdev_t *current_vd = current_rvd->vdev_child[c];8116VERIFY0P(current_vd->vdev_checkpoint_sm);8117}8118}81198120/* for cleaner progress output */8121(void) fprintf(stderr, "\n");8122}81238124/*8125* Verifies that all space that's allocated in the checkpoint is8126* still allocated in the current version, by checking that everything8127* in checkpoint's ms_allocatable (which is actually allocated, not8128* allocatable/free) is not present in current's ms_allocatable.8129*8130* Note that the function changes the state of the ms_allocatable8131* trees of both spas when called. The entries of all ms_allocatable8132* trees are cleared out and then repopulated from their respective8133* ms_sm space maps. In the checkpointed state we load the allocated8134* entries, and in the current state we load the free entries.8135*/8136static void8137verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)8138{8139vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;8140vdev_t *current_rvd = current->spa_root_vdev;81418142load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);8143load_concrete_ms_allocatable_trees(current, SM_FREE);81448145for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {8146vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];8147vdev_t *current_vd = current_rvd->vdev_child[i];81488149if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {8150/*8151* See comment in verify_checkpoint_vdev_spacemaps()8152*/8153ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);8154continue;8155}81568157for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {8158metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];8159metaslab_t *current_msp = current_vd->vdev_ms[m];81608161(void) fprintf(stderr,8162"\rverifying vdev %llu of %llu, "8163"metaslab %llu of %llu ...",8164(longlong_t)current_vd->vdev_id,8165(longlong_t)current_rvd->vdev_children,8166(longlong_t)current_vd->vdev_ms[m]->ms_id,8167(longlong_t)current_vd->vdev_ms_count);81688169/*8170* We walk through the ms_allocatable trees that8171* are loaded with the allocated blocks from the8172* ms_sm spacemaps of the checkpoint. For each8173* one of these ranges we ensure that none of them8174* exists in the ms_allocatable trees of the8175* current state which are loaded with the ranges8176* that are currently free.8177*8178* This way we ensure that none of the blocks that8179* are part of the checkpoint were freed by mistake.8180*/8181zfs_range_tree_walk(ckpoint_msp->ms_allocatable,8182(zfs_range_tree_func_t *)8183zfs_range_tree_verify_not_present,8184current_msp->ms_allocatable);8185}8186}81878188/* for cleaner progress output */8189(void) fprintf(stderr, "\n");8190}81918192static void8193verify_checkpoint_blocks(spa_t *spa)8194{8195ASSERT(!dump_opt['L']);81968197spa_t *checkpoint_spa;8198char *checkpoint_pool;8199int error = 0;82008201/*8202* We import the checkpointed state of the pool (under a different8203* name) so we can do verification on it against the current state8204* of the pool.8205*/8206checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,8207NULL);8208ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);82098210error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);8211if (error != 0) {8212fatal("Tried to open pool \"%s\" but spa_open() failed with "8213"error %d\n", checkpoint_pool, error);8214}82158216/*8217* Ensure that ranges in the checkpoint space maps of each vdev8218* are allocated according to the checkpointed state's metaslab8219* space maps.8220*/8221verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);82228223/*8224* Ensure that allocated ranges in the checkpoint's metaslab8225* space maps remain allocated in the metaslab space maps of8226* the current state.8227*/8228verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);82298230/*8231* Once we are done, we get rid of the checkpointed state.8232*/8233spa_close(checkpoint_spa, FTAG);8234free(checkpoint_pool);8235}82368237static void8238dump_leftover_checkpoint_blocks(spa_t *spa)8239{8240vdev_t *rvd = spa->spa_root_vdev;82418242for (uint64_t i = 0; i < rvd->vdev_children; i++) {8243vdev_t *vd = rvd->vdev_child[i];82448245space_map_t *checkpoint_sm = NULL;8246uint64_t checkpoint_sm_obj;82478248if (vd->vdev_top_zap == 0)8249continue;82508251if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,8252VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)8253continue;82548255VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,8256VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,8257sizeof (uint64_t), 1, &checkpoint_sm_obj));82588259VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),8260checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));8261dump_spacemap(spa->spa_meta_objset, checkpoint_sm);8262space_map_close(checkpoint_sm);8263}8264}82658266static int8267verify_checkpoint(spa_t *spa)8268{8269uberblock_t checkpoint;8270int error;82718272if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))8273return (0);82748275error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,8276DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),8277sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);82788279if (error == ENOENT && !dump_opt['L']) {8280/*8281* If the feature is active but the uberblock is missing8282* then we must be in the middle of discarding the8283* checkpoint.8284*/8285(void) printf("\nPartially discarded checkpoint "8286"state found:\n");8287if (dump_opt['m'] > 3)8288dump_leftover_checkpoint_blocks(spa);8289return (0);8290} else if (error != 0) {8291(void) printf("lookup error %d when looking for "8292"checkpointed uberblock in MOS\n", error);8293return (error);8294}8295dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");82968297if (checkpoint.ub_checkpoint_txg == 0) {8298(void) printf("\nub_checkpoint_txg not set in checkpointed "8299"uberblock\n");8300error = 3;8301}83028303if (error == 0 && !dump_opt['L'])8304verify_checkpoint_blocks(spa);83058306return (error);8307}83088309static void8310mos_leaks_cb(void *arg, uint64_t start, uint64_t size)8311{8312(void) arg;8313for (uint64_t i = start; i < size; i++) {8314(void) printf("MOS object %llu referenced but not allocated\n",8315(u_longlong_t)i);8316}8317}83188319static void8320mos_obj_refd(uint64_t obj)8321{8322if (obj != 0 && mos_refd_objs != NULL)8323zfs_range_tree_add(mos_refd_objs, obj, 1);8324}83258326/*8327* Call on a MOS object that may already have been referenced.8328*/8329static void8330mos_obj_refd_multiple(uint64_t obj)8331{8332if (obj != 0 && mos_refd_objs != NULL &&8333!zfs_range_tree_contains(mos_refd_objs, obj, 1))8334zfs_range_tree_add(mos_refd_objs, obj, 1);8335}83368337static void8338mos_leak_vdev_top_zap(vdev_t *vd)8339{8340uint64_t ms_flush_data_obj;8341int error = zap_lookup(spa_meta_objset(vd->vdev_spa),8342vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,8343sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);8344if (error == ENOENT)8345return;8346ASSERT0(error);83478348mos_obj_refd(ms_flush_data_obj);8349}83508351static void8352mos_leak_vdev(vdev_t *vd)8353{8354mos_obj_refd(vd->vdev_dtl_object);8355mos_obj_refd(vd->vdev_ms_array);8356mos_obj_refd(vd->vdev_indirect_config.vic_births_object);8357mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);8358mos_obj_refd(vd->vdev_leaf_zap);8359if (vd->vdev_checkpoint_sm != NULL)8360mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);8361if (vd->vdev_indirect_mapping != NULL) {8362mos_obj_refd(vd->vdev_indirect_mapping->8363vim_phys->vimp_counts_object);8364}8365if (vd->vdev_obsolete_sm != NULL)8366mos_obj_refd(vd->vdev_obsolete_sm->sm_object);83678368for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {8369metaslab_t *ms = vd->vdev_ms[m];8370mos_obj_refd(space_map_object(ms->ms_sm));8371}83728373if (vd->vdev_root_zap != 0)8374mos_obj_refd(vd->vdev_root_zap);83758376if (vd->vdev_top_zap != 0) {8377mos_obj_refd(vd->vdev_top_zap);8378mos_leak_vdev_top_zap(vd);8379}83808381for (uint64_t c = 0; c < vd->vdev_children; c++) {8382mos_leak_vdev(vd->vdev_child[c]);8383}8384}83858386static void8387mos_leak_log_spacemaps(spa_t *spa)8388{8389uint64_t spacemap_zap;8390int error = zap_lookup(spa_meta_objset(spa),8391DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,8392sizeof (spacemap_zap), 1, &spacemap_zap);8393if (error == ENOENT)8394return;8395ASSERT0(error);83968397mos_obj_refd(spacemap_zap);8398for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);8399sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))8400mos_obj_refd(sls->sls_sm_obj);8401}84028403static void8404errorlog_count_refd(objset_t *mos, uint64_t errlog)8405{8406zap_cursor_t zc;8407zap_attribute_t *za = zap_attribute_alloc();8408for (zap_cursor_init(&zc, mos, errlog);8409zap_cursor_retrieve(&zc, za) == 0;8410zap_cursor_advance(&zc)) {8411mos_obj_refd(za->za_first_integer);8412}8413zap_cursor_fini(&zc);8414zap_attribute_free(za);8415}84168417static int8418dump_mos_leaks(spa_t *spa)8419{8420int rv = 0;8421objset_t *mos = spa->spa_meta_objset;8422dsl_pool_t *dp = spa->spa_dsl_pool;84238424/* Visit and mark all referenced objects in the MOS */84258426mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);8427mos_obj_refd(spa->spa_pool_props_object);8428mos_obj_refd(spa->spa_config_object);8429mos_obj_refd(spa->spa_ddt_stat_object);8430mos_obj_refd(spa->spa_feat_desc_obj);8431mos_obj_refd(spa->spa_feat_enabled_txg_obj);8432mos_obj_refd(spa->spa_feat_for_read_obj);8433mos_obj_refd(spa->spa_feat_for_write_obj);8434mos_obj_refd(spa->spa_history);8435mos_obj_refd(spa->spa_errlog_last);8436mos_obj_refd(spa->spa_errlog_scrub);84378438if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {8439errorlog_count_refd(mos, spa->spa_errlog_last);8440errorlog_count_refd(mos, spa->spa_errlog_scrub);8441}84428443mos_obj_refd(spa->spa_all_vdev_zaps);8444mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);8445mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);8446mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);8447bpobj_count_refd(&spa->spa_deferred_bpobj);8448mos_obj_refd(dp->dp_empty_bpobj);8449bpobj_count_refd(&dp->dp_obsolete_bpobj);8450bpobj_count_refd(&dp->dp_free_bpobj);8451mos_obj_refd(spa->spa_l2cache.sav_object);8452mos_obj_refd(spa->spa_spares.sav_object);84538454if (spa->spa_syncing_log_sm != NULL)8455mos_obj_refd(spa->spa_syncing_log_sm->sm_object);8456mos_leak_log_spacemaps(spa);84578458mos_obj_refd(spa->spa_condensing_indirect_phys.8459scip_next_mapping_object);8460mos_obj_refd(spa->spa_condensing_indirect_phys.8461scip_prev_obsolete_sm_object);8462if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {8463vdev_indirect_mapping_t *vim =8464vdev_indirect_mapping_open(mos,8465spa->spa_condensing_indirect_phys.scip_next_mapping_object);8466mos_obj_refd(vim->vim_phys->vimp_counts_object);8467vdev_indirect_mapping_close(vim);8468}8469deleted_livelists_dump_mos(spa);84708471if (dp->dp_origin_snap != NULL) {8472dsl_dataset_t *ds;84738474dsl_pool_config_enter(dp, FTAG);8475VERIFY0(dsl_dataset_hold_obj(dp,8476dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,8477FTAG, &ds));8478count_ds_mos_objects(ds);8479dump_blkptr_list(&ds->ds_deadlist, "Deadlist");8480dsl_dataset_rele(ds, FTAG);8481dsl_pool_config_exit(dp, FTAG);84828483count_ds_mos_objects(dp->dp_origin_snap);8484dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");8485}8486count_dir_mos_objects(dp->dp_mos_dir);8487if (dp->dp_free_dir != NULL)8488count_dir_mos_objects(dp->dp_free_dir);8489if (dp->dp_leak_dir != NULL)8490count_dir_mos_objects(dp->dp_leak_dir);84918492mos_leak_vdev(spa->spa_root_vdev);84938494for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {8495ddt_t *ddt = spa->spa_ddt[c];8496if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)8497continue;84988499/* DDT store objects */8500for (ddt_type_t type = 0; type < DDT_TYPES; type++) {8501for (ddt_class_t class = 0; class < DDT_CLASSES;8502class++) {8503mos_obj_refd(ddt->ddt_object[type][class]);8504}8505}85068507/* FDT container */8508if (ddt->ddt_version == DDT_VERSION_FDT)8509mos_obj_refd(ddt->ddt_dir_object);85108511/* FDT log objects */8512if (ddt->ddt_flags & DDT_FLAG_LOG) {8513mos_obj_refd(ddt->ddt_log[0].ddl_object);8514mos_obj_refd(ddt->ddt_log[1].ddl_object);8515}8516}85178518for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {8519brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];8520if (brtvd->bv_initiated) {8521mos_obj_refd(brtvd->bv_mos_brtvdev);8522mos_obj_refd(brtvd->bv_mos_entries);8523}8524}85258526/*8527* Visit all allocated objects and make sure they are referenced.8528*/8529uint64_t object = 0;8530while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {8531if (zfs_range_tree_contains(mos_refd_objs, object, 1)) {8532zfs_range_tree_remove(mos_refd_objs, object, 1);8533} else {8534dmu_object_info_t doi;8535const char *name;8536VERIFY0(dmu_object_info(mos, object, &doi));8537if (doi.doi_type & DMU_OT_NEWTYPE) {8538dmu_object_byteswap_t bswap =8539DMU_OT_BYTESWAP(doi.doi_type);8540name = dmu_ot_byteswap[bswap].ob_name;8541} else {8542name = dmu_ot[doi.doi_type].ot_name;8543}85448545(void) printf("MOS object %llu (%s) leaked\n",8546(u_longlong_t)object, name);8547rv = 2;8548}8549}8550(void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);8551if (!zfs_range_tree_is_empty(mos_refd_objs))8552rv = 2;8553zfs_range_tree_vacate(mos_refd_objs, NULL, NULL);8554zfs_range_tree_destroy(mos_refd_objs);8555return (rv);8556}85578558typedef struct log_sm_obsolete_stats_arg {8559uint64_t lsos_current_txg;85608561uint64_t lsos_total_entries;8562uint64_t lsos_valid_entries;85638564uint64_t lsos_sm_entries;8565uint64_t lsos_valid_sm_entries;8566} log_sm_obsolete_stats_arg_t;85678568static int8569log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,8570uint64_t txg, void *arg)8571{8572log_sm_obsolete_stats_arg_t *lsos = arg;85738574uint64_t offset = sme->sme_offset;8575uint64_t vdev_id = sme->sme_vdev;85768577if (lsos->lsos_current_txg == 0) {8578/* this is the first log */8579lsos->lsos_current_txg = txg;8580} else if (lsos->lsos_current_txg < txg) {8581/* we just changed log - print stats and reset */8582(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",8583(u_longlong_t)lsos->lsos_valid_sm_entries,8584(u_longlong_t)lsos->lsos_sm_entries,8585(u_longlong_t)lsos->lsos_current_txg);8586lsos->lsos_valid_sm_entries = 0;8587lsos->lsos_sm_entries = 0;8588lsos->lsos_current_txg = txg;8589}8590ASSERT3U(lsos->lsos_current_txg, ==, txg);85918592lsos->lsos_sm_entries++;8593lsos->lsos_total_entries++;85948595vdev_t *vd = vdev_lookup_top(spa, vdev_id);8596if (!vdev_is_concrete(vd))8597return (0);85988599metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];8600ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);86018602if (txg < metaslab_unflushed_txg(ms))8603return (0);8604lsos->lsos_valid_sm_entries++;8605lsos->lsos_valid_entries++;8606return (0);8607}86088609static void8610dump_log_spacemap_obsolete_stats(spa_t *spa)8611{8612if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))8613return;86148615log_sm_obsolete_stats_arg_t lsos = {0};86168617(void) printf("Log Space Map Obsolete Entry Statistics:\n");86188619iterate_through_spacemap_logs(spa,8620log_spacemap_obsolete_stats_cb, &lsos);86218622/* print stats for latest log */8623(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",8624(u_longlong_t)lsos.lsos_valid_sm_entries,8625(u_longlong_t)lsos.lsos_sm_entries,8626(u_longlong_t)lsos.lsos_current_txg);86278628(void) printf("%-8llu valid entries out of %-8llu - total\n\n",8629(u_longlong_t)lsos.lsos_valid_entries,8630(u_longlong_t)lsos.lsos_total_entries);8631}86328633static void8634dump_zpool(spa_t *spa)8635{8636dsl_pool_t *dp = spa_get_dsl(spa);8637int rc = 0;86388639if (dump_opt['y']) {8640livelist_metaslab_validate(spa);8641}86428643if (dump_opt['S']) {8644dump_simulated_ddt(spa);8645return;8646}86478648if (!dump_opt['e'] && dump_opt['C'] > 1) {8649(void) printf("\nCached configuration:\n");8650dump_nvlist(spa->spa_config, 8);8651}86528653if (dump_opt['C'])8654dump_config(spa);86558656if (dump_opt['u'])8657dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");86588659if (dump_opt['D'])8660dump_all_ddts(spa);86618662if (dump_opt['T'])8663dump_brt(spa);86648665if (dump_opt['d'] > 2 || dump_opt['m'])8666dump_metaslabs(spa);8667if (dump_opt['M'])8668dump_metaslab_groups(spa, dump_opt['M'] > 1);8669if (dump_opt['d'] > 2 || dump_opt['m']) {8670dump_log_spacemaps(spa);8671dump_log_spacemap_obsolete_stats(spa);8672}86738674if (dump_opt['d'] || dump_opt['i']) {8675spa_feature_t f;8676mos_refd_objs = zfs_range_tree_create_flags(8677NULL, ZFS_RANGE_SEG64, NULL, 0, 0,86780, "dump_zpool:mos_refd_objs");8679dump_objset(dp->dp_meta_objset);86808681if (dump_opt['d'] >= 3) {8682dsl_pool_t *dp = spa->spa_dsl_pool;8683dump_full_bpobj(&spa->spa_deferred_bpobj,8684"Deferred frees", 0);8685if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {8686dump_full_bpobj(&dp->dp_free_bpobj,8687"Pool snapshot frees", 0);8688}8689if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {8690ASSERT(spa_feature_is_enabled(spa,8691SPA_FEATURE_DEVICE_REMOVAL));8692dump_full_bpobj(&dp->dp_obsolete_bpobj,8693"Pool obsolete blocks", 0);8694}86958696if (spa_feature_is_active(spa,8697SPA_FEATURE_ASYNC_DESTROY)) {8698dump_bptree(spa->spa_meta_objset,8699dp->dp_bptree_obj,8700"Pool dataset frees");8701}8702dump_dtl(spa->spa_root_vdev, 0);8703}87048705for (spa_feature_t f = 0; f < SPA_FEATURES; f++)8706global_feature_count[f] = UINT64_MAX;8707global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;8708global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;8709global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;8710global_feature_count[SPA_FEATURE_LIVELIST] = 0;87118712(void) dmu_objset_find(spa_name(spa), dump_one_objset,8713NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);87148715if (rc == 0 && !dump_opt['L'])8716rc = dump_mos_leaks(spa);87178718for (f = 0; f < SPA_FEATURES; f++) {8719uint64_t refcount;87208721uint64_t *arr;8722if (!(spa_feature_table[f].fi_flags &8723ZFEATURE_FLAG_PER_DATASET)) {8724if (global_feature_count[f] == UINT64_MAX)8725continue;8726if (!spa_feature_is_enabled(spa, f)) {8727ASSERT0(global_feature_count[f]);8728continue;8729}8730arr = global_feature_count;8731} else {8732if (!spa_feature_is_enabled(spa, f)) {8733ASSERT0(dataset_feature_count[f]);8734continue;8735}8736arr = dataset_feature_count;8737}8738if (feature_get_refcount(spa, &spa_feature_table[f],8739&refcount) == ENOTSUP)8740continue;8741if (arr[f] != refcount) {8742(void) printf("%s feature refcount mismatch: "8743"%lld consumers != %lld refcount\n",8744spa_feature_table[f].fi_uname,8745(longlong_t)arr[f], (longlong_t)refcount);8746rc = 2;8747} else {8748(void) printf("Verified %s feature refcount "8749"of %llu is correct\n",8750spa_feature_table[f].fi_uname,8751(longlong_t)refcount);8752}8753}87548755if (rc == 0)8756rc = verify_device_removal_feature_counts(spa);8757}87588759if (rc == 0 && (dump_opt['b'] || dump_opt['c']))8760rc = dump_block_stats(spa);87618762if (rc == 0)8763rc = verify_spacemap_refcounts(spa);87648765if (dump_opt['s'])8766show_pool_stats(spa);87678768if (dump_opt['h'])8769dump_history(spa);87708771if (rc == 0)8772rc = verify_checkpoint(spa);87738774if (rc != 0) {8775dump_debug_buffer();8776zdb_exit(rc);8777}8778}87798780#define ZDB_FLAG_CHECKSUM 0x00018781#define ZDB_FLAG_DECOMPRESS 0x00028782#define ZDB_FLAG_BSWAP 0x00048783#define ZDB_FLAG_GBH 0x00088784#define ZDB_FLAG_INDIRECT 0x00108785#define ZDB_FLAG_RAW 0x00208786#define ZDB_FLAG_PRINT_BLKPTR 0x00408787#define ZDB_FLAG_VERBOSE 0x008087888789static int flagbits[256];8790static char flagbitstr[16];87918792static void8793zdb_print_blkptr(const blkptr_t *bp, int flags)8794{8795char blkbuf[BP_SPRINTF_LEN];87968797if (flags & ZDB_FLAG_BSWAP)8798byteswap_uint64_array((void *)bp, sizeof (blkptr_t));87998800snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);8801(void) printf("%s\n", blkbuf);8802}88038804static void8805zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)8806{8807int i;88088809for (i = 0; i < nbps; i++)8810zdb_print_blkptr(&bp[i], flags);8811}88128813static void8814zdb_dump_gbh(void *buf, uint64_t size, int flags)8815{8816zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);8817}88188819static void8820zdb_dump_block_raw(void *buf, uint64_t size, int flags)8821{8822if (flags & ZDB_FLAG_BSWAP)8823byteswap_uint64_array(buf, size);8824VERIFY(write(fileno(stdout), buf, size) == size);8825}88268827static void8828zdb_dump_block(char *label, void *buf, uint64_t size, int flags)8829{8830uint64_t *d = (uint64_t *)buf;8831unsigned nwords = size / sizeof (uint64_t);8832int do_bswap = !!(flags & ZDB_FLAG_BSWAP);8833unsigned i, j;8834const char *hdr;8835char *c;883688378838if (do_bswap)8839hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";8840else8841hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";88428843(void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);88448845#ifdef _ZFS_LITTLE_ENDIAN8846/* correct the endianness */8847do_bswap = !do_bswap;8848#endif8849for (i = 0; i < nwords; i += 2) {8850(void) printf("%06llx: %016llx %016llx ",8851(u_longlong_t)(i * sizeof (uint64_t)),8852(u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),8853(u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));88548855c = (char *)&d[i];8856for (j = 0; j < 2 * sizeof (uint64_t); j++)8857(void) printf("%c", isprint(c[j]) ? c[j] : '.');8858(void) printf("\n");8859}8860}88618862/*8863* There are two acceptable formats:8864* leaf_name - For example: c1t0d0 or /tmp/ztest.0a8865* child[.child]* - For example: 0.1.18866*8867* The second form can be used to specify arbitrary vdevs anywhere8868* in the hierarchy. For example, in a pool with a mirror of8869* RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .8870*/8871static vdev_t *8872zdb_vdev_lookup(vdev_t *vdev, const char *path)8873{8874char *s, *p, *q;8875unsigned i;88768877if (vdev == NULL)8878return (NULL);88798880/* First, assume the x.x.x.x format */8881i = strtoul(path, &s, 10);8882if (s == path || (s && *s != '.' && *s != '\0'))8883goto name;8884if (i >= vdev->vdev_children)8885return (NULL);88868887vdev = vdev->vdev_child[i];8888if (s && *s == '\0')8889return (vdev);8890return (zdb_vdev_lookup(vdev, s+1));88918892name:8893for (i = 0; i < vdev->vdev_children; i++) {8894vdev_t *vc = vdev->vdev_child[i];88958896if (vc->vdev_path == NULL) {8897vc = zdb_vdev_lookup(vc, path);8898if (vc == NULL)8899continue;8900else8901return (vc);8902}89038904p = strrchr(vc->vdev_path, '/');8905p = p ? p + 1 : vc->vdev_path;8906q = &vc->vdev_path[strlen(vc->vdev_path) - 2];89078908if (strcmp(vc->vdev_path, path) == 0)8909return (vc);8910if (strcmp(p, path) == 0)8911return (vc);8912if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)8913return (vc);8914}89158916return (NULL);8917}89188919static int8920name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)8921{8922dsl_dataset_t *ds;89238924dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);8925int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,8926NULL, &ds);8927if (error != 0) {8928(void) fprintf(stderr, "failed to hold objset %llu: %s\n",8929(u_longlong_t)objset_id, strerror(error));8930dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);8931return (error);8932}8933dsl_dataset_name(ds, outstr);8934dsl_dataset_rele(ds, NULL);8935dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);8936return (0);8937}89388939static boolean_t8940zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)8941{8942char *s0, *s1, *tmp = NULL;89438944if (sizes == NULL)8945return (B_FALSE);89468947s0 = strtok_r(sizes, "/", &tmp);8948if (s0 == NULL)8949return (B_FALSE);8950s1 = strtok_r(NULL, "/", &tmp);8951*lsize = strtoull(s0, NULL, 16);8952*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;8953return (*lsize >= *psize && *psize > 0);8954}89558956#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg))89578958static boolean_t8959try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,8960int flags, int cfunc, void *lbuf, void *lbuf2)8961{8962if (flags & ZDB_FLAG_VERBOSE) {8963(void) fprintf(stderr,8964"Trying %05llx -> %05llx (%s)\n",8965(u_longlong_t)psize,8966(u_longlong_t)lsize,8967zio_compress_table[cfunc].ci_name);8968}89698970/*8971* We set lbuf to all zeros and lbuf2 to all8972* ones, then decompress to both buffers and8973* compare their contents. This way we can8974* know if decompression filled exactly to8975* lsize or if it left some bytes unwritten.8976*/89778978memset(lbuf, 0x00, lsize);8979memset(lbuf2, 0xff, lsize);89808981abd_t labd, labd2;8982abd_get_from_buf_struct(&labd, lbuf, lsize);8983abd_get_from_buf_struct(&labd2, lbuf2, lsize);89848985boolean_t ret = B_FALSE;8986if (zio_decompress_data(cfunc, pabd,8987&labd, psize, lsize, NULL) == 0 &&8988zio_decompress_data(cfunc, pabd,8989&labd2, psize, lsize, NULL) == 0 &&8990memcmp(lbuf, lbuf2, lsize) == 0)8991ret = B_TRUE;89928993abd_free(&labd2);8994abd_free(&labd);89958996return (ret);8997}89988999static uint64_t9000zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,9001uint64_t psize, int flags)9002{9003(void) buf;9004uint64_t orig_lsize = lsize;9005boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));9006/*9007* We don't know how the data was compressed, so just try9008* every decompress function at every inflated blocksize.9009*/9010void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);9011int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };9012int *cfuncp = cfuncs;9013uint64_t maxlsize = SPA_MAXBLOCKSIZE;9014uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |9015ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |9016ZIO_COMPRESS_MASK(ZLE);9017*cfuncp++ = ZIO_COMPRESS_LZ4;9018*cfuncp++ = ZIO_COMPRESS_LZJB;9019mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);9020/*9021* Every gzip level has the same decompressor, no need to9022* run it 9 times per bruteforce attempt.9023*/9024mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);9025mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);9026mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);9027mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);9028for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)9029if (((1ULL << c) & mask) == 0)9030*cfuncp++ = c;90319032/*9033* On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this9034* could take a while and we should let the user know9035* we are not stuck. On the other hand, printing progress9036* info gets old after a while. User can specify 'v' flag9037* to see the progression.9038*/9039if (lsize == psize)9040lsize += SPA_MINBLOCKSIZE;9041else9042maxlsize = lsize;90439044for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {9045for (cfuncp = cfuncs; *cfuncp; cfuncp++) {9046if (try_decompress_block(pabd, lsize, psize, flags,9047*cfuncp, lbuf, lbuf2)) {9048tryzle = B_FALSE;9049break;9050}9051}9052if (*cfuncp != 0)9053break;9054}9055if (tryzle) {9056for (lsize = orig_lsize; lsize <= maxlsize;9057lsize += SPA_MINBLOCKSIZE) {9058if (try_decompress_block(pabd, lsize, psize, flags,9059ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {9060*cfuncp = ZIO_COMPRESS_ZLE;9061break;9062}9063}9064}9065umem_free(lbuf2, SPA_MAXBLOCKSIZE);90669067if (*cfuncp == ZIO_COMPRESS_ZLE) {9068printf("\nZLE decompression was selected. If you "9069"suspect the results are wrong,\ntry avoiding ZLE "9070"by setting and exporting ZDB_NO_ZLE=\"true\"\n");9071}90729073return (lsize > maxlsize ? -1 : lsize);9074}90759076/*9077* Read a block from a pool and print it out. The syntax of the9078* block descriptor is:9079*9080* pool:vdev_specifier:offset:[lsize/]psize[:flags]9081*9082* pool - The name of the pool you wish to read from9083* vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)9084* offset - offset, in hex, in bytes9085* size - Amount of data to read, in hex, in bytes9086* flags - A string of characters specifying options9087* b: Decode a blkptr at given offset within block9088* c: Calculate and display checksums9089* d: Decompress data before dumping9090* e: Byteswap data before dumping9091* g: Display data as a gang block header9092* i: Display as an indirect block9093* r: Dump raw data to stdout9094* v: Verbose9095*9096*/9097static void9098zdb_read_block(char *thing, spa_t *spa)9099{9100blkptr_t blk, *bp = &blk;9101dva_t *dva = bp->blk_dva;9102int flags = 0;9103uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;9104zio_t *zio;9105vdev_t *vd;9106abd_t *pabd;9107void *lbuf, *buf;9108char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;9109const char *vdev, *errmsg = NULL;9110int i, len, error;9111boolean_t borrowed = B_FALSE, found = B_FALSE;91129113dup = strdup(thing);9114s = strtok_r(dup, ":", &tmp);9115vdev = s ?: "";9116s = strtok_r(NULL, ":", &tmp);9117offset = strtoull(s ? s : "", NULL, 16);9118sizes = strtok_r(NULL, ":", &tmp);9119s = strtok_r(NULL, ":", &tmp);9120flagstr = strdup(s ?: "");91219122if (!zdb_parse_block_sizes(sizes, &lsize, &psize))9123errmsg = "invalid size(s)";9124if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))9125errmsg = "size must be a multiple of sector size";9126if (!IS_P2ALIGNED(offset, DEV_BSIZE))9127errmsg = "offset must be a multiple of sector size";9128if (errmsg) {9129(void) printf("Invalid block specifier: %s - %s\n",9130thing, errmsg);9131goto done;9132}91339134tmp = NULL;9135for (s = strtok_r(flagstr, ":", &tmp);9136s != NULL;9137s = strtok_r(NULL, ":", &tmp)) {9138len = strlen(flagstr);9139for (i = 0; i < len; i++) {9140int bit = flagbits[(uchar_t)flagstr[i]];91419142if (bit == 0) {9143(void) printf("***Ignoring flag: %c\n",9144(uchar_t)flagstr[i]);9145continue;9146}9147found = B_TRUE;9148flags |= bit;91499150p = &flagstr[i + 1];9151if (*p != ':' && *p != '\0') {9152int j = 0, nextbit = flagbits[(uchar_t)*p];9153char *end, offstr[8] = { 0 };9154if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&9155(nextbit == 0)) {9156/* look ahead to isolate the offset */9157while (nextbit == 0 &&9158strchr(flagbitstr, *p) == NULL) {9159offstr[j] = *p;9160j++;9161if (i + j > strlen(flagstr))9162break;9163p++;9164nextbit = flagbits[(uchar_t)*p];9165}9166blkptr_offset = strtoull(offstr, &end,916716);9168i += j;9169} else if (nextbit == 0) {9170(void) printf("***Ignoring flag arg:"9171" '%c'\n", (uchar_t)*p);9172}9173}9174}9175}9176if (blkptr_offset % sizeof (blkptr_t)) {9177printf("Block pointer offset 0x%llx "9178"must be divisible by 0x%x\n",9179(longlong_t)blkptr_offset, (int)sizeof (blkptr_t));9180goto done;9181}9182if (found == B_FALSE && strlen(flagstr) > 0) {9183printf("Invalid flag arg: '%s'\n", flagstr);9184goto done;9185}91869187vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);9188if (vd == NULL) {9189(void) printf("***Invalid vdev: %s\n", vdev);9190goto done;9191} else {9192if (vd->vdev_path)9193(void) fprintf(stderr, "Found vdev: %s\n",9194vd->vdev_path);9195else9196(void) fprintf(stderr, "Found vdev type: %s\n",9197vd->vdev_ops->vdev_op_type);9198}91999200pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);9201lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);92029203BP_ZERO(bp);92049205DVA_SET_VDEV(&dva[0], vd->vdev_id);9206DVA_SET_OFFSET(&dva[0], offset);9207DVA_SET_GANG(&dva[0], 0);9208DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));92099210BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);92119212BP_SET_LSIZE(bp, lsize);9213BP_SET_PSIZE(bp, psize);9214BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);9215BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);9216BP_SET_TYPE(bp, DMU_OT_NONE);9217BP_SET_LEVEL(bp, 0);9218BP_SET_DEDUP(bp, 0);9219BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);92209221spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);9222zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);92239224if (vd == vd->vdev_top) {9225/*9226* Treat this as a normal block read.9227*/9228zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,9229ZIO_PRIORITY_SYNC_READ,9230ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));9231} else {9232/*9233* Treat this as a vdev child I/O.9234*/9235zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,9236psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,9237ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |9238ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,9239NULL, NULL));9240}92419242error = zio_wait(zio);9243spa_config_exit(spa, SCL_STATE, FTAG);92449245if (error) {9246(void) printf("Read of %s failed, error: %d\n", thing, error);9247goto out;9248}92499250uint64_t orig_lsize = lsize;9251buf = lbuf;9252if (flags & ZDB_FLAG_DECOMPRESS) {9253lsize = zdb_decompress_block(pabd, buf, lbuf,9254lsize, psize, flags);9255if (lsize == -1) {9256(void) printf("Decompress of %s failed\n", thing);9257goto out;9258}9259} else {9260buf = abd_borrow_buf_copy(pabd, lsize);9261borrowed = B_TRUE;9262}9263/*9264* Try to detect invalid block pointer. If invalid, try9265* decompressing.9266*/9267if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&9268!(flags & ZDB_FLAG_DECOMPRESS)) {9269const blkptr_t *b = (const blkptr_t *)(void *)9270((uintptr_t)buf + (uintptr_t)blkptr_offset);9271if (zfs_blkptr_verify(spa, b,9272BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {9273abd_return_buf_copy(pabd, buf, lsize);9274borrowed = B_FALSE;9275buf = lbuf;9276lsize = zdb_decompress_block(pabd, buf,9277lbuf, lsize, psize, flags);9278b = (const blkptr_t *)(void *)9279((uintptr_t)buf + (uintptr_t)blkptr_offset);9280if (lsize == -1 || zfs_blkptr_verify(spa, b,9281BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {9282printf("invalid block pointer at this DVA\n");9283goto out;9284}9285}9286}92879288if (flags & ZDB_FLAG_PRINT_BLKPTR)9289zdb_print_blkptr((blkptr_t *)(void *)9290((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);9291else if (flags & ZDB_FLAG_RAW)9292zdb_dump_block_raw(buf, lsize, flags);9293else if (flags & ZDB_FLAG_INDIRECT)9294zdb_dump_indirect((blkptr_t *)buf,9295orig_lsize / sizeof (blkptr_t), flags);9296else if (flags & ZDB_FLAG_GBH)9297zdb_dump_gbh(buf, lsize, flags);9298else9299zdb_dump_block(thing, buf, lsize, flags);93009301/*9302* If :c was specified, iterate through the checksum table to9303* calculate and display each checksum for our specified9304* DVA and length.9305*/9306if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&9307!(flags & ZDB_FLAG_GBH)) {9308zio_t *czio;9309(void) printf("\n");9310for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;9311ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {93129313if ((zio_checksum_table[ck].ci_flags &9314ZCHECKSUM_FLAG_EMBEDDED) ||9315ck == ZIO_CHECKSUM_NOPARITY) {9316continue;9317}9318BP_SET_CHECKSUM(bp, ck);9319spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);9320czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);9321if (vd == vd->vdev_top) {9322zio_nowait(zio_read(czio, spa, bp, pabd, psize,9323NULL, NULL,9324ZIO_PRIORITY_SYNC_READ,9325ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |9326ZIO_FLAG_DONT_RETRY, NULL));9327} else {9328zio_nowait(zio_vdev_child_io(czio, bp, vd,9329offset, pabd, psize, ZIO_TYPE_READ,9330ZIO_PRIORITY_SYNC_READ,9331ZIO_FLAG_DONT_PROPAGATE |9332ZIO_FLAG_DONT_RETRY |9333ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |9334ZIO_FLAG_SPECULATIVE |9335ZIO_FLAG_OPTIONAL, NULL, NULL));9336}9337error = zio_wait(czio);9338if (error == 0 || error == ECKSUM) {9339zio_t *ck_zio = zio_null(NULL, spa, NULL,9340NULL, NULL, 0);9341ck_zio->io_offset =9342DVA_GET_OFFSET(&bp->blk_dva[0]);9343ck_zio->io_bp = bp;9344zio_checksum_compute(ck_zio, ck, pabd, psize);9345printf(9346"%12s\t"9347"cksum=%016llx:%016llx:%016llx:%016llx\n",9348zio_checksum_table[ck].ci_name,9349(u_longlong_t)bp->blk_cksum.zc_word[0],9350(u_longlong_t)bp->blk_cksum.zc_word[1],9351(u_longlong_t)bp->blk_cksum.zc_word[2],9352(u_longlong_t)bp->blk_cksum.zc_word[3]);9353zio_wait(ck_zio);9354} else {9355printf("error %d reading block\n", error);9356}9357spa_config_exit(spa, SCL_STATE, FTAG);9358}9359}93609361if (borrowed)9362abd_return_buf_copy(pabd, buf, lsize);93639364out:9365abd_free(pabd);9366umem_free(lbuf, SPA_MAXBLOCKSIZE);9367done:9368free(flagstr);9369free(dup);9370}93719372static void9373zdb_embedded_block(char *thing)9374{9375blkptr_t bp = {{{{0}}}};9376unsigned long long *words = (void *)&bp;9377char *buf;9378int err;93799380err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"9381"%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",9382words + 0, words + 1, words + 2, words + 3,9383words + 4, words + 5, words + 6, words + 7,9384words + 8, words + 9, words + 10, words + 11,9385words + 12, words + 13, words + 14, words + 15);9386if (err != 16) {9387(void) fprintf(stderr, "invalid input format\n");9388zdb_exit(1);9389}9390ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);9391buf = malloc(SPA_MAXBLOCKSIZE);9392if (buf == NULL) {9393(void) fprintf(stderr, "out of memory\n");9394zdb_exit(1);9395}9396err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));9397if (err != 0) {9398(void) fprintf(stderr, "decode failed: %u\n", err);9399zdb_exit(1);9400}9401zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);9402free(buf);9403}94049405/* check for valid hex or decimal numeric string */9406static boolean_t9407zdb_numeric(char *str)9408{9409int i = 0, len;94109411len = strlen(str);9412if (len == 0)9413return (B_FALSE);9414if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)9415i = 2;9416for (; i < len; i++) {9417if (!isxdigit(str[i]))9418return (B_FALSE);9419}9420return (B_TRUE);9421}94229423static int9424dummy_get_file_info(dmu_object_type_t bonustype, const void *data,9425zfs_file_info_t *zoi)9426{9427(void) data, (void) zoi;94289429if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)9430return (ENOENT);94319432(void) fprintf(stderr, "dummy_get_file_info: not implemented");9433abort();9434}94359436int9437main(int argc, char **argv)9438{9439int c;9440int dump_all = 1;9441int verbose = 0;9442int error = 0;9443char **searchdirs = NULL;9444int nsearch = 0;9445char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];9446nvlist_t *policy = NULL;9447uint64_t max_txg = UINT64_MAX;9448int64_t objset_id = -1;9449uint64_t object;9450int flags = ZFS_IMPORT_MISSING_LOG;9451int rewind = ZPOOL_NEVER_REWIND;9452char *spa_config_path_env, *objset_str;9453boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;9454nvlist_t *cfg = NULL;9455struct sigaction action;9456boolean_t force_import = B_FALSE;9457boolean_t config_path_console = B_FALSE;9458char pbuf[MAXPATHLEN];94599460dprintf_setup(&argc, argv);94619462/*9463* Set up signal handlers, so if we crash due to bad on-disk data we9464* can get more info. Unlike ztest, we don't bail out if we can't set9465* up signal handlers, because zdb is very useful without them.9466*/9467action.sa_handler = sig_handler;9468sigemptyset(&action.sa_mask);9469action.sa_flags = 0;9470if (sigaction(SIGSEGV, &action, NULL) < 0) {9471(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",9472strerror(errno));9473}9474if (sigaction(SIGABRT, &action, NULL) < 0) {9475(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",9476strerror(errno));9477}94789479/*9480* If there is an environment variable SPA_CONFIG_PATH it overrides9481* default spa_config_path setting. If -U flag is specified it will9482* override this environment variable settings once again.9483*/9484spa_config_path_env = getenv("SPA_CONFIG_PATH");9485if (spa_config_path_env != NULL)9486spa_config_path = spa_config_path_env;94879488/*9489* For performance reasons, we set this tunable down. We do so before9490* the arg parsing section so that the user can override this value if9491* they choose.9492*/9493zfs_btree_verify_intensity = 3;94949495struct option long_options[] = {9496{"ignore-assertions", no_argument, NULL, 'A'},9497{"block-stats", no_argument, NULL, 'b'},9498{"backup", no_argument, NULL, 'B'},9499{"checksum", no_argument, NULL, 'c'},9500{"config", no_argument, NULL, 'C'},9501{"datasets", no_argument, NULL, 'd'},9502{"dedup-stats", no_argument, NULL, 'D'},9503{"exported", no_argument, NULL, 'e'},9504{"embedded-block-pointer", no_argument, NULL, 'E'},9505{"automatic-rewind", no_argument, NULL, 'F'},9506{"dump-debug-msg", no_argument, NULL, 'G'},9507{"history", no_argument, NULL, 'h'},9508{"intent-logs", no_argument, NULL, 'i'},9509{"inflight", required_argument, NULL, 'I'},9510{"checkpointed-state", no_argument, NULL, 'k'},9511{"key", required_argument, NULL, 'K'},9512{"label", no_argument, NULL, 'l'},9513{"disable-leak-tracking", no_argument, NULL, 'L'},9514{"metaslabs", no_argument, NULL, 'm'},9515{"metaslab-groups", no_argument, NULL, 'M'},9516{"numeric", no_argument, NULL, 'N'},9517{"option", required_argument, NULL, 'o'},9518{"object-lookups", no_argument, NULL, 'O'},9519{"path", required_argument, NULL, 'p'},9520{"parseable", no_argument, NULL, 'P'},9521{"skip-label", no_argument, NULL, 'q'},9522{"copy-object", no_argument, NULL, 'r'},9523{"read-block", no_argument, NULL, 'R'},9524{"io-stats", no_argument, NULL, 's'},9525{"simulate-dedup", no_argument, NULL, 'S'},9526{"txg", required_argument, NULL, 't'},9527{"brt-stats", no_argument, NULL, 'T'},9528{"uberblock", no_argument, NULL, 'u'},9529{"cachefile", required_argument, NULL, 'U'},9530{"verbose", no_argument, NULL, 'v'},9531{"verbatim", no_argument, NULL, 'V'},9532{"dump-blocks", required_argument, NULL, 'x'},9533{"extreme-rewind", no_argument, NULL, 'X'},9534{"all-reconstruction", no_argument, NULL, 'Y'},9535{"livelist", no_argument, NULL, 'y'},9536{"zstd-headers", no_argument, NULL, 'Z'},9537{"allocated-map", no_argument, NULL,9538ARG_ALLOCATED},9539{"bin", required_argument, NULL,9540ARG_BLOCK_BIN_MODE},9541{"class", required_argument, NULL,9542ARG_BLOCK_CLASSES},9543{0, 0, 0, 0}9544};95459546while ((c = getopt_long(argc, argv,9547"AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",9548long_options, NULL)) != -1) {9549switch (c) {9550case 'b':9551case 'B':9552case 'c':9553case 'C':9554case 'd':9555case 'D':9556case 'E':9557case 'G':9558case 'h':9559case 'i':9560case 'l':9561case 'm':9562case 'M':9563case 'N':9564case 'O':9565case 'r':9566case 'R':9567case 's':9568case 'S':9569case 'T':9570case 'u':9571case 'y':9572case 'Z':9573case ARG_ALLOCATED:9574dump_opt[c]++;9575dump_all = 0;9576break;9577case 'A':9578case 'e':9579case 'F':9580case 'k':9581case 'L':9582case 'P':9583case 'q':9584case 'X':9585dump_opt[c]++;9586break;9587case 'Y':9588zfs_reconstruct_indirect_combinations_max = INT_MAX;9589zfs_deadman_enabled = 0;9590break;9591/* NB: Sort single match options below. */9592case 'I':9593max_inflight_bytes = strtoull(optarg, NULL, 0);9594if (max_inflight_bytes == 0) {9595(void) fprintf(stderr, "maximum number "9596"of inflight bytes must be greater "9597"than 0\n");9598usage();9599}9600break;9601case 'K':9602dump_opt[c]++;9603key_material = strdup(optarg);9604/* redact key material in process table */9605while (*optarg != '\0') { *optarg++ = '*'; }9606break;9607case 'o':9608dump_opt[c]++;9609dump_all = 0;9610error = handle_tunable_option(optarg, B_FALSE);9611if (error != 0)9612zdb_exit(1);9613break;9614case 'p':9615if (searchdirs == NULL) {9616searchdirs = umem_alloc(sizeof (char *),9617UMEM_NOFAIL);9618} else {9619char **tmp = umem_alloc((nsearch + 1) *9620sizeof (char *), UMEM_NOFAIL);9621memcpy(tmp, searchdirs, nsearch *9622sizeof (char *));9623umem_free(searchdirs,9624nsearch * sizeof (char *));9625searchdirs = tmp;9626}9627searchdirs[nsearch++] = optarg;9628break;9629case 't':9630max_txg = strtoull(optarg, NULL, 0);9631if (max_txg < TXG_INITIAL) {9632(void) fprintf(stderr, "incorrect txg "9633"specified: %s\n", optarg);9634usage();9635}9636break;9637case 'U':9638config_path_console = B_TRUE;9639spa_config_path = optarg;9640if (spa_config_path[0] != '/') {9641(void) fprintf(stderr,9642"cachefile must be an absolute path "9643"(i.e. start with a slash)\n");9644usage();9645}9646break;9647case 'v':9648verbose++;9649break;9650case 'V':9651flags = ZFS_IMPORT_VERBATIM;9652break;9653case 'x':9654vn_dumpdir = optarg;9655break;9656case ARG_BLOCK_BIN_MODE:9657if (strcmp(optarg, "lsize") == 0) {9658block_bin_mode = BIN_LSIZE;9659} else if (strcmp(optarg, "psize") == 0) {9660block_bin_mode = BIN_PSIZE;9661} else if (strcmp(optarg, "asize") == 0) {9662block_bin_mode = BIN_ASIZE;9663} else {9664(void) fprintf(stderr,9665"--bin=\"%s\" must be one of \"lsize\", "9666"\"psize\" or \"asize\"\n", optarg);9667usage();9668}9669break;96709671case ARG_BLOCK_CLASSES: {9672char *buf = strdup(optarg), *tok = buf, *next,9673*save = NULL;96749675while ((next = strtok_r(tok, ",", &save)) != NULL) {9676tok = NULL;96779678if (strcmp(next, "normal") == 0) {9679block_classes |= CLASS_NORMAL;9680} else if (strcmp(next, "special") == 0) {9681block_classes |= CLASS_SPECIAL;9682} else if (strcmp(next, "dedup") == 0) {9683block_classes |= CLASS_DEDUP;9684} else if (strcmp(next, "other") == 0) {9685block_classes |= CLASS_OTHER;9686} else {9687(void) fprintf(stderr,9688"--class=\"%s\" must be a "9689"comma-separated list of either "9690"\"normal\", \"special\", "9691"\"asize\" or \"other\"; "9692"got \"%s\"\n",9693optarg, next);9694usage();9695}9696}96979698if (block_classes == 0) {9699(void) fprintf(stderr,9700"--class= must be a comma-separated "9701"list of either \"normal\", \"special\", "9702"\"asize\" or \"other\"; got empty\n");9703usage();9704}97059706free(buf);9707break;9708}9709default:9710usage();9711break;9712}9713}97149715if (!dump_opt['e'] && searchdirs != NULL) {9716(void) fprintf(stderr, "-p option requires use of -e\n");9717usage();9718}9719#if defined(_LP64)9720/*9721* ZDB does not typically re-read blocks; therefore limit the ARC9722* to 256 MB, which can be used entirely for metadata.9723*/9724zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;9725zfs_arc_max = 256 * 1024 * 1024;9726#endif97279728/*9729* "zdb -c" uses checksum-verifying scrub i/os which are async reads.9730* "zdb -b" uses traversal prefetch which uses async reads.9731* For good performance, let several of them be active at once.9732*/9733zfs_vdev_async_read_max_active = 10;97349735/*9736* Disable reference tracking for better performance.9737*/9738reference_tracking_enable = B_FALSE;97399740/*9741* Do not fail spa_load when spa_load_verify fails. This is needed9742* to load non-idle pools.9743*/9744spa_load_verify_dryrun = B_TRUE;97459746/*9747* ZDB should have ability to read spacemaps.9748*/9749spa_mode_readable_spacemaps = B_TRUE;97509751libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));9752zfs_recover = (dump_opt['A'] > 1);97539754if (dump_all)9755verbose = MAX(verbose, 1);97569757for (c = 0; c < 256; c++) {9758if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)9759dump_opt[c] = 1;9760if (dump_opt[c])9761dump_opt[c] += verbose;9762}97639764argc -= optind;9765argv += optind;9766if (argc < 2 && dump_opt['R'])9767usage();97689769target = argv[0];97709771/*9772* Automate cachefile9773*/9774if (!spa_config_path_env && !config_path_console && target &&9775libzfs_core_init() == 0) {9776char *pname = strdup(target);9777const char *value;9778nvlist_t *pnvl = NULL;9779nvlist_t *vnvl = NULL;97809781if (strpbrk(pname, "/@") != NULL)9782*strpbrk(pname, "/@") = '\0';97839784if (pname && lzc_get_props(pname, &pnvl) == 0) {9785if (nvlist_lookup_nvlist(pnvl, "cachefile",9786&vnvl) == 0) {9787value = fnvlist_lookup_string(vnvl,9788ZPROP_VALUE);9789} else {9790value = "-";9791}9792strlcpy(pbuf, value, sizeof (pbuf));9793if (pbuf[0] != '\0') {9794if (pbuf[0] == '/') {9795if (access(pbuf, F_OK) == 0)9796spa_config_path = pbuf;9797else9798force_import = B_TRUE;9799} else if ((strcmp(pbuf, "-") == 0 &&9800access(ZPOOL_CACHE, F_OK) != 0) ||9801strcmp(pbuf, "none") == 0) {9802force_import = B_TRUE;9803}9804}9805nvlist_free(vnvl);9806}98079808free(pname);9809nvlist_free(pnvl);9810libzfs_core_fini();9811}98129813dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info);9814kernel_init(SPA_MODE_READ);9815kernel_init_done = B_TRUE;98169817if (dump_opt['E']) {9818if (argc != 1)9819usage();9820zdb_embedded_block(argv[0]);9821error = 0;9822goto fini;9823}98249825if (argc < 1) {9826if (!dump_opt['e'] && dump_opt['C']) {9827dump_cachefile(spa_config_path);9828error = 0;9829goto fini;9830}9831if (dump_opt['o'])9832/*9833* Avoid blasting tunable options off the top of the9834* screen.9835*/9836zdb_exit(1);9837usage();9838}98399840if (dump_opt['l']) {9841error = dump_label(argv[0]);9842goto fini;9843}98449845if (dump_opt['X'] || dump_opt['F'])9846rewind = ZPOOL_DO_REWIND |9847(dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);98489849/* -N implies -d */9850if (dump_opt['N'] && dump_opt['d'] == 0)9851dump_opt['d'] = dump_opt['N'];98529853if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||9854nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||9855nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)9856fatal("internal error: %s", strerror(ENOMEM));98579858error = 0;98599860if (strpbrk(target, "/@") != NULL) {9861size_t targetlen;98629863target_pool = strdup(target);9864*strpbrk(target_pool, "/@") = '\0';98659866target_is_spa = B_FALSE;9867targetlen = strlen(target);9868if (targetlen && target[targetlen - 1] == '/')9869target[targetlen - 1] = '\0';98709871/*9872* See if an objset ID was supplied (-d <pool>/<objset ID>).9873* To disambiguate tank/100, consider the 100 as objsetID9874* if -N was given, otherwise 100 is an objsetID iff9875* tank/100 as a named dataset fails on lookup.9876*/9877objset_str = strchr(target, '/');9878if (objset_str && strlen(objset_str) > 1 &&9879zdb_numeric(objset_str + 1)) {9880char *endptr;9881errno = 0;9882objset_str++;9883objset_id = strtoull(objset_str, &endptr, 0);9884/* dataset 0 is the same as opening the pool */9885if (errno == 0 && endptr != objset_str &&9886objset_id != 0) {9887if (dump_opt['N'])9888dataset_lookup = B_TRUE;9889}9890/* normal dataset name not an objset ID */9891if (endptr == objset_str) {9892objset_id = -1;9893}9894} else if (objset_str && !zdb_numeric(objset_str + 1) &&9895dump_opt['N']) {9896printf("Supply a numeric objset ID with -N\n");9897error = 2;9898goto fini;9899}9900} else {9901target_pool = target;9902}99039904if (dump_opt['e'] || force_import) {9905importargs_t args = { 0 };99069907/*9908* If path is not provided, search in /dev9909*/9910if (searchdirs == NULL) {9911searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL);9912searchdirs[nsearch++] = (char *)ZFS_DEVDIR;9913}99149915args.paths = nsearch;9916args.path = searchdirs;9917args.can_be_active = B_TRUE;99189919libpc_handle_t lpch = {9920.lpc_lib_handle = NULL,9921.lpc_ops = &libzpool_config_ops,9922.lpc_printerr = B_TRUE9923};9924error = zpool_find_config(&lpch, target_pool, &cfg, &args);99259926if (error == 0) {99279928if (nvlist_add_nvlist(cfg,9929ZPOOL_LOAD_POLICY, policy) != 0) {9930fatal("can't open '%s': %s",9931target, strerror(ENOMEM));9932}99339934if (dump_opt['C'] > 1) {9935(void) printf("\nConfiguration for import:\n");9936dump_nvlist(cfg, 8);9937}99389939/*9940* Disable the activity check to allow examination of9941* active pools.9942*/9943error = spa_import(target_pool, cfg, NULL,9944flags | ZFS_IMPORT_SKIP_MMP);9945}9946}99479948if (searchdirs != NULL) {9949umem_free(searchdirs, nsearch * sizeof (char *));9950searchdirs = NULL;9951}99529953/*9954* We need to make sure to process -O option or call9955* dump_path after the -e option has been processed,9956* which imports the pool to the namespace if it's9957* not in the cachefile.9958*/9959if (dump_opt['O'] && !dump_opt['r']) {9960if (argc != 2)9961usage();9962dump_opt['v'] = verbose + 3;9963error = dump_path(argv[0], argv[1], NULL);9964goto fini;9965}99669967if (dump_opt['r']) {9968target_is_spa = B_FALSE;9969if (argc != 3)9970usage();9971dump_opt['v'] = verbose;9972if (dump_opt['O']) {9973object = strtoull(argv[1], NULL, 0);9974} else {9975error = dump_path(argv[0], argv[1], &object);9976}9977if (error != 0)9978fatal("internal error: %s", strerror(error));9979}99809981/*9982* import_checkpointed_state makes the assumption that the9983* target pool that we pass it is already part of the spa9984* namespace. Because of that we need to make sure to call9985* it always after the -e option has been processed, which9986* imports the pool to the namespace if it's not in the9987* cachefile.9988*/9989char *checkpoint_pool = NULL;9990char *checkpoint_target = NULL;9991if (dump_opt['k']) {9992checkpoint_pool = import_checkpointed_state(target, cfg,9993target_is_spa, &checkpoint_target);99949995if (checkpoint_target != NULL)9996target = checkpoint_target;9997}99989999if (cfg != NULL) {10000nvlist_free(cfg);10001cfg = NULL;10002}1000310004if (target_pool != target)10005free(target_pool);1000610007if (error == 0) {10008if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {10009ASSERT(checkpoint_pool != NULL);10010ASSERT0P(checkpoint_target);1001110012error = spa_open(checkpoint_pool, &spa, FTAG);10013if (error != 0) {10014fatal("Tried to open pool \"%s\" but "10015"spa_open() failed with error %d\n",10016checkpoint_pool, error);10017}1001810019} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||10020objset_id == 0) {10021zdb_set_skip_mmp(target);10022error = spa_open_rewind(target, &spa, FTAG, policy,10023NULL);10024if (error) {10025/*10026* If we're missing the log device then10027* try opening the pool after clearing the10028* log state.10029*/10030spa_namespace_enter(FTAG);10031if ((spa = spa_lookup(target)) != NULL &&10032spa->spa_log_state == SPA_LOG_MISSING) {10033spa->spa_log_state = SPA_LOG_CLEAR;10034error = 0;10035}10036spa_namespace_exit(FTAG);1003710038if (!error) {10039error = spa_open_rewind(target, &spa,10040FTAG, policy, NULL);10041}10042}10043} else if (strpbrk(target, "#") != NULL) {10044dsl_pool_t *dp;10045error = dsl_pool_hold(target, FTAG, &dp);10046if (error != 0) {10047fatal("can't dump '%s': %s", target,10048strerror(error));10049}10050error = dump_bookmark(dp, target, B_TRUE, verbose > 1);10051dsl_pool_rele(dp, FTAG);10052if (error != 0) {10053fatal("can't dump '%s': %s", target,10054strerror(error));10055}10056goto fini;10057} else {10058target_pool = strdup(target);10059if (strpbrk(target, "/@") != NULL)10060*strpbrk(target_pool, "/@") = '\0';1006110062zdb_set_skip_mmp(target);10063/*10064* If -N was supplied, the user has indicated that10065* zdb -d <pool>/<objsetID> is in effect. Otherwise10066* we first assume that the dataset string is the10067* dataset name. If dmu_objset_hold fails with the10068* dataset string, and we have an objset_id, retry the10069* lookup with the objsetID.10070*/10071boolean_t retry = B_TRUE;10072retry_lookup:10073if (dataset_lookup == B_TRUE) {10074/*10075* Use the supplied id to get the name10076* for open_objset.10077*/10078error = spa_open(target_pool, &spa, FTAG);10079if (error == 0) {10080error = name_from_objset_id(spa,10081objset_id, dsname);10082spa_close(spa, FTAG);10083if (error == 0)10084target = dsname;10085}10086}10087if (error == 0) {10088if (objset_id > 0 && retry) {10089int err = dmu_objset_hold(target, FTAG,10090&os);10091if (err) {10092dataset_lookup = B_TRUE;10093retry = B_FALSE;10094goto retry_lookup;10095} else {10096dmu_objset_rele(os, FTAG);10097}10098}10099error = open_objset(target, FTAG, &os);10100}10101if (error == 0)10102spa = dmu_objset_spa(os);10103free(target_pool);10104}10105}10106nvlist_free(policy);1010710108if (error)10109fatal("can't open '%s': %s", target, strerror(error));1011010111/*10112* Set the pool failure mode to panic in order to prevent the pool10113* from suspending. A suspended I/O will have no way to resume and10114* can prevent the zdb(8) command from terminating as expected.10115*/10116if (spa != NULL)10117spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;1011810119argv++;10120argc--;10121if (dump_opt['r']) {10122error = zdb_copy_object(os, object, argv[1]);10123} else if (!dump_opt['R']) {10124flagbits['d'] = ZOR_FLAG_DIRECTORY;10125flagbits['f'] = ZOR_FLAG_PLAIN_FILE;10126flagbits['m'] = ZOR_FLAG_SPACE_MAP;10127flagbits['z'] = ZOR_FLAG_ZAP;10128flagbits['A'] = ZOR_FLAG_ALL_TYPES;1012910130if (argc > 0 && dump_opt['d']) {10131zopt_object_args = argc;10132zopt_object_ranges = calloc(zopt_object_args,10133sizeof (zopt_object_range_t));10134for (unsigned i = 0; i < zopt_object_args; i++) {10135int err;10136const char *msg = NULL;1013710138err = parse_object_range(argv[i],10139&zopt_object_ranges[i], &msg);10140if (err != 0)10141fatal("Bad object or range: '%s': %s\n",10142argv[i], msg ?: "");10143}10144} else if (argc > 0 && dump_opt['m']) {10145zopt_metaslab_args = argc;10146zopt_metaslab = calloc(zopt_metaslab_args,10147sizeof (uint64_t));10148for (unsigned i = 0; i < zopt_metaslab_args; i++) {10149errno = 0;10150zopt_metaslab[i] = strtoull(argv[i], NULL, 0);10151if (zopt_metaslab[i] == 0 && errno != 0)10152fatal("bad number %s: %s", argv[i],10153strerror(errno));10154}10155}10156if (dump_opt['B']) {10157dump_backup(target, objset_id,10158argc > 0 ? argv[0] : NULL);10159} else if (os != NULL) {10160dump_objset(os);10161} else if (zopt_object_args > 0 && !dump_opt['m']) {10162dump_objset(spa->spa_meta_objset);10163} else {10164dump_zpool(spa);10165}10166} else {10167flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;10168flagbits['c'] = ZDB_FLAG_CHECKSUM;10169flagbits['d'] = ZDB_FLAG_DECOMPRESS;10170flagbits['e'] = ZDB_FLAG_BSWAP;10171flagbits['g'] = ZDB_FLAG_GBH;10172flagbits['i'] = ZDB_FLAG_INDIRECT;10173flagbits['r'] = ZDB_FLAG_RAW;10174flagbits['v'] = ZDB_FLAG_VERBOSE;1017510176for (int i = 0; i < argc; i++)10177zdb_read_block(argv[i], spa);10178}1017910180if (dump_opt['k']) {10181free(checkpoint_pool);10182if (!target_is_spa)10183free(checkpoint_target);10184}1018510186fini:10187if (spa != NULL)10188zdb_ddt_cleanup(spa);1018910190if (os != NULL) {10191close_objset(os, FTAG);10192} else if (spa != NULL) {10193spa_close(spa, FTAG);10194}1019510196fuid_table_destroy();1019710198dump_debug_buffer();1019910200if (kernel_init_done)10201kernel_fini();1020210203if (corruption_found && error == 0)10204error = 3;1020510206return (error);10207}102081020910210