Path: blob/main/sys/contrib/openzfs/module/zfs/dnode.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2012, 2020 by Delphix. All rights reserved.24* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.25*/2627#include <sys/zfs_context.h>28#include <sys/dbuf.h>29#include <sys/dnode.h>30#include <sys/dmu.h>31#include <sys/dmu_impl.h>32#include <sys/dmu_tx.h>33#include <sys/dmu_objset.h>34#include <sys/dsl_dir.h>35#include <sys/dsl_dataset.h>36#include <sys/spa.h>37#include <sys/zio.h>38#include <sys/dmu_zfetch.h>39#include <sys/range_tree.h>40#include <sys/trace_zfs.h>41#include <sys/zfs_project.h>4243dnode_stats_t dnode_stats = {44{ "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },45{ "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },46{ "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },47{ "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },48{ "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },49{ "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },50{ "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },51{ "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },52{ "dnode_hold_free_hits", KSTAT_DATA_UINT64 },53{ "dnode_hold_free_misses", KSTAT_DATA_UINT64 },54{ "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },55{ "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },56{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },57{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },58{ "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },59{ "dnode_allocate", KSTAT_DATA_UINT64 },60{ "dnode_reallocate", KSTAT_DATA_UINT64 },61{ "dnode_buf_evict", KSTAT_DATA_UINT64 },62{ "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },63{ "dnode_alloc_race", KSTAT_DATA_UINT64 },64{ "dnode_alloc_next_block", KSTAT_DATA_UINT64 },65{ "dnode_move_invalid", KSTAT_DATA_UINT64 },66{ "dnode_move_recheck1", KSTAT_DATA_UINT64 },67{ "dnode_move_recheck2", KSTAT_DATA_UINT64 },68{ "dnode_move_special", KSTAT_DATA_UINT64 },69{ "dnode_move_handle", KSTAT_DATA_UINT64 },70{ "dnode_move_rwlock", KSTAT_DATA_UINT64 },71{ "dnode_move_active", KSTAT_DATA_UINT64 },72};7374dnode_sums_t dnode_sums;7576static kstat_t *dnode_ksp;77static kmem_cache_t *dnode_cache;7879static dnode_phys_t dnode_phys_zero __maybe_unused;8081int zfs_default_bs = SPA_MINBLOCKSHIFT;82int zfs_default_ibs = DN_MAX_INDBLKSHIFT;8384#ifdef _KERNEL85static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);86#endif /* _KERNEL */8788static char *89rt_name(dnode_t *dn, const char *name)90{91struct objset *os = dn->dn_objset;9293return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",94spa_name(os->os_spa),95(u_longlong_t)(os->os_dsl_dataset ?96os->os_dsl_dataset->ds_object : DMU_META_OBJSET),97(u_longlong_t)dn->dn_object,98name));99}100101static int102dbuf_compare(const void *x1, const void *x2)103{104const dmu_buf_impl_t *d1 = x1;105const dmu_buf_impl_t *d2 = x2;106107int cmp = TREE_CMP(d1->db_level, d2->db_level);108if (likely(cmp))109return (cmp);110111cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);112if (likely(cmp))113return (cmp);114115if (d1->db_state == DB_MARKER) {116ASSERT3S(d2->db_state, !=, DB_MARKER);117return (TREE_PCMP(d1->db_parent, d2));118} else if (d2->db_state == DB_MARKER) {119ASSERT3S(d1->db_state, !=, DB_MARKER);120return (TREE_PCMP(d1, d2->db_parent));121}122123if (d1->db_state == DB_SEARCH) {124ASSERT3S(d2->db_state, !=, DB_SEARCH);125return (-1);126} else if (d2->db_state == DB_SEARCH) {127ASSERT3S(d1->db_state, !=, DB_SEARCH);128return (1);129}130131return (TREE_PCMP(d1, d2));132}133134static int135dnode_cons(void *arg, void *unused, int kmflag)136{137(void) unused, (void) kmflag;138dnode_t *dn = arg;139140rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);141mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);142mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);143cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);144cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);145146/*147* Every dbuf has a reference, and dropping a tracked reference is148* O(number of references), so don't track dn_holds.149*/150zfs_refcount_create_untracked(&dn->dn_holds);151zfs_refcount_create(&dn->dn_tx_holds);152list_link_init(&dn->dn_link);153154memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));155memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));156memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));157memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));158memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));159memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));160memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));161memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));162memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));163164for (int i = 0; i < TXG_SIZE; i++) {165multilist_link_init(&dn->dn_dirty_link[i]);166dn->dn_free_ranges[i] = NULL;167list_create(&dn->dn_dirty_records[i],168sizeof (dbuf_dirty_record_t),169offsetof(dbuf_dirty_record_t, dr_dirty_node));170}171172dn->dn_allocated_txg = 0;173dn->dn_free_txg = 0;174dn->dn_assigned_txg = 0;175dn->dn_dirtycnt = 0;176dn->dn_bonus = NULL;177dn->dn_have_spill = B_FALSE;178dn->dn_zio = NULL;179dn->dn_oldused = 0;180dn->dn_oldflags = 0;181dn->dn_olduid = 0;182dn->dn_oldgid = 0;183dn->dn_oldprojid = ZFS_DEFAULT_PROJID;184dn->dn_newuid = 0;185dn->dn_newgid = 0;186dn->dn_newprojid = ZFS_DEFAULT_PROJID;187dn->dn_id_flags = 0;188189dn->dn_dbufs_count = 0;190avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),191offsetof(dmu_buf_impl_t, db_link));192193dn->dn_moved = 0;194return (0);195}196197static void198dnode_dest(void *arg, void *unused)199{200(void) unused;201dnode_t *dn = arg;202203rw_destroy(&dn->dn_struct_rwlock);204mutex_destroy(&dn->dn_mtx);205mutex_destroy(&dn->dn_dbufs_mtx);206cv_destroy(&dn->dn_notxholds);207cv_destroy(&dn->dn_nodnholds);208zfs_refcount_destroy(&dn->dn_holds);209zfs_refcount_destroy(&dn->dn_tx_holds);210ASSERT(!list_link_active(&dn->dn_link));211212for (int i = 0; i < TXG_SIZE; i++) {213ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));214ASSERT0P(dn->dn_free_ranges[i]);215list_destroy(&dn->dn_dirty_records[i]);216ASSERT0(dn->dn_next_nblkptr[i]);217ASSERT0(dn->dn_next_nlevels[i]);218ASSERT0(dn->dn_next_indblkshift[i]);219ASSERT0(dn->dn_next_bonustype[i]);220ASSERT0(dn->dn_rm_spillblk[i]);221ASSERT0(dn->dn_next_bonuslen[i]);222ASSERT0(dn->dn_next_blksz[i]);223ASSERT0(dn->dn_next_maxblkid[i]);224}225226ASSERT0(dn->dn_allocated_txg);227ASSERT0(dn->dn_free_txg);228ASSERT0(dn->dn_assigned_txg);229ASSERT0(dn->dn_dirtycnt);230ASSERT0P(dn->dn_bonus);231ASSERT(!dn->dn_have_spill);232ASSERT0P(dn->dn_zio);233ASSERT0(dn->dn_oldused);234ASSERT0(dn->dn_oldflags);235ASSERT0(dn->dn_olduid);236ASSERT0(dn->dn_oldgid);237ASSERT0(dn->dn_oldprojid);238ASSERT0(dn->dn_newuid);239ASSERT0(dn->dn_newgid);240ASSERT0(dn->dn_newprojid);241ASSERT0(dn->dn_id_flags);242243ASSERT0(dn->dn_dbufs_count);244avl_destroy(&dn->dn_dbufs);245}246247static int248dnode_kstats_update(kstat_t *ksp, int rw)249{250dnode_stats_t *ds = ksp->ks_data;251252if (rw == KSTAT_WRITE)253return (EACCES);254ds->dnode_hold_dbuf_hold.value.ui64 =255wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);256ds->dnode_hold_dbuf_read.value.ui64 =257wmsum_value(&dnode_sums.dnode_hold_dbuf_read);258ds->dnode_hold_alloc_hits.value.ui64 =259wmsum_value(&dnode_sums.dnode_hold_alloc_hits);260ds->dnode_hold_alloc_misses.value.ui64 =261wmsum_value(&dnode_sums.dnode_hold_alloc_misses);262ds->dnode_hold_alloc_interior.value.ui64 =263wmsum_value(&dnode_sums.dnode_hold_alloc_interior);264ds->dnode_hold_alloc_lock_retry.value.ui64 =265wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);266ds->dnode_hold_alloc_lock_misses.value.ui64 =267wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);268ds->dnode_hold_alloc_type_none.value.ui64 =269wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);270ds->dnode_hold_free_hits.value.ui64 =271wmsum_value(&dnode_sums.dnode_hold_free_hits);272ds->dnode_hold_free_misses.value.ui64 =273wmsum_value(&dnode_sums.dnode_hold_free_misses);274ds->dnode_hold_free_lock_misses.value.ui64 =275wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);276ds->dnode_hold_free_lock_retry.value.ui64 =277wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);278ds->dnode_hold_free_refcount.value.ui64 =279wmsum_value(&dnode_sums.dnode_hold_free_refcount);280ds->dnode_hold_free_overflow.value.ui64 =281wmsum_value(&dnode_sums.dnode_hold_free_overflow);282ds->dnode_free_interior_lock_retry.value.ui64 =283wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);284ds->dnode_allocate.value.ui64 =285wmsum_value(&dnode_sums.dnode_allocate);286ds->dnode_reallocate.value.ui64 =287wmsum_value(&dnode_sums.dnode_reallocate);288ds->dnode_buf_evict.value.ui64 =289wmsum_value(&dnode_sums.dnode_buf_evict);290ds->dnode_alloc_next_chunk.value.ui64 =291wmsum_value(&dnode_sums.dnode_alloc_next_chunk);292ds->dnode_alloc_race.value.ui64 =293wmsum_value(&dnode_sums.dnode_alloc_race);294ds->dnode_alloc_next_block.value.ui64 =295wmsum_value(&dnode_sums.dnode_alloc_next_block);296ds->dnode_move_invalid.value.ui64 =297wmsum_value(&dnode_sums.dnode_move_invalid);298ds->dnode_move_recheck1.value.ui64 =299wmsum_value(&dnode_sums.dnode_move_recheck1);300ds->dnode_move_recheck2.value.ui64 =301wmsum_value(&dnode_sums.dnode_move_recheck2);302ds->dnode_move_special.value.ui64 =303wmsum_value(&dnode_sums.dnode_move_special);304ds->dnode_move_handle.value.ui64 =305wmsum_value(&dnode_sums.dnode_move_handle);306ds->dnode_move_rwlock.value.ui64 =307wmsum_value(&dnode_sums.dnode_move_rwlock);308ds->dnode_move_active.value.ui64 =309wmsum_value(&dnode_sums.dnode_move_active);310return (0);311}312313void314dnode_init(void)315{316ASSERT0P(dnode_cache);317dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),3180, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);319kmem_cache_set_move(dnode_cache, dnode_move);320321wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);322wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);323wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);324wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);325wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);326wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);327wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);328wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);329wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);330wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);331wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);332wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);333wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);334wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);335wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);336wmsum_init(&dnode_sums.dnode_allocate, 0);337wmsum_init(&dnode_sums.dnode_reallocate, 0);338wmsum_init(&dnode_sums.dnode_buf_evict, 0);339wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);340wmsum_init(&dnode_sums.dnode_alloc_race, 0);341wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);342wmsum_init(&dnode_sums.dnode_move_invalid, 0);343wmsum_init(&dnode_sums.dnode_move_recheck1, 0);344wmsum_init(&dnode_sums.dnode_move_recheck2, 0);345wmsum_init(&dnode_sums.dnode_move_special, 0);346wmsum_init(&dnode_sums.dnode_move_handle, 0);347wmsum_init(&dnode_sums.dnode_move_rwlock, 0);348wmsum_init(&dnode_sums.dnode_move_active, 0);349350dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",351KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),352KSTAT_FLAG_VIRTUAL);353if (dnode_ksp != NULL) {354dnode_ksp->ks_data = &dnode_stats;355dnode_ksp->ks_update = dnode_kstats_update;356kstat_install(dnode_ksp);357}358}359360void361dnode_fini(void)362{363if (dnode_ksp != NULL) {364kstat_delete(dnode_ksp);365dnode_ksp = NULL;366}367368wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);369wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);370wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);371wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);372wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);373wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);374wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);375wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);376wmsum_fini(&dnode_sums.dnode_hold_free_hits);377wmsum_fini(&dnode_sums.dnode_hold_free_misses);378wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);379wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);380wmsum_fini(&dnode_sums.dnode_hold_free_refcount);381wmsum_fini(&dnode_sums.dnode_hold_free_overflow);382wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);383wmsum_fini(&dnode_sums.dnode_allocate);384wmsum_fini(&dnode_sums.dnode_reallocate);385wmsum_fini(&dnode_sums.dnode_buf_evict);386wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);387wmsum_fini(&dnode_sums.dnode_alloc_race);388wmsum_fini(&dnode_sums.dnode_alloc_next_block);389wmsum_fini(&dnode_sums.dnode_move_invalid);390wmsum_fini(&dnode_sums.dnode_move_recheck1);391wmsum_fini(&dnode_sums.dnode_move_recheck2);392wmsum_fini(&dnode_sums.dnode_move_special);393wmsum_fini(&dnode_sums.dnode_move_handle);394wmsum_fini(&dnode_sums.dnode_move_rwlock);395wmsum_fini(&dnode_sums.dnode_move_active);396397kmem_cache_destroy(dnode_cache);398dnode_cache = NULL;399}400401402#ifdef ZFS_DEBUG403void404dnode_verify(dnode_t *dn)405{406int drop_struct_lock = FALSE;407408ASSERT(dn->dn_phys);409ASSERT(dn->dn_objset);410ASSERT(dn->dn_handle->dnh_dnode == dn);411412ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));413414if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))415return;416417if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {418rw_enter(&dn->dn_struct_rwlock, RW_READER);419drop_struct_lock = TRUE;420}421if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {422int i;423int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);424ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);425if (dn->dn_datablkshift) {426ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);427ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);428ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);429}430ASSERT3U(dn->dn_nlevels, <=, 30);431ASSERT(DMU_OT_IS_VALID(dn->dn_type));432ASSERT3U(dn->dn_nblkptr, >=, 1);433ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);434ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);435ASSERT3U(dn->dn_datablksz, ==,436dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);437ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);438ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +439dn->dn_bonuslen, <=, max_bonuslen);440for (i = 0; i < TXG_SIZE; i++) {441ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);442}443}444if (dn->dn_phys->dn_type != DMU_OT_NONE)445ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);446ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);447if (dn->dn_dbuf != NULL) {448ASSERT3P(dn->dn_phys, ==,449(dnode_phys_t *)dn->dn_dbuf->db.db_data +450(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));451}452if (drop_struct_lock)453rw_exit(&dn->dn_struct_rwlock);454}455#endif456457void458dnode_byteswap(dnode_phys_t *dnp)459{460uint64_t *buf64 = (void*)&dnp->dn_blkptr;461int i;462463if (dnp->dn_type == DMU_OT_NONE) {464memset(dnp, 0, sizeof (dnode_phys_t));465return;466}467468dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);469dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);470dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);471dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);472dnp->dn_used = BSWAP_64(dnp->dn_used);473474/*475* dn_nblkptr is only one byte, so it's OK to read it in either476* byte order. We can't read dn_bouslen.477*/478ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);479ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);480for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)481buf64[i] = BSWAP_64(buf64[i]);482483/*484* OK to check dn_bonuslen for zero, because it won't matter if485* we have the wrong byte order. This is necessary because the486* dnode dnode is smaller than a regular dnode.487*/488if (dnp->dn_bonuslen != 0) {489dmu_object_byteswap_t byteswap;490ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));491byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);492dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),493DN_MAX_BONUS_LEN(dnp));494}495496/* Swap SPILL block if we have one */497if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)498byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));499}500501void502dnode_buf_byteswap(void *vbuf, size_t size)503{504int i = 0;505506ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));507ASSERT0((size & (sizeof (dnode_phys_t)-1)));508509while (i < size) {510dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);511dnode_byteswap(dnp);512513i += DNODE_MIN_SIZE;514if (dnp->dn_type != DMU_OT_NONE)515i += dnp->dn_extra_slots * DNODE_MIN_SIZE;516}517}518519void520dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)521{522ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);523524dnode_setdirty(dn, tx);525rw_enter(&dn->dn_struct_rwlock, RW_WRITER);526ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -527(dn->dn_nblkptr-1) * sizeof (blkptr_t));528529if (newsize < dn->dn_bonuslen) {530/* clear any data after the end of the new size */531size_t diff = dn->dn_bonuslen - newsize;532char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;533memset(data_end, 0, diff);534}535536dn->dn_bonuslen = newsize;537if (newsize == 0)538dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;539else540dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;541rw_exit(&dn->dn_struct_rwlock);542}543544void545dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)546{547ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);548dnode_setdirty(dn, tx);549rw_enter(&dn->dn_struct_rwlock, RW_WRITER);550dn->dn_bonustype = newtype;551dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;552rw_exit(&dn->dn_struct_rwlock);553}554555void556dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype)557{558/*559* This is not in the dnode_phys, but it should be, and perhaps one day560* will. For now we require it be set after taking a hold.561*/562ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);563dn->dn_storage_type = newtype;564}565566void567dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)568{569ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);570ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));571dnode_setdirty(dn, tx);572dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;573dn->dn_have_spill = B_FALSE;574}575576static void577dnode_setdblksz(dnode_t *dn, int size)578{579ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));580ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);581ASSERT3U(size, >=, SPA_MINBLOCKSIZE);582ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,5831<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));584dn->dn_datablksz = size;585dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;586dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;587}588589static dnode_t *590dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,591uint64_t object, dnode_handle_t *dnh)592{593dnode_t *dn;594595dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);596dn->dn_moved = 0;597598/*599* Defer setting dn_objset until the dnode is ready to be a candidate600* for the dnode_move() callback.601*/602dn->dn_object = object;603dn->dn_dbuf = db;604dn->dn_handle = dnh;605dn->dn_phys = dnp;606607if (dnp->dn_datablkszsec) {608dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);609} else {610dn->dn_datablksz = 0;611dn->dn_datablkszsec = 0;612dn->dn_datablkshift = 0;613}614dn->dn_indblkshift = dnp->dn_indblkshift;615dn->dn_nlevels = dnp->dn_nlevels;616dn->dn_type = dnp->dn_type;617dn->dn_nblkptr = dnp->dn_nblkptr;618dn->dn_checksum = dnp->dn_checksum;619dn->dn_compress = dnp->dn_compress;620dn->dn_bonustype = dnp->dn_bonustype;621dn->dn_bonuslen = dnp->dn_bonuslen;622dn->dn_num_slots = dnp->dn_extra_slots + 1;623dn->dn_maxblkid = dnp->dn_maxblkid;624dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);625dn->dn_id_flags = 0;626627dn->dn_storage_type = DMU_OT_NONE;628629dmu_zfetch_init(&dn->dn_zfetch, dn);630631ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));632ASSERT(zrl_is_locked(&dnh->dnh_zrlock));633ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));634635mutex_enter(&os->os_lock);636637/*638* Exclude special dnodes from os_dnodes so an empty os_dnodes639* signifies that the special dnodes have no references from640* their children (the entries in os_dnodes). This allows641* dnode_destroy() to easily determine if the last child has642* been removed and then complete eviction of the objset.643*/644if (!DMU_OBJECT_IS_SPECIAL(object))645list_insert_head(&os->os_dnodes, dn);646membar_producer();647648/*649* Everything else must be valid before assigning dn_objset650* makes the dnode eligible for dnode_move().651*/652dn->dn_objset = os;653654dnh->dnh_dnode = dn;655mutex_exit(&os->os_lock);656657arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);658659return (dn);660}661662/*663* Caller must be holding the dnode handle, which is released upon return.664*/665static void666dnode_destroy(dnode_t *dn)667{668objset_t *os = dn->dn_objset;669boolean_t complete_os_eviction = B_FALSE;670671ASSERT0((dn->dn_id_flags & DN_ID_NEW_EXIST));672673mutex_enter(&os->os_lock);674POINTER_INVALIDATE(&dn->dn_objset);675if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {676list_remove(&os->os_dnodes, dn);677complete_os_eviction =678list_is_empty(&os->os_dnodes) &&679list_link_active(&os->os_evicting_node);680}681mutex_exit(&os->os_lock);682683/* the dnode can no longer move, so we can release the handle */684if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))685zrl_remove(&dn->dn_handle->dnh_zrlock);686687dn->dn_allocated_txg = 0;688dn->dn_free_txg = 0;689dn->dn_assigned_txg = 0;690dn->dn_dirtycnt = 0;691692if (dn->dn_bonus != NULL) {693mutex_enter(&dn->dn_bonus->db_mtx);694dbuf_destroy(dn->dn_bonus);695dn->dn_bonus = NULL;696}697dn->dn_zio = NULL;698699dn->dn_have_spill = B_FALSE;700dn->dn_oldused = 0;701dn->dn_oldflags = 0;702dn->dn_olduid = 0;703dn->dn_oldgid = 0;704dn->dn_oldprojid = ZFS_DEFAULT_PROJID;705dn->dn_newuid = 0;706dn->dn_newgid = 0;707dn->dn_newprojid = ZFS_DEFAULT_PROJID;708dn->dn_id_flags = 0;709710dn->dn_storage_type = DMU_OT_NONE;711712dmu_zfetch_fini(&dn->dn_zfetch);713kmem_cache_free(dnode_cache, dn);714arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);715716if (complete_os_eviction)717dmu_objset_evict_done(os);718}719720void721dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,722dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)723{724int i;725726ASSERT3U(dn_slots, >, 0);727ASSERT3U(dn_slots << DNODE_SHIFT, <=,728spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));729ASSERT3U(blocksize, <=,730spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));731if (blocksize == 0)732blocksize = 1 << zfs_default_bs;733else734blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);735736if (ibs == 0)737ibs = zfs_default_ibs;738739ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);740741dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",742dn->dn_objset, (u_longlong_t)dn->dn_object,743(u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);744DNODE_STAT_BUMP(dnode_allocate);745746ASSERT(dn->dn_type == DMU_OT_NONE);747ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));748ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);749ASSERT(ot != DMU_OT_NONE);750ASSERT(DMU_OT_IS_VALID(ot));751ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||752(bonustype == DMU_OT_SA && bonuslen == 0) ||753(bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||754(bonustype != DMU_OT_NONE && bonuslen != 0));755ASSERT(DMU_OT_IS_VALID(bonustype));756ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));757ASSERT(dn->dn_type == DMU_OT_NONE);758ASSERT0(dn->dn_maxblkid);759ASSERT0(dn->dn_allocated_txg);760ASSERT0(dn->dn_assigned_txg);761ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));762ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);763ASSERT(avl_is_empty(&dn->dn_dbufs));764765for (i = 0; i < TXG_SIZE; i++) {766ASSERT0(dn->dn_next_nblkptr[i]);767ASSERT0(dn->dn_next_nlevels[i]);768ASSERT0(dn->dn_next_indblkshift[i]);769ASSERT0(dn->dn_next_bonuslen[i]);770ASSERT0(dn->dn_next_bonustype[i]);771ASSERT0(dn->dn_rm_spillblk[i]);772ASSERT0(dn->dn_next_blksz[i]);773ASSERT0(dn->dn_next_maxblkid[i]);774ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));775ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);776ASSERT0P(dn->dn_free_ranges[i]);777}778779dn->dn_type = ot;780dnode_setdblksz(dn, blocksize);781dn->dn_indblkshift = ibs;782dn->dn_nlevels = 1;783dn->dn_num_slots = dn_slots;784if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */785dn->dn_nblkptr = 1;786else {787dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,7881 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>789SPA_BLKPTRSHIFT));790}791792dn->dn_bonustype = bonustype;793dn->dn_bonuslen = bonuslen;794dn->dn_checksum = ZIO_CHECKSUM_INHERIT;795dn->dn_compress = ZIO_COMPRESS_INHERIT;796797dn->dn_free_txg = 0;798dn->dn_dirtycnt = 0;799800dn->dn_allocated_txg = tx->tx_txg;801dn->dn_id_flags = 0;802803dnode_setdirty(dn, tx);804dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;805dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;806dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;807dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;808}809810void811dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,812dmu_object_type_t bonustype, int bonuslen, int dn_slots,813boolean_t keep_spill, dmu_tx_t *tx)814{815int nblkptr;816817ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);818ASSERT3U(blocksize, <=,819spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));820ASSERT0(blocksize % SPA_MINBLOCKSIZE);821ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));822ASSERT(tx->tx_txg != 0);823ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||824(bonustype != DMU_OT_NONE && bonuslen != 0) ||825(bonustype == DMU_OT_SA && bonuslen == 0));826ASSERT(DMU_OT_IS_VALID(bonustype));827ASSERT3U(bonuslen, <=,828DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));829ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));830831dnode_free_interior_slots(dn);832DNODE_STAT_BUMP(dnode_reallocate);833834/* clean up any unreferenced dbufs */835dnode_evict_dbufs(dn);836837dn->dn_id_flags = 0;838839rw_enter(&dn->dn_struct_rwlock, RW_WRITER);840dnode_setdirty(dn, tx);841if (dn->dn_datablksz != blocksize) {842/* change blocksize */843ASSERT0(dn->dn_maxblkid);844ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||845dnode_block_freed(dn, 0));846847dnode_setdblksz(dn, blocksize);848dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;849}850if (dn->dn_bonuslen != bonuslen)851dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;852853if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */854nblkptr = 1;855else856nblkptr = MIN(DN_MAX_NBLKPTR,8571 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>858SPA_BLKPTRSHIFT));859if (dn->dn_bonustype != bonustype)860dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;861if (dn->dn_nblkptr != nblkptr)862dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;863if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {864dbuf_rm_spill(dn, tx);865dnode_rm_spill(dn, tx);866}867868rw_exit(&dn->dn_struct_rwlock);869870/* change type */871dn->dn_type = ot;872873/* change bonus size and type */874mutex_enter(&dn->dn_mtx);875dn->dn_bonustype = bonustype;876dn->dn_bonuslen = bonuslen;877dn->dn_num_slots = dn_slots;878dn->dn_nblkptr = nblkptr;879dn->dn_checksum = ZIO_CHECKSUM_INHERIT;880dn->dn_compress = ZIO_COMPRESS_INHERIT;881ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);882883/* fix up the bonus db_size */884if (dn->dn_bonus) {885dn->dn_bonus->db.db_size =886DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -887(dn->dn_nblkptr-1) * sizeof (blkptr_t);888ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);889}890891dn->dn_allocated_txg = tx->tx_txg;892mutex_exit(&dn->dn_mtx);893}894895#ifdef _KERNEL896static void897dnode_move_impl(dnode_t *odn, dnode_t *ndn)898{899ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));900ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));901ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));902903/* Copy fields. */904ndn->dn_objset = odn->dn_objset;905ndn->dn_object = odn->dn_object;906ndn->dn_dbuf = odn->dn_dbuf;907ndn->dn_handle = odn->dn_handle;908ndn->dn_phys = odn->dn_phys;909ndn->dn_type = odn->dn_type;910ndn->dn_bonuslen = odn->dn_bonuslen;911ndn->dn_bonustype = odn->dn_bonustype;912ndn->dn_nblkptr = odn->dn_nblkptr;913ndn->dn_checksum = odn->dn_checksum;914ndn->dn_compress = odn->dn_compress;915ndn->dn_nlevels = odn->dn_nlevels;916ndn->dn_indblkshift = odn->dn_indblkshift;917ndn->dn_datablkshift = odn->dn_datablkshift;918ndn->dn_datablkszsec = odn->dn_datablkszsec;919ndn->dn_datablksz = odn->dn_datablksz;920ndn->dn_maxblkid = odn->dn_maxblkid;921ndn->dn_num_slots = odn->dn_num_slots;922memcpy(ndn->dn_next_type, odn->dn_next_type,923sizeof (odn->dn_next_type));924memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,925sizeof (odn->dn_next_nblkptr));926memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,927sizeof (odn->dn_next_nlevels));928memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,929sizeof (odn->dn_next_indblkshift));930memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,931sizeof (odn->dn_next_bonustype));932memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,933sizeof (odn->dn_rm_spillblk));934memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,935sizeof (odn->dn_next_bonuslen));936memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,937sizeof (odn->dn_next_blksz));938memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,939sizeof (odn->dn_next_maxblkid));940for (int i = 0; i < TXG_SIZE; i++) {941list_move_tail(&ndn->dn_dirty_records[i],942&odn->dn_dirty_records[i]);943}944memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,945sizeof (odn->dn_free_ranges));946ndn->dn_allocated_txg = odn->dn_allocated_txg;947ndn->dn_free_txg = odn->dn_free_txg;948ndn->dn_assigned_txg = odn->dn_assigned_txg;949ndn->dn_dirtycnt = odn->dn_dirtycnt;950ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));951zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);952ASSERT(avl_is_empty(&ndn->dn_dbufs));953avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);954ndn->dn_dbufs_count = odn->dn_dbufs_count;955ndn->dn_bonus = odn->dn_bonus;956ndn->dn_have_spill = odn->dn_have_spill;957ndn->dn_zio = odn->dn_zio;958ndn->dn_oldused = odn->dn_oldused;959ndn->dn_oldflags = odn->dn_oldflags;960ndn->dn_olduid = odn->dn_olduid;961ndn->dn_oldgid = odn->dn_oldgid;962ndn->dn_oldprojid = odn->dn_oldprojid;963ndn->dn_newuid = odn->dn_newuid;964ndn->dn_newgid = odn->dn_newgid;965ndn->dn_newprojid = odn->dn_newprojid;966ndn->dn_id_flags = odn->dn_id_flags;967ndn->dn_storage_type = odn->dn_storage_type;968dmu_zfetch_init(&ndn->dn_zfetch, ndn);969970/*971* Update back pointers. Updating the handle fixes the back pointer of972* every descendant dbuf as well as the bonus dbuf.973*/974ASSERT(ndn->dn_handle->dnh_dnode == odn);975ndn->dn_handle->dnh_dnode = ndn;976977/*978* Invalidate the original dnode by clearing all of its back pointers.979*/980odn->dn_dbuf = NULL;981odn->dn_handle = NULL;982avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),983offsetof(dmu_buf_impl_t, db_link));984odn->dn_dbufs_count = 0;985odn->dn_bonus = NULL;986dmu_zfetch_fini(&odn->dn_zfetch);987988/*989* Set the low bit of the objset pointer to ensure that dnode_move()990* recognizes the dnode as invalid in any subsequent callback.991*/992POINTER_INVALIDATE(&odn->dn_objset);993994/*995* Satisfy the destructor.996*/997for (int i = 0; i < TXG_SIZE; i++) {998list_create(&odn->dn_dirty_records[i],999sizeof (dbuf_dirty_record_t),1000offsetof(dbuf_dirty_record_t, dr_dirty_node));1001odn->dn_free_ranges[i] = NULL;1002odn->dn_next_nlevels[i] = 0;1003odn->dn_next_indblkshift[i] = 0;1004odn->dn_next_bonustype[i] = 0;1005odn->dn_rm_spillblk[i] = 0;1006odn->dn_next_bonuslen[i] = 0;1007odn->dn_next_blksz[i] = 0;1008}1009odn->dn_allocated_txg = 0;1010odn->dn_free_txg = 0;1011odn->dn_assigned_txg = 0;1012odn->dn_dirtycnt = 0;1013odn->dn_have_spill = B_FALSE;1014odn->dn_zio = NULL;1015odn->dn_oldused = 0;1016odn->dn_oldflags = 0;1017odn->dn_olduid = 0;1018odn->dn_oldgid = 0;1019odn->dn_oldprojid = ZFS_DEFAULT_PROJID;1020odn->dn_newuid = 0;1021odn->dn_newgid = 0;1022odn->dn_newprojid = ZFS_DEFAULT_PROJID;1023odn->dn_id_flags = 0;1024odn->dn_storage_type = DMU_OT_NONE;10251026/*1027* Mark the dnode.1028*/1029ndn->dn_moved = 1;1030odn->dn_moved = (uint8_t)-1;1031}10321033static kmem_cbrc_t1034dnode_move(void *buf, void *newbuf, size_t size, void *arg)1035{1036dnode_t *odn = buf, *ndn = newbuf;1037objset_t *os;1038int64_t refcount;1039uint32_t dbufs;10401041#ifndef USE_DNODE_HANDLE1042/*1043* We can't move dnodes if dbufs reference them directly without1044* using handles and respecitve locking. Unless USE_DNODE_HANDLE1045* is defined the code below is only to make sure it still builds,1046* but it should never be used, since it is unsafe.1047*/1048#ifdef ZFS_DEBUG1049PANIC("dnode_move() called without USE_DNODE_HANDLE");1050#endif1051return (KMEM_CBRC_NO);1052#endif10531054/*1055* The dnode is on the objset's list of known dnodes if the objset1056* pointer is valid. We set the low bit of the objset pointer when1057* freeing the dnode to invalidate it, and the memory patterns written1058* by kmem (baddcafe and deadbeef) set at least one of the two low bits.1059* A newly created dnode sets the objset pointer last of all to indicate1060* that the dnode is known and in a valid state to be moved by this1061* function.1062*/1063os = odn->dn_objset;1064if (!POINTER_IS_VALID(os)) {1065DNODE_STAT_BUMP(dnode_move_invalid);1066return (KMEM_CBRC_DONT_KNOW);1067}10681069/*1070* Ensure that the objset does not go away during the move.1071*/1072rw_enter(&os_lock, RW_WRITER);1073if (os != odn->dn_objset) {1074rw_exit(&os_lock);1075DNODE_STAT_BUMP(dnode_move_recheck1);1076return (KMEM_CBRC_DONT_KNOW);1077}10781079/*1080* If the dnode is still valid, then so is the objset. We know that no1081* valid objset can be freed while we hold os_lock, so we can safely1082* ensure that the objset remains in use.1083*/1084mutex_enter(&os->os_lock);10851086/*1087* Recheck the objset pointer in case the dnode was removed just before1088* acquiring the lock.1089*/1090if (os != odn->dn_objset) {1091mutex_exit(&os->os_lock);1092rw_exit(&os_lock);1093DNODE_STAT_BUMP(dnode_move_recheck2);1094return (KMEM_CBRC_DONT_KNOW);1095}10961097/*1098* At this point we know that as long as we hold os->os_lock, the dnode1099* cannot be freed and fields within the dnode can be safely accessed.1100* The objset listing this dnode cannot go away as long as this dnode is1101* on its list.1102*/1103rw_exit(&os_lock);1104if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {1105mutex_exit(&os->os_lock);1106DNODE_STAT_BUMP(dnode_move_special);1107return (KMEM_CBRC_NO);1108}1109ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */11101111/*1112* Lock the dnode handle to prevent the dnode from obtaining any new1113* holds. This also prevents the descendant dbufs and the bonus dbuf1114* from accessing the dnode, so that we can discount their holds. The1115* handle is safe to access because we know that while the dnode cannot1116* go away, neither can its handle. Once we hold dnh_zrlock, we can1117* safely move any dnode referenced only by dbufs.1118*/1119if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {1120mutex_exit(&os->os_lock);1121DNODE_STAT_BUMP(dnode_move_handle);1122return (KMEM_CBRC_LATER);1123}11241125/*1126* Ensure a consistent view of the dnode's holds and the dnode's dbufs.1127* We need to guarantee that there is a hold for every dbuf in order to1128* determine whether the dnode is actively referenced. Falsely matching1129* a dbuf to an active hold would lead to an unsafe move. It's possible1130* that a thread already having an active dnode hold is about to add a1131* dbuf, and we can't compare hold and dbuf counts while the add is in1132* progress.1133*/1134if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {1135zrl_exit(&odn->dn_handle->dnh_zrlock);1136mutex_exit(&os->os_lock);1137DNODE_STAT_BUMP(dnode_move_rwlock);1138return (KMEM_CBRC_LATER);1139}11401141/*1142* A dbuf may be removed (evicted) without an active dnode hold. In that1143* case, the dbuf count is decremented under the handle lock before the1144* dbuf's hold is released. This order ensures that if we count the hold1145* after the dbuf is removed but before its hold is released, we will1146* treat the unmatched hold as active and exit safely. If we count the1147* hold before the dbuf is removed, the hold is discounted, and the1148* removal is blocked until the move completes.1149*/1150refcount = zfs_refcount_count(&odn->dn_holds);1151ASSERT(refcount >= 0);1152dbufs = DN_DBUFS_COUNT(odn);11531154/* We can't have more dbufs than dnode holds. */1155ASSERT3U(dbufs, <=, refcount);1156DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,1157uint32_t, dbufs);11581159if (refcount > dbufs) {1160rw_exit(&odn->dn_struct_rwlock);1161zrl_exit(&odn->dn_handle->dnh_zrlock);1162mutex_exit(&os->os_lock);1163DNODE_STAT_BUMP(dnode_move_active);1164return (KMEM_CBRC_LATER);1165}11661167rw_exit(&odn->dn_struct_rwlock);11681169/*1170* At this point we know that anyone with a hold on the dnode is not1171* actively referencing it. The dnode is known and in a valid state to1172* move. We're holding the locks needed to execute the critical section.1173*/1174dnode_move_impl(odn, ndn);11751176list_link_replace(&odn->dn_link, &ndn->dn_link);1177/* If the dnode was safe to move, the refcount cannot have changed. */1178ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));1179ASSERT(dbufs == DN_DBUFS_COUNT(ndn));1180zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */1181mutex_exit(&os->os_lock);11821183return (KMEM_CBRC_YES);1184}1185#endif /* _KERNEL */11861187static void1188dnode_slots_hold(dnode_children_t *children, int idx, int slots)1189{1190ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);11911192for (int i = idx; i < idx + slots; i++) {1193dnode_handle_t *dnh = &children->dnc_children[i];1194zrl_add(&dnh->dnh_zrlock);1195}1196}11971198static void1199dnode_slots_rele(dnode_children_t *children, int idx, int slots)1200{1201ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);12021203for (int i = idx; i < idx + slots; i++) {1204dnode_handle_t *dnh = &children->dnc_children[i];12051206if (zrl_is_locked(&dnh->dnh_zrlock))1207zrl_exit(&dnh->dnh_zrlock);1208else1209zrl_remove(&dnh->dnh_zrlock);1210}1211}12121213static int1214dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)1215{1216ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);12171218for (int i = idx; i < idx + slots; i++) {1219dnode_handle_t *dnh = &children->dnc_children[i];12201221if (!zrl_tryenter(&dnh->dnh_zrlock)) {1222for (int j = idx; j < i; j++) {1223dnh = &children->dnc_children[j];1224zrl_exit(&dnh->dnh_zrlock);1225}12261227return (0);1228}1229}12301231return (1);1232}12331234static void1235dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)1236{1237ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);12381239for (int i = idx; i < idx + slots; i++) {1240dnode_handle_t *dnh = &children->dnc_children[i];1241dnh->dnh_dnode = ptr;1242}1243}12441245static boolean_t1246dnode_check_slots_free(dnode_children_t *children, int idx, int slots)1247{1248ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);12491250/*1251* If all dnode slots are either already free or1252* evictable return B_TRUE.1253*/1254for (int i = idx; i < idx + slots; i++) {1255dnode_handle_t *dnh = &children->dnc_children[i];1256dnode_t *dn = dnh->dnh_dnode;12571258if (dn == DN_SLOT_FREE) {1259continue;1260} else if (DN_SLOT_IS_PTR(dn)) {1261mutex_enter(&dn->dn_mtx);1262boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&1263dn->dn_dirtycnt == 0 &&1264zfs_refcount_is_zero(&dn->dn_holds));1265mutex_exit(&dn->dn_mtx);12661267if (!can_free)1268return (B_FALSE);1269else1270continue;1271} else {1272return (B_FALSE);1273}1274}12751276return (B_TRUE);1277}12781279static uint_t1280dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)1281{1282uint_t reclaimed = 0;12831284ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);12851286for (int i = idx; i < idx + slots; i++) {1287dnode_handle_t *dnh = &children->dnc_children[i];12881289ASSERT(zrl_is_locked(&dnh->dnh_zrlock));12901291if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {1292ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);1293dnode_destroy(dnh->dnh_dnode);1294dnh->dnh_dnode = DN_SLOT_FREE;1295reclaimed++;1296}1297}12981299return (reclaimed);1300}13011302void1303dnode_free_interior_slots(dnode_t *dn)1304{1305dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);1306int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;1307int idx = (dn->dn_object & (epb - 1)) + 1;1308int slots = dn->dn_num_slots - 1;13091310if (slots == 0)1311return;13121313ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);13141315while (!dnode_slots_tryenter(children, idx, slots)) {1316DNODE_STAT_BUMP(dnode_free_interior_lock_retry);1317kpreempt(KPREEMPT_SYNC);1318}13191320dnode_set_slots(children, idx, slots, DN_SLOT_FREE);1321dnode_slots_rele(children, idx, slots);1322}13231324void1325dnode_special_close(dnode_handle_t *dnh)1326{1327dnode_t *dn = dnh->dnh_dnode;13281329/*1330* Ensure dnode_rele_and_unlock() has released dn_mtx, after final1331* zfs_refcount_remove()1332*/1333mutex_enter(&dn->dn_mtx);1334if (zfs_refcount_count(&dn->dn_holds) > 0)1335cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);1336mutex_exit(&dn->dn_mtx);1337ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);13381339ASSERT(dn->dn_dbuf == NULL ||1340dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);1341zrl_add(&dnh->dnh_zrlock);1342dnode_destroy(dn); /* implicit zrl_remove() */1343zrl_destroy(&dnh->dnh_zrlock);1344dnh->dnh_dnode = NULL;1345}13461347void1348dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,1349dnode_handle_t *dnh)1350{1351dnode_t *dn;13521353zrl_init(&dnh->dnh_zrlock);1354VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));13551356dn = dnode_create(os, dnp, NULL, object, dnh);1357DNODE_VERIFY(dn);13581359zrl_exit(&dnh->dnh_zrlock);1360}13611362static void1363dnode_buf_evict_async(void *dbu)1364{1365dnode_children_t *dnc = dbu;13661367DNODE_STAT_BUMP(dnode_buf_evict);13681369for (int i = 0; i < dnc->dnc_count; i++) {1370dnode_handle_t *dnh = &dnc->dnc_children[i];1371dnode_t *dn;13721373/*1374* The dnode handle lock guards against the dnode moving to1375* another valid address, so there is no need here to guard1376* against changes to or from NULL.1377*/1378if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {1379zrl_destroy(&dnh->dnh_zrlock);1380dnh->dnh_dnode = DN_SLOT_UNINIT;1381continue;1382}13831384zrl_add(&dnh->dnh_zrlock);1385dn = dnh->dnh_dnode;1386/*1387* If there are holds on this dnode, then there should1388* be holds on the dnode's containing dbuf as well; thus1389* it wouldn't be eligible for eviction and this function1390* would not have been called.1391*/1392ASSERT(zfs_refcount_is_zero(&dn->dn_holds));1393ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));13941395dnode_destroy(dn); /* implicit zrl_remove() for first slot */1396zrl_destroy(&dnh->dnh_zrlock);1397dnh->dnh_dnode = DN_SLOT_UNINIT;1398}1399kmem_free(dnc, sizeof (dnode_children_t) +1400dnc->dnc_count * sizeof (dnode_handle_t));1401}14021403/*1404* When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used1405* to ensure the hole at the specified object offset is large enough to1406* hold the dnode being created. The slots parameter is also used to ensure1407* a dnode does not span multiple dnode blocks. In both of these cases, if1408* a failure occurs, ENOSPC is returned. Keep in mind, these failure cases1409* are only possible when using DNODE_MUST_BE_FREE.1410*1411* If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.1412* dnode_hold_impl() will check if the requested dnode is already consumed1413* as an extra dnode slot by an large dnode, in which case it returns1414* ENOENT.1415*1416* If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just1417* return whether the hold would succeed or not. tag and dnp should set to1418* NULL in this case.1419*1420* errors:1421* EINVAL - Invalid object number or flags.1422* ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)1423* EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)1424* - Refers to a freeing dnode (DNODE_MUST_BE_FREE)1425* - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)1426* ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)1427* - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)1428* EIO - I/O error when reading the meta dnode dbuf.1429*1430* succeeds even for free dnodes.1431*/1432int1433dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,1434const void *tag, dnode_t **dnp)1435{1436int epb, idx, err;1437int drop_struct_lock = FALSE;1438int type;1439uint64_t blk;1440dnode_t *mdn, *dn;1441dmu_buf_impl_t *db;1442dnode_children_t *dnc;1443dnode_phys_t *dn_block;1444dnode_handle_t *dnh;14451446ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));1447ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));1448IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));14491450/*1451* If you are holding the spa config lock as writer, you shouldn't1452* be asking the DMU to do *anything* unless it's the root pool1453* which may require us to read from the root filesystem while1454* holding some (not all) of the locks as writer.1455*/1456ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||1457(spa_is_root(os->os_spa) &&1458spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));14591460ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));14611462if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||1463object == DMU_PROJECTUSED_OBJECT) {1464if (object == DMU_USERUSED_OBJECT)1465dn = DMU_USERUSED_DNODE(os);1466else if (object == DMU_GROUPUSED_OBJECT)1467dn = DMU_GROUPUSED_DNODE(os);1468else1469dn = DMU_PROJECTUSED_DNODE(os);1470if (dn == NULL)1471return (SET_ERROR(ENOENT));1472type = dn->dn_type;1473if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)1474return (SET_ERROR(ENOENT));1475if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)1476return (SET_ERROR(EEXIST));1477DNODE_VERIFY(dn);1478/* Don't actually hold if dry run, just return 0 */1479if (!(flag & DNODE_DRY_RUN)) {1480(void) zfs_refcount_add(&dn->dn_holds, tag);1481*dnp = dn;1482}1483return (0);1484}14851486if (object == 0 || object >= DN_MAX_OBJECT)1487return (SET_ERROR(EINVAL));14881489mdn = DMU_META_DNODE(os);1490ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);14911492DNODE_VERIFY(mdn);14931494if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {1495rw_enter(&mdn->dn_struct_rwlock, RW_READER);1496drop_struct_lock = TRUE;1497}14981499blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));1500db = dbuf_hold(mdn, blk, FTAG);1501if (drop_struct_lock)1502rw_exit(&mdn->dn_struct_rwlock);1503if (db == NULL) {1504DNODE_STAT_BUMP(dnode_hold_dbuf_hold);1505return (SET_ERROR(EIO));1506}15071508/*1509* We do not need to decrypt to read the dnode so it doesn't matter1510* if we get the encrypted or decrypted version.1511*/1512err = dbuf_read(db, NULL, DB_RF_CANFAIL |1513DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);1514if (err) {1515DNODE_STAT_BUMP(dnode_hold_dbuf_read);1516dbuf_rele(db, FTAG);1517return (err);1518}15191520ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);1521epb = db->db.db_size >> DNODE_SHIFT;15221523idx = object & (epb - 1);1524dn_block = (dnode_phys_t *)db->db.db_data;15251526ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);1527dnc = dmu_buf_get_user(&db->db);1528dnh = NULL;1529if (dnc == NULL) {1530dnode_children_t *winner;1531int skip = 0;15321533dnc = kmem_zalloc(sizeof (dnode_children_t) +1534epb * sizeof (dnode_handle_t), KM_SLEEP);1535dnc->dnc_count = epb;1536dnh = &dnc->dnc_children[0];15371538/* Initialize dnode slot status from dnode_phys_t */1539for (int i = 0; i < epb; i++) {1540zrl_init(&dnh[i].dnh_zrlock);15411542if (skip) {1543skip--;1544continue;1545}15461547if (dn_block[i].dn_type != DMU_OT_NONE) {1548int interior = dn_block[i].dn_extra_slots;15491550dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);1551dnode_set_slots(dnc, i + 1, interior,1552DN_SLOT_INTERIOR);1553skip = interior;1554} else {1555dnh[i].dnh_dnode = DN_SLOT_FREE;1556skip = 0;1557}1558}15591560dmu_buf_init_user(&dnc->dnc_dbu, NULL,1561dnode_buf_evict_async, NULL);1562winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);1563if (winner != NULL) {15641565for (int i = 0; i < epb; i++)1566zrl_destroy(&dnh[i].dnh_zrlock);15671568kmem_free(dnc, sizeof (dnode_children_t) +1569epb * sizeof (dnode_handle_t));1570dnc = winner;1571}1572}15731574ASSERT(dnc->dnc_count == epb);15751576if (flag & DNODE_MUST_BE_ALLOCATED) {1577slots = 1;15781579dnode_slots_hold(dnc, idx, slots);1580dnh = &dnc->dnc_children[idx];15811582if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {1583dn = dnh->dnh_dnode;1584} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {1585DNODE_STAT_BUMP(dnode_hold_alloc_interior);1586dnode_slots_rele(dnc, idx, slots);1587dbuf_rele(db, FTAG);1588return (SET_ERROR(EEXIST));1589} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {1590DNODE_STAT_BUMP(dnode_hold_alloc_misses);1591dnode_slots_rele(dnc, idx, slots);1592dbuf_rele(db, FTAG);1593return (SET_ERROR(ENOENT));1594} else {1595dnode_slots_rele(dnc, idx, slots);1596while (!dnode_slots_tryenter(dnc, idx, slots)) {1597DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);1598kpreempt(KPREEMPT_SYNC);1599}16001601/*1602* Someone else won the race and called dnode_create()1603* after we checked DN_SLOT_IS_PTR() above but before1604* we acquired the lock.1605*/1606if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {1607DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);1608dn = dnh->dnh_dnode;1609} else {1610dn = dnode_create(os, dn_block + idx, db,1611object, dnh);1612dmu_buf_add_user_size(&db->db,1613sizeof (dnode_t));1614}1615}16161617mutex_enter(&dn->dn_mtx);1618if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {1619DNODE_STAT_BUMP(dnode_hold_alloc_type_none);1620mutex_exit(&dn->dn_mtx);1621dnode_slots_rele(dnc, idx, slots);1622dbuf_rele(db, FTAG);1623return (SET_ERROR(ENOENT));1624}16251626/* Don't actually hold if dry run, just return 0 */1627if (flag & DNODE_DRY_RUN) {1628mutex_exit(&dn->dn_mtx);1629dnode_slots_rele(dnc, idx, slots);1630dbuf_rele(db, FTAG);1631return (0);1632}16331634DNODE_STAT_BUMP(dnode_hold_alloc_hits);1635} else if (flag & DNODE_MUST_BE_FREE) {16361637if (idx + slots - 1 >= DNODES_PER_BLOCK) {1638DNODE_STAT_BUMP(dnode_hold_free_overflow);1639dbuf_rele(db, FTAG);1640return (SET_ERROR(ENOSPC));1641}16421643dnode_slots_hold(dnc, idx, slots);16441645if (!dnode_check_slots_free(dnc, idx, slots)) {1646DNODE_STAT_BUMP(dnode_hold_free_misses);1647dnode_slots_rele(dnc, idx, slots);1648dbuf_rele(db, FTAG);1649return (SET_ERROR(ENOSPC));1650}16511652dnode_slots_rele(dnc, idx, slots);1653while (!dnode_slots_tryenter(dnc, idx, slots)) {1654DNODE_STAT_BUMP(dnode_hold_free_lock_retry);1655kpreempt(KPREEMPT_SYNC);1656}16571658if (!dnode_check_slots_free(dnc, idx, slots)) {1659DNODE_STAT_BUMP(dnode_hold_free_lock_misses);1660dnode_slots_rele(dnc, idx, slots);1661dbuf_rele(db, FTAG);1662return (SET_ERROR(ENOSPC));1663}16641665/*1666* Allocated but otherwise free dnodes which would1667* be in the interior of a multi-slot dnodes need1668* to be freed. Single slot dnodes can be safely1669* re-purposed as a performance optimization.1670*/1671if (slots > 1) {1672uint_t reclaimed =1673dnode_reclaim_slots(dnc, idx + 1, slots - 1);1674if (reclaimed > 0)1675dmu_buf_sub_user_size(&db->db,1676reclaimed * sizeof (dnode_t));1677}16781679dnh = &dnc->dnc_children[idx];1680if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {1681dn = dnh->dnh_dnode;1682} else {1683dn = dnode_create(os, dn_block + idx, db,1684object, dnh);1685dmu_buf_add_user_size(&db->db, sizeof (dnode_t));1686}16871688mutex_enter(&dn->dn_mtx);1689if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {1690DNODE_STAT_BUMP(dnode_hold_free_refcount);1691mutex_exit(&dn->dn_mtx);1692dnode_slots_rele(dnc, idx, slots);1693dbuf_rele(db, FTAG);1694return (SET_ERROR(EEXIST));1695}16961697/* Don't actually hold if dry run, just return 0 */1698if (flag & DNODE_DRY_RUN) {1699mutex_exit(&dn->dn_mtx);1700dnode_slots_rele(dnc, idx, slots);1701dbuf_rele(db, FTAG);1702return (0);1703}17041705dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);1706DNODE_STAT_BUMP(dnode_hold_free_hits);1707} else {1708dbuf_rele(db, FTAG);1709return (SET_ERROR(EINVAL));1710}17111712ASSERT0(dn->dn_free_txg);17131714if (zfs_refcount_add(&dn->dn_holds, tag) == 1)1715dbuf_add_ref(db, dnh);17161717mutex_exit(&dn->dn_mtx);17181719/* Now we can rely on the hold to prevent the dnode from moving. */1720dnode_slots_rele(dnc, idx, slots);17211722DNODE_VERIFY(dn);1723ASSERT3P(dnp, !=, NULL);1724ASSERT3P(dn->dn_dbuf, ==, db);1725ASSERT3U(dn->dn_object, ==, object);1726dbuf_rele(db, FTAG);17271728*dnp = dn;1729return (0);1730}17311732/*1733* Return held dnode if the object is allocated, NULL if not.1734*/1735int1736dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)1737{1738return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,1739dnp));1740}17411742/*1743* Can only add a reference if there is already at least one1744* reference on the dnode. Returns FALSE if unable to add a1745* new reference.1746*/1747static boolean_t1748dnode_add_ref_locked(dnode_t *dn, const void *tag)1749{1750ASSERT(MUTEX_HELD(&dn->dn_mtx));1751if (zfs_refcount_is_zero(&dn->dn_holds))1752return (FALSE);1753VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));1754return (TRUE);1755}17561757boolean_t1758dnode_add_ref(dnode_t *dn, const void *tag)1759{1760mutex_enter(&dn->dn_mtx);1761boolean_t r = dnode_add_ref_locked(dn, tag);1762mutex_exit(&dn->dn_mtx);1763return (r);1764}17651766void1767dnode_rele(dnode_t *dn, const void *tag)1768{1769mutex_enter(&dn->dn_mtx);1770dnode_rele_and_unlock(dn, tag, B_FALSE);1771}17721773void1774dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)1775{1776uint64_t refs;1777/* Get while the hold prevents the dnode from moving. */1778dmu_buf_impl_t *db = dn->dn_dbuf;1779dnode_handle_t *dnh = dn->dn_handle;17801781refs = zfs_refcount_remove(&dn->dn_holds, tag);1782if (refs == 0)1783cv_broadcast(&dn->dn_nodnholds);1784mutex_exit(&dn->dn_mtx);1785/* dnode could get destroyed at this point, so don't use it anymore */17861787/*1788* It's unsafe to release the last hold on a dnode by dnode_rele() or1789* indirectly by dbuf_rele() while relying on the dnode handle to1790* prevent the dnode from moving, since releasing the last hold could1791* result in the dnode's parent dbuf evicting its dnode handles. For1792* that reason anyone calling dnode_rele() or dbuf_rele() without some1793* other direct or indirect hold on the dnode must first drop the dnode1794* handle.1795*/1796#ifdef ZFS_DEBUG1797ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread);1798#endif17991800/* NOTE: the DNODE_DNODE does not have a dn_dbuf */1801if (refs == 0 && db != NULL) {1802/*1803* Another thread could add a hold to the dnode handle in1804* dnode_hold_impl() while holding the parent dbuf. Since the1805* hold on the parent dbuf prevents the handle from being1806* destroyed, the hold on the handle is OK. We can't yet assert1807* that the handle has zero references, but that will be1808* asserted anyway when the handle gets destroyed.1809*/1810mutex_enter(&db->db_mtx);1811dbuf_rele_and_unlock(db, dnh, evicting);1812}1813}18141815/*1816* Test whether we can create a dnode at the specified location.1817*/1818int1819dnode_try_claim(objset_t *os, uint64_t object, int slots)1820{1821return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,1822slots, NULL, NULL));1823}18241825/*1826* Test if the dnode is dirty, or carrying uncommitted records.1827*1828* dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented1829* in dnode_setdirty() the first time the dnode is dirtied on a txg, and1830* decremented in either dnode_rele_task() or userquota_updates_task() when the1831* txg is synced out.1832*/1833boolean_t1834dnode_is_dirty(dnode_t *dn)1835{1836mutex_enter(&dn->dn_mtx);1837boolean_t dirty = (dn->dn_dirtycnt != 0);1838mutex_exit(&dn->dn_mtx);1839return (dirty);1840}18411842void1843dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)1844{1845objset_t *os = dn->dn_objset;1846uint64_t txg = tx->tx_txg;18471848if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {1849dsl_dataset_dirty(os->os_dsl_dataset, tx);1850return;1851}18521853DNODE_VERIFY(dn);18541855#ifdef ZFS_DEBUG1856mutex_enter(&dn->dn_mtx);1857ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);1858ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);1859mutex_exit(&dn->dn_mtx);1860#endif18611862/*1863* Determine old uid/gid when necessary1864*/1865dmu_objset_userquota_get_ids(dn, B_TRUE, tx);18661867multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];1868multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);18691870/*1871* If we are already marked dirty, we're done.1872*/1873if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {1874multilist_sublist_unlock(mls);1875return;1876}18771878ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||1879!avl_is_empty(&dn->dn_dbufs));1880ASSERT(dn->dn_datablksz != 0);1881ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);1882ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);1883ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);18841885dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",1886(u_longlong_t)dn->dn_object, (u_longlong_t)txg);18871888multilist_sublist_insert_head(mls, dn);18891890multilist_sublist_unlock(mls);18911892/*1893* The dnode maintains a hold on its containing dbuf as1894* long as there are holds on it. Each instantiated child1895* dbuf maintains a hold on the dnode. When the last child1896* drops its hold, the dnode will drop its hold on the1897* containing dbuf. We add a "dirty hold" here so that the1898* dnode will hang around after we finish processing its1899* children.1900*/1901mutex_enter(&dn->dn_mtx);1902VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));1903dn->dn_dirtycnt++;1904ASSERT3U(dn->dn_dirtycnt, <=, 3);1905mutex_exit(&dn->dn_mtx);19061907(void) dbuf_dirty(dn->dn_dbuf, tx);19081909dsl_dataset_dirty(os->os_dsl_dataset, tx);1910}19111912void1913dnode_free(dnode_t *dn, dmu_tx_t *tx)1914{1915mutex_enter(&dn->dn_mtx);1916if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {1917mutex_exit(&dn->dn_mtx);1918return;1919}1920dn->dn_free_txg = tx->tx_txg;1921mutex_exit(&dn->dn_mtx);19221923dnode_setdirty(dn, tx);1924}19251926/*1927* Try to change the block size for the indicated dnode. This can only1928* succeed if there are no blocks allocated or dirty beyond first block1929*/1930int1931dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)1932{1933dmu_buf_impl_t *db;1934int err;19351936ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));1937if (size == 0)1938size = SPA_MINBLOCKSIZE;1939else1940size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);19411942if (ibs == dn->dn_indblkshift)1943ibs = 0;19441945if (size == dn->dn_datablksz && ibs == 0)1946return (0);19471948rw_enter(&dn->dn_struct_rwlock, RW_WRITER);19491950/* Check for any allocated blocks beyond the first */1951if (dn->dn_maxblkid != 0)1952goto fail;19531954mutex_enter(&dn->dn_dbufs_mtx);1955for (db = avl_first(&dn->dn_dbufs); db != NULL;1956db = AVL_NEXT(&dn->dn_dbufs, db)) {1957if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&1958db->db_blkid != DMU_SPILL_BLKID) {1959mutex_exit(&dn->dn_dbufs_mtx);1960goto fail;1961}1962}1963mutex_exit(&dn->dn_dbufs_mtx);19641965if (ibs && dn->dn_nlevels != 1)1966goto fail;19671968dnode_setdirty(dn, tx);1969if (size != dn->dn_datablksz) {1970/* resize the old block */1971err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);1972if (err == 0) {1973dbuf_new_size(db, size, tx);1974} else if (err != ENOENT) {1975goto fail;1976}19771978dnode_setdblksz(dn, size);1979dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;1980if (db)1981dbuf_rele(db, FTAG);1982}1983if (ibs) {1984dn->dn_indblkshift = ibs;1985dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;1986}19871988rw_exit(&dn->dn_struct_rwlock);1989return (0);19901991fail:1992rw_exit(&dn->dn_struct_rwlock);1993return (SET_ERROR(ENOTSUP));1994}19951996static void1997dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)1998{1999uint64_t txgoff = tx->tx_txg & TXG_MASK;2000int old_nlevels = dn->dn_nlevels;2001dmu_buf_impl_t *db;2002list_t *list;2003dbuf_dirty_record_t *new, *dr, *dr_next;20042005ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));20062007ASSERT3U(new_nlevels, >, dn->dn_nlevels);2008dn->dn_nlevels = new_nlevels;20092010ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);2011dn->dn_next_nlevels[txgoff] = new_nlevels;20122013/* dirty the left indirects */2014db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);2015ASSERT(db != NULL);2016new = dbuf_dirty(db, tx);2017dbuf_rele(db, FTAG);20182019/* transfer the dirty records to the new indirect */2020mutex_enter(&dn->dn_mtx);2021mutex_enter(&new->dt.di.dr_mtx);2022list = &dn->dn_dirty_records[txgoff];2023for (dr = list_head(list); dr; dr = dr_next) {2024dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);20252026IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);2027if (dr->dr_dbuf == NULL ||2028(dr->dr_dbuf->db_level == old_nlevels - 1 &&2029dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&2030dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {2031list_remove(&dn->dn_dirty_records[txgoff], dr);2032list_insert_tail(&new->dt.di.dr_children, dr);2033dr->dr_parent = new;2034}2035}2036mutex_exit(&new->dt.di.dr_mtx);2037mutex_exit(&dn->dn_mtx);2038}20392040int2041dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)2042{2043int ret = 0;20442045rw_enter(&dn->dn_struct_rwlock, RW_WRITER);20462047if (dn->dn_nlevels == nlevels) {2048ret = 0;2049goto out;2050} else if (nlevels < dn->dn_nlevels) {2051ret = SET_ERROR(EINVAL);2052goto out;2053}20542055dnode_set_nlevels_impl(dn, nlevels, tx);20562057out:2058rw_exit(&dn->dn_struct_rwlock);2059return (ret);2060}20612062/* read-holding callers must not rely on the lock being continuously held */2063void2064dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,2065boolean_t force)2066{2067int epbs, new_nlevels;2068uint64_t sz;20692070ASSERT(blkid != DMU_BONUS_BLKID);20712072ASSERT(have_read ?2073RW_READ_HELD(&dn->dn_struct_rwlock) :2074RW_WRITE_HELD(&dn->dn_struct_rwlock));20752076/*2077* if we have a read-lock, check to see if we need to do any work2078* before upgrading to a write-lock.2079*/2080if (have_read) {2081if (blkid <= dn->dn_maxblkid)2082return;20832084if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {2085rw_exit(&dn->dn_struct_rwlock);2086rw_enter(&dn->dn_struct_rwlock, RW_WRITER);2087}2088}20892090/*2091* Raw sends (indicated by the force flag) require that we take the2092* given blkid even if the value is lower than the current value.2093*/2094if (!force && blkid <= dn->dn_maxblkid)2095goto out;20962097/*2098* We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]2099* to indicate that this field is set. This allows us to set the2100* maxblkid to 0 on an existing object in dnode_sync().2101*/2102dn->dn_maxblkid = blkid;2103dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =2104blkid | DMU_NEXT_MAXBLKID_SET;21052106/*2107* Compute the number of levels necessary to support the new maxblkid.2108* Raw sends will ensure nlevels is set correctly for us.2109*/2110new_nlevels = 1;2111epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2112for (sz = dn->dn_nblkptr;2113sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)2114new_nlevels++;21152116ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);21172118if (!force) {2119if (new_nlevels > dn->dn_nlevels)2120dnode_set_nlevels_impl(dn, new_nlevels, tx);2121} else {2122ASSERT3U(dn->dn_nlevels, >=, new_nlevels);2123}21242125out:2126if (have_read)2127rw_downgrade(&dn->dn_struct_rwlock);2128}21292130static void2131dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)2132{2133dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);2134if (db != NULL) {2135dmu_buf_will_dirty(&db->db, tx);2136dbuf_rele(db, FTAG);2137}2138}21392140/*2141* Dirty all the in-core level-1 dbufs in the range specified by start_blkid2142* and end_blkid.2143*/2144static void2145dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,2146dmu_tx_t *tx)2147{2148dmu_buf_impl_t *db_search;2149dmu_buf_impl_t *db;2150avl_index_t where;21512152db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);21532154mutex_enter(&dn->dn_dbufs_mtx);21552156db_search->db_level = 1;2157db_search->db_blkid = start_blkid + 1;2158db_search->db_state = DB_SEARCH;2159for (;;) {21602161db = avl_find(&dn->dn_dbufs, db_search, &where);2162if (db == NULL)2163db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);21642165if (db == NULL || db->db_level != 1 ||2166db->db_blkid >= end_blkid) {2167break;2168}21692170/*2171* Setup the next blkid we want to search for.2172*/2173db_search->db_blkid = db->db_blkid + 1;2174ASSERT3U(db->db_blkid, >=, start_blkid);21752176/*2177* If the dbuf transitions to DB_EVICTING while we're trying2178* to dirty it, then we will be unable to discover it in2179* the dbuf hash table. This will result in a call to2180* dbuf_create() which needs to acquire the dn_dbufs_mtx2181* lock. To avoid a deadlock, we drop the lock before2182* dirtying the level-1 dbuf.2183*/2184mutex_exit(&dn->dn_dbufs_mtx);2185dnode_dirty_l1(dn, db->db_blkid, tx);2186mutex_enter(&dn->dn_dbufs_mtx);2187}21882189#ifdef ZFS_DEBUG2190/*2191* Walk all the in-core level-1 dbufs and verify they have been dirtied.2192*/2193db_search->db_level = 1;2194db_search->db_blkid = start_blkid + 1;2195db_search->db_state = DB_SEARCH;2196db = avl_find(&dn->dn_dbufs, db_search, &where);2197if (db == NULL)2198db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);2199for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {2200if (db->db_level != 1 || db->db_blkid >= end_blkid)2201break;2202if (db->db_state != DB_EVICTING)2203ASSERT(db->db_dirtycnt > 0);2204}2205#endif2206kmem_free(db_search, sizeof (dmu_buf_impl_t));2207mutex_exit(&dn->dn_dbufs_mtx);2208}22092210static void2211dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,2212dmu_tx_t *tx)2213{2214dmu_buf_impl_t *db;2215int res;22162217rw_enter(&dn->dn_struct_rwlock, RW_READER);2218res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,2219FTAG, &db);2220rw_exit(&dn->dn_struct_rwlock);2221if (res == 0) {2222db_lock_type_t dblt;2223boolean_t dirty;22242225dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);2226/* don't dirty if not on disk and not dirty */2227dirty = !list_is_empty(&db->db_dirty_records) ||2228(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));2229dmu_buf_unlock_parent(db, dblt, FTAG);2230if (dirty) {2231caddr_t data;22322233dmu_buf_will_dirty(&db->db, tx);2234data = db->db.db_data;2235memset(data + blkoff, 0, len);2236}2237dbuf_rele(db, FTAG);2238}2239}22402241void2242dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)2243{2244uint64_t blkoff, blkid, nblks;2245int blksz, blkshift, head, tail;2246int trunc = FALSE;2247int epbs;22482249blksz = dn->dn_datablksz;2250blkshift = dn->dn_datablkshift;2251epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;22522253if (len == DMU_OBJECT_END) {2254len = UINT64_MAX - off;2255trunc = TRUE;2256}22572258/*2259* First, block align the region to free:2260*/2261if (ISP2(blksz)) {2262head = P2NPHASE(off, blksz);2263blkoff = P2PHASE(off, blksz);2264if ((off >> blkshift) > dn->dn_maxblkid)2265return;2266} else {2267ASSERT0(dn->dn_maxblkid);2268if (off == 0 && len >= blksz) {2269/*2270* Freeing the whole block; fast-track this request.2271*/2272blkid = 0;2273nblks = 1;2274if (dn->dn_nlevels > 1) {2275rw_enter(&dn->dn_struct_rwlock, RW_WRITER);2276dnode_dirty_l1(dn, 0, tx);2277rw_exit(&dn->dn_struct_rwlock);2278}2279goto done;2280} else if (off >= blksz) {2281/* Freeing past end-of-data */2282return;2283} else {2284/* Freeing part of the block. */2285head = blksz - off;2286ASSERT3U(head, >, 0);2287}2288blkoff = off;2289}2290/* zero out any partial block data at the start of the range */2291if (head) {2292ASSERT3U(blkoff + head, ==, blksz);2293if (len < head)2294head = len;2295dnode_partial_zero(dn, off, blkoff, head, tx);2296off += head;2297len -= head;2298}22992300/* If the range was less than one block, we're done */2301if (len == 0)2302return;23032304/* If the remaining range is past end of file, we're done */2305if ((off >> blkshift) > dn->dn_maxblkid)2306return;23072308ASSERT(ISP2(blksz));2309if (trunc)2310tail = 0;2311else2312tail = P2PHASE(len, blksz);23132314ASSERT0(P2PHASE(off, blksz));2315/* zero out any partial block data at the end of the range */2316if (tail) {2317if (len < tail)2318tail = len;2319dnode_partial_zero(dn, off + len, 0, tail, tx);2320len -= tail;2321}23222323/* If the range did not include a full block, we are done */2324if (len == 0)2325return;23262327ASSERT(IS_P2ALIGNED(off, blksz));2328ASSERT(trunc || IS_P2ALIGNED(len, blksz));2329blkid = off >> blkshift;2330nblks = len >> blkshift;2331if (trunc)2332nblks += 1;23332334/*2335* Dirty all the indirect blocks in this range. Note that only2336* the first and last indirect blocks can actually be written2337* (if they were partially freed) -- they must be dirtied, even if2338* they do not exist on disk yet. The interior blocks will2339* be freed by free_children(), so they will not actually be written.2340* Even though these interior blocks will not be written, we2341* dirty them for two reasons:2342*2343* - It ensures that the indirect blocks remain in memory until2344* syncing context. (They have already been prefetched by2345* dmu_tx_hold_free(), so we don't have to worry about reading2346* them serially here.)2347*2348* - The dirty space accounting will put pressure on the txg sync2349* mechanism to begin syncing, and to delay transactions if there2350* is a large amount of freeing. Even though these indirect2351* blocks will not be written, we could need to write the same2352* amount of space if we copy the freed BPs into deadlists.2353*/2354if (dn->dn_nlevels > 1) {2355rw_enter(&dn->dn_struct_rwlock, RW_WRITER);2356uint64_t first, last;23572358first = blkid >> epbs;2359dnode_dirty_l1(dn, first, tx);2360if (trunc)2361last = dn->dn_maxblkid >> epbs;2362else2363last = (blkid + nblks - 1) >> epbs;2364if (last != first)2365dnode_dirty_l1(dn, last, tx);23662367dnode_dirty_l1range(dn, first, last, tx);23682369int shift = dn->dn_datablkshift + dn->dn_indblkshift -2370SPA_BLKPTRSHIFT;2371for (uint64_t i = first + 1; i < last; i++) {2372/*2373* Set i to the blockid of the next non-hole2374* level-1 indirect block at or after i. Note2375* that dnode_next_offset() operates in terms of2376* level-0-equivalent bytes.2377*/2378uint64_t ibyte = i << shift;2379int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,2380&ibyte, 2, 1, 0);2381i = ibyte >> shift;2382if (i >= last)2383break;23842385/*2386* Normally we should not see an error, either2387* from dnode_next_offset() or dbuf_hold_level()2388* (except for ESRCH from dnode_next_offset).2389* If there is an i/o error, then when we read2390* this block in syncing context, it will use2391* ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according2392* to the "failmode" property. dnode_next_offset()2393* doesn't have a flag to indicate MUSTSUCCEED.2394*/2395if (err != 0)2396break;23972398dnode_dirty_l1(dn, i, tx);2399}2400rw_exit(&dn->dn_struct_rwlock);2401}24022403done:2404/*2405* Add this range to the dnode range list.2406* We will finish up this free operation in the syncing phase.2407*/2408mutex_enter(&dn->dn_mtx);2409{2410int txgoff = tx->tx_txg & TXG_MASK;2411if (dn->dn_free_ranges[txgoff] == NULL) {2412dn->dn_free_ranges[txgoff] =2413zfs_range_tree_create_flags(2414NULL, ZFS_RANGE_SEG64, NULL, 0, 0,2415ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));2416}2417zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);2418zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);2419}2420dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",2421(u_longlong_t)blkid, (u_longlong_t)nblks,2422(u_longlong_t)tx->tx_txg);2423mutex_exit(&dn->dn_mtx);24242425dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);2426dnode_setdirty(dn, tx);2427}24282429static boolean_t2430dnode_spill_freed(dnode_t *dn)2431{2432int i;24332434mutex_enter(&dn->dn_mtx);2435for (i = 0; i < TXG_SIZE; i++) {2436if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)2437break;2438}2439mutex_exit(&dn->dn_mtx);2440return (i < TXG_SIZE);2441}24422443/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */2444uint64_t2445dnode_block_freed(dnode_t *dn, uint64_t blkid)2446{2447int i;24482449if (blkid == DMU_BONUS_BLKID)2450return (FALSE);24512452if (dn->dn_free_txg)2453return (TRUE);24542455if (blkid == DMU_SPILL_BLKID)2456return (dnode_spill_freed(dn));24572458mutex_enter(&dn->dn_mtx);2459for (i = 0; i < TXG_SIZE; i++) {2460if (dn->dn_free_ranges[i] != NULL &&2461zfs_range_tree_contains(dn->dn_free_ranges[i], blkid, 1))2462break;2463}2464mutex_exit(&dn->dn_mtx);2465return (i < TXG_SIZE);2466}24672468/* call from syncing context when we actually write/free space for this dnode */2469void2470dnode_diduse_space(dnode_t *dn, int64_t delta)2471{2472uint64_t space;2473dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",2474dn, dn->dn_phys,2475(u_longlong_t)dn->dn_phys->dn_used,2476(longlong_t)delta);24772478mutex_enter(&dn->dn_mtx);2479space = DN_USED_BYTES(dn->dn_phys);2480if (delta > 0) {2481ASSERT3U(space + delta, >=, space); /* no overflow */2482} else {2483ASSERT3U(space, >=, -delta); /* no underflow */2484}2485space += delta;2486if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {2487ASSERT0((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES));2488ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));2489dn->dn_phys->dn_used = space >> DEV_BSHIFT;2490} else {2491dn->dn_phys->dn_used = space;2492dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;2493}2494mutex_exit(&dn->dn_mtx);2495}24962497/*2498* Scans a block at the indicated "level" looking for a hole or data,2499* depending on 'flags'.2500*2501* If level > 0, then we are scanning an indirect block looking at its2502* pointers. If level == 0, then we are looking at a block of dnodes.2503*2504* If we don't find what we are looking for in the block, we return ESRCH.2505* Otherwise, return with *offset pointing to the beginning (if searching2506* forwards) or end (if searching backwards) of the range covered by the2507* block pointer we matched on (or dnode).2508*2509* The basic search algorithm used below by dnode_next_offset() is to2510* use this function to search up the block tree (widen the search) until2511* we find something (i.e., we don't return ESRCH) and then search back2512* down the tree (narrow the search) until we reach our original search2513* level.2514*/2515static int2516dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,2517int lvl, uint64_t blkfill, uint64_t txg)2518{2519dmu_buf_impl_t *db = NULL;2520void *data = NULL;2521uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;2522uint64_t epb = 1ULL << epbs;2523uint64_t minfill, maxfill;2524boolean_t hole;2525int i, inc, error, span;25262527ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));25282529hole = ((flags & DNODE_FIND_HOLE) != 0);2530inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;2531ASSERT(txg == 0 || !hole);25322533if (lvl == dn->dn_phys->dn_nlevels) {2534error = 0;2535epb = dn->dn_phys->dn_nblkptr;2536data = dn->dn_phys->dn_blkptr;2537if (dn->dn_dbuf != NULL)2538rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);2539else if (dmu_objset_ds(dn->dn_objset) != NULL)2540rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,2541RW_READER, FTAG);2542} else {2543uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);2544error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);2545if (error) {2546if (error != ENOENT)2547return (error);2548if (hole)2549return (0);2550/*2551* This can only happen when we are searching up2552* the block tree for data. We don't really need to2553* adjust the offset, as we will just end up looking2554* at the pointer to this block in its parent, and its2555* going to be unallocated, so we will skip over it.2556*/2557return (SET_ERROR(ESRCH));2558}2559error = dbuf_read(db, NULL,2560DB_RF_CANFAIL | DB_RF_HAVESTRUCT |2561DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);2562if (error) {2563dbuf_rele(db, FTAG);2564return (error);2565}2566data = db->db.db_data;2567rw_enter(&db->db_rwlock, RW_READER);2568}25692570if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||2571BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||2572BP_IS_HOLE(db->db_blkptr))) {2573/*2574* This can only happen when we are searching up the tree2575* and these conditions mean that we need to keep climbing.2576*/2577error = SET_ERROR(ESRCH);2578} else if (lvl == 0) {2579dnode_phys_t *dnp = data;25802581ASSERT(dn->dn_type == DMU_OT_DNODE);2582ASSERT(!(flags & DNODE_FIND_BACKWARDS));25832584for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);2585i < blkfill; i += dnp[i].dn_extra_slots + 1) {2586if ((dnp[i].dn_type == DMU_OT_NONE) == hole)2587break;2588}25892590if (i == blkfill)2591error = SET_ERROR(ESRCH);25922593*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +2594(i << DNODE_SHIFT);2595} else {2596blkptr_t *bp = data;2597uint64_t start = *offset;2598span = (lvl - 1) * epbs + dn->dn_datablkshift;2599minfill = 0;2600maxfill = blkfill << ((lvl - 1) * epbs);26012602if (hole)2603maxfill--;2604else2605minfill++;26062607if (span >= 8 * sizeof (*offset)) {2608/* This only happens on the highest indirection level */2609ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);2610*offset = 0;2611} else {2612*offset = *offset >> span;2613}26142615for (i = BF64_GET(*offset, 0, epbs);2616i >= 0 && i < epb; i += inc) {2617if (BP_GET_FILL(&bp[i]) >= minfill &&2618BP_GET_FILL(&bp[i]) <= maxfill &&2619(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))2620break;2621if (inc > 0 || *offset > 0)2622*offset += inc;2623}26242625if (span >= 8 * sizeof (*offset)) {2626*offset = start;2627} else {2628*offset = *offset << span;2629}26302631if (inc < 0) {2632/* traversing backwards; position offset at the end */2633if (span < 8 * sizeof (*offset))2634*offset = MIN(*offset + (1ULL << span) - 1,2635start);2636} else if (*offset < start) {2637*offset = start;2638}2639if (i < 0 || i >= epb)2640error = SET_ERROR(ESRCH);2641}26422643if (db != NULL) {2644rw_exit(&db->db_rwlock);2645dbuf_rele(db, FTAG);2646} else {2647if (dn->dn_dbuf != NULL)2648rw_exit(&dn->dn_dbuf->db_rwlock);2649else if (dmu_objset_ds(dn->dn_objset) != NULL)2650rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,2651FTAG);2652}26532654return (error);2655}26562657/*2658* Adjust *offset to the next (or previous) block byte offset at lvl.2659* Returns FALSE if *offset would overflow or underflow.2660*/2661static boolean_t2662dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)2663{2664int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2665int span = lvl * epbs + dn->dn_datablkshift;2666uint64_t blkid, maxblkid;26672668if (span >= 8 * sizeof (uint64_t))2669return (B_FALSE);26702671blkid = *offset >> span;2672maxblkid = 1ULL << (8 * sizeof (*offset) - span);2673if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)2674*offset = (blkid + 1) << span;2675else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)2676*offset = (blkid << span) - 1;2677else2678return (B_FALSE);26792680return (B_TRUE);2681}26822683/*2684* Find the next hole, data, or sparse region at or after *offset.2685* The value 'blkfill' tells us how many items we expect to find2686* in an L0 data block; this value is 1 for normal objects,2687* DNODES_PER_BLOCK for the meta dnode, and some fraction of2688* DNODES_PER_BLOCK when searching for sparse regions thereof.2689*2690* Examples:2691*2692* dnode_next_offset(dn, flags, offset, 1, 1, 0);2693* Finds the next/previous hole/data in a file.2694* Used in dmu_offset_next().2695*2696* dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);2697* Finds the next free/allocated dnode an objset's meta-dnode.2698* Only finds objects that have new contents since txg (ie.2699* bonus buffer changes and content removal are ignored).2700* Used in dmu_object_next().2701*2702* dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);2703* Finds the next L2 meta-dnode bp that's at most 1/4 full.2704* Used in dmu_object_alloc().2705*/2706int2707dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,2708int minlvl, uint64_t blkfill, uint64_t txg)2709{2710uint64_t matched = *offset;2711int lvl, maxlvl;2712int error = 0;27132714if (!(flags & DNODE_FIND_HAVELOCK))2715rw_enter(&dn->dn_struct_rwlock, RW_READER);27162717if (dn->dn_phys->dn_nlevels == 0) {2718error = SET_ERROR(ESRCH);2719goto out;2720}27212722if (dn->dn_datablkshift == 0) {2723if (*offset < dn->dn_datablksz) {2724if (flags & DNODE_FIND_HOLE)2725*offset = dn->dn_datablksz;2726} else {2727error = SET_ERROR(ESRCH);2728}2729goto out;2730}27312732maxlvl = dn->dn_phys->dn_nlevels;27332734for (lvl = minlvl; lvl <= maxlvl; ) {2735error = dnode_next_offset_level(dn,2736flags, offset, lvl, blkfill, txg);2737if (error == 0 && lvl > minlvl) {2738--lvl;2739matched = *offset;2740} else if (error == ESRCH && lvl < maxlvl &&2741dnode_next_block(dn, flags, &matched, lvl)) {2742/*2743* Continue search at next/prev offset in lvl+1 block.2744*2745* Usually we only search upwards at the start of the2746* search as higher level blocks point at a matching2747* minlvl block in most cases, but we backtrack if not.2748*2749* This can happen for txg > 0 searches if the block2750* contains only BPs/dnodes freed at that txg. It also2751* happens if we are still syncing out the tree, and2752* some BP's at higher levels are not updated yet.2753*2754* We must adjust offset to avoid coming back to the2755* same offset and getting stuck looping forever. This2756* also deals with the case where offset is already at2757* the beginning or end of the object.2758*/2759++lvl;2760*offset = matched;2761} else {2762break;2763}2764}27652766/*2767* There's always a "virtual hole" at the end of the object, even2768* if all BP's which physically exist are non-holes.2769*/2770if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&2771minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {2772error = 0;2773}27742775out:2776if (!(flags & DNODE_FIND_HAVELOCK))2777rw_exit(&dn->dn_struct_rwlock);27782779return (error);2780}27812782#if defined(_KERNEL)2783EXPORT_SYMBOL(dnode_hold);2784EXPORT_SYMBOL(dnode_rele);2785EXPORT_SYMBOL(dnode_set_nlevels);2786EXPORT_SYMBOL(dnode_set_blksz);2787EXPORT_SYMBOL(dnode_free_range);2788EXPORT_SYMBOL(dnode_evict_dbufs);2789EXPORT_SYMBOL(dnode_evict_bonus);2790#endif27912792ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,2793"Default dnode block shift");2794ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,2795"Default dnode indirect block shift");279627972798