Path: blob/main/sys/contrib/openzfs/module/zfs/dnode_sync.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2012, 2020 by Delphix. All rights reserved.25* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.26* Copyright 2020 Oxide Computer Company27*/2829#include <sys/zfs_context.h>30#include <sys/dbuf.h>31#include <sys/dnode.h>32#include <sys/dmu.h>33#include <sys/dmu_tx.h>34#include <sys/dmu_objset.h>35#include <sys/dmu_recv.h>36#include <sys/dsl_dataset.h>37#include <sys/spa.h>38#include <sys/range_tree.h>39#include <sys/zfeature.h>4041static void42dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)43{44dmu_buf_impl_t *db;45int txgoff = tx->tx_txg & TXG_MASK;46int nblkptr = dn->dn_phys->dn_nblkptr;47int old_toplvl = dn->dn_phys->dn_nlevels - 1;48int new_level = dn->dn_next_nlevels[txgoff];49int i;5051rw_enter(&dn->dn_struct_rwlock, RW_WRITER);5253/* this dnode can't be paged out because it's dirty */54ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);55ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);5657db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);58ASSERT(db != NULL);5960dn->dn_phys->dn_nlevels = new_level;61dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,62(u_longlong_t)dn->dn_object, dn->dn_phys->dn_nlevels);6364/*65* Lock ordering requires that we hold the children's db_mutexes (by66* calling dbuf_find()) before holding the parent's db_rwlock. The lock67* order is imposed by dbuf_read's steps of "grab the lock to protect68* db_parent, get db_parent, hold db_parent's db_rwlock".69*/70dmu_buf_impl_t *children[DN_MAX_NBLKPTR];71ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);72for (i = 0; i < nblkptr; i++) {73children[i] = dbuf_find(dn->dn_objset, dn->dn_object,74old_toplvl, i, NULL);75}7677/* transfer dnode's block pointers to new indirect block */78(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);79if (dn->dn_dbuf != NULL)80rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);81rw_enter(&db->db_rwlock, RW_WRITER);82ASSERT(db->db.db_data);83ASSERT(arc_released(db->db_buf));84ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);85memcpy(db->db.db_data, dn->dn_phys->dn_blkptr,86sizeof (blkptr_t) * nblkptr);87arc_buf_freeze(db->db_buf);8889/* set dbuf's parent pointers to new indirect buf */90for (i = 0; i < nblkptr; i++) {91dmu_buf_impl_t *child = children[i];9293if (child == NULL)94continue;95#ifdef ZFS_DEBUG96DB_DNODE_ENTER(child);97ASSERT3P(DB_DNODE(child), ==, dn);98DB_DNODE_EXIT(child);99#endif /* DEBUG */100if (child->db_parent && child->db_parent != dn->dn_dbuf) {101ASSERT(child->db_parent->db_level == db->db_level);102ASSERT(child->db_blkptr !=103&dn->dn_phys->dn_blkptr[child->db_blkid]);104mutex_exit(&child->db_mtx);105continue;106}107ASSERT(child->db_parent == NULL ||108child->db_parent == dn->dn_dbuf);109110child->db_parent = db;111dbuf_add_ref(db, child);112if (db->db.db_data)113child->db_blkptr = (blkptr_t *)db->db.db_data + i;114else115child->db_blkptr = NULL;116dprintf_dbuf_bp(child, child->db_blkptr,117"changed db_blkptr to new indirect %s", "");118119mutex_exit(&child->db_mtx);120}121122memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr);123124rw_exit(&db->db_rwlock);125if (dn->dn_dbuf != NULL)126rw_exit(&dn->dn_dbuf->db_rwlock);127128dbuf_rele(db, FTAG);129130rw_exit(&dn->dn_struct_rwlock);131}132133static void134free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)135{136dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;137uint64_t bytesfreed = 0;138139dprintf("ds=%p obj=%llx num=%d\n", ds, (u_longlong_t)dn->dn_object,140num);141142for (int i = 0; i < num; i++, bp++) {143if (BP_IS_HOLE(bp))144continue;145146bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);147ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));148149/*150* Save some useful information on the holes being151* punched, including logical size, type, and indirection152* level. Retaining birth time enables detection of when153* holes are punched for reducing the number of free154* records transmitted during a zfs send.155*/156157uint64_t lsize = BP_GET_LSIZE(bp);158dmu_object_type_t type = BP_GET_TYPE(bp);159uint64_t lvl = BP_GET_LEVEL(bp);160161memset(bp, 0, sizeof (blkptr_t));162163if (spa_feature_is_active(dn->dn_objset->os_spa,164SPA_FEATURE_HOLE_BIRTH)) {165BP_SET_LSIZE(bp, lsize);166BP_SET_TYPE(bp, type);167BP_SET_LEVEL(bp, lvl);168BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);169}170}171dnode_diduse_space(dn, -bytesfreed);172}173174#ifdef ZFS_DEBUG175static void176free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)177{178uint64_t off, num, i, j;179unsigned int epbs;180int err;181uint64_t txg = tx->tx_txg;182dnode_t *dn;183184DB_DNODE_ENTER(db);185dn = DB_DNODE(db);186epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;187off = start - (db->db_blkid << epbs);188num = end - start + 1;189190ASSERT3U(dn->dn_phys->dn_indblkshift, >=, SPA_BLKPTRSHIFT);191ASSERT3U(end + 1, >=, start);192ASSERT3U(start, >=, (db->db_blkid << epbs));193ASSERT3U(db->db_level, >, 0);194ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);195ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);196ASSERT(db->db_blkptr != NULL);197198for (i = off; i < off+num; i++) {199uint64_t *buf;200dmu_buf_impl_t *child;201dbuf_dirty_record_t *dr;202203ASSERT(db->db_level == 1);204205rw_enter(&dn->dn_struct_rwlock, RW_READER);206err = dbuf_hold_impl(dn, db->db_level - 1,207(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);208rw_exit(&dn->dn_struct_rwlock);209if (err == ENOENT)210continue;211ASSERT0(err);212ASSERT0(child->db_level);213dr = dbuf_find_dirty_eq(child, txg);214215/* data_old better be zeroed */216if (dr) {217buf = dr->dt.dl.dr_data->b_data;218for (j = 0; j < child->db.db_size >> 3; j++) {219if (buf[j] != 0) {220panic("freed data not zero: "221"child=%p i=%llu off=%llu "222"num=%llu\n",223(void *)child, (u_longlong_t)i,224(u_longlong_t)off,225(u_longlong_t)num);226}227}228}229230/*231* db_data better be zeroed unless it's dirty in a232* future txg.233*/234mutex_enter(&child->db_mtx);235buf = child->db.db_data;236if (buf != NULL && child->db_state != DB_FILL &&237list_is_empty(&child->db_dirty_records)) {238for (j = 0; j < child->db.db_size >> 3; j++) {239if (buf[j] != 0) {240panic("freed data not zero: "241"child=%p i=%llu off=%llu "242"num=%llu\n",243(void *)child, (u_longlong_t)i,244(u_longlong_t)off,245(u_longlong_t)num);246}247}248}249mutex_exit(&child->db_mtx);250251dbuf_rele(child, FTAG);252}253DB_DNODE_EXIT(db);254}255#endif256257/*258* We don't usually free the indirect blocks here. If in one txg we have a259* free_range and a write to the same indirect block, it's important that we260* preserve the hole's birth times. Therefore, we don't free any any indirect261* blocks in free_children(). If an indirect block happens to turn into all262* holes, it will be freed by dbuf_write_children_ready, which happens at a263* point in the syncing process where we know for certain the contents of the264* indirect block.265*266* However, if we're freeing a dnode, its space accounting must go to zero267* before we actually try to free the dnode, or we will trip an assertion. In268* addition, we know the case described above cannot occur, because the dnode is269* being freed. Therefore, we free the indirect blocks immediately in that270* case.271*/272static void273free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,274boolean_t free_indirects, dmu_tx_t *tx)275{276dnode_t *dn;277blkptr_t *bp;278dmu_buf_impl_t *subdb;279uint64_t start, end, dbstart, dbend;280unsigned int epbs, shift, i;281282/*283* There is a small possibility that this block will not be cached:284* 1 - if level > 1 and there are no children with level <= 1285* 2 - if this block was evicted since we read it from286* dmu_tx_hold_free().287*/288if (db->db_state != DB_CACHED)289(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);290291/*292* If we modify this indirect block, and we are not freeing the293* dnode (!free_indirects), then this indirect block needs to get294* written to disk by dbuf_write(). If it is dirty, we know it will295* be written (otherwise, we would have incorrect on-disk state296* because the space would be freed but still referenced by the BP297* in this indirect block). Therefore we VERIFY that it is298* dirty.299*300* Our VERIFY covers some cases that do not actually have to be301* dirty, but the open-context code happens to dirty. E.g. if the302* blocks we are freeing are all holes, because in that case, we303* are only freeing part of this indirect block, so it is an304* ancestor of the first or last block to be freed. The first and305* last L1 indirect blocks are always dirtied by dnode_free_range().306*/307db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);308VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);309dmu_buf_unlock_parent(db, dblt, FTAG);310311dbuf_release_bp(db);312bp = db->db.db_data;313314DB_DNODE_ENTER(db);315dn = DB_DNODE(db);316epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;317ASSERT3U(epbs, <, 31);318shift = (db->db_level - 1) * epbs;319dbstart = db->db_blkid << epbs;320start = blkid >> shift;321if (dbstart < start) {322bp += start - dbstart;323} else {324start = dbstart;325}326dbend = ((db->db_blkid + 1) << epbs) - 1;327end = (blkid + nblks - 1) >> shift;328if (dbend <= end)329end = dbend;330331ASSERT3U(start, <=, end);332333if (db->db_level == 1) {334FREE_VERIFY(db, start, end, tx);335rw_enter(&db->db_rwlock, RW_WRITER);336free_blocks(dn, bp, end - start + 1, tx);337rw_exit(&db->db_rwlock);338} else {339for (uint64_t id = start; id <= end; id++, bp++) {340if (BP_IS_HOLE(bp))341continue;342rw_enter(&dn->dn_struct_rwlock, RW_READER);343VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,344id, TRUE, FALSE, FTAG, &subdb));345rw_exit(&dn->dn_struct_rwlock);346ASSERT3P(bp, ==, subdb->db_blkptr);347348free_children(subdb, blkid, nblks, free_indirects, tx);349dbuf_rele(subdb, FTAG);350}351}352353if (free_indirects) {354rw_enter(&db->db_rwlock, RW_WRITER);355for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)356ASSERT(BP_IS_HOLE(bp));357memset(db->db.db_data, 0, db->db.db_size);358free_blocks(dn, db->db_blkptr, 1, tx);359rw_exit(&db->db_rwlock);360}361362DB_DNODE_EXIT(db);363arc_buf_freeze(db->db_buf);364}365366/*367* Traverse the indicated range of the provided file368* and "free" all the blocks contained there.369*/370static void371dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,372boolean_t free_indirects, dmu_tx_t *tx)373{374blkptr_t *bp = dn->dn_phys->dn_blkptr;375int dnlevel = dn->dn_phys->dn_nlevels;376boolean_t trunc = B_FALSE;377378if (blkid > dn->dn_phys->dn_maxblkid)379return;380381ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);382if (blkid + nblks > dn->dn_phys->dn_maxblkid) {383nblks = dn->dn_phys->dn_maxblkid - blkid + 1;384trunc = B_TRUE;385}386387/* There are no indirect blocks in the object */388if (dnlevel == 1) {389if (blkid >= dn->dn_phys->dn_nblkptr) {390/* this range was never made persistent */391return;392}393ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);394free_blocks(dn, bp + blkid, nblks, tx);395} else {396int shift = (dnlevel - 1) *397(dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);398int start = blkid >> shift;399int end = (blkid + nblks - 1) >> shift;400dmu_buf_impl_t *db;401402ASSERT(start < dn->dn_phys->dn_nblkptr);403bp += start;404for (int i = start; i <= end; i++, bp++) {405if (BP_IS_HOLE(bp))406continue;407rw_enter(&dn->dn_struct_rwlock, RW_READER);408VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,409TRUE, FALSE, FTAG, &db));410rw_exit(&dn->dn_struct_rwlock);411free_children(db, blkid, nblks, free_indirects, tx);412dbuf_rele(db, FTAG);413}414}415416/*417* Do not truncate the maxblkid if we are performing a raw418* receive. The raw receive sets the maxblkid manually and419* must not be overridden. Usually, the last DRR_FREE record420* will be at the maxblkid, because the source system sets421* the maxblkid when truncating. However, if the last block422* was freed by overwriting with zeros and being compressed423* away to a hole, the source system will generate a DRR_FREE424* record while leaving the maxblkid after the end of that425* record. In this case we need to leave the maxblkid as426* indicated in the DRR_OBJECT record, so that it matches the427* source system, ensuring that the cryptographic hashes will428* match.429*/430if (trunc && !dn->dn_objset->os_raw_receive) {431uint64_t off __maybe_unused;432dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;433434off = (dn->dn_phys->dn_maxblkid + 1) *435(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);436ASSERT(off < dn->dn_phys->dn_maxblkid ||437dn->dn_phys->dn_maxblkid == 0 ||438dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);439}440}441442typedef struct dnode_sync_free_range_arg {443dnode_t *dsfra_dnode;444dmu_tx_t *dsfra_tx;445boolean_t dsfra_free_indirects;446} dnode_sync_free_range_arg_t;447448static void449dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)450{451dnode_sync_free_range_arg_t *dsfra = arg;452dnode_t *dn = dsfra->dsfra_dnode;453454mutex_exit(&dn->dn_mtx);455dnode_sync_free_range_impl(dn, blkid, nblks,456dsfra->dsfra_free_indirects, dsfra->dsfra_tx);457mutex_enter(&dn->dn_mtx);458}459460/*461* Try to kick all the dnode's dbufs out of the cache...462*/463void464dnode_evict_dbufs(dnode_t *dn)465{466dmu_buf_impl_t *db_marker;467dmu_buf_impl_t *db, *db_next;468469db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);470471mutex_enter(&dn->dn_dbufs_mtx);472for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {473474#ifdef ZFS_DEBUG475DB_DNODE_ENTER(db);476ASSERT3P(DB_DNODE(db), ==, dn);477DB_DNODE_EXIT(db);478#endif /* DEBUG */479480mutex_enter(&db->db_mtx);481if (db->db_state != DB_EVICTING &&482zfs_refcount_is_zero(&db->db_holds)) {483db_marker->db_level = db->db_level;484db_marker->db_blkid = db->db_blkid;485/*486* Insert a MARKER node with the same level and blkid.487* And to resolve any ties in dbuf_compare() use the488* pointer of the dbuf that we are evicting. Pass the489* address in db_parent.490*/491db_marker->db_state = DB_MARKER;492db_marker->db_parent = (void *)((uintptr_t)db - 1);493avl_insert_here(&dn->dn_dbufs, db_marker, db,494AVL_BEFORE);495496/*497* We need to use the "marker" dbuf rather than498* simply getting the next dbuf, because499* dbuf_destroy() may actually remove multiple dbufs.500* It can call itself recursively on the parent dbuf,501* which may also be removed from dn_dbufs. The code502* flow would look like:503*504* dbuf_destroy():505* dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):506* if (!cacheable || pending_evict)507* dbuf_destroy()508*/509dbuf_destroy(db);510511db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);512avl_remove(&dn->dn_dbufs, db_marker);513} else {514db->db_pending_evict = TRUE;515db->db_partial_read = FALSE;516mutex_exit(&db->db_mtx);517db_next = AVL_NEXT(&dn->dn_dbufs, db);518}519}520mutex_exit(&dn->dn_dbufs_mtx);521522kmem_free(db_marker, sizeof (dmu_buf_impl_t));523524dnode_evict_bonus(dn);525}526527void528dnode_evict_bonus(dnode_t *dn)529{530rw_enter(&dn->dn_struct_rwlock, RW_WRITER);531if (dn->dn_bonus != NULL) {532if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {533mutex_enter(&dn->dn_bonus->db_mtx);534dbuf_destroy(dn->dn_bonus);535dn->dn_bonus = NULL;536} else {537dn->dn_bonus->db_pending_evict = TRUE;538}539}540rw_exit(&dn->dn_struct_rwlock);541}542543static void544dnode_undirty_dbufs(list_t *list)545{546dbuf_dirty_record_t *dr;547548while ((dr = list_head(list))) {549dmu_buf_impl_t *db = dr->dr_dbuf;550uint64_t txg = dr->dr_txg;551552if (db->db_level != 0)553dnode_undirty_dbufs(&dr->dt.di.dr_children);554555mutex_enter(&db->db_mtx);556/* XXX - use dbuf_undirty()? */557list_remove(list, dr);558ASSERT(list_head(&db->db_dirty_records) == dr);559list_remove_head(&db->db_dirty_records);560ASSERT(list_is_empty(&db->db_dirty_records));561db->db_dirtycnt -= 1;562if (db->db_level == 0) {563ASSERT(db->db_blkid == DMU_BONUS_BLKID ||564dr->dt.dl.dr_data == db->db_buf);565dbuf_unoverride(dr);566} else {567mutex_destroy(&dr->dt.di.dr_mtx);568list_destroy(&dr->dt.di.dr_children);569}570kmem_cache_free(dbuf_dirty_kmem_cache, dr);571dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);572}573}574575static void576dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)577{578int txgoff = tx->tx_txg & TXG_MASK;579580ASSERT(dmu_tx_is_syncing(tx));581582/*583* Our contents should have been freed in dnode_sync() by the584* free range record inserted by the caller of dnode_free().585*/586ASSERT0(DN_USED_BYTES(dn->dn_phys));587ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));588589dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);590dnode_evict_dbufs(dn);591592/*593* XXX - It would be nice to assert this, but we may still594* have residual holds from async evictions from the arc...595*596* zfs_obj_to_path() also depends on this being597* commented out.598*599* ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);600*/601602/* Undirty next bits */603dn->dn_next_nlevels[txgoff] = 0;604dn->dn_next_indblkshift[txgoff] = 0;605dn->dn_next_blksz[txgoff] = 0;606dn->dn_next_maxblkid[txgoff] = 0;607608/* ASSERT(blkptrs are zero); */609ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);610ASSERT(dn->dn_type != DMU_OT_NONE);611612ASSERT(dn->dn_free_txg > 0);613if (dn->dn_allocated_txg != dn->dn_free_txg)614dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);615memset(dn->dn_phys, 0, sizeof (dnode_phys_t) * dn->dn_num_slots);616dnode_free_interior_slots(dn);617618mutex_enter(&dn->dn_mtx);619dn->dn_type = DMU_OT_NONE;620dn->dn_maxblkid = 0;621dn->dn_allocated_txg = 0;622dn->dn_free_txg = 0;623dn->dn_have_spill = B_FALSE;624dn->dn_num_slots = 1;625mutex_exit(&dn->dn_mtx);626627ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);628629dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);630/*631* Now that we've released our hold, the dnode may632* be evicted, so we mustn't access it.633*/634}635636/*637* Write out the dnode's dirty buffers.638* Does not wait for zio completions.639*/640void641dnode_sync(dnode_t *dn, dmu_tx_t *tx)642{643objset_t *os = dn->dn_objset;644dnode_phys_t *dnp = dn->dn_phys;645int txgoff = tx->tx_txg & TXG_MASK;646list_t *list = &dn->dn_dirty_records[txgoff];647static const dnode_phys_t zerodn __maybe_unused = { 0 };648boolean_t kill_spill = B_FALSE;649650ASSERT(dmu_tx_is_syncing(tx));651ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);652ASSERT(dnp->dn_type != DMU_OT_NONE ||653memcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);654DNODE_VERIFY(dn);655656ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));657658/*659* Do user accounting if it is enabled and this is not660* an encrypted receive.661*/662if (dmu_objset_userused_enabled(os) &&663!DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&664(!os->os_encrypted || !dmu_objset_is_receiving(os))) {665mutex_enter(&dn->dn_mtx);666dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);667dn->dn_oldflags = dn->dn_phys->dn_flags;668dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;669if (dmu_objset_userobjused_enabled(dn->dn_objset))670dn->dn_phys->dn_flags |=671DNODE_FLAG_USEROBJUSED_ACCOUNTED;672mutex_exit(&dn->dn_mtx);673dmu_objset_userquota_get_ids(dn, B_FALSE, tx);674} else if (!(os->os_encrypted && dmu_objset_is_receiving(os))) {675/*676* Once we account for it, we should always account for it,677* except for the case of a raw receive. We will not be able678* to account for it until the receiving dataset has been679* mounted.680*/681ASSERT(!(dn->dn_phys->dn_flags &682DNODE_FLAG_USERUSED_ACCOUNTED));683ASSERT(!(dn->dn_phys->dn_flags &684DNODE_FLAG_USEROBJUSED_ACCOUNTED));685}686687mutex_enter(&dn->dn_mtx);688if (dn->dn_allocated_txg == tx->tx_txg) {689/* The dnode is newly allocated or reallocated */690if (dnp->dn_type == DMU_OT_NONE) {691/* this is a first alloc, not a realloc */692dnp->dn_nlevels = 1;693dnp->dn_nblkptr = dn->dn_nblkptr;694}695696dnp->dn_type = dn->dn_type;697dnp->dn_bonustype = dn->dn_bonustype;698dnp->dn_bonuslen = dn->dn_bonuslen;699}700701dnp->dn_extra_slots = dn->dn_num_slots - 1;702703ASSERT(dnp->dn_nlevels > 1 ||704BP_IS_HOLE(&dnp->dn_blkptr[0]) ||705BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||706BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==707dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);708ASSERT(dnp->dn_nlevels < 2 ||709BP_IS_HOLE(&dnp->dn_blkptr[0]) ||710BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);711712if (dn->dn_next_type[txgoff] != 0) {713dnp->dn_type = dn->dn_type;714dn->dn_next_type[txgoff] = 0;715}716717if (dn->dn_next_blksz[txgoff] != 0) {718ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],719SPA_MINBLOCKSIZE) == 0);720ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||721dn->dn_maxblkid == 0 || list_head(list) != NULL ||722dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==723dnp->dn_datablkszsec ||724!zfs_range_tree_is_empty(dn->dn_free_ranges[txgoff]));725dnp->dn_datablkszsec =726dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;727dn->dn_next_blksz[txgoff] = 0;728}729730if (dn->dn_next_bonuslen[txgoff] != 0) {731if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)732dnp->dn_bonuslen = 0;733else734dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];735ASSERT(dnp->dn_bonuslen <=736DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));737dn->dn_next_bonuslen[txgoff] = 0;738}739740if (dn->dn_next_bonustype[txgoff] != 0) {741ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));742dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];743dn->dn_next_bonustype[txgoff] = 0;744}745746boolean_t freeing_dnode = dn->dn_free_txg > 0 &&747dn->dn_free_txg <= tx->tx_txg;748749/*750* Remove the spill block if we have been explicitly asked to751* remove it, or if the object is being removed.752*/753if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {754if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)755kill_spill = B_TRUE;756dn->dn_rm_spillblk[txgoff] = 0;757}758759if (dn->dn_next_indblkshift[txgoff] != 0) {760ASSERT(dnp->dn_nlevels == 1);761dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];762dn->dn_next_indblkshift[txgoff] = 0;763}764765/*766* Just take the live (open-context) values for checksum and compress.767* Strictly speaking it's a future leak, but nothing bad happens if we768* start using the new checksum or compress algorithm a little early.769*/770dnp->dn_checksum = dn->dn_checksum;771dnp->dn_compress = dn->dn_compress;772773mutex_exit(&dn->dn_mtx);774775if (kill_spill) {776free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);777mutex_enter(&dn->dn_mtx);778dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;779mutex_exit(&dn->dn_mtx);780}781782/* process all the "freed" ranges in the file */783if (dn->dn_free_ranges[txgoff] != NULL) {784dnode_sync_free_range_arg_t dsfra;785dsfra.dsfra_dnode = dn;786dsfra.dsfra_tx = tx;787dsfra.dsfra_free_indirects = freeing_dnode;788mutex_enter(&dn->dn_mtx);789if (freeing_dnode) {790ASSERT(zfs_range_tree_contains(791dn->dn_free_ranges[txgoff], 0,792dn->dn_maxblkid + 1));793}794/*795* Because dnode_sync_free_range() must drop dn_mtx during its796* processing, using it as a callback to zfs_range_tree_vacate()797* is not safe. No other operations (besides destroy) are798* allowed once zfs_range_tree_vacate() has begun, and dropping799* dn_mtx would leave a window open for another thread to800* observe that invalid (and unsafe) state.801*/802zfs_range_tree_walk(dn->dn_free_ranges[txgoff],803dnode_sync_free_range, &dsfra);804zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);805zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]);806dn->dn_free_ranges[txgoff] = NULL;807mutex_exit(&dn->dn_mtx);808}809810if (freeing_dnode) {811dn->dn_objset->os_freed_dnodes++;812dnode_sync_free(dn, tx);813return;814}815816if (dn->dn_num_slots > DNODE_MIN_SLOTS) {817dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;818mutex_enter(&ds->ds_lock);819ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] =820(void *)B_TRUE;821mutex_exit(&ds->ds_lock);822}823824if (dn->dn_next_nlevels[txgoff]) {825dnode_increase_indirection(dn, tx);826dn->dn_next_nlevels[txgoff] = 0;827}828829/*830* This must be done after dnode_sync_free_range()831* and dnode_increase_indirection(). See dnode_new_blkid()832* for an explanation of the high bit being set.833*/834if (dn->dn_next_maxblkid[txgoff]) {835mutex_enter(&dn->dn_mtx);836dnp->dn_maxblkid =837dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;838dn->dn_next_maxblkid[txgoff] = 0;839mutex_exit(&dn->dn_mtx);840}841842if (dn->dn_next_nblkptr[txgoff]) {843/* this should only happen on a realloc */844ASSERT(dn->dn_allocated_txg == tx->tx_txg);845if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {846/* zero the new blkptrs we are gaining */847memset(dnp->dn_blkptr + dnp->dn_nblkptr, 0,848sizeof (blkptr_t) *849(dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));850#ifdef ZFS_DEBUG851} else {852int i;853ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);854/* the blkptrs we are losing better be unallocated */855for (i = 0; i < dnp->dn_nblkptr; i++) {856if (i >= dn->dn_next_nblkptr[txgoff])857ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));858}859#endif860}861mutex_enter(&dn->dn_mtx);862dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];863dn->dn_next_nblkptr[txgoff] = 0;864mutex_exit(&dn->dn_mtx);865}866867dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);868869if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {870ASSERT0P(list_head(list));871dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);872}873874ASSERT3U(dnp->dn_bonuslen, <=, DN_MAX_BONUS_LEN(dnp));875876/*877* Although we have dropped our reference to the dnode, it878* can't be evicted until its written, and we haven't yet879* initiated the IO for the dnode's dbuf. Additionally, the caller880* has already added a reference to the dnode because it's on the881* os_synced_dnodes list.882*/883}884885886