Path: blob/main/sys/contrib/openzfs/module/zfs/dbuf.c
107001 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright 2011 Nexenta Systems, Inc. All rights reserved.24* Copyright (c) 2012, 2020 by Delphix. All rights reserved.25* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.26* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.27* Copyright (c) 2019, Klara Inc.28* Copyright (c) 2019, Allan Jude29* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek30*/3132#include <sys/zfs_context.h>33#include <sys/arc.h>34#include <sys/dmu.h>35#include <sys/dmu_send.h>36#include <sys/dmu_impl.h>37#include <sys/dbuf.h>38#include <sys/dmu_objset.h>39#include <sys/dsl_dataset.h>40#include <sys/dsl_dir.h>41#include <sys/dmu_tx.h>42#include <sys/spa.h>43#include <sys/zio.h>44#include <sys/dmu_zfetch.h>45#include <sys/sa.h>46#include <sys/sa_impl.h>47#include <sys/zfeature.h>48#include <sys/blkptr.h>49#include <sys/range_tree.h>50#include <sys/trace_zfs.h>51#include <sys/callb.h>52#include <sys/abd.h>53#include <sys/brt.h>54#include <sys/vdev.h>55#include <cityhash.h>56#include <sys/spa_impl.h>57#include <sys/wmsum.h>58#include <sys/vdev_impl.h>5960static kstat_t *dbuf_ksp;6162typedef struct dbuf_stats {63/*64* Various statistics about the size of the dbuf cache.65*/66kstat_named_t cache_count;67kstat_named_t cache_size_bytes;68kstat_named_t cache_size_bytes_max;69/*70* Statistics regarding the bounds on the dbuf cache size.71*/72kstat_named_t cache_target_bytes;73kstat_named_t cache_lowater_bytes;74kstat_named_t cache_hiwater_bytes;75/*76* Total number of dbuf cache evictions that have occurred.77*/78kstat_named_t cache_total_evicts;79/*80* The distribution of dbuf levels in the dbuf cache and81* the total size of all dbufs at each level.82*/83kstat_named_t cache_levels[DN_MAX_LEVELS];84kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];85/*86* Statistics about the dbuf hash table.87*/88kstat_named_t hash_hits;89kstat_named_t hash_misses;90kstat_named_t hash_collisions;91kstat_named_t hash_elements;92/*93* Number of sublists containing more than one dbuf in the dbuf94* hash table. Keep track of the longest hash chain.95*/96kstat_named_t hash_chains;97kstat_named_t hash_chain_max;98/*99* Number of times a dbuf_create() discovers that a dbuf was100* already created and in the dbuf hash table.101*/102kstat_named_t hash_insert_race;103/*104* Number of entries in the hash table dbuf and mutex arrays.105*/106kstat_named_t hash_table_count;107kstat_named_t hash_mutex_count;108/*109* Statistics about the size of the metadata dbuf cache.110*/111kstat_named_t metadata_cache_count;112kstat_named_t metadata_cache_size_bytes;113kstat_named_t metadata_cache_size_bytes_max;114/*115* For diagnostic purposes, this is incremented whenever we can't add116* something to the metadata cache because it's full, and instead put117* the data in the regular dbuf cache.118*/119kstat_named_t metadata_cache_overflow;120} dbuf_stats_t;121122dbuf_stats_t dbuf_stats = {123{ "cache_count", KSTAT_DATA_UINT64 },124{ "cache_size_bytes", KSTAT_DATA_UINT64 },125{ "cache_size_bytes_max", KSTAT_DATA_UINT64 },126{ "cache_target_bytes", KSTAT_DATA_UINT64 },127{ "cache_lowater_bytes", KSTAT_DATA_UINT64 },128{ "cache_hiwater_bytes", KSTAT_DATA_UINT64 },129{ "cache_total_evicts", KSTAT_DATA_UINT64 },130{ { "cache_levels_N", KSTAT_DATA_UINT64 } },131{ { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },132{ "hash_hits", KSTAT_DATA_UINT64 },133{ "hash_misses", KSTAT_DATA_UINT64 },134{ "hash_collisions", KSTAT_DATA_UINT64 },135{ "hash_elements", KSTAT_DATA_UINT64 },136{ "hash_chains", KSTAT_DATA_UINT64 },137{ "hash_chain_max", KSTAT_DATA_UINT64 },138{ "hash_insert_race", KSTAT_DATA_UINT64 },139{ "hash_table_count", KSTAT_DATA_UINT64 },140{ "hash_mutex_count", KSTAT_DATA_UINT64 },141{ "metadata_cache_count", KSTAT_DATA_UINT64 },142{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },143{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },144{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }145};146147struct {148wmsum_t cache_count;149wmsum_t cache_total_evicts;150wmsum_t cache_levels[DN_MAX_LEVELS];151wmsum_t cache_levels_bytes[DN_MAX_LEVELS];152wmsum_t hash_hits;153wmsum_t hash_misses;154wmsum_t hash_collisions;155wmsum_t hash_elements;156wmsum_t hash_chains;157wmsum_t hash_insert_race;158wmsum_t metadata_cache_count;159wmsum_t metadata_cache_overflow;160} dbuf_sums;161162#define DBUF_STAT_INCR(stat, val) \163wmsum_add(&dbuf_sums.stat, val)164#define DBUF_STAT_DECR(stat, val) \165DBUF_STAT_INCR(stat, -(val))166#define DBUF_STAT_BUMP(stat) \167DBUF_STAT_INCR(stat, 1)168#define DBUF_STAT_BUMPDOWN(stat) \169DBUF_STAT_INCR(stat, -1)170#define DBUF_STAT_MAX(stat, v) { \171uint64_t _m; \172while ((v) > (_m = dbuf_stats.stat.value.ui64) && \173(_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\174continue; \175}176177static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);178static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);179180/*181* Global data structures and functions for the dbuf cache.182*/183static kmem_cache_t *dbuf_kmem_cache;184kmem_cache_t *dbuf_dirty_kmem_cache;185static taskq_t *dbu_evict_taskq;186187static kthread_t *dbuf_cache_evict_thread;188static kmutex_t dbuf_evict_lock;189static kcondvar_t dbuf_evict_cv;190static boolean_t dbuf_evict_thread_exit;191192/*193* There are two dbuf caches; each dbuf can only be in one of them at a time.194*195* 1. Cache of metadata dbufs, to help make read-heavy administrative commands196* from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs197* that represent the metadata that describes filesystems/snapshots/198* bookmarks/properties/etc. We only evict from this cache when we export a199* pool, to short-circuit as much I/O as possible for all administrative200* commands that need the metadata. There is no eviction policy for this201* cache, because we try to only include types in it which would occupy a202* very small amount of space per object but create a large impact on the203* performance of these commands. Instead, after it reaches a maximum size204* (which should only happen on very small memory systems with a very large205* number of filesystem objects), we stop taking new dbufs into the206* metadata cache, instead putting them in the normal dbuf cache.207*208* 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that209* are not currently held but have been recently released. These dbufs210* are not eligible for arc eviction until they are aged out of the cache.211* Dbufs that are aged out of the cache will be immediately destroyed and212* become eligible for arc eviction.213*214* Dbufs are added to these caches once the last hold is released. If a dbuf is215* later accessed and still exists in the dbuf cache, then it will be removed216* from the cache and later re-added to the head of the cache.217*218* If a given dbuf meets the requirements for the metadata cache, it will go219* there, otherwise it will be considered for the generic LRU dbuf cache. The220* caches and the refcounts tracking their sizes are stored in an array indexed221* by those caches' matching enum values (from dbuf_cached_state_t).222*/223typedef struct dbuf_cache {224multilist_t cache;225zfs_refcount_t size ____cacheline_aligned;226} dbuf_cache_t;227dbuf_cache_t dbuf_caches[DB_CACHE_MAX];228229/* Size limits for the caches */230static uint64_t dbuf_cache_max_bytes = UINT64_MAX;231static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;232233/* Set the default sizes of the caches to log2 fraction of arc size */234static uint_t dbuf_cache_shift = 5;235static uint_t dbuf_metadata_cache_shift = 6;236237/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */238static uint_t dbuf_mutex_cache_shift = 0;239240static unsigned long dbuf_cache_target_bytes(void);241static unsigned long dbuf_metadata_cache_target_bytes(void);242243/*244* The LRU dbuf cache uses a three-stage eviction policy:245* - A low water marker designates when the dbuf eviction thread246* should stop evicting from the dbuf cache.247* - When we reach the maximum size (aka mid water mark), we248* signal the eviction thread to run.249* - The high water mark indicates when the eviction thread250* is unable to keep up with the incoming load and eviction must251* happen in the context of the calling thread.252*253* The dbuf cache:254* (max size)255* low water mid water hi water256* +----------------------------------------+----------+----------+257* | | | |258* | | | |259* | | | |260* | | | |261* +----------------------------------------+----------+----------+262* stop signal evict263* evicting eviction directly264* thread265*266* The high and low water marks indicate the operating range for the eviction267* thread. The low water mark is, by default, 90% of the total size of the268* cache and the high water mark is at 110% (both of these percentages can be269* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,270* respectively). The eviction thread will try to ensure that the cache remains271* within this range by waking up every second and checking if the cache is272* above the low water mark. The thread can also be woken up by callers adding273* elements into the cache if the cache is larger than the mid water (i.e max274* cache size). Once the eviction thread is woken up and eviction is required,275* it will continue evicting buffers until it's able to reduce the cache size276* to the low water mark. If the cache size continues to grow and hits the high277* water mark, then callers adding elements to the cache will begin to evict278* directly from the cache until the cache is no longer above the high water279* mark.280*/281282/*283* The percentage above and below the maximum cache size.284*/285static uint_t dbuf_cache_hiwater_pct = 10;286static uint_t dbuf_cache_lowater_pct = 10;287288static int289dbuf_cons(void *vdb, void *unused, int kmflag)290{291(void) unused, (void) kmflag;292dmu_buf_impl_t *db = vdb;293memset(db, 0, sizeof (dmu_buf_impl_t));294295mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);296rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);297cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);298multilist_link_init(&db->db_cache_link);299zfs_refcount_create(&db->db_holds);300301return (0);302}303304static void305dbuf_dest(void *vdb, void *unused)306{307(void) unused;308dmu_buf_impl_t *db = vdb;309mutex_destroy(&db->db_mtx);310rw_destroy(&db->db_rwlock);311cv_destroy(&db->db_changed);312ASSERT(!multilist_link_active(&db->db_cache_link));313zfs_refcount_destroy(&db->db_holds);314}315316/*317* dbuf hash table routines318*/319static dbuf_hash_table_t dbuf_hash_table;320321/*322* We use Cityhash for this. It's fast, and has good hash properties without323* requiring any large static buffers.324*/325static uint64_t326dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)327{328return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));329}330331#define DTRACE_SET_STATE(db, why) \332DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \333const char *, why)334335#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \336((dbuf)->db.db_object == (obj) && \337(dbuf)->db_objset == (os) && \338(dbuf)->db_level == (level) && \339(dbuf)->db_blkid == (blkid))340341dmu_buf_impl_t *342dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,343uint64_t *hash_out)344{345dbuf_hash_table_t *h = &dbuf_hash_table;346uint64_t hv;347uint64_t idx;348dmu_buf_impl_t *db;349350hv = dbuf_hash(os, obj, level, blkid);351idx = hv & h->hash_table_mask;352353mutex_enter(DBUF_HASH_MUTEX(h, idx));354for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {355if (DBUF_EQUAL(db, os, obj, level, blkid)) {356mutex_enter(&db->db_mtx);357if (db->db_state != DB_EVICTING) {358mutex_exit(DBUF_HASH_MUTEX(h, idx));359return (db);360}361mutex_exit(&db->db_mtx);362}363}364mutex_exit(DBUF_HASH_MUTEX(h, idx));365if (hash_out != NULL)366*hash_out = hv;367return (NULL);368}369370static dmu_buf_impl_t *371dbuf_find_bonus(objset_t *os, uint64_t object)372{373dnode_t *dn;374dmu_buf_impl_t *db = NULL;375376if (dnode_hold(os, object, FTAG, &dn) == 0) {377rw_enter(&dn->dn_struct_rwlock, RW_READER);378if (dn->dn_bonus != NULL) {379db = dn->dn_bonus;380mutex_enter(&db->db_mtx);381}382rw_exit(&dn->dn_struct_rwlock);383dnode_rele(dn, FTAG);384}385return (db);386}387388/*389* Insert an entry into the hash table. If there is already an element390* equal to elem in the hash table, then the already existing element391* will be returned and the new element will not be inserted.392* Otherwise returns NULL.393*/394static dmu_buf_impl_t *395dbuf_hash_insert(dmu_buf_impl_t *db)396{397dbuf_hash_table_t *h = &dbuf_hash_table;398objset_t *os = db->db_objset;399uint64_t obj = db->db.db_object;400int level = db->db_level;401uint64_t blkid, idx;402dmu_buf_impl_t *dbf;403uint32_t i;404405blkid = db->db_blkid;406ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);407idx = db->db_hash & h->hash_table_mask;408409mutex_enter(DBUF_HASH_MUTEX(h, idx));410for (dbf = h->hash_table[idx], i = 0; dbf != NULL;411dbf = dbf->db_hash_next, i++) {412if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {413mutex_enter(&dbf->db_mtx);414if (dbf->db_state != DB_EVICTING) {415mutex_exit(DBUF_HASH_MUTEX(h, idx));416return (dbf);417}418mutex_exit(&dbf->db_mtx);419}420}421422if (i > 0) {423DBUF_STAT_BUMP(hash_collisions);424if (i == 1)425DBUF_STAT_BUMP(hash_chains);426427DBUF_STAT_MAX(hash_chain_max, i);428}429430mutex_enter(&db->db_mtx);431db->db_hash_next = h->hash_table[idx];432h->hash_table[idx] = db;433mutex_exit(DBUF_HASH_MUTEX(h, idx));434DBUF_STAT_BUMP(hash_elements);435436return (NULL);437}438439/*440* This returns whether this dbuf should be stored in the metadata cache, which441* is based on whether it's from one of the dnode types that store data related442* to traversing dataset hierarchies.443*/444static boolean_t445dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)446{447DB_DNODE_ENTER(db);448dnode_t *dn = DB_DNODE(db);449dmu_object_type_t type = dn->dn_storage_type;450if (type == DMU_OT_NONE)451type = dn->dn_type;452DB_DNODE_EXIT(db);453454/* Check if this dbuf is one of the types we care about */455if (DMU_OT_IS_METADATA_CACHED(type)) {456/* If we hit this, then we set something up wrong in dmu_ot */457ASSERT(DMU_OT_IS_METADATA(type));458459/*460* Sanity check for small-memory systems: don't allocate too461* much memory for this purpose.462*/463if (zfs_refcount_count(464&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >465dbuf_metadata_cache_target_bytes()) {466DBUF_STAT_BUMP(metadata_cache_overflow);467return (B_FALSE);468}469470return (B_TRUE);471}472473return (B_FALSE);474}475476/*477* Remove an entry from the hash table. It must be in the EVICTING state.478*/479static void480dbuf_hash_remove(dmu_buf_impl_t *db)481{482dbuf_hash_table_t *h = &dbuf_hash_table;483uint64_t idx;484dmu_buf_impl_t *dbf, **dbp;485486ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,487db->db_blkid), ==, db->db_hash);488idx = db->db_hash & h->hash_table_mask;489490/*491* We mustn't hold db_mtx to maintain lock ordering:492* DBUF_HASH_MUTEX > db_mtx.493*/494ASSERT(zfs_refcount_is_zero(&db->db_holds));495ASSERT(db->db_state == DB_EVICTING);496ASSERT(!MUTEX_HELD(&db->db_mtx));497498mutex_enter(DBUF_HASH_MUTEX(h, idx));499dbp = &h->hash_table[idx];500while ((dbf = *dbp) != db) {501dbp = &dbf->db_hash_next;502ASSERT(dbf != NULL);503}504*dbp = db->db_hash_next;505db->db_hash_next = NULL;506if (h->hash_table[idx] &&507h->hash_table[idx]->db_hash_next == NULL)508DBUF_STAT_BUMPDOWN(hash_chains);509mutex_exit(DBUF_HASH_MUTEX(h, idx));510DBUF_STAT_BUMPDOWN(hash_elements);511}512513typedef enum {514DBVU_EVICTING,515DBVU_NOT_EVICTING516} dbvu_verify_type_t;517518static void519dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)520{521#ifdef ZFS_DEBUG522int64_t holds;523524if (db->db_user == NULL)525return;526527/* Only data blocks support the attachment of user data. */528ASSERT0(db->db_level);529530/* Clients must resolve a dbuf before attaching user data. */531ASSERT(db->db.db_data != NULL);532ASSERT3U(db->db_state, ==, DB_CACHED);533534holds = zfs_refcount_count(&db->db_holds);535if (verify_type == DBVU_EVICTING) {536/*537* Immediate eviction occurs when holds == dirtycnt.538* For normal eviction buffers, holds is zero on539* eviction, except when dbuf_fix_old_data() calls540* dbuf_clear_data(). However, the hold count can grow541* during eviction even though db_mtx is held (see542* dmu_bonus_hold() for an example), so we can only543* test the generic invariant that holds >= dirtycnt.544*/545ASSERT3U(holds, >=, db->db_dirtycnt);546} else {547if (db->db_user_immediate_evict == TRUE)548ASSERT3U(holds, >=, db->db_dirtycnt);549else550ASSERT3U(holds, >, 0);551}552#endif553}554555static void556dbuf_evict_user(dmu_buf_impl_t *db)557{558dmu_buf_user_t *dbu = db->db_user;559560ASSERT(MUTEX_HELD(&db->db_mtx));561562if (dbu == NULL)563return;564565dbuf_verify_user(db, DBVU_EVICTING);566db->db_user = NULL;567568#ifdef ZFS_DEBUG569if (dbu->dbu_clear_on_evict_dbufp != NULL)570*dbu->dbu_clear_on_evict_dbufp = NULL;571#endif572573if (db->db_caching_status != DB_NO_CACHE) {574/*575* This is a cached dbuf, so the size of the user data is576* included in its cached amount. We adjust it here because the577* user data has already been detached from the dbuf, and the578* sync functions are not supposed to touch it (the dbuf might579* not exist anymore by the time the sync functions run.580*/581uint64_t size = dbu->dbu_size;582(void) zfs_refcount_remove_many(583&dbuf_caches[db->db_caching_status].size, size, dbu);584if (db->db_caching_status == DB_DBUF_CACHE)585DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);586}587588/*589* There are two eviction callbacks - one that we call synchronously590* and one that we invoke via a taskq. The async one is useful for591* avoiding lock order reversals and limiting stack depth.592*593* Note that if we have a sync callback but no async callback,594* it's likely that the sync callback will free the structure595* containing the dbu. In that case we need to take care to not596* dereference dbu after calling the sync evict func.597*/598boolean_t has_async = (dbu->dbu_evict_func_async != NULL);599600if (dbu->dbu_evict_func_sync != NULL)601dbu->dbu_evict_func_sync(dbu);602603if (has_async) {604taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,605dbu, 0, &dbu->dbu_tqent);606}607}608609boolean_t610dbuf_is_metadata(dmu_buf_impl_t *db)611{612/*613* Consider indirect blocks and spill blocks to be meta data.614*/615if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {616return (B_TRUE);617} else {618boolean_t is_metadata;619620DB_DNODE_ENTER(db);621is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);622DB_DNODE_EXIT(db);623624return (is_metadata);625}626}627628/*629* We want to exclude buffers that are on a special allocation class from630* L2ARC.631*/632boolean_t633dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)634{635if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||636(db->db_objset->os_secondary_cache ==637ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {638if (l2arc_exclude_special == 0)639return (B_TRUE);640641/*642* bp must be checked in the event it was passed from643* dbuf_read_impl() as the result of a the BP being set from644* a Direct I/O write in dbuf_read(). See comments in645* dbuf_read().646*/647blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;648649if (db_bp == NULL || BP_IS_HOLE(db_bp))650return (B_FALSE);651uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);652vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;653vdev_t *vd = NULL;654655if (vdev < rvd->vdev_children)656vd = rvd->vdev_child[vdev];657658if (vd == NULL)659return (B_TRUE);660661if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&662vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)663return (B_TRUE);664}665return (B_FALSE);666}667668static inline boolean_t669dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)670{671if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||672(dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&673(level > 0 ||674DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {675if (l2arc_exclude_special == 0)676return (B_TRUE);677678if (bp == NULL || BP_IS_HOLE(bp))679return (B_FALSE);680uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);681vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;682vdev_t *vd = NULL;683684if (vdev < rvd->vdev_children)685vd = rvd->vdev_child[vdev];686687if (vd == NULL)688return (B_TRUE);689690if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&691vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)692return (B_TRUE);693}694return (B_FALSE);695}696697698/*699* This function *must* return indices evenly distributed between all700* sublists of the multilist. This is needed due to how the dbuf eviction701* code is laid out; dbuf_evict_thread() assumes dbufs are evenly702* distributed between all sublists and uses this assumption when703* deciding which sublist to evict from and how much to evict from it.704*/705static unsigned int706dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)707{708dmu_buf_impl_t *db = obj;709710/*711* The assumption here, is the hash value for a given712* dmu_buf_impl_t will remain constant throughout it's lifetime713* (i.e. it's objset, object, level and blkid fields don't change).714* Thus, we don't need to store the dbuf's sublist index715* on insertion, as this index can be recalculated on removal.716*717* Also, the low order bits of the hash value are thought to be718* distributed evenly. Otherwise, in the case that the multilist719* has a power of two number of sublists, each sublists' usage720* would not be evenly distributed. In this context full 64bit721* division would be a waste of time, so limit it to 32 bits.722*/723return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,724db->db_level, db->db_blkid) %725multilist_get_num_sublists(ml));726}727728/*729* The target size of the dbuf cache can grow with the ARC target,730* unless limited by the tunable dbuf_cache_max_bytes.731*/732static inline unsigned long733dbuf_cache_target_bytes(void)734{735return (MIN(dbuf_cache_max_bytes,736arc_target_bytes() >> dbuf_cache_shift));737}738739/*740* The target size of the dbuf metadata cache can grow with the ARC target,741* unless limited by the tunable dbuf_metadata_cache_max_bytes.742*/743static inline unsigned long744dbuf_metadata_cache_target_bytes(void)745{746return (MIN(dbuf_metadata_cache_max_bytes,747arc_target_bytes() >> dbuf_metadata_cache_shift));748}749750static inline uint64_t751dbuf_cache_hiwater_bytes(void)752{753uint64_t dbuf_cache_target = dbuf_cache_target_bytes();754return (dbuf_cache_target +755(dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);756}757758static inline uint64_t759dbuf_cache_lowater_bytes(void)760{761uint64_t dbuf_cache_target = dbuf_cache_target_bytes();762return (dbuf_cache_target -763(dbuf_cache_target * dbuf_cache_lowater_pct) / 100);764}765766static inline boolean_t767dbuf_cache_above_lowater(void)768{769return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >770dbuf_cache_lowater_bytes());771}772773/*774* Evict the oldest eligible dbuf from the dbuf cache.775*/776static void777dbuf_evict_one(void)778{779int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);780multilist_sublist_t *mls = multilist_sublist_lock_idx(781&dbuf_caches[DB_DBUF_CACHE].cache, idx);782783ASSERT(!MUTEX_HELD(&dbuf_evict_lock));784785dmu_buf_impl_t *db = multilist_sublist_tail(mls);786while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {787db = multilist_sublist_prev(mls, db);788}789790DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,791multilist_sublist_t *, mls);792793if (db != NULL) {794multilist_sublist_remove(mls, db);795multilist_sublist_unlock(mls);796uint64_t size = db->db.db_size;797uint64_t usize = dmu_buf_user_size(&db->db);798(void) zfs_refcount_remove_many(799&dbuf_caches[DB_DBUF_CACHE].size, size, db);800(void) zfs_refcount_remove_many(801&dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);802DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);803DBUF_STAT_BUMPDOWN(cache_count);804DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);805ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);806db->db_caching_status = DB_NO_CACHE;807dbuf_destroy(db);808DBUF_STAT_BUMP(cache_total_evicts);809} else {810multilist_sublist_unlock(mls);811}812}813814/*815* The dbuf evict thread is responsible for aging out dbufs from the816* cache. Once the cache has reached it's maximum size, dbufs are removed817* and destroyed. The eviction thread will continue running until the size818* of the dbuf cache is at or below the maximum size. Once the dbuf is aged819* out of the cache it is destroyed and becomes eligible for arc eviction.820*/821static __attribute__((noreturn)) void822dbuf_evict_thread(void *unused)823{824(void) unused;825callb_cpr_t cpr;826827CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);828829mutex_enter(&dbuf_evict_lock);830while (!dbuf_evict_thread_exit) {831while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {832CALLB_CPR_SAFE_BEGIN(&cpr);833(void) cv_timedwait_idle_hires(&dbuf_evict_cv,834&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);835CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);836}837mutex_exit(&dbuf_evict_lock);838839/*840* Keep evicting as long as we're above the low water mark841* for the cache. We do this without holding the locks to842* minimize lock contention.843*/844while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {845dbuf_evict_one();846}847848mutex_enter(&dbuf_evict_lock);849}850851dbuf_evict_thread_exit = B_FALSE;852cv_broadcast(&dbuf_evict_cv);853CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */854thread_exit();855}856857/*858* Wake up the dbuf eviction thread if the dbuf cache is at its max size.859* If the dbuf cache is at its high water mark, then evict a dbuf from the860* dbuf cache using the caller's context.861*/862static void863dbuf_evict_notify(uint64_t size)864{865/*866* We check if we should evict without holding the dbuf_evict_lock,867* because it's OK to occasionally make the wrong decision here,868* and grabbing the lock results in massive lock contention.869*/870if (size > dbuf_cache_target_bytes()) {871/*872* Avoid calling dbuf_evict_one() from memory reclaim context873* (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.874* Memory reclaim threads can get stuck waiting for the dbuf875* hash lock.876*/877if (size > dbuf_cache_hiwater_bytes() &&878!current_is_reclaim_thread()) {879dbuf_evict_one();880}881cv_signal(&dbuf_evict_cv);882}883}884885/*886* Since dbuf cache size is a fraction of target ARC size, ARC calls this when887* its target size is reduced due to memory pressure.888*/889void890dbuf_cache_reduce_target_size(void)891{892uint64_t size = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);893894if (size > dbuf_cache_target_bytes())895cv_signal(&dbuf_evict_cv);896}897898static int899dbuf_kstat_update(kstat_t *ksp, int rw)900{901dbuf_stats_t *ds = ksp->ks_data;902dbuf_hash_table_t *h = &dbuf_hash_table;903904if (rw == KSTAT_WRITE)905return (SET_ERROR(EACCES));906907ds->cache_count.value.ui64 =908wmsum_value(&dbuf_sums.cache_count);909ds->cache_size_bytes.value.ui64 =910zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);911ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();912ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();913ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();914ds->cache_total_evicts.value.ui64 =915wmsum_value(&dbuf_sums.cache_total_evicts);916for (int i = 0; i < DN_MAX_LEVELS; i++) {917ds->cache_levels[i].value.ui64 =918wmsum_value(&dbuf_sums.cache_levels[i]);919ds->cache_levels_bytes[i].value.ui64 =920wmsum_value(&dbuf_sums.cache_levels_bytes[i]);921}922ds->hash_hits.value.ui64 =923wmsum_value(&dbuf_sums.hash_hits);924ds->hash_misses.value.ui64 =925wmsum_value(&dbuf_sums.hash_misses);926ds->hash_collisions.value.ui64 =927wmsum_value(&dbuf_sums.hash_collisions);928ds->hash_elements.value.ui64 =929wmsum_value(&dbuf_sums.hash_elements);930ds->hash_chains.value.ui64 =931wmsum_value(&dbuf_sums.hash_chains);932ds->hash_insert_race.value.ui64 =933wmsum_value(&dbuf_sums.hash_insert_race);934ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;935ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;936ds->metadata_cache_count.value.ui64 =937wmsum_value(&dbuf_sums.metadata_cache_count);938ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(939&dbuf_caches[DB_DBUF_METADATA_CACHE].size);940ds->metadata_cache_overflow.value.ui64 =941wmsum_value(&dbuf_sums.metadata_cache_overflow);942return (0);943}944945void946dbuf_init(void)947{948uint64_t hmsize, hsize = 1ULL << 16;949dbuf_hash_table_t *h = &dbuf_hash_table;950951/*952* The hash table is big enough to fill one eighth of physical memory953* with an average block size of zfs_arc_average_blocksize (default 8K).954* By default, the table will take up955* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).956*/957while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)958hsize <<= 1;959960h->hash_table = NULL;961while (h->hash_table == NULL) {962h->hash_table_mask = hsize - 1;963964h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);965if (h->hash_table == NULL)966hsize >>= 1;967968ASSERT3U(hsize, >=, 1ULL << 10);969}970971/*972* The hash table buckets are protected by an array of mutexes where973* each mutex is reponsible for protecting 128 buckets. A minimum974* array size of 8192 is targeted to avoid contention.975*/976if (dbuf_mutex_cache_shift == 0)977hmsize = MAX(hsize >> 7, 1ULL << 13);978else979hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);980981h->hash_mutexes = NULL;982while (h->hash_mutexes == NULL) {983h->hash_mutex_mask = hmsize - 1;984985h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),986KM_SLEEP);987if (h->hash_mutexes == NULL)988hmsize >>= 1;989}990991dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",992sizeof (dmu_buf_impl_t),9930, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);994dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t",995sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0);996997for (int i = 0; i < hmsize; i++)998mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);9991000dbuf_stats_init(h);10011002/*1003* All entries are queued via taskq_dispatch_ent(), so min/maxalloc1004* configuration is not required.1005*/1006dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);10071008for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {1009multilist_create(&dbuf_caches[dcs].cache,1010sizeof (dmu_buf_impl_t),1011offsetof(dmu_buf_impl_t, db_cache_link),1012dbuf_cache_multilist_index_func);1013zfs_refcount_create(&dbuf_caches[dcs].size);1014}10151016dbuf_evict_thread_exit = B_FALSE;1017mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);1018cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);1019dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,1020NULL, 0, &p0, TS_RUN, minclsyspri);10211022wmsum_init(&dbuf_sums.cache_count, 0);1023wmsum_init(&dbuf_sums.cache_total_evicts, 0);1024for (int i = 0; i < DN_MAX_LEVELS; i++) {1025wmsum_init(&dbuf_sums.cache_levels[i], 0);1026wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);1027}1028wmsum_init(&dbuf_sums.hash_hits, 0);1029wmsum_init(&dbuf_sums.hash_misses, 0);1030wmsum_init(&dbuf_sums.hash_collisions, 0);1031wmsum_init(&dbuf_sums.hash_elements, 0);1032wmsum_init(&dbuf_sums.hash_chains, 0);1033wmsum_init(&dbuf_sums.hash_insert_race, 0);1034wmsum_init(&dbuf_sums.metadata_cache_count, 0);1035wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);10361037dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",1038KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),1039KSTAT_FLAG_VIRTUAL);1040if (dbuf_ksp != NULL) {1041for (int i = 0; i < DN_MAX_LEVELS; i++) {1042snprintf(dbuf_stats.cache_levels[i].name,1043KSTAT_STRLEN, "cache_level_%d", i);1044dbuf_stats.cache_levels[i].data_type =1045KSTAT_DATA_UINT64;1046snprintf(dbuf_stats.cache_levels_bytes[i].name,1047KSTAT_STRLEN, "cache_level_%d_bytes", i);1048dbuf_stats.cache_levels_bytes[i].data_type =1049KSTAT_DATA_UINT64;1050}1051dbuf_ksp->ks_data = &dbuf_stats;1052dbuf_ksp->ks_update = dbuf_kstat_update;1053kstat_install(dbuf_ksp);1054}1055}10561057void1058dbuf_fini(void)1059{1060dbuf_hash_table_t *h = &dbuf_hash_table;10611062dbuf_stats_destroy();10631064for (int i = 0; i < (h->hash_mutex_mask + 1); i++)1065mutex_destroy(&h->hash_mutexes[i]);10661067vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));1068vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *1069sizeof (kmutex_t));10701071kmem_cache_destroy(dbuf_kmem_cache);1072kmem_cache_destroy(dbuf_dirty_kmem_cache);1073taskq_destroy(dbu_evict_taskq);10741075mutex_enter(&dbuf_evict_lock);1076dbuf_evict_thread_exit = B_TRUE;1077while (dbuf_evict_thread_exit) {1078cv_signal(&dbuf_evict_cv);1079cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);1080}1081mutex_exit(&dbuf_evict_lock);10821083mutex_destroy(&dbuf_evict_lock);1084cv_destroy(&dbuf_evict_cv);10851086for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {1087zfs_refcount_destroy(&dbuf_caches[dcs].size);1088multilist_destroy(&dbuf_caches[dcs].cache);1089}10901091if (dbuf_ksp != NULL) {1092kstat_delete(dbuf_ksp);1093dbuf_ksp = NULL;1094}10951096wmsum_fini(&dbuf_sums.cache_count);1097wmsum_fini(&dbuf_sums.cache_total_evicts);1098for (int i = 0; i < DN_MAX_LEVELS; i++) {1099wmsum_fini(&dbuf_sums.cache_levels[i]);1100wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);1101}1102wmsum_fini(&dbuf_sums.hash_hits);1103wmsum_fini(&dbuf_sums.hash_misses);1104wmsum_fini(&dbuf_sums.hash_collisions);1105wmsum_fini(&dbuf_sums.hash_elements);1106wmsum_fini(&dbuf_sums.hash_chains);1107wmsum_fini(&dbuf_sums.hash_insert_race);1108wmsum_fini(&dbuf_sums.metadata_cache_count);1109wmsum_fini(&dbuf_sums.metadata_cache_overflow);1110}11111112/*1113* Other stuff.1114*/11151116#ifdef ZFS_DEBUG1117static void1118dbuf_verify(dmu_buf_impl_t *db)1119{1120dnode_t *dn;1121dbuf_dirty_record_t *dr;1122uint32_t txg_prev;11231124ASSERT(MUTEX_HELD(&db->db_mtx));11251126if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))1127return;11281129ASSERT(db->db_objset != NULL);1130DB_DNODE_ENTER(db);1131dn = DB_DNODE(db);1132if (dn == NULL) {1133ASSERT0P(db->db_parent);1134ASSERT0P(db->db_blkptr);1135} else {1136ASSERT3U(db->db.db_object, ==, dn->dn_object);1137ASSERT3P(db->db_objset, ==, dn->dn_objset);1138ASSERT3U(db->db_level, <, dn->dn_nlevels);1139ASSERT(db->db_blkid == DMU_BONUS_BLKID ||1140db->db_blkid == DMU_SPILL_BLKID ||1141!avl_is_empty(&dn->dn_dbufs));1142}1143if (db->db_blkid == DMU_BONUS_BLKID) {1144ASSERT(dn != NULL);1145ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);1146ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);1147} else if (db->db_blkid == DMU_SPILL_BLKID) {1148ASSERT(dn != NULL);1149ASSERT0(db->db.db_offset);1150} else {1151ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);1152}11531154if ((dr = list_head(&db->db_dirty_records)) != NULL) {1155ASSERT(dr->dr_dbuf == db);1156txg_prev = dr->dr_txg;1157for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;1158dr = list_next(&db->db_dirty_records, dr)) {1159ASSERT(dr->dr_dbuf == db);1160ASSERT(txg_prev > dr->dr_txg);1161txg_prev = dr->dr_txg;1162}1163}11641165/*1166* We can't assert that db_size matches dn_datablksz because it1167* can be momentarily different when another thread is doing1168* dnode_set_blksz().1169*/1170if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {1171dr = db->db_data_pending;1172/*1173* It should only be modified in syncing context, so1174* make sure we only have one copy of the data.1175*/1176ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);1177}11781179/* verify db->db_blkptr */1180if (db->db_blkptr) {1181if (db->db_parent == dn->dn_dbuf) {1182/* db is pointed to by the dnode */1183/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */1184if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))1185ASSERT0P(db->db_parent);1186else1187ASSERT(db->db_parent != NULL);1188if (db->db_blkid != DMU_SPILL_BLKID)1189ASSERT3P(db->db_blkptr, ==,1190&dn->dn_phys->dn_blkptr[db->db_blkid]);1191} else {1192/* db is pointed to by an indirect block */1193int epb __maybe_unused = db->db_parent->db.db_size >>1194SPA_BLKPTRSHIFT;1195ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);1196ASSERT3U(db->db_parent->db.db_object, ==,1197db->db.db_object);1198ASSERT3P(db->db_blkptr, ==,1199((blkptr_t *)db->db_parent->db.db_data +1200db->db_blkid % epb));1201}1202}1203if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&1204(db->db_buf == NULL || db->db_buf->b_data) &&1205db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&1206db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {1207/*1208* If the blkptr isn't set but they have nonzero data,1209* it had better be dirty, otherwise we'll lose that1210* data when we evict this buffer.1211*1212* There is an exception to this rule for indirect blocks; in1213* this case, if the indirect block is a hole, we fill in a few1214* fields on each of the child blocks (importantly, birth time)1215* to prevent hole birth times from being lost when you1216* partially fill in a hole.1217*/1218if (db->db_dirtycnt == 0) {1219if (db->db_level == 0) {1220uint64_t *buf = db->db.db_data;1221int i;12221223for (i = 0; i < db->db.db_size >> 3; i++) {1224ASSERT0(buf[i]);1225}1226} else {1227blkptr_t *bps = db->db.db_data;1228ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,1229db->db.db_size);1230/*1231* We want to verify that all the blkptrs in the1232* indirect block are holes, but we may have1233* automatically set up a few fields for them.1234* We iterate through each blkptr and verify1235* they only have those fields set.1236*/1237for (int i = 0;1238i < db->db.db_size / sizeof (blkptr_t);1239i++) {1240blkptr_t *bp = &bps[i];1241ASSERT(ZIO_CHECKSUM_IS_ZERO(1242&bp->blk_cksum));1243ASSERT(1244DVA_IS_EMPTY(&bp->blk_dva[0]) &&1245DVA_IS_EMPTY(&bp->blk_dva[1]) &&1246DVA_IS_EMPTY(&bp->blk_dva[2]));1247ASSERT0(bp->blk_fill);1248ASSERT(!BP_IS_EMBEDDED(bp));1249ASSERT(BP_IS_HOLE(bp));1250ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));1251}1252}1253}1254}1255DB_DNODE_EXIT(db);1256}1257#endif12581259static void1260dbuf_clear_data(dmu_buf_impl_t *db)1261{1262ASSERT(MUTEX_HELD(&db->db_mtx));1263dbuf_evict_user(db);1264ASSERT0P(db->db_buf);1265db->db.db_data = NULL;1266if (db->db_state != DB_NOFILL) {1267db->db_state = DB_UNCACHED;1268DTRACE_SET_STATE(db, "clear data");1269}1270}12711272static void1273dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)1274{1275ASSERT(MUTEX_HELD(&db->db_mtx));1276ASSERT(buf != NULL);12771278db->db_buf = buf;1279ASSERT(buf->b_data != NULL);1280db->db.db_data = buf->b_data;1281}12821283static arc_buf_t *1284dbuf_alloc_arcbuf(dmu_buf_impl_t *db)1285{1286spa_t *spa = db->db_objset->os_spa;12871288return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));1289}12901291/*1292* Calculate which level n block references the data at the level 0 offset1293* provided.1294*/1295uint64_t1296dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)1297{1298if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {1299/*1300* The level n blkid is equal to the level 0 blkid divided by1301* the number of level 0s in a level n block.1302*1303* The level 0 blkid is offset >> datablkshift =1304* offset / 2^datablkshift.1305*1306* The number of level 0s in a level n is the number of block1307* pointers in an indirect block, raised to the power of level.1308* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =1309* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).1310*1311* Thus, the level n blkid is: offset /1312* ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))1313* = offset / 2^(datablkshift + level *1314* (indblkshift - SPA_BLKPTRSHIFT))1315* = offset >> (datablkshift + level *1316* (indblkshift - SPA_BLKPTRSHIFT))1317*/13181319const unsigned exp = dn->dn_datablkshift +1320level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);13211322if (exp >= 8 * sizeof (offset)) {1323/* This only happens on the highest indirection level */1324ASSERT3U(level, ==, dn->dn_nlevels - 1);1325return (0);1326}13271328ASSERT3U(exp, <, 8 * sizeof (offset));13291330return (offset >> exp);1331} else {1332ASSERT3U(offset, <, dn->dn_datablksz);1333return (0);1334}1335}13361337/*1338* This function is used to lock the parent of the provided dbuf. This should be1339* used when modifying or reading db_blkptr.1340*/1341db_lock_type_t1342dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)1343{1344enum db_lock_type ret = DLT_NONE;1345if (db->db_parent != NULL) {1346rw_enter(&db->db_parent->db_rwlock, rw);1347ret = DLT_PARENT;1348} else if (dmu_objset_ds(db->db_objset) != NULL) {1349rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,1350tag);1351ret = DLT_OBJSET;1352}1353/*1354* We only return a DLT_NONE lock when it's the top-most indirect block1355* of the meta-dnode of the MOS.1356*/1357return (ret);1358}13591360/*1361* We need to pass the lock type in because it's possible that the block will1362* move from being the topmost indirect block in a dnode (and thus, have no1363* parent) to not the top-most via an indirection increase. This would cause a1364* panic if we didn't pass the lock type in.1365*/1366void1367dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)1368{1369if (type == DLT_PARENT)1370rw_exit(&db->db_parent->db_rwlock);1371else if (type == DLT_OBJSET)1372rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);1373}13741375static void1376dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,1377arc_buf_t *buf, void *vdb)1378{1379(void) zb, (void) bp;1380dmu_buf_impl_t *db = vdb;13811382mutex_enter(&db->db_mtx);1383ASSERT3U(db->db_state, ==, DB_READ);13841385/*1386* All reads are synchronous, so we must have a hold on the dbuf1387*/1388ASSERT(zfs_refcount_count(&db->db_holds) > 0);1389ASSERT0P(db->db_buf);1390ASSERT0P(db->db.db_data);1391if (buf == NULL) {1392/* i/o error */1393ASSERT(zio == NULL || zio->io_error != 0);1394ASSERT(db->db_blkid != DMU_BONUS_BLKID);1395ASSERT0P(db->db_buf);1396db->db_state = DB_UNCACHED;1397DTRACE_SET_STATE(db, "i/o error");1398} else if (db->db_level == 0 && db->db_freed_in_flight) {1399/* freed in flight */1400ASSERT(zio == NULL || zio->io_error == 0);1401arc_release(buf, db);1402memset(buf->b_data, 0, db->db.db_size);1403arc_buf_freeze(buf);1404db->db_freed_in_flight = FALSE;1405dbuf_set_data(db, buf);1406db->db_state = DB_CACHED;1407DTRACE_SET_STATE(db, "freed in flight");1408} else {1409/* success */1410ASSERT(zio == NULL || zio->io_error == 0);1411dbuf_set_data(db, buf);1412db->db_state = DB_CACHED;1413DTRACE_SET_STATE(db, "successful read");1414}1415cv_broadcast(&db->db_changed);1416dbuf_rele_and_unlock(db, NULL, B_FALSE);1417}14181419/*1420* Shortcut for performing reads on bonus dbufs. Returns1421* an error if we fail to verify the dnode associated with1422* a decrypted block. Otherwise success.1423*/1424static int1425dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)1426{1427void* db_data;1428int bonuslen, max_bonuslen;14291430bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);1431max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);1432ASSERT(MUTEX_HELD(&db->db_mtx));1433ASSERT(DB_DNODE_HELD(db));1434ASSERT3U(bonuslen, <=, db->db.db_size);1435db_data = kmem_alloc(max_bonuslen, KM_SLEEP);1436arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);1437if (bonuslen < max_bonuslen)1438memset(db_data, 0, max_bonuslen);1439if (bonuslen)1440memcpy(db_data, DN_BONUS(dn->dn_phys), bonuslen);1441db->db.db_data = db_data;1442db->db_state = DB_CACHED;1443DTRACE_SET_STATE(db, "bonus buffer filled");1444return (0);1445}14461447static void1448dbuf_handle_indirect_hole(void *data, dnode_t *dn, blkptr_t *dbbp)1449{1450blkptr_t *bps = data;1451uint32_t indbs = 1ULL << dn->dn_indblkshift;1452int n_bps = indbs >> SPA_BLKPTRSHIFT;14531454for (int i = 0; i < n_bps; i++) {1455blkptr_t *bp = &bps[i];14561457ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);1458BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?1459dn->dn_datablksz : BP_GET_LSIZE(dbbp));1460BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));1461BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);1462BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);1463}1464}14651466/*1467* Handle reads on dbufs that are holes, if necessary. This function1468* requires that the dbuf's mutex is held. Returns success (0) if action1469* was taken, ENOENT if no action was taken.1470*/1471static int1472dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)1473{1474ASSERT(MUTEX_HELD(&db->db_mtx));1475arc_buf_t *db_data;14761477int is_hole = bp == NULL || BP_IS_HOLE(bp);1478/*1479* For level 0 blocks only, if the above check fails:1480* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()1481* processes the delete record and clears the bp while we are waiting1482* for the dn_mtx (resulting in a "no" from block_freed).1483*/1484if (!is_hole && db->db_level == 0)1485is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);14861487if (is_hole) {1488db_data = dbuf_alloc_arcbuf(db);1489memset(db_data->b_data, 0, db->db.db_size);14901491if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&1492BP_GET_LOGICAL_BIRTH(bp) != 0) {1493dbuf_handle_indirect_hole(db_data->b_data, dn, bp);1494}1495dbuf_set_data(db, db_data);1496db->db_state = DB_CACHED;1497DTRACE_SET_STATE(db, "hole read satisfied");1498return (0);1499}1500return (ENOENT);1501}15021503/*1504* This function ensures that, when doing a decrypting read of a block,1505* we make sure we have decrypted the dnode associated with it. We must do1506* this so that we ensure we are fully authenticating the checksum-of-MACs1507* tree from the root of the objset down to this block. Indirect blocks are1508* always verified against their secure checksum-of-MACs assuming that the1509* dnode containing them is correct. Now that we are doing a decrypting read,1510* we can be sure that the key is loaded and verify that assumption. This is1511* especially important considering that we always read encrypted dnode1512* blocks as raw data (without verifying their MACs) to start, and1513* decrypt / authenticate them when we need to read an encrypted bonus buffer.1514*/1515static int1516dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,1517dmu_flags_t flags)1518{1519objset_t *os = db->db_objset;1520dmu_buf_impl_t *dndb;1521arc_buf_t *dnbuf;1522zbookmark_phys_t zb;1523int err;15241525if ((flags & DMU_READ_NO_DECRYPT) != 0 ||1526!os->os_encrypted || os->os_raw_receive ||1527(dndb = dn->dn_dbuf) == NULL)1528return (0);15291530dnbuf = dndb->db_buf;1531if (!arc_is_encrypted(dnbuf))1532return (0);15331534mutex_enter(&dndb->db_mtx);15351536/*1537* Since dnode buffer is modified by sync process, there can be only1538* one copy of it. It means we can not modify (decrypt) it while it1539* is being written. I don't see how this may happen now, since1540* encrypted dnode writes by receive should be completed before any1541* plain-text reads due to txg wait, but better be safe than sorry.1542*/1543while (1) {1544if (!arc_is_encrypted(dnbuf)) {1545mutex_exit(&dndb->db_mtx);1546return (0);1547}1548dbuf_dirty_record_t *dr = dndb->db_data_pending;1549if (dr == NULL || dr->dt.dl.dr_data != dnbuf)1550break;1551cv_wait(&dndb->db_changed, &dndb->db_mtx);1552};15531554SET_BOOKMARK(&zb, dmu_objset_id(os),1555DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);1556err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);15571558/*1559* An error code of EACCES tells us that the key is still not1560* available. This is ok if we are only reading authenticated1561* (and therefore non-encrypted) blocks.1562*/1563if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&1564!DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||1565(db->db_blkid == DMU_BONUS_BLKID &&1566!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))1567err = 0;15681569mutex_exit(&dndb->db_mtx);15701571return (err);1572}15731574/*1575* Drops db_mtx and the parent lock specified by dblt and tag before1576* returning.1577*/1578static int1579dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,1580db_lock_type_t dblt, blkptr_t *bp, const void *tag)1581{1582zbookmark_phys_t zb;1583uint32_t aflags = ARC_FLAG_NOWAIT;1584int err, zio_flags;15851586ASSERT(!zfs_refcount_is_zero(&db->db_holds));1587ASSERT(MUTEX_HELD(&db->db_mtx));1588ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);1589ASSERT0P(db->db_buf);1590ASSERT(db->db_parent == NULL ||1591RW_LOCK_HELD(&db->db_parent->db_rwlock));15921593if (db->db_blkid == DMU_BONUS_BLKID) {1594err = dbuf_read_bonus(db, dn);1595goto early_unlock;1596}15971598err = dbuf_read_hole(db, dn, bp);1599if (err == 0)1600goto early_unlock;16011602ASSERT(bp != NULL);16031604/*1605* Any attempt to read a redacted block should result in an error. This1606* will never happen under normal conditions, but can be useful for1607* debugging purposes.1608*/1609if (BP_IS_REDACTED(bp)) {1610ASSERT(dsl_dataset_feature_is_active(1611db->db_objset->os_dsl_dataset,1612SPA_FEATURE_REDACTED_DATASETS));1613err = SET_ERROR(EIO);1614goto early_unlock;1615}16161617SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),1618db->db.db_object, db->db_level, db->db_blkid);16191620/*1621* All bps of an encrypted os should have the encryption bit set.1622* If this is not true it indicates tampering and we report an error.1623*/1624if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {1625spa_log_error(db->db_objset->os_spa, &zb,1626BP_GET_PHYSICAL_BIRTH(bp));1627err = SET_ERROR(EIO);1628goto early_unlock;1629}16301631db->db_state = DB_READ;1632DTRACE_SET_STATE(db, "read issued");1633mutex_exit(&db->db_mtx);16341635if (!DBUF_IS_CACHEABLE(db))1636aflags |= ARC_FLAG_UNCACHED;1637else if (dbuf_is_l2cacheable(db, bp))1638aflags |= ARC_FLAG_L2CACHE;16391640dbuf_add_ref(db, NULL);16411642zio_flags = (flags & DB_RF_CANFAIL) ?1643ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;16441645if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))1646zio_flags |= ZIO_FLAG_RAW;16471648/*1649* The zio layer will copy the provided blkptr later, but we need to1650* do this now so that we can release the parent's rwlock. We have to1651* do that now so that if dbuf_read_done is called synchronously (on1652* an l1 cache hit) we don't acquire the db_mtx while holding the1653* parent's rwlock, which would be a lock ordering violation.1654*/1655blkptr_t copy = *bp;1656dmu_buf_unlock_parent(db, dblt, tag);1657return (arc_read(zio, db->db_objset->os_spa, ©,1658dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,1659&aflags, &zb));16601661early_unlock:1662mutex_exit(&db->db_mtx);1663dmu_buf_unlock_parent(db, dblt, tag);1664return (err);1665}16661667/*1668* This is our just-in-time copy function. It makes a copy of buffers that1669* have been modified in a previous transaction group before we access them in1670* the current active group.1671*1672* This function is used in three places: when we are dirtying a buffer for the1673* first time in a txg, when we are freeing a range in a dnode that includes1674* this buffer, and when we are accessing a buffer which was received compressed1675* and later referenced in a WRITE_BYREF record.1676*1677* Note that when we are called from dbuf_free_range() we do not put a hold on1678* the buffer, we just traverse the active dbuf list for the dnode.1679*/1680static void1681dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)1682{1683dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);16841685ASSERT(MUTEX_HELD(&db->db_mtx));1686ASSERT(db->db.db_data != NULL);1687ASSERT0(db->db_level);1688ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);16891690if (dr == NULL ||1691(dr->dt.dl.dr_data !=1692((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))1693return;16941695/*1696* If the last dirty record for this dbuf has not yet synced1697* and its referencing the dbuf data, either:1698* reset the reference to point to a new copy,1699* or (if there a no active holders)1700* just null out the current db_data pointer.1701*/1702ASSERT3U(dr->dr_txg, >=, txg - 2);1703if (db->db_blkid == DMU_BONUS_BLKID) {1704dnode_t *dn = DB_DNODE(db);1705int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);1706dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);1707arc_space_consume(bonuslen, ARC_SPACE_BONUS);1708memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);1709} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {1710dnode_t *dn = DB_DNODE(db);1711int size = arc_buf_size(db->db_buf);1712arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);1713spa_t *spa = db->db_objset->os_spa;1714enum zio_compress compress_type =1715arc_get_compression(db->db_buf);1716uint8_t complevel = arc_get_complevel(db->db_buf);17171718if (arc_is_encrypted(db->db_buf)) {1719boolean_t byteorder;1720uint8_t salt[ZIO_DATA_SALT_LEN];1721uint8_t iv[ZIO_DATA_IV_LEN];1722uint8_t mac[ZIO_DATA_MAC_LEN];17231724arc_get_raw_params(db->db_buf, &byteorder, salt,1725iv, mac);1726dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,1727dmu_objset_id(dn->dn_objset), byteorder, salt, iv,1728mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),1729compress_type, complevel);1730} else if (compress_type != ZIO_COMPRESS_OFF) {1731ASSERT3U(type, ==, ARC_BUFC_DATA);1732dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,1733size, arc_buf_lsize(db->db_buf), compress_type,1734complevel);1735} else {1736dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);1737}1738memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);1739} else {1740db->db_buf = NULL;1741dbuf_clear_data(db);1742}1743}17441745int1746dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)1747{1748dnode_t *dn;1749boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;1750int err;17511752ASSERT(!zfs_refcount_is_zero(&db->db_holds));17531754DB_DNODE_ENTER(db);1755dn = DB_DNODE(db);17561757/*1758* Ensure that this block's dnode has been decrypted if the caller1759* has requested decrypted data.1760*/1761err = dbuf_read_verify_dnode_crypt(db, dn, flags);1762if (err != 0)1763goto done;17641765prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&1766(flags & DMU_READ_NO_PREFETCH) == 0;17671768mutex_enter(&db->db_mtx);1769if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))1770db->db_pending_evict = B_FALSE;1771if (flags & DMU_PARTIAL_FIRST)1772db->db_partial_read = B_TRUE;1773else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))1774db->db_partial_read = B_FALSE;1775miss = (db->db_state != DB_CACHED);17761777if (db->db_state == DB_READ || db->db_state == DB_FILL) {1778/*1779* Another reader came in while the dbuf was in flight between1780* UNCACHED and CACHED. Either a writer will finish filling1781* the buffer, sending the dbuf to CACHED, or the first reader's1782* request will reach the read_done callback and send the dbuf1783* to CACHED. Otherwise, a failure occurred and the dbuf will1784* be sent to UNCACHED.1785*/1786if (flags & DB_RF_NEVERWAIT) {1787mutex_exit(&db->db_mtx);1788DB_DNODE_EXIT(db);1789goto done;1790}1791do {1792ASSERT(db->db_state == DB_READ ||1793(flags & DB_RF_HAVESTRUCT) == 0);1794DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,1795zio_t *, pio);1796cv_wait(&db->db_changed, &db->db_mtx);1797} while (db->db_state == DB_READ || db->db_state == DB_FILL);1798if (db->db_state == DB_UNCACHED) {1799err = SET_ERROR(EIO);1800mutex_exit(&db->db_mtx);1801DB_DNODE_EXIT(db);1802goto done;1803}1804}18051806if (db->db_state == DB_CACHED) {1807/*1808* If the arc buf is compressed or encrypted and the caller1809* requested uncompressed data, we need to untransform it1810* before returning. We also call arc_untransform() on any1811* unauthenticated blocks, which will verify their MAC if1812* the key is now available.1813*/1814if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&1815(arc_is_encrypted(db->db_buf) ||1816arc_is_unauthenticated(db->db_buf) ||1817arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {1818spa_t *spa = dn->dn_objset->os_spa;1819zbookmark_phys_t zb;18201821SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),1822db->db.db_object, db->db_level, db->db_blkid);1823dbuf_fix_old_data(db, spa_syncing_txg(spa));1824err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);1825dbuf_set_data(db, db->db_buf);1826}1827mutex_exit(&db->db_mtx);1828} else {1829ASSERT(db->db_state == DB_UNCACHED ||1830db->db_state == DB_NOFILL);1831db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);1832blkptr_t *bp;18331834/*1835* If a block clone or Direct I/O write has occurred we will1836* get the dirty records overridden BP so we get the most1837* recent data.1838*/1839err = dmu_buf_get_bp_from_dbuf(db, &bp);18401841if (!err) {1842if (pio == NULL && (db->db_state == DB_NOFILL ||1843(bp != NULL && !BP_IS_HOLE(bp)))) {1844spa_t *spa = dn->dn_objset->os_spa;1845pio =1846zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);1847need_wait = B_TRUE;1848}18491850err =1851dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);1852} else {1853mutex_exit(&db->db_mtx);1854dmu_buf_unlock_parent(db, dblt, FTAG);1855}1856/* dbuf_read_impl drops db_mtx and parent's rwlock. */1857miss = (db->db_state != DB_CACHED);1858}18591860if (err == 0 && prefetch) {1861dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,1862flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||1863db->db_pending_evict);1864}1865DB_DNODE_EXIT(db);18661867/*1868* If we created a zio we must execute it to avoid leaking it, even if1869* it isn't attached to any work due to an error in dbuf_read_impl().1870*/1871if (need_wait) {1872if (err == 0)1873err = zio_wait(pio);1874else1875(void) zio_wait(pio);1876pio = NULL;1877}18781879done:1880if (miss)1881DBUF_STAT_BUMP(hash_misses);1882else1883DBUF_STAT_BUMP(hash_hits);1884if (pio && err != 0) {1885zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,1886ZIO_FLAG_CANFAIL);1887zio->io_error = err;1888zio_nowait(zio);1889}18901891return (err);1892}18931894static void1895dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)1896{1897ASSERT(!zfs_refcount_is_zero(&db->db_holds));1898ASSERT(db->db_blkid != DMU_BONUS_BLKID);1899mutex_enter(&db->db_mtx);1900if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))1901db->db_pending_evict = B_FALSE;1902db->db_partial_read = B_FALSE;1903while (db->db_state == DB_READ || db->db_state == DB_FILL)1904cv_wait(&db->db_changed, &db->db_mtx);1905if (db->db_state == DB_UNCACHED) {1906ASSERT0P(db->db_buf);1907ASSERT0P(db->db.db_data);1908dbuf_set_data(db, dbuf_alloc_arcbuf(db));1909db->db_state = DB_FILL;1910DTRACE_SET_STATE(db, "assigning filled buffer");1911} else if (db->db_state == DB_NOFILL) {1912dbuf_clear_data(db);1913} else {1914ASSERT3U(db->db_state, ==, DB_CACHED);1915}1916mutex_exit(&db->db_mtx);1917}19181919void1920dbuf_unoverride(dbuf_dirty_record_t *dr)1921{1922dmu_buf_impl_t *db = dr->dr_dbuf;1923blkptr_t *bp = &dr->dt.dl.dr_overridden_by;1924uint64_t txg = dr->dr_txg;19251926ASSERT(MUTEX_HELD(&db->db_mtx));19271928/*1929* This assert is valid because dmu_sync() expects to be called by1930* a zilog's get_data while holding a range lock. This call only1931* comes from dbuf_dirty() callers who must also hold a range lock.1932*/1933ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);1934ASSERT0(db->db_level);19351936if (db->db_blkid == DMU_BONUS_BLKID ||1937dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)1938return;19391940ASSERT(db->db_data_pending != dr);19411942/* free this block */1943if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)1944zio_free(db->db_objset->os_spa, txg, bp);19451946if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {1947ASSERT0P(dr->dt.dl.dr_data);1948dr->dt.dl.dr_data = db->db_buf;1949}1950dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;1951dr->dt.dl.dr_nopwrite = B_FALSE;1952dr->dt.dl.dr_brtwrite = B_FALSE;1953dr->dt.dl.dr_diowrite = B_FALSE;1954dr->dt.dl.dr_has_raw_params = B_FALSE;19551956/*1957* In the event that Direct I/O was used, we do not1958* need to release the buffer from the ARC.1959*1960* Release the already-written buffer, so we leave it in1961* a consistent dirty state. Note that all callers are1962* modifying the buffer, so they will immediately do1963* another (redundant) arc_release(). Therefore, leave1964* the buf thawed to save the effort of freezing &1965* immediately re-thawing it.1966*/1967if (dr->dt.dl.dr_data)1968arc_release(dr->dt.dl.dr_data, db);1969}19701971/*1972* Evict (if its unreferenced) or clear (if its referenced) any level-01973* data blocks in the free range, so that any future readers will find1974* empty blocks.1975*/1976void1977dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,1978dmu_tx_t *tx)1979{1980dmu_buf_impl_t *db_search;1981dmu_buf_impl_t *db, *db_next;1982uint64_t txg = tx->tx_txg;1983avl_index_t where;1984dbuf_dirty_record_t *dr;19851986if (end_blkid > dn->dn_maxblkid &&1987!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))1988end_blkid = dn->dn_maxblkid;1989dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,1990(u_longlong_t)end_blkid);19911992db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);1993db_search->db_level = 0;1994db_search->db_blkid = start_blkid;1995db_search->db_state = DB_SEARCH;19961997mutex_enter(&dn->dn_dbufs_mtx);1998db = avl_find(&dn->dn_dbufs, db_search, &where);1999ASSERT0P(db);20002001db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);20022003for (; db != NULL; db = db_next) {2004db_next = AVL_NEXT(&dn->dn_dbufs, db);2005ASSERT(db->db_blkid != DMU_BONUS_BLKID);20062007if (db->db_level != 0 || db->db_blkid > end_blkid) {2008break;2009}2010ASSERT3U(db->db_blkid, >=, start_blkid);20112012/* found a level 0 buffer in the range */2013mutex_enter(&db->db_mtx);2014if (dbuf_undirty(db, tx)) {2015/* mutex has been dropped and dbuf destroyed */2016continue;2017}20182019if (db->db_state == DB_UNCACHED ||2020db->db_state == DB_NOFILL ||2021db->db_state == DB_EVICTING) {2022ASSERT0P(db->db.db_data);2023mutex_exit(&db->db_mtx);2024continue;2025}2026if (db->db_state == DB_READ || db->db_state == DB_FILL) {2027/* will be handled in dbuf_read_done or dbuf_rele */2028db->db_freed_in_flight = TRUE;2029mutex_exit(&db->db_mtx);2030continue;2031}2032if (zfs_refcount_count(&db->db_holds) == 0) {2033ASSERT(db->db_buf);2034dbuf_destroy(db);2035continue;2036}2037/* The dbuf is referenced */20382039dr = list_head(&db->db_dirty_records);2040if (dr != NULL) {2041if (dr->dr_txg == txg) {2042/*2043* This buffer is "in-use", re-adjust the file2044* size to reflect that this buffer may2045* contain new data when we sync.2046*/2047if (db->db_blkid != DMU_SPILL_BLKID &&2048db->db_blkid > dn->dn_maxblkid)2049dn->dn_maxblkid = db->db_blkid;2050dbuf_unoverride(dr);2051} else {2052/*2053* This dbuf is not dirty in the open context.2054* Either uncache it (if its not referenced in2055* the open context) or reset its contents to2056* empty.2057*/2058dbuf_fix_old_data(db, txg);2059}2060}2061/* clear the contents if its cached */2062if (db->db_state == DB_CACHED) {2063ASSERT(db->db.db_data != NULL);2064arc_release(db->db_buf, db);2065rw_enter(&db->db_rwlock, RW_WRITER);2066memset(db->db.db_data, 0, db->db.db_size);2067rw_exit(&db->db_rwlock);2068arc_buf_freeze(db->db_buf);2069}20702071mutex_exit(&db->db_mtx);2072}20732074mutex_exit(&dn->dn_dbufs_mtx);2075kmem_free(db_search, sizeof (dmu_buf_impl_t));2076}20772078void2079dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)2080{2081arc_buf_t *buf, *old_buf;2082dbuf_dirty_record_t *dr;2083int osize = db->db.db_size;2084arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);2085dnode_t *dn;20862087ASSERT(db->db_blkid != DMU_BONUS_BLKID);20882089DB_DNODE_ENTER(db);2090dn = DB_DNODE(db);20912092/*2093* XXX we should be doing a dbuf_read, checking the return2094* value and returning that up to our callers2095*/2096dmu_buf_will_dirty(&db->db, tx);20972098VERIFY3P(db->db_buf, !=, NULL);20992100/* create the data buffer for the new block */2101buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);21022103/* copy old block data to the new block */2104old_buf = db->db_buf;2105memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));2106/* zero the remainder */2107if (size > osize)2108memset((uint8_t *)buf->b_data + osize, 0, size - osize);21092110mutex_enter(&db->db_mtx);2111dbuf_set_data(db, buf);2112arc_buf_destroy(old_buf, db);2113db->db.db_size = size;21142115dr = list_head(&db->db_dirty_records);2116/* dirty record added by dmu_buf_will_dirty() */2117VERIFY(dr != NULL);2118if (db->db_level == 0)2119dr->dt.dl.dr_data = buf;2120ASSERT3U(dr->dr_txg, ==, tx->tx_txg);2121ASSERT3U(dr->dr_accounted, ==, osize);2122dr->dr_accounted = size;2123mutex_exit(&db->db_mtx);21242125dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);2126DB_DNODE_EXIT(db);2127}21282129void2130dbuf_release_bp(dmu_buf_impl_t *db)2131{2132objset_t *os __maybe_unused = db->db_objset;21332134ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));2135ASSERT(arc_released(os->os_phys_buf) ||2136list_link_active(&os->os_dsl_dataset->ds_synced_link));2137ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));21382139(void) arc_release(db->db_buf, db);2140}21412142/*2143* We already have a dirty record for this TXG, and we are being2144* dirtied again.2145*/2146static void2147dbuf_redirty(dbuf_dirty_record_t *dr)2148{2149dmu_buf_impl_t *db = dr->dr_dbuf;21502151ASSERT(MUTEX_HELD(&db->db_mtx));21522153if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {2154/*2155* If this buffer has already been written out,2156* we now need to reset its state.2157*/2158dbuf_unoverride(dr);2159if (db->db.db_object != DMU_META_DNODE_OBJECT &&2160db->db_state != DB_NOFILL) {2161/* Already released on initial dirty, so just thaw. */2162ASSERT(arc_released(db->db_buf));2163arc_buf_thaw(db->db_buf);2164}21652166/*2167* Clear the rewrite flag since this is now a logical2168* modification.2169*/2170dr->dt.dl.dr_rewrite = B_FALSE;2171}2172}21732174dbuf_dirty_record_t *2175dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)2176{2177rw_enter(&dn->dn_struct_rwlock, RW_READER);2178IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);2179dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);2180ASSERT(dn->dn_maxblkid >= blkid);21812182dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);2183list_link_init(&dr->dr_dirty_node);2184list_link_init(&dr->dr_dbuf_node);2185dr->dr_dnode = dn;2186dr->dr_txg = tx->tx_txg;2187dr->dt.dll.dr_blkid = blkid;2188dr->dr_accounted = dn->dn_datablksz;21892190/*2191* There should not be any dbuf for the block that we're dirtying.2192* Otherwise the buffer contents could be inconsistent between the2193* dbuf and the lightweight dirty record.2194*/2195ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,2196NULL));21972198mutex_enter(&dn->dn_mtx);2199int txgoff = tx->tx_txg & TXG_MASK;2200if (dn->dn_free_ranges[txgoff] != NULL) {2201zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);2202}22032204if (dn->dn_nlevels == 1) {2205ASSERT3U(blkid, <, dn->dn_nblkptr);2206list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2207mutex_exit(&dn->dn_mtx);2208rw_exit(&dn->dn_struct_rwlock);2209dnode_setdirty(dn, tx);2210} else {2211mutex_exit(&dn->dn_mtx);22122213int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2214dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,22151, blkid >> epbs, FTAG);2216rw_exit(&dn->dn_struct_rwlock);2217if (parent_db == NULL) {2218kmem_free(dr, sizeof (*dr));2219return (NULL);2220}2221int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |2222DMU_READ_NO_PREFETCH);2223if (err != 0) {2224dbuf_rele(parent_db, FTAG);2225kmem_free(dr, sizeof (*dr));2226return (NULL);2227}22282229dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);2230dbuf_rele(parent_db, FTAG);2231mutex_enter(&parent_dr->dt.di.dr_mtx);2232ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);2233list_insert_tail(&parent_dr->dt.di.dr_children, dr);2234mutex_exit(&parent_dr->dt.di.dr_mtx);2235dr->dr_parent = parent_dr;2236}22372238dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);22392240return (dr);2241}22422243dbuf_dirty_record_t *2244dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)2245{2246dnode_t *dn;2247objset_t *os;2248dbuf_dirty_record_t *dr, *dr_next, *dr_head;2249int txgoff = tx->tx_txg & TXG_MASK;2250boolean_t drop_struct_rwlock = B_FALSE;22512252ASSERT(tx->tx_txg != 0);2253ASSERT(!zfs_refcount_is_zero(&db->db_holds));2254DMU_TX_DIRTY_BUF(tx, db);22552256DB_DNODE_ENTER(db);2257dn = DB_DNODE(db);2258/*2259* Shouldn't dirty a regular buffer in syncing context. Private2260* objects may be dirtied in syncing context, but only if they2261* were already pre-dirtied in open context.2262*/2263#ifdef ZFS_DEBUG2264if (dn->dn_objset->os_dsl_dataset != NULL) {2265rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,2266RW_READER, FTAG);2267}2268ASSERT(!dmu_tx_is_syncing(tx) ||2269BP_IS_HOLE(dn->dn_objset->os_rootbp) ||2270DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||2271dn->dn_objset->os_dsl_dataset == NULL);2272if (dn->dn_objset->os_dsl_dataset != NULL)2273rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);2274#endif22752276mutex_enter(&db->db_mtx);2277/*2278* XXX make this true for indirects too? The problem is that2279* transactions created with dmu_tx_create_assigned() from2280* syncing context don't bother holding ahead.2281*/2282ASSERT(db->db_level != 0 ||2283db->db_state == DB_CACHED || db->db_state == DB_FILL ||2284db->db_state == DB_NOFILL);22852286if (db->db_blkid == DMU_SPILL_BLKID)2287dn->dn_have_spill = B_TRUE;22882289/*2290* If this buffer is already dirty, we're done.2291*/2292dr_head = list_head(&db->db_dirty_records);2293ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||2294db->db.db_object == DMU_META_DNODE_OBJECT);2295dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);2296if (dr_next && dr_next->dr_txg == tx->tx_txg) {2297DB_DNODE_EXIT(db);22982299dbuf_redirty(dr_next);2300mutex_exit(&db->db_mtx);2301return (dr_next);2302}23032304ASSERT3U(dn->dn_nlevels, >, db->db_level);23052306/*2307* We should only be dirtying in syncing context if it's the2308* mos or we're initializing the os or it's a special object.2309* However, we are allowed to dirty in syncing context provided2310* we already dirtied it in open context. Hence we must make2311* this assertion only if we're not already dirty.2312*/2313os = dn->dn_objset;2314VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));2315#ifdef ZFS_DEBUG2316if (dn->dn_objset->os_dsl_dataset != NULL)2317rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);2318ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||2319os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));2320if (dn->dn_objset->os_dsl_dataset != NULL)2321rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);2322#endif2323ASSERT(db->db.db_size != 0);23242325dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);23262327if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {2328dmu_objset_willuse_space(os, db->db.db_size, tx);2329}23302331/*2332* If this buffer is dirty in an old transaction group we need2333* to make a copy of it so that the changes we make in this2334* transaction group won't leak out when we sync the older txg.2335*/2336dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP);2337memset(dr, 0, sizeof (*dr));2338list_link_init(&dr->dr_dirty_node);2339list_link_init(&dr->dr_dbuf_node);2340dr->dr_dnode = dn;2341if (db->db_level == 0) {2342void *data_old = db->db_buf;23432344if (db->db_state != DB_NOFILL) {2345if (db->db_blkid == DMU_BONUS_BLKID) {2346dbuf_fix_old_data(db, tx->tx_txg);2347data_old = db->db.db_data;2348} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {2349/*2350* Release the data buffer from the cache so2351* that we can modify it without impacting2352* possible other users of this cached data2353* block. Note that indirect blocks and2354* private objects are not released until the2355* syncing state (since they are only modified2356* then).2357*/2358arc_release(db->db_buf, db);2359dbuf_fix_old_data(db, tx->tx_txg);2360data_old = db->db_buf;2361}2362ASSERT(data_old != NULL);2363}2364dr->dt.dl.dr_data = data_old;2365} else {2366mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);2367list_create(&dr->dt.di.dr_children,2368sizeof (dbuf_dirty_record_t),2369offsetof(dbuf_dirty_record_t, dr_dirty_node));2370}2371if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {2372dr->dr_accounted = db->db.db_size;2373}2374dr->dr_dbuf = db;2375dr->dr_txg = tx->tx_txg;2376list_insert_before(&db->db_dirty_records, dr_next, dr);23772378/*2379* We could have been freed_in_flight between the dbuf_noread2380* and dbuf_dirty. We win, as though the dbuf_noread() had2381* happened after the free.2382*/2383if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&2384db->db_blkid != DMU_SPILL_BLKID) {2385mutex_enter(&dn->dn_mtx);2386if (dn->dn_free_ranges[txgoff] != NULL) {2387zfs_range_tree_clear(dn->dn_free_ranges[txgoff],2388db->db_blkid, 1);2389}2390mutex_exit(&dn->dn_mtx);2391db->db_freed_in_flight = FALSE;2392}23932394/*2395* This buffer is now part of this txg2396*/2397dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);2398db->db_dirtycnt += 1;2399ASSERT3U(db->db_dirtycnt, <=, 3);24002401mutex_exit(&db->db_mtx);24022403if (db->db_blkid == DMU_BONUS_BLKID ||2404db->db_blkid == DMU_SPILL_BLKID) {2405mutex_enter(&dn->dn_mtx);2406ASSERT(!list_link_active(&dr->dr_dirty_node));2407list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2408mutex_exit(&dn->dn_mtx);2409dnode_setdirty(dn, tx);2410DB_DNODE_EXIT(db);2411return (dr);2412}24132414if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {2415rw_enter(&dn->dn_struct_rwlock, RW_READER);2416drop_struct_rwlock = B_TRUE;2417}24182419/*2420* If we are overwriting a dedup BP, then unless it is snapshotted,2421* when we get to syncing context we will need to decrement its2422* refcount in the DDT. Prefetch the relevant DDT block so that2423* syncing context won't have to wait for the i/o.2424*/2425if (db->db_blkptr != NULL) {2426db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);2427ddt_prefetch(os->os_spa, db->db_blkptr);2428dmu_buf_unlock_parent(db, dblt, FTAG);2429}24302431/*2432* We need to hold the dn_struct_rwlock to make this assertion,2433* because it protects dn_phys / dn_next_nlevels from changing.2434*/2435ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||2436dn->dn_phys->dn_nlevels > db->db_level ||2437dn->dn_next_nlevels[txgoff] > db->db_level ||2438dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||2439dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);244024412442if (db->db_level == 0) {2443ASSERT(!db->db_objset->os_raw_receive ||2444dn->dn_maxblkid >= db->db_blkid);2445dnode_new_blkid(dn, db->db_blkid, tx,2446drop_struct_rwlock, B_FALSE);2447ASSERT(dn->dn_maxblkid >= db->db_blkid);2448}24492450if (db->db_level+1 < dn->dn_nlevels) {2451dmu_buf_impl_t *parent = db->db_parent;2452dbuf_dirty_record_t *di;2453int parent_held = FALSE;24542455if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {2456int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2457parent = dbuf_hold_level(dn, db->db_level + 1,2458db->db_blkid >> epbs, FTAG);2459ASSERT(parent != NULL);2460parent_held = TRUE;2461}2462if (drop_struct_rwlock)2463rw_exit(&dn->dn_struct_rwlock);2464ASSERT3U(db->db_level + 1, ==, parent->db_level);2465di = dbuf_dirty(parent, tx);2466if (parent_held)2467dbuf_rele(parent, FTAG);24682469mutex_enter(&db->db_mtx);2470/*2471* Since we've dropped the mutex, it's possible that2472* dbuf_undirty() might have changed this out from under us.2473*/2474if (list_head(&db->db_dirty_records) == dr ||2475dn->dn_object == DMU_META_DNODE_OBJECT) {2476mutex_enter(&di->dt.di.dr_mtx);2477ASSERT3U(di->dr_txg, ==, tx->tx_txg);2478ASSERT(!list_link_active(&dr->dr_dirty_node));2479list_insert_tail(&di->dt.di.dr_children, dr);2480mutex_exit(&di->dt.di.dr_mtx);2481dr->dr_parent = di;2482}2483mutex_exit(&db->db_mtx);2484} else {2485ASSERT(db->db_level + 1 == dn->dn_nlevels);2486ASSERT(db->db_blkid < dn->dn_nblkptr);2487ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);2488mutex_enter(&dn->dn_mtx);2489ASSERT(!list_link_active(&dr->dr_dirty_node));2490list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2491mutex_exit(&dn->dn_mtx);2492if (drop_struct_rwlock)2493rw_exit(&dn->dn_struct_rwlock);2494}24952496dnode_setdirty(dn, tx);2497DB_DNODE_EXIT(db);2498return (dr);2499}25002501static void2502dbuf_undirty_bonus(dbuf_dirty_record_t *dr)2503{2504dmu_buf_impl_t *db = dr->dr_dbuf;25052506ASSERT(MUTEX_HELD(&db->db_mtx));2507if (dr->dt.dl.dr_data != db->db.db_data) {2508struct dnode *dn = dr->dr_dnode;2509int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);25102511kmem_free(dr->dt.dl.dr_data, max_bonuslen);2512arc_space_return(max_bonuslen, ARC_SPACE_BONUS);2513}2514db->db_data_pending = NULL;2515ASSERT(list_next(&db->db_dirty_records, dr) == NULL);2516list_remove(&db->db_dirty_records, dr);2517if (dr->dr_dbuf->db_level != 0) {2518mutex_destroy(&dr->dt.di.dr_mtx);2519list_destroy(&dr->dt.di.dr_children);2520}2521kmem_cache_free(dbuf_dirty_kmem_cache, dr);2522ASSERT3U(db->db_dirtycnt, >, 0);2523db->db_dirtycnt -= 1;2524}25252526/*2527* Undirty a buffer in the transaction group referenced by the given2528* transaction. Return whether this evicted the dbuf.2529*/2530boolean_t2531dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)2532{2533uint64_t txg = tx->tx_txg;2534boolean_t brtwrite;2535boolean_t diowrite;25362537ASSERT(txg != 0);25382539/*2540* Due to our use of dn_nlevels below, this can only be called2541* in open context, unless we are operating on the MOS or it's2542* a special object. From syncing context, dn_nlevels may be2543* different from the dn_nlevels used when dbuf was dirtied.2544*/2545ASSERT(db->db_objset ==2546dmu_objset_pool(db->db_objset)->dp_meta_objset ||2547DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||2548txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));2549ASSERT(db->db_blkid != DMU_BONUS_BLKID);2550ASSERT0(db->db_level);2551ASSERT(MUTEX_HELD(&db->db_mtx));25522553/*2554* If this buffer is not dirty, we're done.2555*/2556dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);2557if (dr == NULL)2558return (B_FALSE);2559ASSERT(dr->dr_dbuf == db);25602561brtwrite = dr->dt.dl.dr_brtwrite;2562diowrite = dr->dt.dl.dr_diowrite;2563if (brtwrite) {2564ASSERT3B(diowrite, ==, B_FALSE);2565/*2566* We are freeing a block that we cloned in the same2567* transaction group.2568*/2569blkptr_t *bp = &dr->dt.dl.dr_overridden_by;2570if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {2571brt_pending_remove(dmu_objset_spa(db->db_objset),2572bp, tx);2573}2574}25752576dnode_t *dn = dr->dr_dnode;25772578dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);25792580ASSERT(db->db.db_size != 0);25812582dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),2583dr->dr_accounted, txg);25842585list_remove(&db->db_dirty_records, dr);25862587/*2588* Note that there are three places in dbuf_dirty()2589* where this dirty record may be put on a list.2590* Make sure to do a list_remove corresponding to2591* every one of those list_insert calls.2592*/2593if (dr->dr_parent) {2594mutex_enter(&dr->dr_parent->dt.di.dr_mtx);2595list_remove(&dr->dr_parent->dt.di.dr_children, dr);2596mutex_exit(&dr->dr_parent->dt.di.dr_mtx);2597} else if (db->db_blkid == DMU_SPILL_BLKID ||2598db->db_level + 1 == dn->dn_nlevels) {2599ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);2600mutex_enter(&dn->dn_mtx);2601list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);2602mutex_exit(&dn->dn_mtx);2603}26042605if (db->db_state != DB_NOFILL && !brtwrite) {2606dbuf_unoverride(dr);26072608if (dr->dt.dl.dr_data != db->db_buf) {2609ASSERT(db->db_buf != NULL);2610ASSERT(dr->dt.dl.dr_data != NULL);2611arc_buf_destroy(dr->dt.dl.dr_data, db);2612}2613}26142615kmem_cache_free(dbuf_dirty_kmem_cache, dr);26162617ASSERT(db->db_dirtycnt > 0);2618db->db_dirtycnt -= 1;26192620if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {2621ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||2622arc_released(db->db_buf));2623dbuf_destroy(db);2624return (B_TRUE);2625}26262627return (B_FALSE);2628}26292630void2631dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)2632{2633dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2634boolean_t undirty = B_FALSE;26352636ASSERT(tx->tx_txg != 0);2637ASSERT(!zfs_refcount_is_zero(&db->db_holds));26382639/*2640* Quick check for dirtiness to improve performance for some workloads2641* (e.g. file deletion with indirect blocks cached).2642*/2643mutex_enter(&db->db_mtx);2644if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {2645/*2646* It's possible that the dbuf is already dirty but not cached,2647* because there are some calls to dbuf_dirty() that don't2648* go through dmu_buf_will_dirty().2649*/2650dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2651if (dr != NULL) {2652if (db->db_level == 0 &&2653dr->dt.dl.dr_brtwrite) {2654/*2655* Block cloning: If we are dirtying a cloned2656* level 0 block, we cannot simply redirty it,2657* because this dr has no associated data.2658* We will go through a full undirtying below,2659* before dirtying it again.2660*/2661undirty = B_TRUE;2662} else {2663/* This dbuf is already dirty and cached. */2664dbuf_redirty(dr);2665mutex_exit(&db->db_mtx);2666return;2667}2668}2669}2670mutex_exit(&db->db_mtx);26712672DB_DNODE_ENTER(db);2673if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))2674flags |= DB_RF_HAVESTRUCT;2675DB_DNODE_EXIT(db);26762677/*2678* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we2679* want to make sure dbuf_read() will read the pending cloned block and2680* not the uderlying block that is being replaced. dbuf_undirty() will2681* do brt_pending_remove() before removing the dirty record.2682*/2683(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);2684if (undirty) {2685mutex_enter(&db->db_mtx);2686VERIFY(!dbuf_undirty(db, tx));2687mutex_exit(&db->db_mtx);2688}2689(void) dbuf_dirty(db, tx);2690}26912692void2693dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)2694{2695dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);2696}26972698void2699dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)2700{2701dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;27022703ASSERT(tx->tx_txg != 0);2704ASSERT(!zfs_refcount_is_zero(&db->db_holds));27052706/*2707* If the dbuf is already dirty in this txg, it will be written2708* anyway, so there's nothing to do.2709*/2710mutex_enter(&db->db_mtx);2711if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {2712mutex_exit(&db->db_mtx);2713return;2714}2715mutex_exit(&db->db_mtx);27162717/*2718* The dbuf is not dirty, so we need to make it dirty and2719* mark it for rewrite (preserve logical birth time).2720*/2721dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);27222723mutex_enter(&db->db_mtx);2724dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2725if (dr != NULL && db->db_level == 0)2726dr->dt.dl.dr_rewrite = B_TRUE;2727mutex_exit(&db->db_mtx);2728}27292730boolean_t2731dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)2732{2733dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2734dbuf_dirty_record_t *dr;27352736mutex_enter(&db->db_mtx);2737dr = dbuf_find_dirty_eq(db, tx->tx_txg);2738mutex_exit(&db->db_mtx);2739return (dr != NULL);2740}27412742/*2743* Normally the db_blkptr points to the most recent on-disk content for the2744* dbuf (and anything newer will be cached in the dbuf). However, a pending2745* block clone or not yet synced Direct I/O write will have a dirty record BP2746* pointing to the most recent data.2747*/2748int2749dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)2750{2751ASSERT(MUTEX_HELD(&db->db_mtx));2752int error = 0;27532754if (db->db_level != 0) {2755*bp = db->db_blkptr;2756return (0);2757}27582759*bp = db->db_blkptr;2760dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);2761if (dr && db->db_state == DB_NOFILL) {2762/* Block clone */2763if (!dr->dt.dl.dr_brtwrite)2764error = EIO;2765else2766*bp = &dr->dt.dl.dr_overridden_by;2767} else if (dr && db->db_state == DB_UNCACHED) {2768/* Direct I/O write */2769if (dr->dt.dl.dr_diowrite)2770*bp = &dr->dt.dl.dr_overridden_by;2771}27722773return (error);2774}27752776/*2777* Direct I/O reads can read directly from the ARC, but the data has2778* to be untransformed in order to copy it over into user pages.2779*/2780int2781dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)2782{2783int err = 0;2784DB_DNODE_ENTER(db);2785dnode_t *dn = DB_DNODE(db);27862787ASSERT3S(db->db_state, ==, DB_CACHED);2788ASSERT(MUTEX_HELD(&db->db_mtx));27892790/*2791* Ensure that this block's dnode has been decrypted if2792* the caller has requested decrypted data.2793*/2794err = dbuf_read_verify_dnode_crypt(db, dn, 0);27952796/*2797* If the arc buf is compressed or encrypted and the caller2798* requested uncompressed data, we need to untransform it2799* before returning. We also call arc_untransform() on any2800* unauthenticated blocks, which will verify their MAC if2801* the key is now available.2802*/2803if (err == 0 && db->db_buf != NULL &&2804(arc_is_encrypted(db->db_buf) ||2805arc_is_unauthenticated(db->db_buf) ||2806arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {2807zbookmark_phys_t zb;28082809SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),2810db->db.db_object, db->db_level, db->db_blkid);2811dbuf_fix_old_data(db, spa_syncing_txg(spa));2812err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);2813dbuf_set_data(db, db->db_buf);2814}2815DB_DNODE_EXIT(db);2816DBUF_STAT_BUMP(hash_hits);28172818return (err);2819}28202821void2822dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)2823{2824/*2825* Block clones and Direct I/O writes always happen in open-context.2826*/2827dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2828ASSERT0(db->db_level);2829ASSERT(!dmu_tx_is_syncing(tx));2830ASSERT0(db->db_level);2831ASSERT(db->db_blkid != DMU_BONUS_BLKID);2832ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);28332834mutex_enter(&db->db_mtx);2835DBUF_VERIFY(db);28362837/*2838* We are going to clone or issue a Direct I/O write on this block, so2839* undirty modifications done to this block so far in this txg. This2840* includes writes and clones into this block.2841*2842* If there dirty record associated with this txg from a previous Direct2843* I/O write then space accounting cleanup takes place. It is important2844* to go ahead free up the space accounting through dbuf_undirty() ->2845* dbuf_unoverride() -> zio_free(). Space accountiung for determining2846* if a write can occur in zfs_write() happens through dmu_tx_assign().2847* This can cause an issue with Direct I/O writes in the case of2848* overwriting the same block, because all DVA allocations are being2849* done in open-context. Constantly allowing Direct I/O overwrites to2850* the same block can exhaust the pools available space leading to2851* ENOSPC errors at the DVA allocation part of the ZIO pipeline, which2852* will eventually suspend the pool. By cleaning up sapce acccounting2853* now, the ENOSPC error can be avoided.2854*2855* Since we are undirtying the record in open-context, we must have a2856* hold on the db, so it should never be evicted after calling2857* dbuf_undirty().2858*/2859VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);2860ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));28612862if (db->db_buf != NULL) {2863/*2864* If there is an associated ARC buffer with this dbuf we can2865* only destroy it if the previous dirty record does not2866* reference it.2867*/2868dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);2869if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)2870arc_buf_destroy(db->db_buf, db);28712872/*2873* Setting the dbuf's data pointers to NULL will force all2874* future reads down to the devices to get the most up to date2875* version of the data after a Direct I/O write has completed.2876*/2877db->db_buf = NULL;2878dbuf_clear_data(db);2879}28802881ASSERT0P(db->db_buf);2882ASSERT0P(db->db.db_data);28832884db->db_state = DB_NOFILL;2885DTRACE_SET_STATE(db,2886"allocating NOFILL buffer for clone or direct I/O write");28872888DBUF_VERIFY(db);2889mutex_exit(&db->db_mtx);28902891dbuf_noread(db, DMU_KEEP_CACHING);2892(void) dbuf_dirty(db, tx);2893}28942895void2896dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)2897{2898dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;28992900mutex_enter(&db->db_mtx);2901db->db_state = DB_NOFILL;2902DTRACE_SET_STATE(db, "allocating NOFILL buffer");2903mutex_exit(&db->db_mtx);29042905dbuf_noread(db, DMU_KEEP_CACHING);2906(void) dbuf_dirty(db, tx);2907}29082909void2910dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,2911dmu_flags_t flags)2912{2913dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;29142915ASSERT(db->db_blkid != DMU_BONUS_BLKID);2916ASSERT(tx->tx_txg != 0);2917ASSERT0(db->db_level);2918ASSERT(!zfs_refcount_is_zero(&db->db_holds));29192920ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||2921dmu_tx_private_ok(tx));29222923mutex_enter(&db->db_mtx);2924dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2925if (db->db_state == DB_NOFILL ||2926(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {2927/*2928* If the fill can fail we should have a way to return back to2929* the cloned or Direct I/O write data.2930*/2931if (canfail && dr) {2932mutex_exit(&db->db_mtx);2933dmu_buf_will_dirty_flags(db_fake, tx, flags);2934return;2935}2936/*2937* Block cloning: We will be completely overwriting a block2938* cloned in this transaction group, so let's undirty the2939* pending clone and mark the block as uncached. This will be2940* as if the clone was never done.2941*/2942if (db->db_state == DB_NOFILL) {2943VERIFY(!dbuf_undirty(db, tx));2944db->db_state = DB_UNCACHED;2945}2946}2947mutex_exit(&db->db_mtx);29482949dbuf_noread(db, flags);2950(void) dbuf_dirty(db, tx);2951}29522953void2954dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)2955{2956dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);2957}29582959/*2960* This function is effectively the same as dmu_buf_will_dirty(), but2961* indicates the caller expects raw encrypted data in the db, and provides2962* the crypt params (byteorder, salt, iv, mac) which should be stored in the2963* blkptr_t when this dbuf is written. This is only used for blocks of2964* dnodes, during raw receive.2965*/2966void2967dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,2968const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)2969{2970dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2971dbuf_dirty_record_t *dr;29722973/*2974* dr_has_raw_params is only processed for blocks of dnodes2975* (see dbuf_sync_dnode_leaf_crypt()).2976*/2977ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);2978ASSERT0(db->db_level);2979ASSERT(db->db_objset->os_raw_receive);29802981dmu_buf_will_dirty_flags(db_fake, tx,2982DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);29832984dr = dbuf_find_dirty_eq(db, tx->tx_txg);29852986ASSERT3P(dr, !=, NULL);2987ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);29882989dr->dt.dl.dr_has_raw_params = B_TRUE;2990dr->dt.dl.dr_byteorder = byteorder;2991memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);2992memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);2993memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);2994}29952996static void2997dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)2998{2999struct dirty_leaf *dl;3000dbuf_dirty_record_t *dr;30013002ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);3003ASSERT0(db->db_level);30043005dr = list_head(&db->db_dirty_records);3006ASSERT3P(dr, !=, NULL);3007ASSERT3U(dr->dr_txg, ==, tx->tx_txg);3008dl = &dr->dt.dl;3009ASSERT0(dl->dr_has_raw_params);3010dl->dr_overridden_by = *bp;3011dl->dr_override_state = DR_OVERRIDDEN;3012BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);3013}30143015boolean_t3016dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)3017{3018(void) tx;3019dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3020mutex_enter(&db->db_mtx);3021DBUF_VERIFY(db);30223023if (db->db_state == DB_FILL) {3024if (db->db_level == 0 && db->db_freed_in_flight) {3025ASSERT(db->db_blkid != DMU_BONUS_BLKID);3026/* we were freed while filling */3027/* XXX dbuf_undirty? */3028memset(db->db.db_data, 0, db->db.db_size);3029db->db_freed_in_flight = FALSE;3030db->db_state = DB_CACHED;3031DTRACE_SET_STATE(db,3032"fill done handling freed in flight");3033failed = B_FALSE;3034} else if (failed) {3035VERIFY(!dbuf_undirty(db, tx));3036arc_buf_destroy(db->db_buf, db);3037db->db_buf = NULL;3038dbuf_clear_data(db);3039DTRACE_SET_STATE(db, "fill failed");3040} else {3041db->db_state = DB_CACHED;3042DTRACE_SET_STATE(db, "fill done");3043}3044cv_broadcast(&db->db_changed);3045} else {3046db->db_state = DB_CACHED;3047failed = B_FALSE;3048}3049mutex_exit(&db->db_mtx);3050return (failed);3051}30523053void3054dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,3055bp_embedded_type_t etype, enum zio_compress comp,3056int uncompressed_size, int compressed_size, int byteorder,3057dmu_tx_t *tx)3058{3059dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3060struct dirty_leaf *dl;3061dmu_object_type_t type;3062dbuf_dirty_record_t *dr;30633064if (etype == BP_EMBEDDED_TYPE_DATA) {3065ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),3066SPA_FEATURE_EMBEDDED_DATA));3067}30683069DB_DNODE_ENTER(db);3070type = DB_DNODE(db)->dn_type;3071DB_DNODE_EXIT(db);30723073ASSERT0(db->db_level);3074ASSERT(db->db_blkid != DMU_BONUS_BLKID);30753076dmu_buf_will_not_fill(dbuf, tx);30773078dr = list_head(&db->db_dirty_records);3079ASSERT3P(dr, !=, NULL);3080ASSERT3U(dr->dr_txg, ==, tx->tx_txg);3081dl = &dr->dt.dl;3082ASSERT0(dl->dr_has_raw_params);3083encode_embedded_bp_compressed(&dl->dr_overridden_by,3084data, comp, uncompressed_size, compressed_size);3085BPE_SET_ETYPE(&dl->dr_overridden_by, etype);3086BP_SET_TYPE(&dl->dr_overridden_by, type);3087BP_SET_LEVEL(&dl->dr_overridden_by, 0);3088BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);30893090dl->dr_override_state = DR_OVERRIDDEN;3091BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);3092}30933094void3095dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)3096{3097dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3098dmu_object_type_t type;3099ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,3100SPA_FEATURE_REDACTED_DATASETS));31013102DB_DNODE_ENTER(db);3103type = DB_DNODE(db)->dn_type;3104DB_DNODE_EXIT(db);31053106ASSERT0(db->db_level);3107dmu_buf_will_not_fill(dbuf, tx);31083109blkptr_t bp = { { { {0} } } };3110BP_SET_TYPE(&bp, type);3111BP_SET_LEVEL(&bp, 0);3112BP_SET_BIRTH(&bp, tx->tx_txg, 0);3113BP_SET_REDACTED(&bp);3114BPE_SET_LSIZE(&bp, dbuf->db_size);31153116dbuf_override_impl(db, &bp, tx);3117}31183119/*3120* Directly assign a provided arc buf to a given dbuf if it's not referenced3121* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.3122*/3123void3124dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,3125dmu_flags_t flags)3126{3127ASSERT(!zfs_refcount_is_zero(&db->db_holds));3128ASSERT(db->db_blkid != DMU_BONUS_BLKID);3129ASSERT0(db->db_level);3130ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));3131ASSERT(buf != NULL);3132ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);3133ASSERT(tx->tx_txg != 0);31343135arc_return_buf(buf, db);3136ASSERT(arc_released(buf));31373138mutex_enter(&db->db_mtx);3139if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))3140db->db_pending_evict = B_FALSE;3141db->db_partial_read = B_FALSE;31423143while (db->db_state == DB_READ || db->db_state == DB_FILL)3144cv_wait(&db->db_changed, &db->db_mtx);31453146ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||3147db->db_state == DB_NOFILL);31483149if (db->db_state == DB_CACHED &&3150zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {3151/*3152* In practice, we will never have a case where we have an3153* encrypted arc buffer while additional holds exist on the3154* dbuf. We don't handle this here so we simply assert that3155* fact instead.3156*/3157ASSERT(!arc_is_encrypted(buf));3158mutex_exit(&db->db_mtx);3159(void) dbuf_dirty(db, tx);3160memcpy(db->db.db_data, buf->b_data, db->db.db_size);3161arc_buf_destroy(buf, db);3162return;3163}31643165if (db->db_state == DB_CACHED) {3166dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);31673168ASSERT(db->db_buf != NULL);3169if (dr != NULL && dr->dr_txg == tx->tx_txg) {3170ASSERT(dr->dt.dl.dr_data == db->db_buf);31713172if (!arc_released(db->db_buf)) {3173ASSERT(dr->dt.dl.dr_override_state ==3174DR_OVERRIDDEN);3175arc_release(db->db_buf, db);3176}3177dr->dt.dl.dr_data = buf;3178arc_buf_destroy(db->db_buf, db);3179} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {3180arc_release(db->db_buf, db);3181arc_buf_destroy(db->db_buf, db);3182}3183db->db_buf = NULL;3184} else if (db->db_state == DB_NOFILL) {3185/*3186* We will be completely replacing the cloned block. In case3187* it was cloned in this transaction group, let's undirty the3188* pending clone and mark the block as uncached. This will be3189* as if the clone was never done.3190*/3191VERIFY(!dbuf_undirty(db, tx));3192db->db_state = DB_UNCACHED;3193}3194ASSERT0P(db->db_buf);3195dbuf_set_data(db, buf);3196db->db_state = DB_FILL;3197DTRACE_SET_STATE(db, "filling assigned arcbuf");3198mutex_exit(&db->db_mtx);3199(void) dbuf_dirty(db, tx);3200dmu_buf_fill_done(&db->db, tx, B_FALSE);3201}32023203void3204dbuf_destroy(dmu_buf_impl_t *db)3205{3206dnode_t *dn;3207dmu_buf_impl_t *parent = db->db_parent;3208dmu_buf_impl_t *dndb;32093210ASSERT(MUTEX_HELD(&db->db_mtx));3211ASSERT(zfs_refcount_is_zero(&db->db_holds));32123213if (db->db_buf != NULL) {3214arc_buf_destroy(db->db_buf, db);3215db->db_buf = NULL;3216}32173218if (db->db_blkid == DMU_BONUS_BLKID) {3219int slots = DB_DNODE(db)->dn_num_slots;3220int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);3221if (db->db.db_data != NULL) {3222kmem_free(db->db.db_data, bonuslen);3223arc_space_return(bonuslen, ARC_SPACE_BONUS);3224db->db_state = DB_UNCACHED;3225DTRACE_SET_STATE(db, "buffer cleared");3226}3227}32283229dbuf_clear_data(db);32303231if (multilist_link_active(&db->db_cache_link)) {3232ASSERT(db->db_caching_status == DB_DBUF_CACHE ||3233db->db_caching_status == DB_DBUF_METADATA_CACHE);32343235multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);32363237ASSERT0(dmu_buf_user_size(&db->db));3238(void) zfs_refcount_remove_many(3239&dbuf_caches[db->db_caching_status].size,3240db->db.db_size, db);32413242if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {3243DBUF_STAT_BUMPDOWN(metadata_cache_count);3244} else {3245DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);3246DBUF_STAT_BUMPDOWN(cache_count);3247DBUF_STAT_DECR(cache_levels_bytes[db->db_level],3248db->db.db_size);3249}3250db->db_caching_status = DB_NO_CACHE;3251}32523253ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);3254ASSERT0P(db->db_data_pending);3255ASSERT(list_is_empty(&db->db_dirty_records));32563257db->db_state = DB_EVICTING;3258DTRACE_SET_STATE(db, "buffer eviction started");3259db->db_blkptr = NULL;32603261/*3262* Now that db_state is DB_EVICTING, nobody else can find this via3263* the hash table. We can now drop db_mtx, which allows us to3264* acquire the dn_dbufs_mtx.3265*/3266mutex_exit(&db->db_mtx);32673268DB_DNODE_ENTER(db);3269dn = DB_DNODE(db);3270dndb = dn->dn_dbuf;3271if (db->db_blkid != DMU_BONUS_BLKID) {3272boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);3273if (needlock)3274mutex_enter_nested(&dn->dn_dbufs_mtx,3275NESTED_SINGLE);3276avl_remove(&dn->dn_dbufs, db);3277membar_producer();3278DB_DNODE_EXIT(db);3279if (needlock)3280mutex_exit(&dn->dn_dbufs_mtx);3281/*3282* Decrementing the dbuf count means that the hold corresponding3283* to the removed dbuf is no longer discounted in dnode_move(),3284* so the dnode cannot be moved until after we release the hold.3285* The membar_producer() ensures visibility of the decremented3286* value in dnode_move(), since DB_DNODE_EXIT doesn't actually3287* release any lock.3288*/3289mutex_enter(&dn->dn_mtx);3290dnode_rele_and_unlock(dn, db, B_TRUE);3291#ifdef USE_DNODE_HANDLE3292db->db_dnode_handle = NULL;3293#else3294db->db_dnode = NULL;3295#endif32963297dbuf_hash_remove(db);3298} else {3299DB_DNODE_EXIT(db);3300}33013302ASSERT(zfs_refcount_is_zero(&db->db_holds));33033304db->db_parent = NULL;33053306ASSERT0P(db->db_buf);3307ASSERT0P(db->db.db_data);3308ASSERT0P(db->db_hash_next);3309ASSERT0P(db->db_blkptr);3310ASSERT0P(db->db_data_pending);3311ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);3312ASSERT(!multilist_link_active(&db->db_cache_link));33133314/*3315* If this dbuf is referenced from an indirect dbuf,3316* decrement the ref count on the indirect dbuf.3317*/3318if (parent && parent != dndb) {3319mutex_enter(&parent->db_mtx);3320dbuf_rele_and_unlock(parent, db, B_TRUE);3321}33223323kmem_cache_free(dbuf_kmem_cache, db);3324arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);3325}33263327/*3328* Note: While bpp will always be updated if the function returns success,3329* parentp will not be updated if the dnode does not have dn_dbuf filled in;3330* this happens when the dnode is the meta-dnode, or {user|group|project}used3331* object.3332*/3333__attribute__((always_inline))3334static inline int3335dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,3336dmu_buf_impl_t **parentp, blkptr_t **bpp)3337{3338*parentp = NULL;3339*bpp = NULL;33403341ASSERT(blkid != DMU_BONUS_BLKID);33423343if (blkid == DMU_SPILL_BLKID) {3344mutex_enter(&dn->dn_mtx);3345if (dn->dn_have_spill &&3346(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))3347*bpp = DN_SPILL_BLKPTR(dn->dn_phys);3348else3349*bpp = NULL;3350dbuf_add_ref(dn->dn_dbuf, NULL);3351*parentp = dn->dn_dbuf;3352mutex_exit(&dn->dn_mtx);3353return (0);3354}33553356int nlevels =3357(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;3358int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;33593360ASSERT3U(level * epbs, <, 64);3361ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3362/*3363* This assertion shouldn't trip as long as the max indirect block size3364* is less than 1M. The reason for this is that up to that point,3365* the number of levels required to address an entire object with blocks3366* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In3367* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 553368* (i.e. we can address the entire object), objects will all use at most3369* N-1 levels and the assertion won't overflow. However, once epbs is3370* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be3371* enough to address an entire object, so objects will have 5 levels,3372* but then this assertion will overflow.3373*3374* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we3375* need to redo this logic to handle overflows.3376*/3377ASSERT(level >= nlevels ||3378((nlevels - level - 1) * epbs) +3379highbit64(dn->dn_phys->dn_nblkptr) <= 64);3380if (level >= nlevels ||3381blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<3382((nlevels - level - 1) * epbs)) ||3383(fail_sparse &&3384blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {3385/* the buffer has no parent yet */3386return (SET_ERROR(ENOENT));3387} else if (level < nlevels-1) {3388/* this block is referenced from an indirect block */3389int err;33903391err = dbuf_hold_impl(dn, level + 1,3392blkid >> epbs, fail_sparse, FALSE, NULL, parentp);33933394if (err)3395return (err);3396err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |3397DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);3398if (err) {3399dbuf_rele(*parentp, NULL);3400*parentp = NULL;3401return (err);3402}3403*bpp = ((blkptr_t *)(*parentp)->db.db_data) +3404(blkid & ((1ULL << epbs) - 1));3405return (0);3406} else {3407/* the block is referenced from the dnode */3408ASSERT3U(level, ==, nlevels-1);3409ASSERT(dn->dn_phys->dn_nblkptr == 0 ||3410blkid < dn->dn_phys->dn_nblkptr);3411if (dn->dn_dbuf) {3412dbuf_add_ref(dn->dn_dbuf, NULL);3413*parentp = dn->dn_dbuf;3414}3415*bpp = &dn->dn_phys->dn_blkptr[blkid];3416return (0);3417}3418}34193420static dmu_buf_impl_t *3421dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,3422dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)3423{3424objset_t *os = dn->dn_objset;3425dmu_buf_impl_t *db, *odb;34263427ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3428ASSERT(dn->dn_type != DMU_OT_NONE);34293430db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);34313432list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),3433offsetof(dbuf_dirty_record_t, dr_dbuf_node));34343435db->db_objset = os;3436db->db.db_object = dn->dn_object;3437db->db_level = level;3438db->db_blkid = blkid;3439db->db_dirtycnt = 0;3440#ifdef USE_DNODE_HANDLE3441db->db_dnode_handle = dn->dn_handle;3442#else3443db->db_dnode = dn;3444#endif3445db->db_parent = parent;3446db->db_blkptr = blkptr;3447db->db_hash = hash;34483449db->db_user = NULL;3450db->db_user_immediate_evict = FALSE;3451db->db_freed_in_flight = FALSE;3452db->db_pending_evict = TRUE;3453db->db_partial_read = FALSE;34543455if (blkid == DMU_BONUS_BLKID) {3456ASSERT3P(parent, ==, dn->dn_dbuf);3457db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -3458(dn->dn_nblkptr-1) * sizeof (blkptr_t);3459ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);3460db->db.db_offset = DMU_BONUS_BLKID;3461db->db_state = DB_UNCACHED;3462DTRACE_SET_STATE(db, "bonus buffer created");3463db->db_caching_status = DB_NO_CACHE;3464/* the bonus dbuf is not placed in the hash table */3465arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);3466return (db);3467} else if (blkid == DMU_SPILL_BLKID) {3468db->db.db_size = (blkptr != NULL) ?3469BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;3470db->db.db_offset = 0;3471} else {3472int blocksize =3473db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;3474db->db.db_size = blocksize;3475db->db.db_offset = db->db_blkid * blocksize;3476}34773478/*3479* Hold the dn_dbufs_mtx while we get the new dbuf3480* in the hash table *and* added to the dbufs list.3481* This prevents a possible deadlock with someone3482* trying to look up this dbuf before it's added to the3483* dn_dbufs list.3484*/3485mutex_enter(&dn->dn_dbufs_mtx);3486db->db_state = DB_EVICTING; /* not worth logging this state change */3487if ((odb = dbuf_hash_insert(db)) != NULL) {3488/* someone else inserted it first */3489mutex_exit(&dn->dn_dbufs_mtx);3490kmem_cache_free(dbuf_kmem_cache, db);3491DBUF_STAT_BUMP(hash_insert_race);3492return (odb);3493}3494avl_add(&dn->dn_dbufs, db);34953496db->db_state = DB_UNCACHED;3497DTRACE_SET_STATE(db, "regular buffer created");3498db->db_caching_status = DB_NO_CACHE;3499mutex_exit(&dn->dn_dbufs_mtx);3500arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);35013502if (parent && parent != dn->dn_dbuf)3503dbuf_add_ref(parent, db);35043505ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||3506zfs_refcount_count(&dn->dn_holds) > 0);3507(void) zfs_refcount_add(&dn->dn_holds, db);35083509dprintf_dbuf(db, "db=%p\n", db);35103511return (db);3512}35133514/*3515* This function returns a block pointer and information about the object,3516* given a dnode and a block. This is a publicly accessible version of3517* dbuf_findbp that only returns some information, rather than the3518* dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock3519* should be locked as (at least) a reader.3520*/3521int3522dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,3523blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)3524{3525dmu_buf_impl_t *dbp = NULL;3526blkptr_t *bp2;3527int err = 0;3528ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));35293530err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);3531if (err == 0) {3532ASSERT3P(bp2, !=, NULL);3533*bp = *bp2;3534if (dbp != NULL)3535dbuf_rele(dbp, NULL);3536if (datablkszsec != NULL)3537*datablkszsec = dn->dn_phys->dn_datablkszsec;3538if (indblkshift != NULL)3539*indblkshift = dn->dn_phys->dn_indblkshift;3540}35413542return (err);3543}35443545typedef struct dbuf_prefetch_arg {3546spa_t *dpa_spa; /* The spa to issue the prefetch in. */3547zbookmark_phys_t dpa_zb; /* The target block to prefetch. */3548int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */3549int dpa_curlevel; /* The current level that we're reading */3550dnode_t *dpa_dnode; /* The dnode associated with the prefetch */3551zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */3552zio_t *dpa_zio; /* The parent zio_t for all prefetches. */3553arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */3554dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */3555void *dpa_arg; /* prefetch completion arg */3556} dbuf_prefetch_arg_t;35573558static void3559dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)3560{3561if (dpa->dpa_cb != NULL) {3562dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,3563dpa->dpa_zb.zb_blkid, io_done);3564}3565kmem_free(dpa, sizeof (*dpa));3566}35673568static void3569dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,3570const blkptr_t *iobp, arc_buf_t *abuf, void *private)3571{3572(void) zio, (void) zb, (void) iobp;3573dbuf_prefetch_arg_t *dpa = private;35743575if (abuf != NULL)3576arc_buf_destroy(abuf, private);35773578dbuf_prefetch_fini(dpa, B_TRUE);3579}35803581/*3582* Actually issue the prefetch read for the block given.3583*/3584static void3585dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)3586{3587ASSERT(!BP_IS_HOLE(bp));3588ASSERT(!BP_IS_REDACTED(bp));3589if (BP_IS_EMBEDDED(bp))3590return (dbuf_prefetch_fini(dpa, B_FALSE));35913592int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;3593arc_flags_t aflags =3594dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |3595ARC_FLAG_NO_BUF;35963597/* dnodes are always read as raw and then converted later */3598if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&3599dpa->dpa_curlevel == 0)3600zio_flags |= ZIO_FLAG_RAW;36013602ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));3603ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);3604ASSERT(dpa->dpa_zio != NULL);3605(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,3606dbuf_issue_final_prefetch_done, dpa,3607dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);3608}36093610/*3611* Called when an indirect block above our prefetch target is read in. This3612* will either read in the next indirect block down the tree or issue the actual3613* prefetch if the next block down is our target.3614*/3615static void3616dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,3617const blkptr_t *iobp, arc_buf_t *abuf, void *private)3618{3619(void) zb, (void) iobp;3620dbuf_prefetch_arg_t *dpa = private;36213622ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);3623ASSERT3S(dpa->dpa_curlevel, >, 0);36243625if (abuf == NULL) {3626ASSERT(zio == NULL || zio->io_error != 0);3627dbuf_prefetch_fini(dpa, B_TRUE);3628return;3629}3630ASSERT(zio == NULL || zio->io_error == 0);36313632/*3633* The dpa_dnode is only valid if we are called with a NULL3634* zio. This indicates that the arc_read() returned without3635* first calling zio_read() to issue a physical read. Once3636* a physical read is made the dpa_dnode must be invalidated3637* as the locks guarding it may have been dropped. If the3638* dpa_dnode is still valid, then we want to add it to the dbuf3639* cache. To do so, we must hold the dbuf associated with the block3640* we just prefetched, read its contents so that we associate it3641* with an arc_buf_t, and then release it.3642*/3643if (zio != NULL) {3644ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);3645if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {3646ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);3647} else {3648ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);3649}3650ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);36513652dpa->dpa_dnode = NULL;3653} else if (dpa->dpa_dnode != NULL) {3654uint64_t curblkid = dpa->dpa_zb.zb_blkid >>3655(dpa->dpa_epbs * (dpa->dpa_curlevel -3656dpa->dpa_zb.zb_level));3657dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,3658dpa->dpa_curlevel, curblkid, FTAG);3659if (db == NULL) {3660arc_buf_destroy(abuf, private);3661dbuf_prefetch_fini(dpa, B_TRUE);3662return;3663}3664(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |3665DMU_READ_NO_PREFETCH);3666dbuf_rele(db, FTAG);3667}36683669dpa->dpa_curlevel--;3670uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>3671(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));3672blkptr_t *bp = ((blkptr_t *)abuf->b_data) +3673P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);36743675ASSERT(!BP_IS_REDACTED(bp) || dpa->dpa_dnode == NULL ||3676dsl_dataset_feature_is_active(3677dpa->dpa_dnode->dn_objset->os_dsl_dataset,3678SPA_FEATURE_REDACTED_DATASETS));3679if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {3680arc_buf_destroy(abuf, private);3681dbuf_prefetch_fini(dpa, B_TRUE);3682return;3683} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {3684ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);3685dbuf_issue_final_prefetch(dpa, bp);3686} else {3687arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;3688zbookmark_phys_t zb;36893690/* flag if L2ARC eligible, l2arc_noprefetch then decides */3691if (dpa->dpa_dnode) {3692if (dnode_level_is_l2cacheable(bp, dpa->dpa_dnode,3693dpa->dpa_curlevel))3694iter_aflags |= ARC_FLAG_L2CACHE;3695} else {3696if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)3697iter_aflags |= ARC_FLAG_L2CACHE;3698}36993700ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));37013702SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,3703dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);37043705(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,3706bp, dbuf_prefetch_indirect_done, dpa,3707ZIO_PRIORITY_SYNC_READ,3708ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,3709&iter_aflags, &zb);3710}37113712arc_buf_destroy(abuf, private);3713}37143715/*3716* Issue prefetch reads for the given block on the given level. If the indirect3717* blocks above that block are not in memory, we will read them in3718* asynchronously. As a result, this call never blocks waiting for a read to3719* complete. Note that the prefetch might fail if the dataset is encrypted and3720* the encryption key is unmapped before the IO completes.3721*/3722int3723dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,3724zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,3725void *arg)3726{3727blkptr_t bp;3728int epbs, nlevels, curlevel;3729uint64_t curblkid;37303731ASSERT(blkid != DMU_BONUS_BLKID);3732ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));37333734if (blkid > dn->dn_maxblkid)3735goto no_issue;37363737if (level == 0 && dnode_block_freed(dn, blkid))3738goto no_issue;37393740/*3741* This dnode hasn't been written to disk yet, so there's nothing to3742* prefetch.3743*/3744nlevels = dn->dn_phys->dn_nlevels;3745if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)3746goto no_issue;37473748epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;3749if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))3750goto no_issue;37513752dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,3753level, blkid, NULL);3754if (db != NULL) {3755mutex_exit(&db->db_mtx);3756/*3757* This dbuf already exists. It is either CACHED, or3758* (we assume) about to be read or filled.3759*/3760goto no_issue;3761}37623763/*3764* Find the closest ancestor (indirect block) of the target block3765* that is present in the cache. In this indirect block, we will3766* find the bp that is at curlevel, curblkid.3767*/3768curlevel = level;3769curblkid = blkid;3770while (curlevel < nlevels - 1) {3771int parent_level = curlevel + 1;3772uint64_t parent_blkid = curblkid >> epbs;3773dmu_buf_impl_t *db;37743775if (dbuf_hold_impl(dn, parent_level, parent_blkid,3776FALSE, TRUE, FTAG, &db) == 0) {3777blkptr_t *bpp = db->db_buf->b_data;3778bp = bpp[P2PHASE(curblkid, 1 << epbs)];3779dbuf_rele(db, FTAG);3780break;3781}37823783curlevel = parent_level;3784curblkid = parent_blkid;3785}37863787if (curlevel == nlevels - 1) {3788/* No cached indirect blocks found. */3789ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);3790bp = dn->dn_phys->dn_blkptr[curblkid];3791}3792ASSERT(!BP_IS_REDACTED(&bp) ||3793dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,3794SPA_FEATURE_REDACTED_DATASETS));3795if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))3796goto no_issue;37973798ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));37993800zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,3801ZIO_FLAG_CANFAIL);38023803dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);3804dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;3805SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,3806dn->dn_object, level, blkid);3807dpa->dpa_curlevel = curlevel;3808dpa->dpa_prio = prio;3809dpa->dpa_aflags = aflags;3810dpa->dpa_spa = dn->dn_objset->os_spa;3811dpa->dpa_dnode = dn;3812dpa->dpa_epbs = epbs;3813dpa->dpa_zio = pio;3814dpa->dpa_cb = cb;3815dpa->dpa_arg = arg;38163817if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))3818dpa->dpa_aflags |= ARC_FLAG_UNCACHED;3819else if (dnode_level_is_l2cacheable(&bp, dn, level))3820dpa->dpa_aflags |= ARC_FLAG_L2CACHE;38213822/*3823* If we have the indirect just above us, no need to do the asynchronous3824* prefetch chain; we'll just run the last step ourselves. If we're at3825* a higher level, though, we want to issue the prefetches for all the3826* indirect blocks asynchronously, so we can go on with whatever we were3827* doing.3828*/3829if (curlevel == level) {3830ASSERT3U(curblkid, ==, blkid);3831dbuf_issue_final_prefetch(dpa, &bp);3832} else {3833arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;3834zbookmark_phys_t zb;38353836/* flag if L2ARC eligible, l2arc_noprefetch then decides */3837if (dnode_level_is_l2cacheable(&bp, dn, curlevel))3838iter_aflags |= ARC_FLAG_L2CACHE;38393840SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,3841dn->dn_object, curlevel, curblkid);3842(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,3843&bp, dbuf_prefetch_indirect_done, dpa,3844ZIO_PRIORITY_SYNC_READ,3845ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,3846&iter_aflags, &zb);3847}3848/*3849* We use pio here instead of dpa_zio since it's possible that3850* dpa may have already been freed.3851*/3852zio_nowait(pio);3853return (1);3854no_issue:3855if (cb != NULL)3856cb(arg, level, blkid, B_FALSE);3857return (0);3858}38593860int3861dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,3862arc_flags_t aflags)3863{38643865return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));3866}38673868/*3869* Helper function for dbuf_hold_impl() to copy a buffer. Handles3870* the case of encrypted, compressed and uncompressed buffers by3871* allocating the new buffer, respectively, with arc_alloc_raw_buf(),3872* arc_alloc_compressed_buf() or arc_alloc_buf().*3873*3874* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().3875*/3876noinline static void3877dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)3878{3879dbuf_dirty_record_t *dr = db->db_data_pending;3880arc_buf_t *data = dr->dt.dl.dr_data;3881arc_buf_t *db_data;3882enum zio_compress compress_type = arc_get_compression(data);3883uint8_t complevel = arc_get_complevel(data);38843885if (arc_is_encrypted(data)) {3886boolean_t byteorder;3887uint8_t salt[ZIO_DATA_SALT_LEN];3888uint8_t iv[ZIO_DATA_IV_LEN];3889uint8_t mac[ZIO_DATA_MAC_LEN];38903891arc_get_raw_params(data, &byteorder, salt, iv, mac);3892db_data = arc_alloc_raw_buf(dn->dn_objset->os_spa, db,3893dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,3894dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),3895compress_type, complevel);3896} else if (compress_type != ZIO_COMPRESS_OFF) {3897db_data = arc_alloc_compressed_buf(3898dn->dn_objset->os_spa, db, arc_buf_size(data),3899arc_buf_lsize(data), compress_type, complevel);3900} else {3901db_data = arc_alloc_buf(dn->dn_objset->os_spa, db,3902DBUF_GET_BUFC_TYPE(db), db->db.db_size);3903}3904memcpy(db_data->b_data, data->b_data, arc_buf_size(data));39053906dbuf_set_data(db, db_data);3907}39083909/*3910* Returns with db_holds incremented, and db_mtx not held.3911* Note: dn_struct_rwlock must be held.3912*/3913int3914dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,3915boolean_t fail_sparse, boolean_t fail_uncached,3916const void *tag, dmu_buf_impl_t **dbp)3917{3918dmu_buf_impl_t *db, *parent = NULL;3919uint64_t hv;39203921/* If the pool has been created, verify the tx_sync_lock is not held */3922spa_t *spa = dn->dn_objset->os_spa;3923dsl_pool_t *dp = spa->spa_dsl_pool;3924if (dp != NULL) {3925ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));3926}39273928ASSERT(blkid != DMU_BONUS_BLKID);3929ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3930if (!fail_sparse)3931ASSERT3U(dn->dn_nlevels, >, level);39323933*dbp = NULL;39343935/* dbuf_find() returns with db_mtx held */3936db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);39373938if (db == NULL) {3939blkptr_t *bp = NULL;3940int err;39413942if (fail_uncached)3943return (SET_ERROR(ENOENT));39443945ASSERT0P(parent);3946err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);3947if (fail_sparse) {3948if (err == 0 && bp && BP_IS_HOLE(bp))3949err = SET_ERROR(ENOENT);3950if (err) {3951if (parent)3952dbuf_rele(parent, NULL);3953return (err);3954}3955}3956if (err && err != ENOENT)3957return (err);3958db = dbuf_create(dn, level, blkid, parent, bp, hv);3959}39603961if (fail_uncached && db->db_state != DB_CACHED) {3962mutex_exit(&db->db_mtx);3963return (SET_ERROR(ENOENT));3964}39653966if (db->db_buf != NULL) {3967arc_buf_access(db->db_buf);3968ASSERT(MUTEX_HELD(&db->db_mtx));3969ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);3970}39713972ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));39733974/*3975* If this buffer is currently syncing out, and we are3976* still referencing it from db_data, we need to make a copy3977* of it in case we decide we want to dirty it again in this txg.3978*/3979if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&3980dn->dn_object != DMU_META_DNODE_OBJECT &&3981db->db_state == DB_CACHED && db->db_data_pending) {3982dbuf_dirty_record_t *dr = db->db_data_pending;3983if (dr->dt.dl.dr_data == db->db_buf) {3984ASSERT3P(db->db_buf, !=, NULL);3985dbuf_hold_copy(dn, db);3986}3987}39883989if (multilist_link_active(&db->db_cache_link)) {3990ASSERT(zfs_refcount_is_zero(&db->db_holds));3991ASSERT(db->db_caching_status == DB_DBUF_CACHE ||3992db->db_caching_status == DB_DBUF_METADATA_CACHE);39933994multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);39953996uint64_t size = db->db.db_size;3997uint64_t usize = dmu_buf_user_size(&db->db);3998(void) zfs_refcount_remove_many(3999&dbuf_caches[db->db_caching_status].size, size, db);4000(void) zfs_refcount_remove_many(4001&dbuf_caches[db->db_caching_status].size, usize,4002db->db_user);40034004if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {4005DBUF_STAT_BUMPDOWN(metadata_cache_count);4006} else {4007DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);4008DBUF_STAT_BUMPDOWN(cache_count);4009DBUF_STAT_DECR(cache_levels_bytes[db->db_level],4010size + usize);4011}4012db->db_caching_status = DB_NO_CACHE;4013}4014(void) zfs_refcount_add(&db->db_holds, tag);4015DBUF_VERIFY(db);4016mutex_exit(&db->db_mtx);40174018/* NOTE: we can't rele the parent until after we drop the db_mtx */4019if (parent)4020dbuf_rele(parent, NULL);40214022ASSERT3P(DB_DNODE(db), ==, dn);4023ASSERT3U(db->db_blkid, ==, blkid);4024ASSERT3U(db->db_level, ==, level);4025*dbp = db;40264027return (0);4028}40294030dmu_buf_impl_t *4031dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)4032{4033return (dbuf_hold_level(dn, 0, blkid, tag));4034}40354036dmu_buf_impl_t *4037dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)4038{4039dmu_buf_impl_t *db;4040int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);4041return (err ? NULL : db);4042}40434044void4045dbuf_create_bonus(dnode_t *dn)4046{4047ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));40484049ASSERT0P(dn->dn_bonus);4050dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,4051dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));4052dn->dn_bonus->db_pending_evict = FALSE;4053}40544055int4056dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)4057{4058dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;40594060if (db->db_blkid != DMU_SPILL_BLKID)4061return (SET_ERROR(ENOTSUP));4062if (blksz == 0)4063blksz = SPA_MINBLOCKSIZE;4064ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));4065blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);40664067dbuf_new_size(db, blksz, tx);40684069return (0);4070}40714072void4073dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)4074{4075dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);4076}40774078#pragma weak dmu_buf_add_ref = dbuf_add_ref4079void4080dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)4081{4082int64_t holds = zfs_refcount_add(&db->db_holds, tag);4083VERIFY3S(holds, >, 1);4084}40854086#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref4087boolean_t4088dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,4089const void *tag)4090{4091dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4092dmu_buf_impl_t *found_db;4093boolean_t result = B_FALSE;40944095if (blkid == DMU_BONUS_BLKID)4096found_db = dbuf_find_bonus(os, obj);4097else4098found_db = dbuf_find(os, obj, 0, blkid, NULL);40994100if (found_db != NULL) {4101if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {4102(void) zfs_refcount_add(&db->db_holds, tag);4103result = B_TRUE;4104}4105mutex_exit(&found_db->db_mtx);4106}4107return (result);4108}41094110/*4111* If you call dbuf_rele() you had better not be referencing the dnode handle4112* unless you have some other direct or indirect hold on the dnode. (An indirect4113* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)4114* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the4115* dnode's parent dbuf evicting its dnode handles.4116*/4117void4118dbuf_rele(dmu_buf_impl_t *db, const void *tag)4119{4120mutex_enter(&db->db_mtx);4121dbuf_rele_and_unlock(db, tag, B_FALSE);4122}41234124void4125dmu_buf_rele(dmu_buf_t *db, const void *tag)4126{4127dbuf_rele((dmu_buf_impl_t *)db, tag);4128}41294130/*4131* dbuf_rele() for an already-locked dbuf. This is necessary to allow4132* db_dirtycnt and db_holds to be updated atomically. The 'evicting'4133* argument should be set if we are already in the dbuf-evicting code4134* path, in which case we don't want to recursively evict. This allows us to4135* avoid deeply nested stacks that would have a call flow similar to this:4136*4137* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()4138* ^ |4139* | |4140* +-----dbuf_destroy()<--dbuf_evict_one()<--------+4141*4142*/4143void4144dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)4145{4146int64_t holds;4147uint64_t size;41484149ASSERT(MUTEX_HELD(&db->db_mtx));4150DBUF_VERIFY(db);41514152/*4153* Remove the reference to the dbuf before removing its hold on the4154* dnode so we can guarantee in dnode_move() that a referenced bonus4155* buffer has a corresponding dnode hold.4156*/4157holds = zfs_refcount_remove(&db->db_holds, tag);4158ASSERT(holds >= 0);41594160/*4161* We can't freeze indirects if there is a possibility that they4162* may be modified in the current syncing context.4163*/4164if (db->db_buf != NULL &&4165holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {4166arc_buf_freeze(db->db_buf);4167}41684169if (holds == db->db_dirtycnt &&4170db->db_level == 0 && db->db_user_immediate_evict)4171dbuf_evict_user(db);41724173if (holds == 0) {4174if (db->db_blkid == DMU_BONUS_BLKID) {4175dnode_t *dn;4176boolean_t evict_dbuf = db->db_pending_evict;41774178/*4179* If the dnode moves here, we cannot cross this4180* barrier until the move completes.4181*/4182DB_DNODE_ENTER(db);41834184dn = DB_DNODE(db);4185atomic_dec_32(&dn->dn_dbufs_count);41864187/*4188* Decrementing the dbuf count means that the bonus4189* buffer's dnode hold is no longer discounted in4190* dnode_move(). The dnode cannot move until after4191* the dnode_rele() below.4192*/4193DB_DNODE_EXIT(db);41944195/*4196* Do not reference db after its lock is dropped.4197* Another thread may evict it.4198*/4199mutex_exit(&db->db_mtx);42004201if (evict_dbuf)4202dnode_evict_bonus(dn);42034204dnode_rele(dn, db);4205} else if (db->db_buf == NULL) {4206/*4207* This is a special case: we never associated this4208* dbuf with any data allocated from the ARC.4209*/4210ASSERT(db->db_state == DB_UNCACHED ||4211db->db_state == DB_NOFILL);4212dbuf_destroy(db);4213} else if (arc_released(db->db_buf)) {4214/*4215* This dbuf has anonymous data associated with it.4216*/4217dbuf_destroy(db);4218} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {4219/*4220* We don't expect more accesses to the dbuf, and it4221* is either not cacheable or was marked for eviction.4222*/4223dbuf_destroy(db);4224} else if (!multilist_link_active(&db->db_cache_link)) {4225ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);42264227dbuf_cached_state_t dcs =4228dbuf_include_in_metadata_cache(db) ?4229DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;4230db->db_caching_status = dcs;42314232multilist_insert(&dbuf_caches[dcs].cache, db);4233uint64_t db_size = db->db.db_size;4234uint64_t dbu_size = dmu_buf_user_size(&db->db);4235(void) zfs_refcount_add_many(4236&dbuf_caches[dcs].size, db_size, db);4237size = zfs_refcount_add_many(4238&dbuf_caches[dcs].size, dbu_size, db->db_user);4239uint8_t db_level = db->db_level;4240mutex_exit(&db->db_mtx);42414242if (dcs == DB_DBUF_METADATA_CACHE) {4243DBUF_STAT_BUMP(metadata_cache_count);4244DBUF_STAT_MAX(metadata_cache_size_bytes_max,4245size);4246} else {4247DBUF_STAT_BUMP(cache_count);4248DBUF_STAT_MAX(cache_size_bytes_max, size);4249DBUF_STAT_BUMP(cache_levels[db_level]);4250DBUF_STAT_INCR(cache_levels_bytes[db_level],4251db_size + dbu_size);4252}42534254if (dcs == DB_DBUF_CACHE && !evicting)4255dbuf_evict_notify(size);4256}4257} else {4258mutex_exit(&db->db_mtx);4259}4260}42614262#pragma weak dmu_buf_refcount = dbuf_refcount4263uint64_t4264dbuf_refcount(dmu_buf_impl_t *db)4265{4266return (zfs_refcount_count(&db->db_holds));4267}42684269uint64_t4270dmu_buf_user_refcount(dmu_buf_t *db_fake)4271{4272uint64_t holds;4273dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;42744275mutex_enter(&db->db_mtx);4276ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);4277holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;4278mutex_exit(&db->db_mtx);42794280return (holds);4281}42824283void *4284dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,4285dmu_buf_user_t *new_user)4286{4287dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;42884289mutex_enter(&db->db_mtx);4290dbuf_verify_user(db, DBVU_NOT_EVICTING);4291if (db->db_user == old_user)4292db->db_user = new_user;4293else4294old_user = db->db_user;4295dbuf_verify_user(db, DBVU_NOT_EVICTING);4296mutex_exit(&db->db_mtx);42974298return (old_user);4299}43004301void *4302dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)4303{4304return (dmu_buf_replace_user(db_fake, NULL, user));4305}43064307void *4308dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)4309{4310dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;43114312db->db_user_immediate_evict = TRUE;4313return (dmu_buf_set_user(db_fake, user));4314}43154316void *4317dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)4318{4319return (dmu_buf_replace_user(db_fake, user, NULL));4320}43214322void *4323dmu_buf_get_user(dmu_buf_t *db_fake)4324{4325dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;43264327dbuf_verify_user(db, DBVU_NOT_EVICTING);4328return (db->db_user);4329}43304331uint64_t4332dmu_buf_user_size(dmu_buf_t *db_fake)4333{4334dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4335if (db->db_user == NULL)4336return (0);4337return (atomic_load_64(&db->db_user->dbu_size));4338}43394340void4341dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)4342{4343dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4344ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);4345ASSERT3P(db->db_user, !=, NULL);4346ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);4347atomic_add_64(&db->db_user->dbu_size, nadd);4348}43494350void4351dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)4352{4353dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4354ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);4355ASSERT3P(db->db_user, !=, NULL);4356ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);4357atomic_sub_64(&db->db_user->dbu_size, nsub);4358}43594360void4361dmu_buf_user_evict_wait(void)4362{4363taskq_wait(dbu_evict_taskq);4364}43654366blkptr_t *4367dmu_buf_get_blkptr(dmu_buf_t *db)4368{4369dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;4370return (dbi->db_blkptr);4371}43724373objset_t *4374dmu_buf_get_objset(dmu_buf_t *db)4375{4376dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;4377return (dbi->db_objset);4378}43794380static void4381dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)4382{4383/* ASSERT(dmu_tx_is_syncing(tx) */4384ASSERT(MUTEX_HELD(&db->db_mtx));43854386if (db->db_blkptr != NULL)4387return;43884389if (db->db_blkid == DMU_SPILL_BLKID) {4390db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);4391BP_ZERO(db->db_blkptr);4392return;4393}4394if (db->db_level == dn->dn_phys->dn_nlevels-1) {4395/*4396* This buffer was allocated at a time when there was4397* no available blkptrs from the dnode, or it was4398* inappropriate to hook it in (i.e., nlevels mismatch).4399*/4400ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);4401ASSERT0P(db->db_parent);4402db->db_parent = dn->dn_dbuf;4403db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];4404DBUF_VERIFY(db);4405} else {4406dmu_buf_impl_t *parent = db->db_parent;4407int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;44084409ASSERT(dn->dn_phys->dn_nlevels > 1);4410if (parent == NULL) {4411mutex_exit(&db->db_mtx);4412rw_enter(&dn->dn_struct_rwlock, RW_READER);4413parent = dbuf_hold_level(dn, db->db_level + 1,4414db->db_blkid >> epbs, db);4415rw_exit(&dn->dn_struct_rwlock);4416mutex_enter(&db->db_mtx);4417db->db_parent = parent;4418}4419db->db_blkptr = (blkptr_t *)parent->db.db_data +4420(db->db_blkid & ((1ULL << epbs) - 1));4421DBUF_VERIFY(db);4422}4423}44244425static void4426dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4427{4428dmu_buf_impl_t *db = dr->dr_dbuf;4429void *data = dr->dt.dl.dr_data;44304431ASSERT0(db->db_level);4432ASSERT(MUTEX_HELD(&db->db_mtx));4433ASSERT(db->db_blkid == DMU_BONUS_BLKID);4434ASSERT(data != NULL);44354436dnode_t *dn = dr->dr_dnode;4437ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,4438DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));4439memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));44404441dbuf_sync_leaf_verify_bonus_dnode(dr);44424443dbuf_undirty_bonus(dr);4444dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);4445}44464447/*4448* When syncing out a blocks of dnodes, adjust the block to deal with4449* encryption. Normally, we make sure the block is decrypted before writing4450* it. If we have crypt params, then we are writing a raw (encrypted) block,4451* from a raw receive. In this case, set the ARC buf's crypt params so4452* that the BP will be filled with the correct byteorder, salt, iv, and mac.4453*/4454static void4455dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)4456{4457int err;4458dmu_buf_impl_t *db = dr->dr_dbuf;44594460ASSERT(MUTEX_HELD(&db->db_mtx));4461ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);4462ASSERT0(db->db_level);44634464if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {4465zbookmark_phys_t zb;44664467/*4468* Unfortunately, there is currently no mechanism for4469* syncing context to handle decryption errors. An error4470* here is only possible if an attacker maliciously4471* changed a dnode block and updated the associated4472* checksums going up the block tree.4473*/4474SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),4475db->db.db_object, db->db_level, db->db_blkid);4476err = arc_untransform(db->db_buf, db->db_objset->os_spa,4477&zb, B_TRUE);4478if (err)4479panic("Invalid dnode block MAC");4480} else if (dr->dt.dl.dr_has_raw_params) {4481(void) arc_release(dr->dt.dl.dr_data, db);4482arc_convert_to_raw(dr->dt.dl.dr_data,4483dmu_objset_id(db->db_objset),4484dr->dt.dl.dr_byteorder, DMU_OT_DNODE,4485dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);4486}4487}44884489/*4490* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it4491* is critical the we not allow the compiler to inline this function in to4492* dbuf_sync_list() thereby drastically bloating the stack usage.4493*/4494noinline static void4495dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4496{4497dmu_buf_impl_t *db = dr->dr_dbuf;4498dnode_t *dn = dr->dr_dnode;44994500ASSERT(dmu_tx_is_syncing(tx));45014502dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);45034504mutex_enter(&db->db_mtx);45054506ASSERT(db->db_level > 0);4507DBUF_VERIFY(db);45084509/* Read the block if it hasn't been read yet. */4510if (db->db_buf == NULL) {4511mutex_exit(&db->db_mtx);4512(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);4513mutex_enter(&db->db_mtx);4514}4515ASSERT3U(db->db_state, ==, DB_CACHED);4516ASSERT(db->db_buf != NULL);45174518/* Indirect block size must match what the dnode thinks it is. */4519ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);4520dbuf_check_blkptr(dn, db);45214522/* Provide the pending dirty record to child dbufs */4523db->db_data_pending = dr;45244525mutex_exit(&db->db_mtx);45264527dbuf_write(dr, db->db_buf, tx);45284529zio_t *zio = dr->dr_zio;4530mutex_enter(&dr->dt.di.dr_mtx);4531dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);4532ASSERT(list_head(&dr->dt.di.dr_children) == NULL);4533mutex_exit(&dr->dt.di.dr_mtx);4534zio_nowait(zio);4535}45364537/*4538* Verify that the size of the data in our bonus buffer does not exceed4539* its recorded size.4540*4541* The purpose of this verification is to catch any cases in development4542* where the size of a phys structure (i.e space_map_phys_t) grows and,4543* due to incorrect feature management, older pools expect to read more4544* data even though they didn't actually write it to begin with.4545*4546* For a example, this would catch an error in the feature logic where we4547* open an older pool and we expect to write the space map histogram of4548* a space map with size SPACE_MAP_SIZE_V0.4549*/4550static void4551dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)4552{4553#ifdef ZFS_DEBUG4554dnode_t *dn = dr->dr_dnode;45554556/*4557* Encrypted bonus buffers can have data past their bonuslen.4558* Skip the verification of these blocks.4559*/4560if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))4561return;45624563uint16_t bonuslen = dn->dn_phys->dn_bonuslen;4564uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);4565ASSERT3U(bonuslen, <=, maxbonuslen);45664567arc_buf_t *datap = dr->dt.dl.dr_data;4568char *datap_end = ((char *)datap) + bonuslen;4569char *datap_max = ((char *)datap) + maxbonuslen;45704571/* ensure that everything is zero after our data */4572for (; datap_end < datap_max; datap_end++)4573ASSERT0(*datap_end);4574#endif4575}45764577static blkptr_t *4578dbuf_lightweight_bp(dbuf_dirty_record_t *dr)4579{4580/* This must be a lightweight dirty record. */4581ASSERT0P(dr->dr_dbuf);4582dnode_t *dn = dr->dr_dnode;45834584if (dn->dn_phys->dn_nlevels == 1) {4585VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);4586return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);4587} else {4588dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;4589int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;4590VERIFY3U(parent_db->db_level, ==, 1);4591VERIFY3P(DB_DNODE(parent_db), ==, dn);4592VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);4593blkptr_t *bp = parent_db->db.db_data;4594return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);4595}4596}45974598static void4599dbuf_lightweight_ready(zio_t *zio)4600{4601dbuf_dirty_record_t *dr = zio->io_private;4602blkptr_t *bp = zio->io_bp;46034604if (zio->io_error != 0)4605return;46064607dnode_t *dn = dr->dr_dnode;46084609blkptr_t *bp_orig = dbuf_lightweight_bp(dr);4610spa_t *spa = dmu_objset_spa(dn->dn_objset);4611int64_t delta = bp_get_dsize_sync(spa, bp) -4612bp_get_dsize_sync(spa, bp_orig);4613dnode_diduse_space(dn, delta);46144615uint64_t blkid = dr->dt.dll.dr_blkid;4616mutex_enter(&dn->dn_mtx);4617if (blkid > dn->dn_phys->dn_maxblkid) {4618ASSERT0(dn->dn_objset->os_raw_receive);4619dn->dn_phys->dn_maxblkid = blkid;4620}4621mutex_exit(&dn->dn_mtx);46224623if (!BP_IS_EMBEDDED(bp)) {4624uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;4625BP_SET_FILL(bp, fill);4626}46274628dmu_buf_impl_t *parent_db;4629EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);4630if (dr->dr_parent == NULL) {4631parent_db = dn->dn_dbuf;4632} else {4633parent_db = dr->dr_parent->dr_dbuf;4634}4635rw_enter(&parent_db->db_rwlock, RW_WRITER);4636*bp_orig = *bp;4637rw_exit(&parent_db->db_rwlock);4638}46394640static void4641dbuf_lightweight_done(zio_t *zio)4642{4643dbuf_dirty_record_t *dr = zio->io_private;46444645VERIFY0(zio->io_error);46464647objset_t *os = dr->dr_dnode->dn_objset;4648dmu_tx_t *tx = os->os_synctx;46494650if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {4651ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));4652} else {4653dsl_dataset_t *ds = os->os_dsl_dataset;4654(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);4655dsl_dataset_block_born(ds, zio->io_bp, tx);4656}46574658dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,4659zio->io_txg);46604661abd_free(dr->dt.dll.dr_abd);4662kmem_free(dr, sizeof (*dr));4663}46644665noinline static void4666dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4667{4668dnode_t *dn = dr->dr_dnode;4669zio_t *pio;4670if (dn->dn_phys->dn_nlevels == 1) {4671pio = dn->dn_zio;4672} else {4673pio = dr->dr_parent->dr_zio;4674}46754676zbookmark_phys_t zb = {4677.zb_objset = dmu_objset_id(dn->dn_objset),4678.zb_object = dn->dn_object,4679.zb_level = 0,4680.zb_blkid = dr->dt.dll.dr_blkid,4681};46824683/*4684* See comment in dbuf_write(). This is so that zio->io_bp_orig4685* will have the old BP in dbuf_lightweight_done().4686*/4687dr->dr_bp_copy = *dbuf_lightweight_bp(dr);46884689dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),4690dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,4691dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),4692&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,4693dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,4694ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);46954696zio_nowait(dr->dr_zio);4697}46984699/*4700* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is4701* critical the we not allow the compiler to inline this function in to4702* dbuf_sync_list() thereby drastically bloating the stack usage.4703*/4704noinline static void4705dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4706{4707arc_buf_t **datap = &dr->dt.dl.dr_data;4708dmu_buf_impl_t *db = dr->dr_dbuf;4709dnode_t *dn = dr->dr_dnode;4710objset_t *os;4711uint64_t txg = tx->tx_txg;47124713ASSERT(dmu_tx_is_syncing(tx));47144715dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);47164717mutex_enter(&db->db_mtx);4718/*4719* To be synced, we must be dirtied. But we might have been freed4720* after the dirty.4721*/4722if (db->db_state == DB_UNCACHED) {4723/* This buffer has been freed since it was dirtied */4724ASSERT0P(db->db.db_data);4725} else if (db->db_state == DB_FILL) {4726/* This buffer was freed and is now being re-filled */4727ASSERT(db->db.db_data != dr->dt.dl.dr_data);4728} else if (db->db_state == DB_READ) {4729/*4730* This buffer was either cloned or had a Direct I/O write4731* occur and has an in-flgiht read on the BP. It is safe to4732* issue the write here, because the read has already been4733* issued and the contents won't change.4734*4735* We can verify the case of both the clone and Direct I/O4736* write by making sure the first dirty record for the dbuf4737* has no ARC buffer associated with it.4738*/4739dbuf_dirty_record_t *dr_head =4740list_head(&db->db_dirty_records);4741ASSERT0P(db->db_buf);4742ASSERT0P(db->db.db_data);4743ASSERT0P(dr_head->dt.dl.dr_data);4744ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);4745} else {4746ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);4747}4748DBUF_VERIFY(db);47494750if (db->db_blkid == DMU_SPILL_BLKID) {4751mutex_enter(&dn->dn_mtx);4752if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {4753/*4754* In the previous transaction group, the bonus buffer4755* was entirely used to store the attributes for the4756* dnode which overrode the dn_spill field. However,4757* when adding more attributes to the file a spill4758* block was required to hold the extra attributes.4759*4760* Make sure to clear the garbage left in the dn_spill4761* field from the previous attributes in the bonus4762* buffer. Otherwise, after writing out the spill4763* block to the new allocated dva, it will free4764* the old block pointed to by the invalid dn_spill.4765*/4766db->db_blkptr = NULL;4767}4768dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;4769mutex_exit(&dn->dn_mtx);4770}47714772/*4773* If this is a bonus buffer, simply copy the bonus data into the4774* dnode. It will be written out when the dnode is synced (and it4775* will be synced, since it must have been dirty for dbuf_sync to4776* be called).4777*/4778if (db->db_blkid == DMU_BONUS_BLKID) {4779ASSERT(dr->dr_dbuf == db);4780dbuf_sync_bonus(dr, tx);4781return;4782}47834784os = dn->dn_objset;47854786/*4787* This function may have dropped the db_mtx lock allowing a dmu_sync4788* operation to sneak in. As a result, we need to ensure that we4789* don't check the dr_override_state until we have returned from4790* dbuf_check_blkptr.4791*/4792dbuf_check_blkptr(dn, db);47934794/*4795* If this buffer is in the middle of an immediate write, wait for the4796* synchronous IO to complete.4797*4798* This is also valid even with Direct I/O writes setting a dirty4799* records override state into DR_IN_DMU_SYNC, because all4800* Direct I/O writes happen in open-context.4801*/4802while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {4803ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);4804cv_wait(&db->db_changed, &db->db_mtx);4805}48064807/*4808* If this is a dnode block, ensure it is appropriately encrypted4809* or decrypted, depending on what we are writing to it this txg.4810*/4811if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)4812dbuf_prepare_encrypted_dnode_leaf(dr);48134814if (*datap != NULL && *datap == db->db_buf &&4815dn->dn_object != DMU_META_DNODE_OBJECT &&4816zfs_refcount_count(&db->db_holds) > 1) {4817/*4818* If this buffer is currently "in use" (i.e., there4819* are active holds and db_data still references it),4820* then make a copy before we start the write so that4821* any modifications from the open txg will not leak4822* into this write.4823*4824* NOTE: this copy does not need to be made for4825* objects only modified in the syncing context (e.g.4826* DNONE_DNODE blocks).4827*/4828int psize = arc_buf_size(*datap);4829int lsize = arc_buf_lsize(*datap);4830arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);4831enum zio_compress compress_type = arc_get_compression(*datap);4832uint8_t complevel = arc_get_complevel(*datap);48334834if (arc_is_encrypted(*datap)) {4835boolean_t byteorder;4836uint8_t salt[ZIO_DATA_SALT_LEN];4837uint8_t iv[ZIO_DATA_IV_LEN];4838uint8_t mac[ZIO_DATA_MAC_LEN];48394840arc_get_raw_params(*datap, &byteorder, salt, iv, mac);4841*datap = arc_alloc_raw_buf(os->os_spa, db,4842dmu_objset_id(os), byteorder, salt, iv, mac,4843dn->dn_type, psize, lsize, compress_type,4844complevel);4845} else if (compress_type != ZIO_COMPRESS_OFF) {4846ASSERT3U(type, ==, ARC_BUFC_DATA);4847*datap = arc_alloc_compressed_buf(os->os_spa, db,4848psize, lsize, compress_type, complevel);4849} else {4850*datap = arc_alloc_buf(os->os_spa, db, type, psize);4851}4852memcpy((*datap)->b_data, db->db.db_data, psize);4853}4854db->db_data_pending = dr;48554856mutex_exit(&db->db_mtx);48574858dbuf_write(dr, *datap, tx);48594860ASSERT(!list_link_active(&dr->dr_dirty_node));4861if (dn->dn_object == DMU_META_DNODE_OBJECT) {4862list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);4863} else {4864zio_nowait(dr->dr_zio);4865}4866}48674868/*4869* Syncs out a range of dirty records for indirect or leaf dbufs. May be4870* called recursively from dbuf_sync_indirect().4871*/4872void4873dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)4874{4875dbuf_dirty_record_t *dr;48764877while ((dr = list_head(list))) {4878if (dr->dr_zio != NULL) {4879/*4880* If we find an already initialized zio then we4881* are processing the meta-dnode, and we have finished.4882* The dbufs for all dnodes are put back on the list4883* during processing, so that we can zio_wait()4884* these IOs after initiating all child IOs.4885*/4886ASSERT3U(dr->dr_dbuf->db.db_object, ==,4887DMU_META_DNODE_OBJECT);4888break;4889}4890list_remove(list, dr);4891if (dr->dr_dbuf == NULL) {4892dbuf_sync_lightweight(dr, tx);4893} else {4894if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&4895dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {4896VERIFY3U(dr->dr_dbuf->db_level, ==, level);4897}4898if (dr->dr_dbuf->db_level > 0)4899dbuf_sync_indirect(dr, tx);4900else4901dbuf_sync_leaf(dr, tx);4902}4903}4904}49054906static void4907dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)4908{4909(void) buf;4910dmu_buf_impl_t *db = vdb;4911dnode_t *dn;4912blkptr_t *bp = zio->io_bp;4913blkptr_t *bp_orig = &zio->io_bp_orig;4914spa_t *spa = zio->io_spa;4915int64_t delta;4916uint64_t fill = 0;4917int i;49184919ASSERT3P(db->db_blkptr, !=, NULL);4920ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);49214922DB_DNODE_ENTER(db);4923dn = DB_DNODE(db);4924delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);4925dnode_diduse_space(dn, delta - zio->io_prev_space_delta);4926zio->io_prev_space_delta = delta;49274928if (BP_GET_BIRTH(bp) != 0) {4929ASSERT((db->db_blkid != DMU_SPILL_BLKID &&4930BP_GET_TYPE(bp) == dn->dn_type) ||4931(db->db_blkid == DMU_SPILL_BLKID &&4932BP_GET_TYPE(bp) == dn->dn_bonustype) ||4933BP_IS_EMBEDDED(bp));4934ASSERT(BP_GET_LEVEL(bp) == db->db_level);4935}49364937mutex_enter(&db->db_mtx);49384939#ifdef ZFS_DEBUG4940if (db->db_blkid == DMU_SPILL_BLKID) {4941ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);4942ASSERT(!(BP_IS_HOLE(bp)) &&4943db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));4944}4945#endif49464947if (db->db_level == 0) {4948mutex_enter(&dn->dn_mtx);4949if (db->db_blkid > dn->dn_phys->dn_maxblkid &&4950db->db_blkid != DMU_SPILL_BLKID) {4951ASSERT0(db->db_objset->os_raw_receive);4952dn->dn_phys->dn_maxblkid = db->db_blkid;4953}4954mutex_exit(&dn->dn_mtx);49554956if (dn->dn_type == DMU_OT_DNODE) {4957i = 0;4958while (i < db->db.db_size) {4959dnode_phys_t *dnp =4960(void *)(((char *)db->db.db_data) + i);49614962i += DNODE_MIN_SIZE;4963if (dnp->dn_type != DMU_OT_NONE) {4964fill++;4965for (int j = 0; j < dnp->dn_nblkptr;4966j++) {4967(void) zfs_blkptr_verify(spa,4968&dnp->dn_blkptr[j],4969BLK_CONFIG_SKIP,4970BLK_VERIFY_HALT);4971}4972if (dnp->dn_flags &4973DNODE_FLAG_SPILL_BLKPTR) {4974(void) zfs_blkptr_verify(spa,4975DN_SPILL_BLKPTR(dnp),4976BLK_CONFIG_SKIP,4977BLK_VERIFY_HALT);4978}4979i += dnp->dn_extra_slots *4980DNODE_MIN_SIZE;4981}4982}4983} else {4984if (BP_IS_HOLE(bp)) {4985fill = 0;4986} else {4987fill = 1;4988}4989}4990} else {4991blkptr_t *ibp = db->db.db_data;4992ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);4993for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {4994if (BP_IS_HOLE(ibp))4995continue;4996(void) zfs_blkptr_verify(spa, ibp,4997BLK_CONFIG_SKIP, BLK_VERIFY_HALT);4998fill += BP_GET_FILL(ibp);4999}5000}5001DB_DNODE_EXIT(db);50025003if (!BP_IS_EMBEDDED(bp))5004BP_SET_FILL(bp, fill);50055006mutex_exit(&db->db_mtx);50075008db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);5009*db->db_blkptr = *bp;5010dmu_buf_unlock_parent(db, dblt, FTAG);5011}50125013/*5014* This function gets called just prior to running through the compression5015* stage of the zio pipeline. If we're an indirect block comprised of only5016* holes, then we want this indirect to be compressed away to a hole. In5017* order to do that we must zero out any information about the holes that5018* this indirect points to prior to before we try to compress it.5019*/5020static void5021dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)5022{5023(void) zio, (void) buf;5024dmu_buf_impl_t *db = vdb;5025blkptr_t *bp;5026unsigned int epbs, i;50275028ASSERT3U(db->db_level, >, 0);5029DB_DNODE_ENTER(db);5030epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;5031DB_DNODE_EXIT(db);5032ASSERT3U(epbs, <, 31);50335034/* Determine if all our children are holes */5035for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {5036if (!BP_IS_HOLE(bp))5037break;5038}50395040/*5041* If all the children are holes, then zero them all out so that5042* we may get compressed away.5043*/5044if (i == 1ULL << epbs) {5045/*5046* We only found holes. Grab the rwlock to prevent5047* anybody from reading the blocks we're about to5048* zero out.5049*/5050rw_enter(&db->db_rwlock, RW_WRITER);5051memset(db->db.db_data, 0, db->db.db_size);5052rw_exit(&db->db_rwlock);5053}5054}50555056static void5057dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)5058{5059(void) buf;5060dmu_buf_impl_t *db = vdb;5061blkptr_t *bp_orig = &zio->io_bp_orig;5062blkptr_t *bp = db->db_blkptr;5063objset_t *os = db->db_objset;5064dmu_tx_t *tx = os->os_synctx;50655066ASSERT0(zio->io_error);5067ASSERT(db->db_blkptr == bp);50685069/*5070* For nopwrites and rewrites we ensure that the bp matches our5071* original and bypass all the accounting.5072*/5073if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {5074ASSERT(BP_EQUAL(bp, bp_orig));5075} else {5076dsl_dataset_t *ds = os->os_dsl_dataset;5077(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);5078dsl_dataset_block_born(ds, bp, tx);5079}50805081mutex_enter(&db->db_mtx);50825083DBUF_VERIFY(db);50845085dbuf_dirty_record_t *dr = db->db_data_pending;5086dnode_t *dn = dr->dr_dnode;5087ASSERT(!list_link_active(&dr->dr_dirty_node));5088ASSERT(dr->dr_dbuf == db);5089ASSERT(list_next(&db->db_dirty_records, dr) == NULL);5090list_remove(&db->db_dirty_records, dr);50915092#ifdef ZFS_DEBUG5093if (db->db_blkid == DMU_SPILL_BLKID) {5094ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);5095ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&5096db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));5097}5098#endif50995100if (db->db_level == 0) {5101ASSERT(db->db_blkid != DMU_BONUS_BLKID);5102ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);51035104/* no dr_data if this is a NO_FILL or Direct I/O */5105if (dr->dt.dl.dr_data != NULL &&5106dr->dt.dl.dr_data != db->db_buf) {5107ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);5108ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);5109arc_buf_destroy(dr->dt.dl.dr_data, db);5110}5111} else {5112ASSERT(list_head(&dr->dt.di.dr_children) == NULL);5113ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);5114if (!BP_IS_HOLE(db->db_blkptr)) {5115int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -5116SPA_BLKPTRSHIFT;5117ASSERT3U(db->db_blkid, <=,5118dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));5119ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,5120db->db.db_size);5121}5122mutex_destroy(&dr->dt.di.dr_mtx);5123list_destroy(&dr->dt.di.dr_children);5124}51255126cv_broadcast(&db->db_changed);5127ASSERT(db->db_dirtycnt > 0);5128db->db_dirtycnt -= 1;5129db->db_data_pending = NULL;5130dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);51315132dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,5133zio->io_txg);51345135kmem_cache_free(dbuf_dirty_kmem_cache, dr);5136}51375138static void5139dbuf_write_nofill_ready(zio_t *zio)5140{5141dbuf_write_ready(zio, NULL, zio->io_private);5142}51435144static void5145dbuf_write_nofill_done(zio_t *zio)5146{5147dbuf_write_done(zio, NULL, zio->io_private);5148}51495150static void5151dbuf_write_override_ready(zio_t *zio)5152{5153dbuf_dirty_record_t *dr = zio->io_private;5154dmu_buf_impl_t *db = dr->dr_dbuf;51555156dbuf_write_ready(zio, NULL, db);5157}51585159static void5160dbuf_write_override_done(zio_t *zio)5161{5162dbuf_dirty_record_t *dr = zio->io_private;5163dmu_buf_impl_t *db = dr->dr_dbuf;5164blkptr_t *obp = &dr->dt.dl.dr_overridden_by;51655166mutex_enter(&db->db_mtx);5167if (!BP_EQUAL(zio->io_bp, obp)) {5168if (!BP_IS_HOLE(obp))5169dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);5170arc_release(dr->dt.dl.dr_data, db);5171}5172mutex_exit(&db->db_mtx);51735174dbuf_write_done(zio, NULL, db);51755176if (zio->io_abd != NULL)5177abd_free(zio->io_abd);5178}51795180typedef struct dbuf_remap_impl_callback_arg {5181objset_t *drica_os;5182uint64_t drica_blk_birth;5183dmu_tx_t *drica_tx;5184} dbuf_remap_impl_callback_arg_t;51855186static void5187dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,5188void *arg)5189{5190dbuf_remap_impl_callback_arg_t *drica = arg;5191objset_t *os = drica->drica_os;5192spa_t *spa = dmu_objset_spa(os);5193dmu_tx_t *tx = drica->drica_tx;51945195ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));51965197if (os == spa_meta_objset(spa)) {5198spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);5199} else {5200dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,5201size, drica->drica_blk_birth, tx);5202}5203}52045205static void5206dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)5207{5208blkptr_t bp_copy = *bp;5209spa_t *spa = dmu_objset_spa(dn->dn_objset);5210dbuf_remap_impl_callback_arg_t drica;52115212ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));52135214drica.drica_os = dn->dn_objset;5215drica.drica_blk_birth = BP_GET_BIRTH(bp);5216drica.drica_tx = tx;5217if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,5218&drica)) {5219/*5220* If the blkptr being remapped is tracked by a livelist,5221* then we need to make sure the livelist reflects the update.5222* First, cancel out the old blkptr by appending a 'FREE'5223* entry. Next, add an 'ALLOC' to track the new version. This5224* way we avoid trying to free an inaccurate blkptr at delete.5225* Note that embedded blkptrs are not tracked in livelists.5226*/5227if (dn->dn_objset != spa_meta_objset(spa)) {5228dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);5229if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&5230BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {5231ASSERT(!BP_IS_EMBEDDED(bp));5232ASSERT(dsl_dir_is_clone(ds->ds_dir));5233ASSERT(spa_feature_is_enabled(spa,5234SPA_FEATURE_LIVELIST));5235bplist_append(&ds->ds_dir->dd_pending_frees,5236bp);5237bplist_append(&ds->ds_dir->dd_pending_allocs,5238&bp_copy);5239}5240}52415242/*5243* The db_rwlock prevents dbuf_read_impl() from5244* dereferencing the BP while we are changing it. To5245* avoid lock contention, only grab it when we are actually5246* changing the BP.5247*/5248if (rw != NULL)5249rw_enter(rw, RW_WRITER);5250*bp = bp_copy;5251if (rw != NULL)5252rw_exit(rw);5253}5254}52555256/*5257* Remap any existing BP's to concrete vdevs, if possible.5258*/5259static void5260dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)5261{5262spa_t *spa = dmu_objset_spa(db->db_objset);5263ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));52645265if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))5266return;52675268if (db->db_level > 0) {5269blkptr_t *bp = db->db.db_data;5270for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {5271dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);5272}5273} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {5274dnode_phys_t *dnp = db->db.db_data;5275ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE);5276for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;5277i += dnp[i].dn_extra_slots + 1) {5278for (int j = 0; j < dnp[i].dn_nblkptr; j++) {5279krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :5280&dn->dn_dbuf->db_rwlock);5281dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,5282tx);5283}5284}5285}5286}528752885289/*5290* Populate dr->dr_zio with a zio to commit a dirty buffer to disk.5291* Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).5292*/5293static void5294dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)5295{5296dmu_buf_impl_t *db = dr->dr_dbuf;5297dnode_t *dn = dr->dr_dnode;5298objset_t *os;5299dmu_buf_impl_t *parent = db->db_parent;5300uint64_t txg = tx->tx_txg;5301zbookmark_phys_t zb;5302zio_prop_t zp;5303zio_t *pio; /* parent I/O */5304int wp_flag = 0;53055306ASSERT(dmu_tx_is_syncing(tx));53075308os = dn->dn_objset;53095310if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {5311/*5312* Private object buffers are released here rather than in5313* dbuf_dirty() since they are only modified in the syncing5314* context and we don't want the overhead of making multiple5315* copies of the data.5316*/5317if (BP_IS_HOLE(db->db_blkptr))5318arc_buf_thaw(data);5319else5320dbuf_release_bp(db);5321dbuf_remap(dn, db, tx);5322}53235324if (parent != dn->dn_dbuf) {5325/* Our parent is an indirect block. */5326/* We have a dirty parent that has been scheduled for write. */5327ASSERT(parent && parent->db_data_pending);5328/* Our parent's buffer is one level closer to the dnode. */5329ASSERT(db->db_level == parent->db_level-1);5330/*5331* We're about to modify our parent's db_data by modifying5332* our block pointer, so the parent must be released.5333*/5334ASSERT(arc_released(parent->db_buf));5335pio = parent->db_data_pending->dr_zio;5336} else {5337/* Our parent is the dnode itself. */5338ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&5339db->db_blkid != DMU_SPILL_BLKID) ||5340(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));5341if (db->db_blkid != DMU_SPILL_BLKID)5342ASSERT3P(db->db_blkptr, ==,5343&dn->dn_phys->dn_blkptr[db->db_blkid]);5344pio = dn->dn_zio;5345}53465347ASSERT(db->db_level == 0 || data == db->db_buf);5348ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);5349ASSERT(pio);53505351SET_BOOKMARK(&zb, os->os_dsl_dataset ?5352os->os_dsl_dataset->ds_object : DMU_META_OBJSET,5353db->db.db_object, db->db_level, db->db_blkid);53545355if (db->db_blkid == DMU_SPILL_BLKID)5356wp_flag = WP_SPILL;5357wp_flag |= (data == NULL) ? WP_NOFILL : 0;53585359dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);53605361/*5362* Set rewrite properties for zfs_rewrite() operations.5363*/5364if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {5365zp.zp_rewrite = B_TRUE;53665367/*5368* Mark physical rewrite feature for activation.5369* This will be activated automatically during dataset sync.5370*/5371dsl_dataset_t *ds = os->os_dsl_dataset;5372if (!dsl_dataset_feature_is_active(ds,5373SPA_FEATURE_PHYSICAL_REWRITE)) {5374ds->ds_feature_activation[5375SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;5376}5377}53785379/*5380* We copy the blkptr now (rather than when we instantiate the dirty5381* record), because its value can change between open context and5382* syncing context. We do not need to hold dn_struct_rwlock to read5383* db_blkptr because we are in syncing context.5384*/5385dr->dr_bp_copy = *db->db_blkptr;53865387if (db->db_level == 0 &&5388dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {5389/*5390* The BP for this block has been provided by open context5391* (by dmu_sync(), dmu_write_direct(),5392* or dmu_buf_write_embedded()).5393*/5394abd_t *contents = (data != NULL) ?5395abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;53965397dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,5398contents, db->db.db_size, db->db.db_size, &zp,5399dbuf_write_override_ready, NULL,5400dbuf_write_override_done,5401dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);5402mutex_enter(&db->db_mtx);5403dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;5404zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,5405dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,5406dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);5407mutex_exit(&db->db_mtx);5408} else if (data == NULL) {5409ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||5410zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);5411dr->dr_zio = zio_write(pio, os->os_spa, txg,5412&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,5413dbuf_write_nofill_ready, NULL,5414dbuf_write_nofill_done, db,5415ZIO_PRIORITY_ASYNC_WRITE,5416ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);5417} else {5418ASSERT(arc_released(data));54195420/*5421* For indirect blocks, we want to setup the children5422* ready callback so that we can properly handle an indirect5423* block that only contains holes.5424*/5425arc_write_done_func_t *children_ready_cb = NULL;5426if (db->db_level != 0)5427children_ready_cb = dbuf_write_children_ready;54285429dr->dr_zio = arc_write(pio, os->os_spa, txg,5430&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),5431dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,5432children_ready_cb, dbuf_write_done, db,5433ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);5434}5435}54365437EXPORT_SYMBOL(dbuf_find);5438EXPORT_SYMBOL(dbuf_is_metadata);5439EXPORT_SYMBOL(dbuf_destroy);5440EXPORT_SYMBOL(dbuf_whichblock);5441EXPORT_SYMBOL(dbuf_read);5442EXPORT_SYMBOL(dbuf_unoverride);5443EXPORT_SYMBOL(dbuf_free_range);5444EXPORT_SYMBOL(dbuf_new_size);5445EXPORT_SYMBOL(dbuf_release_bp);5446EXPORT_SYMBOL(dbuf_dirty);5447EXPORT_SYMBOL(dmu_buf_set_crypt_params);5448EXPORT_SYMBOL(dmu_buf_will_dirty);5449EXPORT_SYMBOL(dmu_buf_will_rewrite);5450EXPORT_SYMBOL(dmu_buf_is_dirty);5451EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);5452EXPORT_SYMBOL(dmu_buf_will_not_fill);5453EXPORT_SYMBOL(dmu_buf_will_fill);5454EXPORT_SYMBOL(dmu_buf_fill_done);5455EXPORT_SYMBOL(dmu_buf_rele);5456EXPORT_SYMBOL(dbuf_assign_arcbuf);5457EXPORT_SYMBOL(dbuf_prefetch);5458EXPORT_SYMBOL(dbuf_hold_impl);5459EXPORT_SYMBOL(dbuf_hold);5460EXPORT_SYMBOL(dbuf_hold_level);5461EXPORT_SYMBOL(dbuf_create_bonus);5462EXPORT_SYMBOL(dbuf_spill_set_blksz);5463EXPORT_SYMBOL(dbuf_rm_spill);5464EXPORT_SYMBOL(dbuf_add_ref);5465EXPORT_SYMBOL(dbuf_rele);5466EXPORT_SYMBOL(dbuf_rele_and_unlock);5467EXPORT_SYMBOL(dbuf_refcount);5468EXPORT_SYMBOL(dbuf_sync_list);5469EXPORT_SYMBOL(dmu_buf_set_user);5470EXPORT_SYMBOL(dmu_buf_set_user_ie);5471EXPORT_SYMBOL(dmu_buf_get_user);5472EXPORT_SYMBOL(dmu_buf_get_blkptr);54735474ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,5475"Maximum size in bytes of the dbuf cache.");54765477ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,5478"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");54795480ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,5481"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");54825483ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,5484"Maximum size in bytes of dbuf metadata cache.");54855486ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,5487"Set size of dbuf cache to log2 fraction of arc size.");54885489ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,5490"Set size of dbuf metadata cache to log2 fraction of arc size.");54915492ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,5493"Set size of dbuf cache mutex array as log2 shift.");549454955496