Path: blob/main/sys/contrib/openzfs/module/zfs/dbuf.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright 2011 Nexenta Systems, Inc. All rights reserved.24* Copyright (c) 2012, 2020 by Delphix. All rights reserved.25* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.26* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.27* Copyright (c) 2019, Klara Inc.28* Copyright (c) 2019, Allan Jude29* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek30*/3132#include <sys/zfs_context.h>33#include <sys/arc.h>34#include <sys/dmu.h>35#include <sys/dmu_send.h>36#include <sys/dmu_impl.h>37#include <sys/dbuf.h>38#include <sys/dmu_objset.h>39#include <sys/dsl_dataset.h>40#include <sys/dsl_dir.h>41#include <sys/dmu_tx.h>42#include <sys/spa.h>43#include <sys/zio.h>44#include <sys/dmu_zfetch.h>45#include <sys/sa.h>46#include <sys/sa_impl.h>47#include <sys/zfeature.h>48#include <sys/blkptr.h>49#include <sys/range_tree.h>50#include <sys/trace_zfs.h>51#include <sys/callb.h>52#include <sys/abd.h>53#include <sys/brt.h>54#include <sys/vdev.h>55#include <cityhash.h>56#include <sys/spa_impl.h>57#include <sys/wmsum.h>58#include <sys/vdev_impl.h>5960static kstat_t *dbuf_ksp;6162typedef struct dbuf_stats {63/*64* Various statistics about the size of the dbuf cache.65*/66kstat_named_t cache_count;67kstat_named_t cache_size_bytes;68kstat_named_t cache_size_bytes_max;69/*70* Statistics regarding the bounds on the dbuf cache size.71*/72kstat_named_t cache_target_bytes;73kstat_named_t cache_lowater_bytes;74kstat_named_t cache_hiwater_bytes;75/*76* Total number of dbuf cache evictions that have occurred.77*/78kstat_named_t cache_total_evicts;79/*80* The distribution of dbuf levels in the dbuf cache and81* the total size of all dbufs at each level.82*/83kstat_named_t cache_levels[DN_MAX_LEVELS];84kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];85/*86* Statistics about the dbuf hash table.87*/88kstat_named_t hash_hits;89kstat_named_t hash_misses;90kstat_named_t hash_collisions;91kstat_named_t hash_elements;92/*93* Number of sublists containing more than one dbuf in the dbuf94* hash table. Keep track of the longest hash chain.95*/96kstat_named_t hash_chains;97kstat_named_t hash_chain_max;98/*99* Number of times a dbuf_create() discovers that a dbuf was100* already created and in the dbuf hash table.101*/102kstat_named_t hash_insert_race;103/*104* Number of entries in the hash table dbuf and mutex arrays.105*/106kstat_named_t hash_table_count;107kstat_named_t hash_mutex_count;108/*109* Statistics about the size of the metadata dbuf cache.110*/111kstat_named_t metadata_cache_count;112kstat_named_t metadata_cache_size_bytes;113kstat_named_t metadata_cache_size_bytes_max;114/*115* For diagnostic purposes, this is incremented whenever we can't add116* something to the metadata cache because it's full, and instead put117* the data in the regular dbuf cache.118*/119kstat_named_t metadata_cache_overflow;120} dbuf_stats_t;121122dbuf_stats_t dbuf_stats = {123{ "cache_count", KSTAT_DATA_UINT64 },124{ "cache_size_bytes", KSTAT_DATA_UINT64 },125{ "cache_size_bytes_max", KSTAT_DATA_UINT64 },126{ "cache_target_bytes", KSTAT_DATA_UINT64 },127{ "cache_lowater_bytes", KSTAT_DATA_UINT64 },128{ "cache_hiwater_bytes", KSTAT_DATA_UINT64 },129{ "cache_total_evicts", KSTAT_DATA_UINT64 },130{ { "cache_levels_N", KSTAT_DATA_UINT64 } },131{ { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },132{ "hash_hits", KSTAT_DATA_UINT64 },133{ "hash_misses", KSTAT_DATA_UINT64 },134{ "hash_collisions", KSTAT_DATA_UINT64 },135{ "hash_elements", KSTAT_DATA_UINT64 },136{ "hash_chains", KSTAT_DATA_UINT64 },137{ "hash_chain_max", KSTAT_DATA_UINT64 },138{ "hash_insert_race", KSTAT_DATA_UINT64 },139{ "hash_table_count", KSTAT_DATA_UINT64 },140{ "hash_mutex_count", KSTAT_DATA_UINT64 },141{ "metadata_cache_count", KSTAT_DATA_UINT64 },142{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },143{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },144{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }145};146147struct {148wmsum_t cache_count;149wmsum_t cache_total_evicts;150wmsum_t cache_levels[DN_MAX_LEVELS];151wmsum_t cache_levels_bytes[DN_MAX_LEVELS];152wmsum_t hash_hits;153wmsum_t hash_misses;154wmsum_t hash_collisions;155wmsum_t hash_elements;156wmsum_t hash_chains;157wmsum_t hash_insert_race;158wmsum_t metadata_cache_count;159wmsum_t metadata_cache_overflow;160} dbuf_sums;161162#define DBUF_STAT_INCR(stat, val) \163wmsum_add(&dbuf_sums.stat, val)164#define DBUF_STAT_DECR(stat, val) \165DBUF_STAT_INCR(stat, -(val))166#define DBUF_STAT_BUMP(stat) \167DBUF_STAT_INCR(stat, 1)168#define DBUF_STAT_BUMPDOWN(stat) \169DBUF_STAT_INCR(stat, -1)170#define DBUF_STAT_MAX(stat, v) { \171uint64_t _m; \172while ((v) > (_m = dbuf_stats.stat.value.ui64) && \173(_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\174continue; \175}176177static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);178static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);179180/*181* Global data structures and functions for the dbuf cache.182*/183static kmem_cache_t *dbuf_kmem_cache;184kmem_cache_t *dbuf_dirty_kmem_cache;185static taskq_t *dbu_evict_taskq;186187static kthread_t *dbuf_cache_evict_thread;188static kmutex_t dbuf_evict_lock;189static kcondvar_t dbuf_evict_cv;190static boolean_t dbuf_evict_thread_exit;191192/*193* There are two dbuf caches; each dbuf can only be in one of them at a time.194*195* 1. Cache of metadata dbufs, to help make read-heavy administrative commands196* from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs197* that represent the metadata that describes filesystems/snapshots/198* bookmarks/properties/etc. We only evict from this cache when we export a199* pool, to short-circuit as much I/O as possible for all administrative200* commands that need the metadata. There is no eviction policy for this201* cache, because we try to only include types in it which would occupy a202* very small amount of space per object but create a large impact on the203* performance of these commands. Instead, after it reaches a maximum size204* (which should only happen on very small memory systems with a very large205* number of filesystem objects), we stop taking new dbufs into the206* metadata cache, instead putting them in the normal dbuf cache.207*208* 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that209* are not currently held but have been recently released. These dbufs210* are not eligible for arc eviction until they are aged out of the cache.211* Dbufs that are aged out of the cache will be immediately destroyed and212* become eligible for arc eviction.213*214* Dbufs are added to these caches once the last hold is released. If a dbuf is215* later accessed and still exists in the dbuf cache, then it will be removed216* from the cache and later re-added to the head of the cache.217*218* If a given dbuf meets the requirements for the metadata cache, it will go219* there, otherwise it will be considered for the generic LRU dbuf cache. The220* caches and the refcounts tracking their sizes are stored in an array indexed221* by those caches' matching enum values (from dbuf_cached_state_t).222*/223typedef struct dbuf_cache {224multilist_t cache;225zfs_refcount_t size ____cacheline_aligned;226} dbuf_cache_t;227dbuf_cache_t dbuf_caches[DB_CACHE_MAX];228229/* Size limits for the caches */230static uint64_t dbuf_cache_max_bytes = UINT64_MAX;231static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;232233/* Set the default sizes of the caches to log2 fraction of arc size */234static uint_t dbuf_cache_shift = 5;235static uint_t dbuf_metadata_cache_shift = 6;236237/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */238static uint_t dbuf_mutex_cache_shift = 0;239240static unsigned long dbuf_cache_target_bytes(void);241static unsigned long dbuf_metadata_cache_target_bytes(void);242243/*244* The LRU dbuf cache uses a three-stage eviction policy:245* - A low water marker designates when the dbuf eviction thread246* should stop evicting from the dbuf cache.247* - When we reach the maximum size (aka mid water mark), we248* signal the eviction thread to run.249* - The high water mark indicates when the eviction thread250* is unable to keep up with the incoming load and eviction must251* happen in the context of the calling thread.252*253* The dbuf cache:254* (max size)255* low water mid water hi water256* +----------------------------------------+----------+----------+257* | | | |258* | | | |259* | | | |260* | | | |261* +----------------------------------------+----------+----------+262* stop signal evict263* evicting eviction directly264* thread265*266* The high and low water marks indicate the operating range for the eviction267* thread. The low water mark is, by default, 90% of the total size of the268* cache and the high water mark is at 110% (both of these percentages can be269* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,270* respectively). The eviction thread will try to ensure that the cache remains271* within this range by waking up every second and checking if the cache is272* above the low water mark. The thread can also be woken up by callers adding273* elements into the cache if the cache is larger than the mid water (i.e max274* cache size). Once the eviction thread is woken up and eviction is required,275* it will continue evicting buffers until it's able to reduce the cache size276* to the low water mark. If the cache size continues to grow and hits the high277* water mark, then callers adding elements to the cache will begin to evict278* directly from the cache until the cache is no longer above the high water279* mark.280*/281282/*283* The percentage above and below the maximum cache size.284*/285static uint_t dbuf_cache_hiwater_pct = 10;286static uint_t dbuf_cache_lowater_pct = 10;287288static int289dbuf_cons(void *vdb, void *unused, int kmflag)290{291(void) unused, (void) kmflag;292dmu_buf_impl_t *db = vdb;293memset(db, 0, sizeof (dmu_buf_impl_t));294295mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);296rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);297cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);298multilist_link_init(&db->db_cache_link);299zfs_refcount_create(&db->db_holds);300301return (0);302}303304static void305dbuf_dest(void *vdb, void *unused)306{307(void) unused;308dmu_buf_impl_t *db = vdb;309mutex_destroy(&db->db_mtx);310rw_destroy(&db->db_rwlock);311cv_destroy(&db->db_changed);312ASSERT(!multilist_link_active(&db->db_cache_link));313zfs_refcount_destroy(&db->db_holds);314}315316/*317* dbuf hash table routines318*/319static dbuf_hash_table_t dbuf_hash_table;320321/*322* We use Cityhash for this. It's fast, and has good hash properties without323* requiring any large static buffers.324*/325static uint64_t326dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)327{328return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));329}330331#define DTRACE_SET_STATE(db, why) \332DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \333const char *, why)334335#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \336((dbuf)->db.db_object == (obj) && \337(dbuf)->db_objset == (os) && \338(dbuf)->db_level == (level) && \339(dbuf)->db_blkid == (blkid))340341dmu_buf_impl_t *342dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,343uint64_t *hash_out)344{345dbuf_hash_table_t *h = &dbuf_hash_table;346uint64_t hv;347uint64_t idx;348dmu_buf_impl_t *db;349350hv = dbuf_hash(os, obj, level, blkid);351idx = hv & h->hash_table_mask;352353mutex_enter(DBUF_HASH_MUTEX(h, idx));354for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {355if (DBUF_EQUAL(db, os, obj, level, blkid)) {356mutex_enter(&db->db_mtx);357if (db->db_state != DB_EVICTING) {358mutex_exit(DBUF_HASH_MUTEX(h, idx));359return (db);360}361mutex_exit(&db->db_mtx);362}363}364mutex_exit(DBUF_HASH_MUTEX(h, idx));365if (hash_out != NULL)366*hash_out = hv;367return (NULL);368}369370static dmu_buf_impl_t *371dbuf_find_bonus(objset_t *os, uint64_t object)372{373dnode_t *dn;374dmu_buf_impl_t *db = NULL;375376if (dnode_hold(os, object, FTAG, &dn) == 0) {377rw_enter(&dn->dn_struct_rwlock, RW_READER);378if (dn->dn_bonus != NULL) {379db = dn->dn_bonus;380mutex_enter(&db->db_mtx);381}382rw_exit(&dn->dn_struct_rwlock);383dnode_rele(dn, FTAG);384}385return (db);386}387388/*389* Insert an entry into the hash table. If there is already an element390* equal to elem in the hash table, then the already existing element391* will be returned and the new element will not be inserted.392* Otherwise returns NULL.393*/394static dmu_buf_impl_t *395dbuf_hash_insert(dmu_buf_impl_t *db)396{397dbuf_hash_table_t *h = &dbuf_hash_table;398objset_t *os = db->db_objset;399uint64_t obj = db->db.db_object;400int level = db->db_level;401uint64_t blkid, idx;402dmu_buf_impl_t *dbf;403uint32_t i;404405blkid = db->db_blkid;406ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);407idx = db->db_hash & h->hash_table_mask;408409mutex_enter(DBUF_HASH_MUTEX(h, idx));410for (dbf = h->hash_table[idx], i = 0; dbf != NULL;411dbf = dbf->db_hash_next, i++) {412if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {413mutex_enter(&dbf->db_mtx);414if (dbf->db_state != DB_EVICTING) {415mutex_exit(DBUF_HASH_MUTEX(h, idx));416return (dbf);417}418mutex_exit(&dbf->db_mtx);419}420}421422if (i > 0) {423DBUF_STAT_BUMP(hash_collisions);424if (i == 1)425DBUF_STAT_BUMP(hash_chains);426427DBUF_STAT_MAX(hash_chain_max, i);428}429430mutex_enter(&db->db_mtx);431db->db_hash_next = h->hash_table[idx];432h->hash_table[idx] = db;433mutex_exit(DBUF_HASH_MUTEX(h, idx));434DBUF_STAT_BUMP(hash_elements);435436return (NULL);437}438439/*440* This returns whether this dbuf should be stored in the metadata cache, which441* is based on whether it's from one of the dnode types that store data related442* to traversing dataset hierarchies.443*/444static boolean_t445dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)446{447DB_DNODE_ENTER(db);448dmu_object_type_t type = DB_DNODE(db)->dn_type;449DB_DNODE_EXIT(db);450451/* Check if this dbuf is one of the types we care about */452if (DMU_OT_IS_METADATA_CACHED(type)) {453/* If we hit this, then we set something up wrong in dmu_ot */454ASSERT(DMU_OT_IS_METADATA(type));455456/*457* Sanity check for small-memory systems: don't allocate too458* much memory for this purpose.459*/460if (zfs_refcount_count(461&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >462dbuf_metadata_cache_target_bytes()) {463DBUF_STAT_BUMP(metadata_cache_overflow);464return (B_FALSE);465}466467return (B_TRUE);468}469470return (B_FALSE);471}472473/*474* Remove an entry from the hash table. It must be in the EVICTING state.475*/476static void477dbuf_hash_remove(dmu_buf_impl_t *db)478{479dbuf_hash_table_t *h = &dbuf_hash_table;480uint64_t idx;481dmu_buf_impl_t *dbf, **dbp;482483ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,484db->db_blkid), ==, db->db_hash);485idx = db->db_hash & h->hash_table_mask;486487/*488* We mustn't hold db_mtx to maintain lock ordering:489* DBUF_HASH_MUTEX > db_mtx.490*/491ASSERT(zfs_refcount_is_zero(&db->db_holds));492ASSERT(db->db_state == DB_EVICTING);493ASSERT(!MUTEX_HELD(&db->db_mtx));494495mutex_enter(DBUF_HASH_MUTEX(h, idx));496dbp = &h->hash_table[idx];497while ((dbf = *dbp) != db) {498dbp = &dbf->db_hash_next;499ASSERT(dbf != NULL);500}501*dbp = db->db_hash_next;502db->db_hash_next = NULL;503if (h->hash_table[idx] &&504h->hash_table[idx]->db_hash_next == NULL)505DBUF_STAT_BUMPDOWN(hash_chains);506mutex_exit(DBUF_HASH_MUTEX(h, idx));507DBUF_STAT_BUMPDOWN(hash_elements);508}509510typedef enum {511DBVU_EVICTING,512DBVU_NOT_EVICTING513} dbvu_verify_type_t;514515static void516dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)517{518#ifdef ZFS_DEBUG519int64_t holds;520521if (db->db_user == NULL)522return;523524/* Only data blocks support the attachment of user data. */525ASSERT0(db->db_level);526527/* Clients must resolve a dbuf before attaching user data. */528ASSERT(db->db.db_data != NULL);529ASSERT3U(db->db_state, ==, DB_CACHED);530531holds = zfs_refcount_count(&db->db_holds);532if (verify_type == DBVU_EVICTING) {533/*534* Immediate eviction occurs when holds == dirtycnt.535* For normal eviction buffers, holds is zero on536* eviction, except when dbuf_fix_old_data() calls537* dbuf_clear_data(). However, the hold count can grow538* during eviction even though db_mtx is held (see539* dmu_bonus_hold() for an example), so we can only540* test the generic invariant that holds >= dirtycnt.541*/542ASSERT3U(holds, >=, db->db_dirtycnt);543} else {544if (db->db_user_immediate_evict == TRUE)545ASSERT3U(holds, >=, db->db_dirtycnt);546else547ASSERT3U(holds, >, 0);548}549#endif550}551552static void553dbuf_evict_user(dmu_buf_impl_t *db)554{555dmu_buf_user_t *dbu = db->db_user;556557ASSERT(MUTEX_HELD(&db->db_mtx));558559if (dbu == NULL)560return;561562dbuf_verify_user(db, DBVU_EVICTING);563db->db_user = NULL;564565#ifdef ZFS_DEBUG566if (dbu->dbu_clear_on_evict_dbufp != NULL)567*dbu->dbu_clear_on_evict_dbufp = NULL;568#endif569570if (db->db_caching_status != DB_NO_CACHE) {571/*572* This is a cached dbuf, so the size of the user data is573* included in its cached amount. We adjust it here because the574* user data has already been detached from the dbuf, and the575* sync functions are not supposed to touch it (the dbuf might576* not exist anymore by the time the sync functions run.577*/578uint64_t size = dbu->dbu_size;579(void) zfs_refcount_remove_many(580&dbuf_caches[db->db_caching_status].size, size, dbu);581if (db->db_caching_status == DB_DBUF_CACHE)582DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);583}584585/*586* There are two eviction callbacks - one that we call synchronously587* and one that we invoke via a taskq. The async one is useful for588* avoiding lock order reversals and limiting stack depth.589*590* Note that if we have a sync callback but no async callback,591* it's likely that the sync callback will free the structure592* containing the dbu. In that case we need to take care to not593* dereference dbu after calling the sync evict func.594*/595boolean_t has_async = (dbu->dbu_evict_func_async != NULL);596597if (dbu->dbu_evict_func_sync != NULL)598dbu->dbu_evict_func_sync(dbu);599600if (has_async) {601taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,602dbu, 0, &dbu->dbu_tqent);603}604}605606boolean_t607dbuf_is_metadata(dmu_buf_impl_t *db)608{609/*610* Consider indirect blocks and spill blocks to be meta data.611*/612if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {613return (B_TRUE);614} else {615boolean_t is_metadata;616617DB_DNODE_ENTER(db);618is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);619DB_DNODE_EXIT(db);620621return (is_metadata);622}623}624625/*626* We want to exclude buffers that are on a special allocation class from627* L2ARC.628*/629boolean_t630dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)631{632if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||633(db->db_objset->os_secondary_cache ==634ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {635if (l2arc_exclude_special == 0)636return (B_TRUE);637638/*639* bp must be checked in the event it was passed from640* dbuf_read_impl() as the result of a the BP being set from641* a Direct I/O write in dbuf_read(). See comments in642* dbuf_read().643*/644blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;645646if (db_bp == NULL || BP_IS_HOLE(db_bp))647return (B_FALSE);648uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);649vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;650vdev_t *vd = NULL;651652if (vdev < rvd->vdev_children)653vd = rvd->vdev_child[vdev];654655if (vd == NULL)656return (B_TRUE);657658if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&659vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)660return (B_TRUE);661}662return (B_FALSE);663}664665static inline boolean_t666dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)667{668if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||669(dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&670(level > 0 ||671DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {672if (l2arc_exclude_special == 0)673return (B_TRUE);674675if (bp == NULL || BP_IS_HOLE(bp))676return (B_FALSE);677uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);678vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;679vdev_t *vd = NULL;680681if (vdev < rvd->vdev_children)682vd = rvd->vdev_child[vdev];683684if (vd == NULL)685return (B_TRUE);686687if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&688vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)689return (B_TRUE);690}691return (B_FALSE);692}693694695/*696* This function *must* return indices evenly distributed between all697* sublists of the multilist. This is needed due to how the dbuf eviction698* code is laid out; dbuf_evict_thread() assumes dbufs are evenly699* distributed between all sublists and uses this assumption when700* deciding which sublist to evict from and how much to evict from it.701*/702static unsigned int703dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)704{705dmu_buf_impl_t *db = obj;706707/*708* The assumption here, is the hash value for a given709* dmu_buf_impl_t will remain constant throughout it's lifetime710* (i.e. it's objset, object, level and blkid fields don't change).711* Thus, we don't need to store the dbuf's sublist index712* on insertion, as this index can be recalculated on removal.713*714* Also, the low order bits of the hash value are thought to be715* distributed evenly. Otherwise, in the case that the multilist716* has a power of two number of sublists, each sublists' usage717* would not be evenly distributed. In this context full 64bit718* division would be a waste of time, so limit it to 32 bits.719*/720return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,721db->db_level, db->db_blkid) %722multilist_get_num_sublists(ml));723}724725/*726* The target size of the dbuf cache can grow with the ARC target,727* unless limited by the tunable dbuf_cache_max_bytes.728*/729static inline unsigned long730dbuf_cache_target_bytes(void)731{732return (MIN(dbuf_cache_max_bytes,733arc_target_bytes() >> dbuf_cache_shift));734}735736/*737* The target size of the dbuf metadata cache can grow with the ARC target,738* unless limited by the tunable dbuf_metadata_cache_max_bytes.739*/740static inline unsigned long741dbuf_metadata_cache_target_bytes(void)742{743return (MIN(dbuf_metadata_cache_max_bytes,744arc_target_bytes() >> dbuf_metadata_cache_shift));745}746747static inline uint64_t748dbuf_cache_hiwater_bytes(void)749{750uint64_t dbuf_cache_target = dbuf_cache_target_bytes();751return (dbuf_cache_target +752(dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);753}754755static inline uint64_t756dbuf_cache_lowater_bytes(void)757{758uint64_t dbuf_cache_target = dbuf_cache_target_bytes();759return (dbuf_cache_target -760(dbuf_cache_target * dbuf_cache_lowater_pct) / 100);761}762763static inline boolean_t764dbuf_cache_above_lowater(void)765{766return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >767dbuf_cache_lowater_bytes());768}769770/*771* Evict the oldest eligible dbuf from the dbuf cache.772*/773static void774dbuf_evict_one(void)775{776int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);777multilist_sublist_t *mls = multilist_sublist_lock_idx(778&dbuf_caches[DB_DBUF_CACHE].cache, idx);779780ASSERT(!MUTEX_HELD(&dbuf_evict_lock));781782dmu_buf_impl_t *db = multilist_sublist_tail(mls);783while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {784db = multilist_sublist_prev(mls, db);785}786787DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,788multilist_sublist_t *, mls);789790if (db != NULL) {791multilist_sublist_remove(mls, db);792multilist_sublist_unlock(mls);793uint64_t size = db->db.db_size;794uint64_t usize = dmu_buf_user_size(&db->db);795(void) zfs_refcount_remove_many(796&dbuf_caches[DB_DBUF_CACHE].size, size, db);797(void) zfs_refcount_remove_many(798&dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);799DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);800DBUF_STAT_BUMPDOWN(cache_count);801DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);802ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);803db->db_caching_status = DB_NO_CACHE;804dbuf_destroy(db);805DBUF_STAT_BUMP(cache_total_evicts);806} else {807multilist_sublist_unlock(mls);808}809}810811/*812* The dbuf evict thread is responsible for aging out dbufs from the813* cache. Once the cache has reached it's maximum size, dbufs are removed814* and destroyed. The eviction thread will continue running until the size815* of the dbuf cache is at or below the maximum size. Once the dbuf is aged816* out of the cache it is destroyed and becomes eligible for arc eviction.817*/818static __attribute__((noreturn)) void819dbuf_evict_thread(void *unused)820{821(void) unused;822callb_cpr_t cpr;823824CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);825826mutex_enter(&dbuf_evict_lock);827while (!dbuf_evict_thread_exit) {828while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {829CALLB_CPR_SAFE_BEGIN(&cpr);830(void) cv_timedwait_idle_hires(&dbuf_evict_cv,831&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);832CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);833}834mutex_exit(&dbuf_evict_lock);835836/*837* Keep evicting as long as we're above the low water mark838* for the cache. We do this without holding the locks to839* minimize lock contention.840*/841while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {842dbuf_evict_one();843}844845mutex_enter(&dbuf_evict_lock);846}847848dbuf_evict_thread_exit = B_FALSE;849cv_broadcast(&dbuf_evict_cv);850CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */851thread_exit();852}853854/*855* Wake up the dbuf eviction thread if the dbuf cache is at its max size.856* If the dbuf cache is at its high water mark, then evict a dbuf from the857* dbuf cache using the caller's context.858*/859static void860dbuf_evict_notify(uint64_t size)861{862/*863* We check if we should evict without holding the dbuf_evict_lock,864* because it's OK to occasionally make the wrong decision here,865* and grabbing the lock results in massive lock contention.866*/867if (size > dbuf_cache_target_bytes()) {868/*869* Avoid calling dbuf_evict_one() from memory reclaim context870* (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.871* Memory reclaim threads can get stuck waiting for the dbuf872* hash lock.873*/874if (size > dbuf_cache_hiwater_bytes() &&875!current_is_reclaim_thread()) {876dbuf_evict_one();877}878cv_signal(&dbuf_evict_cv);879}880}881882/*883* Since dbuf cache size is a fraction of target ARC size, ARC calls this when884* its target size is reduced due to memory pressure.885*/886void887dbuf_cache_reduce_target_size(void)888{889uint64_t size = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);890891if (size > dbuf_cache_target_bytes())892cv_signal(&dbuf_evict_cv);893}894895static int896dbuf_kstat_update(kstat_t *ksp, int rw)897{898dbuf_stats_t *ds = ksp->ks_data;899dbuf_hash_table_t *h = &dbuf_hash_table;900901if (rw == KSTAT_WRITE)902return (SET_ERROR(EACCES));903904ds->cache_count.value.ui64 =905wmsum_value(&dbuf_sums.cache_count);906ds->cache_size_bytes.value.ui64 =907zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);908ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();909ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();910ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();911ds->cache_total_evicts.value.ui64 =912wmsum_value(&dbuf_sums.cache_total_evicts);913for (int i = 0; i < DN_MAX_LEVELS; i++) {914ds->cache_levels[i].value.ui64 =915wmsum_value(&dbuf_sums.cache_levels[i]);916ds->cache_levels_bytes[i].value.ui64 =917wmsum_value(&dbuf_sums.cache_levels_bytes[i]);918}919ds->hash_hits.value.ui64 =920wmsum_value(&dbuf_sums.hash_hits);921ds->hash_misses.value.ui64 =922wmsum_value(&dbuf_sums.hash_misses);923ds->hash_collisions.value.ui64 =924wmsum_value(&dbuf_sums.hash_collisions);925ds->hash_elements.value.ui64 =926wmsum_value(&dbuf_sums.hash_elements);927ds->hash_chains.value.ui64 =928wmsum_value(&dbuf_sums.hash_chains);929ds->hash_insert_race.value.ui64 =930wmsum_value(&dbuf_sums.hash_insert_race);931ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;932ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;933ds->metadata_cache_count.value.ui64 =934wmsum_value(&dbuf_sums.metadata_cache_count);935ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(936&dbuf_caches[DB_DBUF_METADATA_CACHE].size);937ds->metadata_cache_overflow.value.ui64 =938wmsum_value(&dbuf_sums.metadata_cache_overflow);939return (0);940}941942void943dbuf_init(void)944{945uint64_t hmsize, hsize = 1ULL << 16;946dbuf_hash_table_t *h = &dbuf_hash_table;947948/*949* The hash table is big enough to fill one eighth of physical memory950* with an average block size of zfs_arc_average_blocksize (default 8K).951* By default, the table will take up952* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).953*/954while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)955hsize <<= 1;956957h->hash_table = NULL;958while (h->hash_table == NULL) {959h->hash_table_mask = hsize - 1;960961h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);962if (h->hash_table == NULL)963hsize >>= 1;964965ASSERT3U(hsize, >=, 1ULL << 10);966}967968/*969* The hash table buckets are protected by an array of mutexes where970* each mutex is reponsible for protecting 128 buckets. A minimum971* array size of 8192 is targeted to avoid contention.972*/973if (dbuf_mutex_cache_shift == 0)974hmsize = MAX(hsize >> 7, 1ULL << 13);975else976hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);977978h->hash_mutexes = NULL;979while (h->hash_mutexes == NULL) {980h->hash_mutex_mask = hmsize - 1;981982h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),983KM_SLEEP);984if (h->hash_mutexes == NULL)985hmsize >>= 1;986}987988dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",989sizeof (dmu_buf_impl_t),9900, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);991dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t",992sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0);993994for (int i = 0; i < hmsize; i++)995mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);996997dbuf_stats_init(h);998999/*1000* All entries are queued via taskq_dispatch_ent(), so min/maxalloc1001* configuration is not required.1002*/1003dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);10041005for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {1006multilist_create(&dbuf_caches[dcs].cache,1007sizeof (dmu_buf_impl_t),1008offsetof(dmu_buf_impl_t, db_cache_link),1009dbuf_cache_multilist_index_func);1010zfs_refcount_create(&dbuf_caches[dcs].size);1011}10121013dbuf_evict_thread_exit = B_FALSE;1014mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);1015cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);1016dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,1017NULL, 0, &p0, TS_RUN, minclsyspri);10181019wmsum_init(&dbuf_sums.cache_count, 0);1020wmsum_init(&dbuf_sums.cache_total_evicts, 0);1021for (int i = 0; i < DN_MAX_LEVELS; i++) {1022wmsum_init(&dbuf_sums.cache_levels[i], 0);1023wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);1024}1025wmsum_init(&dbuf_sums.hash_hits, 0);1026wmsum_init(&dbuf_sums.hash_misses, 0);1027wmsum_init(&dbuf_sums.hash_collisions, 0);1028wmsum_init(&dbuf_sums.hash_elements, 0);1029wmsum_init(&dbuf_sums.hash_chains, 0);1030wmsum_init(&dbuf_sums.hash_insert_race, 0);1031wmsum_init(&dbuf_sums.metadata_cache_count, 0);1032wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);10331034dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",1035KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),1036KSTAT_FLAG_VIRTUAL);1037if (dbuf_ksp != NULL) {1038for (int i = 0; i < DN_MAX_LEVELS; i++) {1039snprintf(dbuf_stats.cache_levels[i].name,1040KSTAT_STRLEN, "cache_level_%d", i);1041dbuf_stats.cache_levels[i].data_type =1042KSTAT_DATA_UINT64;1043snprintf(dbuf_stats.cache_levels_bytes[i].name,1044KSTAT_STRLEN, "cache_level_%d_bytes", i);1045dbuf_stats.cache_levels_bytes[i].data_type =1046KSTAT_DATA_UINT64;1047}1048dbuf_ksp->ks_data = &dbuf_stats;1049dbuf_ksp->ks_update = dbuf_kstat_update;1050kstat_install(dbuf_ksp);1051}1052}10531054void1055dbuf_fini(void)1056{1057dbuf_hash_table_t *h = &dbuf_hash_table;10581059dbuf_stats_destroy();10601061for (int i = 0; i < (h->hash_mutex_mask + 1); i++)1062mutex_destroy(&h->hash_mutexes[i]);10631064vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));1065vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *1066sizeof (kmutex_t));10671068kmem_cache_destroy(dbuf_kmem_cache);1069kmem_cache_destroy(dbuf_dirty_kmem_cache);1070taskq_destroy(dbu_evict_taskq);10711072mutex_enter(&dbuf_evict_lock);1073dbuf_evict_thread_exit = B_TRUE;1074while (dbuf_evict_thread_exit) {1075cv_signal(&dbuf_evict_cv);1076cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);1077}1078mutex_exit(&dbuf_evict_lock);10791080mutex_destroy(&dbuf_evict_lock);1081cv_destroy(&dbuf_evict_cv);10821083for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {1084zfs_refcount_destroy(&dbuf_caches[dcs].size);1085multilist_destroy(&dbuf_caches[dcs].cache);1086}10871088if (dbuf_ksp != NULL) {1089kstat_delete(dbuf_ksp);1090dbuf_ksp = NULL;1091}10921093wmsum_fini(&dbuf_sums.cache_count);1094wmsum_fini(&dbuf_sums.cache_total_evicts);1095for (int i = 0; i < DN_MAX_LEVELS; i++) {1096wmsum_fini(&dbuf_sums.cache_levels[i]);1097wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);1098}1099wmsum_fini(&dbuf_sums.hash_hits);1100wmsum_fini(&dbuf_sums.hash_misses);1101wmsum_fini(&dbuf_sums.hash_collisions);1102wmsum_fini(&dbuf_sums.hash_elements);1103wmsum_fini(&dbuf_sums.hash_chains);1104wmsum_fini(&dbuf_sums.hash_insert_race);1105wmsum_fini(&dbuf_sums.metadata_cache_count);1106wmsum_fini(&dbuf_sums.metadata_cache_overflow);1107}11081109/*1110* Other stuff.1111*/11121113#ifdef ZFS_DEBUG1114static void1115dbuf_verify(dmu_buf_impl_t *db)1116{1117dnode_t *dn;1118dbuf_dirty_record_t *dr;1119uint32_t txg_prev;11201121ASSERT(MUTEX_HELD(&db->db_mtx));11221123if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))1124return;11251126ASSERT(db->db_objset != NULL);1127DB_DNODE_ENTER(db);1128dn = DB_DNODE(db);1129if (dn == NULL) {1130ASSERT0P(db->db_parent);1131ASSERT0P(db->db_blkptr);1132} else {1133ASSERT3U(db->db.db_object, ==, dn->dn_object);1134ASSERT3P(db->db_objset, ==, dn->dn_objset);1135ASSERT3U(db->db_level, <, dn->dn_nlevels);1136ASSERT(db->db_blkid == DMU_BONUS_BLKID ||1137db->db_blkid == DMU_SPILL_BLKID ||1138!avl_is_empty(&dn->dn_dbufs));1139}1140if (db->db_blkid == DMU_BONUS_BLKID) {1141ASSERT(dn != NULL);1142ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);1143ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);1144} else if (db->db_blkid == DMU_SPILL_BLKID) {1145ASSERT(dn != NULL);1146ASSERT0(db->db.db_offset);1147} else {1148ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);1149}11501151if ((dr = list_head(&db->db_dirty_records)) != NULL) {1152ASSERT(dr->dr_dbuf == db);1153txg_prev = dr->dr_txg;1154for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;1155dr = list_next(&db->db_dirty_records, dr)) {1156ASSERT(dr->dr_dbuf == db);1157ASSERT(txg_prev > dr->dr_txg);1158txg_prev = dr->dr_txg;1159}1160}11611162/*1163* We can't assert that db_size matches dn_datablksz because it1164* can be momentarily different when another thread is doing1165* dnode_set_blksz().1166*/1167if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {1168dr = db->db_data_pending;1169/*1170* It should only be modified in syncing context, so1171* make sure we only have one copy of the data.1172*/1173ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);1174}11751176/* verify db->db_blkptr */1177if (db->db_blkptr) {1178if (db->db_parent == dn->dn_dbuf) {1179/* db is pointed to by the dnode */1180/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */1181if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))1182ASSERT0P(db->db_parent);1183else1184ASSERT(db->db_parent != NULL);1185if (db->db_blkid != DMU_SPILL_BLKID)1186ASSERT3P(db->db_blkptr, ==,1187&dn->dn_phys->dn_blkptr[db->db_blkid]);1188} else {1189/* db is pointed to by an indirect block */1190int epb __maybe_unused = db->db_parent->db.db_size >>1191SPA_BLKPTRSHIFT;1192ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);1193ASSERT3U(db->db_parent->db.db_object, ==,1194db->db.db_object);1195ASSERT3P(db->db_blkptr, ==,1196((blkptr_t *)db->db_parent->db.db_data +1197db->db_blkid % epb));1198}1199}1200if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&1201(db->db_buf == NULL || db->db_buf->b_data) &&1202db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&1203db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {1204/*1205* If the blkptr isn't set but they have nonzero data,1206* it had better be dirty, otherwise we'll lose that1207* data when we evict this buffer.1208*1209* There is an exception to this rule for indirect blocks; in1210* this case, if the indirect block is a hole, we fill in a few1211* fields on each of the child blocks (importantly, birth time)1212* to prevent hole birth times from being lost when you1213* partially fill in a hole.1214*/1215if (db->db_dirtycnt == 0) {1216if (db->db_level == 0) {1217uint64_t *buf = db->db.db_data;1218int i;12191220for (i = 0; i < db->db.db_size >> 3; i++) {1221ASSERT0(buf[i]);1222}1223} else {1224blkptr_t *bps = db->db.db_data;1225ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,1226db->db.db_size);1227/*1228* We want to verify that all the blkptrs in the1229* indirect block are holes, but we may have1230* automatically set up a few fields for them.1231* We iterate through each blkptr and verify1232* they only have those fields set.1233*/1234for (int i = 0;1235i < db->db.db_size / sizeof (blkptr_t);1236i++) {1237blkptr_t *bp = &bps[i];1238ASSERT(ZIO_CHECKSUM_IS_ZERO(1239&bp->blk_cksum));1240ASSERT(1241DVA_IS_EMPTY(&bp->blk_dva[0]) &&1242DVA_IS_EMPTY(&bp->blk_dva[1]) &&1243DVA_IS_EMPTY(&bp->blk_dva[2]));1244ASSERT0(bp->blk_fill);1245ASSERT(!BP_IS_EMBEDDED(bp));1246ASSERT(BP_IS_HOLE(bp));1247ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));1248}1249}1250}1251}1252DB_DNODE_EXIT(db);1253}1254#endif12551256static void1257dbuf_clear_data(dmu_buf_impl_t *db)1258{1259ASSERT(MUTEX_HELD(&db->db_mtx));1260dbuf_evict_user(db);1261ASSERT0P(db->db_buf);1262db->db.db_data = NULL;1263if (db->db_state != DB_NOFILL) {1264db->db_state = DB_UNCACHED;1265DTRACE_SET_STATE(db, "clear data");1266}1267}12681269static void1270dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)1271{1272ASSERT(MUTEX_HELD(&db->db_mtx));1273ASSERT(buf != NULL);12741275db->db_buf = buf;1276ASSERT(buf->b_data != NULL);1277db->db.db_data = buf->b_data;1278}12791280static arc_buf_t *1281dbuf_alloc_arcbuf(dmu_buf_impl_t *db)1282{1283spa_t *spa = db->db_objset->os_spa;12841285return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));1286}12871288/*1289* Calculate which level n block references the data at the level 0 offset1290* provided.1291*/1292uint64_t1293dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)1294{1295if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {1296/*1297* The level n blkid is equal to the level 0 blkid divided by1298* the number of level 0s in a level n block.1299*1300* The level 0 blkid is offset >> datablkshift =1301* offset / 2^datablkshift.1302*1303* The number of level 0s in a level n is the number of block1304* pointers in an indirect block, raised to the power of level.1305* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =1306* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).1307*1308* Thus, the level n blkid is: offset /1309* ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))1310* = offset / 2^(datablkshift + level *1311* (indblkshift - SPA_BLKPTRSHIFT))1312* = offset >> (datablkshift + level *1313* (indblkshift - SPA_BLKPTRSHIFT))1314*/13151316const unsigned exp = dn->dn_datablkshift +1317level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);13181319if (exp >= 8 * sizeof (offset)) {1320/* This only happens on the highest indirection level */1321ASSERT3U(level, ==, dn->dn_nlevels - 1);1322return (0);1323}13241325ASSERT3U(exp, <, 8 * sizeof (offset));13261327return (offset >> exp);1328} else {1329ASSERT3U(offset, <, dn->dn_datablksz);1330return (0);1331}1332}13331334/*1335* This function is used to lock the parent of the provided dbuf. This should be1336* used when modifying or reading db_blkptr.1337*/1338db_lock_type_t1339dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)1340{1341enum db_lock_type ret = DLT_NONE;1342if (db->db_parent != NULL) {1343rw_enter(&db->db_parent->db_rwlock, rw);1344ret = DLT_PARENT;1345} else if (dmu_objset_ds(db->db_objset) != NULL) {1346rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,1347tag);1348ret = DLT_OBJSET;1349}1350/*1351* We only return a DLT_NONE lock when it's the top-most indirect block1352* of the meta-dnode of the MOS.1353*/1354return (ret);1355}13561357/*1358* We need to pass the lock type in because it's possible that the block will1359* move from being the topmost indirect block in a dnode (and thus, have no1360* parent) to not the top-most via an indirection increase. This would cause a1361* panic if we didn't pass the lock type in.1362*/1363void1364dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)1365{1366if (type == DLT_PARENT)1367rw_exit(&db->db_parent->db_rwlock);1368else if (type == DLT_OBJSET)1369rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);1370}13711372static void1373dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,1374arc_buf_t *buf, void *vdb)1375{1376(void) zb, (void) bp;1377dmu_buf_impl_t *db = vdb;13781379mutex_enter(&db->db_mtx);1380ASSERT3U(db->db_state, ==, DB_READ);13811382/*1383* All reads are synchronous, so we must have a hold on the dbuf1384*/1385ASSERT(zfs_refcount_count(&db->db_holds) > 0);1386ASSERT0P(db->db_buf);1387ASSERT0P(db->db.db_data);1388if (buf == NULL) {1389/* i/o error */1390ASSERT(zio == NULL || zio->io_error != 0);1391ASSERT(db->db_blkid != DMU_BONUS_BLKID);1392ASSERT0P(db->db_buf);1393db->db_state = DB_UNCACHED;1394DTRACE_SET_STATE(db, "i/o error");1395} else if (db->db_level == 0 && db->db_freed_in_flight) {1396/* freed in flight */1397ASSERT(zio == NULL || zio->io_error == 0);1398arc_release(buf, db);1399memset(buf->b_data, 0, db->db.db_size);1400arc_buf_freeze(buf);1401db->db_freed_in_flight = FALSE;1402dbuf_set_data(db, buf);1403db->db_state = DB_CACHED;1404DTRACE_SET_STATE(db, "freed in flight");1405} else {1406/* success */1407ASSERT(zio == NULL || zio->io_error == 0);1408dbuf_set_data(db, buf);1409db->db_state = DB_CACHED;1410DTRACE_SET_STATE(db, "successful read");1411}1412cv_broadcast(&db->db_changed);1413dbuf_rele_and_unlock(db, NULL, B_FALSE);1414}14151416/*1417* Shortcut for performing reads on bonus dbufs. Returns1418* an error if we fail to verify the dnode associated with1419* a decrypted block. Otherwise success.1420*/1421static int1422dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)1423{1424void* db_data;1425int bonuslen, max_bonuslen;14261427bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);1428max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);1429ASSERT(MUTEX_HELD(&db->db_mtx));1430ASSERT(DB_DNODE_HELD(db));1431ASSERT3U(bonuslen, <=, db->db.db_size);1432db_data = kmem_alloc(max_bonuslen, KM_SLEEP);1433arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);1434if (bonuslen < max_bonuslen)1435memset(db_data, 0, max_bonuslen);1436if (bonuslen)1437memcpy(db_data, DN_BONUS(dn->dn_phys), bonuslen);1438db->db.db_data = db_data;1439db->db_state = DB_CACHED;1440DTRACE_SET_STATE(db, "bonus buffer filled");1441return (0);1442}14431444static void1445dbuf_handle_indirect_hole(void *data, dnode_t *dn, blkptr_t *dbbp)1446{1447blkptr_t *bps = data;1448uint32_t indbs = 1ULL << dn->dn_indblkshift;1449int n_bps = indbs >> SPA_BLKPTRSHIFT;14501451for (int i = 0; i < n_bps; i++) {1452blkptr_t *bp = &bps[i];14531454ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);1455BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?1456dn->dn_datablksz : BP_GET_LSIZE(dbbp));1457BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));1458BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);1459BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);1460}1461}14621463/*1464* Handle reads on dbufs that are holes, if necessary. This function1465* requires that the dbuf's mutex is held. Returns success (0) if action1466* was taken, ENOENT if no action was taken.1467*/1468static int1469dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)1470{1471ASSERT(MUTEX_HELD(&db->db_mtx));1472arc_buf_t *db_data;14731474int is_hole = bp == NULL || BP_IS_HOLE(bp);1475/*1476* For level 0 blocks only, if the above check fails:1477* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()1478* processes the delete record and clears the bp while we are waiting1479* for the dn_mtx (resulting in a "no" from block_freed).1480*/1481if (!is_hole && db->db_level == 0)1482is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);14831484if (is_hole) {1485db_data = dbuf_alloc_arcbuf(db);1486memset(db_data->b_data, 0, db->db.db_size);14871488if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&1489BP_GET_LOGICAL_BIRTH(bp) != 0) {1490dbuf_handle_indirect_hole(db_data->b_data, dn, bp);1491}1492dbuf_set_data(db, db_data);1493db->db_state = DB_CACHED;1494DTRACE_SET_STATE(db, "hole read satisfied");1495return (0);1496}1497return (ENOENT);1498}14991500/*1501* This function ensures that, when doing a decrypting read of a block,1502* we make sure we have decrypted the dnode associated with it. We must do1503* this so that we ensure we are fully authenticating the checksum-of-MACs1504* tree from the root of the objset down to this block. Indirect blocks are1505* always verified against their secure checksum-of-MACs assuming that the1506* dnode containing them is correct. Now that we are doing a decrypting read,1507* we can be sure that the key is loaded and verify that assumption. This is1508* especially important considering that we always read encrypted dnode1509* blocks as raw data (without verifying their MACs) to start, and1510* decrypt / authenticate them when we need to read an encrypted bonus buffer.1511*/1512static int1513dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,1514dmu_flags_t flags)1515{1516objset_t *os = db->db_objset;1517dmu_buf_impl_t *dndb;1518arc_buf_t *dnbuf;1519zbookmark_phys_t zb;1520int err;15211522if ((flags & DMU_READ_NO_DECRYPT) != 0 ||1523!os->os_encrypted || os->os_raw_receive ||1524(dndb = dn->dn_dbuf) == NULL)1525return (0);15261527dnbuf = dndb->db_buf;1528if (!arc_is_encrypted(dnbuf))1529return (0);15301531mutex_enter(&dndb->db_mtx);15321533/*1534* Since dnode buffer is modified by sync process, there can be only1535* one copy of it. It means we can not modify (decrypt) it while it1536* is being written. I don't see how this may happen now, since1537* encrypted dnode writes by receive should be completed before any1538* plain-text reads due to txg wait, but better be safe than sorry.1539*/1540while (1) {1541if (!arc_is_encrypted(dnbuf)) {1542mutex_exit(&dndb->db_mtx);1543return (0);1544}1545dbuf_dirty_record_t *dr = dndb->db_data_pending;1546if (dr == NULL || dr->dt.dl.dr_data != dnbuf)1547break;1548cv_wait(&dndb->db_changed, &dndb->db_mtx);1549};15501551SET_BOOKMARK(&zb, dmu_objset_id(os),1552DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);1553err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);15541555/*1556* An error code of EACCES tells us that the key is still not1557* available. This is ok if we are only reading authenticated1558* (and therefore non-encrypted) blocks.1559*/1560if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&1561!DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||1562(db->db_blkid == DMU_BONUS_BLKID &&1563!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))1564err = 0;15651566mutex_exit(&dndb->db_mtx);15671568return (err);1569}15701571/*1572* Drops db_mtx and the parent lock specified by dblt and tag before1573* returning.1574*/1575static int1576dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,1577db_lock_type_t dblt, blkptr_t *bp, const void *tag)1578{1579zbookmark_phys_t zb;1580uint32_t aflags = ARC_FLAG_NOWAIT;1581int err, zio_flags;15821583ASSERT(!zfs_refcount_is_zero(&db->db_holds));1584ASSERT(MUTEX_HELD(&db->db_mtx));1585ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);1586ASSERT0P(db->db_buf);1587ASSERT(db->db_parent == NULL ||1588RW_LOCK_HELD(&db->db_parent->db_rwlock));15891590if (db->db_blkid == DMU_BONUS_BLKID) {1591err = dbuf_read_bonus(db, dn);1592goto early_unlock;1593}15941595err = dbuf_read_hole(db, dn, bp);1596if (err == 0)1597goto early_unlock;15981599ASSERT(bp != NULL);16001601/*1602* Any attempt to read a redacted block should result in an error. This1603* will never happen under normal conditions, but can be useful for1604* debugging purposes.1605*/1606if (BP_IS_REDACTED(bp)) {1607ASSERT(dsl_dataset_feature_is_active(1608db->db_objset->os_dsl_dataset,1609SPA_FEATURE_REDACTED_DATASETS));1610err = SET_ERROR(EIO);1611goto early_unlock;1612}16131614SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),1615db->db.db_object, db->db_level, db->db_blkid);16161617/*1618* All bps of an encrypted os should have the encryption bit set.1619* If this is not true it indicates tampering and we report an error.1620*/1621if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {1622spa_log_error(db->db_objset->os_spa, &zb,1623BP_GET_PHYSICAL_BIRTH(bp));1624err = SET_ERROR(EIO);1625goto early_unlock;1626}16271628db->db_state = DB_READ;1629DTRACE_SET_STATE(db, "read issued");1630mutex_exit(&db->db_mtx);16311632if (!DBUF_IS_CACHEABLE(db))1633aflags |= ARC_FLAG_UNCACHED;1634else if (dbuf_is_l2cacheable(db, bp))1635aflags |= ARC_FLAG_L2CACHE;16361637dbuf_add_ref(db, NULL);16381639zio_flags = (flags & DB_RF_CANFAIL) ?1640ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;16411642if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))1643zio_flags |= ZIO_FLAG_RAW;16441645/*1646* The zio layer will copy the provided blkptr later, but we need to1647* do this now so that we can release the parent's rwlock. We have to1648* do that now so that if dbuf_read_done is called synchronously (on1649* an l1 cache hit) we don't acquire the db_mtx while holding the1650* parent's rwlock, which would be a lock ordering violation.1651*/1652blkptr_t copy = *bp;1653dmu_buf_unlock_parent(db, dblt, tag);1654return (arc_read(zio, db->db_objset->os_spa, ©,1655dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,1656&aflags, &zb));16571658early_unlock:1659mutex_exit(&db->db_mtx);1660dmu_buf_unlock_parent(db, dblt, tag);1661return (err);1662}16631664/*1665* This is our just-in-time copy function. It makes a copy of buffers that1666* have been modified in a previous transaction group before we access them in1667* the current active group.1668*1669* This function is used in three places: when we are dirtying a buffer for the1670* first time in a txg, when we are freeing a range in a dnode that includes1671* this buffer, and when we are accessing a buffer which was received compressed1672* and later referenced in a WRITE_BYREF record.1673*1674* Note that when we are called from dbuf_free_range() we do not put a hold on1675* the buffer, we just traverse the active dbuf list for the dnode.1676*/1677static void1678dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)1679{1680dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);16811682ASSERT(MUTEX_HELD(&db->db_mtx));1683ASSERT(db->db.db_data != NULL);1684ASSERT0(db->db_level);1685ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);16861687if (dr == NULL ||1688(dr->dt.dl.dr_data !=1689((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))1690return;16911692/*1693* If the last dirty record for this dbuf has not yet synced1694* and its referencing the dbuf data, either:1695* reset the reference to point to a new copy,1696* or (if there a no active holders)1697* just null out the current db_data pointer.1698*/1699ASSERT3U(dr->dr_txg, >=, txg - 2);1700if (db->db_blkid == DMU_BONUS_BLKID) {1701dnode_t *dn = DB_DNODE(db);1702int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);1703dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);1704arc_space_consume(bonuslen, ARC_SPACE_BONUS);1705memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);1706} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {1707dnode_t *dn = DB_DNODE(db);1708int size = arc_buf_size(db->db_buf);1709arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);1710spa_t *spa = db->db_objset->os_spa;1711enum zio_compress compress_type =1712arc_get_compression(db->db_buf);1713uint8_t complevel = arc_get_complevel(db->db_buf);17141715if (arc_is_encrypted(db->db_buf)) {1716boolean_t byteorder;1717uint8_t salt[ZIO_DATA_SALT_LEN];1718uint8_t iv[ZIO_DATA_IV_LEN];1719uint8_t mac[ZIO_DATA_MAC_LEN];17201721arc_get_raw_params(db->db_buf, &byteorder, salt,1722iv, mac);1723dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,1724dmu_objset_id(dn->dn_objset), byteorder, salt, iv,1725mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),1726compress_type, complevel);1727} else if (compress_type != ZIO_COMPRESS_OFF) {1728ASSERT3U(type, ==, ARC_BUFC_DATA);1729dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,1730size, arc_buf_lsize(db->db_buf), compress_type,1731complevel);1732} else {1733dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);1734}1735memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);1736} else {1737db->db_buf = NULL;1738dbuf_clear_data(db);1739}1740}17411742int1743dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)1744{1745dnode_t *dn;1746boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;1747int err;17481749ASSERT(!zfs_refcount_is_zero(&db->db_holds));17501751DB_DNODE_ENTER(db);1752dn = DB_DNODE(db);17531754/*1755* Ensure that this block's dnode has been decrypted if the caller1756* has requested decrypted data.1757*/1758err = dbuf_read_verify_dnode_crypt(db, dn, flags);1759if (err != 0)1760goto done;17611762prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&1763(flags & DMU_READ_NO_PREFETCH) == 0;17641765mutex_enter(&db->db_mtx);1766if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))1767db->db_pending_evict = B_FALSE;1768if (flags & DMU_PARTIAL_FIRST)1769db->db_partial_read = B_TRUE;1770else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))1771db->db_partial_read = B_FALSE;1772miss = (db->db_state != DB_CACHED);17731774if (db->db_state == DB_READ || db->db_state == DB_FILL) {1775/*1776* Another reader came in while the dbuf was in flight between1777* UNCACHED and CACHED. Either a writer will finish filling1778* the buffer, sending the dbuf to CACHED, or the first reader's1779* request will reach the read_done callback and send the dbuf1780* to CACHED. Otherwise, a failure occurred and the dbuf will1781* be sent to UNCACHED.1782*/1783if (flags & DB_RF_NEVERWAIT) {1784mutex_exit(&db->db_mtx);1785DB_DNODE_EXIT(db);1786goto done;1787}1788do {1789ASSERT(db->db_state == DB_READ ||1790(flags & DB_RF_HAVESTRUCT) == 0);1791DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,1792zio_t *, pio);1793cv_wait(&db->db_changed, &db->db_mtx);1794} while (db->db_state == DB_READ || db->db_state == DB_FILL);1795if (db->db_state == DB_UNCACHED) {1796err = SET_ERROR(EIO);1797mutex_exit(&db->db_mtx);1798DB_DNODE_EXIT(db);1799goto done;1800}1801}18021803if (db->db_state == DB_CACHED) {1804/*1805* If the arc buf is compressed or encrypted and the caller1806* requested uncompressed data, we need to untransform it1807* before returning. We also call arc_untransform() on any1808* unauthenticated blocks, which will verify their MAC if1809* the key is now available.1810*/1811if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&1812(arc_is_encrypted(db->db_buf) ||1813arc_is_unauthenticated(db->db_buf) ||1814arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {1815spa_t *spa = dn->dn_objset->os_spa;1816zbookmark_phys_t zb;18171818SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),1819db->db.db_object, db->db_level, db->db_blkid);1820dbuf_fix_old_data(db, spa_syncing_txg(spa));1821err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);1822dbuf_set_data(db, db->db_buf);1823}1824mutex_exit(&db->db_mtx);1825} else {1826ASSERT(db->db_state == DB_UNCACHED ||1827db->db_state == DB_NOFILL);1828db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);1829blkptr_t *bp;18301831/*1832* If a block clone or Direct I/O write has occurred we will1833* get the dirty records overridden BP so we get the most1834* recent data.1835*/1836err = dmu_buf_get_bp_from_dbuf(db, &bp);18371838if (!err) {1839if (pio == NULL && (db->db_state == DB_NOFILL ||1840(bp != NULL && !BP_IS_HOLE(bp)))) {1841spa_t *spa = dn->dn_objset->os_spa;1842pio =1843zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);1844need_wait = B_TRUE;1845}18461847err =1848dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);1849} else {1850mutex_exit(&db->db_mtx);1851dmu_buf_unlock_parent(db, dblt, FTAG);1852}1853/* dbuf_read_impl drops db_mtx and parent's rwlock. */1854miss = (db->db_state != DB_CACHED);1855}18561857if (err == 0 && prefetch) {1858dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,1859flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||1860db->db_pending_evict);1861}1862DB_DNODE_EXIT(db);18631864/*1865* If we created a zio we must execute it to avoid leaking it, even if1866* it isn't attached to any work due to an error in dbuf_read_impl().1867*/1868if (need_wait) {1869if (err == 0)1870err = zio_wait(pio);1871else1872(void) zio_wait(pio);1873pio = NULL;1874}18751876done:1877if (miss)1878DBUF_STAT_BUMP(hash_misses);1879else1880DBUF_STAT_BUMP(hash_hits);1881if (pio && err != 0) {1882zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,1883ZIO_FLAG_CANFAIL);1884zio->io_error = err;1885zio_nowait(zio);1886}18871888return (err);1889}18901891static void1892dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)1893{1894ASSERT(!zfs_refcount_is_zero(&db->db_holds));1895ASSERT(db->db_blkid != DMU_BONUS_BLKID);1896mutex_enter(&db->db_mtx);1897if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))1898db->db_pending_evict = B_FALSE;1899db->db_partial_read = B_FALSE;1900while (db->db_state == DB_READ || db->db_state == DB_FILL)1901cv_wait(&db->db_changed, &db->db_mtx);1902if (db->db_state == DB_UNCACHED) {1903ASSERT0P(db->db_buf);1904ASSERT0P(db->db.db_data);1905dbuf_set_data(db, dbuf_alloc_arcbuf(db));1906db->db_state = DB_FILL;1907DTRACE_SET_STATE(db, "assigning filled buffer");1908} else if (db->db_state == DB_NOFILL) {1909dbuf_clear_data(db);1910} else {1911ASSERT3U(db->db_state, ==, DB_CACHED);1912}1913mutex_exit(&db->db_mtx);1914}19151916void1917dbuf_unoverride(dbuf_dirty_record_t *dr)1918{1919dmu_buf_impl_t *db = dr->dr_dbuf;1920blkptr_t *bp = &dr->dt.dl.dr_overridden_by;1921uint64_t txg = dr->dr_txg;19221923ASSERT(MUTEX_HELD(&db->db_mtx));19241925/*1926* This assert is valid because dmu_sync() expects to be called by1927* a zilog's get_data while holding a range lock. This call only1928* comes from dbuf_dirty() callers who must also hold a range lock.1929*/1930ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);1931ASSERT0(db->db_level);19321933if (db->db_blkid == DMU_BONUS_BLKID ||1934dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)1935return;19361937ASSERT(db->db_data_pending != dr);19381939/* free this block */1940if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)1941zio_free(db->db_objset->os_spa, txg, bp);19421943if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {1944ASSERT0P(dr->dt.dl.dr_data);1945dr->dt.dl.dr_data = db->db_buf;1946}1947dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;1948dr->dt.dl.dr_nopwrite = B_FALSE;1949dr->dt.dl.dr_brtwrite = B_FALSE;1950dr->dt.dl.dr_diowrite = B_FALSE;1951dr->dt.dl.dr_has_raw_params = B_FALSE;19521953/*1954* In the event that Direct I/O was used, we do not1955* need to release the buffer from the ARC.1956*1957* Release the already-written buffer, so we leave it in1958* a consistent dirty state. Note that all callers are1959* modifying the buffer, so they will immediately do1960* another (redundant) arc_release(). Therefore, leave1961* the buf thawed to save the effort of freezing &1962* immediately re-thawing it.1963*/1964if (dr->dt.dl.dr_data)1965arc_release(dr->dt.dl.dr_data, db);1966}19671968/*1969* Evict (if its unreferenced) or clear (if its referenced) any level-01970* data blocks in the free range, so that any future readers will find1971* empty blocks.1972*/1973void1974dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,1975dmu_tx_t *tx)1976{1977dmu_buf_impl_t *db_search;1978dmu_buf_impl_t *db, *db_next;1979uint64_t txg = tx->tx_txg;1980avl_index_t where;1981dbuf_dirty_record_t *dr;19821983if (end_blkid > dn->dn_maxblkid &&1984!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))1985end_blkid = dn->dn_maxblkid;1986dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,1987(u_longlong_t)end_blkid);19881989db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);1990db_search->db_level = 0;1991db_search->db_blkid = start_blkid;1992db_search->db_state = DB_SEARCH;19931994mutex_enter(&dn->dn_dbufs_mtx);1995db = avl_find(&dn->dn_dbufs, db_search, &where);1996ASSERT0P(db);19971998db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);19992000for (; db != NULL; db = db_next) {2001db_next = AVL_NEXT(&dn->dn_dbufs, db);2002ASSERT(db->db_blkid != DMU_BONUS_BLKID);20032004if (db->db_level != 0 || db->db_blkid > end_blkid) {2005break;2006}2007ASSERT3U(db->db_blkid, >=, start_blkid);20082009/* found a level 0 buffer in the range */2010mutex_enter(&db->db_mtx);2011if (dbuf_undirty(db, tx)) {2012/* mutex has been dropped and dbuf destroyed */2013continue;2014}20152016if (db->db_state == DB_UNCACHED ||2017db->db_state == DB_NOFILL ||2018db->db_state == DB_EVICTING) {2019ASSERT0P(db->db.db_data);2020mutex_exit(&db->db_mtx);2021continue;2022}2023if (db->db_state == DB_READ || db->db_state == DB_FILL) {2024/* will be handled in dbuf_read_done or dbuf_rele */2025db->db_freed_in_flight = TRUE;2026mutex_exit(&db->db_mtx);2027continue;2028}2029if (zfs_refcount_count(&db->db_holds) == 0) {2030ASSERT(db->db_buf);2031dbuf_destroy(db);2032continue;2033}2034/* The dbuf is referenced */20352036dr = list_head(&db->db_dirty_records);2037if (dr != NULL) {2038if (dr->dr_txg == txg) {2039/*2040* This buffer is "in-use", re-adjust the file2041* size to reflect that this buffer may2042* contain new data when we sync.2043*/2044if (db->db_blkid != DMU_SPILL_BLKID &&2045db->db_blkid > dn->dn_maxblkid)2046dn->dn_maxblkid = db->db_blkid;2047dbuf_unoverride(dr);2048} else {2049/*2050* This dbuf is not dirty in the open context.2051* Either uncache it (if its not referenced in2052* the open context) or reset its contents to2053* empty.2054*/2055dbuf_fix_old_data(db, txg);2056}2057}2058/* clear the contents if its cached */2059if (db->db_state == DB_CACHED) {2060ASSERT(db->db.db_data != NULL);2061arc_release(db->db_buf, db);2062rw_enter(&db->db_rwlock, RW_WRITER);2063memset(db->db.db_data, 0, db->db.db_size);2064rw_exit(&db->db_rwlock);2065arc_buf_freeze(db->db_buf);2066}20672068mutex_exit(&db->db_mtx);2069}20702071mutex_exit(&dn->dn_dbufs_mtx);2072kmem_free(db_search, sizeof (dmu_buf_impl_t));2073}20742075void2076dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)2077{2078arc_buf_t *buf, *old_buf;2079dbuf_dirty_record_t *dr;2080int osize = db->db.db_size;2081arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);2082dnode_t *dn;20832084ASSERT(db->db_blkid != DMU_BONUS_BLKID);20852086DB_DNODE_ENTER(db);2087dn = DB_DNODE(db);20882089/*2090* XXX we should be doing a dbuf_read, checking the return2091* value and returning that up to our callers2092*/2093dmu_buf_will_dirty(&db->db, tx);20942095VERIFY3P(db->db_buf, !=, NULL);20962097/* create the data buffer for the new block */2098buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);20992100/* copy old block data to the new block */2101old_buf = db->db_buf;2102memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));2103/* zero the remainder */2104if (size > osize)2105memset((uint8_t *)buf->b_data + osize, 0, size - osize);21062107mutex_enter(&db->db_mtx);2108dbuf_set_data(db, buf);2109arc_buf_destroy(old_buf, db);2110db->db.db_size = size;21112112dr = list_head(&db->db_dirty_records);2113/* dirty record added by dmu_buf_will_dirty() */2114VERIFY(dr != NULL);2115if (db->db_level == 0)2116dr->dt.dl.dr_data = buf;2117ASSERT3U(dr->dr_txg, ==, tx->tx_txg);2118ASSERT3U(dr->dr_accounted, ==, osize);2119dr->dr_accounted = size;2120mutex_exit(&db->db_mtx);21212122dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);2123DB_DNODE_EXIT(db);2124}21252126void2127dbuf_release_bp(dmu_buf_impl_t *db)2128{2129objset_t *os __maybe_unused = db->db_objset;21302131ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));2132ASSERT(arc_released(os->os_phys_buf) ||2133list_link_active(&os->os_dsl_dataset->ds_synced_link));2134ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));21352136(void) arc_release(db->db_buf, db);2137}21382139/*2140* We already have a dirty record for this TXG, and we are being2141* dirtied again.2142*/2143static void2144dbuf_redirty(dbuf_dirty_record_t *dr)2145{2146dmu_buf_impl_t *db = dr->dr_dbuf;21472148ASSERT(MUTEX_HELD(&db->db_mtx));21492150if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {2151/*2152* If this buffer has already been written out,2153* we now need to reset its state.2154*/2155dbuf_unoverride(dr);2156if (db->db.db_object != DMU_META_DNODE_OBJECT &&2157db->db_state != DB_NOFILL) {2158/* Already released on initial dirty, so just thaw. */2159ASSERT(arc_released(db->db_buf));2160arc_buf_thaw(db->db_buf);2161}21622163/*2164* Clear the rewrite flag since this is now a logical2165* modification.2166*/2167dr->dt.dl.dr_rewrite = B_FALSE;2168}2169}21702171dbuf_dirty_record_t *2172dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)2173{2174rw_enter(&dn->dn_struct_rwlock, RW_READER);2175IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);2176dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);2177ASSERT(dn->dn_maxblkid >= blkid);21782179dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);2180list_link_init(&dr->dr_dirty_node);2181list_link_init(&dr->dr_dbuf_node);2182dr->dr_dnode = dn;2183dr->dr_txg = tx->tx_txg;2184dr->dt.dll.dr_blkid = blkid;2185dr->dr_accounted = dn->dn_datablksz;21862187/*2188* There should not be any dbuf for the block that we're dirtying.2189* Otherwise the buffer contents could be inconsistent between the2190* dbuf and the lightweight dirty record.2191*/2192ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,2193NULL));21942195mutex_enter(&dn->dn_mtx);2196int txgoff = tx->tx_txg & TXG_MASK;2197if (dn->dn_free_ranges[txgoff] != NULL) {2198zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);2199}22002201if (dn->dn_nlevels == 1) {2202ASSERT3U(blkid, <, dn->dn_nblkptr);2203list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2204mutex_exit(&dn->dn_mtx);2205rw_exit(&dn->dn_struct_rwlock);2206dnode_setdirty(dn, tx);2207} else {2208mutex_exit(&dn->dn_mtx);22092210int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2211dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,22121, blkid >> epbs, FTAG);2213rw_exit(&dn->dn_struct_rwlock);2214if (parent_db == NULL) {2215kmem_free(dr, sizeof (*dr));2216return (NULL);2217}2218int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |2219DMU_READ_NO_PREFETCH);2220if (err != 0) {2221dbuf_rele(parent_db, FTAG);2222kmem_free(dr, sizeof (*dr));2223return (NULL);2224}22252226dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);2227dbuf_rele(parent_db, FTAG);2228mutex_enter(&parent_dr->dt.di.dr_mtx);2229ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);2230list_insert_tail(&parent_dr->dt.di.dr_children, dr);2231mutex_exit(&parent_dr->dt.di.dr_mtx);2232dr->dr_parent = parent_dr;2233}22342235dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);22362237return (dr);2238}22392240dbuf_dirty_record_t *2241dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)2242{2243dnode_t *dn;2244objset_t *os;2245dbuf_dirty_record_t *dr, *dr_next, *dr_head;2246int txgoff = tx->tx_txg & TXG_MASK;2247boolean_t drop_struct_rwlock = B_FALSE;22482249ASSERT(tx->tx_txg != 0);2250ASSERT(!zfs_refcount_is_zero(&db->db_holds));2251DMU_TX_DIRTY_BUF(tx, db);22522253DB_DNODE_ENTER(db);2254dn = DB_DNODE(db);2255/*2256* Shouldn't dirty a regular buffer in syncing context. Private2257* objects may be dirtied in syncing context, but only if they2258* were already pre-dirtied in open context.2259*/2260#ifdef ZFS_DEBUG2261if (dn->dn_objset->os_dsl_dataset != NULL) {2262rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,2263RW_READER, FTAG);2264}2265ASSERT(!dmu_tx_is_syncing(tx) ||2266BP_IS_HOLE(dn->dn_objset->os_rootbp) ||2267DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||2268dn->dn_objset->os_dsl_dataset == NULL);2269if (dn->dn_objset->os_dsl_dataset != NULL)2270rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);2271#endif22722273mutex_enter(&db->db_mtx);2274/*2275* XXX make this true for indirects too? The problem is that2276* transactions created with dmu_tx_create_assigned() from2277* syncing context don't bother holding ahead.2278*/2279ASSERT(db->db_level != 0 ||2280db->db_state == DB_CACHED || db->db_state == DB_FILL ||2281db->db_state == DB_NOFILL);22822283if (db->db_blkid == DMU_SPILL_BLKID)2284dn->dn_have_spill = B_TRUE;22852286/*2287* If this buffer is already dirty, we're done.2288*/2289dr_head = list_head(&db->db_dirty_records);2290ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||2291db->db.db_object == DMU_META_DNODE_OBJECT);2292dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);2293if (dr_next && dr_next->dr_txg == tx->tx_txg) {2294DB_DNODE_EXIT(db);22952296dbuf_redirty(dr_next);2297mutex_exit(&db->db_mtx);2298return (dr_next);2299}23002301ASSERT3U(dn->dn_nlevels, >, db->db_level);23022303/*2304* We should only be dirtying in syncing context if it's the2305* mos or we're initializing the os or it's a special object.2306* However, we are allowed to dirty in syncing context provided2307* we already dirtied it in open context. Hence we must make2308* this assertion only if we're not already dirty.2309*/2310os = dn->dn_objset;2311VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));2312#ifdef ZFS_DEBUG2313if (dn->dn_objset->os_dsl_dataset != NULL)2314rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);2315ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||2316os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));2317if (dn->dn_objset->os_dsl_dataset != NULL)2318rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);2319#endif2320ASSERT(db->db.db_size != 0);23212322dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);23232324if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {2325dmu_objset_willuse_space(os, db->db.db_size, tx);2326}23272328/*2329* If this buffer is dirty in an old transaction group we need2330* to make a copy of it so that the changes we make in this2331* transaction group won't leak out when we sync the older txg.2332*/2333dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP);2334memset(dr, 0, sizeof (*dr));2335list_link_init(&dr->dr_dirty_node);2336list_link_init(&dr->dr_dbuf_node);2337dr->dr_dnode = dn;2338if (db->db_level == 0) {2339void *data_old = db->db_buf;23402341if (db->db_state != DB_NOFILL) {2342if (db->db_blkid == DMU_BONUS_BLKID) {2343dbuf_fix_old_data(db, tx->tx_txg);2344data_old = db->db.db_data;2345} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {2346/*2347* Release the data buffer from the cache so2348* that we can modify it without impacting2349* possible other users of this cached data2350* block. Note that indirect blocks and2351* private objects are not released until the2352* syncing state (since they are only modified2353* then).2354*/2355arc_release(db->db_buf, db);2356dbuf_fix_old_data(db, tx->tx_txg);2357data_old = db->db_buf;2358}2359ASSERT(data_old != NULL);2360}2361dr->dt.dl.dr_data = data_old;2362} else {2363mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);2364list_create(&dr->dt.di.dr_children,2365sizeof (dbuf_dirty_record_t),2366offsetof(dbuf_dirty_record_t, dr_dirty_node));2367}2368if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {2369dr->dr_accounted = db->db.db_size;2370}2371dr->dr_dbuf = db;2372dr->dr_txg = tx->tx_txg;2373list_insert_before(&db->db_dirty_records, dr_next, dr);23742375/*2376* We could have been freed_in_flight between the dbuf_noread2377* and dbuf_dirty. We win, as though the dbuf_noread() had2378* happened after the free.2379*/2380if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&2381db->db_blkid != DMU_SPILL_BLKID) {2382mutex_enter(&dn->dn_mtx);2383if (dn->dn_free_ranges[txgoff] != NULL) {2384zfs_range_tree_clear(dn->dn_free_ranges[txgoff],2385db->db_blkid, 1);2386}2387mutex_exit(&dn->dn_mtx);2388db->db_freed_in_flight = FALSE;2389}23902391/*2392* This buffer is now part of this txg2393*/2394dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);2395db->db_dirtycnt += 1;2396ASSERT3U(db->db_dirtycnt, <=, 3);23972398mutex_exit(&db->db_mtx);23992400if (db->db_blkid == DMU_BONUS_BLKID ||2401db->db_blkid == DMU_SPILL_BLKID) {2402mutex_enter(&dn->dn_mtx);2403ASSERT(!list_link_active(&dr->dr_dirty_node));2404list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2405mutex_exit(&dn->dn_mtx);2406dnode_setdirty(dn, tx);2407DB_DNODE_EXIT(db);2408return (dr);2409}24102411if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {2412rw_enter(&dn->dn_struct_rwlock, RW_READER);2413drop_struct_rwlock = B_TRUE;2414}24152416/*2417* If we are overwriting a dedup BP, then unless it is snapshotted,2418* when we get to syncing context we will need to decrement its2419* refcount in the DDT. Prefetch the relevant DDT block so that2420* syncing context won't have to wait for the i/o.2421*/2422if (db->db_blkptr != NULL) {2423db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);2424ddt_prefetch(os->os_spa, db->db_blkptr);2425dmu_buf_unlock_parent(db, dblt, FTAG);2426}24272428/*2429* We need to hold the dn_struct_rwlock to make this assertion,2430* because it protects dn_phys / dn_next_nlevels from changing.2431*/2432ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||2433dn->dn_phys->dn_nlevels > db->db_level ||2434dn->dn_next_nlevels[txgoff] > db->db_level ||2435dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||2436dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);243724382439if (db->db_level == 0) {2440ASSERT(!db->db_objset->os_raw_receive ||2441dn->dn_maxblkid >= db->db_blkid);2442dnode_new_blkid(dn, db->db_blkid, tx,2443drop_struct_rwlock, B_FALSE);2444ASSERT(dn->dn_maxblkid >= db->db_blkid);2445}24462447if (db->db_level+1 < dn->dn_nlevels) {2448dmu_buf_impl_t *parent = db->db_parent;2449dbuf_dirty_record_t *di;2450int parent_held = FALSE;24512452if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {2453int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;2454parent = dbuf_hold_level(dn, db->db_level + 1,2455db->db_blkid >> epbs, FTAG);2456ASSERT(parent != NULL);2457parent_held = TRUE;2458}2459if (drop_struct_rwlock)2460rw_exit(&dn->dn_struct_rwlock);2461ASSERT3U(db->db_level + 1, ==, parent->db_level);2462di = dbuf_dirty(parent, tx);2463if (parent_held)2464dbuf_rele(parent, FTAG);24652466mutex_enter(&db->db_mtx);2467/*2468* Since we've dropped the mutex, it's possible that2469* dbuf_undirty() might have changed this out from under us.2470*/2471if (list_head(&db->db_dirty_records) == dr ||2472dn->dn_object == DMU_META_DNODE_OBJECT) {2473mutex_enter(&di->dt.di.dr_mtx);2474ASSERT3U(di->dr_txg, ==, tx->tx_txg);2475ASSERT(!list_link_active(&dr->dr_dirty_node));2476list_insert_tail(&di->dt.di.dr_children, dr);2477mutex_exit(&di->dt.di.dr_mtx);2478dr->dr_parent = di;2479}2480mutex_exit(&db->db_mtx);2481} else {2482ASSERT(db->db_level + 1 == dn->dn_nlevels);2483ASSERT(db->db_blkid < dn->dn_nblkptr);2484ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);2485mutex_enter(&dn->dn_mtx);2486ASSERT(!list_link_active(&dr->dr_dirty_node));2487list_insert_tail(&dn->dn_dirty_records[txgoff], dr);2488mutex_exit(&dn->dn_mtx);2489if (drop_struct_rwlock)2490rw_exit(&dn->dn_struct_rwlock);2491}24922493dnode_setdirty(dn, tx);2494DB_DNODE_EXIT(db);2495return (dr);2496}24972498static void2499dbuf_undirty_bonus(dbuf_dirty_record_t *dr)2500{2501dmu_buf_impl_t *db = dr->dr_dbuf;25022503ASSERT(MUTEX_HELD(&db->db_mtx));2504if (dr->dt.dl.dr_data != db->db.db_data) {2505struct dnode *dn = dr->dr_dnode;2506int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);25072508kmem_free(dr->dt.dl.dr_data, max_bonuslen);2509arc_space_return(max_bonuslen, ARC_SPACE_BONUS);2510}2511db->db_data_pending = NULL;2512ASSERT(list_next(&db->db_dirty_records, dr) == NULL);2513list_remove(&db->db_dirty_records, dr);2514if (dr->dr_dbuf->db_level != 0) {2515mutex_destroy(&dr->dt.di.dr_mtx);2516list_destroy(&dr->dt.di.dr_children);2517}2518kmem_cache_free(dbuf_dirty_kmem_cache, dr);2519ASSERT3U(db->db_dirtycnt, >, 0);2520db->db_dirtycnt -= 1;2521}25222523/*2524* Undirty a buffer in the transaction group referenced by the given2525* transaction. Return whether this evicted the dbuf.2526*/2527boolean_t2528dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)2529{2530uint64_t txg = tx->tx_txg;2531boolean_t brtwrite;2532boolean_t diowrite;25332534ASSERT(txg != 0);25352536/*2537* Due to our use of dn_nlevels below, this can only be called2538* in open context, unless we are operating on the MOS or it's2539* a special object. From syncing context, dn_nlevels may be2540* different from the dn_nlevels used when dbuf was dirtied.2541*/2542ASSERT(db->db_objset ==2543dmu_objset_pool(db->db_objset)->dp_meta_objset ||2544DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||2545txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));2546ASSERT(db->db_blkid != DMU_BONUS_BLKID);2547ASSERT0(db->db_level);2548ASSERT(MUTEX_HELD(&db->db_mtx));25492550/*2551* If this buffer is not dirty, we're done.2552*/2553dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);2554if (dr == NULL)2555return (B_FALSE);2556ASSERT(dr->dr_dbuf == db);25572558brtwrite = dr->dt.dl.dr_brtwrite;2559diowrite = dr->dt.dl.dr_diowrite;2560if (brtwrite) {2561ASSERT3B(diowrite, ==, B_FALSE);2562/*2563* We are freeing a block that we cloned in the same2564* transaction group.2565*/2566blkptr_t *bp = &dr->dt.dl.dr_overridden_by;2567if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {2568brt_pending_remove(dmu_objset_spa(db->db_objset),2569bp, tx);2570}2571}25722573dnode_t *dn = dr->dr_dnode;25742575dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);25762577ASSERT(db->db.db_size != 0);25782579dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),2580dr->dr_accounted, txg);25812582list_remove(&db->db_dirty_records, dr);25832584/*2585* Note that there are three places in dbuf_dirty()2586* where this dirty record may be put on a list.2587* Make sure to do a list_remove corresponding to2588* every one of those list_insert calls.2589*/2590if (dr->dr_parent) {2591mutex_enter(&dr->dr_parent->dt.di.dr_mtx);2592list_remove(&dr->dr_parent->dt.di.dr_children, dr);2593mutex_exit(&dr->dr_parent->dt.di.dr_mtx);2594} else if (db->db_blkid == DMU_SPILL_BLKID ||2595db->db_level + 1 == dn->dn_nlevels) {2596ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);2597mutex_enter(&dn->dn_mtx);2598list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);2599mutex_exit(&dn->dn_mtx);2600}26012602if (db->db_state != DB_NOFILL && !brtwrite) {2603dbuf_unoverride(dr);26042605if (dr->dt.dl.dr_data != db->db_buf) {2606ASSERT(db->db_buf != NULL);2607ASSERT(dr->dt.dl.dr_data != NULL);2608arc_buf_destroy(dr->dt.dl.dr_data, db);2609}2610}26112612kmem_cache_free(dbuf_dirty_kmem_cache, dr);26132614ASSERT(db->db_dirtycnt > 0);2615db->db_dirtycnt -= 1;26162617if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {2618ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||2619arc_released(db->db_buf));2620dbuf_destroy(db);2621return (B_TRUE);2622}26232624return (B_FALSE);2625}26262627void2628dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)2629{2630dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2631boolean_t undirty = B_FALSE;26322633ASSERT(tx->tx_txg != 0);2634ASSERT(!zfs_refcount_is_zero(&db->db_holds));26352636/*2637* Quick check for dirtiness to improve performance for some workloads2638* (e.g. file deletion with indirect blocks cached).2639*/2640mutex_enter(&db->db_mtx);2641if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {2642/*2643* It's possible that the dbuf is already dirty but not cached,2644* because there are some calls to dbuf_dirty() that don't2645* go through dmu_buf_will_dirty().2646*/2647dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2648if (dr != NULL) {2649if (db->db_level == 0 &&2650dr->dt.dl.dr_brtwrite) {2651/*2652* Block cloning: If we are dirtying a cloned2653* level 0 block, we cannot simply redirty it,2654* because this dr has no associated data.2655* We will go through a full undirtying below,2656* before dirtying it again.2657*/2658undirty = B_TRUE;2659} else {2660/* This dbuf is already dirty and cached. */2661dbuf_redirty(dr);2662mutex_exit(&db->db_mtx);2663return;2664}2665}2666}2667mutex_exit(&db->db_mtx);26682669DB_DNODE_ENTER(db);2670if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))2671flags |= DB_RF_HAVESTRUCT;2672DB_DNODE_EXIT(db);26732674/*2675* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we2676* want to make sure dbuf_read() will read the pending cloned block and2677* not the uderlying block that is being replaced. dbuf_undirty() will2678* do brt_pending_remove() before removing the dirty record.2679*/2680(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);2681if (undirty) {2682mutex_enter(&db->db_mtx);2683VERIFY(!dbuf_undirty(db, tx));2684mutex_exit(&db->db_mtx);2685}2686(void) dbuf_dirty(db, tx);2687}26882689void2690dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)2691{2692dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);2693}26942695void2696dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)2697{2698dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;26992700ASSERT(tx->tx_txg != 0);2701ASSERT(!zfs_refcount_is_zero(&db->db_holds));27022703/*2704* If the dbuf is already dirty in this txg, it will be written2705* anyway, so there's nothing to do.2706*/2707mutex_enter(&db->db_mtx);2708if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {2709mutex_exit(&db->db_mtx);2710return;2711}2712mutex_exit(&db->db_mtx);27132714/*2715* The dbuf is not dirty, so we need to make it dirty and2716* mark it for rewrite (preserve logical birth time).2717*/2718dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);27192720mutex_enter(&db->db_mtx);2721dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2722if (dr != NULL && db->db_level == 0)2723dr->dt.dl.dr_rewrite = B_TRUE;2724mutex_exit(&db->db_mtx);2725}27262727boolean_t2728dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)2729{2730dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2731dbuf_dirty_record_t *dr;27322733mutex_enter(&db->db_mtx);2734dr = dbuf_find_dirty_eq(db, tx->tx_txg);2735mutex_exit(&db->db_mtx);2736return (dr != NULL);2737}27382739/*2740* Normally the db_blkptr points to the most recent on-disk content for the2741* dbuf (and anything newer will be cached in the dbuf). However, a pending2742* block clone or not yet synced Direct I/O write will have a dirty record BP2743* pointing to the most recent data.2744*/2745int2746dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)2747{2748ASSERT(MUTEX_HELD(&db->db_mtx));2749int error = 0;27502751if (db->db_level != 0) {2752*bp = db->db_blkptr;2753return (0);2754}27552756*bp = db->db_blkptr;2757dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);2758if (dr && db->db_state == DB_NOFILL) {2759/* Block clone */2760if (!dr->dt.dl.dr_brtwrite)2761error = EIO;2762else2763*bp = &dr->dt.dl.dr_overridden_by;2764} else if (dr && db->db_state == DB_UNCACHED) {2765/* Direct I/O write */2766if (dr->dt.dl.dr_diowrite)2767*bp = &dr->dt.dl.dr_overridden_by;2768}27692770return (error);2771}27722773/*2774* Direct I/O reads can read directly from the ARC, but the data has2775* to be untransformed in order to copy it over into user pages.2776*/2777int2778dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)2779{2780int err = 0;2781DB_DNODE_ENTER(db);2782dnode_t *dn = DB_DNODE(db);27832784ASSERT3S(db->db_state, ==, DB_CACHED);2785ASSERT(MUTEX_HELD(&db->db_mtx));27862787/*2788* Ensure that this block's dnode has been decrypted if2789* the caller has requested decrypted data.2790*/2791err = dbuf_read_verify_dnode_crypt(db, dn, 0);27922793/*2794* If the arc buf is compressed or encrypted and the caller2795* requested uncompressed data, we need to untransform it2796* before returning. We also call arc_untransform() on any2797* unauthenticated blocks, which will verify their MAC if2798* the key is now available.2799*/2800if (err == 0 && db->db_buf != NULL &&2801(arc_is_encrypted(db->db_buf) ||2802arc_is_unauthenticated(db->db_buf) ||2803arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {2804zbookmark_phys_t zb;28052806SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),2807db->db.db_object, db->db_level, db->db_blkid);2808dbuf_fix_old_data(db, spa_syncing_txg(spa));2809err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);2810dbuf_set_data(db, db->db_buf);2811}2812DB_DNODE_EXIT(db);2813DBUF_STAT_BUMP(hash_hits);28142815return (err);2816}28172818void2819dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)2820{2821/*2822* Block clones and Direct I/O writes always happen in open-context.2823*/2824dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2825ASSERT0(db->db_level);2826ASSERT(!dmu_tx_is_syncing(tx));2827ASSERT0(db->db_level);2828ASSERT(db->db_blkid != DMU_BONUS_BLKID);2829ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);28302831mutex_enter(&db->db_mtx);2832DBUF_VERIFY(db);28332834/*2835* We are going to clone or issue a Direct I/O write on this block, so2836* undirty modifications done to this block so far in this txg. This2837* includes writes and clones into this block.2838*2839* If there dirty record associated with this txg from a previous Direct2840* I/O write then space accounting cleanup takes place. It is important2841* to go ahead free up the space accounting through dbuf_undirty() ->2842* dbuf_unoverride() -> zio_free(). Space accountiung for determining2843* if a write can occur in zfs_write() happens through dmu_tx_assign().2844* This can cause an issue with Direct I/O writes in the case of2845* overwriting the same block, because all DVA allocations are being2846* done in open-context. Constantly allowing Direct I/O overwrites to2847* the same block can exhaust the pools available space leading to2848* ENOSPC errors at the DVA allocation part of the ZIO pipeline, which2849* will eventually suspend the pool. By cleaning up sapce acccounting2850* now, the ENOSPC error can be avoided.2851*2852* Since we are undirtying the record in open-context, we must have a2853* hold on the db, so it should never be evicted after calling2854* dbuf_undirty().2855*/2856VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);2857ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));28582859if (db->db_buf != NULL) {2860/*2861* If there is an associated ARC buffer with this dbuf we can2862* only destroy it if the previous dirty record does not2863* reference it.2864*/2865dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);2866if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)2867arc_buf_destroy(db->db_buf, db);28682869/*2870* Setting the dbuf's data pointers to NULL will force all2871* future reads down to the devices to get the most up to date2872* version of the data after a Direct I/O write has completed.2873*/2874db->db_buf = NULL;2875dbuf_clear_data(db);2876}28772878ASSERT0P(db->db_buf);2879ASSERT0P(db->db.db_data);28802881db->db_state = DB_NOFILL;2882DTRACE_SET_STATE(db,2883"allocating NOFILL buffer for clone or direct I/O write");28842885DBUF_VERIFY(db);2886mutex_exit(&db->db_mtx);28872888dbuf_noread(db, DMU_KEEP_CACHING);2889(void) dbuf_dirty(db, tx);2890}28912892void2893dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)2894{2895dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;28962897mutex_enter(&db->db_mtx);2898db->db_state = DB_NOFILL;2899DTRACE_SET_STATE(db, "allocating NOFILL buffer");2900mutex_exit(&db->db_mtx);29012902dbuf_noread(db, DMU_KEEP_CACHING);2903(void) dbuf_dirty(db, tx);2904}29052906void2907dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,2908dmu_flags_t flags)2909{2910dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;29112912ASSERT(db->db_blkid != DMU_BONUS_BLKID);2913ASSERT(tx->tx_txg != 0);2914ASSERT0(db->db_level);2915ASSERT(!zfs_refcount_is_zero(&db->db_holds));29162917ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||2918dmu_tx_private_ok(tx));29192920mutex_enter(&db->db_mtx);2921dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);2922if (db->db_state == DB_NOFILL ||2923(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {2924/*2925* If the fill can fail we should have a way to return back to2926* the cloned or Direct I/O write data.2927*/2928if (canfail && dr) {2929mutex_exit(&db->db_mtx);2930dmu_buf_will_dirty_flags(db_fake, tx, flags);2931return;2932}2933/*2934* Block cloning: We will be completely overwriting a block2935* cloned in this transaction group, so let's undirty the2936* pending clone and mark the block as uncached. This will be2937* as if the clone was never done.2938*/2939if (db->db_state == DB_NOFILL) {2940VERIFY(!dbuf_undirty(db, tx));2941db->db_state = DB_UNCACHED;2942}2943}2944mutex_exit(&db->db_mtx);29452946dbuf_noread(db, flags);2947(void) dbuf_dirty(db, tx);2948}29492950void2951dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)2952{2953dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);2954}29552956/*2957* This function is effectively the same as dmu_buf_will_dirty(), but2958* indicates the caller expects raw encrypted data in the db, and provides2959* the crypt params (byteorder, salt, iv, mac) which should be stored in the2960* blkptr_t when this dbuf is written. This is only used for blocks of2961* dnodes, during raw receive.2962*/2963void2964dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,2965const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)2966{2967dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;2968dbuf_dirty_record_t *dr;29692970/*2971* dr_has_raw_params is only processed for blocks of dnodes2972* (see dbuf_sync_dnode_leaf_crypt()).2973*/2974ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);2975ASSERT0(db->db_level);2976ASSERT(db->db_objset->os_raw_receive);29772978dmu_buf_will_dirty_flags(db_fake, tx,2979DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);29802981dr = dbuf_find_dirty_eq(db, tx->tx_txg);29822983ASSERT3P(dr, !=, NULL);2984ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);29852986dr->dt.dl.dr_has_raw_params = B_TRUE;2987dr->dt.dl.dr_byteorder = byteorder;2988memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);2989memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);2990memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);2991}29922993static void2994dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)2995{2996struct dirty_leaf *dl;2997dbuf_dirty_record_t *dr;29982999ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);3000ASSERT0(db->db_level);30013002dr = list_head(&db->db_dirty_records);3003ASSERT3P(dr, !=, NULL);3004ASSERT3U(dr->dr_txg, ==, tx->tx_txg);3005dl = &dr->dt.dl;3006ASSERT0(dl->dr_has_raw_params);3007dl->dr_overridden_by = *bp;3008dl->dr_override_state = DR_OVERRIDDEN;3009BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);3010}30113012boolean_t3013dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)3014{3015(void) tx;3016dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3017mutex_enter(&db->db_mtx);3018DBUF_VERIFY(db);30193020if (db->db_state == DB_FILL) {3021if (db->db_level == 0 && db->db_freed_in_flight) {3022ASSERT(db->db_blkid != DMU_BONUS_BLKID);3023/* we were freed while filling */3024/* XXX dbuf_undirty? */3025memset(db->db.db_data, 0, db->db.db_size);3026db->db_freed_in_flight = FALSE;3027db->db_state = DB_CACHED;3028DTRACE_SET_STATE(db,3029"fill done handling freed in flight");3030failed = B_FALSE;3031} else if (failed) {3032VERIFY(!dbuf_undirty(db, tx));3033arc_buf_destroy(db->db_buf, db);3034db->db_buf = NULL;3035dbuf_clear_data(db);3036DTRACE_SET_STATE(db, "fill failed");3037} else {3038db->db_state = DB_CACHED;3039DTRACE_SET_STATE(db, "fill done");3040}3041cv_broadcast(&db->db_changed);3042} else {3043db->db_state = DB_CACHED;3044failed = B_FALSE;3045}3046mutex_exit(&db->db_mtx);3047return (failed);3048}30493050void3051dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,3052bp_embedded_type_t etype, enum zio_compress comp,3053int uncompressed_size, int compressed_size, int byteorder,3054dmu_tx_t *tx)3055{3056dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3057struct dirty_leaf *dl;3058dmu_object_type_t type;3059dbuf_dirty_record_t *dr;30603061if (etype == BP_EMBEDDED_TYPE_DATA) {3062ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),3063SPA_FEATURE_EMBEDDED_DATA));3064}30653066DB_DNODE_ENTER(db);3067type = DB_DNODE(db)->dn_type;3068DB_DNODE_EXIT(db);30693070ASSERT0(db->db_level);3071ASSERT(db->db_blkid != DMU_BONUS_BLKID);30723073dmu_buf_will_not_fill(dbuf, tx);30743075dr = list_head(&db->db_dirty_records);3076ASSERT3P(dr, !=, NULL);3077ASSERT3U(dr->dr_txg, ==, tx->tx_txg);3078dl = &dr->dt.dl;3079ASSERT0(dl->dr_has_raw_params);3080encode_embedded_bp_compressed(&dl->dr_overridden_by,3081data, comp, uncompressed_size, compressed_size);3082BPE_SET_ETYPE(&dl->dr_overridden_by, etype);3083BP_SET_TYPE(&dl->dr_overridden_by, type);3084BP_SET_LEVEL(&dl->dr_overridden_by, 0);3085BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);30863087dl->dr_override_state = DR_OVERRIDDEN;3088BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);3089}30903091void3092dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)3093{3094dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;3095dmu_object_type_t type;3096ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,3097SPA_FEATURE_REDACTED_DATASETS));30983099DB_DNODE_ENTER(db);3100type = DB_DNODE(db)->dn_type;3101DB_DNODE_EXIT(db);31023103ASSERT0(db->db_level);3104dmu_buf_will_not_fill(dbuf, tx);31053106blkptr_t bp = { { { {0} } } };3107BP_SET_TYPE(&bp, type);3108BP_SET_LEVEL(&bp, 0);3109BP_SET_BIRTH(&bp, tx->tx_txg, 0);3110BP_SET_REDACTED(&bp);3111BPE_SET_LSIZE(&bp, dbuf->db_size);31123113dbuf_override_impl(db, &bp, tx);3114}31153116/*3117* Directly assign a provided arc buf to a given dbuf if it's not referenced3118* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.3119*/3120void3121dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,3122dmu_flags_t flags)3123{3124ASSERT(!zfs_refcount_is_zero(&db->db_holds));3125ASSERT(db->db_blkid != DMU_BONUS_BLKID);3126ASSERT0(db->db_level);3127ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));3128ASSERT(buf != NULL);3129ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);3130ASSERT(tx->tx_txg != 0);31313132arc_return_buf(buf, db);3133ASSERT(arc_released(buf));31343135mutex_enter(&db->db_mtx);3136if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))3137db->db_pending_evict = B_FALSE;3138db->db_partial_read = B_FALSE;31393140while (db->db_state == DB_READ || db->db_state == DB_FILL)3141cv_wait(&db->db_changed, &db->db_mtx);31423143ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||3144db->db_state == DB_NOFILL);31453146if (db->db_state == DB_CACHED &&3147zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {3148/*3149* In practice, we will never have a case where we have an3150* encrypted arc buffer while additional holds exist on the3151* dbuf. We don't handle this here so we simply assert that3152* fact instead.3153*/3154ASSERT(!arc_is_encrypted(buf));3155mutex_exit(&db->db_mtx);3156(void) dbuf_dirty(db, tx);3157memcpy(db->db.db_data, buf->b_data, db->db.db_size);3158arc_buf_destroy(buf, db);3159return;3160}31613162if (db->db_state == DB_CACHED) {3163dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);31643165ASSERT(db->db_buf != NULL);3166if (dr != NULL && dr->dr_txg == tx->tx_txg) {3167ASSERT(dr->dt.dl.dr_data == db->db_buf);31683169if (!arc_released(db->db_buf)) {3170ASSERT(dr->dt.dl.dr_override_state ==3171DR_OVERRIDDEN);3172arc_release(db->db_buf, db);3173}3174dr->dt.dl.dr_data = buf;3175arc_buf_destroy(db->db_buf, db);3176} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {3177arc_release(db->db_buf, db);3178arc_buf_destroy(db->db_buf, db);3179}3180db->db_buf = NULL;3181} else if (db->db_state == DB_NOFILL) {3182/*3183* We will be completely replacing the cloned block. In case3184* it was cloned in this transaction group, let's undirty the3185* pending clone and mark the block as uncached. This will be3186* as if the clone was never done.3187*/3188VERIFY(!dbuf_undirty(db, tx));3189db->db_state = DB_UNCACHED;3190}3191ASSERT0P(db->db_buf);3192dbuf_set_data(db, buf);3193db->db_state = DB_FILL;3194DTRACE_SET_STATE(db, "filling assigned arcbuf");3195mutex_exit(&db->db_mtx);3196(void) dbuf_dirty(db, tx);3197dmu_buf_fill_done(&db->db, tx, B_FALSE);3198}31993200void3201dbuf_destroy(dmu_buf_impl_t *db)3202{3203dnode_t *dn;3204dmu_buf_impl_t *parent = db->db_parent;3205dmu_buf_impl_t *dndb;32063207ASSERT(MUTEX_HELD(&db->db_mtx));3208ASSERT(zfs_refcount_is_zero(&db->db_holds));32093210if (db->db_buf != NULL) {3211arc_buf_destroy(db->db_buf, db);3212db->db_buf = NULL;3213}32143215if (db->db_blkid == DMU_BONUS_BLKID) {3216int slots = DB_DNODE(db)->dn_num_slots;3217int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);3218if (db->db.db_data != NULL) {3219kmem_free(db->db.db_data, bonuslen);3220arc_space_return(bonuslen, ARC_SPACE_BONUS);3221db->db_state = DB_UNCACHED;3222DTRACE_SET_STATE(db, "buffer cleared");3223}3224}32253226dbuf_clear_data(db);32273228if (multilist_link_active(&db->db_cache_link)) {3229ASSERT(db->db_caching_status == DB_DBUF_CACHE ||3230db->db_caching_status == DB_DBUF_METADATA_CACHE);32313232multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);32333234ASSERT0(dmu_buf_user_size(&db->db));3235(void) zfs_refcount_remove_many(3236&dbuf_caches[db->db_caching_status].size,3237db->db.db_size, db);32383239if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {3240DBUF_STAT_BUMPDOWN(metadata_cache_count);3241} else {3242DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);3243DBUF_STAT_BUMPDOWN(cache_count);3244DBUF_STAT_DECR(cache_levels_bytes[db->db_level],3245db->db.db_size);3246}3247db->db_caching_status = DB_NO_CACHE;3248}32493250ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);3251ASSERT0P(db->db_data_pending);3252ASSERT(list_is_empty(&db->db_dirty_records));32533254db->db_state = DB_EVICTING;3255DTRACE_SET_STATE(db, "buffer eviction started");3256db->db_blkptr = NULL;32573258/*3259* Now that db_state is DB_EVICTING, nobody else can find this via3260* the hash table. We can now drop db_mtx, which allows us to3261* acquire the dn_dbufs_mtx.3262*/3263mutex_exit(&db->db_mtx);32643265DB_DNODE_ENTER(db);3266dn = DB_DNODE(db);3267dndb = dn->dn_dbuf;3268if (db->db_blkid != DMU_BONUS_BLKID) {3269boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);3270if (needlock)3271mutex_enter_nested(&dn->dn_dbufs_mtx,3272NESTED_SINGLE);3273avl_remove(&dn->dn_dbufs, db);3274membar_producer();3275DB_DNODE_EXIT(db);3276if (needlock)3277mutex_exit(&dn->dn_dbufs_mtx);3278/*3279* Decrementing the dbuf count means that the hold corresponding3280* to the removed dbuf is no longer discounted in dnode_move(),3281* so the dnode cannot be moved until after we release the hold.3282* The membar_producer() ensures visibility of the decremented3283* value in dnode_move(), since DB_DNODE_EXIT doesn't actually3284* release any lock.3285*/3286mutex_enter(&dn->dn_mtx);3287dnode_rele_and_unlock(dn, db, B_TRUE);3288#ifdef USE_DNODE_HANDLE3289db->db_dnode_handle = NULL;3290#else3291db->db_dnode = NULL;3292#endif32933294dbuf_hash_remove(db);3295} else {3296DB_DNODE_EXIT(db);3297}32983299ASSERT(zfs_refcount_is_zero(&db->db_holds));33003301db->db_parent = NULL;33023303ASSERT0P(db->db_buf);3304ASSERT0P(db->db.db_data);3305ASSERT0P(db->db_hash_next);3306ASSERT0P(db->db_blkptr);3307ASSERT0P(db->db_data_pending);3308ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);3309ASSERT(!multilist_link_active(&db->db_cache_link));33103311/*3312* If this dbuf is referenced from an indirect dbuf,3313* decrement the ref count on the indirect dbuf.3314*/3315if (parent && parent != dndb) {3316mutex_enter(&parent->db_mtx);3317dbuf_rele_and_unlock(parent, db, B_TRUE);3318}33193320kmem_cache_free(dbuf_kmem_cache, db);3321arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);3322}33233324/*3325* Note: While bpp will always be updated if the function returns success,3326* parentp will not be updated if the dnode does not have dn_dbuf filled in;3327* this happens when the dnode is the meta-dnode, or {user|group|project}used3328* object.3329*/3330__attribute__((always_inline))3331static inline int3332dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,3333dmu_buf_impl_t **parentp, blkptr_t **bpp)3334{3335*parentp = NULL;3336*bpp = NULL;33373338ASSERT(blkid != DMU_BONUS_BLKID);33393340if (blkid == DMU_SPILL_BLKID) {3341mutex_enter(&dn->dn_mtx);3342if (dn->dn_have_spill &&3343(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))3344*bpp = DN_SPILL_BLKPTR(dn->dn_phys);3345else3346*bpp = NULL;3347dbuf_add_ref(dn->dn_dbuf, NULL);3348*parentp = dn->dn_dbuf;3349mutex_exit(&dn->dn_mtx);3350return (0);3351}33523353int nlevels =3354(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;3355int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;33563357ASSERT3U(level * epbs, <, 64);3358ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3359/*3360* This assertion shouldn't trip as long as the max indirect block size3361* is less than 1M. The reason for this is that up to that point,3362* the number of levels required to address an entire object with blocks3363* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In3364* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 553365* (i.e. we can address the entire object), objects will all use at most3366* N-1 levels and the assertion won't overflow. However, once epbs is3367* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be3368* enough to address an entire object, so objects will have 5 levels,3369* but then this assertion will overflow.3370*3371* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we3372* need to redo this logic to handle overflows.3373*/3374ASSERT(level >= nlevels ||3375((nlevels - level - 1) * epbs) +3376highbit64(dn->dn_phys->dn_nblkptr) <= 64);3377if (level >= nlevels ||3378blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<3379((nlevels - level - 1) * epbs)) ||3380(fail_sparse &&3381blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {3382/* the buffer has no parent yet */3383return (SET_ERROR(ENOENT));3384} else if (level < nlevels-1) {3385/* this block is referenced from an indirect block */3386int err;33873388err = dbuf_hold_impl(dn, level + 1,3389blkid >> epbs, fail_sparse, FALSE, NULL, parentp);33903391if (err)3392return (err);3393err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |3394DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);3395if (err) {3396dbuf_rele(*parentp, NULL);3397*parentp = NULL;3398return (err);3399}3400*bpp = ((blkptr_t *)(*parentp)->db.db_data) +3401(blkid & ((1ULL << epbs) - 1));3402return (0);3403} else {3404/* the block is referenced from the dnode */3405ASSERT3U(level, ==, nlevels-1);3406ASSERT(dn->dn_phys->dn_nblkptr == 0 ||3407blkid < dn->dn_phys->dn_nblkptr);3408if (dn->dn_dbuf) {3409dbuf_add_ref(dn->dn_dbuf, NULL);3410*parentp = dn->dn_dbuf;3411}3412*bpp = &dn->dn_phys->dn_blkptr[blkid];3413return (0);3414}3415}34163417static dmu_buf_impl_t *3418dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,3419dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)3420{3421objset_t *os = dn->dn_objset;3422dmu_buf_impl_t *db, *odb;34233424ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3425ASSERT(dn->dn_type != DMU_OT_NONE);34263427db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);34283429list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),3430offsetof(dbuf_dirty_record_t, dr_dbuf_node));34313432db->db_objset = os;3433db->db.db_object = dn->dn_object;3434db->db_level = level;3435db->db_blkid = blkid;3436db->db_dirtycnt = 0;3437#ifdef USE_DNODE_HANDLE3438db->db_dnode_handle = dn->dn_handle;3439#else3440db->db_dnode = dn;3441#endif3442db->db_parent = parent;3443db->db_blkptr = blkptr;3444db->db_hash = hash;34453446db->db_user = NULL;3447db->db_user_immediate_evict = FALSE;3448db->db_freed_in_flight = FALSE;3449db->db_pending_evict = TRUE;3450db->db_partial_read = FALSE;34513452if (blkid == DMU_BONUS_BLKID) {3453ASSERT3P(parent, ==, dn->dn_dbuf);3454db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -3455(dn->dn_nblkptr-1) * sizeof (blkptr_t);3456ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);3457db->db.db_offset = DMU_BONUS_BLKID;3458db->db_state = DB_UNCACHED;3459DTRACE_SET_STATE(db, "bonus buffer created");3460db->db_caching_status = DB_NO_CACHE;3461/* the bonus dbuf is not placed in the hash table */3462arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);3463return (db);3464} else if (blkid == DMU_SPILL_BLKID) {3465db->db.db_size = (blkptr != NULL) ?3466BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;3467db->db.db_offset = 0;3468} else {3469int blocksize =3470db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;3471db->db.db_size = blocksize;3472db->db.db_offset = db->db_blkid * blocksize;3473}34743475/*3476* Hold the dn_dbufs_mtx while we get the new dbuf3477* in the hash table *and* added to the dbufs list.3478* This prevents a possible deadlock with someone3479* trying to look up this dbuf before it's added to the3480* dn_dbufs list.3481*/3482mutex_enter(&dn->dn_dbufs_mtx);3483db->db_state = DB_EVICTING; /* not worth logging this state change */3484if ((odb = dbuf_hash_insert(db)) != NULL) {3485/* someone else inserted it first */3486mutex_exit(&dn->dn_dbufs_mtx);3487kmem_cache_free(dbuf_kmem_cache, db);3488DBUF_STAT_BUMP(hash_insert_race);3489return (odb);3490}3491avl_add(&dn->dn_dbufs, db);34923493db->db_state = DB_UNCACHED;3494DTRACE_SET_STATE(db, "regular buffer created");3495db->db_caching_status = DB_NO_CACHE;3496mutex_exit(&dn->dn_dbufs_mtx);3497arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);34983499if (parent && parent != dn->dn_dbuf)3500dbuf_add_ref(parent, db);35013502ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||3503zfs_refcount_count(&dn->dn_holds) > 0);3504(void) zfs_refcount_add(&dn->dn_holds, db);35053506dprintf_dbuf(db, "db=%p\n", db);35073508return (db);3509}35103511/*3512* This function returns a block pointer and information about the object,3513* given a dnode and a block. This is a publicly accessible version of3514* dbuf_findbp that only returns some information, rather than the3515* dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock3516* should be locked as (at least) a reader.3517*/3518int3519dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,3520blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)3521{3522dmu_buf_impl_t *dbp = NULL;3523blkptr_t *bp2;3524int err = 0;3525ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));35263527err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);3528if (err == 0) {3529ASSERT3P(bp2, !=, NULL);3530*bp = *bp2;3531if (dbp != NULL)3532dbuf_rele(dbp, NULL);3533if (datablkszsec != NULL)3534*datablkszsec = dn->dn_phys->dn_datablkszsec;3535if (indblkshift != NULL)3536*indblkshift = dn->dn_phys->dn_indblkshift;3537}35383539return (err);3540}35413542typedef struct dbuf_prefetch_arg {3543spa_t *dpa_spa; /* The spa to issue the prefetch in. */3544zbookmark_phys_t dpa_zb; /* The target block to prefetch. */3545int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */3546int dpa_curlevel; /* The current level that we're reading */3547dnode_t *dpa_dnode; /* The dnode associated with the prefetch */3548zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */3549zio_t *dpa_zio; /* The parent zio_t for all prefetches. */3550arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */3551dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */3552void *dpa_arg; /* prefetch completion arg */3553} dbuf_prefetch_arg_t;35543555static void3556dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)3557{3558if (dpa->dpa_cb != NULL) {3559dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,3560dpa->dpa_zb.zb_blkid, io_done);3561}3562kmem_free(dpa, sizeof (*dpa));3563}35643565static void3566dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,3567const blkptr_t *iobp, arc_buf_t *abuf, void *private)3568{3569(void) zio, (void) zb, (void) iobp;3570dbuf_prefetch_arg_t *dpa = private;35713572if (abuf != NULL)3573arc_buf_destroy(abuf, private);35743575dbuf_prefetch_fini(dpa, B_TRUE);3576}35773578/*3579* Actually issue the prefetch read for the block given.3580*/3581static void3582dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)3583{3584ASSERT(!BP_IS_HOLE(bp));3585ASSERT(!BP_IS_REDACTED(bp));3586if (BP_IS_EMBEDDED(bp))3587return (dbuf_prefetch_fini(dpa, B_FALSE));35883589int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;3590arc_flags_t aflags =3591dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |3592ARC_FLAG_NO_BUF;35933594/* dnodes are always read as raw and then converted later */3595if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&3596dpa->dpa_curlevel == 0)3597zio_flags |= ZIO_FLAG_RAW;35983599ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));3600ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);3601ASSERT(dpa->dpa_zio != NULL);3602(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,3603dbuf_issue_final_prefetch_done, dpa,3604dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);3605}36063607/*3608* Called when an indirect block above our prefetch target is read in. This3609* will either read in the next indirect block down the tree or issue the actual3610* prefetch if the next block down is our target.3611*/3612static void3613dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,3614const blkptr_t *iobp, arc_buf_t *abuf, void *private)3615{3616(void) zb, (void) iobp;3617dbuf_prefetch_arg_t *dpa = private;36183619ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);3620ASSERT3S(dpa->dpa_curlevel, >, 0);36213622if (abuf == NULL) {3623ASSERT(zio == NULL || zio->io_error != 0);3624dbuf_prefetch_fini(dpa, B_TRUE);3625return;3626}3627ASSERT(zio == NULL || zio->io_error == 0);36283629/*3630* The dpa_dnode is only valid if we are called with a NULL3631* zio. This indicates that the arc_read() returned without3632* first calling zio_read() to issue a physical read. Once3633* a physical read is made the dpa_dnode must be invalidated3634* as the locks guarding it may have been dropped. If the3635* dpa_dnode is still valid, then we want to add it to the dbuf3636* cache. To do so, we must hold the dbuf associated with the block3637* we just prefetched, read its contents so that we associate it3638* with an arc_buf_t, and then release it.3639*/3640if (zio != NULL) {3641ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);3642if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {3643ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);3644} else {3645ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);3646}3647ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);36483649dpa->dpa_dnode = NULL;3650} else if (dpa->dpa_dnode != NULL) {3651uint64_t curblkid = dpa->dpa_zb.zb_blkid >>3652(dpa->dpa_epbs * (dpa->dpa_curlevel -3653dpa->dpa_zb.zb_level));3654dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,3655dpa->dpa_curlevel, curblkid, FTAG);3656if (db == NULL) {3657arc_buf_destroy(abuf, private);3658dbuf_prefetch_fini(dpa, B_TRUE);3659return;3660}3661(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |3662DMU_READ_NO_PREFETCH);3663dbuf_rele(db, FTAG);3664}36653666dpa->dpa_curlevel--;3667uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>3668(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));3669blkptr_t *bp = ((blkptr_t *)abuf->b_data) +3670P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);36713672ASSERT(!BP_IS_REDACTED(bp) || dpa->dpa_dnode == NULL ||3673dsl_dataset_feature_is_active(3674dpa->dpa_dnode->dn_objset->os_dsl_dataset,3675SPA_FEATURE_REDACTED_DATASETS));3676if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {3677arc_buf_destroy(abuf, private);3678dbuf_prefetch_fini(dpa, B_TRUE);3679return;3680} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {3681ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);3682dbuf_issue_final_prefetch(dpa, bp);3683} else {3684arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;3685zbookmark_phys_t zb;36863687/* flag if L2ARC eligible, l2arc_noprefetch then decides */3688if (dpa->dpa_dnode) {3689if (dnode_level_is_l2cacheable(bp, dpa->dpa_dnode,3690dpa->dpa_curlevel))3691iter_aflags |= ARC_FLAG_L2CACHE;3692} else {3693if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)3694iter_aflags |= ARC_FLAG_L2CACHE;3695}36963697ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));36983699SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,3700dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);37013702(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,3703bp, dbuf_prefetch_indirect_done, dpa,3704ZIO_PRIORITY_SYNC_READ,3705ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,3706&iter_aflags, &zb);3707}37083709arc_buf_destroy(abuf, private);3710}37113712/*3713* Issue prefetch reads for the given block on the given level. If the indirect3714* blocks above that block are not in memory, we will read them in3715* asynchronously. As a result, this call never blocks waiting for a read to3716* complete. Note that the prefetch might fail if the dataset is encrypted and3717* the encryption key is unmapped before the IO completes.3718*/3719int3720dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,3721zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,3722void *arg)3723{3724blkptr_t bp;3725int epbs, nlevels, curlevel;3726uint64_t curblkid;37273728ASSERT(blkid != DMU_BONUS_BLKID);3729ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));37303731if (blkid > dn->dn_maxblkid)3732goto no_issue;37333734if (level == 0 && dnode_block_freed(dn, blkid))3735goto no_issue;37363737/*3738* This dnode hasn't been written to disk yet, so there's nothing to3739* prefetch.3740*/3741nlevels = dn->dn_phys->dn_nlevels;3742if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)3743goto no_issue;37443745epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;3746if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))3747goto no_issue;37483749dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,3750level, blkid, NULL);3751if (db != NULL) {3752mutex_exit(&db->db_mtx);3753/*3754* This dbuf already exists. It is either CACHED, or3755* (we assume) about to be read or filled.3756*/3757goto no_issue;3758}37593760/*3761* Find the closest ancestor (indirect block) of the target block3762* that is present in the cache. In this indirect block, we will3763* find the bp that is at curlevel, curblkid.3764*/3765curlevel = level;3766curblkid = blkid;3767while (curlevel < nlevels - 1) {3768int parent_level = curlevel + 1;3769uint64_t parent_blkid = curblkid >> epbs;3770dmu_buf_impl_t *db;37713772if (dbuf_hold_impl(dn, parent_level, parent_blkid,3773FALSE, TRUE, FTAG, &db) == 0) {3774blkptr_t *bpp = db->db_buf->b_data;3775bp = bpp[P2PHASE(curblkid, 1 << epbs)];3776dbuf_rele(db, FTAG);3777break;3778}37793780curlevel = parent_level;3781curblkid = parent_blkid;3782}37833784if (curlevel == nlevels - 1) {3785/* No cached indirect blocks found. */3786ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);3787bp = dn->dn_phys->dn_blkptr[curblkid];3788}3789ASSERT(!BP_IS_REDACTED(&bp) ||3790dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,3791SPA_FEATURE_REDACTED_DATASETS));3792if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))3793goto no_issue;37943795ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));37963797zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,3798ZIO_FLAG_CANFAIL);37993800dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);3801dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;3802SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,3803dn->dn_object, level, blkid);3804dpa->dpa_curlevel = curlevel;3805dpa->dpa_prio = prio;3806dpa->dpa_aflags = aflags;3807dpa->dpa_spa = dn->dn_objset->os_spa;3808dpa->dpa_dnode = dn;3809dpa->dpa_epbs = epbs;3810dpa->dpa_zio = pio;3811dpa->dpa_cb = cb;3812dpa->dpa_arg = arg;38133814if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))3815dpa->dpa_aflags |= ARC_FLAG_UNCACHED;3816else if (dnode_level_is_l2cacheable(&bp, dn, level))3817dpa->dpa_aflags |= ARC_FLAG_L2CACHE;38183819/*3820* If we have the indirect just above us, no need to do the asynchronous3821* prefetch chain; we'll just run the last step ourselves. If we're at3822* a higher level, though, we want to issue the prefetches for all the3823* indirect blocks asynchronously, so we can go on with whatever we were3824* doing.3825*/3826if (curlevel == level) {3827ASSERT3U(curblkid, ==, blkid);3828dbuf_issue_final_prefetch(dpa, &bp);3829} else {3830arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;3831zbookmark_phys_t zb;38323833/* flag if L2ARC eligible, l2arc_noprefetch then decides */3834if (dnode_level_is_l2cacheable(&bp, dn, curlevel))3835iter_aflags |= ARC_FLAG_L2CACHE;38363837SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,3838dn->dn_object, curlevel, curblkid);3839(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,3840&bp, dbuf_prefetch_indirect_done, dpa,3841ZIO_PRIORITY_SYNC_READ,3842ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,3843&iter_aflags, &zb);3844}3845/*3846* We use pio here instead of dpa_zio since it's possible that3847* dpa may have already been freed.3848*/3849zio_nowait(pio);3850return (1);3851no_issue:3852if (cb != NULL)3853cb(arg, level, blkid, B_FALSE);3854return (0);3855}38563857int3858dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,3859arc_flags_t aflags)3860{38613862return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));3863}38643865/*3866* Helper function for dbuf_hold_impl() to copy a buffer. Handles3867* the case of encrypted, compressed and uncompressed buffers by3868* allocating the new buffer, respectively, with arc_alloc_raw_buf(),3869* arc_alloc_compressed_buf() or arc_alloc_buf().*3870*3871* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().3872*/3873noinline static void3874dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)3875{3876dbuf_dirty_record_t *dr = db->db_data_pending;3877arc_buf_t *data = dr->dt.dl.dr_data;3878arc_buf_t *db_data;3879enum zio_compress compress_type = arc_get_compression(data);3880uint8_t complevel = arc_get_complevel(data);38813882if (arc_is_encrypted(data)) {3883boolean_t byteorder;3884uint8_t salt[ZIO_DATA_SALT_LEN];3885uint8_t iv[ZIO_DATA_IV_LEN];3886uint8_t mac[ZIO_DATA_MAC_LEN];38873888arc_get_raw_params(data, &byteorder, salt, iv, mac);3889db_data = arc_alloc_raw_buf(dn->dn_objset->os_spa, db,3890dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,3891dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),3892compress_type, complevel);3893} else if (compress_type != ZIO_COMPRESS_OFF) {3894db_data = arc_alloc_compressed_buf(3895dn->dn_objset->os_spa, db, arc_buf_size(data),3896arc_buf_lsize(data), compress_type, complevel);3897} else {3898db_data = arc_alloc_buf(dn->dn_objset->os_spa, db,3899DBUF_GET_BUFC_TYPE(db), db->db.db_size);3900}3901memcpy(db_data->b_data, data->b_data, arc_buf_size(data));39023903dbuf_set_data(db, db_data);3904}39053906/*3907* Returns with db_holds incremented, and db_mtx not held.3908* Note: dn_struct_rwlock must be held.3909*/3910int3911dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,3912boolean_t fail_sparse, boolean_t fail_uncached,3913const void *tag, dmu_buf_impl_t **dbp)3914{3915dmu_buf_impl_t *db, *parent = NULL;3916uint64_t hv;39173918/* If the pool has been created, verify the tx_sync_lock is not held */3919spa_t *spa = dn->dn_objset->os_spa;3920dsl_pool_t *dp = spa->spa_dsl_pool;3921if (dp != NULL) {3922ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));3923}39243925ASSERT(blkid != DMU_BONUS_BLKID);3926ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));3927if (!fail_sparse)3928ASSERT3U(dn->dn_nlevels, >, level);39293930*dbp = NULL;39313932/* dbuf_find() returns with db_mtx held */3933db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);39343935if (db == NULL) {3936blkptr_t *bp = NULL;3937int err;39383939if (fail_uncached)3940return (SET_ERROR(ENOENT));39413942ASSERT0P(parent);3943err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);3944if (fail_sparse) {3945if (err == 0 && bp && BP_IS_HOLE(bp))3946err = SET_ERROR(ENOENT);3947if (err) {3948if (parent)3949dbuf_rele(parent, NULL);3950return (err);3951}3952}3953if (err && err != ENOENT)3954return (err);3955db = dbuf_create(dn, level, blkid, parent, bp, hv);3956}39573958if (fail_uncached && db->db_state != DB_CACHED) {3959mutex_exit(&db->db_mtx);3960return (SET_ERROR(ENOENT));3961}39623963if (db->db_buf != NULL) {3964arc_buf_access(db->db_buf);3965ASSERT(MUTEX_HELD(&db->db_mtx));3966ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);3967}39683969ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));39703971/*3972* If this buffer is currently syncing out, and we are3973* still referencing it from db_data, we need to make a copy3974* of it in case we decide we want to dirty it again in this txg.3975*/3976if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&3977dn->dn_object != DMU_META_DNODE_OBJECT &&3978db->db_state == DB_CACHED && db->db_data_pending) {3979dbuf_dirty_record_t *dr = db->db_data_pending;3980if (dr->dt.dl.dr_data == db->db_buf) {3981ASSERT3P(db->db_buf, !=, NULL);3982dbuf_hold_copy(dn, db);3983}3984}39853986if (multilist_link_active(&db->db_cache_link)) {3987ASSERT(zfs_refcount_is_zero(&db->db_holds));3988ASSERT(db->db_caching_status == DB_DBUF_CACHE ||3989db->db_caching_status == DB_DBUF_METADATA_CACHE);39903991multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);39923993uint64_t size = db->db.db_size;3994uint64_t usize = dmu_buf_user_size(&db->db);3995(void) zfs_refcount_remove_many(3996&dbuf_caches[db->db_caching_status].size, size, db);3997(void) zfs_refcount_remove_many(3998&dbuf_caches[db->db_caching_status].size, usize,3999db->db_user);40004001if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {4002DBUF_STAT_BUMPDOWN(metadata_cache_count);4003} else {4004DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);4005DBUF_STAT_BUMPDOWN(cache_count);4006DBUF_STAT_DECR(cache_levels_bytes[db->db_level],4007size + usize);4008}4009db->db_caching_status = DB_NO_CACHE;4010}4011(void) zfs_refcount_add(&db->db_holds, tag);4012DBUF_VERIFY(db);4013mutex_exit(&db->db_mtx);40144015/* NOTE: we can't rele the parent until after we drop the db_mtx */4016if (parent)4017dbuf_rele(parent, NULL);40184019ASSERT3P(DB_DNODE(db), ==, dn);4020ASSERT3U(db->db_blkid, ==, blkid);4021ASSERT3U(db->db_level, ==, level);4022*dbp = db;40234024return (0);4025}40264027dmu_buf_impl_t *4028dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)4029{4030return (dbuf_hold_level(dn, 0, blkid, tag));4031}40324033dmu_buf_impl_t *4034dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)4035{4036dmu_buf_impl_t *db;4037int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);4038return (err ? NULL : db);4039}40404041void4042dbuf_create_bonus(dnode_t *dn)4043{4044ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));40454046ASSERT0P(dn->dn_bonus);4047dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,4048dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));4049dn->dn_bonus->db_pending_evict = FALSE;4050}40514052int4053dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)4054{4055dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;40564057if (db->db_blkid != DMU_SPILL_BLKID)4058return (SET_ERROR(ENOTSUP));4059if (blksz == 0)4060blksz = SPA_MINBLOCKSIZE;4061ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));4062blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);40634064dbuf_new_size(db, blksz, tx);40654066return (0);4067}40684069void4070dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)4071{4072dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);4073}40744075#pragma weak dmu_buf_add_ref = dbuf_add_ref4076void4077dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)4078{4079int64_t holds = zfs_refcount_add(&db->db_holds, tag);4080VERIFY3S(holds, >, 1);4081}40824083#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref4084boolean_t4085dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,4086const void *tag)4087{4088dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4089dmu_buf_impl_t *found_db;4090boolean_t result = B_FALSE;40914092if (blkid == DMU_BONUS_BLKID)4093found_db = dbuf_find_bonus(os, obj);4094else4095found_db = dbuf_find(os, obj, 0, blkid, NULL);40964097if (found_db != NULL) {4098if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {4099(void) zfs_refcount_add(&db->db_holds, tag);4100result = B_TRUE;4101}4102mutex_exit(&found_db->db_mtx);4103}4104return (result);4105}41064107/*4108* If you call dbuf_rele() you had better not be referencing the dnode handle4109* unless you have some other direct or indirect hold on the dnode. (An indirect4110* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)4111* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the4112* dnode's parent dbuf evicting its dnode handles.4113*/4114void4115dbuf_rele(dmu_buf_impl_t *db, const void *tag)4116{4117mutex_enter(&db->db_mtx);4118dbuf_rele_and_unlock(db, tag, B_FALSE);4119}41204121void4122dmu_buf_rele(dmu_buf_t *db, const void *tag)4123{4124dbuf_rele((dmu_buf_impl_t *)db, tag);4125}41264127/*4128* dbuf_rele() for an already-locked dbuf. This is necessary to allow4129* db_dirtycnt and db_holds to be updated atomically. The 'evicting'4130* argument should be set if we are already in the dbuf-evicting code4131* path, in which case we don't want to recursively evict. This allows us to4132* avoid deeply nested stacks that would have a call flow similar to this:4133*4134* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()4135* ^ |4136* | |4137* +-----dbuf_destroy()<--dbuf_evict_one()<--------+4138*4139*/4140void4141dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)4142{4143int64_t holds;4144uint64_t size;41454146ASSERT(MUTEX_HELD(&db->db_mtx));4147DBUF_VERIFY(db);41484149/*4150* Remove the reference to the dbuf before removing its hold on the4151* dnode so we can guarantee in dnode_move() that a referenced bonus4152* buffer has a corresponding dnode hold.4153*/4154holds = zfs_refcount_remove(&db->db_holds, tag);4155ASSERT(holds >= 0);41564157/*4158* We can't freeze indirects if there is a possibility that they4159* may be modified in the current syncing context.4160*/4161if (db->db_buf != NULL &&4162holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {4163arc_buf_freeze(db->db_buf);4164}41654166if (holds == db->db_dirtycnt &&4167db->db_level == 0 && db->db_user_immediate_evict)4168dbuf_evict_user(db);41694170if (holds == 0) {4171if (db->db_blkid == DMU_BONUS_BLKID) {4172dnode_t *dn;4173boolean_t evict_dbuf = db->db_pending_evict;41744175/*4176* If the dnode moves here, we cannot cross this4177* barrier until the move completes.4178*/4179DB_DNODE_ENTER(db);41804181dn = DB_DNODE(db);4182atomic_dec_32(&dn->dn_dbufs_count);41834184/*4185* Decrementing the dbuf count means that the bonus4186* buffer's dnode hold is no longer discounted in4187* dnode_move(). The dnode cannot move until after4188* the dnode_rele() below.4189*/4190DB_DNODE_EXIT(db);41914192/*4193* Do not reference db after its lock is dropped.4194* Another thread may evict it.4195*/4196mutex_exit(&db->db_mtx);41974198if (evict_dbuf)4199dnode_evict_bonus(dn);42004201dnode_rele(dn, db);4202} else if (db->db_buf == NULL) {4203/*4204* This is a special case: we never associated this4205* dbuf with any data allocated from the ARC.4206*/4207ASSERT(db->db_state == DB_UNCACHED ||4208db->db_state == DB_NOFILL);4209dbuf_destroy(db);4210} else if (arc_released(db->db_buf)) {4211/*4212* This dbuf has anonymous data associated with it.4213*/4214dbuf_destroy(db);4215} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {4216/*4217* We don't expect more accesses to the dbuf, and it4218* is either not cacheable or was marked for eviction.4219*/4220dbuf_destroy(db);4221} else if (!multilist_link_active(&db->db_cache_link)) {4222ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);42234224dbuf_cached_state_t dcs =4225dbuf_include_in_metadata_cache(db) ?4226DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;4227db->db_caching_status = dcs;42284229multilist_insert(&dbuf_caches[dcs].cache, db);4230uint64_t db_size = db->db.db_size;4231uint64_t dbu_size = dmu_buf_user_size(&db->db);4232(void) zfs_refcount_add_many(4233&dbuf_caches[dcs].size, db_size, db);4234size = zfs_refcount_add_many(4235&dbuf_caches[dcs].size, dbu_size, db->db_user);4236uint8_t db_level = db->db_level;4237mutex_exit(&db->db_mtx);42384239if (dcs == DB_DBUF_METADATA_CACHE) {4240DBUF_STAT_BUMP(metadata_cache_count);4241DBUF_STAT_MAX(metadata_cache_size_bytes_max,4242size);4243} else {4244DBUF_STAT_BUMP(cache_count);4245DBUF_STAT_MAX(cache_size_bytes_max, size);4246DBUF_STAT_BUMP(cache_levels[db_level]);4247DBUF_STAT_INCR(cache_levels_bytes[db_level],4248db_size + dbu_size);4249}42504251if (dcs == DB_DBUF_CACHE && !evicting)4252dbuf_evict_notify(size);4253}4254} else {4255mutex_exit(&db->db_mtx);4256}4257}42584259#pragma weak dmu_buf_refcount = dbuf_refcount4260uint64_t4261dbuf_refcount(dmu_buf_impl_t *db)4262{4263return (zfs_refcount_count(&db->db_holds));4264}42654266uint64_t4267dmu_buf_user_refcount(dmu_buf_t *db_fake)4268{4269uint64_t holds;4270dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;42714272mutex_enter(&db->db_mtx);4273ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);4274holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;4275mutex_exit(&db->db_mtx);42764277return (holds);4278}42794280void *4281dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,4282dmu_buf_user_t *new_user)4283{4284dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;42854286mutex_enter(&db->db_mtx);4287dbuf_verify_user(db, DBVU_NOT_EVICTING);4288if (db->db_user == old_user)4289db->db_user = new_user;4290else4291old_user = db->db_user;4292dbuf_verify_user(db, DBVU_NOT_EVICTING);4293mutex_exit(&db->db_mtx);42944295return (old_user);4296}42974298void *4299dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)4300{4301return (dmu_buf_replace_user(db_fake, NULL, user));4302}43034304void *4305dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)4306{4307dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;43084309db->db_user_immediate_evict = TRUE;4310return (dmu_buf_set_user(db_fake, user));4311}43124313void *4314dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)4315{4316return (dmu_buf_replace_user(db_fake, user, NULL));4317}43184319void *4320dmu_buf_get_user(dmu_buf_t *db_fake)4321{4322dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;43234324dbuf_verify_user(db, DBVU_NOT_EVICTING);4325return (db->db_user);4326}43274328uint64_t4329dmu_buf_user_size(dmu_buf_t *db_fake)4330{4331dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4332if (db->db_user == NULL)4333return (0);4334return (atomic_load_64(&db->db_user->dbu_size));4335}43364337void4338dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)4339{4340dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4341ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);4342ASSERT3P(db->db_user, !=, NULL);4343ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);4344atomic_add_64(&db->db_user->dbu_size, nadd);4345}43464347void4348dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)4349{4350dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;4351ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);4352ASSERT3P(db->db_user, !=, NULL);4353ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);4354atomic_sub_64(&db->db_user->dbu_size, nsub);4355}43564357void4358dmu_buf_user_evict_wait(void)4359{4360taskq_wait(dbu_evict_taskq);4361}43624363blkptr_t *4364dmu_buf_get_blkptr(dmu_buf_t *db)4365{4366dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;4367return (dbi->db_blkptr);4368}43694370objset_t *4371dmu_buf_get_objset(dmu_buf_t *db)4372{4373dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;4374return (dbi->db_objset);4375}43764377static void4378dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)4379{4380/* ASSERT(dmu_tx_is_syncing(tx) */4381ASSERT(MUTEX_HELD(&db->db_mtx));43824383if (db->db_blkptr != NULL)4384return;43854386if (db->db_blkid == DMU_SPILL_BLKID) {4387db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);4388BP_ZERO(db->db_blkptr);4389return;4390}4391if (db->db_level == dn->dn_phys->dn_nlevels-1) {4392/*4393* This buffer was allocated at a time when there was4394* no available blkptrs from the dnode, or it was4395* inappropriate to hook it in (i.e., nlevels mismatch).4396*/4397ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);4398ASSERT0P(db->db_parent);4399db->db_parent = dn->dn_dbuf;4400db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];4401DBUF_VERIFY(db);4402} else {4403dmu_buf_impl_t *parent = db->db_parent;4404int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;44054406ASSERT(dn->dn_phys->dn_nlevels > 1);4407if (parent == NULL) {4408mutex_exit(&db->db_mtx);4409rw_enter(&dn->dn_struct_rwlock, RW_READER);4410parent = dbuf_hold_level(dn, db->db_level + 1,4411db->db_blkid >> epbs, db);4412rw_exit(&dn->dn_struct_rwlock);4413mutex_enter(&db->db_mtx);4414db->db_parent = parent;4415}4416db->db_blkptr = (blkptr_t *)parent->db.db_data +4417(db->db_blkid & ((1ULL << epbs) - 1));4418DBUF_VERIFY(db);4419}4420}44214422static void4423dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4424{4425dmu_buf_impl_t *db = dr->dr_dbuf;4426void *data = dr->dt.dl.dr_data;44274428ASSERT0(db->db_level);4429ASSERT(MUTEX_HELD(&db->db_mtx));4430ASSERT(db->db_blkid == DMU_BONUS_BLKID);4431ASSERT(data != NULL);44324433dnode_t *dn = dr->dr_dnode;4434ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,4435DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));4436memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));44374438dbuf_sync_leaf_verify_bonus_dnode(dr);44394440dbuf_undirty_bonus(dr);4441dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);4442}44434444/*4445* When syncing out a blocks of dnodes, adjust the block to deal with4446* encryption. Normally, we make sure the block is decrypted before writing4447* it. If we have crypt params, then we are writing a raw (encrypted) block,4448* from a raw receive. In this case, set the ARC buf's crypt params so4449* that the BP will be filled with the correct byteorder, salt, iv, and mac.4450*/4451static void4452dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)4453{4454int err;4455dmu_buf_impl_t *db = dr->dr_dbuf;44564457ASSERT(MUTEX_HELD(&db->db_mtx));4458ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);4459ASSERT0(db->db_level);44604461if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {4462zbookmark_phys_t zb;44634464/*4465* Unfortunately, there is currently no mechanism for4466* syncing context to handle decryption errors. An error4467* here is only possible if an attacker maliciously4468* changed a dnode block and updated the associated4469* checksums going up the block tree.4470*/4471SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),4472db->db.db_object, db->db_level, db->db_blkid);4473err = arc_untransform(db->db_buf, db->db_objset->os_spa,4474&zb, B_TRUE);4475if (err)4476panic("Invalid dnode block MAC");4477} else if (dr->dt.dl.dr_has_raw_params) {4478(void) arc_release(dr->dt.dl.dr_data, db);4479arc_convert_to_raw(dr->dt.dl.dr_data,4480dmu_objset_id(db->db_objset),4481dr->dt.dl.dr_byteorder, DMU_OT_DNODE,4482dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);4483}4484}44854486/*4487* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it4488* is critical the we not allow the compiler to inline this function in to4489* dbuf_sync_list() thereby drastically bloating the stack usage.4490*/4491noinline static void4492dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4493{4494dmu_buf_impl_t *db = dr->dr_dbuf;4495dnode_t *dn = dr->dr_dnode;44964497ASSERT(dmu_tx_is_syncing(tx));44984499dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);45004501mutex_enter(&db->db_mtx);45024503ASSERT(db->db_level > 0);4504DBUF_VERIFY(db);45054506/* Read the block if it hasn't been read yet. */4507if (db->db_buf == NULL) {4508mutex_exit(&db->db_mtx);4509(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);4510mutex_enter(&db->db_mtx);4511}4512ASSERT3U(db->db_state, ==, DB_CACHED);4513ASSERT(db->db_buf != NULL);45144515/* Indirect block size must match what the dnode thinks it is. */4516ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);4517dbuf_check_blkptr(dn, db);45184519/* Provide the pending dirty record to child dbufs */4520db->db_data_pending = dr;45214522mutex_exit(&db->db_mtx);45234524dbuf_write(dr, db->db_buf, tx);45254526zio_t *zio = dr->dr_zio;4527mutex_enter(&dr->dt.di.dr_mtx);4528dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);4529ASSERT(list_head(&dr->dt.di.dr_children) == NULL);4530mutex_exit(&dr->dt.di.dr_mtx);4531zio_nowait(zio);4532}45334534/*4535* Verify that the size of the data in our bonus buffer does not exceed4536* its recorded size.4537*4538* The purpose of this verification is to catch any cases in development4539* where the size of a phys structure (i.e space_map_phys_t) grows and,4540* due to incorrect feature management, older pools expect to read more4541* data even though they didn't actually write it to begin with.4542*4543* For a example, this would catch an error in the feature logic where we4544* open an older pool and we expect to write the space map histogram of4545* a space map with size SPACE_MAP_SIZE_V0.4546*/4547static void4548dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)4549{4550#ifdef ZFS_DEBUG4551dnode_t *dn = dr->dr_dnode;45524553/*4554* Encrypted bonus buffers can have data past their bonuslen.4555* Skip the verification of these blocks.4556*/4557if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))4558return;45594560uint16_t bonuslen = dn->dn_phys->dn_bonuslen;4561uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);4562ASSERT3U(bonuslen, <=, maxbonuslen);45634564arc_buf_t *datap = dr->dt.dl.dr_data;4565char *datap_end = ((char *)datap) + bonuslen;4566char *datap_max = ((char *)datap) + maxbonuslen;45674568/* ensure that everything is zero after our data */4569for (; datap_end < datap_max; datap_end++)4570ASSERT0(*datap_end);4571#endif4572}45734574static blkptr_t *4575dbuf_lightweight_bp(dbuf_dirty_record_t *dr)4576{4577/* This must be a lightweight dirty record. */4578ASSERT0P(dr->dr_dbuf);4579dnode_t *dn = dr->dr_dnode;45804581if (dn->dn_phys->dn_nlevels == 1) {4582VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);4583return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);4584} else {4585dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;4586int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;4587VERIFY3U(parent_db->db_level, ==, 1);4588VERIFY3P(DB_DNODE(parent_db), ==, dn);4589VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);4590blkptr_t *bp = parent_db->db.db_data;4591return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);4592}4593}45944595static void4596dbuf_lightweight_ready(zio_t *zio)4597{4598dbuf_dirty_record_t *dr = zio->io_private;4599blkptr_t *bp = zio->io_bp;46004601if (zio->io_error != 0)4602return;46034604dnode_t *dn = dr->dr_dnode;46054606blkptr_t *bp_orig = dbuf_lightweight_bp(dr);4607spa_t *spa = dmu_objset_spa(dn->dn_objset);4608int64_t delta = bp_get_dsize_sync(spa, bp) -4609bp_get_dsize_sync(spa, bp_orig);4610dnode_diduse_space(dn, delta);46114612uint64_t blkid = dr->dt.dll.dr_blkid;4613mutex_enter(&dn->dn_mtx);4614if (blkid > dn->dn_phys->dn_maxblkid) {4615ASSERT0(dn->dn_objset->os_raw_receive);4616dn->dn_phys->dn_maxblkid = blkid;4617}4618mutex_exit(&dn->dn_mtx);46194620if (!BP_IS_EMBEDDED(bp)) {4621uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;4622BP_SET_FILL(bp, fill);4623}46244625dmu_buf_impl_t *parent_db;4626EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);4627if (dr->dr_parent == NULL) {4628parent_db = dn->dn_dbuf;4629} else {4630parent_db = dr->dr_parent->dr_dbuf;4631}4632rw_enter(&parent_db->db_rwlock, RW_WRITER);4633*bp_orig = *bp;4634rw_exit(&parent_db->db_rwlock);4635}46364637static void4638dbuf_lightweight_done(zio_t *zio)4639{4640dbuf_dirty_record_t *dr = zio->io_private;46414642VERIFY0(zio->io_error);46434644objset_t *os = dr->dr_dnode->dn_objset;4645dmu_tx_t *tx = os->os_synctx;46464647if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {4648ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));4649} else {4650dsl_dataset_t *ds = os->os_dsl_dataset;4651(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);4652dsl_dataset_block_born(ds, zio->io_bp, tx);4653}46544655dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,4656zio->io_txg);46574658abd_free(dr->dt.dll.dr_abd);4659kmem_free(dr, sizeof (*dr));4660}46614662noinline static void4663dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4664{4665dnode_t *dn = dr->dr_dnode;4666zio_t *pio;4667if (dn->dn_phys->dn_nlevels == 1) {4668pio = dn->dn_zio;4669} else {4670pio = dr->dr_parent->dr_zio;4671}46724673zbookmark_phys_t zb = {4674.zb_objset = dmu_objset_id(dn->dn_objset),4675.zb_object = dn->dn_object,4676.zb_level = 0,4677.zb_blkid = dr->dt.dll.dr_blkid,4678};46794680/*4681* See comment in dbuf_write(). This is so that zio->io_bp_orig4682* will have the old BP in dbuf_lightweight_done().4683*/4684dr->dr_bp_copy = *dbuf_lightweight_bp(dr);46854686dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),4687dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,4688dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),4689&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,4690dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,4691ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);46924693zio_nowait(dr->dr_zio);4694}46954696/*4697* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is4698* critical the we not allow the compiler to inline this function in to4699* dbuf_sync_list() thereby drastically bloating the stack usage.4700*/4701noinline static void4702dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)4703{4704arc_buf_t **datap = &dr->dt.dl.dr_data;4705dmu_buf_impl_t *db = dr->dr_dbuf;4706dnode_t *dn = dr->dr_dnode;4707objset_t *os;4708uint64_t txg = tx->tx_txg;47094710ASSERT(dmu_tx_is_syncing(tx));47114712dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);47134714mutex_enter(&db->db_mtx);4715/*4716* To be synced, we must be dirtied. But we might have been freed4717* after the dirty.4718*/4719if (db->db_state == DB_UNCACHED) {4720/* This buffer has been freed since it was dirtied */4721ASSERT0P(db->db.db_data);4722} else if (db->db_state == DB_FILL) {4723/* This buffer was freed and is now being re-filled */4724ASSERT(db->db.db_data != dr->dt.dl.dr_data);4725} else if (db->db_state == DB_READ) {4726/*4727* This buffer was either cloned or had a Direct I/O write4728* occur and has an in-flgiht read on the BP. It is safe to4729* issue the write here, because the read has already been4730* issued and the contents won't change.4731*4732* We can verify the case of both the clone and Direct I/O4733* write by making sure the first dirty record for the dbuf4734* has no ARC buffer associated with it.4735*/4736dbuf_dirty_record_t *dr_head =4737list_head(&db->db_dirty_records);4738ASSERT0P(db->db_buf);4739ASSERT0P(db->db.db_data);4740ASSERT0P(dr_head->dt.dl.dr_data);4741ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);4742} else {4743ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);4744}4745DBUF_VERIFY(db);47464747if (db->db_blkid == DMU_SPILL_BLKID) {4748mutex_enter(&dn->dn_mtx);4749if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {4750/*4751* In the previous transaction group, the bonus buffer4752* was entirely used to store the attributes for the4753* dnode which overrode the dn_spill field. However,4754* when adding more attributes to the file a spill4755* block was required to hold the extra attributes.4756*4757* Make sure to clear the garbage left in the dn_spill4758* field from the previous attributes in the bonus4759* buffer. Otherwise, after writing out the spill4760* block to the new allocated dva, it will free4761* the old block pointed to by the invalid dn_spill.4762*/4763db->db_blkptr = NULL;4764}4765dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;4766mutex_exit(&dn->dn_mtx);4767}47684769/*4770* If this is a bonus buffer, simply copy the bonus data into the4771* dnode. It will be written out when the dnode is synced (and it4772* will be synced, since it must have been dirty for dbuf_sync to4773* be called).4774*/4775if (db->db_blkid == DMU_BONUS_BLKID) {4776ASSERT(dr->dr_dbuf == db);4777dbuf_sync_bonus(dr, tx);4778return;4779}47804781os = dn->dn_objset;47824783/*4784* This function may have dropped the db_mtx lock allowing a dmu_sync4785* operation to sneak in. As a result, we need to ensure that we4786* don't check the dr_override_state until we have returned from4787* dbuf_check_blkptr.4788*/4789dbuf_check_blkptr(dn, db);47904791/*4792* If this buffer is in the middle of an immediate write, wait for the4793* synchronous IO to complete.4794*4795* This is also valid even with Direct I/O writes setting a dirty4796* records override state into DR_IN_DMU_SYNC, because all4797* Direct I/O writes happen in open-context.4798*/4799while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {4800ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);4801cv_wait(&db->db_changed, &db->db_mtx);4802}48034804/*4805* If this is a dnode block, ensure it is appropriately encrypted4806* or decrypted, depending on what we are writing to it this txg.4807*/4808if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)4809dbuf_prepare_encrypted_dnode_leaf(dr);48104811if (*datap != NULL && *datap == db->db_buf &&4812dn->dn_object != DMU_META_DNODE_OBJECT &&4813zfs_refcount_count(&db->db_holds) > 1) {4814/*4815* If this buffer is currently "in use" (i.e., there4816* are active holds and db_data still references it),4817* then make a copy before we start the write so that4818* any modifications from the open txg will not leak4819* into this write.4820*4821* NOTE: this copy does not need to be made for4822* objects only modified in the syncing context (e.g.4823* DNONE_DNODE blocks).4824*/4825int psize = arc_buf_size(*datap);4826int lsize = arc_buf_lsize(*datap);4827arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);4828enum zio_compress compress_type = arc_get_compression(*datap);4829uint8_t complevel = arc_get_complevel(*datap);48304831if (arc_is_encrypted(*datap)) {4832boolean_t byteorder;4833uint8_t salt[ZIO_DATA_SALT_LEN];4834uint8_t iv[ZIO_DATA_IV_LEN];4835uint8_t mac[ZIO_DATA_MAC_LEN];48364837arc_get_raw_params(*datap, &byteorder, salt, iv, mac);4838*datap = arc_alloc_raw_buf(os->os_spa, db,4839dmu_objset_id(os), byteorder, salt, iv, mac,4840dn->dn_type, psize, lsize, compress_type,4841complevel);4842} else if (compress_type != ZIO_COMPRESS_OFF) {4843ASSERT3U(type, ==, ARC_BUFC_DATA);4844*datap = arc_alloc_compressed_buf(os->os_spa, db,4845psize, lsize, compress_type, complevel);4846} else {4847*datap = arc_alloc_buf(os->os_spa, db, type, psize);4848}4849memcpy((*datap)->b_data, db->db.db_data, psize);4850}4851db->db_data_pending = dr;48524853mutex_exit(&db->db_mtx);48544855dbuf_write(dr, *datap, tx);48564857ASSERT(!list_link_active(&dr->dr_dirty_node));4858if (dn->dn_object == DMU_META_DNODE_OBJECT) {4859list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);4860} else {4861zio_nowait(dr->dr_zio);4862}4863}48644865/*4866* Syncs out a range of dirty records for indirect or leaf dbufs. May be4867* called recursively from dbuf_sync_indirect().4868*/4869void4870dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)4871{4872dbuf_dirty_record_t *dr;48734874while ((dr = list_head(list))) {4875if (dr->dr_zio != NULL) {4876/*4877* If we find an already initialized zio then we4878* are processing the meta-dnode, and we have finished.4879* The dbufs for all dnodes are put back on the list4880* during processing, so that we can zio_wait()4881* these IOs after initiating all child IOs.4882*/4883ASSERT3U(dr->dr_dbuf->db.db_object, ==,4884DMU_META_DNODE_OBJECT);4885break;4886}4887list_remove(list, dr);4888if (dr->dr_dbuf == NULL) {4889dbuf_sync_lightweight(dr, tx);4890} else {4891if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&4892dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {4893VERIFY3U(dr->dr_dbuf->db_level, ==, level);4894}4895if (dr->dr_dbuf->db_level > 0)4896dbuf_sync_indirect(dr, tx);4897else4898dbuf_sync_leaf(dr, tx);4899}4900}4901}49024903static void4904dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)4905{4906(void) buf;4907dmu_buf_impl_t *db = vdb;4908dnode_t *dn;4909blkptr_t *bp = zio->io_bp;4910blkptr_t *bp_orig = &zio->io_bp_orig;4911spa_t *spa = zio->io_spa;4912int64_t delta;4913uint64_t fill = 0;4914int i;49154916ASSERT3P(db->db_blkptr, !=, NULL);4917ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);49184919DB_DNODE_ENTER(db);4920dn = DB_DNODE(db);4921delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);4922dnode_diduse_space(dn, delta - zio->io_prev_space_delta);4923zio->io_prev_space_delta = delta;49244925if (BP_GET_BIRTH(bp) != 0) {4926ASSERT((db->db_blkid != DMU_SPILL_BLKID &&4927BP_GET_TYPE(bp) == dn->dn_type) ||4928(db->db_blkid == DMU_SPILL_BLKID &&4929BP_GET_TYPE(bp) == dn->dn_bonustype) ||4930BP_IS_EMBEDDED(bp));4931ASSERT(BP_GET_LEVEL(bp) == db->db_level);4932}49334934mutex_enter(&db->db_mtx);49354936#ifdef ZFS_DEBUG4937if (db->db_blkid == DMU_SPILL_BLKID) {4938ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);4939ASSERT(!(BP_IS_HOLE(bp)) &&4940db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));4941}4942#endif49434944if (db->db_level == 0) {4945mutex_enter(&dn->dn_mtx);4946if (db->db_blkid > dn->dn_phys->dn_maxblkid &&4947db->db_blkid != DMU_SPILL_BLKID) {4948ASSERT0(db->db_objset->os_raw_receive);4949dn->dn_phys->dn_maxblkid = db->db_blkid;4950}4951mutex_exit(&dn->dn_mtx);49524953if (dn->dn_type == DMU_OT_DNODE) {4954i = 0;4955while (i < db->db.db_size) {4956dnode_phys_t *dnp =4957(void *)(((char *)db->db.db_data) + i);49584959i += DNODE_MIN_SIZE;4960if (dnp->dn_type != DMU_OT_NONE) {4961fill++;4962for (int j = 0; j < dnp->dn_nblkptr;4963j++) {4964(void) zfs_blkptr_verify(spa,4965&dnp->dn_blkptr[j],4966BLK_CONFIG_SKIP,4967BLK_VERIFY_HALT);4968}4969if (dnp->dn_flags &4970DNODE_FLAG_SPILL_BLKPTR) {4971(void) zfs_blkptr_verify(spa,4972DN_SPILL_BLKPTR(dnp),4973BLK_CONFIG_SKIP,4974BLK_VERIFY_HALT);4975}4976i += dnp->dn_extra_slots *4977DNODE_MIN_SIZE;4978}4979}4980} else {4981if (BP_IS_HOLE(bp)) {4982fill = 0;4983} else {4984fill = 1;4985}4986}4987} else {4988blkptr_t *ibp = db->db.db_data;4989ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);4990for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {4991if (BP_IS_HOLE(ibp))4992continue;4993(void) zfs_blkptr_verify(spa, ibp,4994BLK_CONFIG_SKIP, BLK_VERIFY_HALT);4995fill += BP_GET_FILL(ibp);4996}4997}4998DB_DNODE_EXIT(db);49995000if (!BP_IS_EMBEDDED(bp))5001BP_SET_FILL(bp, fill);50025003mutex_exit(&db->db_mtx);50045005db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);5006*db->db_blkptr = *bp;5007dmu_buf_unlock_parent(db, dblt, FTAG);5008}50095010/*5011* This function gets called just prior to running through the compression5012* stage of the zio pipeline. If we're an indirect block comprised of only5013* holes, then we want this indirect to be compressed away to a hole. In5014* order to do that we must zero out any information about the holes that5015* this indirect points to prior to before we try to compress it.5016*/5017static void5018dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)5019{5020(void) zio, (void) buf;5021dmu_buf_impl_t *db = vdb;5022blkptr_t *bp;5023unsigned int epbs, i;50245025ASSERT3U(db->db_level, >, 0);5026DB_DNODE_ENTER(db);5027epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;5028DB_DNODE_EXIT(db);5029ASSERT3U(epbs, <, 31);50305031/* Determine if all our children are holes */5032for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {5033if (!BP_IS_HOLE(bp))5034break;5035}50365037/*5038* If all the children are holes, then zero them all out so that5039* we may get compressed away.5040*/5041if (i == 1ULL << epbs) {5042/*5043* We only found holes. Grab the rwlock to prevent5044* anybody from reading the blocks we're about to5045* zero out.5046*/5047rw_enter(&db->db_rwlock, RW_WRITER);5048memset(db->db.db_data, 0, db->db.db_size);5049rw_exit(&db->db_rwlock);5050}5051}50525053static void5054dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)5055{5056(void) buf;5057dmu_buf_impl_t *db = vdb;5058blkptr_t *bp_orig = &zio->io_bp_orig;5059blkptr_t *bp = db->db_blkptr;5060objset_t *os = db->db_objset;5061dmu_tx_t *tx = os->os_synctx;50625063ASSERT0(zio->io_error);5064ASSERT(db->db_blkptr == bp);50655066/*5067* For nopwrites and rewrites we ensure that the bp matches our5068* original and bypass all the accounting.5069*/5070if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {5071ASSERT(BP_EQUAL(bp, bp_orig));5072} else {5073dsl_dataset_t *ds = os->os_dsl_dataset;5074(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);5075dsl_dataset_block_born(ds, bp, tx);5076}50775078mutex_enter(&db->db_mtx);50795080DBUF_VERIFY(db);50815082dbuf_dirty_record_t *dr = db->db_data_pending;5083dnode_t *dn = dr->dr_dnode;5084ASSERT(!list_link_active(&dr->dr_dirty_node));5085ASSERT(dr->dr_dbuf == db);5086ASSERT(list_next(&db->db_dirty_records, dr) == NULL);5087list_remove(&db->db_dirty_records, dr);50885089#ifdef ZFS_DEBUG5090if (db->db_blkid == DMU_SPILL_BLKID) {5091ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);5092ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&5093db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));5094}5095#endif50965097if (db->db_level == 0) {5098ASSERT(db->db_blkid != DMU_BONUS_BLKID);5099ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);51005101/* no dr_data if this is a NO_FILL or Direct I/O */5102if (dr->dt.dl.dr_data != NULL &&5103dr->dt.dl.dr_data != db->db_buf) {5104ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);5105ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);5106arc_buf_destroy(dr->dt.dl.dr_data, db);5107}5108} else {5109ASSERT(list_head(&dr->dt.di.dr_children) == NULL);5110ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);5111if (!BP_IS_HOLE(db->db_blkptr)) {5112int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -5113SPA_BLKPTRSHIFT;5114ASSERT3U(db->db_blkid, <=,5115dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));5116ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,5117db->db.db_size);5118}5119mutex_destroy(&dr->dt.di.dr_mtx);5120list_destroy(&dr->dt.di.dr_children);5121}51225123cv_broadcast(&db->db_changed);5124ASSERT(db->db_dirtycnt > 0);5125db->db_dirtycnt -= 1;5126db->db_data_pending = NULL;5127dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);51285129dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,5130zio->io_txg);51315132kmem_cache_free(dbuf_dirty_kmem_cache, dr);5133}51345135static void5136dbuf_write_nofill_ready(zio_t *zio)5137{5138dbuf_write_ready(zio, NULL, zio->io_private);5139}51405141static void5142dbuf_write_nofill_done(zio_t *zio)5143{5144dbuf_write_done(zio, NULL, zio->io_private);5145}51465147static void5148dbuf_write_override_ready(zio_t *zio)5149{5150dbuf_dirty_record_t *dr = zio->io_private;5151dmu_buf_impl_t *db = dr->dr_dbuf;51525153dbuf_write_ready(zio, NULL, db);5154}51555156static void5157dbuf_write_override_done(zio_t *zio)5158{5159dbuf_dirty_record_t *dr = zio->io_private;5160dmu_buf_impl_t *db = dr->dr_dbuf;5161blkptr_t *obp = &dr->dt.dl.dr_overridden_by;51625163mutex_enter(&db->db_mtx);5164if (!BP_EQUAL(zio->io_bp, obp)) {5165if (!BP_IS_HOLE(obp))5166dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);5167arc_release(dr->dt.dl.dr_data, db);5168}5169mutex_exit(&db->db_mtx);51705171dbuf_write_done(zio, NULL, db);51725173if (zio->io_abd != NULL)5174abd_free(zio->io_abd);5175}51765177typedef struct dbuf_remap_impl_callback_arg {5178objset_t *drica_os;5179uint64_t drica_blk_birth;5180dmu_tx_t *drica_tx;5181} dbuf_remap_impl_callback_arg_t;51825183static void5184dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,5185void *arg)5186{5187dbuf_remap_impl_callback_arg_t *drica = arg;5188objset_t *os = drica->drica_os;5189spa_t *spa = dmu_objset_spa(os);5190dmu_tx_t *tx = drica->drica_tx;51915192ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));51935194if (os == spa_meta_objset(spa)) {5195spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);5196} else {5197dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,5198size, drica->drica_blk_birth, tx);5199}5200}52015202static void5203dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)5204{5205blkptr_t bp_copy = *bp;5206spa_t *spa = dmu_objset_spa(dn->dn_objset);5207dbuf_remap_impl_callback_arg_t drica;52085209ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));52105211drica.drica_os = dn->dn_objset;5212drica.drica_blk_birth = BP_GET_BIRTH(bp);5213drica.drica_tx = tx;5214if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,5215&drica)) {5216/*5217* If the blkptr being remapped is tracked by a livelist,5218* then we need to make sure the livelist reflects the update.5219* First, cancel out the old blkptr by appending a 'FREE'5220* entry. Next, add an 'ALLOC' to track the new version. This5221* way we avoid trying to free an inaccurate blkptr at delete.5222* Note that embedded blkptrs are not tracked in livelists.5223*/5224if (dn->dn_objset != spa_meta_objset(spa)) {5225dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);5226if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&5227BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {5228ASSERT(!BP_IS_EMBEDDED(bp));5229ASSERT(dsl_dir_is_clone(ds->ds_dir));5230ASSERT(spa_feature_is_enabled(spa,5231SPA_FEATURE_LIVELIST));5232bplist_append(&ds->ds_dir->dd_pending_frees,5233bp);5234bplist_append(&ds->ds_dir->dd_pending_allocs,5235&bp_copy);5236}5237}52385239/*5240* The db_rwlock prevents dbuf_read_impl() from5241* dereferencing the BP while we are changing it. To5242* avoid lock contention, only grab it when we are actually5243* changing the BP.5244*/5245if (rw != NULL)5246rw_enter(rw, RW_WRITER);5247*bp = bp_copy;5248if (rw != NULL)5249rw_exit(rw);5250}5251}52525253/*5254* Remap any existing BP's to concrete vdevs, if possible.5255*/5256static void5257dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)5258{5259spa_t *spa = dmu_objset_spa(db->db_objset);5260ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));52615262if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))5263return;52645265if (db->db_level > 0) {5266blkptr_t *bp = db->db.db_data;5267for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {5268dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);5269}5270} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {5271dnode_phys_t *dnp = db->db.db_data;5272ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE);5273for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;5274i += dnp[i].dn_extra_slots + 1) {5275for (int j = 0; j < dnp[i].dn_nblkptr; j++) {5276krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :5277&dn->dn_dbuf->db_rwlock);5278dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,5279tx);5280}5281}5282}5283}528452855286/*5287* Populate dr->dr_zio with a zio to commit a dirty buffer to disk.5288* Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).5289*/5290static void5291dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)5292{5293dmu_buf_impl_t *db = dr->dr_dbuf;5294dnode_t *dn = dr->dr_dnode;5295objset_t *os;5296dmu_buf_impl_t *parent = db->db_parent;5297uint64_t txg = tx->tx_txg;5298zbookmark_phys_t zb;5299zio_prop_t zp;5300zio_t *pio; /* parent I/O */5301int wp_flag = 0;53025303ASSERT(dmu_tx_is_syncing(tx));53045305os = dn->dn_objset;53065307if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {5308/*5309* Private object buffers are released here rather than in5310* dbuf_dirty() since they are only modified in the syncing5311* context and we don't want the overhead of making multiple5312* copies of the data.5313*/5314if (BP_IS_HOLE(db->db_blkptr))5315arc_buf_thaw(data);5316else5317dbuf_release_bp(db);5318dbuf_remap(dn, db, tx);5319}53205321if (parent != dn->dn_dbuf) {5322/* Our parent is an indirect block. */5323/* We have a dirty parent that has been scheduled for write. */5324ASSERT(parent && parent->db_data_pending);5325/* Our parent's buffer is one level closer to the dnode. */5326ASSERT(db->db_level == parent->db_level-1);5327/*5328* We're about to modify our parent's db_data by modifying5329* our block pointer, so the parent must be released.5330*/5331ASSERT(arc_released(parent->db_buf));5332pio = parent->db_data_pending->dr_zio;5333} else {5334/* Our parent is the dnode itself. */5335ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&5336db->db_blkid != DMU_SPILL_BLKID) ||5337(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));5338if (db->db_blkid != DMU_SPILL_BLKID)5339ASSERT3P(db->db_blkptr, ==,5340&dn->dn_phys->dn_blkptr[db->db_blkid]);5341pio = dn->dn_zio;5342}53435344ASSERT(db->db_level == 0 || data == db->db_buf);5345ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);5346ASSERT(pio);53475348SET_BOOKMARK(&zb, os->os_dsl_dataset ?5349os->os_dsl_dataset->ds_object : DMU_META_OBJSET,5350db->db.db_object, db->db_level, db->db_blkid);53515352if (db->db_blkid == DMU_SPILL_BLKID)5353wp_flag = WP_SPILL;5354wp_flag |= (data == NULL) ? WP_NOFILL : 0;53555356dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);53575358/*5359* Set rewrite properties for zfs_rewrite() operations.5360*/5361if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {5362zp.zp_rewrite = B_TRUE;53635364/*5365* Mark physical rewrite feature for activation.5366* This will be activated automatically during dataset sync.5367*/5368dsl_dataset_t *ds = os->os_dsl_dataset;5369if (!dsl_dataset_feature_is_active(ds,5370SPA_FEATURE_PHYSICAL_REWRITE)) {5371ds->ds_feature_activation[5372SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;5373}5374}53755376/*5377* We copy the blkptr now (rather than when we instantiate the dirty5378* record), because its value can change between open context and5379* syncing context. We do not need to hold dn_struct_rwlock to read5380* db_blkptr because we are in syncing context.5381*/5382dr->dr_bp_copy = *db->db_blkptr;53835384if (db->db_level == 0 &&5385dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {5386/*5387* The BP for this block has been provided by open context5388* (by dmu_sync(), dmu_write_direct(),5389* or dmu_buf_write_embedded()).5390*/5391abd_t *contents = (data != NULL) ?5392abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;53935394dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,5395contents, db->db.db_size, db->db.db_size, &zp,5396dbuf_write_override_ready, NULL,5397dbuf_write_override_done,5398dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);5399mutex_enter(&db->db_mtx);5400dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;5401zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,5402dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,5403dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);5404mutex_exit(&db->db_mtx);5405} else if (data == NULL) {5406ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||5407zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);5408dr->dr_zio = zio_write(pio, os->os_spa, txg,5409&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,5410dbuf_write_nofill_ready, NULL,5411dbuf_write_nofill_done, db,5412ZIO_PRIORITY_ASYNC_WRITE,5413ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);5414} else {5415ASSERT(arc_released(data));54165417/*5418* For indirect blocks, we want to setup the children5419* ready callback so that we can properly handle an indirect5420* block that only contains holes.5421*/5422arc_write_done_func_t *children_ready_cb = NULL;5423if (db->db_level != 0)5424children_ready_cb = dbuf_write_children_ready;54255426dr->dr_zio = arc_write(pio, os->os_spa, txg,5427&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),5428dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,5429children_ready_cb, dbuf_write_done, db,5430ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);5431}5432}54335434EXPORT_SYMBOL(dbuf_find);5435EXPORT_SYMBOL(dbuf_is_metadata);5436EXPORT_SYMBOL(dbuf_destroy);5437EXPORT_SYMBOL(dbuf_whichblock);5438EXPORT_SYMBOL(dbuf_read);5439EXPORT_SYMBOL(dbuf_unoverride);5440EXPORT_SYMBOL(dbuf_free_range);5441EXPORT_SYMBOL(dbuf_new_size);5442EXPORT_SYMBOL(dbuf_release_bp);5443EXPORT_SYMBOL(dbuf_dirty);5444EXPORT_SYMBOL(dmu_buf_set_crypt_params);5445EXPORT_SYMBOL(dmu_buf_will_dirty);5446EXPORT_SYMBOL(dmu_buf_will_rewrite);5447EXPORT_SYMBOL(dmu_buf_is_dirty);5448EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);5449EXPORT_SYMBOL(dmu_buf_will_not_fill);5450EXPORT_SYMBOL(dmu_buf_will_fill);5451EXPORT_SYMBOL(dmu_buf_fill_done);5452EXPORT_SYMBOL(dmu_buf_rele);5453EXPORT_SYMBOL(dbuf_assign_arcbuf);5454EXPORT_SYMBOL(dbuf_prefetch);5455EXPORT_SYMBOL(dbuf_hold_impl);5456EXPORT_SYMBOL(dbuf_hold);5457EXPORT_SYMBOL(dbuf_hold_level);5458EXPORT_SYMBOL(dbuf_create_bonus);5459EXPORT_SYMBOL(dbuf_spill_set_blksz);5460EXPORT_SYMBOL(dbuf_rm_spill);5461EXPORT_SYMBOL(dbuf_add_ref);5462EXPORT_SYMBOL(dbuf_rele);5463EXPORT_SYMBOL(dbuf_rele_and_unlock);5464EXPORT_SYMBOL(dbuf_refcount);5465EXPORT_SYMBOL(dbuf_sync_list);5466EXPORT_SYMBOL(dmu_buf_set_user);5467EXPORT_SYMBOL(dmu_buf_set_user_ie);5468EXPORT_SYMBOL(dmu_buf_get_user);5469EXPORT_SYMBOL(dmu_buf_get_blkptr);54705471ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,5472"Maximum size in bytes of the dbuf cache.");54735474ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,5475"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");54765477ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,5478"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");54795480ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,5481"Maximum size in bytes of dbuf metadata cache.");54825483ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,5484"Set size of dbuf cache to log2 fraction of arc size.");54855486ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,5487"Set size of dbuf metadata cache to log2 fraction of arc size.");54885489ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,5490"Set size of dbuf cache mutex array as log2 shift.");549154925493