Path: blob/main/sys/contrib/openzfs/module/zfs/brt.c
107264 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek24*/2526#include <sys/zfs_context.h>27#include <sys/spa.h>28#include <sys/spa_impl.h>29#include <sys/zio.h>30#include <sys/brt.h>31#include <sys/brt_impl.h>32#include <sys/ddt.h>33#include <sys/bitmap.h>34#include <sys/zap.h>35#include <sys/dmu_tx.h>36#include <sys/arc.h>37#include <sys/dsl_pool.h>38#include <sys/dsl_scan.h>39#include <sys/vdev_impl.h>40#include <sys/kstat.h>41#include <sys/wmsum.h>4243/*44* Block Cloning design.45*46* Block Cloning allows to manually clone a file (or a subset of its blocks)47* into another (or the same) file by just creating additional references to48* the data blocks without copying the data itself. Those references are kept49* in the Block Reference Tables (BRTs).50*51* In many ways this is similar to the existing deduplication, but there are52* some important differences:53*54* - Deduplication is automatic and Block Cloning is not - one has to use a55* dedicated system call(s) to clone the given file/blocks.56* - Deduplication keeps all data blocks in its table, even those referenced57* just once. Block Cloning creates an entry in its tables only when there58* are at least two references to the given data block. If the block was59* never explicitly cloned or the second to last reference was dropped,60* there will be neither space nor performance overhead.61* - Deduplication needs data to work - one needs to pass real data to the62* write(2) syscall, so hash can be calculated. Block Cloning doesn't require63* data, just block pointers to the data, so it is extremely fast, as we pay64* neither the cost of reading the data, nor the cost of writing the data -65* we operate exclusively on metadata.66* - If the D (dedup) bit is not set in the block pointer, it means that67* the block is not in the dedup table (DDT) and we won't consult the DDT68* when we need to free the block. Block Cloning must be consulted on every69* free, because we cannot modify the source BP (eg. by setting something70* similar to the D bit), thus we have no hint if the block is in the71* Block Reference Table (BRT), so we need to look into the BRT. There is72* an optimization in place that allows us to eliminate the majority of BRT73* lookups which is described below in the "Minimizing free penalty" section.74* - The BRT entry is much smaller than the DDT entry - for BRT we only store75* 64bit offset and 64bit reference counter.76* - Dedup keys are cryptographic hashes, so two blocks that are close to each77* other on disk are most likely in totally different parts of the DDT.78* The BRT entry keys are offsets into a single top-level VDEV, so data blocks79* from one file should have BRT entries close to each other.80* - Scrub will only do a single pass over a block that is referenced multiple81* times in the DDT. Unfortunately it is not currently (if at all) possible82* with Block Cloning and block referenced multiple times will be scrubbed83* multiple times. The new, sorted scrub should be able to eliminate84* duplicated reads given enough memory.85* - Deduplication requires cryptographically strong hash as a checksum or86* additional data verification. Block Cloning works with any checksum87* algorithm or even with checksumming disabled.88*89* As mentioned above, the BRT entries are much smaller than the DDT entries.90* To uniquely identify a block we just need its vdev id and offset. We also91* need to maintain a reference counter. The vdev id will often repeat, as there92* is a small number of top-level VDEVs and a large number of blocks stored in93* each VDEV. We take advantage of that to reduce the BRT entry size further by94* maintaining one BRT for each top-level VDEV, so we can then have only offset95* and counter as the BRT entry.96*97* Minimizing free penalty.98*99* Block Cloning allows creating additional references to any existing block.100* When we free a block there is no hint in the block pointer whether the block101* was cloned or not, so on each free we have to check if there is a102* corresponding entry in the BRT or not. If there is, we need to decrease103* the reference counter. Doing BRT lookup on every free can potentially be104* expensive by requiring additional I/Os if the BRT doesn't fit into memory.105* This is the main problem with deduplication, so we've learned our lesson and106* try not to repeat the same mistake here. How do we do that? We divide each107* top-level VDEV into 16MB regions. For each region we maintain a counter that108* is a sum of all the BRT entries that have offsets within the region. This109* creates the entries count array of 16bit numbers for each top-level VDEV.110* The entries count array is always kept in memory and updated on disk in the111* same transaction group as the BRT updates to keep everything in-sync. We can112* keep the array in memory, because it is very small. With 16MB regions and113* 1TB VDEV the array requires only 128kB of memory (we may decide to decrease114* the region size even further in the future). Now, when we want to free115* a block, we first consult the array. If the counter for the whole region is116* zero, there is no need to look for the BRT entry, as there isn't one for117* sure. If the counter for the region is greater than zero, only then we will118* do a BRT lookup and if an entry is found we will decrease the reference119* counter in the BRT entry and in the entry counters array.120*121* The entry counters array is small, but can potentially be larger for very122* large VDEVs or smaller regions. In this case we don't want to rewrite entire123* array on every change. We then divide the array into 32kB block and keep124* a bitmap of dirty blocks within a transaction group. When we sync the125* transaction group we can only update the parts of the entry counters array126* that were modified. Note: Keeping track of the dirty parts of the entry127* counters array is implemented, but updating only parts of the array on disk128* is not yet implemented - for now we will update entire array if there was129* any change.130*131* The implementation tries to be economic: if BRT is not used, or no longer132* used, there will be no entries in the MOS and no additional memory used (eg.133* the entry counters array is only allocated if needed).134*135* Interaction between Deduplication and Block Cloning.136*137* If both functionalities are in use, we could end up with a block that is138* referenced multiple times in both DDT and BRT. When we free one of the139* references we couldn't tell where it belongs, so we would have to decide140* what table takes the precedence: do we first clear DDT references or BRT141* references? To avoid this dilemma BRT cooperates with DDT - if a given block142* is being cloned using BRT and the BP has the D (dedup) bit set, BRT will143* lookup DDT entry instead and increase the counter there. No BRT entry144* will be created for a block which has the D (dedup) bit set.145* BRT may be more efficient for manual deduplication, but if the block is146* already in the DDT, then creating additional BRT entry would be less147* efficient. This clever idea was proposed by Allan Jude.148*149* Block Cloning across datasets.150*151* Block Cloning is not limited to cloning blocks within the same dataset.152* It is possible (and very useful) to clone blocks between different datasets.153* One use case is recovering files from snapshots. By cloning the files into154* dataset we need no additional storage. Without Block Cloning we would need155* additional space for those files.156* Another interesting use case is moving the files between datasets157* (copying the file content to the new dataset and removing the source file).158* In that case Block Cloning will only be used briefly, because the BRT entries159* will be removed when the source is removed.160* Block Cloning across encrypted datasets is supported as long as both161* datasets share the same master key (e.g. snapshots and clones)162*163* Block Cloning flow through ZFS layers.164*165* Note: Block Cloning can be used both for cloning file system blocks and ZVOL166* blocks. As of this writing no interface is implemented that allows for block167* cloning within a ZVOL.168* FreeBSD and Linux provides copy_file_range(2) system call and we will use it169* for blocking cloning.170*171* ssize_t172* copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,173* size_t len, unsigned int flags);174*175* Even though offsets and length represent bytes, they have to be176* block-aligned or we will return an error so the upper layer can177* fallback to the generic mechanism that will just copy the data.178* Using copy_file_range(2) will call OS-independent zfs_clone_range() function.179* This function was implemented based on zfs_write(), but instead of writing180* the given data we first read block pointers using the new dmu_read_l0_bps()181* function from the source file. Once we have BPs from the source file we call182* the dmu_brt_clone() function on the destination file. This function183* allocates BPs for us. We iterate over all source BPs. If the given BP is184* a hole or an embedded block, we just copy BP as-is. If it points to a real185* data we place this BP on a BRT pending list using the brt_pending_add()186* function.187*188* We use this pending list to keep track of all BPs that got new references189* within this transaction group.190*191* Some special cases to consider and how we address them:192* - The block we want to clone may have been created within the same193* transaction group that we are trying to clone. Such block has no BP194* allocated yet, so cannot be immediately cloned. We return EAGAIN.195* - The block we want to clone may have been modified within the same196* transaction group. We return EAGAIN.197* - A block may be cloned multiple times during one transaction group (that's198* why pending list is actually a tree and not an append-only list - this199* way we can figure out faster if this block is cloned for the first time200* in this txg or consecutive time).201* - A block may be cloned and freed within the same transaction group202* (see dbuf_undirty()).203* - A block may be cloned and within the same transaction group the clone204* can be cloned again (see dmu_read_l0_bps()).205* - A file might have been deleted, but the caller still has a file descriptor206* open to this file and clones it.207*208* When we free a block we have an additional step in the ZIO pipeline where we209* call the zio_brt_free() function. We then call the brt_entry_decref()210* that loads the corresponding BRT entry (if one exists) and decreases211* reference counter. If this is not the last reference we will stop ZIO212* pipeline here. If this is the last reference or the block is not in the213* BRT, we continue the pipeline and free the block as usual.214*215* At the beginning of spa_sync() where there can be no more block cloning,216* but before issuing frees we call brt_pending_apply(). This function applies217* all the new clones to the BRT table - we load BRT entries and update218* reference counters. To sync new BRT entries to disk, we use brt_sync()219* function. This function will sync all dirty per-top-level-vdev BRTs,220* the entry counters arrays, etc.221*222* Block Cloning and ZIL.223*224* Every clone operation is divided into chunks (similar to write) and each225* chunk is cloned in a separate transaction. The chunk size is determined by226* how many BPs we can fit into a single ZIL entry.227* Replaying clone operation is different from the regular clone operation,228* as when we log clone operations we cannot use the source object - it may229* reside on a different dataset, so we log BPs we want to clone.230* The ZIL is replayed when we mount the given dataset, not when the pool is231* imported. Taking this into account it is possible that the pool is imported232* without mounting datasets and the source dataset is destroyed before the233* destination dataset is mounted and its ZIL replayed.234* To address this situation we leverage zil_claim() mechanism where ZFS will235* parse all the ZILs on pool import. When we come across TX_CLONE_RANGE236* entries, we will bump reference counters for their BPs in the BRT. Then237* on mount and ZIL replay we bump the reference counters once more, while the238* first references are dropped during ZIL destroy by zil_free_clone_range().239* It is possible that after zil_claim() we never mount the destination, so240* we never replay its ZIL and just destroy it. In this case the only taken241* references will be dropped by zil_free_clone_range(), since the cloning is242* not going to ever take place.243*/244245static kmem_cache_t *brt_entry_cache;246247/*248* Enable/disable prefetching of BRT entries that we are going to modify.249*/250static int brt_zap_prefetch = 1;251252#ifdef ZFS_DEBUG253#define BRT_DEBUG(...) do { \254if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \255__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \256} \257} while (0)258#else259#define BRT_DEBUG(...) do { } while (0)260#endif261262static int brt_zap_default_bs = 13;263static int brt_zap_default_ibs = 13;264265static kstat_t *brt_ksp;266267typedef struct brt_stats {268kstat_named_t brt_addref_entry_not_on_disk;269kstat_named_t brt_addref_entry_on_disk;270kstat_named_t brt_decref_entry_in_memory;271kstat_named_t brt_decref_entry_loaded_from_disk;272kstat_named_t brt_decref_entry_not_in_memory;273kstat_named_t brt_decref_entry_read_lost_race;274kstat_named_t brt_decref_entry_still_referenced;275kstat_named_t brt_decref_free_data_later;276kstat_named_t brt_decref_free_data_now;277kstat_named_t brt_decref_no_entry;278} brt_stats_t;279280static brt_stats_t brt_stats = {281{ "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },282{ "addref_entry_on_disk", KSTAT_DATA_UINT64 },283{ "decref_entry_in_memory", KSTAT_DATA_UINT64 },284{ "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },285{ "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },286{ "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },287{ "decref_entry_still_referenced", KSTAT_DATA_UINT64 },288{ "decref_free_data_later", KSTAT_DATA_UINT64 },289{ "decref_free_data_now", KSTAT_DATA_UINT64 },290{ "decref_no_entry", KSTAT_DATA_UINT64 }291};292293struct {294wmsum_t brt_addref_entry_not_on_disk;295wmsum_t brt_addref_entry_on_disk;296wmsum_t brt_decref_entry_in_memory;297wmsum_t brt_decref_entry_loaded_from_disk;298wmsum_t brt_decref_entry_not_in_memory;299wmsum_t brt_decref_entry_read_lost_race;300wmsum_t brt_decref_entry_still_referenced;301wmsum_t brt_decref_free_data_later;302wmsum_t brt_decref_free_data_now;303wmsum_t brt_decref_no_entry;304} brt_sums;305306#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)307308static int brt_entry_compare(const void *x1, const void *x2);309static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);310311static void312brt_rlock(spa_t *spa)313{314rw_enter(&spa->spa_brt_lock, RW_READER);315}316317static void318brt_wlock(spa_t *spa)319{320rw_enter(&spa->spa_brt_lock, RW_WRITER);321}322323static void324brt_unlock(spa_t *spa)325{326rw_exit(&spa->spa_brt_lock);327}328329static uint16_t330brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)331{332333ASSERT3U(idx, <, brtvd->bv_size);334335if (unlikely(brtvd->bv_need_byteswap)) {336return (BSWAP_16(brtvd->bv_entcount[idx]));337} else {338return (brtvd->bv_entcount[idx]);339}340}341342static void343brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)344{345346ASSERT3U(idx, <, brtvd->bv_size);347348if (unlikely(brtvd->bv_need_byteswap)) {349brtvd->bv_entcount[idx] = BSWAP_16(entcnt);350} else {351brtvd->bv_entcount[idx] = entcnt;352}353}354355static void356brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)357{358uint16_t entcnt;359360ASSERT3U(idx, <, brtvd->bv_size);361362entcnt = brt_vdev_entcount_get(brtvd, idx);363ASSERT(entcnt < UINT16_MAX);364365brt_vdev_entcount_set(brtvd, idx, entcnt + 1);366}367368static void369brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)370{371uint16_t entcnt;372373ASSERT3U(idx, <, brtvd->bv_size);374375entcnt = brt_vdev_entcount_get(brtvd, idx);376ASSERT(entcnt > 0);377378brt_vdev_entcount_set(brtvd, idx, entcnt - 1);379}380381#ifdef ZFS_DEBUG382static void383brt_vdev_dump(brt_vdev_t *brtvd)384{385uint64_t idx;386387uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);388zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "389"size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",390(u_longlong_t)brtvd->bv_vdevid,391brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,392(u_longlong_t)brtvd->bv_size,393(u_longlong_t)brtvd->bv_totalcount,394(u_longlong_t)nblocks,395(size_t)BT_SIZEOFMAP(nblocks));396if (brtvd->bv_totalcount > 0) {397zfs_dbgmsg(" entcounts:");398for (idx = 0; idx < brtvd->bv_size; idx++) {399uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);400if (entcnt > 0) {401zfs_dbgmsg(" [%04llu] %hu",402(u_longlong_t)idx, entcnt);403}404}405}406if (brtvd->bv_entcount_dirty) {407char *bitmap;408409bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);410for (idx = 0; idx < nblocks; idx++) {411bitmap[idx] =412BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';413}414bitmap[idx] = '\0';415zfs_dbgmsg(" dirty: %s", bitmap);416kmem_free(bitmap, nblocks + 1);417}418}419#endif420421static brt_vdev_t *422brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)423{424brt_vdev_t *brtvd = NULL;425426brt_rlock(spa);427if (vdevid < spa->spa_brt_nvdevs) {428brtvd = spa->spa_brt_vdevs[vdevid];429} else if (alloc) {430/* New VDEV was added. */431brt_unlock(spa);432brt_wlock(spa);433if (vdevid >= spa->spa_brt_nvdevs)434brt_vdevs_expand(spa, vdevid + 1);435brtvd = spa->spa_brt_vdevs[vdevid];436}437brt_unlock(spa);438return (brtvd);439}440441static void442brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)443{444char name[64];445446ASSERT(brtvd->bv_initiated);447ASSERT0(brtvd->bv_mos_brtvdev);448ASSERT0(brtvd->bv_mos_entries);449450uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,451ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,452brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);453VERIFY(mos_entries != 0);454VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,455&brtvd->bv_mos_entries_dnode));456dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);457rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);458brtvd->bv_mos_entries = mos_entries;459rw_exit(&brtvd->bv_mos_entries_lock);460BRT_DEBUG("MOS entries created, object=%llu",461(u_longlong_t)brtvd->bv_mos_entries);462463/*464* We allocate DMU buffer to store the bv_entcount[] array.465* We will keep array size (bv_size) and cummulative count for all466* bv_entcount[]s (bv_totalcount) in the bonus buffer.467*/468brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,469DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,470DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);471VERIFY(brtvd->bv_mos_brtvdev != 0);472BRT_DEBUG("MOS BRT VDEV created, object=%llu",473(u_longlong_t)brtvd->bv_mos_brtvdev);474475snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,476(u_longlong_t)brtvd->bv_vdevid);477VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,478sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));479BRT_DEBUG("Pool directory object created, object=%s", name);480481/*482* Activate the endian-fixed feature if this is the first BRT ZAP483* (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.484*/485if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&486!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {487spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);488} else if (spa_feature_is_active(spa,489SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {490spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);491}492493spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);494}495496static void497brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)498{499vdev_t *vd;500uint16_t *entcount;501ulong_t *bitmap;502uint64_t nblocks, onblocks, size;503504ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));505506spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);507vd = vdev_lookup_top(spa, brtvd->bv_vdevid);508size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;509spa_config_exit(spa, SCL_VDEV, FTAG);510511nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);512entcount = vmem_zalloc(nblocks * BRT_BLOCKSIZE, KM_SLEEP);513bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);514515if (!brtvd->bv_initiated) {516ASSERT0(brtvd->bv_size);517ASSERT0P(brtvd->bv_entcount);518ASSERT0P(brtvd->bv_bitmap);519} else {520ASSERT(brtvd->bv_size > 0);521ASSERT(brtvd->bv_entcount != NULL);522ASSERT(brtvd->bv_bitmap != NULL);523/*524* TODO: Allow vdev shrinking. We only need to implement525* shrinking the on-disk BRT VDEV object.526* dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,527* offset, size, tx);528*/529ASSERT3U(brtvd->bv_size, <=, size);530531memcpy(entcount, brtvd->bv_entcount,532sizeof (entcount[0]) * MIN(size, brtvd->bv_size));533onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);534vmem_free(brtvd->bv_entcount, onblocks * BRT_BLOCKSIZE);535memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),536BT_SIZEOFMAP(onblocks)));537kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));538}539540brtvd->bv_size = size;541brtvd->bv_entcount = entcount;542brtvd->bv_bitmap = bitmap;543if (!brtvd->bv_initiated) {544brtvd->bv_need_byteswap = FALSE;545brtvd->bv_initiated = TRUE;546BRT_DEBUG("BRT VDEV %llu initiated.",547(u_longlong_t)brtvd->bv_vdevid);548}549}550551static int552brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)553{554dmu_buf_t *db;555brt_vdev_phys_t *bvphys;556int error;557558ASSERT(!brtvd->bv_initiated);559ASSERT(brtvd->bv_mos_brtvdev != 0);560561error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,562FTAG, &db);563if (error != 0)564return (error);565566bvphys = db->db_data;567if (spa->spa_brt_rangesize == 0) {568spa->spa_brt_rangesize = bvphys->bvp_rangesize;569} else {570ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);571}572573brt_vdev_realloc(spa, brtvd);574575/* TODO: We don't support VDEV shrinking. */576ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);577578/*579* If VDEV grew, we will leave new bv_entcount[] entries zeroed out.580*/581error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,582MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),583brtvd->bv_entcount, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);584if (error != 0)585return (error);586587ASSERT(bvphys->bvp_mos_entries != 0);588VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,589&brtvd->bv_mos_entries_dnode));590dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);591rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);592brtvd->bv_mos_entries = bvphys->bvp_mos_entries;593rw_exit(&brtvd->bv_mos_entries_lock);594brtvd->bv_need_byteswap =595(bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);596brtvd->bv_totalcount = bvphys->bvp_totalcount;597brtvd->bv_usedspace = bvphys->bvp_usedspace;598brtvd->bv_savedspace = bvphys->bvp_savedspace;599600dmu_buf_rele(db, FTAG);601602BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",603(u_longlong_t)brtvd->bv_vdevid,604(u_longlong_t)brtvd->bv_mos_brtvdev,605(u_longlong_t)brtvd->bv_mos_entries);606return (0);607}608609static void610brt_vdev_dealloc(brt_vdev_t *brtvd)611{612ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));613ASSERT(brtvd->bv_initiated);614ASSERT0(avl_numnodes(&brtvd->bv_tree));615616uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);617vmem_free(brtvd->bv_entcount, nblocks * BRT_BLOCKSIZE);618brtvd->bv_entcount = NULL;619kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));620brtvd->bv_bitmap = NULL;621622brtvd->bv_size = 0;623624brtvd->bv_initiated = FALSE;625BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);626}627628static void629brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)630{631char name[64];632uint64_t count;633634ASSERT(brtvd->bv_initiated);635ASSERT(brtvd->bv_mos_brtvdev != 0);636ASSERT(brtvd->bv_mos_entries != 0);637ASSERT0(brtvd->bv_totalcount);638ASSERT0(brtvd->bv_usedspace);639ASSERT0(brtvd->bv_savedspace);640641uint64_t mos_entries = brtvd->bv_mos_entries;642rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);643brtvd->bv_mos_entries = 0;644rw_exit(&brtvd->bv_mos_entries_lock);645dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);646brtvd->bv_mos_entries_dnode = NULL;647ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));648ASSERT0(count);649VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));650BRT_DEBUG("MOS entries destroyed, object=%llu",651(u_longlong_t)mos_entries);652653VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,654tx));655BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",656(u_longlong_t)brtvd->bv_mos_brtvdev);657brtvd->bv_mos_brtvdev = 0;658brtvd->bv_entcount_dirty = FALSE;659660snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,661(u_longlong_t)brtvd->bv_vdevid);662VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,663name, tx));664BRT_DEBUG("Pool directory object removed, object=%s", name);665666brtvd->bv_meta_dirty = FALSE;667668rw_enter(&brtvd->bv_lock, RW_WRITER);669brt_vdev_dealloc(brtvd);670rw_exit(&brtvd->bv_lock);671672spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);673if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))674spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);675}676677static void678brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)679{680brt_vdev_t **vdevs;681682ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));683ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);684685if (nvdevs == spa->spa_brt_nvdevs)686return;687688vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);689if (spa->spa_brt_nvdevs > 0) {690ASSERT(spa->spa_brt_vdevs != NULL);691692memcpy(vdevs, spa->spa_brt_vdevs,693sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);694kmem_free(spa->spa_brt_vdevs,695sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);696}697spa->spa_brt_vdevs = vdevs;698699for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {700brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);701rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);702brtvd->bv_vdevid = vdevid;703brtvd->bv_initiated = FALSE;704rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);705avl_create(&brtvd->bv_tree, brt_entry_compare,706sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));707for (int i = 0; i < TXG_SIZE; i++) {708avl_create(&brtvd->bv_pending_tree[i],709brt_entry_compare, sizeof (brt_entry_t),710offsetof(brt_entry_t, bre_node));711}712mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);713spa->spa_brt_vdevs[vdevid] = brtvd;714}715716BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",717(u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);718spa->spa_brt_nvdevs = nvdevs;719}720721static boolean_t722brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)723{724uint64_t idx = offset / spa->spa_brt_rangesize;725if (idx < brtvd->bv_size) {726/* VDEV wasn't expanded. */727return (brt_vdev_entcount_get(brtvd, idx) > 0);728}729return (FALSE);730}731732static void733brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,734uint64_t dsize, uint64_t count)735{736uint64_t idx;737738ASSERT(brtvd->bv_initiated);739740brtvd->bv_savedspace += dsize * count;741brtvd->bv_meta_dirty = TRUE;742743if (bre->bre_count > 0)744return;745746brtvd->bv_usedspace += dsize;747748idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;749if (idx >= brtvd->bv_size) {750/* VDEV has been expanded. */751rw_enter(&brtvd->bv_lock, RW_WRITER);752brt_vdev_realloc(spa, brtvd);753rw_exit(&brtvd->bv_lock);754}755756ASSERT3U(idx, <, brtvd->bv_size);757758brtvd->bv_totalcount++;759brt_vdev_entcount_inc(brtvd, idx);760brtvd->bv_entcount_dirty = TRUE;761idx = idx / BRT_BLOCKSIZE / 8;762BT_SET(brtvd->bv_bitmap, idx);763}764765static void766brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,767uint64_t dsize)768{769uint64_t idx;770771ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));772ASSERT(brtvd->bv_initiated);773774brtvd->bv_savedspace -= dsize;775brtvd->bv_meta_dirty = TRUE;776777if (bre->bre_count > 0)778return;779780brtvd->bv_usedspace -= dsize;781782idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;783ASSERT3U(idx, <, brtvd->bv_size);784785ASSERT(brtvd->bv_totalcount > 0);786brtvd->bv_totalcount--;787brt_vdev_entcount_dec(brtvd, idx);788brtvd->bv_entcount_dirty = TRUE;789idx = idx / BRT_BLOCKSIZE / 8;790BT_SET(brtvd->bv_bitmap, idx);791}792793static void794brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)795{796dmu_buf_t *db;797brt_vdev_phys_t *bvphys;798799ASSERT(brtvd->bv_meta_dirty);800ASSERT(brtvd->bv_mos_brtvdev != 0);801ASSERT(dmu_tx_is_syncing(tx));802803VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,804FTAG, &db));805806if (brtvd->bv_entcount_dirty) {807/*808* TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.809*/810uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);811dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,812nblocks * BRT_BLOCKSIZE, brtvd->bv_entcount, tx,813DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);814memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));815brtvd->bv_entcount_dirty = FALSE;816}817818dmu_buf_will_dirty(db, tx);819bvphys = db->db_data;820bvphys->bvp_mos_entries = brtvd->bv_mos_entries;821bvphys->bvp_size = brtvd->bv_size;822if (brtvd->bv_need_byteswap) {823bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;824} else {825bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;826}827bvphys->bvp_totalcount = brtvd->bv_totalcount;828bvphys->bvp_rangesize = spa->spa_brt_rangesize;829bvphys->bvp_usedspace = brtvd->bv_usedspace;830bvphys->bvp_savedspace = brtvd->bv_savedspace;831dmu_buf_rele(db, FTAG);832833brtvd->bv_meta_dirty = FALSE;834}835836static void837brt_vdevs_free(spa_t *spa)838{839if (spa->spa_brt_vdevs == 0)840return;841for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {842brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];843rw_enter(&brtvd->bv_lock, RW_WRITER);844if (brtvd->bv_initiated)845brt_vdev_dealloc(brtvd);846rw_exit(&brtvd->bv_lock);847rw_destroy(&brtvd->bv_lock);848if (brtvd->bv_mos_entries != 0)849dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);850rw_destroy(&brtvd->bv_mos_entries_lock);851avl_destroy(&brtvd->bv_tree);852for (int i = 0; i < TXG_SIZE; i++)853avl_destroy(&brtvd->bv_pending_tree[i]);854mutex_destroy(&brtvd->bv_pending_lock);855kmem_free(brtvd, sizeof (*brtvd));856}857kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *858spa->spa_brt_nvdevs);859}860861static void862brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)863{864865bre->bre_bp = *bp;866bre->bre_count = 0;867bre->bre_pcount = 0;868869*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);870}871872static boolean_t873brt_has_endian_fixed(spa_t *spa)874{875return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));876}877878static int879brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)880{881uint64_t off = BRE_OFFSET(bre);882883if (brtvd->bv_mos_entries == 0)884return (SET_ERROR(ENOENT));885886if (brt_has_endian_fixed(spa)) {887return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,888&off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,889&bre->bre_count));890} else {891return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,892&off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),893&bre->bre_count));894}895}896897/*898* Return TRUE if we _can_ have BRT entry for this bp. It might be false899* positive, but gives us quick answer if we should look into BRT, which900* may require reads and thus will be more expensive.901*/902boolean_t903brt_maybe_exists(spa_t *spa, const blkptr_t *bp)904{905906if (spa->spa_brt_nvdevs == 0)907return (B_FALSE);908909uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);910brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);911if (brtvd == NULL || !brtvd->bv_initiated)912return (FALSE);913914/*915* We don't need locks here, since bv_entcount pointer must be916* stable at this point, and we don't care about false positive917* races here, while false negative should be impossible, since918* all brt_vdev_addref() have already completed by this point.919*/920uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);921return (brt_vdev_lookup(spa, brtvd, off));922}923924uint64_t925brt_get_dspace(spa_t *spa)926{927if (spa->spa_brt_nvdevs == 0)928return (0);929930brt_rlock(spa);931uint64_t s = 0;932for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)933s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;934brt_unlock(spa);935return (s);936}937938uint64_t939brt_get_used(spa_t *spa)940{941if (spa->spa_brt_nvdevs == 0)942return (0);943944brt_rlock(spa);945uint64_t s = 0;946for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)947s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;948brt_unlock(spa);949return (s);950}951952uint64_t953brt_get_saved(spa_t *spa)954{955return (brt_get_dspace(spa));956}957958uint64_t959brt_get_ratio(spa_t *spa)960{961uint64_t used = brt_get_used(spa);962if (used == 0)963return (100);964return ((used + brt_get_saved(spa)) * 100 / used);965}966967static int968brt_kstats_update(kstat_t *ksp, int rw)969{970brt_stats_t *bs = ksp->ks_data;971972if (rw == KSTAT_WRITE)973return (EACCES);974975bs->brt_addref_entry_not_on_disk.value.ui64 =976wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);977bs->brt_addref_entry_on_disk.value.ui64 =978wmsum_value(&brt_sums.brt_addref_entry_on_disk);979bs->brt_decref_entry_in_memory.value.ui64 =980wmsum_value(&brt_sums.brt_decref_entry_in_memory);981bs->brt_decref_entry_loaded_from_disk.value.ui64 =982wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);983bs->brt_decref_entry_not_in_memory.value.ui64 =984wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);985bs->brt_decref_entry_read_lost_race.value.ui64 =986wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);987bs->brt_decref_entry_still_referenced.value.ui64 =988wmsum_value(&brt_sums.brt_decref_entry_still_referenced);989bs->brt_decref_free_data_later.value.ui64 =990wmsum_value(&brt_sums.brt_decref_free_data_later);991bs->brt_decref_free_data_now.value.ui64 =992wmsum_value(&brt_sums.brt_decref_free_data_now);993bs->brt_decref_no_entry.value.ui64 =994wmsum_value(&brt_sums.brt_decref_no_entry);995996return (0);997}998999static void1000brt_stat_init(void)1001{10021003wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);1004wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);1005wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);1006wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);1007wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);1008wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);1009wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);1010wmsum_init(&brt_sums.brt_decref_free_data_later, 0);1011wmsum_init(&brt_sums.brt_decref_free_data_now, 0);1012wmsum_init(&brt_sums.brt_decref_no_entry, 0);10131014brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,1015sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);1016if (brt_ksp != NULL) {1017brt_ksp->ks_data = &brt_stats;1018brt_ksp->ks_update = brt_kstats_update;1019kstat_install(brt_ksp);1020}1021}10221023static void1024brt_stat_fini(void)1025{1026if (brt_ksp != NULL) {1027kstat_delete(brt_ksp);1028brt_ksp = NULL;1029}10301031wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);1032wmsum_fini(&brt_sums.brt_addref_entry_on_disk);1033wmsum_fini(&brt_sums.brt_decref_entry_in_memory);1034wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);1035wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);1036wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);1037wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);1038wmsum_fini(&brt_sums.brt_decref_free_data_later);1039wmsum_fini(&brt_sums.brt_decref_free_data_now);1040wmsum_fini(&brt_sums.brt_decref_no_entry);1041}10421043void1044brt_init(void)1045{1046brt_entry_cache = kmem_cache_create("brt_entry_cache",1047sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);10481049brt_stat_init();1050}10511052void1053brt_fini(void)1054{1055brt_stat_fini();10561057kmem_cache_destroy(brt_entry_cache);1058}10591060/* Return TRUE if block should be freed immediately. */1061boolean_t1062brt_entry_decref(spa_t *spa, const blkptr_t *bp)1063{1064brt_entry_t *bre, *racebre;1065brt_entry_t bre_search;1066avl_index_t where;1067uint64_t vdevid;1068int error;10691070brt_entry_fill(bp, &bre_search, &vdevid);10711072brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);1073ASSERT(brtvd != NULL);10741075rw_enter(&brtvd->bv_lock, RW_WRITER);1076ASSERT(brtvd->bv_initiated);1077bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);1078if (bre != NULL) {1079BRTSTAT_BUMP(brt_decref_entry_in_memory);1080goto out;1081} else {1082BRTSTAT_BUMP(brt_decref_entry_not_in_memory);1083}1084rw_exit(&brtvd->bv_lock);10851086error = brt_entry_lookup(spa, brtvd, &bre_search);1087/* bre_search now contains correct bre_count */1088if (error == ENOENT) {1089BRTSTAT_BUMP(brt_decref_no_entry);1090return (B_TRUE);1091}1092ASSERT0(error);10931094rw_enter(&brtvd->bv_lock, RW_WRITER);1095racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);1096if (racebre != NULL) {1097/* The entry was added when the lock was dropped. */1098BRTSTAT_BUMP(brt_decref_entry_read_lost_race);1099bre = racebre;1100goto out;1101}11021103BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);1104bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);1105bre->bre_bp = bre_search.bre_bp;1106bre->bre_count = bre_search.bre_count;1107bre->bre_pcount = 0;1108avl_insert(&brtvd->bv_tree, bre, where);11091110out:1111if (bre->bre_count == 0) {1112rw_exit(&brtvd->bv_lock);1113BRTSTAT_BUMP(brt_decref_free_data_now);1114return (B_TRUE);1115}11161117bre->bre_pcount--;1118ASSERT(bre->bre_count > 0);1119bre->bre_count--;1120if (bre->bre_count == 0)1121BRTSTAT_BUMP(brt_decref_free_data_later);1122else1123BRTSTAT_BUMP(brt_decref_entry_still_referenced);1124brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));11251126rw_exit(&brtvd->bv_lock);11271128return (B_FALSE);1129}11301131uint64_t1132brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)1133{1134brt_entry_t bre_search, *bre;1135uint64_t vdevid, refcnt;1136int error;11371138brt_entry_fill(bp, &bre_search, &vdevid);11391140brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);1141ASSERT(brtvd != NULL);11421143rw_enter(&brtvd->bv_lock, RW_READER);1144ASSERT(brtvd->bv_initiated);1145bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);1146if (bre == NULL) {1147rw_exit(&brtvd->bv_lock);1148error = brt_entry_lookup(spa, brtvd, &bre_search);1149if (error == ENOENT) {1150refcnt = 0;1151} else {1152ASSERT0(error);1153refcnt = bre_search.bre_count;1154}1155} else {1156refcnt = bre->bre_count;1157rw_exit(&brtvd->bv_lock);1158}11591160return (refcnt);1161}11621163static void1164brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)1165{1166if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)1167return;11681169uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);1170rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);1171if (brtvd->bv_mos_entries != 0) {1172(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,1173&off, BRT_KEY_WORDS);1174}1175rw_exit(&brtvd->bv_mos_entries_lock);1176}11771178static int1179brt_entry_compare(const void *x1, const void *x2)1180{1181const brt_entry_t *bre1 = x1, *bre2 = x2;1182const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;11831184return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),1185DVA_GET_OFFSET(&bp2->blk_dva[0])));1186}11871188void1189brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)1190{1191brt_entry_t *bre, *newbre;1192avl_index_t where;1193uint64_t txg;11941195txg = dmu_tx_get_txg(tx);1196ASSERT3U(txg, !=, 0);11971198uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);1199brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);1200avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];12011202newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);1203newbre->bre_bp = *bp;1204newbre->bre_count = 0;1205newbre->bre_pcount = 1;12061207mutex_enter(&brtvd->bv_pending_lock);1208bre = avl_find(pending_tree, newbre, &where);1209if (bre == NULL) {1210avl_insert(pending_tree, newbre, where);1211newbre = NULL;1212} else {1213bre->bre_pcount++;1214}1215mutex_exit(&brtvd->bv_pending_lock);12161217if (newbre != NULL) {1218ASSERT(bre != NULL);1219ASSERT(bre != newbre);1220kmem_cache_free(brt_entry_cache, newbre);1221} else {1222ASSERT0P(bre);12231224/* Prefetch BRT entry for the syncing context. */1225brt_prefetch(brtvd, bp);1226}1227}12281229void1230brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)1231{1232brt_entry_t *bre, bre_search;1233uint64_t txg;12341235txg = dmu_tx_get_txg(tx);1236ASSERT3U(txg, !=, 0);12371238uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);1239brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);1240ASSERT(brtvd != NULL);1241avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];12421243bre_search.bre_bp = *bp;12441245mutex_enter(&brtvd->bv_pending_lock);1246bre = avl_find(pending_tree, &bre_search, NULL);1247ASSERT(bre != NULL);1248ASSERT(bre->bre_pcount > 0);1249bre->bre_pcount--;1250if (bre->bre_pcount == 0)1251avl_remove(pending_tree, bre);1252else1253bre = NULL;1254mutex_exit(&brtvd->bv_pending_lock);12551256if (bre)1257kmem_cache_free(brt_entry_cache, bre);1258}12591260static void1261brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)1262{1263brt_entry_t *bre, *nbre;12641265/*1266* We are in syncing context, so no other bv_pending_tree accesses1267* are possible for the TXG. So we don't need bv_pending_lock.1268*/1269ASSERT(avl_is_empty(&brtvd->bv_tree));1270avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);12711272for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {1273nbre = AVL_NEXT(&brtvd->bv_tree, bre);12741275/*1276* If the block has DEDUP bit set, it means that it1277* already exists in the DEDUP table, so we can just1278* use that instead of creating new entry in the BRT.1279*/1280if (BP_GET_DEDUP(&bre->bre_bp)) {1281while (bre->bre_pcount > 0) {1282if (!ddt_addref(spa, &bre->bre_bp))1283break;1284bre->bre_pcount--;1285}1286if (bre->bre_pcount == 0) {1287avl_remove(&brtvd->bv_tree, bre);1288kmem_cache_free(brt_entry_cache, bre);1289continue;1290}1291}12921293/*1294* Unless we know that the block is definitely not in ZAP,1295* try to get its reference count from there.1296*/1297uint64_t off = BRE_OFFSET(bre);1298if (brtvd->bv_mos_entries != 0 &&1299brt_vdev_lookup(spa, brtvd, off)) {1300int error;1301if (brt_has_endian_fixed(spa)) {1302error = zap_lookup_uint64_by_dnode(1303brtvd->bv_mos_entries_dnode, &off,1304BRT_KEY_WORDS, sizeof (bre->bre_count), 1,1305&bre->bre_count);1306} else {1307error = zap_lookup_uint64_by_dnode(1308brtvd->bv_mos_entries_dnode, &off,1309BRT_KEY_WORDS, 1, sizeof (bre->bre_count),1310&bre->bre_count);1311}1312if (error == 0) {1313BRTSTAT_BUMP(brt_addref_entry_on_disk);1314} else {1315ASSERT3U(error, ==, ENOENT);1316BRTSTAT_BUMP(brt_addref_entry_not_on_disk);1317}1318}1319}13201321/*1322* If all the cloned blocks we had were handled by DDT, we don't need1323* to initiate the vdev.1324*/1325if (avl_is_empty(&brtvd->bv_tree))1326return;13271328if (!brtvd->bv_initiated) {1329rw_enter(&brtvd->bv_lock, RW_WRITER);1330brt_vdev_realloc(spa, brtvd);1331rw_exit(&brtvd->bv_lock);1332}13331334/*1335* Convert pending references into proper ones. This has to be a1336* separate loop, since entcount modifications would cause false1337* positives for brt_vdev_lookup() on following iterations.1338*/1339for (bre = avl_first(&brtvd->bv_tree); bre;1340bre = AVL_NEXT(&brtvd->bv_tree, bre)) {1341brt_vdev_addref(spa, brtvd, bre,1342bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);1343bre->bre_count += bre->bre_pcount;1344}1345}13461347void1348brt_pending_apply(spa_t *spa, uint64_t txg)1349{13501351brt_rlock(spa);1352for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {1353brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];1354brt_unlock(spa);13551356brt_pending_apply_vdev(spa, brtvd, txg);13571358brt_rlock(spa);1359}1360brt_unlock(spa);1361}13621363static void1364brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)1365{1366uint64_t off = BRE_OFFSET(bre);13671368if (bre->bre_pcount == 0) {1369/* The net change is zero, nothing to do in ZAP. */1370} else if (bre->bre_count == 0) {1371int error = zap_remove_uint64_by_dnode(dn, &off,1372BRT_KEY_WORDS, tx);1373VERIFY(error == 0 || error == ENOENT);1374} else {1375if (brt_has_endian_fixed(spa)) {1376VERIFY0(zap_update_uint64_by_dnode(dn, &off,1377BRT_KEY_WORDS, sizeof (bre->bre_count), 1,1378&bre->bre_count, tx));1379} else {1380VERIFY0(zap_update_uint64_by_dnode(dn, &off,1381BRT_KEY_WORDS, 1, sizeof (bre->bre_count),1382&bre->bre_count, tx));1383}1384}1385}13861387static void1388brt_sync_table(spa_t *spa, dmu_tx_t *tx)1389{1390brt_entry_t *bre;13911392brt_rlock(spa);1393for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {1394brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];1395brt_unlock(spa);13961397if (!brtvd->bv_meta_dirty) {1398ASSERT(!brtvd->bv_entcount_dirty);1399ASSERT0(avl_numnodes(&brtvd->bv_tree));1400brt_rlock(spa);1401continue;1402}14031404ASSERT(!brtvd->bv_entcount_dirty ||1405avl_numnodes(&brtvd->bv_tree) != 0);14061407if (brtvd->bv_mos_brtvdev == 0)1408brt_vdev_create(spa, brtvd, tx);14091410void *c = NULL;1411while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {1412brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,1413tx);1414kmem_cache_free(brt_entry_cache, bre);1415}14161417#ifdef ZFS_DEBUG1418if (zfs_flags & ZFS_DEBUG_BRT)1419brt_vdev_dump(brtvd);1420#endif1421if (brtvd->bv_totalcount == 0)1422brt_vdev_destroy(spa, brtvd, tx);1423else1424brt_vdev_sync(spa, brtvd, tx);1425brt_rlock(spa);1426}1427brt_unlock(spa);1428}14291430void1431brt_sync(spa_t *spa, uint64_t txg)1432{1433dmu_tx_t *tx;1434uint64_t vdevid;14351436ASSERT3U(spa_syncing_txg(spa), ==, txg);14371438brt_rlock(spa);1439for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {1440if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)1441break;1442}1443if (vdevid >= spa->spa_brt_nvdevs) {1444brt_unlock(spa);1445return;1446}1447brt_unlock(spa);14481449tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);1450brt_sync_table(spa, tx);1451dmu_tx_commit(tx);1452}14531454static void1455brt_alloc(spa_t *spa)1456{1457rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);1458spa->spa_brt_vdevs = NULL;1459spa->spa_brt_nvdevs = 0;1460spa->spa_brt_rangesize = 0;1461}14621463void1464brt_create(spa_t *spa)1465{1466brt_alloc(spa);1467spa->spa_brt_rangesize = BRT_RANGESIZE;1468}14691470int1471brt_load(spa_t *spa)1472{1473int error = 0;14741475brt_alloc(spa);1476brt_wlock(spa);1477for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;1478vdevid++) {1479char name[64];1480uint64_t mos_brtvdev;14811482/* Look if this vdev had active block cloning. */1483snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,1484(u_longlong_t)vdevid);1485error = zap_lookup(spa->spa_meta_objset,1486DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,1487&mos_brtvdev);1488if (error == ENOENT) {1489error = 0;1490continue;1491}1492if (error != 0)1493break;14941495/* If it did, then allocate them all and load this one. */1496brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);1497brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];1498rw_enter(&brtvd->bv_lock, RW_WRITER);1499brtvd->bv_mos_brtvdev = mos_brtvdev;1500error = brt_vdev_load(spa, brtvd);1501rw_exit(&brtvd->bv_lock);1502if (error != 0)1503break;1504}15051506if (spa->spa_brt_rangesize == 0)1507spa->spa_brt_rangesize = BRT_RANGESIZE;1508brt_unlock(spa);1509return (error);1510}15111512void1513brt_prefetch_all(spa_t *spa)1514{1515/*1516* Load all BRT entries for each vdev. This is intended to perform1517* a prefetch on all such blocks. For the same reason that brt_prefetch1518* (called from brt_pending_add) isn't locked, this is also not locked.1519*/1520brt_rlock(spa);1521for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {1522brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];1523brt_unlock(spa);15241525rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);1526if (brtvd->bv_mos_entries != 0) {1527(void) zap_prefetch_object(spa->spa_meta_objset,1528brtvd->bv_mos_entries);1529}1530rw_exit(&brtvd->bv_mos_entries_lock);15311532brt_rlock(spa);1533}1534brt_unlock(spa);1535}15361537void1538brt_unload(spa_t *spa)1539{1540if (spa->spa_brt_rangesize == 0)1541return;1542brt_vdevs_free(spa);1543rw_destroy(&spa->spa_brt_lock);1544spa->spa_brt_rangesize = 0;1545}15461547ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,1548"Enable prefetching of BRT ZAP entries");1549ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,1550"BRT ZAP leaf blockshift");1551ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,1552"BRT ZAP indirect blockshift");155315541555