Path: blob/main/sys/contrib/openzfs/module/zfs/ddt_log.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2023, Klara Inc.24*/2526#include <sys/zfs_context.h>27#include <sys/spa.h>28#include <sys/ddt.h>29#include <sys/dmu_tx.h>30#include <sys/dmu.h>31#include <sys/ddt_impl.h>32#include <sys/dnode.h>33#include <sys/dbuf.h>34#include <sys/zap.h>35#include <sys/zio_checksum.h>3637/*38* No more than this many txgs before swapping logs.39*/40uint_t zfs_dedup_log_txg_max = 8;4142/*43* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module44* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.45*/46uint64_t zfs_dedup_log_mem_max = 0;47uint_t zfs_dedup_log_mem_max_percent = 1;484950static kmem_cache_t *ddt_log_entry_flat_cache;51static kmem_cache_t *ddt_log_entry_trad_cache;5253#define DDT_LOG_ENTRY_FLAT_SIZE \54(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)55#define DDT_LOG_ENTRY_TRAD_SIZE \56(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)5758#define DDT_LOG_ENTRY_SIZE(ddt) \59_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)6061void62ddt_log_init(void)63{64ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",65DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);66ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",67DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);6869/*70* Max memory for log AVL entries. At least 1M, because we need71* something (that's ~3800 entries per tree). They can say 100% if they72* want; it just means they're at the mercy of the the txg flush limit.73*/74if (zfs_dedup_log_mem_max == 0) {75zfs_dedup_log_mem_max_percent =76MIN(zfs_dedup_log_mem_max_percent, 100);77zfs_dedup_log_mem_max = (physmem * PAGESIZE) *78zfs_dedup_log_mem_max_percent / 100;79}80zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);81}8283void84ddt_log_fini(void)85{86kmem_cache_destroy(ddt_log_entry_trad_cache);87kmem_cache_destroy(ddt_log_entry_flat_cache);88}8990static void91ddt_log_name(ddt_t *ddt, char *name, uint_t n)92{93snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,94zio_checksum_table[ddt->ddt_checksum].ci_name, n);95}9697static void98ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)99{100dmu_buf_t *db;101VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));102dmu_buf_will_dirty(db, tx);103104ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;105DLH_SET_VERSION(hdr, 1);106DLH_SET_FLAGS(hdr, ddl->ddl_flags);107hdr->dlh_length = ddl->ddl_length;108hdr->dlh_first_txg = ddl->ddl_first_txg;109hdr->dlh_checkpoint = ddl->ddl_checkpoint;110111dmu_buf_rele(db, FTAG);112}113114static void115ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)116{117ASSERT3U(ddt->ddt_dir_object, >, 0);118ASSERT0(ddl->ddl_object);119120char name[DDT_NAMELEN];121ddt_log_name(ddt, name, n);122123ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,124DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,125DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);126VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,127sizeof (uint64_t), 1, &ddl->ddl_object, tx));128ddl->ddl_length = 0;129ddl->ddl_first_txg = tx->tx_txg;130ddt_log_update_header(ddt, ddl, tx);131}132133static void134ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)135{136ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);137ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);138}139140static void141ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)142{143ASSERT3U(ddt->ddt_dir_object, >, 0);144145if (ddl->ddl_object == 0)146return;147148ASSERT0(ddl->ddl_length);149150char name[DDT_NAMELEN];151ddt_log_name(ddt, name, n);152153VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));154VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));155156ddl->ddl_object = 0;157}158159void160ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)161{162ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);163ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);164}165166static void167ddt_log_update_stats(ddt_t *ddt)168{169/*170* Log object stats. We count the number of live entries in the log171* tree, even if there are more than on disk, and even if the same172* entry is on both append and flush trees, because that's more what173* the user expects to see. This does mean the on-disk size is not174* really correlated with the number of entries, but I don't think175* that's reasonable to expect anyway.176*/177dmu_object_info_t doi;178uint64_t nblocks = 0;179if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,180&doi) == 0)181nblocks += doi.doi_physical_blocks_512;182if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,183&doi) == 0)184nblocks += doi.doi_physical_blocks_512;185186ddt_object_t *ddo = &ddt->ddt_log_stats;187ddo->ddo_count =188avl_numnodes(&ddt->ddt_log_active->ddl_tree) +189avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);190ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);191ddo->ddo_dspace = nblocks << 9;192}193194void195ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)196{197ASSERT3U(nentries, >, 0);198ASSERT0P(dlu->dlu_dbp);199200if (ddt->ddt_log_active->ddl_object == 0)201ddt_log_create(ddt, tx);202203/*204* We want to store as many entries as we can in a block, but never205* split an entry across block boundaries.206*/207size_t reclen = P2ALIGN_TYPED(208sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +209DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);210ASSERT3U(reclen, <=, UINT16_MAX);211dlu->dlu_reclen = reclen;212213VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,214&dlu->dlu_dn));215dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);216217uint64_t nblocks = howmany(nentries,218dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);219uint64_t offset = ddt->ddt_log_active->ddl_length;220uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;221222VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,223B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,224DMU_READ_NO_PREFETCH));225226dlu->dlu_tx = tx;227dlu->dlu_block = dlu->dlu_offset = 0;228}229230static ddt_log_entry_t *231ddt_log_alloc_entry(ddt_t *ddt)232{233ddt_log_entry_t *ddle;234235if (ddt->ddt_flags & DDT_FLAG_FLAT) {236ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);237memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);238} else {239ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);240memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);241}242243return (ddle);244}245246static void247ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)248{249kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?250ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);251}252253static void254ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)255{256/* Create the log tree entry from a live or stored entry */257avl_index_t where;258ddt_log_entry_t *ddle =259avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);260if (ddle == NULL) {261ddle = ddt_log_alloc_entry(ddt);262ddle->ddle_key = ddlwe->ddlwe_key;263avl_insert(&ddl->ddl_tree, ddle, where);264}265ddle->ddle_type = ddlwe->ddlwe_type;266ddle->ddle_class = ddlwe->ddlwe_class;267memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));268}269270void271ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)272{273ASSERT3U(dlu->dlu_dbp, !=, NULL);274275ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);276ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);277278/* Get our block */279ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);280dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];281282/*283* If this would take us past the end of the block, finish it and284* move to the next one.285*/286if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {287ASSERT3U(dlu->dlu_offset, >, 0);288dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);289dlu->dlu_block++;290dlu->dlu_offset = 0;291ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);292db = dlu->dlu_dbp[dlu->dlu_block];293}294295/*296* If this is the first time touching the block, inform the DMU that297* we will fill it, and zero it out.298*/299if (dlu->dlu_offset == 0) {300dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);301memset(db->db_data, 0, db->db_size);302}303304/* Create the log record directly in the buffer */305ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);306DLR_SET_TYPE(dlr, DLR_ENTRY);307DLR_SET_RECLEN(dlr, dlu->dlu_reclen);308DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);309DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);310311ddt_log_record_entry_t *dlre =312(ddt_log_record_entry_t *)&dlr->dlr_payload;313dlre->dlre_key = ddlwe->ddlwe_key;314memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));315316/* Advance offset for next record. */317dlu->dlu_offset += dlu->dlu_reclen;318}319320void321ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)322{323ASSERT3U(dlu->dlu_dbp, !=, NULL);324ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);325ASSERT3U(dlu->dlu_offset, >, 0);326327/*328* Close out the last block. Whatever we haven't used will be zeroed,329* which matches DLR_INVALID, so we can detect this during load.330*/331dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);332333dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);334335ddt->ddt_log_active->ddl_length +=336dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;337dnode_rele(dlu->dlu_dn, FTAG);338339ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);340341memset(dlu, 0, sizeof (ddt_log_update_t));342343ddt_log_update_stats(ddt);344}345346boolean_t347ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)348{349ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);350if (ddle == NULL)351return (B_FALSE);352353DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);354355ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);356357avl_remove(&ddl->ddl_tree, ddle);358ddt_log_free_entry(ddt, ddle);359360return (B_TRUE);361}362363boolean_t364ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)365{366ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);367if (ddle == NULL)368return (B_FALSE);369370ddt_lightweight_entry_t ddlwe;371DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);372ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);373374avl_remove(&ddl->ddl_tree, ddle);375ddt_log_free_entry(ddt, ddle);376377return (B_TRUE);378}379380boolean_t381ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,382ddt_lightweight_entry_t *ddlwe)383{384ddt_log_entry_t *ddle =385avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);386if (!ddle)387ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);388if (!ddle)389return (B_FALSE);390if (ddlwe)391DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);392return (B_TRUE);393}394395void396ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)397{398ddt_log_t *ddl = ddt->ddt_log_flushing;399400ASSERT3U(ddl->ddl_object, !=, 0);401402#ifdef ZFS_DEBUG403/*404* There should not be any entries on the log tree before the given405* checkpoint. Assert that this is the case.406*/407ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);408if (ddle != NULL)409VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),410>, 0);411#endif412413ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;414ddl->ddl_checkpoint = ddlwe->ddlwe_key;415ddt_log_update_header(ddt, ddl, tx);416417ddt_log_update_stats(ddt);418}419420void421ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)422{423ddt_log_t *ddl = ddt->ddt_log_flushing;424425if (ddl->ddl_object == 0)426return;427428ASSERT(avl_is_empty(&ddl->ddl_tree));429430/* Eject the entire object */431dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);432433ddl->ddl_length = 0;434ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;435memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));436ddt_log_update_header(ddt, ddl, tx);437438ddt_log_update_stats(ddt);439}440441boolean_t442ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)443{444/* Swap the logs. The old flushing one must be empty */445VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));446447/*448* If there are still blocks on the flushing log, truncate it first.449* This can happen if there were entries on the flushing log that were450* removed in memory via ddt_lookup(); their vestigal remains are451* on disk.452*/453if (ddt->ddt_log_flushing->ddl_length > 0)454ddt_log_truncate(ddt, tx);455456/*457* Swap policy. We swap the logs (and so begin flushing) when the458* active tree grows too large, or when we haven't swapped it in459* some amount of time, or if something has requested the logs be460* flushed ASAP (see ddt_walk_init()).461*/462463/*464* The log tree is too large if the memory usage of its entries is over465* half of the memory limit. This effectively gives each log tree half466* the available memory.467*/468const boolean_t too_large =469(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *470DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);471472const boolean_t too_old =473tx->tx_txg >=474(ddt->ddt_log_active->ddl_first_txg +475MAX(1, zfs_dedup_log_txg_max));476477const boolean_t force =478ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;479480if (!(too_large || too_old || force))481return (B_FALSE);482483ddt_log_t *swap = ddt->ddt_log_active;484ddt->ddt_log_active = ddt->ddt_log_flushing;485ddt->ddt_log_flushing = swap;486487ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);488ddt->ddt_log_active->ddl_flags &=489~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);490491ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));492ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;493494ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;495496ddt_log_update_header(ddt, ddt->ddt_log_active, tx);497ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);498499ddt_log_update_stats(ddt);500501return (B_TRUE);502}503504static inline void505ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,506const ddt_key_t *checkpoint)507{508ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);509510ddt_log_record_entry_t *dlre =511(ddt_log_record_entry_t *)dlr->dlr_payload;512if (checkpoint != NULL &&513ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {514/* Skip pre-checkpoint entries; they're already flushed. */515return;516}517518ddt_lightweight_entry_t ddlwe;519ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);520ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);521522ddlwe.ddlwe_key = dlre->dlre_key;523memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));524525ddt_log_update_entry(ddt, ddl, &ddlwe);526}527528static void529ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)530{531void *cookie = NULL;532ddt_log_entry_t *ddle;533IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));534while ((ddle =535avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {536ddt_log_free_entry(ddt, ddle);537}538ASSERT(avl_is_empty(&ddl->ddl_tree));539}540541static int542ddt_log_load_one(ddt_t *ddt, uint_t n)543{544ASSERT3U(n, <, 2);545546ddt_log_t *ddl = &ddt->ddt_log[n];547548char name[DDT_NAMELEN];549ddt_log_name(ddt, name, n);550551uint64_t obj;552int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,553sizeof (uint64_t), 1, &obj);554if (err == ENOENT)555return (0);556if (err != 0)557return (err);558559dnode_t *dn;560err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);561if (err != 0)562return (err);563564ddt_log_header_t hdr;565dmu_buf_t *db;566err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);567if (err != 0) {568dnode_rele(dn, FTAG);569return (err);570}571memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));572dmu_buf_rele(db, FTAG);573574if (DLH_GET_VERSION(&hdr) != 1) {575dnode_rele(dn, FTAG);576zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "577"unknown version=%llu", spa_name(ddt->ddt_spa), name,578(u_longlong_t)DLH_GET_VERSION(&hdr));579return (SET_ERROR(EINVAL));580}581582ddt_key_t *checkpoint = NULL;583if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {584/*585* If the log has a checkpoint, then we can ignore any entries586* that have already been flushed.587*/588ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);589checkpoint = &hdr.dlh_checkpoint;590}591592if (hdr.dlh_length > 0) {593dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,594ZIO_PRIORITY_SYNC_READ);595596for (uint64_t offset = 0; offset < hdr.dlh_length;597offset += dn->dn_datablksz) {598err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,599DMU_READ_PREFETCH);600if (err != 0) {601dnode_rele(dn, FTAG);602ddt_log_empty(ddt, ddl);603return (err);604}605606uint64_t boffset = 0;607while (boffset < db->db_size) {608ddt_log_record_t *dlr =609(ddt_log_record_t *)(db->db_data + boffset);610611/* Partially-filled block, skip the rest */612if (DLR_GET_TYPE(dlr) == DLR_INVALID)613break;614615switch (DLR_GET_TYPE(dlr)) {616case DLR_ENTRY:617ddt_log_load_entry(ddt, ddl, dlr,618checkpoint);619break;620621default:622dmu_buf_rele(db, FTAG);623dnode_rele(dn, FTAG);624ddt_log_empty(ddt, ddl);625return (SET_ERROR(EINVAL));626}627628boffset += DLR_GET_RECLEN(dlr);629}630631dmu_buf_rele(db, FTAG);632}633}634635dnode_rele(dn, FTAG);636637ddl->ddl_object = obj;638ddl->ddl_flags = DLH_GET_FLAGS(&hdr);639ddl->ddl_length = hdr.dlh_length;640ddl->ddl_first_txg = hdr.dlh_first_txg;641642if (ddl->ddl_flags & DDL_FLAG_FLUSHING)643ddt->ddt_log_flushing = ddl;644else645ddt->ddt_log_active = ddl;646647return (0);648}649650int651ddt_log_load(ddt_t *ddt)652{653int err;654655if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {656/*657* The DDT is going to be freed again in a moment, so there's658* no point loading the log; it'll just slow down import.659*/660return (0);661}662663ASSERT0(ddt->ddt_log[0].ddl_object);664ASSERT0(ddt->ddt_log[1].ddl_object);665if (ddt->ddt_dir_object == 0) {666/*667* If we're configured but the containing dir doesn't exist668* yet, then the log object can't possibly exist either.669*/670ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);671return (SET_ERROR(ENOENT));672}673674if ((err = ddt_log_load_one(ddt, 0)) != 0)675return (err);676if ((err = ddt_log_load_one(ddt, 1)) != 0)677return (err);678679VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);680VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));681VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));682VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);683684/*685* We have two finalisation tasks:686*687* - rebuild the histogram. We do this at the end rather than while688* we're loading so we don't need to uncount and recount entries that689* appear multiple times in the log.690*691* - remove entries from the flushing tree that are on both trees. This692* happens when ddt_lookup() rehydrates an entry from the flushing693* tree, as ddt_log_take_key() removes the entry from the in-memory694* tree but doesn't remove it from disk.695*/696697/*698* We don't technically need a config lock here, since there shouldn't699* be pool config changes during DDT load. dva_get_dsize_sync() via700* ddt_stat_generate() is expecting it though, and it won't hurt701* anything, so we take it.702*/703spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);704705avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;706avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;707ddt_log_entry_t *ae = avl_first(al);708ddt_log_entry_t *fe = avl_first(fl);709while (ae != NULL || fe != NULL) {710ddt_log_entry_t *ddle;711if (ae == NULL) {712/* active exhausted, take flushing */713ddle = fe;714fe = AVL_NEXT(fl, fe);715} else if (fe == NULL) {716/* flushing exuhausted, take active */717ddle = ae;718ae = AVL_NEXT(al, ae);719} else {720/* compare active and flushing */721int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);722if (c < 0) {723/* active behind, take and advance */724ddle = ae;725ae = AVL_NEXT(al, ae);726} else if (c > 0) {727/* flushing behind, take and advance */728ddle = fe;729fe = AVL_NEXT(fl, fe);730} else {731/* match. remove from flushing, take active */732ddle = fe;733fe = AVL_NEXT(fl, fe);734avl_remove(fl, ddle);735ddt_log_free_entry(ddt, ddle);736ddle = ae;737ae = AVL_NEXT(al, ae);738}739}740741ddt_lightweight_entry_t ddlwe;742DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);743ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);744}745746spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);747748ddt_log_update_stats(ddt);749750return (0);751}752753void754ddt_log_alloc(ddt_t *ddt)755{756ASSERT0P(ddt->ddt_log_active);757ASSERT0P(ddt->ddt_log_flushing);758759avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,760sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));761avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,762sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));763ddt->ddt_log_active = &ddt->ddt_log[0];764ddt->ddt_log_flushing = &ddt->ddt_log[1];765ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;766}767768void769ddt_log_free(ddt_t *ddt)770{771ddt_log_empty(ddt, &ddt->ddt_log[0]);772ddt_log_empty(ddt, &ddt->ddt_log[1]);773avl_destroy(&ddt->ddt_log[0].ddl_tree);774avl_destroy(&ddt->ddt_log[1].ddl_tree);775}776777ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,778"Max transactions before starting to flush dedup logs");779780ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,781"Max memory for dedup logs");782783ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,784"Max memory for dedup logs, as % of total memory");785786787