Path: blob/main/sys/contrib/openzfs/module/zfs/ddt_log.c
108011 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2023, Klara Inc.24*/2526#include <sys/zfs_context.h>27#include <sys/spa.h>28#include <sys/ddt.h>29#include <sys/dmu_tx.h>30#include <sys/dmu.h>31#include <sys/ddt_impl.h>32#include <sys/dnode.h>33#include <sys/dbuf.h>34#include <sys/zap.h>35#include <sys/zio_checksum.h>3637/*38* No more than this many txgs before swapping logs.39*/40uint_t zfs_dedup_log_txg_max = 8;4142/*43* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module44* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.45*/46uint64_t zfs_dedup_log_mem_max = 0;47uint_t zfs_dedup_log_mem_max_percent = 1;484950static kmem_cache_t *ddt_log_entry_flat_cache;51static kmem_cache_t *ddt_log_entry_trad_cache;5253#define DDT_LOG_ENTRY_FLAT_SIZE \54(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)55#define DDT_LOG_ENTRY_TRAD_SIZE \56(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)5758#define DDT_LOG_ENTRY_SIZE(ddt) \59_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)6061void62ddt_log_init(void)63{64ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",65DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);66ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",67DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);6869/*70* Max memory for log AVL entries. At least 1M, because we need71* something (that's ~3800 entries per tree). They can say 100% if they72* want; it just means they're at the mercy of the the txg flush limit.73*/74if (zfs_dedup_log_mem_max == 0) {75zfs_dedup_log_mem_max_percent =76MIN(zfs_dedup_log_mem_max_percent, 100);77zfs_dedup_log_mem_max = (physmem * PAGESIZE) *78zfs_dedup_log_mem_max_percent / 100;79}80zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);81}8283void84ddt_log_fini(void)85{86kmem_cache_destroy(ddt_log_entry_trad_cache);87kmem_cache_destroy(ddt_log_entry_flat_cache);88}8990static void91ddt_log_name(ddt_t *ddt, char *name, uint_t n)92{93snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,94zio_checksum_table[ddt->ddt_checksum].ci_name, n);95}9697static void98ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)99{100dmu_buf_t *db;101VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));102dmu_buf_will_dirty(db, tx);103104ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;105DLH_SET_VERSION(hdr, 1);106DLH_SET_FLAGS(hdr, ddl->ddl_flags);107hdr->dlh_length = ddl->ddl_length;108hdr->dlh_first_txg = ddl->ddl_first_txg;109hdr->dlh_checkpoint = ddl->ddl_checkpoint;110111dmu_buf_rele(db, FTAG);112}113114static void115ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)116{117ASSERT3U(ddt->ddt_dir_object, >, 0);118ASSERT0(ddl->ddl_object);119120char name[DDT_NAMELEN];121ddt_log_name(ddt, name, n);122123ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,124DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,125DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);126VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,127sizeof (uint64_t), 1, &ddl->ddl_object, tx));128ddl->ddl_length = 0;129ddl->ddl_first_txg = tx->tx_txg;130ddt_log_update_header(ddt, ddl, tx);131}132133static void134ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)135{136ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);137ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);138}139140static void141ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)142{143ASSERT3U(ddt->ddt_dir_object, >, 0);144145if (ddl->ddl_object == 0)146return;147148ASSERT0(ddl->ddl_length);149150char name[DDT_NAMELEN];151ddt_log_name(ddt, name, n);152153VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));154VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));155156ddl->ddl_object = 0;157}158159void160ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)161{162ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);163ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);164}165166static void167ddt_log_update_stats(ddt_t *ddt)168{169/*170* Log object stats. We count the number of live entries in the log171* tree, even if there are more than on disk, and even if the same172* entry is on both append and flush trees, because that's more what173* the user expects to see. This does mean the on-disk size is not174* really correlated with the number of entries, but I don't think175* that's reasonable to expect anyway.176*/177dmu_object_info_t doi;178uint64_t nblocks = 0;179if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,180&doi) == 0)181nblocks += doi.doi_physical_blocks_512;182if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,183&doi) == 0)184nblocks += doi.doi_physical_blocks_512;185186ddt_object_t *ddo = &ddt->ddt_log_stats;187ddo->ddo_count =188avl_numnodes(&ddt->ddt_log_active->ddl_tree) +189avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);190ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);191ddo->ddo_dspace = nblocks << 9;192}193194void195ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)196{197ASSERT3U(nentries, >, 0);198ASSERT0P(dlu->dlu_dbp);199200if (ddt->ddt_log_active->ddl_object == 0)201ddt_log_create(ddt, tx);202203/*204* We want to store as many entries as we can in a block, but never205* split an entry across block boundaries.206*/207size_t reclen = P2ALIGN_TYPED(208sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +209DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);210ASSERT3U(reclen, <=, UINT16_MAX);211dlu->dlu_reclen = reclen;212213VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,214&dlu->dlu_dn));215dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);216217uint64_t nblocks = howmany(nentries,218dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);219uint64_t offset = ddt->ddt_log_active->ddl_length;220uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;221222VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,223B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,224DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));225226dlu->dlu_tx = tx;227dlu->dlu_block = dlu->dlu_offset = 0;228}229230static ddt_log_entry_t *231ddt_log_alloc_entry(ddt_t *ddt)232{233ddt_log_entry_t *ddle;234235if (ddt->ddt_flags & DDT_FLAG_FLAT) {236ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);237memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);238} else {239ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);240memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);241}242243return (ddle);244}245246static void247ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)248{249kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?250ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);251}252253static void254ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,255boolean_t hist)256{257/* Create the log tree entry from a live or stored entry */258avl_index_t where;259ddt_log_entry_t *ddle =260avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);261if (ddle == NULL) {262ddle = ddt_log_alloc_entry(ddt);263ddle->ddle_key = ddlwe->ddlwe_key;264avl_insert(&ddl->ddl_tree, ddle, where);265} else if (hist) {266ddt_lightweight_entry_t oddlwe;267DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);268ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);269}270if (hist)271ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);272ddle->ddle_type = ddlwe->ddlwe_type;273ddle->ddle_class = ddlwe->ddlwe_class;274memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));275}276277void278ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)279{280ASSERT3U(dlu->dlu_dbp, !=, NULL);281282ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);283284/* Get our block */285ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);286dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];287288/*289* If this would take us past the end of the block, finish it and290* move to the next one.291*/292if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {293ASSERT3U(dlu->dlu_offset, >, 0);294dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);295dlu->dlu_block++;296dlu->dlu_offset = 0;297ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);298db = dlu->dlu_dbp[dlu->dlu_block];299}300301/*302* If this is the first time touching the block, inform the DMU that303* we will fill it, and zero it out.304*/305if (dlu->dlu_offset == 0) {306dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE,307DMU_UNCACHEDIO);308memset(db->db_data, 0, db->db_size);309}310311/* Create the log record directly in the buffer */312ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);313DLR_SET_TYPE(dlr, DLR_ENTRY);314DLR_SET_RECLEN(dlr, dlu->dlu_reclen);315DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);316DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);317318ddt_log_record_entry_t *dlre =319(ddt_log_record_entry_t *)&dlr->dlr_payload;320dlre->dlre_key = ddlwe->ddlwe_key;321memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));322323/* Advance offset for next record. */324dlu->dlu_offset += dlu->dlu_reclen;325}326327void328ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)329{330ASSERT3U(dlu->dlu_dbp, !=, NULL);331ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);332ASSERT3U(dlu->dlu_offset, >, 0);333334/*335* Close out the last block. Whatever we haven't used will be zeroed,336* which matches DLR_INVALID, so we can detect this during load.337*/338dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);339340dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);341342ddt->ddt_log_active->ddl_length +=343dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;344dnode_rele(dlu->dlu_dn, FTAG);345346ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);347348memset(dlu, 0, sizeof (ddt_log_update_t));349350ddt_log_update_stats(ddt);351}352353boolean_t354ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)355{356ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);357if (ddle == NULL)358return (B_FALSE);359360DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);361362ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);363364avl_remove(&ddl->ddl_tree, ddle);365ddt_log_free_entry(ddt, ddle);366367return (B_TRUE);368}369370boolean_t371ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)372{373ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);374if (ddle == NULL)375return (B_FALSE);376377ddt_lightweight_entry_t ddlwe;378DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);379ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);380381avl_remove(&ddl->ddl_tree, ddle);382ddt_log_free_entry(ddt, ddle);383384return (B_TRUE);385}386387boolean_t388ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,389ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)390{391ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,392ddk, NULL);393if (ddle) {394if (from_flushing)395*from_flushing = B_FALSE;396} else {397ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);398if (!ddle)399return (B_FALSE);400if (from_flushing)401*from_flushing = B_TRUE;402}403if (ddlwe)404DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);405return (B_TRUE);406}407408void409ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)410{411ddt_log_t *ddl = ddt->ddt_log_flushing;412413ASSERT3U(ddl->ddl_object, !=, 0);414415#ifdef ZFS_DEBUG416/*417* There should not be any entries on the log tree before the given418* checkpoint. Assert that this is the case.419*/420ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);421if (ddle != NULL)422VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),423>, 0);424#endif425426ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;427ddl->ddl_checkpoint = ddlwe->ddlwe_key;428ddt_log_update_header(ddt, ddl, tx);429430ddt_log_update_stats(ddt);431}432433void434ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)435{436ddt_log_t *ddl = ddt->ddt_log_flushing;437438if (ddl->ddl_object == 0)439return;440441ASSERT(avl_is_empty(&ddl->ddl_tree));442443/* Eject the entire object */444dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);445446ddl->ddl_length = 0;447ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;448memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));449ddt_log_update_header(ddt, ddl, tx);450451ddt_log_update_stats(ddt);452}453454boolean_t455ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)456{457/* Swap the logs. The old flushing one must be empty */458VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));459460/*461* If there are still blocks on the flushing log, truncate it first.462* This can happen if there were entries on the flushing log that were463* removed in memory via ddt_lookup(); their vestigal remains are464* on disk.465*/466if (ddt->ddt_log_flushing->ddl_length > 0)467ddt_log_truncate(ddt, tx);468469/*470* Swap policy. We swap the logs (and so begin flushing) when the471* active tree grows too large, or when we haven't swapped it in472* some amount of time, or if something has requested the logs be473* flushed ASAP (see ddt_walk_init()).474*/475476/*477* The log tree is too large if the memory usage of its entries is over478* half of the memory limit. This effectively gives each log tree half479* the available memory.480*/481const boolean_t too_large =482(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *483DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);484485const boolean_t too_old =486tx->tx_txg >=487(ddt->ddt_log_active->ddl_first_txg +488MAX(1, zfs_dedup_log_txg_max));489490const boolean_t force =491ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;492493if (!(too_large || too_old || force))494return (B_FALSE);495496ddt_log_t *swap = ddt->ddt_log_active;497ddt->ddt_log_active = ddt->ddt_log_flushing;498ddt->ddt_log_flushing = swap;499500ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);501ddt->ddt_log_active->ddl_flags &=502~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);503504ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));505ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;506507ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;508509ddt_log_update_header(ddt, ddt->ddt_log_active, tx);510ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);511512ddt_log_update_stats(ddt);513514return (B_TRUE);515}516517static inline void518ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,519const ddt_key_t *checkpoint)520{521ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);522523ddt_log_record_entry_t *dlre =524(ddt_log_record_entry_t *)dlr->dlr_payload;525if (checkpoint != NULL &&526ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {527/* Skip pre-checkpoint entries; they're already flushed. */528return;529}530531ddt_lightweight_entry_t ddlwe;532ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);533ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);534535ddlwe.ddlwe_key = dlre->dlre_key;536memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));537538ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);539}540541static void542ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)543{544void *cookie = NULL;545ddt_log_entry_t *ddle;546IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));547while ((ddle =548avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {549ddt_log_free_entry(ddt, ddle);550}551ASSERT(avl_is_empty(&ddl->ddl_tree));552}553554static int555ddt_log_load_one(ddt_t *ddt, uint_t n)556{557ASSERT3U(n, <, 2);558559ddt_log_t *ddl = &ddt->ddt_log[n];560561char name[DDT_NAMELEN];562ddt_log_name(ddt, name, n);563564uint64_t obj;565int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,566sizeof (uint64_t), 1, &obj);567if (err == ENOENT)568return (0);569if (err != 0)570return (err);571572dnode_t *dn;573err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);574if (err != 0)575return (err);576577ddt_log_header_t hdr;578dmu_buf_t *db;579err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);580if (err != 0) {581dnode_rele(dn, FTAG);582return (err);583}584memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));585dmu_buf_rele(db, FTAG);586587if (DLH_GET_VERSION(&hdr) != 1) {588dnode_rele(dn, FTAG);589zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "590"unknown version=%llu", spa_name(ddt->ddt_spa), name,591(u_longlong_t)DLH_GET_VERSION(&hdr));592return (SET_ERROR(EINVAL));593}594595ddt_key_t *checkpoint = NULL;596if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {597/*598* If the log has a checkpoint, then we can ignore any entries599* that have already been flushed.600*/601ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);602checkpoint = &hdr.dlh_checkpoint;603}604605if (hdr.dlh_length > 0) {606dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,607ZIO_PRIORITY_SYNC_READ);608609for (uint64_t offset = 0; offset < hdr.dlh_length;610offset += dn->dn_datablksz) {611err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,612DMU_READ_PREFETCH | DMU_UNCACHEDIO);613if (err != 0) {614dnode_rele(dn, FTAG);615ddt_log_empty(ddt, ddl);616return (err);617}618619uint64_t boffset = 0;620while (boffset < db->db_size) {621ddt_log_record_t *dlr =622(ddt_log_record_t *)(db->db_data + boffset);623624/* Partially-filled block, skip the rest */625if (DLR_GET_TYPE(dlr) == DLR_INVALID)626break;627628switch (DLR_GET_TYPE(dlr)) {629case DLR_ENTRY:630ddt_log_load_entry(ddt, ddl, dlr,631checkpoint);632break;633634default:635dmu_buf_rele(db, FTAG);636dnode_rele(dn, FTAG);637ddt_log_empty(ddt, ddl);638return (SET_ERROR(EINVAL));639}640641boffset += DLR_GET_RECLEN(dlr);642}643644dmu_buf_rele(db, FTAG);645}646}647648dnode_rele(dn, FTAG);649650ddl->ddl_object = obj;651ddl->ddl_flags = DLH_GET_FLAGS(&hdr);652ddl->ddl_length = hdr.dlh_length;653ddl->ddl_first_txg = hdr.dlh_first_txg;654655if (ddl->ddl_flags & DDL_FLAG_FLUSHING)656ddt->ddt_log_flushing = ddl;657else658ddt->ddt_log_active = ddl;659660return (0);661}662663int664ddt_log_load(ddt_t *ddt)665{666int err;667668if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {669/*670* The DDT is going to be freed again in a moment, so there's671* no point loading the log; it'll just slow down import.672*/673return (0);674}675676ASSERT0(ddt->ddt_log[0].ddl_object);677ASSERT0(ddt->ddt_log[1].ddl_object);678if (ddt->ddt_dir_object == 0) {679/*680* If we're configured but the containing dir doesn't exist681* yet, then the log object can't possibly exist either.682*/683ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);684return (SET_ERROR(ENOENT));685}686687if ((err = ddt_log_load_one(ddt, 0)) != 0)688return (err);689if ((err = ddt_log_load_one(ddt, 1)) != 0)690return (err);691692VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);693VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));694VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));695VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);696697/*698* We have two finalisation tasks:699*700* - rebuild the histogram. We do this at the end rather than while701* we're loading so we don't need to uncount and recount entries that702* appear multiple times in the log.703*704* - remove entries from the flushing tree that are on both trees. This705* happens when ddt_lookup() rehydrates an entry from the flushing706* tree, as ddt_log_take_key() removes the entry from the in-memory707* tree but doesn't remove it from disk.708*/709710/*711* We don't technically need a config lock here, since there shouldn't712* be pool config changes during DDT load. dva_get_dsize_sync() via713* ddt_stat_generate() is expecting it though, and it won't hurt714* anything, so we take it.715*/716spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);717718avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;719avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;720ddt_log_entry_t *ae = avl_first(al);721ddt_log_entry_t *fe = avl_first(fl);722while (ae != NULL || fe != NULL) {723ddt_log_entry_t *ddle;724if (ae == NULL) {725/* active exhausted, take flushing */726ddle = fe;727fe = AVL_NEXT(fl, fe);728} else if (fe == NULL) {729/* flushing exuhausted, take active */730ddle = ae;731ae = AVL_NEXT(al, ae);732} else {733/* compare active and flushing */734int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);735if (c < 0) {736/* active behind, take and advance */737ddle = ae;738ae = AVL_NEXT(al, ae);739} else if (c > 0) {740/* flushing behind, take and advance */741ddle = fe;742fe = AVL_NEXT(fl, fe);743} else {744/* match. remove from flushing, take active */745ddle = fe;746fe = AVL_NEXT(fl, fe);747avl_remove(fl, ddle);748ddt_log_free_entry(ddt, ddle);749ddle = ae;750ae = AVL_NEXT(al, ae);751}752}753754ddt_lightweight_entry_t ddlwe;755DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);756ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);757}758759spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);760761ddt_log_update_stats(ddt);762763return (0);764}765766void767ddt_log_alloc(ddt_t *ddt)768{769ASSERT0P(ddt->ddt_log_active);770ASSERT0P(ddt->ddt_log_flushing);771772avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,773sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));774avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,775sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));776ddt->ddt_log_active = &ddt->ddt_log[0];777ddt->ddt_log_flushing = &ddt->ddt_log[1];778ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;779}780781void782ddt_log_free(ddt_t *ddt)783{784ddt_log_empty(ddt, &ddt->ddt_log[0]);785ddt_log_empty(ddt, &ddt->ddt_log[1]);786avl_destroy(&ddt->ddt_log[0].ddl_tree);787avl_destroy(&ddt->ddt_log[1].ddl_tree);788}789790ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,791"Max transactions before starting to flush dedup logs");792793ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,794"Max memory for dedup logs");795796ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,797"Max memory for dedup logs, as % of total memory");798799800