Path: blob/main/sys/contrib/openzfs/module/zfs/dsl_dataset.c
107275 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2011, 2020 by Delphix. All rights reserved.25* Copyright (c) 2014, Joyent, Inc. All rights reserved.26* Copyright (c) 2014 RackTop Systems.27* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.28* Copyright (c) 2016 Actifio, Inc. All rights reserved.29* Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.30* Copyright 2017 Nexenta Systems, Inc.31* Copyright (c) 2019, Klara Inc.32* Copyright (c) 2019, Allan Jude33* Copyright (c) 2020 The FreeBSD Foundation [1]34* Copyright (c) 2025, Rob Norris <[email protected]>35*36* [1] Portions of this software were developed by Allan Jude37* under sponsorship from the FreeBSD Foundation.38*/3940#include <sys/dmu_objset.h>41#include <sys/dsl_dataset.h>42#include <sys/dsl_dir.h>43#include <sys/dsl_prop.h>44#include <sys/dsl_synctask.h>45#include <sys/dmu_traverse.h>46#include <sys/dmu_impl.h>47#include <sys/dmu_tx.h>48#include <sys/arc.h>49#include <sys/zio.h>50#include <sys/zap.h>51#include <sys/zfeature.h>52#include <sys/unique.h>53#include <sys/zfs_context.h>54#include <sys/zfs_ioctl.h>55#include <sys/spa.h>56#include <sys/spa_impl.h>57#include <sys/vdev.h>58#include <sys/zfs_znode.h>59#include <sys/zfs_onexit.h>60#include <sys/zvol.h>61#include <sys/dsl_scan.h>62#include <sys/dsl_deadlist.h>63#include <sys/dsl_destroy.h>64#include <sys/dsl_userhold.h>65#include <sys/dsl_bookmark.h>66#include <sys/policy.h>67#include <sys/dmu_send.h>68#include <sys/dmu_recv.h>69#include <sys/zio_compress.h>70#include <zfs_fletcher.h>71#include <sys/zio_checksum.h>72#include <sys/brt.h>7374/*75* The SPA supports block sizes up to 16MB. However, very large blocks76* can have an impact on i/o latency (e.g. tying up a spinning disk for77* ~300ms), and also potentially on the memory allocator. Therefore,78* we did not allow the recordsize to be set larger than zfs_max_recordsize79* (former default: 1MB). Larger blocks could be created by changing this80* tunable, and pools with larger blocks could always be imported and used,81* regardless of this setting.82*83* We do, however, still limit it by default to 1M on x86_32, because Linux's84* 3/1 memory split doesn't leave much room for 16M chunks.85*/86#ifdef _ILP3287uint_t zfs_max_recordsize = 1 * 1024 * 1024;88#else89uint_t zfs_max_recordsize = 16 * 1024 * 1024;90#endif91static int zfs_allow_redacted_dataset_mount = 0;9293int zfs_snapshot_history_enabled = 1;9495#define SWITCH64(x, y) \96{ \97uint64_t __tmp = (x); \98(x) = (y); \99(y) = __tmp; \100}101102#define DS_REF_MAX (1ULL << 62)103104static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,105uint64_t obj, dmu_tx_t *tx);106static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,107dmu_tx_t *tx);108109static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);110111extern uint_t spa_asize_inflation;112113static zil_header_t zero_zil;114115/*116* Figure out how much of this delta should be propagated to the dsl_dir117* layer. If there's a refreservation, that space has already been118* partially accounted for in our ancestors.119*/120static int64_t121parent_delta(dsl_dataset_t *ds, int64_t delta)122{123dsl_dataset_phys_t *ds_phys;124uint64_t old_bytes, new_bytes;125126if (ds->ds_reserved == 0)127return (delta);128129ds_phys = dsl_dataset_phys(ds);130old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);131new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);132133ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));134return (new_bytes - old_bytes);135}136137void138dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)139{140spa_t *spa = dmu_tx_pool(tx)->dp_spa;141int used = bp_get_dsize_sync(spa, bp);142int compressed = BP_GET_PSIZE(bp);143int uncompressed = BP_GET_UCSIZE(bp);144int64_t delta;145spa_feature_t f;146147dprintf_bp(bp, "ds=%p", ds);148149ASSERT(dmu_tx_is_syncing(tx));150/* It could have been compressed away to nothing */151if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))152return;153ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);154ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));155if (ds == NULL) {156dsl_pool_mos_diduse_space(tx->tx_pool,157used, compressed, uncompressed);158return;159}160161ASSERT3U(BP_GET_BIRTH(bp), >,162dsl_dataset_phys(ds)->ds_prev_snap_txg);163/* ds_dbuf is pre-dirtied in dsl_dataset_sync(). */164ASSERT(dmu_buf_is_dirty(ds->ds_dbuf, tx));165mutex_enter(&ds->ds_lock);166delta = parent_delta(ds, used);167dsl_dataset_phys(ds)->ds_referenced_bytes += used;168dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;169dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;170dsl_dataset_phys(ds)->ds_unique_bytes += used;171172if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {173ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] =174(void *)B_TRUE;175}176177178f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));179if (f != SPA_FEATURE_NONE) {180ASSERT3S(spa_feature_table[f].fi_type, ==,181ZFEATURE_TYPE_BOOLEAN);182ds->ds_feature_activation[f] = (void *)B_TRUE;183}184185f = zio_compress_to_feature(BP_GET_COMPRESS(bp));186if (f != SPA_FEATURE_NONE) {187ASSERT3S(spa_feature_table[f].fi_type, ==,188ZFEATURE_TYPE_BOOLEAN);189ds->ds_feature_activation[f] = (void *)B_TRUE;190}191192/*193* Track block for livelist, but ignore embedded blocks because194* they do not need to be freed.195*/196if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&197BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&198!(BP_IS_EMBEDDED(bp))) {199ASSERT(dsl_dir_is_clone(ds->ds_dir));200ASSERT(spa_feature_is_enabled(spa,201SPA_FEATURE_LIVELIST));202bplist_append(&ds->ds_dir->dd_pending_allocs, bp);203}204205mutex_exit(&ds->ds_lock);206dsl_dir_diduse_transfer_space(ds->ds_dir, delta,207compressed, uncompressed, used,208DD_USED_REFRSRV, DD_USED_HEAD, tx);209}210211/*212* Called when the specified segment has been remapped, and is thus no213* longer referenced in the head dataset. The vdev must be indirect.214*215* If the segment is referenced by a snapshot, put it on the remap deadlist.216* Otherwise, add this segment to the obsolete spacemap.217*/218void219dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,220uint64_t size, uint64_t birth, dmu_tx_t *tx)221{222spa_t *spa = ds->ds_dir->dd_pool->dp_spa;223224ASSERT(dmu_tx_is_syncing(tx));225ASSERT(birth <= tx->tx_txg);226ASSERT(!ds->ds_is_snapshot);227228if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {229spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);230} else {231blkptr_t fakebp;232dva_t *dva = &fakebp.blk_dva[0];233234ASSERT(ds != NULL);235236mutex_enter(&ds->ds_remap_deadlist_lock);237if (!dsl_dataset_remap_deadlist_exists(ds)) {238dsl_dataset_create_remap_deadlist(ds, tx);239}240mutex_exit(&ds->ds_remap_deadlist_lock);241242BP_ZERO(&fakebp);243BP_SET_LOGICAL_BIRTH(&fakebp, birth);244DVA_SET_VDEV(dva, vdev);245DVA_SET_OFFSET(dva, offset);246DVA_SET_ASIZE(dva, size);247dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,248tx);249}250}251252int253dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,254boolean_t async)255{256spa_t *spa = dmu_tx_pool(tx)->dp_spa;257258int used = bp_get_dsize_sync(spa, bp);259int compressed = BP_GET_PSIZE(bp);260int uncompressed = BP_GET_UCSIZE(bp);261262if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))263return (0);264265ASSERT(dmu_tx_is_syncing(tx));266ASSERT(BP_GET_BIRTH(bp) <= tx->tx_txg);267268if (ds == NULL) {269dsl_free(tx->tx_pool, tx->tx_txg, bp);270dsl_pool_mos_diduse_space(tx->tx_pool,271-used, -compressed, -uncompressed);272return (used);273}274ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);275276ASSERT(!ds->ds_is_snapshot);277/* ds_dbuf is pre-dirtied in dsl_dataset_sync(). */278ASSERT(dmu_buf_is_dirty(ds->ds_dbuf, tx));279280/*281* Track block for livelist, but ignore embedded blocks because282* they do not need to be freed.283*/284if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&285BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&286!(BP_IS_EMBEDDED(bp))) {287ASSERT(dsl_dir_is_clone(ds->ds_dir));288ASSERT(spa_feature_is_enabled(spa,289SPA_FEATURE_LIVELIST));290bplist_append(&ds->ds_dir->dd_pending_frees, bp);291}292293if (BP_GET_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {294int64_t delta;295296/*297* Put blocks that would create IO on the pool's deadlist for298* dsl_process_async_destroys() to find. This is to prevent299* zio_free() from creating a ZIO_TYPE_FREE IO for them, which300* are very heavy and can lead to out-of-memory conditions if301* something tries to free millions of blocks on the same txg.302*/303boolean_t defer = spa_version(spa) >= SPA_VERSION_DEADLISTS &&304(BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||305brt_maybe_exists(spa, bp));306307if (defer) {308dprintf_bp(bp, "putting on free list: %s", "");309bpobj_enqueue(&ds->ds_dir->dd_pool->dp_free_bpobj,310bp, B_FALSE, tx);311} else {312dprintf_bp(bp, "freeing ds=%llu",313(u_longlong_t)ds->ds_object);314dsl_free(tx->tx_pool, tx->tx_txg, bp);315}316317mutex_enter(&ds->ds_lock);318ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||319!DS_UNIQUE_IS_ACCURATE(ds));320delta = parent_delta(ds, -used);321dsl_dataset_phys(ds)->ds_unique_bytes -= used;322mutex_exit(&ds->ds_lock);323324dsl_dir_diduse_transfer_space(ds->ds_dir,325delta, -compressed, -uncompressed, -used,326DD_USED_REFRSRV, DD_USED_HEAD, tx);327328if (defer)329dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,330DD_USED_HEAD, used, compressed, uncompressed, tx);331} else {332dprintf_bp(bp, "putting on dead list: %s", "");333if (async) {334/*335* We are here as part of zio's write done callback,336* which means we're a zio interrupt thread. We can't337* call dsl_deadlist_insert() now because it may block338* waiting for I/O. Instead, put bp on the deferred339* queue and let dsl_pool_sync() finish the job.340*/341bplist_append(&ds->ds_pending_deadlist, bp);342} else {343dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);344}345ASSERT3U(ds->ds_prev->ds_object, ==,346dsl_dataset_phys(ds)->ds_prev_snap_obj);347ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);348/* if (logical birth > prev prev snap txg) prev unique += bs */349if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==350ds->ds_object && BP_GET_BIRTH(bp) >351dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {352dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);353mutex_enter(&ds->ds_prev->ds_lock);354dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;355mutex_exit(&ds->ds_prev->ds_lock);356}357if (BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {358dsl_dir_transfer_space(ds->ds_dir, used,359DD_USED_HEAD, DD_USED_SNAP, tx);360}361}362363dsl_bookmark_block_killed(ds, bp, tx);364365mutex_enter(&ds->ds_lock);366ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);367dsl_dataset_phys(ds)->ds_referenced_bytes -= used;368ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);369dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;370ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);371dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;372mutex_exit(&ds->ds_lock);373374return (used);375}376377struct feature_type_uint64_array_arg {378uint64_t length;379uint64_t *array;380};381382static void383unload_zfeature(dsl_dataset_t *ds, spa_feature_t f)384{385switch (spa_feature_table[f].fi_type) {386case ZFEATURE_TYPE_BOOLEAN:387break;388case ZFEATURE_TYPE_UINT64_ARRAY:389{390struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];391kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t));392kmem_free(ftuaa, sizeof (*ftuaa));393break;394}395default:396panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);397}398}399400static int401load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f)402{403int err = 0;404switch (spa_feature_table[f].fi_type) {405case ZFEATURE_TYPE_BOOLEAN:406err = zap_contains(mos, ds->ds_object,407spa_feature_table[f].fi_guid);408if (err == 0) {409ds->ds_feature[f] = (void *)B_TRUE;410} else {411ASSERT3U(err, ==, ENOENT);412err = 0;413}414break;415case ZFEATURE_TYPE_UINT64_ARRAY:416{417uint64_t int_size, num_int;418uint64_t *data;419err = zap_length(mos, ds->ds_object,420spa_feature_table[f].fi_guid, &int_size, &num_int);421if (err != 0) {422ASSERT3U(err, ==, ENOENT);423err = 0;424break;425}426ASSERT3U(int_size, ==, sizeof (uint64_t));427data = kmem_alloc(int_size * num_int, KM_SLEEP);428VERIFY0(zap_lookup(mos, ds->ds_object,429spa_feature_table[f].fi_guid, int_size, num_int, data));430struct feature_type_uint64_array_arg *ftuaa =431kmem_alloc(sizeof (*ftuaa), KM_SLEEP);432ftuaa->length = num_int;433ftuaa->array = data;434ds->ds_feature[f] = ftuaa;435break;436}437default:438panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);439}440return (err);441}442443/*444* We have to release the fsid synchronously or we risk that a subsequent445* mount of the same dataset will fail to unique_insert the fsid. This446* failure would manifest itself as the fsid of this dataset changing447* between mounts which makes NFS clients quite unhappy.448*/449static void450dsl_dataset_evict_sync(void *dbu)451{452dsl_dataset_t *ds = dbu;453454ASSERT0P(ds->ds_owner);455456unique_remove(ds->ds_fsid_guid);457}458459static void460dsl_dataset_evict_async(void *dbu)461{462dsl_dataset_t *ds = dbu;463464ASSERT0P(ds->ds_owner);465466ds->ds_dbuf = NULL;467468if (ds->ds_objset != NULL)469dmu_objset_evict(ds->ds_objset);470471if (ds->ds_prev) {472dsl_dataset_rele(ds->ds_prev, ds);473ds->ds_prev = NULL;474}475476dsl_bookmark_fini_ds(ds);477478bplist_destroy(&ds->ds_pending_deadlist);479if (dsl_deadlist_is_open(&ds->ds_deadlist))480dsl_deadlist_close(&ds->ds_deadlist);481if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))482dsl_deadlist_close(&ds->ds_remap_deadlist);483if (ds->ds_dir)484dsl_dir_async_rele(ds->ds_dir, ds);485486ASSERT(!list_link_active(&ds->ds_synced_link));487488for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {489if (dsl_dataset_feature_is_active(ds, f))490unload_zfeature(ds, f);491}492493list_destroy(&ds->ds_prop_cbs);494mutex_destroy(&ds->ds_lock);495mutex_destroy(&ds->ds_opening_lock);496mutex_destroy(&ds->ds_sendstream_lock);497mutex_destroy(&ds->ds_remap_deadlist_lock);498zfs_refcount_destroy(&ds->ds_longholds);499rrw_destroy(&ds->ds_bp_rwlock);500501kmem_free(ds, sizeof (dsl_dataset_t));502}503504int505dsl_dataset_get_snapname(dsl_dataset_t *ds)506{507dsl_dataset_phys_t *headphys;508int err;509dmu_buf_t *headdbuf;510dsl_pool_t *dp = ds->ds_dir->dd_pool;511objset_t *mos = dp->dp_meta_objset;512513if (ds->ds_snapname[0])514return (0);515if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)516return (0);517518err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,519FTAG, &headdbuf);520if (err != 0)521return (err);522headphys = headdbuf->db_data;523err = zap_value_search(dp->dp_meta_objset,524headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname,525sizeof (ds->ds_snapname));526if (err != 0 && zfs_recover == B_TRUE) {527err = 0;528(void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),529"SNAPOBJ=%llu-ERR=%d",530(unsigned long long)ds->ds_object, err);531}532dmu_buf_rele(headdbuf, FTAG);533return (err);534}535536int537dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)538{539objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;540uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;541matchtype_t mt = 0;542int err;543544if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)545mt = MT_NORMALIZE;546547err = zap_lookup_norm(mos, snapobj, name, 8, 1,548value, mt, NULL, 0, NULL);549if (err == ENOTSUP && (mt & MT_NORMALIZE))550err = zap_lookup(mos, snapobj, name, 8, 1, value);551return (err);552}553554int555dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,556boolean_t adj_cnt)557{558objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;559uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;560matchtype_t mt = 0;561int err;562563dsl_dir_snap_cmtime_update(ds->ds_dir, tx);564565if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)566mt = MT_NORMALIZE;567568err = zap_remove_norm(mos, snapobj, name, mt, tx);569if (err == ENOTSUP && (mt & MT_NORMALIZE))570err = zap_remove(mos, snapobj, name, tx);571572if (err == 0 && adj_cnt)573dsl_fs_ss_count_adjust(ds->ds_dir, -1,574DD_FIELD_SNAPSHOT_COUNT, tx);575576return (err);577}578579boolean_t580dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag)581{582dmu_buf_t *dbuf = ds->ds_dbuf;583boolean_t result = B_FALSE;584585if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,586ds->ds_object, DMU_BONUS_BLKID, tag)) {587588if (ds == dmu_buf_get_user(dbuf))589result = B_TRUE;590else591dmu_buf_rele(dbuf, tag);592}593594return (result);595}596597int598dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag,599dsl_dataset_t **dsp)600{601objset_t *mos = dp->dp_meta_objset;602dmu_buf_t *dbuf;603dsl_dataset_t *ds;604int err;605dmu_object_info_t doi;606607ASSERT(dsl_pool_config_held(dp));608609err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);610if (err != 0)611return (err);612613/* Make sure dsobj has the correct object type. */614dmu_object_info_from_db(dbuf, &doi);615if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {616dmu_buf_rele(dbuf, tag);617return (SET_ERROR(EINVAL));618}619620ds = dmu_buf_get_user(dbuf);621if (ds == NULL) {622dsl_dataset_t *winner = NULL;623624ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);625ds->ds_dbuf = dbuf;626ds->ds_object = dsobj;627ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;628list_link_init(&ds->ds_synced_link);629630err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,631NULL, ds, &ds->ds_dir);632if (err != 0) {633kmem_free(ds, sizeof (dsl_dataset_t));634dmu_buf_rele(dbuf, tag);635return (err);636}637638mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);639mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);640mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);641mutex_init(&ds->ds_remap_deadlist_lock,642NULL, MUTEX_DEFAULT, NULL);643rrw_init(&ds->ds_bp_rwlock, B_FALSE);644zfs_refcount_create(&ds->ds_longholds);645646bplist_create(&ds->ds_pending_deadlist);647648list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t),649offsetof(dmu_sendstatus_t, dss_link));650651list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),652offsetof(dsl_prop_cb_record_t, cbr_ds_node));653654if (doi.doi_type == DMU_OTN_ZAP_METADATA) {655spa_feature_t f;656657for (f = 0; f < SPA_FEATURES; f++) {658if (!(spa_feature_table[f].fi_flags &659ZFEATURE_FLAG_PER_DATASET))660continue;661err = load_zfeature(mos, ds, f);662}663}664665if (!ds->ds_is_snapshot) {666ds->ds_snapname[0] = '\0';667if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {668err = dsl_dataset_hold_obj(dp,669dsl_dataset_phys(ds)->ds_prev_snap_obj,670ds, &ds->ds_prev);671}672if (err != 0)673goto after_dsl_bookmark_fini;674err = dsl_bookmark_init_ds(ds);675} else {676if (zfs_flags & ZFS_DEBUG_SNAPNAMES)677err = dsl_dataset_get_snapname(ds);678if (err == 0 &&679dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {680err = zap_count(681ds->ds_dir->dd_pool->dp_meta_objset,682dsl_dataset_phys(ds)->ds_userrefs_obj,683&ds->ds_userrefs);684}685}686687if (err == 0 && !ds->ds_is_snapshot) {688err = dsl_prop_get_int_ds(ds,689zfs_prop_to_name(ZFS_PROP_REFRESERVATION),690&ds->ds_reserved);691if (err == 0) {692err = dsl_prop_get_int_ds(ds,693zfs_prop_to_name(ZFS_PROP_REFQUOTA),694&ds->ds_quota);695}696} else {697ds->ds_reserved = ds->ds_quota = 0;698}699700if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 &&701ds->ds_is_snapshot &&702zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) {703dp->dp_spa->spa_errata =704ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;705}706707if (err == 0) {708err = dsl_deadlist_open(&ds->ds_deadlist,709mos, dsl_dataset_phys(ds)->ds_deadlist_obj);710}711if (err == 0) {712uint64_t remap_deadlist_obj =713dsl_dataset_get_remap_deadlist_object(ds);714if (remap_deadlist_obj != 0) {715err = dsl_deadlist_open(&ds->ds_remap_deadlist,716mos, remap_deadlist_obj);717}718}719720dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,721dsl_dataset_evict_async, &ds->ds_dbuf);722if (err == 0)723winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);724725if (err != 0 || winner != NULL) {726if (dsl_deadlist_is_open(&ds->ds_deadlist))727dsl_deadlist_close(&ds->ds_deadlist);728if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))729dsl_deadlist_close(&ds->ds_remap_deadlist);730dsl_bookmark_fini_ds(ds);731after_dsl_bookmark_fini:732if (ds->ds_prev)733dsl_dataset_rele(ds->ds_prev, ds);734dsl_dir_rele(ds->ds_dir, ds);735for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {736if (dsl_dataset_feature_is_active(ds, f))737unload_zfeature(ds, f);738}739740list_destroy(&ds->ds_prop_cbs);741list_destroy(&ds->ds_sendstreams);742bplist_destroy(&ds->ds_pending_deadlist);743mutex_destroy(&ds->ds_lock);744mutex_destroy(&ds->ds_opening_lock);745mutex_destroy(&ds->ds_sendstream_lock);746mutex_destroy(&ds->ds_remap_deadlist_lock);747zfs_refcount_destroy(&ds->ds_longholds);748rrw_destroy(&ds->ds_bp_rwlock);749kmem_free(ds, sizeof (dsl_dataset_t));750if (err != 0) {751dmu_buf_rele(dbuf, tag);752return (err);753}754ds = winner;755} else {756ds->ds_fsid_guid =757unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);758if (ds->ds_fsid_guid !=759dsl_dataset_phys(ds)->ds_fsid_guid) {760zfs_dbgmsg("ds_fsid_guid changed from "761"%llx to %llx for pool %s dataset id %llu",762(long long)763dsl_dataset_phys(ds)->ds_fsid_guid,764(long long)ds->ds_fsid_guid,765spa_name(dp->dp_spa),766(u_longlong_t)dsobj);767}768}769}770771ASSERT3P(ds->ds_dbuf, ==, dbuf);772ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);773ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||774spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||775dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);776*dsp = ds;777778return (0);779}780781int782dsl_dataset_create_key_mapping(dsl_dataset_t *ds)783{784dsl_dir_t *dd = ds->ds_dir;785786if (dd->dd_crypto_obj == 0)787return (0);788789return (spa_keystore_create_mapping(dd->dd_pool->dp_spa,790ds, ds, &ds->ds_key_mapping));791}792793int794dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,795ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)796{797int err;798799err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);800if (err != 0)801return (err);802803ASSERT3P(*dsp, !=, NULL);804805if (flags & DS_HOLD_FLAG_DECRYPT) {806err = dsl_dataset_create_key_mapping(*dsp);807if (err != 0)808dsl_dataset_rele(*dsp, tag);809}810811return (err);812}813814int815dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,816const void *tag, dsl_dataset_t **dsp)817{818dsl_dir_t *dd;819const char *snapname;820uint64_t obj;821int err = 0;822dsl_dataset_t *ds;823824err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);825if (err != 0)826return (err);827828ASSERT(dsl_pool_config_held(dp));829obj = dsl_dir_phys(dd)->dd_head_dataset_obj;830if (obj != 0)831err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);832else833err = SET_ERROR(ENOENT);834835/* we may be looking for a snapshot */836if (err == 0 && snapname != NULL) {837dsl_dataset_t *snap_ds;838839if (*snapname++ != '@') {840dsl_dataset_rele_flags(ds, flags, tag);841dsl_dir_rele(dd, FTAG);842return (SET_ERROR(ENOENT));843}844845dprintf("looking for snapshot '%s'\n", snapname);846err = dsl_dataset_snap_lookup(ds, snapname, &obj);847if (err == 0) {848err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,849&snap_ds);850}851dsl_dataset_rele_flags(ds, flags, tag);852853if (err == 0) {854mutex_enter(&snap_ds->ds_lock);855if (snap_ds->ds_snapname[0] == 0)856(void) strlcpy(snap_ds->ds_snapname, snapname,857sizeof (snap_ds->ds_snapname));858mutex_exit(&snap_ds->ds_lock);859ds = snap_ds;860}861}862if (err == 0)863*dsp = ds;864dsl_dir_rele(dd, FTAG);865return (err);866}867868int869dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag,870dsl_dataset_t **dsp)871{872return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));873}874875static int876dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,877const void *tag, boolean_t override, dsl_dataset_t **dsp)878{879int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);880if (err != 0)881return (err);882if (!dsl_dataset_tryown(*dsp, tag, override)) {883dsl_dataset_rele_flags(*dsp, flags, tag);884*dsp = NULL;885return (SET_ERROR(EBUSY));886}887return (0);888}889890891int892dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,893const void *tag, dsl_dataset_t **dsp)894{895return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));896}897898int899dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,900ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)901{902return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));903}904905static int906dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,907const void *tag, boolean_t override, dsl_dataset_t **dsp)908{909int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);910if (err != 0)911return (err);912if (!dsl_dataset_tryown(*dsp, tag, override)) {913dsl_dataset_rele_flags(*dsp, flags, tag);914return (SET_ERROR(EBUSY));915}916return (0);917}918919int920dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,921const void *tag, dsl_dataset_t **dsp)922{923return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));924}925926int927dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,928const void *tag, dsl_dataset_t **dsp)929{930return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));931}932933/*934* See the comment above dsl_pool_hold() for details. In summary, a long935* hold is used to prevent destruction of a dataset while the pool hold936* is dropped, allowing other concurrent operations (e.g. spa_sync()).937*938* The dataset and pool must be held when this function is called. After it939* is called, the pool hold may be released while the dataset is still held940* and accessed.941*/942void943dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag)944{945ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));946(void) zfs_refcount_add(&ds->ds_longholds, tag);947}948949void950dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag)951{952(void) zfs_refcount_remove(&ds->ds_longholds, tag);953}954955/* Return B_TRUE if there are any long holds on this dataset. */956boolean_t957dsl_dataset_long_held(dsl_dataset_t *ds)958{959return (!zfs_refcount_is_zero(&ds->ds_longholds));960}961962void963dsl_dataset_name(dsl_dataset_t *ds, char *name)964{965if (ds == NULL) {966(void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN);967} else {968dsl_dir_name(ds->ds_dir, name);969VERIFY0(dsl_dataset_get_snapname(ds));970if (ds->ds_snapname[0]) {971VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),972<, ZFS_MAX_DATASET_NAME_LEN);973/*974* We use a "recursive" mutex so that we975* can call dprintf_ds() with ds_lock held.976*/977if (!MUTEX_HELD(&ds->ds_lock)) {978mutex_enter(&ds->ds_lock);979VERIFY3U(strlcat(name, ds->ds_snapname,980ZFS_MAX_DATASET_NAME_LEN), <,981ZFS_MAX_DATASET_NAME_LEN);982mutex_exit(&ds->ds_lock);983} else {984VERIFY3U(strlcat(name, ds->ds_snapname,985ZFS_MAX_DATASET_NAME_LEN), <,986ZFS_MAX_DATASET_NAME_LEN);987}988}989}990}991992int993dsl_dataset_namelen(dsl_dataset_t *ds)994{995VERIFY0(dsl_dataset_get_snapname(ds));996mutex_enter(&ds->ds_lock);997int len = strlen(ds->ds_snapname);998mutex_exit(&ds->ds_lock);999/* add '@' if ds is a snap */1000if (len > 0)1001len++;1002len += dsl_dir_namelen(ds->ds_dir);1003return (len);1004}10051006void1007dsl_dataset_rele(dsl_dataset_t *ds, const void *tag)1008{1009dmu_buf_rele(ds->ds_dbuf, tag);1010}10111012void1013dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)1014{1015dsl_dir_t *dd = ds->ds_dir;10161017if (dd == NULL || dd->dd_crypto_obj == 0)1018return;10191020(void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa,1021ds->ds_object, ds);1022}10231024void1025dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags,1026const void *tag)1027{1028if (flags & DS_HOLD_FLAG_DECRYPT)1029dsl_dataset_remove_key_mapping(ds);10301031dsl_dataset_rele(ds, tag);1032}10331034void1035dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag)1036{1037ASSERT3P(ds->ds_owner, ==, tag);1038ASSERT(ds->ds_dbuf != NULL);10391040mutex_enter(&ds->ds_lock);1041ds->ds_owner = NULL;1042mutex_exit(&ds->ds_lock);1043dsl_dataset_long_rele(ds, tag);1044dsl_dataset_rele_flags(ds, flags, tag);1045}10461047boolean_t1048dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override)1049{1050boolean_t gotit = FALSE;10511052ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));1053mutex_enter(&ds->ds_lock);1054if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) ||1055(dsl_dataset_feature_is_active(ds,1056SPA_FEATURE_REDACTED_DATASETS) &&1057!zfs_allow_redacted_dataset_mount)))) {1058ds->ds_owner = tag;1059dsl_dataset_long_hold(ds, tag);1060gotit = TRUE;1061}1062mutex_exit(&ds->ds_lock);1063return (gotit);1064}10651066boolean_t1067dsl_dataset_has_owner(dsl_dataset_t *ds)1068{1069boolean_t rv;1070mutex_enter(&ds->ds_lock);1071rv = (ds->ds_owner != NULL);1072mutex_exit(&ds->ds_lock);1073return (rv);1074}10751076static boolean_t1077zfeature_active(spa_feature_t f, void *arg)1078{1079switch (spa_feature_table[f].fi_type) {1080case ZFEATURE_TYPE_BOOLEAN: {1081boolean_t val = (boolean_t)(uintptr_t)arg;1082ASSERT(val == B_FALSE || val == B_TRUE);1083return (val);1084}1085case ZFEATURE_TYPE_UINT64_ARRAY:1086/*1087* In this case, arg is a uint64_t array. The feature is active1088* if the array is non-null.1089*/1090return (arg != NULL);1091default:1092panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);1093return (B_FALSE);1094}1095}10961097boolean_t1098dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f)1099{1100return (zfeature_active(f, ds->ds_feature[f]));1101}11021103/*1104* The buffers passed out by this function are references to internal buffers;1105* they should not be freed by callers of this function, and they should not be1106* used after the dataset has been released.1107*/1108boolean_t1109dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f,1110uint64_t *outlength, uint64_t **outp)1111{1112VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY);1113if (!dsl_dataset_feature_is_active(ds, f)) {1114return (B_FALSE);1115}1116struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];1117*outp = ftuaa->array;1118*outlength = ftuaa->length;1119return (B_TRUE);1120}11211122void1123dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg,1124dmu_tx_t *tx)1125{1126spa_t *spa = dmu_tx_pool(tx)->dp_spa;1127objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;1128uint64_t zero = 0;11291130VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);11311132spa_feature_incr(spa, f, tx);1133dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);11341135switch (spa_feature_table[f].fi_type) {1136case ZFEATURE_TYPE_BOOLEAN:1137ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE);1138VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,1139sizeof (zero), 1, &zero, tx));1140break;1141case ZFEATURE_TYPE_UINT64_ARRAY:1142{1143struct feature_type_uint64_array_arg *ftuaa = arg;1144VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,1145sizeof (uint64_t), ftuaa->length, ftuaa->array, tx));1146break;1147}1148default:1149panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);1150}1151}11521153static void1154dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f,1155dmu_tx_t *tx)1156{1157spa_t *spa = dmu_tx_pool(tx)->dp_spa;1158objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;1159uint64_t dsobj = ds->ds_object;11601161VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);11621163VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));1164spa_feature_decr(spa, f, tx);1165ds->ds_feature[f] = NULL;1166}11671168void1169dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx)1170{1171unload_zfeature(ds, f);1172dsl_dataset_deactivate_feature_impl(ds, f, tx);1173}11741175uint64_t1176dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,1177dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)1178{1179dsl_pool_t *dp = dd->dd_pool;1180dmu_buf_t *dbuf;1181dsl_dataset_phys_t *dsphys;1182uint64_t dsobj;1183objset_t *mos = dp->dp_meta_objset;11841185if (origin == NULL)1186origin = dp->dp_origin_snap;11871188ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);1189ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);1190ASSERT(dmu_tx_is_syncing(tx));1191ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);11921193dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,1194DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);1195VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));1196dmu_buf_will_dirty(dbuf, tx);1197dsphys = dbuf->db_data;1198memset(dsphys, 0, sizeof (dsl_dataset_phys_t));1199dsphys->ds_dir_obj = dd->dd_object;1200dsphys->ds_flags = flags;1201dsphys->ds_fsid_guid = unique_create();1202(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,1203sizeof (dsphys->ds_guid));1204dsphys->ds_snapnames_zapobj =1205zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,1206DMU_OT_NONE, 0, tx);1207dsphys->ds_creation_time = gethrestime_sec();1208dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;12091210if (origin == NULL) {1211dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);1212} else {1213dsl_dataset_t *ohds; /* head of the origin snapshot */12141215dsphys->ds_prev_snap_obj = origin->ds_object;1216dsphys->ds_prev_snap_txg =1217dsl_dataset_phys(origin)->ds_creation_txg;1218dsphys->ds_referenced_bytes =1219dsl_dataset_phys(origin)->ds_referenced_bytes;1220dsphys->ds_compressed_bytes =1221dsl_dataset_phys(origin)->ds_compressed_bytes;1222dsphys->ds_uncompressed_bytes =1223dsl_dataset_phys(origin)->ds_uncompressed_bytes;1224rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);1225dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;1226rrw_exit(&origin->ds_bp_rwlock, FTAG);12271228/*1229* Inherit flags that describe the dataset's contents1230* (INCONSISTENT) or properties (Case Insensitive).1231*/1232dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &1233(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);12341235for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {1236if (zfeature_active(f, origin->ds_feature[f])) {1237dsl_dataset_activate_feature(dsobj, f,1238origin->ds_feature[f], tx);1239}1240}12411242dmu_buf_will_dirty(origin->ds_dbuf, tx);1243dsl_dataset_phys(origin)->ds_num_children++;12441245VERIFY0(dsl_dataset_hold_obj(dp,1246dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,1247FTAG, &ohds));1248dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,1249dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);1250dsl_dataset_rele(ohds, FTAG);12511252if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {1253if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {1254dsl_dataset_phys(origin)->ds_next_clones_obj =1255zap_create(mos,1256DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);1257}1258VERIFY0(zap_add_int(mos,1259dsl_dataset_phys(origin)->ds_next_clones_obj,1260dsobj, tx));1261}12621263dmu_buf_will_dirty(dd->dd_dbuf, tx);1264dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;1265if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {1266if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {1267dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);1268dsl_dir_phys(origin->ds_dir)->dd_clones =1269zap_create(mos,1270DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);1271}1272VERIFY0(zap_add_int(mos,1273dsl_dir_phys(origin->ds_dir)->dd_clones,1274dsobj, tx));1275}1276}12771278/* handle encryption */1279dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);12801281if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)1282dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;12831284dmu_buf_rele(dbuf, FTAG);12851286dmu_buf_will_dirty(dd->dd_dbuf, tx);1287dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;12881289return (dsobj);1290}12911292static void1293dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)1294{1295objset_t *os;12961297VERIFY0(dmu_objset_from_ds(ds, &os));1298if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {1299dsl_pool_t *dp = ds->ds_dir->dd_pool;1300zio_t *zio;13011302memset(&os->os_zil_header, 0, sizeof (os->os_zil_header));1303if (os->os_encrypted)1304os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;13051306zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);1307dsl_dataset_sync(ds, zio, tx);1308VERIFY0(zio_wait(zio));1309dsl_dataset_sync_done(ds, tx);1310}1311}13121313uint64_t1314dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,1315dsl_dataset_t *origin, uint64_t flags, cred_t *cr,1316dsl_crypto_params_t *dcp, dmu_tx_t *tx)1317{1318dsl_pool_t *dp = pdd->dd_pool;1319uint64_t dsobj, ddobj;1320dsl_dir_t *dd;13211322ASSERT(dmu_tx_is_syncing(tx));1323ASSERT(lastname[0] != '@');1324/*1325* Filesystems will eventually have their origin set to dp_origin_snap,1326* but that's taken care of in dsl_dataset_create_sync_dd. When1327* creating a filesystem, this function is called with origin equal to1328* NULL.1329*/1330if (origin != NULL)1331ASSERT3P(origin, !=, dp->dp_origin_snap);13321333ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);1334VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));13351336dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,1337flags & ~DS_CREATE_FLAG_NODIRTY, tx);13381339dsl_deleg_set_create_perms(dd, tx, cr);13401341/*1342* If we are creating a clone and the livelist feature is enabled,1343* add the entry DD_FIELD_LIVELIST to ZAP.1344*/1345if (origin != NULL &&1346spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {1347objset_t *mos = dd->dd_pool->dp_meta_objset;1348dsl_dir_zapify(dd, tx);1349uint64_t obj = dsl_deadlist_alloc(mos, tx);1350VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,1351sizeof (uint64_t), 1, &obj, tx));1352spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);1353}13541355/*1356* Since we're creating a new node we know it's a leaf, so we can1357* initialize the counts if the limit feature is active.1358*/1359if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {1360uint64_t cnt = 0;1361objset_t *os = dd->dd_pool->dp_meta_objset;13621363dsl_dir_zapify(dd, tx);1364VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,1365sizeof (cnt), 1, &cnt, tx));1366VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,1367sizeof (cnt), 1, &cnt, tx));1368}13691370dsl_dir_rele(dd, FTAG);13711372/*1373* If we are creating a clone, make sure we zero out any stale1374* data from the origin snapshots zil header.1375*/1376if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {1377dsl_dataset_t *ds;13781379VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));1380dsl_dataset_zero_zil(ds, tx);1381dsl_dataset_rele(ds, FTAG);1382}13831384return (dsobj);1385}13861387/*1388* The unique space in the head dataset can be calculated by subtracting1389* the space used in the most recent snapshot, that is still being used1390* in this file system, from the space currently in use. To figure out1391* the space in the most recent snapshot still in use, we need to take1392* the total space used in the snapshot and subtract out the space that1393* has been freed up since the snapshot was taken.1394*/1395void1396dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)1397{1398uint64_t mrs_used;1399uint64_t dlused, dlcomp, dluncomp;14001401ASSERT(!ds->ds_is_snapshot);14021403if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)1404mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;1405else1406mrs_used = 0;14071408dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);14091410ASSERT3U(dlused, <=, mrs_used);1411dsl_dataset_phys(ds)->ds_unique_bytes =1412dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);14131414if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=1415SPA_VERSION_UNIQUE_ACCURATE)1416dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;1417}14181419void1420dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,1421dmu_tx_t *tx)1422{1423objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;1424uint64_t count __maybe_unused;1425int err;14261427ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);1428err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,1429obj, tx);1430/*1431* The err should not be ENOENT, but a bug in a previous version1432* of the code could cause upgrade_clones_cb() to not set1433* ds_next_snap_obj when it should, leading to a missing entry.1434* If we knew that the pool was created after1435* SPA_VERSION_NEXT_CLONES, we could assert that it isn't1436* ENOENT. However, at least we can check that we don't have1437* too many entries in the next_clones_obj even after failing to1438* remove this one.1439*/1440if (err != ENOENT)1441VERIFY0(err);1442ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,1443&count));1444ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);1445}144614471448blkptr_t *1449dsl_dataset_get_blkptr(dsl_dataset_t *ds)1450{1451return (&dsl_dataset_phys(ds)->ds_bp);1452}14531454spa_t *1455dsl_dataset_get_spa(dsl_dataset_t *ds)1456{1457return (ds->ds_dir->dd_pool->dp_spa);1458}14591460void1461dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)1462{1463dsl_pool_t *dp;14641465if (ds == NULL) /* this is the meta-objset */1466return;14671468ASSERT(ds->ds_objset != NULL);14691470if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)1471panic("dirtying snapshot!");14721473/* Must not dirty a dataset in the same txg where it got snapshotted. */1474ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);14751476dp = ds->ds_dir->dd_pool;1477if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {1478objset_t *os = ds->ds_objset;14791480/* up the hold count until we can be written out */1481dmu_buf_add_ref(ds->ds_dbuf, ds);14821483/* if this dataset is encrypted, grab a reference to the DCK */1484if (ds->ds_dir->dd_crypto_obj != 0 &&1485!os->os_raw_receive &&1486!os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {1487ASSERT3P(ds->ds_key_mapping, !=, NULL);1488key_mapping_add_ref(ds->ds_key_mapping, ds);1489}1490}1491}14921493static int1494dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)1495{1496uint64_t asize;14971498if (!dmu_tx_is_syncing(tx))1499return (0);15001501/*1502* If there's an fs-only reservation, any blocks that might become1503* owned by the snapshot dataset must be accommodated by space1504* outside of the reservation.1505*/1506ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));1507asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);1508if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))1509return (SET_ERROR(ENOSPC));15101511/*1512* Propagate any reserved space for this snapshot to other1513* snapshot checks in this sync group.1514*/1515if (asize > 0)1516dsl_dir_willuse_space(ds->ds_dir, asize, tx);15171518return (0);1519}15201521int1522dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,1523dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)1524{1525int error;1526uint64_t value;15271528ds->ds_trysnap_txg = tx->tx_txg;15291530if (!dmu_tx_is_syncing(tx))1531return (0);15321533/*1534* We don't allow multiple snapshots of the same txg. If there1535* is already one, try again.1536*/1537if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)1538return (SET_ERROR(EAGAIN));15391540/*1541* Check for conflicting snapshot name.1542*/1543error = dsl_dataset_snap_lookup(ds, snapname, &value);1544if (error == 0)1545return (SET_ERROR(EEXIST));1546if (error != ENOENT)1547return (error);15481549/*1550* We don't allow taking snapshots of inconsistent datasets, such as1551* those into which we are currently receiving. However, if we are1552* creating this snapshot as part of a receive, this check will be1553* executed atomically with respect to the completion of the receive1554* itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this1555* case we ignore this, knowing it will be fixed up for us shortly in1556* dmu_recv_end_sync().1557*/1558if (!recv && DS_IS_INCONSISTENT(ds))1559return (SET_ERROR(EBUSY));15601561/*1562* Skip the check for temporary snapshots or if we have already checked1563* the counts in dsl_dataset_snapshot_check. This means we really only1564* check the count here when we're receiving a stream.1565*/1566if (cnt != 0 && cr != NULL) {1567error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,1568ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);1569if (error != 0)1570return (error);1571}15721573error = dsl_dataset_snapshot_reserve_space(ds, tx);1574if (error != 0)1575return (error);15761577return (0);1578}15791580int1581dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)1582{1583dsl_dataset_snapshot_arg_t *ddsa = arg;1584dsl_pool_t *dp = dmu_tx_pool(tx);1585nvpair_t *pair;1586int rv = 0;15871588/*1589* Pre-compute how many total new snapshots will be created for each1590* level in the tree and below. This is needed for validating the1591* snapshot limit when either taking a recursive snapshot or when1592* taking multiple snapshots.1593*1594* The problem is that the counts are not actually adjusted when1595* we are checking, only when we finally sync. For a single snapshot,1596* this is easy, the count will increase by 1 at each node up the tree,1597* but its more complicated for the recursive/multiple snapshot case.1598*1599* The dsl_fs_ss_limit_check function does recursively check the count1600* at each level up the tree but since it is validating each snapshot1601* independently we need to be sure that we are validating the complete1602* count for the entire set of snapshots. We do this by rolling up the1603* counts for each component of the name into an nvlist and then1604* checking each of those cases with the aggregated count.1605*1606* This approach properly handles not only the recursive snapshot1607* case (where we get all of those on the ddsa_snaps list) but also1608* the sibling case (e.g. snapshot a/b and a/c so that we will also1609* validate the limit on 'a' using a count of 2).1610*1611* We validate the snapshot names in the third loop and only report1612* name errors once.1613*/1614if (dmu_tx_is_syncing(tx)) {1615char *nm;1616nvlist_t *cnt_track = NULL;1617cnt_track = fnvlist_alloc();16181619nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);16201621/* Rollup aggregated counts into the cnt_track list */1622for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);1623pair != NULL;1624pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {1625char *pdelim;1626uint64_t val;16271628(void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);1629pdelim = strchr(nm, '@');1630if (pdelim == NULL)1631continue;1632*pdelim = '\0';16331634do {1635if (nvlist_lookup_uint64(cnt_track, nm,1636&val) == 0) {1637/* update existing entry */1638fnvlist_add_uint64(cnt_track, nm,1639val + 1);1640} else {1641/* add to list */1642fnvlist_add_uint64(cnt_track, nm, 1);1643}16441645pdelim = strrchr(nm, '/');1646if (pdelim != NULL)1647*pdelim = '\0';1648} while (pdelim != NULL);1649}16501651kmem_free(nm, MAXPATHLEN);16521653/* Check aggregated counts at each level */1654for (pair = nvlist_next_nvpair(cnt_track, NULL);1655pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {1656int error = 0;1657const char *name;1658uint64_t cnt = 0;1659dsl_dataset_t *ds;16601661name = nvpair_name(pair);1662cnt = fnvpair_value_uint64(pair);1663ASSERT(cnt > 0);16641665error = dsl_dataset_hold(dp, name, FTAG, &ds);1666if (error == 0) {1667error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,1668ZFS_PROP_SNAPSHOT_LIMIT, NULL,1669ddsa->ddsa_cr);1670dsl_dataset_rele(ds, FTAG);1671}16721673if (error != 0) {1674if (ddsa->ddsa_errors != NULL)1675fnvlist_add_int32(ddsa->ddsa_errors,1676name, error);1677rv = error;1678/* only report one error for this check */1679break;1680}1681}1682nvlist_free(cnt_track);1683}16841685for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);1686pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {1687int error = 0;1688dsl_dataset_t *ds;1689const char *name, *atp = NULL;1690char dsname[ZFS_MAX_DATASET_NAME_LEN];16911692name = nvpair_name(pair);1693if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)1694error = SET_ERROR(ENAMETOOLONG);1695if (error == 0) {1696atp = strchr(name, '@');1697if (atp == NULL)1698error = SET_ERROR(EINVAL);1699if (error == 0)1700(void) strlcpy(dsname, name, atp - name + 1);1701}1702if (error == 0)1703error = dsl_dataset_hold(dp, dsname, FTAG, &ds);1704if (error == 0) {1705/* passing 0/NULL skips dsl_fs_ss_limit_check */1706error = dsl_dataset_snapshot_check_impl(ds,1707atp + 1, tx, B_FALSE, 0, NULL);1708dsl_dataset_rele(ds, FTAG);1709}17101711if (error != 0) {1712if (ddsa->ddsa_errors != NULL) {1713fnvlist_add_int32(ddsa->ddsa_errors,1714name, error);1715}1716rv = error;1717}1718}17191720return (rv);1721}17221723void1724dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,1725dmu_tx_t *tx)1726{1727dsl_pool_t *dp = ds->ds_dir->dd_pool;1728dmu_buf_t *dbuf;1729dsl_dataset_phys_t *dsphys;1730uint64_t dsobj, crtxg;1731objset_t *mos = dp->dp_meta_objset;1732objset_t *os __maybe_unused;17331734ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));17351736/*1737* If we are on an old pool, the zil must not be active, in which1738* case it will be zeroed. Usually zil_suspend() accomplishes this.1739*/1740ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||1741dmu_objset_from_ds(ds, &os) != 0 ||1742memcmp(&os->os_phys->os_zil_header, &zero_zil,1743sizeof (zero_zil)) == 0);17441745/* Should not snapshot a dirty dataset. */1746ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,1747ds, tx->tx_txg));17481749dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);17501751/*1752* The origin's ds_creation_txg has to be < TXG_INITIAL1753*/1754if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)1755crtxg = 1;1756else1757crtxg = tx->tx_txg;17581759dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,1760DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);1761VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));1762dmu_buf_will_dirty(dbuf, tx);1763dsphys = dbuf->db_data;1764memset(dsphys, 0, sizeof (dsl_dataset_phys_t));1765dsphys->ds_dir_obj = ds->ds_dir->dd_object;1766dsphys->ds_fsid_guid = unique_create();1767(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,1768sizeof (dsphys->ds_guid));1769dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;1770dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;1771dsphys->ds_next_snap_obj = ds->ds_object;1772dsphys->ds_num_children = 1;1773dsphys->ds_creation_time = gethrestime_sec();1774dsphys->ds_creation_txg = crtxg;1775dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;1776dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;1777dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;1778dsphys->ds_uncompressed_bytes =1779dsl_dataset_phys(ds)->ds_uncompressed_bytes;1780dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;1781rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);1782dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;1783rrw_exit(&ds->ds_bp_rwlock, FTAG);1784dmu_buf_rele(dbuf, FTAG);17851786for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {1787if (zfeature_active(f, ds->ds_feature[f])) {1788dsl_dataset_activate_feature(dsobj, f,1789ds->ds_feature[f], tx);1790}1791}17921793ASSERT3U(ds->ds_prev != 0, ==,1794dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);1795if (ds->ds_prev) {1796uint64_t next_clones_obj =1797dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;1798ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==1799ds->ds_object ||1800dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);1801if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==1802ds->ds_object) {1803dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);1804ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,1805dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);1806dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;1807} else if (next_clones_obj != 0) {1808dsl_dataset_remove_from_next_clones(ds->ds_prev,1809dsphys->ds_next_snap_obj, tx);1810VERIFY0(zap_add_int(mos,1811next_clones_obj, dsobj, tx));1812}1813}18141815/*1816* If we have a reference-reservation on this dataset, we will1817* need to increase the amount of refreservation being charged1818* since our unique space is going to zero.1819*/1820if (ds->ds_reserved) {1821int64_t delta;1822ASSERT(DS_UNIQUE_IS_ACCURATE(ds));1823delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,1824ds->ds_reserved);1825dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,1826delta, 0, 0, tx);1827}18281829dmu_buf_will_dirty(ds->ds_dbuf, tx);1830dsl_dataset_phys(ds)->ds_deadlist_obj =1831dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,1832dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);1833dsl_deadlist_close(&ds->ds_deadlist);1834VERIFY0(dsl_deadlist_open(&ds->ds_deadlist, mos,1835dsl_dataset_phys(ds)->ds_deadlist_obj));1836dsl_deadlist_add_key(&ds->ds_deadlist,1837dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);1838dsl_bookmark_snapshotted(ds, tx);18391840if (dsl_dataset_remap_deadlist_exists(ds)) {1841uint64_t remap_deadlist_obj =1842dsl_dataset_get_remap_deadlist_object(ds);1843/*1844* Move the remap_deadlist to the snapshot. The head1845* will create a new remap deadlist on demand, from1846* dsl_dataset_block_remapped().1847*/1848dsl_dataset_unset_remap_deadlist_object(ds, tx);1849dsl_deadlist_close(&ds->ds_remap_deadlist);18501851dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);1852VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,1853sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));1854}18551856/*1857* Create a ivset guid for this snapshot if the dataset is1858* encrypted. This may be overridden by a raw receive. A1859* previous implementation of this code did not have this1860* field as part of the on-disk format for ZFS encryption1861* (see errata #4). As part of the remediation for this1862* issue, we ask the user to enable the bookmark_v2 feature1863* which is now a dependency of the encryption feature. We1864* use this as a heuristic to determine when the user has1865* elected to correct any datasets created with the old code.1866* As a result, we only do this step if the bookmark_v21867* feature is enabled, which limits the number of states a1868* given pool / dataset can be in with regards to terms of1869* correcting the issue.1870*/1871if (ds->ds_dir->dd_crypto_obj != 0 &&1872spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) {1873uint64_t ivset_guid = unique_create();18741875dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);1876VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID,1877sizeof (ivset_guid), 1, &ivset_guid, tx));1878}18791880ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);1881dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;1882dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;1883dsl_dataset_phys(ds)->ds_unique_bytes = 0;18841885if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)1886dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;18871888VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,1889snapname, 8, 1, &dsobj, tx));18901891if (ds->ds_prev)1892dsl_dataset_rele(ds->ds_prev, ds);1893VERIFY0(dsl_dataset_hold_obj(dp,1894dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));18951896dsl_scan_ds_snapshotted(ds, tx);18971898dsl_dir_snap_cmtime_update(ds->ds_dir, tx);18991900if (zfs_snapshot_history_enabled)1901spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");1902}19031904void1905dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)1906{1907dsl_dataset_snapshot_arg_t *ddsa = arg;1908dsl_pool_t *dp = dmu_tx_pool(tx);1909nvpair_t *pair;19101911for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);1912pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {1913dsl_dataset_t *ds;1914const char *name, *atp;1915char dsname[ZFS_MAX_DATASET_NAME_LEN];19161917name = nvpair_name(pair);1918atp = strchr(name, '@');1919(void) strlcpy(dsname, name, atp - name + 1);1920VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));19211922dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);1923if (ddsa->ddsa_props != NULL) {1924dsl_props_set_sync_impl(ds->ds_prev,1925ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);1926}1927dsl_dataset_rele(ds, FTAG);1928}1929}19301931/*1932* The snapshots must all be in the same pool.1933* All-or-nothing: if there are any failures, nothing will be modified.1934*/1935int1936dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)1937{1938dsl_dataset_snapshot_arg_t ddsa;1939nvpair_t *pair;1940boolean_t needsuspend;1941int error;1942spa_t *spa;1943const char *firstname;1944nvlist_t *suspended = NULL;19451946pair = nvlist_next_nvpair(snaps, NULL);1947if (pair == NULL)1948return (0);1949firstname = nvpair_name(pair);19501951error = spa_open(firstname, &spa, FTAG);1952if (error != 0)1953return (error);1954needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);1955spa_close(spa, FTAG);19561957if (needsuspend) {1958suspended = fnvlist_alloc();1959for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;1960pair = nvlist_next_nvpair(snaps, pair)) {1961char fsname[ZFS_MAX_DATASET_NAME_LEN];1962const char *snapname = nvpair_name(pair);1963const char *atp;1964void *cookie;19651966atp = strchr(snapname, '@');1967if (atp == NULL) {1968error = SET_ERROR(EINVAL);1969break;1970}1971(void) strlcpy(fsname, snapname, atp - snapname + 1);19721973error = zil_suspend(fsname, &cookie);1974if (error != 0)1975break;1976fnvlist_add_uint64(suspended, fsname,1977(uintptr_t)cookie);1978}1979}19801981cred_t *cr = CRED();1982crhold(cr);19831984ddsa.ddsa_snaps = snaps;1985ddsa.ddsa_props = props;1986ddsa.ddsa_errors = errors;1987ddsa.ddsa_cr = cr;19881989if (error == 0) {1990error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,1991dsl_dataset_snapshot_sync, &ddsa,1992fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);1993}19941995crfree(cr);19961997if (suspended != NULL) {1998for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;1999pair = nvlist_next_nvpair(suspended, pair)) {2000zil_resume((void *)(uintptr_t)2001fnvpair_value_uint64(pair));2002}2003fnvlist_free(suspended);2004}20052006if (error == 0) {2007for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;2008pair = nvlist_next_nvpair(snaps, pair)) {2009zvol_create_minors(nvpair_name(pair));2010}2011}20122013return (error);2014}20152016typedef struct dsl_dataset_snapshot_tmp_arg {2017const char *ddsta_fsname;2018const char *ddsta_snapname;2019minor_t ddsta_cleanup_minor;2020const char *ddsta_htag;2021} dsl_dataset_snapshot_tmp_arg_t;20222023static int2024dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)2025{2026dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;2027dsl_pool_t *dp = dmu_tx_pool(tx);2028dsl_dataset_t *ds;2029int error;20302031error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);2032if (error != 0)2033return (error);20342035/* NULL cred means no limit check for tmp snapshot */2036error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,2037tx, B_FALSE, 0, NULL);2038if (error != 0) {2039dsl_dataset_rele(ds, FTAG);2040return (error);2041}20422043if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {2044dsl_dataset_rele(ds, FTAG);2045return (SET_ERROR(ENOTSUP));2046}2047error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,2048B_TRUE, tx);2049if (error != 0) {2050dsl_dataset_rele(ds, FTAG);2051return (error);2052}20532054dsl_dataset_rele(ds, FTAG);2055return (0);2056}20572058static void2059dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)2060{2061dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;2062dsl_pool_t *dp = dmu_tx_pool(tx);2063dsl_dataset_t *ds = NULL;20642065VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));20662067dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);2068dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,2069ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);2070dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);20712072dsl_dataset_rele(ds, FTAG);2073}20742075int2076dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,2077minor_t cleanup_minor, const char *htag)2078{2079dsl_dataset_snapshot_tmp_arg_t ddsta;2080int error;2081spa_t *spa;2082boolean_t needsuspend;2083void *cookie;20842085ddsta.ddsta_fsname = fsname;2086ddsta.ddsta_snapname = snapname;2087ddsta.ddsta_cleanup_minor = cleanup_minor;2088ddsta.ddsta_htag = htag;20892090error = spa_open(fsname, &spa, FTAG);2091if (error != 0)2092return (error);2093needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);2094spa_close(spa, FTAG);20952096if (needsuspend) {2097error = zil_suspend(fsname, &cookie);2098if (error != 0)2099return (error);2100}21012102error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,2103dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);21042105if (needsuspend)2106zil_resume(cookie);2107return (error);2108}21092110/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */2111void2112dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)2113{2114ASSERT(dmu_tx_is_syncing(tx));2115ASSERT(ds->ds_objset != NULL);2116ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);21172118/*2119* in case we had to change ds_fsid_guid when we opened it,2120* sync it out now.2121*/2122dmu_buf_will_dirty(ds->ds_dbuf, tx);2123dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;21242125if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {2126VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,2127ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,2128&ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));2129VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,2130ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,2131&ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));2132VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,2133ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,2134&ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));2135ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;2136ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;2137ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;2138}21392140dmu_objset_sync(ds->ds_objset, rio, tx);2141}21422143/*2144* Check if the percentage of blocks shared between the clone and the2145* snapshot (as opposed to those that are clone only) is below a certain2146* threshold2147*/2148static boolean_t2149dsl_livelist_should_disable(dsl_dataset_t *ds)2150{2151uint64_t used, referenced;2152int percent_shared;21532154used = dsl_dir_get_usedds(ds->ds_dir);2155referenced = dsl_get_referenced(ds);2156if (referenced == 0)2157return (B_FALSE);2158percent_shared = (100 * (referenced - used)) / referenced;2159if (percent_shared <= zfs_livelist_min_percent_shared)2160return (B_TRUE);2161return (B_FALSE);2162}21632164/*2165* Check if it is possible to combine two livelist entries into one.2166* This is the case if the combined number of 'live' blkptrs (ALLOCs that2167* don't have a matching FREE) is under the maximum sublist size.2168* We check this by subtracting twice the total number of frees from the total2169* number of blkptrs. FREEs are counted twice because each FREE blkptr2170* will cancel out an ALLOC blkptr when the livelist is processed.2171*/2172static boolean_t2173dsl_livelist_should_condense(dsl_deadlist_entry_t *first,2174dsl_deadlist_entry_t *next)2175{2176uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +2177next->dle_bpobj.bpo_phys->bpo_num_freed;2178uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +2179next->dle_bpobj.bpo_phys->bpo_num_blkptrs;2180if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)2181return (B_TRUE);2182return (B_FALSE);2183}21842185typedef struct try_condense_arg {2186spa_t *spa;2187dsl_dataset_t *ds;2188} try_condense_arg_t;21892190/*2191* Iterate over the livelist entries, searching for a pair to condense.2192* A nonzero return value means stop, 0 means keep looking.2193*/2194static int2195dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)2196{2197try_condense_arg_t *tca = arg;2198spa_t *spa = tca->spa;2199dsl_dataset_t *ds = tca->ds;2200dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;2201dsl_deadlist_entry_t *next;22022203/* The condense thread has not yet been created at import */2204if (spa->spa_livelist_condense_zthr == NULL)2205return (1);22062207/* A condense is already in progress */2208if (spa->spa_to_condense.ds != NULL)2209return (1);22102211next = AVL_NEXT(&ll->dl_tree, &first->dle_node);2212/* The livelist has only one entry - don't condense it */2213if (next == NULL)2214return (1);22152216/* Next is the newest entry - don't condense it */2217if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)2218return (1);22192220/* This pair is not ready to condense but keep looking */2221if (!dsl_livelist_should_condense(first, next))2222return (0);22232224/*2225* Add a ref to prevent the dataset from being evicted while2226* the condense zthr or synctask are running. Ref will be2227* released at the end of the condense synctask2228*/2229dmu_buf_add_ref(ds->ds_dbuf, spa);22302231spa->spa_to_condense.ds = ds;2232spa->spa_to_condense.first = first;2233spa->spa_to_condense.next = next;2234spa->spa_to_condense.syncing = B_FALSE;2235spa->spa_to_condense.cancelled = B_FALSE;22362237zthr_wakeup(spa->spa_livelist_condense_zthr);2238return (1);2239}22402241static void2242dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)2243{2244dsl_dir_t *dd = ds->ds_dir;2245spa_t *spa = ds->ds_dir->dd_pool->dp_spa;2246dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);22472248/* Check if we need to add a new sub-livelist */2249if (last == NULL) {2250/* The livelist is empty */2251dsl_deadlist_add_key(&dd->dd_livelist,2252tx->tx_txg - 1, tx);2253} else if (spa_sync_pass(spa) == 1) {2254/*2255* Check if the newest entry is full. If it is, make a new one.2256* We only do this once per sync because we could overfill a2257* sublist in one sync pass and don't want to add another entry2258* for a txg that is already represented. This ensures that2259* blkptrs born in the same txg are stored in the same sublist.2260*/2261bpobj_t bpobj = last->dle_bpobj;2262uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;2263uint64_t free = bpobj.bpo_phys->bpo_num_freed;2264uint64_t alloc = all - free;2265if (alloc > zfs_livelist_max_entries) {2266dsl_deadlist_add_key(&dd->dd_livelist,2267tx->tx_txg - 1, tx);2268}2269}22702271/* Insert each entry into the on-disk livelist */2272bplist_iterate(&dd->dd_pending_allocs,2273dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);2274bplist_iterate(&dd->dd_pending_frees,2275dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);22762277/* Attempt to condense every pair of adjacent entries */2278try_condense_arg_t arg = {2279.spa = spa,2280.ds = ds2281};2282dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,2283&arg);2284}22852286void2287dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)2288{2289objset_t *os = ds->ds_objset;22902291bplist_iterate(&ds->ds_pending_deadlist,2292dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);22932294if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {2295dsl_flush_pending_livelist(ds, tx);2296if (dsl_livelist_should_disable(ds)) {2297dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);2298}2299}23002301dsl_bookmark_sync_done(ds, tx);23022303multilist_destroy(&os->os_synced_dnodes);23042305if (os->os_encrypted)2306os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE;2307else2308ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);23092310for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {2311if (zfeature_active(f,2312ds->ds_feature_activation[f])) {2313if (zfeature_active(f, ds->ds_feature[f]))2314continue;2315dsl_dataset_activate_feature(ds->ds_object, f,2316ds->ds_feature_activation[f], tx);2317ds->ds_feature[f] = ds->ds_feature_activation[f];2318}2319}23202321ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));2322}23232324int2325get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)2326{2327uint64_t count = 0;2328objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;2329zap_cursor_t zc;2330zap_attribute_t *za;23312332ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));23332334/*2335* There may be missing entries in ds_next_clones_obj2336* due to a bug in a previous version of the code.2337* Only trust it if it has the right number of entries.2338*/2339if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {2340VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,2341&count));2342}2343if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {2344return (SET_ERROR(ENOENT));2345}23462347za = zap_attribute_alloc();2348for (zap_cursor_init(&zc, mos,2349dsl_dataset_phys(ds)->ds_next_clones_obj);2350zap_cursor_retrieve(&zc, za) == 0;2351zap_cursor_advance(&zc)) {2352dsl_dataset_t *clone;2353char buf[ZFS_MAX_DATASET_NAME_LEN];2354VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,2355za->za_first_integer, FTAG, &clone));2356dsl_dir_name(clone->ds_dir, buf);2357fnvlist_add_boolean(val, buf);2358dsl_dataset_rele(clone, FTAG);2359}2360zap_cursor_fini(&zc);2361zap_attribute_free(za);2362return (0);2363}23642365void2366get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)2367{2368nvlist_t *propval = fnvlist_alloc();2369nvlist_t *val = fnvlist_alloc();23702371if (get_clones_stat_impl(ds, val) == 0) {2372fnvlist_add_nvlist(propval, ZPROP_VALUE, val);2373fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),2374propval);2375}23762377nvlist_free(val);2378nvlist_free(propval);2379}23802381static char *2382get_receive_resume_token_impl(dsl_dataset_t *ds)2383{2384if (!dsl_dataset_has_resume_receive_state(ds))2385return (NULL);23862387dsl_pool_t *dp = ds->ds_dir->dd_pool;2388char *str;2389void *packed;2390uint8_t *compressed;2391uint64_t val;2392nvlist_t *token_nv = fnvlist_alloc();2393size_t packed_size, compressed_size;23942395if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2396DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {2397fnvlist_add_uint64(token_nv, "fromguid", val);2398}2399if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2400DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {2401fnvlist_add_uint64(token_nv, "object", val);2402}2403if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2404DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {2405fnvlist_add_uint64(token_nv, "offset", val);2406}2407if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2408DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {2409fnvlist_add_uint64(token_nv, "bytes", val);2410}2411if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2412DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {2413fnvlist_add_uint64(token_nv, "toguid", val);2414}2415char buf[MAXNAMELEN];2416if (zap_lookup(dp->dp_meta_objset, ds->ds_object,2417DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {2418fnvlist_add_string(token_nv, "toname", buf);2419}2420if (zap_contains(dp->dp_meta_objset, ds->ds_object,2421DS_FIELD_RESUME_LARGEBLOCK) == 0) {2422fnvlist_add_boolean(token_nv, "largeblockok");2423}2424if (zap_contains(dp->dp_meta_objset, ds->ds_object,2425DS_FIELD_RESUME_EMBEDOK) == 0) {2426fnvlist_add_boolean(token_nv, "embedok");2427}2428if (zap_contains(dp->dp_meta_objset, ds->ds_object,2429DS_FIELD_RESUME_COMPRESSOK) == 0) {2430fnvlist_add_boolean(token_nv, "compressok");2431}2432if (zap_contains(dp->dp_meta_objset, ds->ds_object,2433DS_FIELD_RESUME_RAWOK) == 0) {2434fnvlist_add_boolean(token_nv, "rawok");2435}2436if (dsl_dataset_feature_is_active(ds,2437SPA_FEATURE_REDACTED_DATASETS)) {2438uint64_t num_redact_snaps = 0;2439uint64_t *redact_snaps = NULL;2440VERIFY3B(dsl_dataset_get_uint64_array_feature(ds,2441SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,2442&redact_snaps), ==, B_TRUE);2443fnvlist_add_uint64_array(token_nv, "redact_snaps",2444redact_snaps, num_redact_snaps);2445}2446if (zap_contains(dp->dp_meta_objset, ds->ds_object,2447DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {2448uint64_t num_redact_snaps = 0, int_size = 0;2449uint64_t *redact_snaps = NULL;2450VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,2451DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,2452&num_redact_snaps));2453ASSERT3U(int_size, ==, sizeof (uint64_t));24542455redact_snaps = kmem_alloc(int_size * num_redact_snaps,2456KM_SLEEP);2457VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,2458DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,2459num_redact_snaps, redact_snaps));2460fnvlist_add_uint64_array(token_nv, "book_redact_snaps",2461redact_snaps, num_redact_snaps);2462kmem_free(redact_snaps, int_size * num_redact_snaps);2463}2464packed = fnvlist_pack(token_nv, &packed_size);2465fnvlist_free(token_nv);2466compressed = kmem_alloc(packed_size, KM_SLEEP);24672468/* Call compress function directly to avoid hole detection. */2469abd_t pabd, cabd;2470abd_get_from_buf_struct(&pabd, packed, packed_size);2471abd_get_from_buf_struct(&cabd, compressed, packed_size);2472compressed_size = zfs_gzip_compress(&pabd, &cabd,2473packed_size, packed_size, 6);2474abd_free(&cabd);2475abd_free(&pabd);24762477zio_cksum_t cksum;2478fletcher_4_native_varsize(compressed, compressed_size, &cksum);24792480size_t alloc_size = compressed_size * 2 + 1;2481str = kmem_alloc(alloc_size, KM_SLEEP);2482for (int i = 0; i < compressed_size; i++) {2483size_t offset = i * 2;2484(void) snprintf(str + offset, alloc_size - offset,2485"%02x", compressed[i]);2486}2487str[compressed_size * 2] = '\0';2488char *propval = kmem_asprintf("%u-%llx-%llx-%s",2489ZFS_SEND_RESUME_TOKEN_VERSION,2490(longlong_t)cksum.zc_word[0],2491(longlong_t)packed_size, str);2492kmem_free(packed, packed_size);2493kmem_free(str, alloc_size);2494kmem_free(compressed, packed_size);2495return (propval);2496}24972498/*2499* Returns a string that represents the receive resume state token. It should2500* be freed with strfree(). NULL is returned if no resume state is present.2501*/2502char *2503get_receive_resume_token(dsl_dataset_t *ds)2504{2505/*2506* A failed "newfs" (e.g. full) resumable receive leaves2507* the stats set on this dataset. Check here for the prop.2508*/2509char *token = get_receive_resume_token_impl(ds);2510if (token != NULL)2511return (token);2512/*2513* A failed incremental resumable receive leaves the2514* stats set on our child named "%recv". Check the child2515* for the prop.2516*/2517/* 6 extra bytes for /%recv */2518char name[ZFS_MAX_DATASET_NAME_LEN + 6];2519dsl_dataset_t *recv_ds;2520dsl_dataset_name(ds, name);2521if (strlcat(name, "/", sizeof (name)) < sizeof (name) &&2522strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) &&2523dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) {2524token = get_receive_resume_token_impl(recv_ds);2525dsl_dataset_rele(recv_ds, FTAG);2526}2527return (token);2528}25292530uint64_t2531dsl_get_refratio(dsl_dataset_t *ds)2532{2533uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :2534(dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /2535dsl_dataset_phys(ds)->ds_compressed_bytes);2536return (ratio);2537}25382539uint64_t2540dsl_get_logicalreferenced(dsl_dataset_t *ds)2541{2542return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);2543}25442545uint64_t2546dsl_get_compressratio(dsl_dataset_t *ds)2547{2548if (ds->ds_is_snapshot) {2549return (dsl_get_refratio(ds));2550} else {2551dsl_dir_t *dd = ds->ds_dir;2552mutex_enter(&dd->dd_lock);2553uint64_t val = dsl_dir_get_compressratio(dd);2554mutex_exit(&dd->dd_lock);2555return (val);2556}2557}25582559uint64_t2560dsl_get_used(dsl_dataset_t *ds)2561{2562if (ds->ds_is_snapshot) {2563return (dsl_dataset_phys(ds)->ds_unique_bytes);2564} else {2565dsl_dir_t *dd = ds->ds_dir;2566mutex_enter(&dd->dd_lock);2567uint64_t val = dsl_dir_get_used(dd);2568mutex_exit(&dd->dd_lock);2569return (val);2570}2571}25722573uint64_t2574dsl_get_creation(dsl_dataset_t *ds)2575{2576return (dsl_dataset_phys(ds)->ds_creation_time);2577}25782579uint64_t2580dsl_get_creationtxg(dsl_dataset_t *ds)2581{2582return (dsl_dataset_phys(ds)->ds_creation_txg);2583}25842585uint64_t2586dsl_get_refquota(dsl_dataset_t *ds)2587{2588return (ds->ds_quota);2589}25902591uint64_t2592dsl_get_refreservation(dsl_dataset_t *ds)2593{2594return (ds->ds_reserved);2595}25962597uint64_t2598dsl_get_guid(dsl_dataset_t *ds)2599{2600return (dsl_dataset_phys(ds)->ds_guid);2601}26022603uint64_t2604dsl_get_unique(dsl_dataset_t *ds)2605{2606return (dsl_dataset_phys(ds)->ds_unique_bytes);2607}26082609uint64_t2610dsl_get_objsetid(dsl_dataset_t *ds)2611{2612return (ds->ds_object);2613}26142615uint64_t2616dsl_get_userrefs(dsl_dataset_t *ds)2617{2618return (ds->ds_userrefs);2619}26202621uint64_t2622dsl_get_defer_destroy(dsl_dataset_t *ds)2623{2624return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);2625}26262627uint64_t2628dsl_get_referenced(dsl_dataset_t *ds)2629{2630return (dsl_dataset_phys(ds)->ds_referenced_bytes);2631}26322633uint64_t2634dsl_get_numclones(dsl_dataset_t *ds)2635{2636ASSERT(ds->ds_is_snapshot);2637return (dsl_dataset_phys(ds)->ds_num_children - 1);2638}26392640uint64_t2641dsl_get_inconsistent(dsl_dataset_t *ds)2642{2643return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?26441 : 0);2645}26462647uint64_t2648dsl_get_redacted(dsl_dataset_t *ds)2649{2650return (dsl_dataset_feature_is_active(ds,2651SPA_FEATURE_REDACTED_DATASETS));2652}26532654uint64_t2655dsl_get_available(dsl_dataset_t *ds)2656{2657uint64_t refdbytes = dsl_get_referenced(ds);2658uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,2659NULL, 0, TRUE);2660if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {2661availbytes +=2662ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;2663}2664if (ds->ds_quota != 0) {2665/*2666* Adjust available bytes according to refquota2667*/2668if (refdbytes < ds->ds_quota) {2669availbytes = MIN(availbytes,2670ds->ds_quota - refdbytes);2671} else {2672availbytes = 0;2673}2674}2675return (availbytes);2676}26772678int2679dsl_get_written(dsl_dataset_t *ds, uint64_t *written)2680{2681dsl_pool_t *dp = ds->ds_dir->dd_pool;2682dsl_dataset_t *prev;2683int err = dsl_dataset_hold_obj(dp,2684dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);2685if (err == 0) {2686uint64_t comp, uncomp;2687err = dsl_dataset_space_written(prev, ds, written,2688&comp, &uncomp);2689dsl_dataset_rele(prev, FTAG);2690}2691return (err);2692}26932694/*2695* 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.2696*/2697int2698dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)2699{2700dsl_pool_t *dp = ds->ds_dir->dd_pool;2701if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {2702dsl_dataset_name(ds->ds_prev, snap);2703return (0);2704} else {2705return (SET_ERROR(ENOENT));2706}2707}27082709void2710dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval)2711{2712uint64_t nsnaps;2713uint64_t *snaps;2714if (dsl_dataset_get_uint64_array_feature(ds,2715SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) {2716fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps,2717nsnaps);2718}2719}27202721/*2722* Returns the mountpoint property and source for the given dataset in the value2723* and source buffers. The value buffer must be at least as large as MAXPATHLEN2724* and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.2725* Returns 0 on success and an error on failure.2726*/2727int2728dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,2729char *source)2730{2731int error;2732dsl_pool_t *dp = ds->ds_dir->dd_pool;27332734/* Retrieve the mountpoint value stored in the zap object */2735error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,2736ZAP_MAXVALUELEN, value, source);2737if (error != 0) {2738return (error);2739}27402741/*2742* Process the dsname and source to find the full mountpoint string.2743* Can be skipped for 'legacy' or 'none'.2744*/2745if (value[0] == '/') {2746char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);2747char *root = buf;2748const char *relpath;27492750/*2751* If we inherit the mountpoint, even from a dataset2752* with a received value, the source will be the path of2753* the dataset we inherit from. If source is2754* ZPROP_SOURCE_VAL_RECVD, the received value is not2755* inherited.2756*/2757if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {2758relpath = "";2759} else {2760ASSERT0(strncmp(dsname, source, strlen(source)));2761relpath = dsname + strlen(source);2762if (relpath[0] == '/')2763relpath++;2764}27652766spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);27672768/*2769* Special case an alternate root of '/'. This will2770* avoid having multiple leading slashes in the2771* mountpoint path.2772*/2773if (strcmp(root, "/") == 0)2774root++;27752776/*2777* If the mountpoint is '/' then skip over this2778* if we are obtaining either an alternate root or2779* an inherited mountpoint.2780*/2781char *mnt = value;2782if (value[1] == '\0' && (root[0] != '\0' ||2783relpath[0] != '\0'))2784mnt = value + 1;27852786mnt = kmem_strdup(mnt);27872788if (relpath[0] == '\0') {2789(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",2790root, mnt);2791} else {2792(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",2793root, mnt, relpath[0] == '@' ? "" : "/",2794relpath);2795}2796kmem_free(buf, ZAP_MAXVALUELEN);2797kmem_strfree(mnt);2798}27992800return (0);2801}28022803void2804dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)2805{2806dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;28072808ASSERT(dsl_pool_config_held(dp));28092810dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,2811dsl_get_refratio(ds));2812dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,2813dsl_get_logicalreferenced(ds));2814dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,2815dsl_get_compressratio(ds));2816dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,2817dsl_get_used(ds));28182819if (ds->ds_is_snapshot) {2820get_clones_stat(ds, nv);2821} else {2822char buf[ZFS_MAX_DATASET_NAME_LEN];2823if (dsl_get_prev_snap(ds, buf) == 0)2824dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,2825buf);2826dsl_dir_stats(ds->ds_dir, nv);2827}28282829nvlist_t *propval = fnvlist_alloc();2830dsl_get_redact_snaps(ds, propval);2831fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),2832propval);2833nvlist_free(propval);28342835dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,2836dsl_get_available(ds));2837dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,2838dsl_get_referenced(ds));2839dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,2840dsl_get_creation(ds));2841dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,2842dsl_get_creationtxg(ds));2843dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,2844dsl_get_refquota(ds));2845dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,2846dsl_get_refreservation(ds));2847dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,2848dsl_get_guid(ds));2849dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,2850dsl_get_unique(ds));2851dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,2852dsl_get_objsetid(ds));2853dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,2854dsl_get_userrefs(ds));2855dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,2856dsl_get_defer_destroy(ds));2857dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED,2858dsl_dir_snap_cmtime(ds->ds_dir).tv_sec);2859dsl_dataset_crypt_stats(ds, nv);28602861if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {2862uint64_t written;2863if (dsl_get_written(ds, &written) == 0) {2864dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,2865written);2866}2867}28682869if (!dsl_dataset_is_snapshot(ds)) {2870char *token = get_receive_resume_token(ds);2871if (token != NULL) {2872dsl_prop_nvlist_add_string(nv,2873ZFS_PROP_RECEIVE_RESUME_TOKEN, token);2874kmem_strfree(token);2875}2876}2877}28782879void2880dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)2881{2882dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;2883ASSERT(dsl_pool_config_held(dp));28842885stat->dds_creation_txg = dsl_get_creationtxg(ds);2886stat->dds_inconsistent = dsl_get_inconsistent(ds);2887stat->dds_guid = dsl_get_guid(ds);2888stat->dds_redacted = dsl_get_redacted(ds);2889stat->dds_origin[0] = '\0';2890stat->dds_flags = DDS_FLAG_HAS_ENCRYPTED;2891if (ds->ds_dir->dd_crypto_obj != 0)2892stat->dds_flags |= DDS_FLAG_ENCRYPTED;2893if (ds->ds_is_snapshot) {2894stat->dds_is_snapshot = B_TRUE;2895stat->dds_num_clones = dsl_get_numclones(ds);2896} else {2897stat->dds_is_snapshot = B_FALSE;2898stat->dds_num_clones = 0;28992900if (dsl_dir_is_clone(ds->ds_dir)) {2901dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);2902}2903}2904}29052906uint64_t2907dsl_dataset_fsid_guid(dsl_dataset_t *ds)2908{2909return (ds->ds_fsid_guid);2910}29112912void2913dsl_dataset_space(dsl_dataset_t *ds,2914uint64_t *refdbytesp, uint64_t *availbytesp,2915uint64_t *usedobjsp, uint64_t *availobjsp)2916{2917*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;2918*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);2919if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)2920*availbytesp +=2921ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;2922if (ds->ds_quota != 0) {2923/*2924* Adjust available bytes according to refquota2925*/2926if (*refdbytesp < ds->ds_quota)2927*availbytesp = MIN(*availbytesp,2928ds->ds_quota - *refdbytesp);2929else2930*availbytesp = 0;2931}2932rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);2933*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);2934rrw_exit(&ds->ds_bp_rwlock, FTAG);2935*availobjsp = DN_MAX_OBJECT - *usedobjsp;2936}29372938boolean_t2939dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)2940{2941dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;2942uint64_t birth;29432944ASSERT(dsl_pool_config_held(dp));2945if (snap == NULL)2946return (B_FALSE);2947rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);2948birth = BP_GET_BIRTH(dsl_dataset_get_blkptr(ds));2949rrw_exit(&ds->ds_bp_rwlock, FTAG);2950if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {2951objset_t *os, *os_snap;2952/*2953* It may be that only the ZIL differs, because it was2954* reset in the head. Don't count that as being2955* modified.2956*/2957if (dmu_objset_from_ds(ds, &os) != 0)2958return (B_TRUE);2959if (dmu_objset_from_ds(snap, &os_snap) != 0)2960return (B_TRUE);2961return (memcmp(&os->os_phys->os_meta_dnode,2962&os_snap->os_phys->os_meta_dnode,2963sizeof (os->os_phys->os_meta_dnode)) != 0);2964}2965return (B_FALSE);2966}29672968static int2969dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,2970dsl_dataset_t *hds, void *arg)2971{2972(void) dp;2973dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;2974int error;2975uint64_t val;29762977error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);2978if (error != 0) {2979/* ignore nonexistent snapshots */2980return (error == ENOENT ? 0 : error);2981}29822983/* new name should not exist */2984error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);2985if (error == 0)2986error = SET_ERROR(EEXIST);2987else if (error == ENOENT)2988error = 0;29892990/* dataset name + 1 for the "@" + the new snapshot name must fit */2991if (dsl_dir_namelen(hds->ds_dir) + 1 +2992strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)2993error = SET_ERROR(ENAMETOOLONG);29942995return (error);2996}29972998int2999dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)3000{3001dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;3002dsl_pool_t *dp = dmu_tx_pool(tx);3003dsl_dataset_t *hds;3004int error;30053006error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);3007if (error != 0)3008return (error);30093010if (ddrsa->ddrsa_recursive) {3011error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,3012dsl_dataset_rename_snapshot_check_impl, ddrsa,3013DS_FIND_CHILDREN);3014} else {3015error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);3016}3017dsl_dataset_rele(hds, FTAG);3018return (error);3019}30203021static int3022dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,3023dsl_dataset_t *hds, void *arg)3024{3025dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;3026dsl_dataset_t *ds;3027uint64_t val;3028dmu_tx_t *tx = ddrsa->ddrsa_tx;3029char *oldname, *newname;3030int error;30313032error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);3033ASSERT(error == 0 || error == ENOENT);3034if (error == ENOENT) {3035/* ignore nonexistent snapshots */3036return (0);3037}30383039VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));30403041/* log before we change the name */3042spa_history_log_internal_ds(ds, "rename", tx,3043"-> @%s", ddrsa->ddrsa_newsnapname);30443045VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,3046B_FALSE));3047mutex_enter(&ds->ds_lock);3048(void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname,3049sizeof (ds->ds_snapname));3050mutex_exit(&ds->ds_lock);3051VERIFY0(zap_add(dp->dp_meta_objset,3052dsl_dataset_phys(hds)->ds_snapnames_zapobj,3053ds->ds_snapname, 8, 1, &ds->ds_object, tx));30543055oldname = kmem_asprintf("%s@%s", ddrsa->ddrsa_fsname,3056ddrsa->ddrsa_oldsnapname);3057newname = kmem_asprintf("%s@%s", ddrsa->ddrsa_fsname,3058ddrsa->ddrsa_newsnapname);3059zvol_rename_minors(dp->dp_spa, oldname, newname, B_TRUE);3060kmem_strfree(oldname);3061kmem_strfree(newname);30623063dsl_dataset_rele(ds, FTAG);3064return (0);3065}30663067void3068dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)3069{3070dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;3071dsl_pool_t *dp = dmu_tx_pool(tx);3072dsl_dataset_t *hds = NULL;30733074VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));3075ddrsa->ddrsa_tx = tx;3076if (ddrsa->ddrsa_recursive) {3077VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,3078dsl_dataset_rename_snapshot_sync_impl, ddrsa,3079DS_FIND_CHILDREN));3080} else {3081VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));3082}3083dsl_dataset_rele(hds, FTAG);3084}30853086int3087dsl_dataset_rename_snapshot(const char *fsname,3088const char *oldsnapname, const char *newsnapname, boolean_t recursive)3089{3090dsl_dataset_rename_snapshot_arg_t ddrsa;30913092ddrsa.ddrsa_fsname = fsname;3093ddrsa.ddrsa_oldsnapname = oldsnapname;3094ddrsa.ddrsa_newsnapname = newsnapname;3095ddrsa.ddrsa_recursive = recursive;30963097return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,3098dsl_dataset_rename_snapshot_sync, &ddrsa,30991, ZFS_SPACE_CHECK_RESERVED));3100}31013102/*3103* If we're doing an ownership handoff, we need to make sure that there is3104* only one long hold on the dataset. We're not allowed to change anything here3105* so we don't permanently release the long hold or regular hold here. We want3106* to do this only when syncing to avoid the dataset unexpectedly going away3107* when we release the long hold.3108*/3109static int3110dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)3111{3112boolean_t held = B_FALSE;31133114if (!dmu_tx_is_syncing(tx))3115return (0);31163117dsl_dir_t *dd = ds->ds_dir;3118mutex_enter(&dd->dd_activity_lock);3119uint64_t holds = zfs_refcount_count(&ds->ds_longholds) -3120(owner != NULL ? 1 : 0);3121/*3122* The value of dd_activity_waiters can chance as soon as we drop the3123* lock, but we're fine with that; new waiters coming in or old3124* waiters leaving doesn't cause problems, since we're going to cancel3125* waiters later anyway. The goal of this check is to verify that no3126* non-waiters have long-holds, and all new long-holds will be3127* prevented because we're holding the pool config as writer.3128*/3129if (holds != dd->dd_activity_waiters)3130held = B_TRUE;3131mutex_exit(&dd->dd_activity_lock);31323133if (held)3134return (SET_ERROR(EBUSY));31353136return (0);3137}31383139int3140dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)3141{3142dsl_dataset_rollback_arg_t *ddra = arg;3143dsl_pool_t *dp = dmu_tx_pool(tx);3144dsl_dataset_t *ds;3145int64_t unused_refres_delta;3146int error;31473148error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);3149if (error != 0)3150return (error);31513152/* must not be a snapshot */3153if (ds->ds_is_snapshot) {3154dsl_dataset_rele(ds, FTAG);3155return (SET_ERROR(EINVAL));3156}31573158/* must have a most recent snapshot */3159if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {3160dsl_dataset_rele(ds, FTAG);3161return (SET_ERROR(ESRCH));3162}31633164/*3165* No rollback to a snapshot created in the current txg, because3166* the rollback may dirty the dataset and create blocks that are3167* not reachable from the rootbp while having a birth txg that3168* falls into the snapshot's range.3169*/3170if (dmu_tx_is_syncing(tx) &&3171dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {3172dsl_dataset_rele(ds, FTAG);3173return (SET_ERROR(EAGAIN));3174}31753176/*3177* If the expected target snapshot is specified, then check that3178* the latest snapshot is it.3179*/3180if (ddra->ddra_tosnap != NULL) {3181dsl_dataset_t *snapds;31823183/* Check if the target snapshot exists at all. */3184error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);3185if (error != 0) {3186/*3187* ESRCH is used to signal that the target snapshot does3188* not exist, while ENOENT is used to report that3189* the rolled back dataset does not exist.3190* ESRCH is also used to cover other cases where the3191* target snapshot is not related to the dataset being3192* rolled back such as being in a different pool.3193*/3194if (error == ENOENT || error == EXDEV)3195error = SET_ERROR(ESRCH);3196dsl_dataset_rele(ds, FTAG);3197return (error);3198}3199ASSERT(snapds->ds_is_snapshot);32003201/* Check if the snapshot is the latest snapshot indeed. */3202if (snapds != ds->ds_prev) {3203/*3204* Distinguish between the case where the only problem3205* is intervening snapshots (EEXIST) vs the snapshot3206* not being a valid target for rollback (ESRCH).3207*/3208if (snapds->ds_dir == ds->ds_dir ||3209(dsl_dir_is_clone(ds->ds_dir) &&3210dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==3211snapds->ds_object)) {3212error = SET_ERROR(EEXIST);3213} else {3214error = SET_ERROR(ESRCH);3215}3216dsl_dataset_rele(snapds, FTAG);3217dsl_dataset_rele(ds, FTAG);3218return (error);3219}3220dsl_dataset_rele(snapds, FTAG);3221}32223223/* must not have any bookmarks after the most recent snapshot */3224if (dsl_bookmark_latest_txg(ds) >3225dsl_dataset_phys(ds)->ds_prev_snap_txg) {3226dsl_dataset_rele(ds, FTAG);3227return (SET_ERROR(EEXIST));3228}32293230error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);3231if (error != 0) {3232dsl_dataset_rele(ds, FTAG);3233return (error);3234}32353236/*3237* Check if the snap we are rolling back to uses more than3238* the refquota.3239*/3240if (ds->ds_quota != 0 &&3241dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {3242dsl_dataset_rele(ds, FTAG);3243return (SET_ERROR(EDQUOT));3244}32453246/*3247* When we do the clone swap, we will temporarily use more space3248* due to the refreservation (the head will no longer have any3249* unique space, so the entire amount of the refreservation will need3250* to be free). We will immediately destroy the clone, freeing3251* this space, but the freeing happens over many txg's.3252*/3253unused_refres_delta = (int64_t)MIN(ds->ds_reserved,3254dsl_dataset_phys(ds)->ds_unique_bytes);32553256if (unused_refres_delta > 0 &&3257unused_refres_delta >3258dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {3259dsl_dataset_rele(ds, FTAG);3260return (SET_ERROR(ENOSPC));3261}32623263dsl_dataset_rele(ds, FTAG);3264return (0);3265}32663267void3268dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)3269{3270dsl_dataset_rollback_arg_t *ddra = arg;3271dsl_pool_t *dp = dmu_tx_pool(tx);3272dsl_dataset_t *ds, *clone;3273uint64_t cloneobj;3274char namebuf[ZFS_MAX_DATASET_NAME_LEN];32753276VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));32773278dsl_dataset_name(ds->ds_prev, namebuf);3279fnvlist_add_string(ddra->ddra_result, "target", namebuf);32803281cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",3282ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);32833284VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));32853286dsl_dataset_clone_swap_sync_impl(clone, ds, tx);3287dsl_dataset_zero_zil(ds, tx);32883289dsl_destroy_head_sync_impl(clone, tx);32903291dsl_dataset_rele(clone, FTAG);3292dsl_dataset_rele(ds, FTAG);3293}32943295/*3296* Rolls back the given filesystem or volume to the most recent snapshot.3297* The name of the most recent snapshot will be returned under key "target"3298* in the result nvlist.3299*3300* If owner != NULL:3301* - The existing dataset MUST be owned by the specified owner at entry3302* - Upon return, dataset will still be held by the same owner, whether we3303* succeed or not.3304*3305* This mode is required any time the existing filesystem is mounted. See3306* notes above zfs_suspend_fs() for further details.3307*/3308int3309dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,3310nvlist_t *result)3311{3312dsl_dataset_rollback_arg_t ddra;33133314ddra.ddra_fsname = fsname;3315ddra.ddra_tosnap = tosnap;3316ddra.ddra_owner = owner;3317ddra.ddra_result = result;33183319return (dsl_sync_task(fsname, dsl_dataset_rollback_check,3320dsl_dataset_rollback_sync, &ddra,33211, ZFS_SPACE_CHECK_RESERVED));3322}33233324int3325dsl_dataset_clone_check(void *arg, dmu_tx_t *tx)3326{3327dsl_dataset_clone_arg_t *ddca = arg;3328dsl_dir_t *pdd;3329const char *tail;3330int error;3331dsl_dataset_t *origin;3332dsl_pool_t *dp = dmu_tx_pool(tx);33333334if (strchr(ddca->ddca_clone, '@') != NULL)3335return (SET_ERROR(EINVAL));33363337if (strlen(ddca->ddca_clone) >= ZFS_MAX_DATASET_NAME_LEN)3338return (SET_ERROR(ENAMETOOLONG));33393340error = dsl_dir_hold(dp, ddca->ddca_clone, FTAG, &pdd, &tail);3341if (error != 0)3342return (error);3343if (tail == NULL) {3344dsl_dir_rele(pdd, FTAG);3345return (SET_ERROR(EEXIST));3346}33473348error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,3349ddca->ddca_cred);3350if (error != 0) {3351dsl_dir_rele(pdd, FTAG);3352return (SET_ERROR(EDQUOT));3353}33543355error = dsl_dataset_hold(dp, ddca->ddca_origin, FTAG, &origin);3356if (error != 0) {3357dsl_dir_rele(pdd, FTAG);3358return (error);3359}33603361/* You can only clone snapshots, not the head datasets. */3362if (!origin->ds_is_snapshot) {3363dsl_dataset_rele(origin, FTAG);3364dsl_dir_rele(pdd, FTAG);3365return (SET_ERROR(EINVAL));3366}33673368dsl_dataset_rele(origin, FTAG);3369dsl_dir_rele(pdd, FTAG);33703371return (0);3372}33733374void3375dsl_dataset_clone_sync(void *arg, dmu_tx_t *tx)3376{3377dsl_dataset_clone_arg_t *ddca = arg;3378dsl_pool_t *dp = dmu_tx_pool(tx);3379dsl_dir_t *pdd;3380const char *tail;3381dsl_dataset_t *origin, *ds;3382uint64_t obj;3383char namebuf[ZFS_MAX_DATASET_NAME_LEN];33843385VERIFY0(dsl_dir_hold(dp, ddca->ddca_clone, FTAG, &pdd, &tail));3386VERIFY0(dsl_dataset_hold(dp, ddca->ddca_origin, FTAG, &origin));33873388obj = dsl_dataset_create_sync(pdd, tail, origin, 0,3389ddca->ddca_cred, NULL, tx);33903391VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));3392dsl_dataset_name(origin, namebuf);3393spa_history_log_internal_ds(ds, "clone", tx,3394"origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object);3395dsl_dataset_rele(ds, FTAG);3396dsl_dataset_rele(origin, FTAG);3397dsl_dir_rele(pdd, FTAG);3398}33993400int3401dsl_dataset_clone(const char *clone, const char *origin)3402{3403dsl_dataset_clone_arg_t ddca;34043405cred_t *cr = CRED();3406crhold(cr);34073408ddca.ddca_clone = clone;3409ddca.ddca_origin = origin;3410ddca.ddca_cred = cr;34113412int rv = dsl_sync_task(clone,3413dsl_dataset_clone_check, dsl_dataset_clone_sync, &ddca,34146, ZFS_SPACE_CHECK_NORMAL);34153416if (rv == 0)3417zvol_create_minors(clone);34183419crfree(cr);34203421return (rv);3422}34233424struct promotenode {3425list_node_t link;3426dsl_dataset_t *ds;3427};34283429static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);3430static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,3431const void *tag);3432static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag);34333434int3435dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)3436{3437dsl_dataset_promote_arg_t *ddpa = arg;3438dsl_pool_t *dp = dmu_tx_pool(tx);3439dsl_dataset_t *hds;3440struct promotenode *snap;3441int err;3442uint64_t unused;3443uint64_t ss_mv_cnt;3444size_t max_snap_len;3445boolean_t conflicting_snaps;34463447err = promote_hold(ddpa, dp, FTAG);3448if (err != 0)3449return (err);34503451hds = ddpa->ddpa_clone;3452max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;34533454if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {3455promote_rele(ddpa, FTAG);3456return (SET_ERROR(EXDEV));3457}34583459snap = list_head(&ddpa->shared_snaps);3460if (snap == NULL) {3461err = SET_ERROR(ENOENT);3462goto out;3463}3464dsl_dataset_t *const origin_ds = snap->ds;34653466/*3467* Encrypted clones share a DSL Crypto Key with their origin's dsl dir.3468* When doing a promote we must make sure the encryption root for3469* both the target and the target's origin does not change to avoid3470* needing to rewrap encryption keys3471*/3472err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);3473if (err != 0)3474goto out;34753476/*3477* Compute and check the amount of space to transfer. Since this is3478* so expensive, don't do the preliminary check.3479*/3480if (!dmu_tx_is_syncing(tx)) {3481promote_rele(ddpa, FTAG);3482return (0);3483}34843485/* compute origin's new unique space */3486snap = list_tail(&ddpa->clone_snaps);3487ASSERT(snap != NULL);3488ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,3489origin_ds->ds_object);3490dsl_deadlist_space_range(&snap->ds->ds_deadlist,3491dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,3492&ddpa->unique, &unused, &unused);34933494/*3495* Walk the snapshots that we are moving3496*3497* Compute space to transfer. Consider the incremental changes3498* to used by each snapshot:3499* (my used) = (prev's used) + (blocks born) - (blocks killed)3500* So each snapshot gave birth to:3501* (blocks born) = (my used) - (prev's used) + (blocks killed)3502* So a sequence would look like:3503* (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)3504* Which simplifies to:3505* uN + kN + kN-1 + ... + k1 + k03506* Note however, if we stop before we reach the ORIGIN we get:3507* uN + kN + kN-1 + ... + kM - uM-13508*/3509conflicting_snaps = B_FALSE;3510ss_mv_cnt = 0;3511ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;3512ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;3513ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;3514for (snap = list_head(&ddpa->shared_snaps); snap;3515snap = list_next(&ddpa->shared_snaps, snap)) {3516uint64_t val, dlused, dlcomp, dluncomp;3517dsl_dataset_t *ds = snap->ds;35183519ss_mv_cnt++;35203521/*3522* If there are long holds, we won't be able to evict3523* the objset.3524*/3525if (dsl_dataset_long_held(ds)) {3526err = SET_ERROR(EBUSY);3527goto out;3528}35293530/* Check that the snapshot name does not conflict */3531VERIFY0(dsl_dataset_get_snapname(ds));3532if (strlen(ds->ds_snapname) >= max_snap_len) {3533err = SET_ERROR(ENAMETOOLONG);3534goto out;3535}3536err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);3537if (err == 0) {3538fnvlist_add_boolean(ddpa->err_ds,3539snap->ds->ds_snapname);3540conflicting_snaps = B_TRUE;3541} else if (err != ENOENT) {3542goto out;3543}35443545/* The very first snapshot does not have a deadlist */3546if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)3547continue;35483549dsl_deadlist_space(&ds->ds_deadlist,3550&dlused, &dlcomp, &dluncomp);3551ddpa->used += dlused;3552ddpa->comp += dlcomp;3553ddpa->uncomp += dluncomp;3554}35553556/*3557* Check that bookmarks that are being transferred don't have3558* name conflicts.3559*/3560for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks);3561dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=3562dsl_dataset_phys(origin_ds)->ds_creation_txg;3563dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) {3564if (strlen(dbn->dbn_name) >= max_snap_len) {3565err = SET_ERROR(ENAMETOOLONG);3566goto out;3567}3568zfs_bookmark_phys_t bm;3569err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone,3570dbn->dbn_name, &bm);35713572if (err == 0) {3573fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name);3574conflicting_snaps = B_TRUE;3575} else if (err == ESRCH) {3576err = 0;3577}3578if (err != 0) {3579goto out;3580}3581}35823583/*3584* In order to return the full list of conflicting snapshots, we check3585* whether there was a conflict after traversing all of them.3586*/3587if (conflicting_snaps) {3588err = SET_ERROR(EEXIST);3589goto out;3590}35913592/*3593* If we are a clone of a clone then we never reached ORIGIN,3594* so we need to subtract out the clone origin's used space.3595*/3596if (ddpa->origin_origin) {3597ddpa->used -=3598dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;3599ddpa->comp -=3600dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;3601ddpa->uncomp -=3602dsl_dataset_phys(ddpa->origin_origin)->3603ds_uncompressed_bytes;3604}36053606/* Check that there is enough space and limit headroom here */3607err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,36080, ss_mv_cnt, ddpa->used, ddpa->cr);3609if (err != 0)3610goto out;36113612/*3613* Compute the amounts of space that will be used by snapshots3614* after the promotion (for both origin and clone). For each,3615* it is the amount of space that will be on all of their3616* deadlists (that was not born before their new origin).3617*/3618if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {3619uint64_t space;36203621/*3622* Note, typically this will not be a clone of a clone,3623* so dd_origin_txg will be < TXG_INITIAL, so3624* these snaplist_space() -> dsl_deadlist_space_range()3625* calls will be fast because they do not have to3626* iterate over all bps.3627*/3628snap = list_head(&ddpa->origin_snaps);3629if (snap == NULL) {3630err = SET_ERROR(ENOENT);3631goto out;3632}3633err = snaplist_space(&ddpa->shared_snaps,3634snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);3635if (err != 0)3636goto out;36373638err = snaplist_space(&ddpa->clone_snaps,3639snap->ds->ds_dir->dd_origin_txg, &space);3640if (err != 0)3641goto out;3642ddpa->cloneusedsnap += space;3643}3644if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &3645DD_FLAG_USED_BREAKDOWN) {3646err = snaplist_space(&ddpa->origin_snaps,3647dsl_dataset_phys(origin_ds)->ds_creation_txg,3648&ddpa->originusedsnap);3649if (err != 0)3650goto out;3651}36523653out:3654promote_rele(ddpa, FTAG);3655return (err);3656}36573658void3659dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)3660{3661dsl_dataset_promote_arg_t *ddpa = arg;3662dsl_pool_t *dp = dmu_tx_pool(tx);3663dsl_dataset_t *hds;3664struct promotenode *snap;3665dsl_dataset_t *origin_ds;3666dsl_dataset_t *origin_head;3667dsl_dir_t *dd;3668dsl_dir_t *odd = NULL;3669uint64_t oldnext_obj;3670int64_t delta;36713672ASSERT(nvlist_empty(ddpa->err_ds));36733674VERIFY0(promote_hold(ddpa, dp, FTAG));3675hds = ddpa->ddpa_clone;36763677ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);36783679snap = list_head(&ddpa->shared_snaps);3680origin_ds = snap->ds;3681dd = hds->ds_dir;36823683snap = list_head(&ddpa->origin_snaps);3684origin_head = snap->ds;36853686/*3687* We need to explicitly open odd, since origin_ds's dd will be3688* changing.3689*/3690VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,3691NULL, FTAG, &odd));36923693dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);36943695/* change origin's next snap */3696dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);3697oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;3698snap = list_tail(&ddpa->clone_snaps);3699ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,3700origin_ds->ds_object);3701dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;37023703/* change the origin's next clone */3704if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {3705dsl_dataset_remove_from_next_clones(origin_ds,3706snap->ds->ds_object, tx);3707VERIFY0(zap_add_int(dp->dp_meta_objset,3708dsl_dataset_phys(origin_ds)->ds_next_clones_obj,3709oldnext_obj, tx));3710}37113712/* change origin */3713dmu_buf_will_dirty(dd->dd_dbuf, tx);3714ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);3715dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;3716dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;3717dmu_buf_will_dirty(odd->dd_dbuf, tx);3718dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;3719origin_head->ds_dir->dd_origin_txg =3720dsl_dataset_phys(origin_ds)->ds_creation_txg;37213722/* change dd_clone entries */3723if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {3724VERIFY0(zap_remove_int(dp->dp_meta_objset,3725dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));3726VERIFY0(zap_add_int(dp->dp_meta_objset,3727dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,3728hds->ds_object, tx));37293730VERIFY0(zap_remove_int(dp->dp_meta_objset,3731dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,3732origin_head->ds_object, tx));3733if (dsl_dir_phys(dd)->dd_clones == 0) {3734dsl_dir_phys(dd)->dd_clones =3735zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,3736DMU_OT_NONE, 0, tx);3737}3738VERIFY0(zap_add_int(dp->dp_meta_objset,3739dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));3740}37413742/*3743* Move bookmarks to this dir.3744*/3745dsl_bookmark_node_t *dbn_next;3746for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);3747dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=3748dsl_dataset_phys(origin_ds)->ds_creation_txg;3749dbn = dbn_next) {3750dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn);37513752avl_remove(&origin_head->ds_bookmarks, dbn);3753VERIFY0(zap_remove(dp->dp_meta_objset,3754origin_head->ds_bookmarks_obj, dbn->dbn_name, tx));37553756dsl_bookmark_node_add(hds, dbn, tx);3757}37583759dsl_bookmark_next_changed(hds, origin_ds, tx);37603761/* move snapshots to this dir */3762for (snap = list_head(&ddpa->shared_snaps); snap;3763snap = list_next(&ddpa->shared_snaps, snap)) {3764dsl_dataset_t *ds = snap->ds;37653766/*3767* Property callbacks are registered to a particular3768* dsl_dir. Since ours is changing, evict the objset3769* so that they will be unregistered from the old dsl_dir.3770*/3771if (ds->ds_objset) {3772dmu_objset_evict(ds->ds_objset);3773ds->ds_objset = NULL;3774}37753776/* move snap name entry */3777VERIFY0(dsl_dataset_get_snapname(ds));3778VERIFY0(dsl_dataset_snap_remove(origin_head,3779ds->ds_snapname, tx, B_TRUE));3780VERIFY0(zap_add(dp->dp_meta_objset,3781dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,37828, 1, &ds->ds_object, tx));3783dsl_fs_ss_count_adjust(hds->ds_dir, 1,3784DD_FIELD_SNAPSHOT_COUNT, tx);37853786/* change containing dsl_dir */3787dmu_buf_will_dirty(ds->ds_dbuf, tx);3788ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);3789dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;3790ASSERT3P(ds->ds_dir, ==, odd);3791dsl_dir_rele(ds->ds_dir, ds);3792VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,3793NULL, ds, &ds->ds_dir));37943795/* move any clone references */3796if (dsl_dataset_phys(ds)->ds_next_clones_obj &&3797spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {3798zap_cursor_t zc;3799zap_attribute_t *za = zap_attribute_alloc();38003801for (zap_cursor_init(&zc, dp->dp_meta_objset,3802dsl_dataset_phys(ds)->ds_next_clones_obj);3803zap_cursor_retrieve(&zc, za) == 0;3804zap_cursor_advance(&zc)) {3805dsl_dataset_t *cnds;3806uint64_t o;38073808if (za->za_first_integer == oldnext_obj) {3809/*3810* We've already moved the3811* origin's reference.3812*/3813continue;3814}38153816VERIFY0(dsl_dataset_hold_obj(dp,3817za->za_first_integer, FTAG, &cnds));3818o = dsl_dir_phys(cnds->ds_dir)->3819dd_head_dataset_obj;38203821VERIFY0(zap_remove_int(dp->dp_meta_objset,3822dsl_dir_phys(odd)->dd_clones, o, tx));3823VERIFY0(zap_add_int(dp->dp_meta_objset,3824dsl_dir_phys(dd)->dd_clones, o, tx));3825dsl_dataset_rele(cnds, FTAG);3826}3827zap_cursor_fini(&zc);3828zap_attribute_free(za);3829}38303831ASSERT(!dsl_prop_hascb(ds));3832}38333834/*3835* Change space accounting.3836* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either3837* both be valid, or both be 0 (resulting in delta == 0). This3838* is true for each of {clone,origin} independently.3839*/38403841delta = ddpa->cloneusedsnap -3842dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];3843ASSERT3S(delta, >=, 0);3844ASSERT3U(ddpa->used, >=, delta);3845dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);3846dsl_dir_diduse_space(dd, DD_USED_HEAD,3847ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);38483849delta = ddpa->originusedsnap -3850dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];3851ASSERT3S(delta, <=, 0);3852ASSERT3U(ddpa->used, >=, -delta);3853dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);3854dsl_dir_diduse_space(odd, DD_USED_HEAD,3855-ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);38563857dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;38583859/*3860* Since livelists are specific to a clone's origin txg, they3861* are no longer accurate. Destroy the livelist from the clone being3862* promoted. If the origin dataset is a clone, destroy its livelist3863* as well.3864*/3865dsl_dir_remove_livelist(dd, tx, B_TRUE);3866dsl_dir_remove_livelist(odd, tx, B_TRUE);38673868/* log history record */3869spa_history_log_internal_ds(hds, "promote", tx, " ");38703871dsl_dir_rele(odd, FTAG);38723873/*3874* Transfer common error blocks from old head to new head, before3875* calling promote_rele() on ddpa since we need to dereference3876* origin_head and hds.3877*/3878if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {3879uint64_t old_head = origin_head->ds_object;3880uint64_t new_head = hds->ds_object;3881spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);3882}38833884promote_rele(ddpa, FTAG);3885}38863887/*3888* Make a list of dsl_dataset_t's for the snapshots between first_obj3889* (exclusive) and last_obj (inclusive). The list will be in reverse3890* order (last_obj will be the list_head()). If first_obj == 0, do all3891* snapshots back to this dataset's origin.3892*/3893static int3894snaplist_make(dsl_pool_t *dp,3895uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag)3896{3897uint64_t obj = last_obj;38983899list_create(l, sizeof (struct promotenode),3900offsetof(struct promotenode, link));39013902while (obj != first_obj) {3903dsl_dataset_t *ds;3904struct promotenode *snap;3905int err;39063907err = dsl_dataset_hold_obj(dp, obj, tag, &ds);3908ASSERT(err != ENOENT);3909if (err != 0)3910return (err);39113912if (first_obj == 0)3913first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;39143915snap = kmem_alloc(sizeof (*snap), KM_SLEEP);3916snap->ds = ds;3917list_insert_tail(l, snap);3918obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;3919}39203921return (0);3922}39233924static int3925snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)3926{3927struct promotenode *snap;39283929*spacep = 0;3930for (snap = list_head(l); snap; snap = list_next(l, snap)) {3931uint64_t used, comp, uncomp;3932dsl_deadlist_space_range(&snap->ds->ds_deadlist,3933mintxg, UINT64_MAX, &used, &comp, &uncomp);3934*spacep += used;3935}3936return (0);3937}39383939static void3940snaplist_destroy(list_t *l, const void *tag)3941{3942struct promotenode *snap;39433944if (l == NULL || !list_link_active(&l->list_head))3945return;39463947while ((snap = list_remove_tail(l)) != NULL) {3948dsl_dataset_rele(snap->ds, tag);3949kmem_free(snap, sizeof (*snap));3950}3951list_destroy(l);3952}39533954static int3955promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag)3956{3957int error;3958dsl_dir_t *dd;3959struct promotenode *snap;39603961error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,3962&ddpa->ddpa_clone);3963if (error != 0)3964return (error);3965dd = ddpa->ddpa_clone->ds_dir;39663967if (ddpa->ddpa_clone->ds_is_snapshot ||3968!dsl_dir_is_clone(dd)) {3969dsl_dataset_rele(ddpa->ddpa_clone, tag);3970return (SET_ERROR(EINVAL));3971}39723973error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,3974&ddpa->shared_snaps, tag);3975if (error != 0)3976goto out;39773978error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,3979&ddpa->clone_snaps, tag);3980if (error != 0)3981goto out;39823983snap = list_head(&ddpa->shared_snaps);3984ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);3985error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,3986dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,3987&ddpa->origin_snaps, tag);3988if (error != 0)3989goto out;39903991if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {3992error = dsl_dataset_hold_obj(dp,3993dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,3994tag, &ddpa->origin_origin);3995if (error != 0)3996goto out;3997}3998out:3999if (error != 0)4000promote_rele(ddpa, tag);4001return (error);4002}40034004static void4005promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag)4006{4007snaplist_destroy(&ddpa->shared_snaps, tag);4008snaplist_destroy(&ddpa->clone_snaps, tag);4009snaplist_destroy(&ddpa->origin_snaps, tag);4010if (ddpa->origin_origin != NULL)4011dsl_dataset_rele(ddpa->origin_origin, tag);4012dsl_dataset_rele(ddpa->ddpa_clone, tag);4013}40144015/*4016* Promote a clone.4017*4018* If it fails due to a conflicting snapshot name, "conflsnap" will be filled4019* in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)4020*/4021int4022dsl_dataset_promote(const char *name, char *conflsnap)4023{4024dsl_dataset_promote_arg_t ddpa = { 0 };4025uint64_t numsnaps;4026int error;4027nvpair_t *snap_pair;4028objset_t *os;40294030/*4031* We will modify space proportional to the number of4032* snapshots. Compute numsnaps.4033*/4034error = dmu_objset_hold(name, FTAG, &os);4035if (error != 0)4036return (error);4037error = zap_count(dmu_objset_pool(os)->dp_meta_objset,4038dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,4039&numsnaps);4040dmu_objset_rele(os, FTAG);4041if (error != 0)4042return (error);40434044cred_t *cr = CRED();4045crhold(cr);40464047ddpa.ddpa_clonename = name;4048ddpa.err_ds = fnvlist_alloc();4049ddpa.cr = cr;40504051error = dsl_sync_task(name, dsl_dataset_promote_check,4052dsl_dataset_promote_sync, &ddpa,40532 + numsnaps, ZFS_SPACE_CHECK_RESERVED);40544055crfree(cr);40564057/*4058* Return the first conflicting snapshot found.4059*/4060snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);4061if (snap_pair != NULL && conflsnap != NULL)4062(void) strlcpy(conflsnap, nvpair_name(snap_pair),4063ZFS_MAX_DATASET_NAME_LEN);40644065fnvlist_free(ddpa.err_ds);4066return (error);4067}40684069int4070dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,4071dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)4072{4073/*4074* "slack" factor for received datasets with refquota set on them.4075* See the bottom of this function for details on its use.4076*/4077uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS *4078spa_asize_inflation;4079int64_t unused_refres_delta;40804081/* they should both be heads */4082if (clone->ds_is_snapshot ||4083origin_head->ds_is_snapshot)4084return (SET_ERROR(EINVAL));40854086/* if we are not forcing, the branch point should be just before them */4087if (!force && clone->ds_prev != origin_head->ds_prev)4088return (SET_ERROR(EINVAL));40894090/* clone should be the clone (unless they are unrelated) */4091if (clone->ds_prev != NULL &&4092clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&4093origin_head->ds_dir != clone->ds_prev->ds_dir)4094return (SET_ERROR(EINVAL));40954096/* the clone should be a child of the origin */4097if (clone->ds_dir->dd_parent != origin_head->ds_dir)4098return (SET_ERROR(EINVAL));40994100/* origin_head shouldn't be modified unless 'force' */4101if (!force &&4102dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))4103return (SET_ERROR(ETXTBSY));41044105/* origin_head should have no long holds (e.g. is not mounted) */4106if (dsl_dataset_handoff_check(origin_head, owner, tx))4107return (SET_ERROR(EBUSY));41084109/* check amount of any unconsumed refreservation */4110unused_refres_delta =4111(int64_t)MIN(origin_head->ds_reserved,4112dsl_dataset_phys(origin_head)->ds_unique_bytes) -4113(int64_t)MIN(origin_head->ds_reserved,4114dsl_dataset_phys(clone)->ds_unique_bytes);41154116if (unused_refres_delta > 0 &&4117unused_refres_delta >4118dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))4119return (SET_ERROR(ENOSPC));41204121/*4122* The clone can't be too much over the head's refquota.4123*4124* To ensure that the entire refquota can be used, we allow one4125* transaction to exceed the refquota. Therefore, this check4126* needs to also allow for the space referenced to be more than the4127* refquota. The maximum amount of space that one transaction can use4128* on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this4129* overage ensures that we are able to receive a filesystem that4130* exceeds the refquota on the source system.4131*4132* So that overage is the refquota_slack we use below.4133*/4134if (origin_head->ds_quota != 0 &&4135dsl_dataset_phys(clone)->ds_referenced_bytes >4136origin_head->ds_quota + refquota_slack)4137return (SET_ERROR(EDQUOT));41384139return (0);4140}41414142static void4143dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,4144dsl_dataset_t *origin, dmu_tx_t *tx)4145{4146uint64_t clone_remap_dl_obj, origin_remap_dl_obj;4147dsl_pool_t *dp = dmu_tx_pool(tx);41484149ASSERT(dsl_pool_sync_context(dp));41504151clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);4152origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);41534154if (clone_remap_dl_obj != 0) {4155dsl_deadlist_close(&clone->ds_remap_deadlist);4156dsl_dataset_unset_remap_deadlist_object(clone, tx);4157}4158if (origin_remap_dl_obj != 0) {4159dsl_deadlist_close(&origin->ds_remap_deadlist);4160dsl_dataset_unset_remap_deadlist_object(origin, tx);4161}41624163if (clone_remap_dl_obj != 0) {4164dsl_dataset_set_remap_deadlist_object(origin,4165clone_remap_dl_obj, tx);4166VERIFY0(dsl_deadlist_open(&origin->ds_remap_deadlist,4167dp->dp_meta_objset, clone_remap_dl_obj));4168}4169if (origin_remap_dl_obj != 0) {4170dsl_dataset_set_remap_deadlist_object(clone,4171origin_remap_dl_obj, tx);4172VERIFY0(dsl_deadlist_open(&clone->ds_remap_deadlist,4173dp->dp_meta_objset, origin_remap_dl_obj));4174}4175}41764177void4178dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,4179dsl_dataset_t *origin_head, dmu_tx_t *tx)4180{4181dsl_pool_t *dp = dmu_tx_pool(tx);4182int64_t unused_refres_delta;41834184ASSERT0(clone->ds_reserved);4185/*4186* NOTE: On DEBUG kernels there could be a race between this and4187* the check function if spa_asize_inflation is adjusted...4188*/4189ASSERT(origin_head->ds_quota == 0 ||4190dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +4191DMU_MAX_ACCESS * spa_asize_inflation);4192ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);41934194dsl_dir_cancel_waiters(origin_head->ds_dir);41954196/*4197* Swap per-dataset feature flags.4198*/4199for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {4200if (!(spa_feature_table[f].fi_flags &4201ZFEATURE_FLAG_PER_DATASET)) {4202ASSERT(!dsl_dataset_feature_is_active(clone, f));4203ASSERT(!dsl_dataset_feature_is_active(origin_head, f));4204continue;4205}42064207boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f);4208void *clone_feature = clone->ds_feature[f];4209boolean_t origin_head_inuse =4210dsl_dataset_feature_is_active(origin_head, f);4211void *origin_head_feature = origin_head->ds_feature[f];42124213if (clone_inuse)4214dsl_dataset_deactivate_feature_impl(clone, f, tx);4215if (origin_head_inuse)4216dsl_dataset_deactivate_feature_impl(origin_head, f, tx);42174218if (clone_inuse) {4219dsl_dataset_activate_feature(origin_head->ds_object, f,4220clone_feature, tx);4221origin_head->ds_feature[f] = clone_feature;4222}4223if (origin_head_inuse) {4224dsl_dataset_activate_feature(clone->ds_object, f,4225origin_head_feature, tx);4226clone->ds_feature[f] = origin_head_feature;4227}4228}42294230dmu_buf_will_dirty(clone->ds_dbuf, tx);4231dmu_buf_will_dirty(origin_head->ds_dbuf, tx);42324233if (clone->ds_objset != NULL) {4234dmu_objset_evict(clone->ds_objset);4235clone->ds_objset = NULL;4236}42374238if (origin_head->ds_objset != NULL) {4239dmu_objset_evict(origin_head->ds_objset);4240origin_head->ds_objset = NULL;4241}42424243unused_refres_delta =4244(int64_t)MIN(origin_head->ds_reserved,4245dsl_dataset_phys(origin_head)->ds_unique_bytes) -4246(int64_t)MIN(origin_head->ds_reserved,4247dsl_dataset_phys(clone)->ds_unique_bytes);42484249/*4250* Reset origin's unique bytes.4251*/4252{4253dsl_dataset_t *origin = clone->ds_prev;4254uint64_t comp, uncomp;42554256dmu_buf_will_dirty(origin->ds_dbuf, tx);4257dsl_deadlist_space_range(&clone->ds_deadlist,4258dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,4259&dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);4260}42614262/* swap blkptrs */4263{4264rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);4265rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);4266blkptr_t tmp;4267tmp = dsl_dataset_phys(origin_head)->ds_bp;4268dsl_dataset_phys(origin_head)->ds_bp =4269dsl_dataset_phys(clone)->ds_bp;4270dsl_dataset_phys(clone)->ds_bp = tmp;4271rrw_exit(&origin_head->ds_bp_rwlock, FTAG);4272rrw_exit(&clone->ds_bp_rwlock, FTAG);4273}42744275/* set dd_*_bytes */4276{4277int64_t dused, dcomp, duncomp;4278uint64_t cdl_used, cdl_comp, cdl_uncomp;4279uint64_t odl_used, odl_comp, odl_uncomp;42804281ASSERT3U(dsl_dir_phys(clone->ds_dir)->4282dd_used_breakdown[DD_USED_SNAP], ==, 0);42834284dsl_deadlist_space(&clone->ds_deadlist,4285&cdl_used, &cdl_comp, &cdl_uncomp);4286dsl_deadlist_space(&origin_head->ds_deadlist,4287&odl_used, &odl_comp, &odl_uncomp);42884289dused = dsl_dataset_phys(clone)->ds_referenced_bytes +4290cdl_used -4291(dsl_dataset_phys(origin_head)->ds_referenced_bytes +4292odl_used);4293dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +4294cdl_comp -4295(dsl_dataset_phys(origin_head)->ds_compressed_bytes +4296odl_comp);4297duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +4298cdl_uncomp -4299(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +4300odl_uncomp);43014302dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,4303dused, dcomp, duncomp, tx);4304dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,4305-dused, -dcomp, -duncomp, tx);43064307/*4308* The difference in the space used by snapshots is the4309* difference in snapshot space due to the head's4310* deadlist (since that's the only thing that's4311* changing that affects the snapused).4312*/4313dsl_deadlist_space_range(&clone->ds_deadlist,4314origin_head->ds_dir->dd_origin_txg, UINT64_MAX,4315&cdl_used, &cdl_comp, &cdl_uncomp);4316dsl_deadlist_space_range(&origin_head->ds_deadlist,4317origin_head->ds_dir->dd_origin_txg, UINT64_MAX,4318&odl_used, &odl_comp, &odl_uncomp);4319dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,4320DD_USED_HEAD, DD_USED_SNAP, tx);4321}43224323/* swap ds_*_bytes */4324SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,4325dsl_dataset_phys(clone)->ds_referenced_bytes);4326SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,4327dsl_dataset_phys(clone)->ds_compressed_bytes);4328SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,4329dsl_dataset_phys(clone)->ds_uncompressed_bytes);4330SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,4331dsl_dataset_phys(clone)->ds_unique_bytes);43324333/* apply any parent delta for change in unconsumed refreservation */4334dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,4335unused_refres_delta, 0, 0, tx);43364337/*4338* Swap deadlists.4339*/4340dsl_deadlist_close(&clone->ds_deadlist);4341dsl_deadlist_close(&origin_head->ds_deadlist);4342SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,4343dsl_dataset_phys(clone)->ds_deadlist_obj);4344VERIFY0(dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,4345dsl_dataset_phys(clone)->ds_deadlist_obj));4346VERIFY0(dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,4347dsl_dataset_phys(origin_head)->ds_deadlist_obj));4348dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);43494350/*4351* If there is a bookmark at the origin, its "next dataset" is4352* changing, so we need to reset its FBN.4353*/4354dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx);43554356dsl_scan_ds_clone_swapped(origin_head, clone, tx);43574358/*4359* Destroy any livelists associated with the clone or the origin,4360* since after the swap the corresponding livelists are no longer4361* valid.4362*/4363dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);4364dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);43654366spa_history_log_internal_ds(clone, "clone swap", tx,4367"parent=%s", origin_head->ds_dir->dd_myname);4368}43694370/*4371* Given a pool name and a dataset object number in that pool,4372* return the name of that dataset.4373*/4374int4375dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)4376{4377dsl_pool_t *dp;4378dsl_dataset_t *ds;4379int error;43804381error = dsl_pool_hold(pname, FTAG, &dp);4382if (error != 0)4383return (error);43844385error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);4386if (error == 0) {4387dsl_dataset_name(ds, buf);4388dsl_dataset_rele(ds, FTAG);4389}4390dsl_pool_rele(dp, FTAG);43914392return (error);4393}43944395int4396dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,4397uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)4398{4399int error = 0;44004401ASSERT3S(asize, >, 0);44024403/*4404* *ref_rsrv is the portion of asize that will come from any4405* unconsumed refreservation space.4406*/4407*ref_rsrv = 0;44084409mutex_enter(&ds->ds_lock);4410/*4411* Make a space adjustment for reserved bytes.4412*/4413if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {4414ASSERT3U(*used, >=,4415ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);4416*used -=4417(ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);4418*ref_rsrv =4419asize - MIN(asize, parent_delta(ds, asize + inflight));4420}44214422if (!check_quota || ds->ds_quota == 0) {4423mutex_exit(&ds->ds_lock);4424return (0);4425}4426/*4427* If they are requesting more space, and our current estimate4428* is over quota, they get to try again unless the actual4429* on-disk is over quota and there are no pending changes (which4430* may free up space for us).4431*/4432if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=4433ds->ds_quota) {4434if (inflight > 0 ||4435dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)4436error = SET_ERROR(ERESTART);4437else4438error = SET_ERROR(EDQUOT);4439}4440mutex_exit(&ds->ds_lock);44414442return (error);4443}44444445typedef struct dsl_dataset_set_qr_arg {4446const char *ddsqra_name;4447zprop_source_t ddsqra_source;4448uint64_t ddsqra_value;4449} dsl_dataset_set_qr_arg_t;445044514452static int4453dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)4454{4455dsl_dataset_set_qr_arg_t *ddsqra = arg;4456dsl_pool_t *dp = dmu_tx_pool(tx);4457dsl_dataset_t *ds;4458int error;4459uint64_t newval;44604461if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)4462return (SET_ERROR(ENOTSUP));44634464error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);4465if (error != 0)4466return (error);44674468if (ds->ds_is_snapshot) {4469dsl_dataset_rele(ds, FTAG);4470return (SET_ERROR(EINVAL));4471}44724473error = dsl_prop_predict(ds->ds_dir,4474zfs_prop_to_name(ZFS_PROP_REFQUOTA),4475ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);4476if (error != 0) {4477dsl_dataset_rele(ds, FTAG);4478return (error);4479}44804481if (newval == 0) {4482dsl_dataset_rele(ds, FTAG);4483return (0);4484}44854486if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||4487newval < ds->ds_reserved) {4488dsl_dataset_rele(ds, FTAG);4489return (SET_ERROR(ENOSPC));4490}44914492dsl_dataset_rele(ds, FTAG);4493return (0);4494}44954496static void4497dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)4498{4499dsl_dataset_set_qr_arg_t *ddsqra = arg;4500dsl_pool_t *dp = dmu_tx_pool(tx);4501dsl_dataset_t *ds = NULL;4502uint64_t newval;45034504VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));45054506dsl_prop_set_sync_impl(ds,4507zfs_prop_to_name(ZFS_PROP_REFQUOTA),4508ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,4509&ddsqra->ddsqra_value, tx);45104511VERIFY0(dsl_prop_get_int_ds(ds,4512zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));45134514if (ds->ds_quota != newval) {4515dmu_buf_will_dirty(ds->ds_dbuf, tx);4516ds->ds_quota = newval;4517}4518dsl_dataset_rele(ds, FTAG);4519}45204521int4522dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,4523uint64_t refquota)4524{4525dsl_dataset_set_qr_arg_t ddsqra;45264527ddsqra.ddsqra_name = dsname;4528ddsqra.ddsqra_source = source;4529ddsqra.ddsqra_value = refquota;45304531return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,4532dsl_dataset_set_refquota_sync, &ddsqra, 0,4533ZFS_SPACE_CHECK_EXTRA_RESERVED));4534}45354536static int4537dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)4538{4539dsl_dataset_set_qr_arg_t *ddsqra = arg;4540dsl_pool_t *dp = dmu_tx_pool(tx);4541dsl_dataset_t *ds;4542int error;4543uint64_t newval, unique;45444545if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)4546return (SET_ERROR(ENOTSUP));45474548error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);4549if (error != 0)4550return (error);45514552if (ds->ds_is_snapshot) {4553dsl_dataset_rele(ds, FTAG);4554return (SET_ERROR(EINVAL));4555}45564557error = dsl_prop_predict(ds->ds_dir,4558zfs_prop_to_name(ZFS_PROP_REFRESERVATION),4559ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);4560if (error != 0) {4561dsl_dataset_rele(ds, FTAG);4562return (error);4563}45644565/*4566* If we are doing the preliminary check in open context, the4567* space estimates may be inaccurate.4568*/4569if (!dmu_tx_is_syncing(tx)) {4570dsl_dataset_rele(ds, FTAG);4571return (0);4572}45734574mutex_enter(&ds->ds_lock);4575if (!DS_UNIQUE_IS_ACCURATE(ds))4576dsl_dataset_recalc_head_uniq(ds);4577unique = dsl_dataset_phys(ds)->ds_unique_bytes;4578mutex_exit(&ds->ds_lock);45794580if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {4581uint64_t delta = MAX(unique, newval) -4582MAX(unique, ds->ds_reserved);45834584if (delta >4585dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||4586(ds->ds_quota > 0 && newval > ds->ds_quota)) {4587dsl_dataset_rele(ds, FTAG);4588return (SET_ERROR(ENOSPC));4589}4590}45914592dsl_dataset_rele(ds, FTAG);4593return (0);4594}45954596void4597dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,4598zprop_source_t source, uint64_t value, dmu_tx_t *tx)4599{4600uint64_t newval;4601uint64_t unique;4602int64_t delta;46034604dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),4605source, sizeof (value), 1, &value, tx);46064607VERIFY0(dsl_prop_get_int_ds(ds,4608zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));46094610dmu_buf_will_dirty(ds->ds_dbuf, tx);4611mutex_enter(&ds->ds_dir->dd_lock);4612mutex_enter(&ds->ds_lock);4613ASSERT(DS_UNIQUE_IS_ACCURATE(ds));4614unique = dsl_dataset_phys(ds)->ds_unique_bytes;4615delta = MAX(0, (int64_t)(newval - unique)) -4616MAX(0, (int64_t)(ds->ds_reserved - unique));4617ds->ds_reserved = newval;4618mutex_exit(&ds->ds_lock);46194620dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);4621mutex_exit(&ds->ds_dir->dd_lock);4622}46234624static void4625dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)4626{4627dsl_dataset_set_qr_arg_t *ddsqra = arg;4628dsl_pool_t *dp = dmu_tx_pool(tx);4629dsl_dataset_t *ds = NULL;46304631VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));4632dsl_dataset_set_refreservation_sync_impl(ds,4633ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);4634dsl_dataset_rele(ds, FTAG);4635}46364637int4638dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,4639uint64_t refreservation)4640{4641dsl_dataset_set_qr_arg_t ddsqra;46424643ddsqra.ddsqra_name = dsname;4644ddsqra.ddsqra_source = source;4645ddsqra.ddsqra_value = refreservation;46464647return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,4648dsl_dataset_set_refreservation_sync, &ddsqra, 0,4649ZFS_SPACE_CHECK_EXTRA_RESERVED));4650}46514652typedef struct dsl_dataset_set_compression_arg {4653const char *ddsca_name;4654zprop_source_t ddsca_source;4655uint64_t ddsca_value;4656} dsl_dataset_set_compression_arg_t;46574658static int4659dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)4660{4661dsl_dataset_set_compression_arg_t *ddsca = arg;4662dsl_pool_t *dp = dmu_tx_pool(tx);46634664uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);4665spa_feature_t f = zio_compress_to_feature(compval);46664667if (f == SPA_FEATURE_NONE)4668return (SET_ERROR(EINVAL));46694670if (!spa_feature_is_enabled(dp->dp_spa, f))4671return (SET_ERROR(ENOTSUP));46724673return (0);4674}46754676static void4677dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)4678{4679dsl_dataset_set_compression_arg_t *ddsca = arg;4680dsl_pool_t *dp = dmu_tx_pool(tx);4681dsl_dataset_t *ds = NULL;46824683uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);4684spa_feature_t f = zio_compress_to_feature(compval);4685ASSERT3S(f, !=, SPA_FEATURE_NONE);4686ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);46874688VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));4689if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) {4690ds->ds_feature_activation[f] = (void *)B_TRUE;4691dsl_dataset_activate_feature(ds->ds_object, f,4692ds->ds_feature_activation[f], tx);4693ds->ds_feature[f] = ds->ds_feature_activation[f];4694}4695dsl_dataset_rele(ds, FTAG);4696}46974698int4699dsl_dataset_set_compression(const char *dsname, zprop_source_t source,4700uint64_t compression)4701{4702dsl_dataset_set_compression_arg_t ddsca;47034704/*4705* The sync task is only required for zstd in order to activate4706* the feature flag when the property is first set.4707*/4708if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD)4709return (0);47104711ddsca.ddsca_name = dsname;4712ddsca.ddsca_source = source;4713ddsca.ddsca_value = compression;47144715return (dsl_sync_task(dsname, dsl_dataset_set_compression_check,4716dsl_dataset_set_compression_sync, &ddsca, 0,4717ZFS_SPACE_CHECK_EXTRA_RESERVED));4718}47194720/*4721* Return (in *usedp) the amount of space referenced by "new" that was not4722* referenced at the time the bookmark corresponds to. "New" may be a4723* snapshot or a head. The bookmark must be before new, in4724* new's filesystem (or its origin) -- caller verifies this.4725*4726* The written space is calculated by considering two components: First, we4727* ignore any freed space, and calculate the written as new's used space4728* minus old's used space. Next, we add in the amount of space that was freed4729* between the two time points, thus reducing new's used space relative to4730* old's. Specifically, this is the space that was born before4731* zbm_creation_txg, and freed before new (ie. on new's deadlist or a4732* previous deadlist).4733*4734* space freed [---------------------]4735* snapshots ---O-------O--------O-------O------4736* bookmark new4737*4738* Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN4739* flag is not set, we will calculate the freed_before_next based on the4740* next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap.4741*/4742static int4743dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp,4744dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)4745{4746int err = 0;4747dsl_pool_t *dp = new->ds_dir->dd_pool;47484749ASSERT(dsl_pool_config_held(dp));4750if (dsl_dataset_is_snapshot(new)) {4751ASSERT3U(bmp->zbm_creation_txg, <,4752dsl_dataset_phys(new)->ds_creation_txg);4753}47544755*usedp = 0;4756*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;4757*usedp -= bmp->zbm_referenced_bytes_refd;47584759*compp = 0;4760*compp += dsl_dataset_phys(new)->ds_compressed_bytes;4761*compp -= bmp->zbm_compressed_bytes_refd;47624763*uncompp = 0;4764*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;4765*uncompp -= bmp->zbm_uncompressed_bytes_refd;47664767dsl_dataset_t *snap = new;47684769while (dsl_dataset_phys(snap)->ds_prev_snap_txg >4770bmp->zbm_creation_txg) {4771uint64_t used, comp, uncomp;47724773dsl_deadlist_space_range(&snap->ds_deadlist,47740, bmp->zbm_creation_txg,4775&used, &comp, &uncomp);4776*usedp += used;4777*compp += comp;4778*uncompp += uncomp;47794780uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;4781if (snap != new)4782dsl_dataset_rele(snap, FTAG);4783err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);4784if (err != 0)4785break;4786}47874788/*4789* We might not have the FBN if we are calculating written from4790* a snapshot (because we didn't know the correct "next" snapshot4791* until now).4792*/4793if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) {4794*usedp += bmp->zbm_referenced_freed_before_next_snap;4795*compp += bmp->zbm_compressed_freed_before_next_snap;4796*uncompp += bmp->zbm_uncompressed_freed_before_next_snap;4797} else {4798ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==,4799bmp->zbm_creation_txg);4800uint64_t used, comp, uncomp;4801dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp);4802*usedp += used;4803*compp += comp;4804*uncompp += uncomp;4805}4806if (snap != new)4807dsl_dataset_rele(snap, FTAG);4808return (err);4809}48104811/*4812* Return (in *usedp) the amount of space written in new that was not4813* present at the time the bookmark corresponds to. New may be a4814* snapshot or the head. Old must be a bookmark before new, in4815* new's filesystem (or its origin) -- caller verifies this.4816*/4817int4818dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp,4819dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)4820{4821if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN))4822return (SET_ERROR(ENOTSUP));4823return (dsl_dataset_space_written_impl(bmp, new,4824usedp, compp, uncompp));4825}48264827/*4828* Return (in *usedp) the amount of space written in new that is not4829* present in oldsnap. New may be a snapshot or the head. Old must be4830* a snapshot before new, in new's filesystem (or its origin). If not then4831* fail and return EINVAL.4832*/4833int4834dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,4835uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)4836{4837if (!dsl_dataset_is_before(new, oldsnap, 0))4838return (SET_ERROR(EINVAL));48394840zfs_bookmark_phys_t zbm = { 0 };4841dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap);4842zbm.zbm_guid = dsp->ds_guid;4843zbm.zbm_creation_txg = dsp->ds_creation_txg;4844zbm.zbm_creation_time = dsp->ds_creation_time;4845zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;4846zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;4847zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;48484849/*4850* If oldsnap is the origin (or origin's origin, ...) of new,4851* we can't easily calculate the effective FBN. Therefore,4852* we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate4853* it relative to the correct "next": the next snapshot towards "new",4854* rather than the next snapshot in oldsnap's dsl_dir.4855*/4856return (dsl_dataset_space_written_impl(&zbm, new,4857usedp, compp, uncompp));4858}48594860/*4861* Return (in *usedp) the amount of space that will be reclaimed if firstsnap,4862* lastsnap, and all snapshots in between are deleted.4863*4864* blocks that would be freed [---------------------------]4865* snapshots ---O-------O--------O-------O--------O4866* firstsnap lastsnap4867*4868* This is the set of blocks that were born after the snap before firstsnap,4869* (birth > firstsnap->prev_snap_txg) and died before the snap after the4870* last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).4871* We calculate this by iterating over the relevant deadlists (from the snap4872* after lastsnap, backward to the snap after firstsnap), summing up the4873* space on the deadlist that was born after the snap before firstsnap.4874*/4875int4876dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,4877dsl_dataset_t *lastsnap,4878uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)4879{4880int err = 0;4881uint64_t snapobj;4882dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;48834884ASSERT(firstsnap->ds_is_snapshot);4885ASSERT(lastsnap->ds_is_snapshot);48864887/*4888* Check that the snapshots are in the same dsl_dir, and firstsnap4889* is before lastsnap.4890*/4891if (firstsnap->ds_dir != lastsnap->ds_dir ||4892dsl_dataset_phys(firstsnap)->ds_creation_txg >4893dsl_dataset_phys(lastsnap)->ds_creation_txg)4894return (SET_ERROR(EINVAL));48954896*usedp = *compp = *uncompp = 0;48974898snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;4899while (snapobj != firstsnap->ds_object) {4900dsl_dataset_t *ds;4901uint64_t used, comp, uncomp;49024903err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);4904if (err != 0)4905break;49064907dsl_deadlist_space_range(&ds->ds_deadlist,4908dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,4909&used, &comp, &uncomp);4910*usedp += used;4911*compp += comp;4912*uncompp += uncomp;49134914snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;4915ASSERT3U(snapobj, !=, 0);4916dsl_dataset_rele(ds, FTAG);4917}4918return (err);4919}49204921/*4922* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.4923* For example, they could both be snapshots of the same filesystem, and4924* 'earlier' is before 'later'. Or 'earlier' could be the origin of4925* 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's4926* filesystem. Or 'earlier' could be the origin's origin.4927*4928* If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.4929*/4930boolean_t4931dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,4932uint64_t earlier_txg)4933{4934dsl_pool_t *dp = later->ds_dir->dd_pool;4935int error;4936boolean_t ret;49374938ASSERT(dsl_pool_config_held(dp));4939ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);49404941if (earlier_txg == 0)4942earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;49434944if (later->ds_is_snapshot &&4945earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)4946return (B_FALSE);49474948if (later->ds_dir == earlier->ds_dir)4949return (B_TRUE);49504951/*4952* We check dd_origin_obj explicitly here rather than using4953* dsl_dir_is_clone() so that we will return TRUE if "earlier"4954* is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on4955* this behavior.4956*/4957if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0)4958return (B_FALSE);49594960dsl_dataset_t *origin;4961error = dsl_dataset_hold_obj(dp,4962dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);4963if (error != 0)4964return (B_FALSE);4965if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg &&4966origin->ds_dir == earlier->ds_dir) {4967dsl_dataset_rele(origin, FTAG);4968return (B_TRUE);4969}4970ret = dsl_dataset_is_before(origin, earlier, earlier_txg);4971dsl_dataset_rele(origin, FTAG);4972return (ret);4973}49744975void4976dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)4977{4978objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;4979dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);4980}49814982boolean_t4983dsl_dataset_is_zapified(dsl_dataset_t *ds)4984{4985dmu_object_info_t doi;49864987dmu_object_info_from_db(ds->ds_dbuf, &doi);4988return (doi.doi_type == DMU_OTN_ZAP_METADATA);4989}49904991boolean_t4992dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)4993{4994return (dsl_dataset_is_zapified(ds) &&4995zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,4996ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);4997}49984999uint64_t5000dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)5001{5002uint64_t remap_deadlist_obj;5003int err;50045005if (!dsl_dataset_is_zapified(ds))5006return (0);50075008err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,5009DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,5010&remap_deadlist_obj);50115012if (err != 0) {5013VERIFY3S(err, ==, ENOENT);5014return (0);5015}50165017ASSERT(remap_deadlist_obj != 0);5018return (remap_deadlist_obj);5019}50205021boolean_t5022dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)5023{5024EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),5025dsl_dataset_get_remap_deadlist_object(ds) != 0);5026return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));5027}50285029static void5030dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,5031dmu_tx_t *tx)5032{5033ASSERT(obj != 0);5034dsl_dataset_zapify(ds, tx);5035VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,5036DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));5037}50385039static void5040dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)5041{5042VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,5043ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));5044}50455046void5047dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)5048{5049uint64_t remap_deadlist_object;5050spa_t *spa = ds->ds_dir->dd_pool->dp_spa;50515052ASSERT(dmu_tx_is_syncing(tx));5053ASSERT(dsl_dataset_remap_deadlist_exists(ds));50545055remap_deadlist_object = ds->ds_remap_deadlist.dl_object;5056dsl_deadlist_close(&ds->ds_remap_deadlist);5057dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);5058dsl_dataset_unset_remap_deadlist_object(ds, tx);5059spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);5060}50615062void5063dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)5064{5065uint64_t remap_deadlist_obj;5066spa_t *spa = ds->ds_dir->dd_pool->dp_spa;50675068ASSERT(dmu_tx_is_syncing(tx));5069ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));5070/*5071* Currently we only create remap deadlists when there are indirect5072* vdevs with referenced mappings.5073*/5074ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));50755076remap_deadlist_obj = dsl_deadlist_clone(5077&ds->ds_deadlist, UINT64_MAX,5078dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);5079dsl_dataset_set_remap_deadlist_object(ds,5080remap_deadlist_obj, tx);5081VERIFY0(dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),5082remap_deadlist_obj));5083spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);5084}50855086void5087dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,5088uint64_t num_redact_snaps, dmu_tx_t *tx)5089{5090uint64_t dsobj = ds->ds_object;5091struct feature_type_uint64_array_arg *ftuaa =5092kmem_zalloc(sizeof (*ftuaa), KM_SLEEP);5093ftuaa->length = (int64_t)num_redact_snaps;5094if (num_redact_snaps > 0) {5095ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),5096KM_SLEEP);5097memcpy(ftuaa->array, redact_snaps, num_redact_snaps *5098sizeof (uint64_t));5099}5100dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,5101ftuaa, tx);5102ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;5103}51045105/*5106* Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj5107* dataset whose birth time is >= min_txg.5108*/5109int5110dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg,5111uint64_t *oldest_dsobj)5112{5113dsl_dataset_t *ds;5114dsl_pool_t *dp = spa->spa_dsl_pool;51155116int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);5117if (error != 0)5118return (error);51195120uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;5121uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;51225123while (prev_obj != 0 && min_txg < prev_obj_txg) {5124dsl_dataset_rele(ds, FTAG);5125if ((error = dsl_dataset_hold_obj(dp, prev_obj,5126FTAG, &ds)) != 0)5127return (error);5128prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;5129prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;5130}5131*oldest_dsobj = ds->ds_object;5132dsl_dataset_rele(ds, FTAG);5133return (0);5134}51355136ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW,5137"Max allowed record size");51385139ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,5140"Allow mounting of redacted datasets");51415142ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW,5143"Include snapshot events in pool history/events");51445145EXPORT_SYMBOL(dsl_dataset_hold);5146EXPORT_SYMBOL(dsl_dataset_hold_flags);5147EXPORT_SYMBOL(dsl_dataset_hold_obj);5148EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);5149EXPORT_SYMBOL(dsl_dataset_own);5150EXPORT_SYMBOL(dsl_dataset_own_obj);5151EXPORT_SYMBOL(dsl_dataset_name);5152EXPORT_SYMBOL(dsl_dataset_rele);5153EXPORT_SYMBOL(dsl_dataset_rele_flags);5154EXPORT_SYMBOL(dsl_dataset_disown);5155EXPORT_SYMBOL(dsl_dataset_tryown);5156EXPORT_SYMBOL(dsl_dataset_create_sync);5157EXPORT_SYMBOL(dsl_dataset_create_sync_dd);5158EXPORT_SYMBOL(dsl_dataset_snapshot_check);5159EXPORT_SYMBOL(dsl_dataset_snapshot_sync);5160EXPORT_SYMBOL(dsl_dataset_promote);5161EXPORT_SYMBOL(dsl_dataset_user_hold);5162EXPORT_SYMBOL(dsl_dataset_user_release);5163EXPORT_SYMBOL(dsl_dataset_get_holds);5164EXPORT_SYMBOL(dsl_dataset_get_blkptr);5165EXPORT_SYMBOL(dsl_dataset_get_spa);5166EXPORT_SYMBOL(dsl_dataset_modified_since_snap);5167EXPORT_SYMBOL(dsl_dataset_space_written);5168EXPORT_SYMBOL(dsl_dataset_space_wouldfree);5169EXPORT_SYMBOL(dsl_dataset_sync);5170EXPORT_SYMBOL(dsl_dataset_block_born);5171EXPORT_SYMBOL(dsl_dataset_block_kill);5172EXPORT_SYMBOL(dsl_dataset_dirty);5173EXPORT_SYMBOL(dsl_dataset_stats);5174EXPORT_SYMBOL(dsl_dataset_fast_stat);5175EXPORT_SYMBOL(dsl_dataset_space);5176EXPORT_SYMBOL(dsl_dataset_fsid_guid);5177EXPORT_SYMBOL(dsl_dsobj_to_dsname);5178EXPORT_SYMBOL(dsl_dataset_check_quota);5179EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);5180EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);518151825183