Path: blob/main/sys/contrib/openzfs/module/zfs/dsl_pool.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2011, 2020 by Delphix. All rights reserved.24* Copyright (c) 2013 Steven Hartland. All rights reserved.25* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.26* Copyright 2016 Nexenta Systems, Inc. All rights reserved.27*/2829#include <sys/dsl_pool.h>30#include <sys/dsl_dataset.h>31#include <sys/dsl_prop.h>32#include <sys/dsl_dir.h>33#include <sys/dsl_synctask.h>34#include <sys/dsl_scan.h>35#include <sys/dnode.h>36#include <sys/dmu_tx.h>37#include <sys/dmu_objset.h>38#include <sys/arc.h>39#include <sys/zap.h>40#include <sys/zio.h>41#include <sys/zfs_context.h>42#include <sys/fs/zfs.h>43#include <sys/zfs_znode.h>44#include <sys/spa_impl.h>45#include <sys/vdev_impl.h>46#include <sys/metaslab_impl.h>47#include <sys/bptree.h>48#include <sys/zfeature.h>49#include <sys/zil_impl.h>50#include <sys/dsl_userhold.h>51#include <sys/trace_zfs.h>52#include <sys/mmp.h>5354/*55* ZFS Write Throttle56* ------------------57*58* ZFS must limit the rate of incoming writes to the rate at which it is able59* to sync data modifications to the backend storage. Throttling by too much60* creates an artificial limit; throttling by too little can only be sustained61* for short periods and would lead to highly lumpy performance. On a per-pool62* basis, ZFS tracks the amount of modified (dirty) data. As operations change63* data, the amount of dirty data increases; as ZFS syncs out data, the amount64* of dirty data decreases. When the amount of dirty data exceeds a65* predetermined threshold further modifications are blocked until the amount66* of dirty data decreases (as data is synced out).67*68* The limit on dirty data is tunable, and should be adjusted according to69* both the IO capacity and available memory of the system. The larger the70* window, the more ZFS is able to aggregate and amortize metadata (and data)71* changes. However, memory is a limited resource, and allowing for more dirty72* data comes at the cost of keeping other useful data in memory (for example73* ZFS data cached by the ARC).74*75* Implementation76*77* As buffers are modified dsl_pool_willuse_space() increments both the per-78* txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of79* dirty space used; dsl_pool_dirty_space() decrements those values as data80* is synced out from dsl_pool_sync(). While only the poolwide value is81* relevant, the per-txg value is useful for debugging. The tunable82* zfs_dirty_data_max determines the dirty space limit. Once that value is83* exceeded, new writes are halted until space frees up.84*85* The zfs_dirty_data_sync_percent tunable dictates the threshold at which we86* ensure that there is a txg syncing (see the comment in txg.c for a full87* description of transaction group stages).88*89* The IO scheduler uses both the dirty space limit and current amount of90* dirty data as inputs. Those values affect the number of concurrent IOs ZFS91* issues. See the comment in vdev_queue.c for details of the IO scheduler.92*93* The delay is also calculated based on the amount of dirty data. See the94* comment above dmu_tx_delay() for details.95*/9697/*98* zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,99* capped at zfs_dirty_data_max_max. It can also be overridden with a module100* parameter.101*/102uint64_t zfs_dirty_data_max = 0;103uint64_t zfs_dirty_data_max_max = 0;104uint_t zfs_dirty_data_max_percent = 10;105uint_t zfs_dirty_data_max_max_percent = 25;106107/*108* The upper limit of TX_WRITE log data. Write operations are throttled109* when approaching the limit until log data is cleared out after txg sync.110* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.111*/112uint64_t zfs_wrlog_data_max = 0;113114/*115* If there's at least this much dirty data (as a percentage of116* zfs_dirty_data_max), push out a txg. This should be less than117* zfs_vdev_async_write_active_min_dirty_percent.118*/119static uint_t zfs_dirty_data_sync_percent = 20;120121/*122* Once there is this amount of dirty data, the dmu_tx_delay() will kick in123* and delay each transaction.124* This value should be >= zfs_vdev_async_write_active_max_dirty_percent.125*/126uint_t zfs_delay_min_dirty_percent = 60;127128/*129* This controls how quickly the delay approaches infinity.130* Larger values cause it to delay more for a given amount of dirty data.131* Therefore larger values will cause there to be less dirty data for a132* given throughput.133*134* For the smoothest delay, this value should be about 1 billion divided135* by the maximum number of operations per second. This will smoothly136* handle between 10x and 1/10th this number.137*138* Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the139* multiply in dmu_tx_delay().140*/141uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;142143/*144* These tunables determine the behavior of how zil_itxg_clean() is145* called via zil_clean() in the context of spa_sync(). When an itxg146* list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.147* If the dispatch fails, the call to zil_itxg_clean() will occur148* synchronously in the context of spa_sync(), which can negatively149* impact the performance of spa_sync() (e.g. in the case of the itxg150* list having a large number of itxs that needs to be cleaned).151*152* Thus, these tunables can be used to manipulate the behavior of the153* taskq used by zil_clean(); they determine the number of taskq entries154* that are pre-populated when the taskq is first created (via the155* "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of156* taskq entries that are cached after an on-demand allocation (via the157* "zfs_zil_clean_taskq_maxalloc").158*159* The idea being, we want to try reasonably hard to ensure there will160* already be a taskq entry pre-allocated by the time that it is needed161* by zil_clean(). This way, we can avoid the possibility of an162* on-demand allocation of a new taskq entry from failing, which would163* result in zil_itxg_clean() being called synchronously from zil_clean()164* (which can adversely affect performance of spa_sync()).165*166* Additionally, the number of threads used by the taskq can be167* configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.168*/169static int zfs_zil_clean_taskq_nthr_pct = 100;170static int zfs_zil_clean_taskq_minalloc = 1024;171static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;172173int174dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)175{176uint64_t obj;177int err;178179err = zap_lookup(dp->dp_meta_objset,180dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,181name, sizeof (obj), 1, &obj);182if (err)183return (err);184185return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));186}187188static dsl_pool_t *189dsl_pool_open_impl(spa_t *spa, uint64_t txg)190{191dsl_pool_t *dp;192blkptr_t *bp = spa_get_rootblkptr(spa);193194dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);195dp->dp_spa = spa;196dp->dp_meta_rootbp = *bp;197rrw_init(&dp->dp_config_rwlock, B_TRUE);198txg_init(dp, txg);199mmp_init(spa);200201txg_list_create(&dp->dp_dirty_datasets, spa,202offsetof(dsl_dataset_t, ds_dirty_link));203txg_list_create(&dp->dp_dirty_zilogs, spa,204offsetof(zilog_t, zl_dirty_link));205txg_list_create(&dp->dp_dirty_dirs, spa,206offsetof(dsl_dir_t, dd_dirty_link));207txg_list_create(&dp->dp_sync_tasks, spa,208offsetof(dsl_sync_task_t, dst_node));209txg_list_create(&dp->dp_early_sync_tasks, spa,210offsetof(dsl_sync_task_t, dst_node));211212dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");213214dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",215zfs_zil_clean_taskq_nthr_pct, minclsyspri,216zfs_zil_clean_taskq_minalloc,217zfs_zil_clean_taskq_maxalloc,218TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);219220mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);221cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);222223aggsum_init(&dp->dp_wrlog_total, 0);224for (int i = 0; i < TXG_SIZE; i++) {225aggsum_init(&dp->dp_wrlog_pertxg[i], 0);226}227228dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,229boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |230TASKQ_THREADS_CPU_PCT);231dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",232100, defclsyspri, boot_ncpus, INT_MAX,233TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);234235return (dp);236}237238int239dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)240{241int err;242dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);243244/*245* Initialize the caller's dsl_pool_t structure before we actually open246* the meta objset. This is done because a self-healing write zio may247* be issued as part of dmu_objset_open_impl() and the spa needs its248* dsl_pool_t initialized in order to handle the write.249*/250*dpp = dp;251252err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,253&dp->dp_meta_objset);254if (err != 0) {255dsl_pool_close(dp);256*dpp = NULL;257}258259return (err);260}261262int263dsl_pool_open(dsl_pool_t *dp)264{265int err;266dsl_dir_t *dd;267dsl_dataset_t *ds;268uint64_t obj;269270rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);271err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,272DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,273&dp->dp_root_dir_obj);274if (err)275goto out;276277err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,278NULL, dp, &dp->dp_root_dir);279if (err)280goto out;281282err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);283if (err)284goto out;285286if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {287err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);288if (err)289goto out;290err = dsl_dataset_hold_obj(dp,291dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);292if (err == 0) {293err = dsl_dataset_hold_obj(dp,294dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,295&dp->dp_origin_snap);296dsl_dataset_rele(ds, FTAG);297}298dsl_dir_rele(dd, dp);299if (err)300goto out;301}302303if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {304err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,305&dp->dp_free_dir);306if (err)307goto out;308309err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,310DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);311if (err)312goto out;313VERIFY0(bpobj_open(&dp->dp_free_bpobj,314dp->dp_meta_objset, obj));315}316317if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {318err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,319DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);320if (err == 0) {321VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,322dp->dp_meta_objset, obj));323} else if (err == ENOENT) {324/*325* We might not have created the remap bpobj yet.326*/327} else {328goto out;329}330}331332/*333* Note: errors ignored, because the these special dirs, used for334* space accounting, are only created on demand.335*/336(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,337&dp->dp_leak_dir);338339if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {340err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,341DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,342&dp->dp_bptree_obj);343if (err != 0)344goto out;345}346347if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {348err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,349DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,350&dp->dp_empty_bpobj);351if (err != 0)352goto out;353}354355err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,356DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,357&dp->dp_tmp_userrefs_obj);358if (err == ENOENT)359err = 0;360if (err)361goto out;362363err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);364365out:366rrw_exit(&dp->dp_config_rwlock, FTAG);367return (err);368}369370void371dsl_pool_close(dsl_pool_t *dp)372{373/*374* Drop our references from dsl_pool_open().375*376* Since we held the origin_snap from "syncing" context (which377* includes pool-opening context), it actually only got a "ref"378* and not a hold, so just drop that here.379*/380if (dp->dp_origin_snap != NULL)381dsl_dataset_rele(dp->dp_origin_snap, dp);382if (dp->dp_mos_dir != NULL)383dsl_dir_rele(dp->dp_mos_dir, dp);384if (dp->dp_free_dir != NULL)385dsl_dir_rele(dp->dp_free_dir, dp);386if (dp->dp_leak_dir != NULL)387dsl_dir_rele(dp->dp_leak_dir, dp);388if (dp->dp_root_dir != NULL)389dsl_dir_rele(dp->dp_root_dir, dp);390391bpobj_close(&dp->dp_free_bpobj);392bpobj_close(&dp->dp_obsolete_bpobj);393394/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */395if (dp->dp_meta_objset != NULL)396dmu_objset_evict(dp->dp_meta_objset);397398txg_list_destroy(&dp->dp_dirty_datasets);399txg_list_destroy(&dp->dp_dirty_zilogs);400txg_list_destroy(&dp->dp_sync_tasks);401txg_list_destroy(&dp->dp_early_sync_tasks);402txg_list_destroy(&dp->dp_dirty_dirs);403404taskq_destroy(dp->dp_zil_clean_taskq);405spa_sync_tq_destroy(dp->dp_spa);406407if (dp->dp_spa->spa_state == POOL_STATE_EXPORTED ||408dp->dp_spa->spa_state == POOL_STATE_DESTROYED) {409/*410* On export/destroy perform the ARC flush asynchronously.411*/412arc_flush_async(dp->dp_spa);413} else {414/*415* We can't set retry to TRUE since we're explicitly specifying416* a spa to flush. This is good enough; any missed buffers for417* this spa won't cause trouble, and they'll eventually fall418* out of the ARC just like any other unused buffer.419*/420arc_flush(dp->dp_spa, FALSE);421}422423mmp_fini(dp->dp_spa);424txg_fini(dp);425dsl_scan_fini(dp);426dmu_buf_user_evict_wait();427428rrw_destroy(&dp->dp_config_rwlock);429mutex_destroy(&dp->dp_lock);430cv_destroy(&dp->dp_spaceavail_cv);431432ASSERT0(aggsum_value(&dp->dp_wrlog_total));433aggsum_fini(&dp->dp_wrlog_total);434for (int i = 0; i < TXG_SIZE; i++) {435ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));436aggsum_fini(&dp->dp_wrlog_pertxg[i]);437}438439taskq_destroy(dp->dp_unlinked_drain_taskq);440taskq_destroy(dp->dp_zrele_taskq);441if (dp->dp_blkstats != NULL)442vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));443kmem_free(dp, sizeof (dsl_pool_t));444}445446void447dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)448{449uint64_t obj;450/*451* Currently, we only create the obsolete_bpobj where there are452* indirect vdevs with referenced mappings.453*/454ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));455/* create and open the obsolete_bpobj */456obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);457VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));458VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,459DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));460spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);461}462463void464dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)465{466spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);467VERIFY0(zap_remove(dp->dp_meta_objset,468DMU_POOL_DIRECTORY_OBJECT,469DMU_POOL_OBSOLETE_BPOBJ, tx));470bpobj_free(dp->dp_meta_objset,471dp->dp_obsolete_bpobj.bpo_object, tx);472bpobj_close(&dp->dp_obsolete_bpobj);473}474475dsl_pool_t *476dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)),477dsl_crypto_params_t *dcp, uint64_t txg)478{479int err;480dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);481dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);482#ifdef _KERNEL483objset_t *os;484#else485objset_t *os __attribute__((unused));486#endif487dsl_dataset_t *ds;488uint64_t obj;489490rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);491492/* create and open the MOS (meta-objset) */493dp->dp_meta_objset = dmu_objset_create_impl(spa,494NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);495spa->spa_meta_objset = dp->dp_meta_objset;496497/* create the pool directory */498err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,499DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);500ASSERT0(err);501502/* Initialize scan structures */503VERIFY0(dsl_scan_init(dp, txg));504505/* create and open the root dir */506dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);507VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,508NULL, dp, &dp->dp_root_dir));509510/* create and open the meta-objset dir */511(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);512VERIFY0(dsl_pool_open_special_dir(dp,513MOS_DIR_NAME, &dp->dp_mos_dir));514515if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {516/* create and open the free dir */517(void) dsl_dir_create_sync(dp, dp->dp_root_dir,518FREE_DIR_NAME, tx);519VERIFY0(dsl_pool_open_special_dir(dp,520FREE_DIR_NAME, &dp->dp_free_dir));521522/* create and open the free_bplist */523obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);524VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,525DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));526VERIFY0(bpobj_open(&dp->dp_free_bpobj,527dp->dp_meta_objset, obj));528}529530if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)531dsl_pool_create_origin(dp, tx);532533/*534* Some features may be needed when creating the root dataset, so we535* create the feature objects here.536*/537if (spa_version(spa) >= SPA_VERSION_FEATURES)538spa_feature_create_zap_objects(spa, tx);539540if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&541dcp->cp_crypt != ZIO_CRYPT_INHERIT)542spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);543544/* create the root dataset */545obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);546547/* create the root objset */548VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,549DS_HOLD_FLAG_DECRYPT, FTAG, &ds));550rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);551os = dmu_objset_create_impl(dp->dp_spa, ds,552dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);553rrw_exit(&ds->ds_bp_rwlock, FTAG);554#ifdef _KERNEL555zfs_create_fs(os, kcred, zplprops, tx);556#endif557dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);558559dmu_tx_commit(tx);560561rrw_exit(&dp->dp_config_rwlock, FTAG);562563return (dp);564}565566/*567* Account for the meta-objset space in its placeholder dsl_dir.568*/569void570dsl_pool_mos_diduse_space(dsl_pool_t *dp,571int64_t used, int64_t comp, int64_t uncomp)572{573ASSERT3U(comp, ==, uncomp); /* it's all metadata */574mutex_enter(&dp->dp_lock);575dp->dp_mos_used_delta += used;576dp->dp_mos_compressed_delta += comp;577dp->dp_mos_uncompressed_delta += uncomp;578mutex_exit(&dp->dp_lock);579}580581static void582dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)583{584zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);585dmu_objset_sync(dp->dp_meta_objset, zio, tx);586VERIFY0(zio_wait(zio));587dmu_objset_sync_done(dp->dp_meta_objset, tx);588taskq_wait(dp->dp_sync_taskq);589multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes);590591dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");592spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);593}594595static void596dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)597{598ASSERT(MUTEX_HELD(&dp->dp_lock));599600if (delta < 0)601ASSERT3U(-delta, <=, dp->dp_dirty_total);602603dp->dp_dirty_total += delta;604605/*606* Note: we signal even when increasing dp_dirty_total.607* This ensures forward progress -- each thread wakes the next waiter.608*/609if (dp->dp_dirty_total < zfs_dirty_data_max)610cv_signal(&dp->dp_spaceavail_cv);611}612613void614dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)615{616ASSERT3S(size, >=, 0);617618aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);619aggsum_add(&dp->dp_wrlog_total, size);620621/* Choose a value slightly bigger than min dirty sync bytes */622uint64_t sync_min =623zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;624if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)625txg_kick(dp, txg);626}627628boolean_t629dsl_pool_need_wrlog_delay(dsl_pool_t *dp)630{631uint64_t delay_min_bytes =632zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;633634return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);635}636637static void638dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)639{640int64_t delta;641delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);642aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);643aggsum_add(&dp->dp_wrlog_total, delta);644/* Compact per-CPU sums after the big change. */645(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);646(void) aggsum_value(&dp->dp_wrlog_total);647}648649#ifdef ZFS_DEBUG650static boolean_t651dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)652{653spa_t *spa = dp->dp_spa;654vdev_t *rvd = spa->spa_root_vdev;655656for (uint64_t c = 0; c < rvd->vdev_children; c++) {657vdev_t *vd = rvd->vdev_child[c];658txg_list_t *tl = &vd->vdev_ms_list;659metaslab_t *ms;660661for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;662ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {663VERIFY(zfs_range_tree_is_empty(ms->ms_freeing));664VERIFY(zfs_range_tree_is_empty(ms->ms_checkpointing));665}666}667668return (B_TRUE);669}670#else671#define dsl_early_sync_task_verify(dp, txg) \672((void) sizeof (dp), (void) sizeof (txg), B_TRUE)673#endif674675void676dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)677{678zio_t *rio; /* root zio for all dirty dataset syncs */679dmu_tx_t *tx;680dsl_dir_t *dd;681dsl_dataset_t *ds;682objset_t *mos = dp->dp_meta_objset;683list_t synced_datasets;684685list_create(&synced_datasets, sizeof (dsl_dataset_t),686offsetof(dsl_dataset_t, ds_synced_link));687688tx = dmu_tx_create_assigned(dp, txg);689690/*691* Run all early sync tasks before writing out any dirty blocks.692* For more info on early sync tasks see block comment in693* dsl_early_sync_task().694*/695if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {696dsl_sync_task_t *dst;697698ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);699while ((dst =700txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {701ASSERT(dsl_early_sync_task_verify(dp, txg));702dsl_sync_task_sync(dst, tx);703}704ASSERT(dsl_early_sync_task_verify(dp, txg));705}706707/*708* Write out all dirty blocks of dirty datasets. Note, this could709* create a very large (+10k) zio tree.710*/711rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);712while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {713/*714* We must not sync any non-MOS datasets twice, because715* we may have taken a snapshot of them. However, we716* may sync newly-created datasets on pass 2.717*/718ASSERT(!list_link_active(&ds->ds_synced_link));719list_insert_tail(&synced_datasets, ds);720dsl_dataset_sync(ds, rio, tx);721}722VERIFY0(zio_wait(rio));723724/*725* Update the long range free counter after726* we're done syncing user data727*/728mutex_enter(&dp->dp_lock);729ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||730dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);731dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;732mutex_exit(&dp->dp_lock);733734/*735* After the data blocks have been written (ensured by the zio_wait()736* above), update the user/group/project space accounting. This happens737* in tasks dispatched to dp_sync_taskq, so wait for them before738* continuing.739*/740for (ds = list_head(&synced_datasets); ds != NULL;741ds = list_next(&synced_datasets, ds)) {742dmu_objset_sync_done(ds->ds_objset, tx);743}744taskq_wait(dp->dp_sync_taskq);745746/*747* Sync the datasets again to push out the changes due to748* userspace updates. This must be done before we process the749* sync tasks, so that any snapshots will have the correct750* user accounting information (and we won't get confused751* about which blocks are part of the snapshot).752*/753rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);754while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {755objset_t *os = ds->ds_objset;756757ASSERT(list_link_active(&ds->ds_synced_link));758dmu_buf_rele(ds->ds_dbuf, ds);759dsl_dataset_sync(ds, rio, tx);760761/*762* Release any key mappings created by calls to763* dsl_dataset_dirty() from the userquota accounting764* code paths.765*/766if (os->os_encrypted && !os->os_raw_receive &&767!os->os_next_write_raw[txg & TXG_MASK]) {768ASSERT3P(ds->ds_key_mapping, !=, NULL);769key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);770}771}772VERIFY0(zio_wait(rio));773774/*775* Now that the datasets have been completely synced, we can776* clean up our in-memory structures accumulated while syncing:777*778* - move dead blocks from the pending deadlist and livelists779* to the on-disk versions780* - release hold from dsl_dataset_dirty()781* - release key mapping hold from dsl_dataset_dirty()782*/783while ((ds = list_remove_head(&synced_datasets)) != NULL) {784objset_t *os = ds->ds_objset;785786if (os->os_encrypted && !os->os_raw_receive &&787!os->os_next_write_raw[txg & TXG_MASK]) {788ASSERT3P(ds->ds_key_mapping, !=, NULL);789key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);790}791792dsl_dataset_sync_done(ds, tx);793dmu_buf_rele(ds->ds_dbuf, ds);794}795796while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {797dsl_dir_sync(dd, tx);798}799800/*801* The MOS's space is accounted for in the pool/$MOS802* (dp_mos_dir). We can't modify the mos while we're syncing803* it, so we remember the deltas and apply them here.804*/805if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||806dp->dp_mos_uncompressed_delta != 0) {807dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,808dp->dp_mos_used_delta,809dp->dp_mos_compressed_delta,810dp->dp_mos_uncompressed_delta, tx);811dp->dp_mos_used_delta = 0;812dp->dp_mos_compressed_delta = 0;813dp->dp_mos_uncompressed_delta = 0;814}815816if (dmu_objset_is_dirty(mos, txg)) {817dsl_pool_sync_mos(dp, tx);818}819820/*821* We have written all of the accounted dirty data, so our822* dp_space_towrite should now be zero. However, some seldom-used823* code paths do not adhere to this (e.g. dbuf_undirty()). Shore up824* the accounting of any dirtied space now.825*826* Note that, besides any dirty data from datasets, the amount of827* dirty data in the MOS is also accounted by the pool. Therefore,828* we want to do this cleanup after dsl_pool_sync_mos() so we don't829* attempt to update the accounting for the same dirty data twice.830* (i.e. at this point we only update the accounting for the space831* that we know that we "leaked").832*/833dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);834835/*836* If we modify a dataset in the same txg that we want to destroy it,837* its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.838* dsl_dir_destroy_check() will fail if there are unexpected holds.839* Therefore, we want to sync the MOS (thus syncing the dd_dbuf840* and clearing the hold on it) before we process the sync_tasks.841* The MOS data dirtied by the sync_tasks will be synced on the next842* pass.843*/844if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {845dsl_sync_task_t *dst;846/*847* No more sync tasks should have been added while we848* were syncing.849*/850ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);851while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)852dsl_sync_task_sync(dst, tx);853}854855dmu_tx_commit(tx);856857DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);858}859860void861dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)862{863zilog_t *zilog;864865while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) {866dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);867/*868* We don't remove the zilog from the dp_dirty_zilogs869* list until after we've cleaned it. This ensures that870* callers of zilog_is_dirty() receive an accurate871* answer when they are racing with the spa sync thread.872*/873zil_clean(zilog, txg);874(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);875ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));876dmu_buf_rele(ds->ds_dbuf, zilog);877}878879dsl_pool_wrlog_clear(dp, txg);880881ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));882}883884/*885* TRUE if the current thread is the tx_sync_thread or if we886* are being called from SPA context during pool initialization.887*/888int889dsl_pool_sync_context(dsl_pool_t *dp)890{891return (curthread == dp->dp_tx.tx_sync_thread ||892spa_is_initializing(dp->dp_spa) ||893taskq_member(dp->dp_sync_taskq, curthread));894}895896/*897* This function returns the amount of allocatable space in the pool898* minus whatever space is currently reserved by ZFS for specific899* purposes. Specifically:900*901* 1] Any reserved SLOP space902* 2] Any space used by the checkpoint903* 3] Any space used for deferred frees904*905* The latter 2 are especially important because they are needed to906* rectify the SPA's and DMU's different understanding of how much space907* is used. Now the DMU is aware of that extra space tracked by the SPA908* without having to maintain a separate special dir (e.g similar to909* $MOS, $FREEING, and $LEAKED).910*911* Note: By deferred frees here, we mean the frees that were deferred912* in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the913* segments placed in ms_defer trees during metaslab_sync_done().914*/915uint64_t916dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)917{918spa_t *spa = dp->dp_spa;919uint64_t space, resv, adjustedsize;920uint64_t spa_deferred_frees =921spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;922923space = spa_get_dspace(spa)924- spa_get_checkpoint_space(spa) - spa_deferred_frees;925resv = spa_get_slop_space(spa);926927switch (slop_policy) {928case ZFS_SPACE_CHECK_NORMAL:929break;930case ZFS_SPACE_CHECK_RESERVED:931resv >>= 1;932break;933case ZFS_SPACE_CHECK_EXTRA_RESERVED:934resv >>= 2;935break;936case ZFS_SPACE_CHECK_NONE:937resv = 0;938break;939default:940panic("invalid slop policy value: %d", slop_policy);941break;942}943adjustedsize = (space >= resv) ? (space - resv) : 0;944945return (adjustedsize);946}947948uint64_t949dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)950{951uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);952uint64_t deferred =953metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));954uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;955return (quota);956}957958uint64_t959dsl_pool_deferred_space(dsl_pool_t *dp)960{961return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)));962}963964boolean_t965dsl_pool_need_dirty_delay(dsl_pool_t *dp)966{967uint64_t delay_min_bytes =968zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;969970/*971* We are not taking the dp_lock here and few other places, since torn972* reads are unlikely: on 64-bit systems due to register size and on973* 32-bit due to memory constraints. Pool-wide locks in hot path may974* be too expensive, while we do not need a precise result here.975*/976return (dp->dp_dirty_total > delay_min_bytes);977}978979static boolean_t980dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)981{982uint64_t dirty_min_bytes =983zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;984uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];985986return (dirty > dirty_min_bytes);987}988989void990dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)991{992if (space > 0) {993mutex_enter(&dp->dp_lock);994dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;995dsl_pool_dirty_delta(dp, space);996boolean_t needsync = !dmu_tx_is_syncing(tx) &&997dsl_pool_need_dirty_sync(dp, tx->tx_txg);998mutex_exit(&dp->dp_lock);9991000if (needsync)1001txg_kick(dp, tx->tx_txg);1002}1003}10041005void1006dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)1007{1008ASSERT3S(space, >=, 0);1009if (space == 0)1010return;10111012mutex_enter(&dp->dp_lock);1013if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {1014/* XXX writing something we didn't dirty? */1015space = dp->dp_dirty_pertxg[txg & TXG_MASK];1016}1017ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);1018dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;1019ASSERT3U(dp->dp_dirty_total, >=, space);1020dsl_pool_dirty_delta(dp, -space);1021mutex_exit(&dp->dp_lock);1022}10231024static int1025upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)1026{1027dmu_tx_t *tx = arg;1028dsl_dataset_t *ds, *prev = NULL;1029int err;10301031err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);1032if (err)1033return (err);10341035while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {1036err = dsl_dataset_hold_obj(dp,1037dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);1038if (err) {1039dsl_dataset_rele(ds, FTAG);1040return (err);1041}10421043if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)1044break;1045dsl_dataset_rele(ds, FTAG);1046ds = prev;1047prev = NULL;1048}10491050if (prev == NULL) {1051prev = dp->dp_origin_snap;10521053/*1054* The $ORIGIN can't have any data, or the accounting1055* will be wrong.1056*/1057rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);1058ASSERT0(BP_GET_BIRTH(&dsl_dataset_phys(prev)->ds_bp));1059rrw_exit(&ds->ds_bp_rwlock, FTAG);10601061/* The origin doesn't get attached to itself */1062if (ds->ds_object == prev->ds_object) {1063dsl_dataset_rele(ds, FTAG);1064return (0);1065}10661067dmu_buf_will_dirty(ds->ds_dbuf, tx);1068dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;1069dsl_dataset_phys(ds)->ds_prev_snap_txg =1070dsl_dataset_phys(prev)->ds_creation_txg;10711072dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);1073dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;10741075dmu_buf_will_dirty(prev->ds_dbuf, tx);1076dsl_dataset_phys(prev)->ds_num_children++;10771078if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {1079ASSERT0P(ds->ds_prev);1080VERIFY0(dsl_dataset_hold_obj(dp,1081dsl_dataset_phys(ds)->ds_prev_snap_obj,1082ds, &ds->ds_prev));1083}1084}10851086ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);1087ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);10881089if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {1090dmu_buf_will_dirty(prev->ds_dbuf, tx);1091dsl_dataset_phys(prev)->ds_next_clones_obj =1092zap_create(dp->dp_meta_objset,1093DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);1094}1095VERIFY0(zap_add_int(dp->dp_meta_objset,1096dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));10971098dsl_dataset_rele(ds, FTAG);1099if (prev != dp->dp_origin_snap)1100dsl_dataset_rele(prev, FTAG);1101return (0);1102}11031104void1105dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)1106{1107ASSERT(dmu_tx_is_syncing(tx));1108ASSERT(dp->dp_origin_snap != NULL);11091110VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,1111tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));1112}11131114static int1115upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)1116{1117dmu_tx_t *tx = arg;1118objset_t *mos = dp->dp_meta_objset;11191120if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {1121dsl_dataset_t *origin;11221123VERIFY0(dsl_dataset_hold_obj(dp,1124dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));11251126if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {1127dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);1128dsl_dir_phys(origin->ds_dir)->dd_clones =1129zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,11300, tx);1131}11321133VERIFY0(zap_add_int(dp->dp_meta_objset,1134dsl_dir_phys(origin->ds_dir)->dd_clones,1135ds->ds_object, tx));11361137dsl_dataset_rele(origin, FTAG);1138}1139return (0);1140}11411142void1143dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)1144{1145uint64_t obj;11461147ASSERT(dmu_tx_is_syncing(tx));11481149(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);1150VERIFY0(dsl_pool_open_special_dir(dp,1151FREE_DIR_NAME, &dp->dp_free_dir));11521153/*1154* We can't use bpobj_alloc(), because spa_version() still1155* returns the old version, and we need a new-version bpobj with1156* subobj support. So call dmu_object_alloc() directly.1157*/1158obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,1159SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);1160VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,1161DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));1162VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));11631164VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,1165upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));1166}11671168void1169dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)1170{1171uint64_t dsobj;1172dsl_dataset_t *ds;11731174ASSERT(dmu_tx_is_syncing(tx));1175ASSERT0P(dp->dp_origin_snap);1176ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));11771178/* create the origin dir, ds, & snap-ds */1179dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,1180NULL, 0, kcred, NULL, tx);1181VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));1182dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);1183VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,1184dp, &dp->dp_origin_snap));1185dsl_dataset_rele(ds, FTAG);1186}11871188taskq_t *1189dsl_pool_zrele_taskq(dsl_pool_t *dp)1190{1191return (dp->dp_zrele_taskq);1192}11931194taskq_t *1195dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)1196{1197return (dp->dp_unlinked_drain_taskq);1198}11991200/*1201* Walk through the pool-wide zap object of temporary snapshot user holds1202* and release them.1203*/1204void1205dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)1206{1207zap_attribute_t *za;1208zap_cursor_t zc;1209objset_t *mos = dp->dp_meta_objset;1210uint64_t zapobj = dp->dp_tmp_userrefs_obj;1211nvlist_t *holds;12121213if (zapobj == 0)1214return;1215ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);12161217holds = fnvlist_alloc();12181219za = zap_attribute_alloc();1220for (zap_cursor_init(&zc, mos, zapobj);1221zap_cursor_retrieve(&zc, za) == 0;1222zap_cursor_advance(&zc)) {1223char *htag;1224nvlist_t *tags;12251226htag = strchr(za->za_name, '-');1227*htag = '\0';1228++htag;1229if (nvlist_lookup_nvlist(holds, za->za_name, &tags) != 0) {1230tags = fnvlist_alloc();1231fnvlist_add_boolean(tags, htag);1232fnvlist_add_nvlist(holds, za->za_name, tags);1233fnvlist_free(tags);1234} else {1235fnvlist_add_boolean(tags, htag);1236}1237}1238dsl_dataset_user_release_tmp(dp, holds);1239fnvlist_free(holds);1240zap_cursor_fini(&zc);1241zap_attribute_free(za);1242}12431244/*1245* Create the pool-wide zap object for storing temporary snapshot holds.1246*/1247static void1248dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)1249{1250objset_t *mos = dp->dp_meta_objset;12511252ASSERT0(dp->dp_tmp_userrefs_obj);1253ASSERT(dmu_tx_is_syncing(tx));12541255dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,1256DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);1257}12581259static int1260dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,1261const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)1262{1263objset_t *mos = dp->dp_meta_objset;1264uint64_t zapobj = dp->dp_tmp_userrefs_obj;1265char *name;1266int error;12671268ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);1269ASSERT(dmu_tx_is_syncing(tx));12701271/*1272* If the pool was created prior to SPA_VERSION_USERREFS, the1273* zap object for temporary holds might not exist yet.1274*/1275if (zapobj == 0) {1276if (holding) {1277dsl_pool_user_hold_create_obj(dp, tx);1278zapobj = dp->dp_tmp_userrefs_obj;1279} else {1280return (SET_ERROR(ENOENT));1281}1282}12831284name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);1285if (holding)1286error = zap_add(mos, zapobj, name, 8, 1, &now, tx);1287else1288error = zap_remove(mos, zapobj, name, tx);1289kmem_strfree(name);12901291return (error);1292}12931294/*1295* Add a temporary hold for the given dataset object and tag.1296*/1297int1298dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,1299uint64_t now, dmu_tx_t *tx)1300{1301return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));1302}13031304/*1305* Release a temporary hold for the given dataset object and tag.1306*/1307int1308dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,1309dmu_tx_t *tx)1310{1311return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,1312tx, B_FALSE));1313}13141315/*1316* DSL Pool Configuration Lock1317*1318* The dp_config_rwlock protects against changes to DSL state (e.g. dataset1319* creation / destruction / rename / property setting). It must be held for1320* read to hold a dataset or dsl_dir. I.e. you must call1321* dsl_pool_config_enter() or dsl_pool_hold() before calling1322* dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock1323* must be held continuously until all datasets and dsl_dirs are released.1324*1325* The only exception to this rule is that if a "long hold" is placed on1326* a dataset, then the dp_config_rwlock may be dropped while the dataset1327* is still held. The long hold will prevent the dataset from being1328* destroyed -- the destroy will fail with EBUSY. A long hold can be1329* obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset1330* (by calling dsl_{dataset,objset}_{try}own{_obj}).1331*1332* Legitimate long-holders (including owners) should be long-running, cancelable1333* tasks that should cause "zfs destroy" to fail. This includes DMU1334* consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),1335* "zfs send", and "zfs diff". There are several other long-holders whose1336* uses are suboptimal (e.g. "zfs promote", and zil_suspend()).1337*1338* The usual formula for long-holding would be:1339* dsl_pool_hold()1340* dsl_dataset_hold()1341* ... perform checks ...1342* dsl_dataset_long_hold()1343* dsl_pool_rele()1344* ... perform long-running task ...1345* dsl_dataset_long_rele()1346* dsl_dataset_rele()1347*1348* Note that when the long hold is released, the dataset is still held but1349* the pool is not held. The dataset may change arbitrarily during this time1350* (e.g. it could be destroyed). Therefore you shouldn't do anything to the1351* dataset except release it.1352*1353* Operations generally fall somewhere into the following taxonomy:1354*1355* Read-Only Modifying1356*1357* Dataset Layer / MOS zfs get zfs destroy1358*1359* Individual Dataset read() write()1360*1361*1362* Dataset Layer Operations1363*1364* Modifying operations should generally use dsl_sync_task(). The synctask1365* infrastructure enforces proper locking strategy with respect to the1366* dp_config_rwlock. See the comment above dsl_sync_task() for details.1367*1368* Read-only operations will manually hold the pool, then the dataset, obtain1369* information from the dataset, then release the pool and dataset.1370* dmu_objset_{hold,rele}() are convenience routines that also do the pool1371* hold/rele.1372*1373*1374* Operations On Individual Datasets1375*1376* Objects _within_ an objset should only be modified by the current 'owner'1377* of the objset to prevent incorrect concurrent modification. Thus, use1378* {dmu_objset,dsl_dataset}_own to mark some entity as the current owner,1379* and fail with EBUSY if there is already an owner. The owner can then1380* implement its own locking strategy, independent of the dataset layer's1381* locking infrastructure.1382* (E.g., the ZPL has its own set of locks to control concurrency. A regular1383* vnop will not reach into the dataset layer).1384*1385* Ideally, objects would also only be read by the objset’s owner, so that we1386* don’t observe state mid-modification.1387* (E.g. the ZPL is creating a new object and linking it into a directory; if1388* you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an1389* intermediate state. The ioctl level violates this but in pretty benign1390* ways, e.g. reading the zpl props object.)1391*/13921393int1394dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp)1395{1396spa_t *spa;1397int error;13981399error = spa_open(name, &spa, tag);1400if (error == 0) {1401*dp = spa_get_dsl(spa);1402dsl_pool_config_enter(*dp, tag);1403}1404return (error);1405}14061407void1408dsl_pool_rele(dsl_pool_t *dp, const void *tag)1409{1410dsl_pool_config_exit(dp, tag);1411spa_close(dp->dp_spa, tag);1412}14131414void1415dsl_pool_config_enter(dsl_pool_t *dp, const void *tag)1416{1417/*1418* We use a "reentrant" reader-writer lock, but not reentrantly.1419*1420* The rrwlock can (with the track_all flag) track all reading threads,1421* which is very useful for debugging which code path failed to release1422* the lock, and for verifying that the *current* thread does hold1423* the lock.1424*1425* (Unlike a rwlock, which knows that N threads hold it for1426* read, but not *which* threads, so rw_held(RW_READER) returns TRUE1427* if any thread holds it for read, even if this thread doesn't).1428*/1429ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));1430rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);1431}14321433void1434dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag)1435{1436ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));1437rrw_enter_read_prio(&dp->dp_config_rwlock, tag);1438}14391440void1441dsl_pool_config_exit(dsl_pool_t *dp, const void *tag)1442{1443rrw_exit(&dp->dp_config_rwlock, tag);1444}14451446boolean_t1447dsl_pool_config_held(dsl_pool_t *dp)1448{1449return (RRW_LOCK_HELD(&dp->dp_config_rwlock));1450}14511452boolean_t1453dsl_pool_config_held_writer(dsl_pool_t *dp)1454{1455return (RRW_WRITE_HELD(&dp->dp_config_rwlock));1456}14571458EXPORT_SYMBOL(dsl_pool_config_enter);1459EXPORT_SYMBOL(dsl_pool_config_exit);14601461/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */1462ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD,1463"Max percent of RAM allowed to be dirty");14641465/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */1466ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD,1467"zfs_dirty_data_max upper bound as % of RAM");14681469ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW,1470"Transaction delay threshold");14711472ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW,1473"Determines the dirty space limit");14741475ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW,1476"The size limit of write-transaction zil log data");14771478/* zfs_dirty_data_max_max only applied at module load in arc_init(). */1479ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD,1480"zfs_dirty_data_max upper bound in bytes");14811482ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,1483"Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");14841485ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,1486"How quickly delay approaches infinity");14871488ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,1489"Max percent of CPUs that are used per dp_sync_taskq");14901491ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,1492"Number of taskq entries that are pre-populated");14931494ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,1495"Max number of taskq entries that are cached");149614971498