Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_tx.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright 2011 Nexenta Systems, Inc. All rights reserved.24* Copyright (c) 2012, 2017 by Delphix. All rights reserved.25* Copyright (c) 2024, 2025, Klara, Inc.26*/2728#include <sys/dmu.h>29#include <sys/dmu_impl.h>30#include <sys/dbuf.h>31#include <sys/dmu_tx.h>32#include <sys/dmu_objset.h>33#include <sys/dsl_dataset.h>34#include <sys/dsl_dir.h>35#include <sys/dsl_pool.h>36#include <sys/zap_impl.h>37#include <sys/spa.h>38#include <sys/brt_impl.h>39#include <sys/sa.h>40#include <sys/sa_impl.h>41#include <sys/zfs_context.h>42#include <sys/trace_zfs.h>4344typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,45uint64_t arg1, uint64_t arg2);4647dmu_tx_stats_t dmu_tx_stats = {48{ "dmu_tx_assigned", KSTAT_DATA_UINT64 },49{ "dmu_tx_delay", KSTAT_DATA_UINT64 },50{ "dmu_tx_error", KSTAT_DATA_UINT64 },51{ "dmu_tx_suspended", KSTAT_DATA_UINT64 },52{ "dmu_tx_group", KSTAT_DATA_UINT64 },53{ "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 },54{ "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 },55{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },56{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },57{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },58{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },59{ "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },60{ "dmu_tx_quota", KSTAT_DATA_UINT64 },61};6263static kstat_t *dmu_tx_ksp;6465dmu_tx_t *66dmu_tx_create_dd(dsl_dir_t *dd)67{68dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);69tx->tx_dir = dd;70if (dd != NULL)71tx->tx_pool = dd->dd_pool;72list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),73offsetof(dmu_tx_hold_t, txh_node));74list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),75offsetof(dmu_tx_callback_t, dcb_node));76tx->tx_start = gethrtime();77return (tx);78}7980dmu_tx_t *81dmu_tx_create(objset_t *os)82{83dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);84tx->tx_objset = os;85return (tx);86}8788dmu_tx_t *89dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)90{91dmu_tx_t *tx = dmu_tx_create_dd(NULL);9293TXG_VERIFY(dp->dp_spa, txg);94tx->tx_pool = dp;95tx->tx_txg = txg;96tx->tx_anyobj = TRUE;9798return (tx);99}100101int102dmu_tx_is_syncing(dmu_tx_t *tx)103{104return (tx->tx_anyobj);105}106107int108dmu_tx_private_ok(dmu_tx_t *tx)109{110return (tx->tx_anyobj);111}112113static dmu_tx_hold_t *114dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,115uint64_t arg1, uint64_t arg2)116{117dmu_tx_hold_t *txh;118119if (dn != NULL) {120(void) zfs_refcount_add(&dn->dn_holds, tx);121if (tx->tx_txg != 0) {122mutex_enter(&dn->dn_mtx);123/*124* dn->dn_assigned_txg == tx->tx_txg doesn't pose a125* problem, but there's no way for it to happen (for126* now, at least).127*/128ASSERT0(dn->dn_assigned_txg);129dn->dn_assigned_txg = tx->tx_txg;130(void) zfs_refcount_add(&dn->dn_tx_holds, tx);131mutex_exit(&dn->dn_mtx);132}133}134135txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);136txh->txh_tx = tx;137txh->txh_dnode = dn;138zfs_refcount_create(&txh->txh_space_towrite);139zfs_refcount_create(&txh->txh_memory_tohold);140txh->txh_type = type;141txh->txh_arg1 = arg1;142txh->txh_arg2 = arg2;143list_insert_tail(&tx->tx_holds, txh);144145return (txh);146}147148static dmu_tx_hold_t *149dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,150enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)151{152dnode_t *dn = NULL;153dmu_tx_hold_t *txh;154int err;155156if (object != DMU_NEW_OBJECT) {157err = dnode_hold(os, object, FTAG, &dn);158if (err != 0) {159tx->tx_err = err;160return (NULL);161}162}163txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);164if (dn != NULL)165dnode_rele(dn, FTAG);166return (txh);167}168169void170dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)171{172/*173* If we're syncing, they can manipulate any object anyhow, and174* the hold on the dnode_t can cause problems.175*/176if (!dmu_tx_is_syncing(tx))177(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);178}179180/*181* This function reads specified data from disk. The specified data will182* be needed to perform the transaction -- i.e, it will be read after183* we do dmu_tx_assign(). There are two reasons that we read the data now184* (before dmu_tx_assign()):185*186* 1. Reading it now has potentially better performance. The transaction187* has not yet been assigned, so the TXG is not held open, and also the188* caller typically has less locks held when calling dmu_tx_hold_*() than189* after the transaction has been assigned. This reduces the lock (and txg)190* hold times, thus reducing lock contention.191*192* 2. It is easier for callers (primarily the ZPL) to handle i/o errors193* that are detected before they start making changes to the DMU state194* (i.e. now). Once the transaction has been assigned, and some DMU195* state has been changed, it can be difficult to recover from an i/o196* error (e.g. to undo the changes already made in memory at the DMU197* layer). Typically code to do so does not exist in the caller -- it198* assumes that the data has already been cached and thus i/o errors are199* not possible.200*201* It has been observed that the i/o initiated here can be a performance202* problem, and it appears to be optional, because we don't look at the203* data which is read. However, removing this read would only serve to204* move the work elsewhere (after the dmu_tx_assign()), where it may205* have a greater impact on performance (in addition to the impact on206* fault tolerance noted above).207*/208static int209dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)210{211int err;212dmu_buf_impl_t *db;213214rw_enter(&dn->dn_struct_rwlock, RW_READER);215err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db);216rw_exit(&dn->dn_struct_rwlock);217if (err == ENOENT)218return (0);219if (err != 0)220return (err);221/*222* PARTIAL_FIRST allows caching for uncacheable blocks. It will223* be cleared after dmu_buf_will_dirty() call dbuf_read() again.224*/225err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH |226(level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0));227dbuf_rele(db, FTAG);228return (err);229}230231static void232dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)233{234dnode_t *dn = txh->txh_dnode;235int err = 0;236237if (len == 0)238return;239240(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);241242if (dn == NULL)243return;244245/*246* For i/o error checking, read the blocks that will be needed247* to perform the write: the first and last level-0 blocks (if248* they are not aligned, i.e. if they are partial-block writes),249* and all the level-1 blocks.250*/251if (dn->dn_maxblkid == 0) {252if (off < dn->dn_datablksz &&253(off > 0 || len < dn->dn_datablksz)) {254err = dmu_tx_check_ioerr(NULL, dn, 0, 0);255if (err != 0) {256txh->txh_tx->tx_err = err;257}258}259} else {260zio_t *zio = zio_root(dn->dn_objset->os_spa,261NULL, NULL, ZIO_FLAG_CANFAIL);262263/* first level-0 block */264uint64_t start = off >> dn->dn_datablkshift;265if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {266err = dmu_tx_check_ioerr(zio, dn, 0, start);267if (err != 0) {268txh->txh_tx->tx_err = err;269}270}271272/* last level-0 block */273uint64_t end = (off + len - 1) >> dn->dn_datablkshift;274if (end != start && end <= dn->dn_maxblkid &&275P2PHASE(off + len, dn->dn_datablksz)) {276err = dmu_tx_check_ioerr(zio, dn, 0, end);277if (err != 0) {278txh->txh_tx->tx_err = err;279}280}281282/* level-1 blocks */283if (dn->dn_nlevels > 1) {284int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;285for (uint64_t i = (start >> shft) + 1;286i < end >> shft; i++) {287err = dmu_tx_check_ioerr(zio, dn, 1, i);288if (err != 0) {289txh->txh_tx->tx_err = err;290}291}292}293294err = zio_wait(zio);295if (err != 0) {296txh->txh_tx->tx_err = err;297}298}299}300301static void302dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)303{304dnode_t *dn = txh->txh_dnode;305int err = 0;306307if (len == 0)308return;309310(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);311312if (dn == NULL)313return;314315/*316* For i/o error checking, read the blocks that will be needed317* to perform the append; first level-0 block (if not aligned, i.e.318* if they are partial-block writes), no additional blocks are read.319*/320if (dn->dn_maxblkid == 0) {321if (off < dn->dn_datablksz &&322(off > 0 || len < dn->dn_datablksz)) {323err = dmu_tx_check_ioerr(NULL, dn, 0, 0);324if (err != 0) {325txh->txh_tx->tx_err = err;326}327}328} else {329zio_t *zio = zio_root(dn->dn_objset->os_spa,330NULL, NULL, ZIO_FLAG_CANFAIL);331332/* first level-0 block */333uint64_t start = off >> dn->dn_datablkshift;334if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {335err = dmu_tx_check_ioerr(zio, dn, 0, start);336if (err != 0) {337txh->txh_tx->tx_err = err;338}339}340341err = zio_wait(zio);342if (err != 0) {343txh->txh_tx->tx_err = err;344}345}346}347348static void349dmu_tx_count_dnode(dmu_tx_hold_t *txh)350{351(void) zfs_refcount_add_many(&txh->txh_space_towrite,352DNODE_MIN_SIZE, FTAG);353}354355void356dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)357{358dmu_tx_hold_t *txh;359360ASSERT0(tx->tx_txg);361ASSERT3U(len, <=, DMU_MAX_ACCESS);362ASSERT(len == 0 || UINT64_MAX - off >= len - 1);363364txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,365object, THT_WRITE, off, len);366if (txh != NULL) {367dmu_tx_count_write(txh, off, len);368dmu_tx_count_dnode(txh);369}370}371372void373dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)374{375dmu_tx_hold_t *txh;376377ASSERT0(tx->tx_txg);378ASSERT3U(len, <=, DMU_MAX_ACCESS);379ASSERT(len == 0 || UINT64_MAX - off >= len - 1);380381txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);382if (txh != NULL) {383dmu_tx_count_write(txh, off, len);384dmu_tx_count_dnode(txh);385}386}387388/*389* Should be used when appending to an object and the exact offset is unknown.390* The write must occur at or beyond the specified offset. Only the L0 block391* at provided offset will be prefetched.392*/393void394dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)395{396dmu_tx_hold_t *txh;397398ASSERT0(tx->tx_txg);399ASSERT3U(len, <=, DMU_MAX_ACCESS);400401txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,402object, THT_APPEND, off, DMU_OBJECT_END);403if (txh != NULL) {404dmu_tx_count_append(txh, off, len);405dmu_tx_count_dnode(txh);406}407}408409void410dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)411{412dmu_tx_hold_t *txh;413414ASSERT0(tx->tx_txg);415ASSERT3U(len, <=, DMU_MAX_ACCESS);416417txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END);418if (txh != NULL) {419dmu_tx_count_append(txh, off, len);420dmu_tx_count_dnode(txh);421}422}423424/*425* This function marks the transaction as being a "net free". The end426* result is that refquotas will be disabled for this transaction, and427* this transaction will be able to use half of the pool space overhead428* (see dsl_pool_adjustedsize()). Therefore this function should only429* be called for transactions that we expect will not cause a net increase430* in the amount of space used (but it's OK if that is occasionally not true).431*/432void433dmu_tx_mark_netfree(dmu_tx_t *tx)434{435tx->tx_netfree = B_TRUE;436}437438static void439dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)440{441dmu_tx_t *tx = txh->txh_tx;442dnode_t *dn = txh->txh_dnode;443int err;444445ASSERT0(tx->tx_txg);446447if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)448return;449if (len == DMU_OBJECT_END)450len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;451452/*453* For i/o error checking, we read the first and last level-0454* blocks if they are not aligned, and all the level-1 blocks.455*456* Note: dbuf_free_range() assumes that we have not instantiated457* any level-0 dbufs that will be completely freed. Therefore we must458* exercise care to not read or count the first and last blocks459* if they are blocksize-aligned.460*/461if (dn->dn_datablkshift == 0) {462if (off != 0 || len < dn->dn_datablksz)463dmu_tx_count_write(txh, 0, dn->dn_datablksz);464} else {465/* first block will be modified if it is not aligned */466if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))467dmu_tx_count_write(txh, off, 1);468/* last block will be modified if it is not aligned */469if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))470dmu_tx_count_write(txh, off + len, 1);471}472473/*474* Check level-1 blocks.475*/476if (dn->dn_nlevels > 1) {477int shift = dn->dn_datablkshift + dn->dn_indblkshift -478SPA_BLKPTRSHIFT;479uint64_t start = off >> shift;480uint64_t end = (off + len) >> shift;481482ASSERT(dn->dn_indblkshift != 0);483484/*485* dnode_reallocate() can result in an object with indirect486* blocks having an odd data block size. In this case,487* just check the single block.488*/489if (dn->dn_datablkshift == 0)490start = end = 0;491492zio_t *zio = zio_root(tx->tx_pool->dp_spa,493NULL, NULL, ZIO_FLAG_CANFAIL);494for (uint64_t i = start; i <= end; i++) {495uint64_t ibyte = i << shift;496err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);497i = ibyte >> shift;498if (err == ESRCH || i > end)499break;500if (err != 0) {501tx->tx_err = err;502(void) zio_wait(zio);503return;504}505506(void) zfs_refcount_add_many(&txh->txh_memory_tohold,5071 << dn->dn_indblkshift, FTAG);508509err = dmu_tx_check_ioerr(zio, dn, 1, i);510if (err != 0) {511tx->tx_err = err;512(void) zio_wait(zio);513return;514}515}516err = zio_wait(zio);517if (err != 0) {518tx->tx_err = err;519return;520}521}522}523524void525dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)526{527dmu_tx_hold_t *txh;528529txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,530object, THT_FREE, off, len);531if (txh != NULL) {532dmu_tx_count_dnode(txh);533dmu_tx_count_free(txh, off, len);534}535}536537void538dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)539{540dmu_tx_hold_t *txh;541542txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);543if (txh != NULL) {544dmu_tx_count_dnode(txh);545dmu_tx_count_free(txh, off, len);546}547}548549static void550dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len,551uint_t blksz)552{553dmu_tx_t *tx = txh->txh_tx;554dnode_t *dn = txh->txh_dnode;555int err;556557ASSERT0(tx->tx_txg);558ASSERT(dn->dn_indblkshift != 0);559ASSERT(blksz != 0);560ASSERT0(off % blksz);561562(void) zfs_refcount_add_many(&txh->txh_memory_tohold,563len / blksz * sizeof (brt_entry_t), FTAG);564565int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;566uint64_t start = off / blksz >> shift;567uint64_t end = (off + len) / blksz >> shift;568569(void) zfs_refcount_add_many(&txh->txh_space_towrite,570(end - start + 1) << dn->dn_indblkshift, FTAG);571572zio_t *zio = zio_root(tx->tx_pool->dp_spa,573NULL, NULL, ZIO_FLAG_CANFAIL);574for (uint64_t i = start; i <= end; i++) {575err = dmu_tx_check_ioerr(zio, dn, 1, i);576if (err != 0) {577tx->tx_err = err;578break;579}580}581err = zio_wait(zio);582if (err != 0)583tx->tx_err = err;584}585586void587dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,588uint64_t len, uint_t blksz)589{590dmu_tx_hold_t *txh;591592ASSERT0(tx->tx_txg);593ASSERT(len == 0 || UINT64_MAX - off >= len - 1);594595txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);596if (txh != NULL) {597dmu_tx_count_dnode(txh);598dmu_tx_count_clone(txh, off, len, blksz);599}600}601602static void603dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)604{605dmu_tx_t *tx = txh->txh_tx;606dnode_t *dn = txh->txh_dnode;607int err;608609ASSERT0(tx->tx_txg);610611dmu_tx_count_dnode(txh);612613/*614* Modifying a almost-full microzap is around the worst case (128KB)615*616* If it is a fat zap, the worst case would be 7*16KB=112KB:617* - 3 blocks overwritten: target leaf, ptrtbl block, header block618* - 4 new blocks written if adding:619* - 2 blocks for possibly split leaves,620* - 2 grown ptrtbl blocks621*/622(void) zfs_refcount_add_many(&txh->txh_space_towrite,623zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG);624625if (dn == NULL)626return;627628ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);629630if (dn->dn_maxblkid == 0 || name == NULL) {631/*632* This is a microzap (only one block), or we don't know633* the name. Check the first block for i/o errors.634*/635err = dmu_tx_check_ioerr(NULL, dn, 0, 0);636if (err != 0) {637tx->tx_err = err;638}639} else {640/*641* Access the name so that we'll check for i/o errors to642* the leaf blocks, etc. We ignore ENOENT, as this name643* may not yet exist.644*/645err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);646if (err == EIO || err == ECKSUM || err == ENXIO) {647tx->tx_err = err;648}649}650}651652void653dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)654{655dmu_tx_hold_t *txh;656657ASSERT0(tx->tx_txg);658659txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,660object, THT_ZAP, add, (uintptr_t)name);661if (txh != NULL)662dmu_tx_hold_zap_impl(txh, name);663}664665void666dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)667{668dmu_tx_hold_t *txh;669670ASSERT0(tx->tx_txg);671ASSERT(dn != NULL);672673txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);674if (txh != NULL)675dmu_tx_hold_zap_impl(txh, name);676}677678void679dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)680{681dmu_tx_hold_t *txh;682683ASSERT0(tx->tx_txg);684685txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,686object, THT_BONUS, 0, 0);687if (txh)688dmu_tx_count_dnode(txh);689}690691void692dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)693{694dmu_tx_hold_t *txh;695696ASSERT0(tx->tx_txg);697698txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);699if (txh)700dmu_tx_count_dnode(txh);701}702703void704dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)705{706dmu_tx_hold_t *txh;707708ASSERT0(tx->tx_txg);709710txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,711DMU_NEW_OBJECT, THT_SPACE, space, 0);712if (txh) {713(void) zfs_refcount_add_many(714&txh->txh_space_towrite, space, FTAG);715}716}717718#ifdef ZFS_DEBUG719void720dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)721{722boolean_t match_object = B_FALSE;723boolean_t match_offset = B_FALSE;724725DB_DNODE_ENTER(db);726dnode_t *dn = DB_DNODE(db);727ASSERT(tx->tx_txg != 0);728ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);729ASSERT3U(dn->dn_object, ==, db->db.db_object);730731if (tx->tx_anyobj) {732DB_DNODE_EXIT(db);733return;734}735736/* XXX No checking on the meta dnode for now */737if (db->db.db_object == DMU_META_DNODE_OBJECT) {738DB_DNODE_EXIT(db);739return;740}741742for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;743txh = list_next(&tx->tx_holds, txh)) {744ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);745if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)746match_object = TRUE;747if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {748int datablkshift = dn->dn_datablkshift ?749dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;750int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;751int shift = datablkshift + epbs * db->db_level;752uint64_t beginblk = shift >= 64 ? 0 :753(txh->txh_arg1 >> shift);754uint64_t endblk = shift >= 64 ? 0 :755((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);756uint64_t blkid = db->db_blkid;757758/* XXX txh_arg2 better not be zero... */759760dprintf("found txh type %x beginblk=%llx endblk=%llx\n",761txh->txh_type, (u_longlong_t)beginblk,762(u_longlong_t)endblk);763764switch (txh->txh_type) {765case THT_WRITE:766if (blkid >= beginblk && blkid <= endblk)767match_offset = TRUE;768/*769* We will let this hold work for the bonus770* or spill buffer so that we don't need to771* hold it when creating a new object.772*/773if (blkid == DMU_BONUS_BLKID ||774blkid == DMU_SPILL_BLKID)775match_offset = TRUE;776/*777* They might have to increase nlevels,778* thus dirtying the new TLIBs. Or the779* might have to change the block size,780* thus dirying the new lvl=0 blk=0.781*/782if (blkid == 0)783match_offset = TRUE;784break;785case THT_APPEND:786if (blkid >= beginblk && (blkid <= endblk ||787txh->txh_arg2 == DMU_OBJECT_END))788match_offset = TRUE;789790/*791* THT_WRITE used for bonus and spill blocks.792*/793ASSERT(blkid != DMU_BONUS_BLKID &&794blkid != DMU_SPILL_BLKID);795796/*797* They might have to increase nlevels,798* thus dirtying the new TLIBs. Or the799* might have to change the block size,800* thus dirying the new lvl=0 blk=0.801*/802if (blkid == 0)803match_offset = TRUE;804break;805case THT_FREE:806/*807* We will dirty all the level 1 blocks in808* the free range and perhaps the first and809* last level 0 block.810*/811if (blkid >= beginblk && (blkid <= endblk ||812txh->txh_arg2 == DMU_OBJECT_END))813match_offset = TRUE;814break;815case THT_SPILL:816if (blkid == DMU_SPILL_BLKID)817match_offset = TRUE;818break;819case THT_BONUS:820if (blkid == DMU_BONUS_BLKID)821match_offset = TRUE;822break;823case THT_ZAP:824match_offset = TRUE;825break;826case THT_NEWOBJECT:827match_object = TRUE;828break;829case THT_CLONE:830if (blkid >= beginblk && blkid <= endblk)831match_offset = TRUE;832/*833* They might have to increase nlevels,834* thus dirtying the new TLIBs. Or the835* might have to change the block size,836* thus dirying the new lvl=0 blk=0.837*/838if (blkid == 0)839match_offset = TRUE;840break;841default:842cmn_err(CE_PANIC, "bad txh_type %d",843txh->txh_type);844}845}846if (match_object && match_offset) {847DB_DNODE_EXIT(db);848return;849}850}851DB_DNODE_EXIT(db);852panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",853(u_longlong_t)db->db.db_object, db->db_level,854(u_longlong_t)db->db_blkid);855}856#endif857858/*859* If we can't do 10 iops, something is wrong. Let us go ahead860* and hit zfs_dirty_data_max.861*/862static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */863864/*865* We delay transactions when we've determined that the backend storage866* isn't able to accommodate the rate of incoming writes.867*868* If there is already a transaction waiting, we delay relative to when869* that transaction finishes waiting. This way the calculated min_time870* is independent of the number of threads concurrently executing871* transactions.872*873* If we are the only waiter, wait relative to when the transaction874* started, rather than the current time. This credits the transaction for875* "time already served", e.g. reading indirect blocks.876*877* The minimum time for a transaction to take is calculated as:878* min_time = scale * (dirty - min) / (max - dirty)879* min_time is then capped at zfs_delay_max_ns.880*881* The delay has two degrees of freedom that can be adjusted via tunables.882* The percentage of dirty data at which we start to delay is defined by883* zfs_delay_min_dirty_percent. This should typically be at or above884* zfs_vdev_async_write_active_max_dirty_percent so that we only start to885* delay after writing at full speed has failed to keep up with the incoming886* write rate. The scale of the curve is defined by zfs_delay_scale. Roughly887* speaking, this variable determines the amount of delay at the midpoint of888* the curve.889*890* delay891* 10ms +-------------------------------------------------------------*+892* | *|893* 9ms + *+894* | *|895* 8ms + *+896* | * |897* 7ms + * +898* | * |899* 6ms + * +900* | * |901* 5ms + * +902* | * |903* 4ms + * +904* | * |905* 3ms + * +906* | * |907* 2ms + (midpoint) * +908* | | ** |909* 1ms + v *** +910* | zfs_delay_scale ----------> ******** |911* 0 +-------------------------------------*********----------------+912* 0% <- zfs_dirty_data_max -> 100%913*914* Note that since the delay is added to the outstanding time remaining on the915* most recent transaction, the delay is effectively the inverse of IOPS.916* Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve917* was chosen such that small changes in the amount of accumulated dirty data918* in the first 3/4 of the curve yield relatively small differences in the919* amount of delay.920*921* The effects can be easier to understand when the amount of delay is922* represented on a log scale:923*924* delay925* 100ms +-------------------------------------------------------------++926* + +927* | |928* + *+929* 10ms + *+930* + ** +931* | (midpoint) ** |932* + | ** +933* 1ms + v **** +934* + zfs_delay_scale ----------> ***** +935* | **** |936* + **** +937* 100us + ** +938* + * +939* | * |940* + * +941* 10us + * +942* + +943* | |944* + +945* +--------------------------------------------------------------+946* 0% <- zfs_dirty_data_max -> 100%947*948* Note here that only as the amount of dirty data approaches its limit does949* the delay start to increase rapidly. The goal of a properly tuned system950* should be to keep the amount of dirty data out of that range by first951* ensuring that the appropriate limits are set for the I/O scheduler to reach952* optimal throughput on the backend storage, and then by changing the value953* of zfs_delay_scale to increase the steepness of the curve.954*/955static void956dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)957{958dsl_pool_t *dp = tx->tx_pool;959uint64_t delay_min_bytes, wrlog;960hrtime_t wakeup, tx_time = 0, now;961962/* Calculate minimum transaction time for the dirty data amount. */963delay_min_bytes =964zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;965if (dirty > delay_min_bytes) {966/*967* The caller has already waited until we are under the max.968* We make them pass us the amount of dirty data so we don't969* have to handle the case of it being >= the max, which970* could cause a divide-by-zero if it's == the max.971*/972ASSERT3U(dirty, <, zfs_dirty_data_max);973974tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /975(zfs_dirty_data_max - dirty);976}977978/* Calculate minimum transaction time for the TX_WRITE log size. */979wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);980delay_min_bytes =981zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;982if (wrlog >= zfs_wrlog_data_max) {983tx_time = zfs_delay_max_ns;984} else if (wrlog > delay_min_bytes) {985tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /986(zfs_wrlog_data_max - wrlog), tx_time);987}988989if (tx_time == 0)990return;991992tx_time = MIN(tx_time, zfs_delay_max_ns);993now = gethrtime();994if (now > tx->tx_start + tx_time)995return;996997DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,998uint64_t, tx_time);9991000mutex_enter(&dp->dp_lock);1001wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);1002dp->dp_last_wakeup = wakeup;1003mutex_exit(&dp->dp_lock);10041005zfs_sleep_until(wakeup);1006}10071008/*1009* This routine attempts to assign the transaction to a transaction group.1010* To do so, we must determine if there is sufficient free space on disk.1011*1012* If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()1013* on it), then it is assumed that there is sufficient free space,1014* unless there's insufficient slop space in the pool (see the comment1015* above spa_slop_shift in spa_misc.c).1016*1017* If it is not a "netfree" transaction, then if the data already on disk1018* is over the allowed usage (e.g. quota), this will fail with EDQUOT or1019* ENOSPC. Otherwise, if the current rough estimate of pending changes,1020* plus the rough estimate of this transaction's changes, may exceed the1021* allowed usage, then this will fail with ERESTART, which will cause the1022* caller to wait for the pending changes to be written to disk (by waiting1023* for the next TXG to open), and then check the space usage again.1024*1025* The rough estimate of pending changes is comprised of the sum of:1026*1027* - this transaction's holds' txh_space_towrite1028*1029* - dd_tempreserved[], which is the sum of in-flight transactions'1030* holds' txh_space_towrite (i.e. those transactions that have called1031* dmu_tx_assign() but not yet called dmu_tx_commit()).1032*1033* - dd_space_towrite[], which is the amount of dirtied dbufs.1034*1035* Note that all of these values are inflated by spa_get_worst_case_asize(),1036* which means that we may get ERESTART well before we are actually in danger1037* of running out of space, but this also mitigates any small inaccuracies1038* in the rough estimate (e.g. txh_space_towrite doesn't take into account1039* indirect blocks, and dd_space_towrite[] doesn't take into account changes1040* to the MOS).1041*1042* Note that due to this algorithm, it is possible to exceed the allowed1043* usage by one transaction. Also, as we approach the allowed usage,1044* we will allow a very limited amount of changes into each TXG, thus1045* decreasing performance.1046*/1047static int1048dmu_tx_try_assign(dmu_tx_t *tx)1049{1050spa_t *spa = tx->tx_pool->dp_spa;10511052ASSERT0(tx->tx_txg);10531054if (tx->tx_err) {1055DMU_TX_STAT_BUMP(dmu_tx_error);1056return (SET_ERROR(EIO));1057}10581059if (spa_suspended(spa)) {1060DMU_TX_STAT_BUMP(dmu_tx_suspended);10611062/*1063* Let dmu_tx_assign() know specifically what happened, so1064* it can make the right choice based on the caller flags.1065*/1066return (SET_ERROR(ESHUTDOWN));1067}10681069if (!tx->tx_dirty_delayed &&1070dsl_pool_need_wrlog_delay(tx->tx_pool)) {1071tx->tx_wait_dirty = B_TRUE;1072DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);1073return (SET_ERROR(ERESTART));1074}10751076if (!tx->tx_dirty_delayed &&1077dsl_pool_need_dirty_delay(tx->tx_pool)) {1078tx->tx_wait_dirty = B_TRUE;1079DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);1080return (SET_ERROR(ERESTART));1081}10821083tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);1084tx->tx_needassign_txh = NULL;10851086/*1087* NB: No error returns are allowed after txg_hold_open, but1088* before processing the dnode holds, due to the1089* dmu_tx_unassign() logic.1090*/10911092uint64_t towrite = 0;1093uint64_t tohold = 0;1094for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;1095txh = list_next(&tx->tx_holds, txh)) {1096dnode_t *dn = txh->txh_dnode;1097if (dn != NULL) {1098/*1099* This thread can't hold the dn_struct_rwlock1100* while assigning the tx, because this can lead to1101* deadlock. Specifically, if this dnode is already1102* assigned to an earlier txg, this thread may need1103* to wait for that txg to sync (the ERESTART case1104* below). The other thread that has assigned this1105* dnode to an earlier txg prevents this txg from1106* syncing until its tx can complete (calling1107* dmu_tx_commit()), but it may need to acquire the1108* dn_struct_rwlock to do so (e.g. via1109* dmu_buf_hold*()).1110*1111* Note that this thread can't hold the lock for1112* read either, but the rwlock doesn't record1113* enough information to make that assertion.1114*/1115ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));11161117mutex_enter(&dn->dn_mtx);1118if (dn->dn_assigned_txg == tx->tx_txg - 1) {1119mutex_exit(&dn->dn_mtx);1120tx->tx_needassign_txh = txh;1121DMU_TX_STAT_BUMP(dmu_tx_group);1122return (SET_ERROR(ERESTART));1123}1124if (dn->dn_assigned_txg == 0)1125dn->dn_assigned_txg = tx->tx_txg;1126ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);1127(void) zfs_refcount_add(&dn->dn_tx_holds, tx);1128mutex_exit(&dn->dn_mtx);1129}1130towrite += zfs_refcount_count(&txh->txh_space_towrite);1131tohold += zfs_refcount_count(&txh->txh_memory_tohold);1132}11331134/* needed allocation: worst-case estimate of write space */1135uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);1136/* calculate memory footprint estimate */1137uint64_t memory = towrite + tohold;11381139if (tx->tx_dir != NULL && asize != 0) {1140int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,1141asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);1142if (err != 0)1143return (err);1144}11451146DMU_TX_STAT_BUMP(dmu_tx_assigned);11471148return (0);1149}11501151static void1152dmu_tx_unassign(dmu_tx_t *tx)1153{1154if (tx->tx_txg == 0)1155return;11561157txg_rele_to_quiesce(&tx->tx_txgh);11581159/*1160* Walk the transaction's hold list, removing the hold on the1161* associated dnode, and notifying waiters if the refcount drops to 0.1162*/1163for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);1164txh && txh != tx->tx_needassign_txh;1165txh = list_next(&tx->tx_holds, txh)) {1166dnode_t *dn = txh->txh_dnode;11671168if (dn == NULL)1169continue;1170mutex_enter(&dn->dn_mtx);1171ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);11721173if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {1174dn->dn_assigned_txg = 0;1175cv_broadcast(&dn->dn_notxholds);1176}1177mutex_exit(&dn->dn_mtx);1178}11791180txg_rele_to_sync(&tx->tx_txgh);11811182tx->tx_lasttried_txg = tx->tx_txg;1183tx->tx_txg = 0;1184}11851186/*1187* Assign tx to a transaction group; `flags` is a bitmask:1188*1189* If DMU_TX_WAIT is set and the currently open txg is full, this function1190* will wait until there's a new txg. This should be used when no locks1191* are being held. With this bit set, this function will only fail if1192* we're truly out of space (ENOSPC), over quota (EDQUOT), or required1193* data for the transaction could not be read from disk (EIO).1194*1195* If DMU_TX_WAIT is *not* set and we can't assign into the currently open1196* txg without blocking, this function will return immediately with1197* ERESTART. This should be used whenever locks are being held. On an1198* ERESTART error, the caller should drop all locks, call dmu_tx_wait(),1199* and try again.1200*1201* If DMU_TX_NOTHROTTLE is set, this indicates that this tx should not be1202* delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for1203* details on the throttle). This is used by the VFS operations, after1204* they have already called dmu_tx_wait() (though most likely on a1205* different tx).1206*1207* If DMU_TX_SUSPEND is set, this indicates that this tx should ignore1208* the pool being or becoming suspending while it is in progress. This will1209* cause dmu_tx_assign() (and dmu_tx_wait()) to block until the pool resumes.1210* If this flag is not set and the pool suspends, the return will be either1211* ERESTART or EIO, depending on the value of the pool's failmode= property.1212*1213* It is guaranteed that subsequent successful calls to dmu_tx_assign()1214* will assign the tx to monotonically increasing txgs. Of course this is1215* not strong monotonicity, because the same txg can be returned multiple1216* times in a row. This guarantee holds both for subsequent calls from1217* one thread and for multiple threads. For example, it is impossible to1218* observe the following sequence of events:1219*1220* Thread 1 Thread 21221*1222* dmu_tx_assign(T1, ...)1223* 1 <- dmu_tx_get_txg(T1)1224* dmu_tx_assign(T2, ...)1225* 2 <- dmu_tx_get_txg(T2)1226* dmu_tx_assign(T3, ...)1227* 1 <- dmu_tx_get_txg(T3)1228*/1229int1230dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags)1231{1232int err;12331234ASSERT0(tx->tx_txg);1235ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND));1236IMPLY(flags & DMU_TX_SUSPEND, flags & DMU_TX_WAIT);1237ASSERT(!dsl_pool_sync_context(tx->tx_pool));12381239/* If we might wait, we must not hold the config lock. */1240IMPLY((flags & DMU_TX_WAIT), !dsl_pool_config_held(tx->tx_pool));12411242if ((flags & DMU_TX_NOTHROTTLE))1243tx->tx_dirty_delayed = B_TRUE;12441245if (!(flags & DMU_TX_SUSPEND))1246tx->tx_break_on_suspend = B_TRUE;12471248while ((err = dmu_tx_try_assign(tx)) != 0) {1249dmu_tx_unassign(tx);12501251boolean_t suspended = (err == ESHUTDOWN);1252if (suspended) {1253/*1254* Pool suspended. We need to decide whether to block1255* and retry, or return error, depending on the1256* caller's flags and the pool config.1257*/1258if (flags & DMU_TX_SUSPEND)1259/*1260* The caller expressly does not care about1261* suspend, so treat it as a normal retry.1262*/1263err = SET_ERROR(ERESTART);1264else if ((flags & DMU_TX_WAIT) &&1265spa_get_failmode(tx->tx_pool->dp_spa) ==1266ZIO_FAILURE_MODE_CONTINUE)1267/*1268* Caller wants to wait, but pool config is1269* overriding that, so return EIO to be1270* propagated back to userspace.1271*/1272err = SET_ERROR(EIO);1273else1274/* Anything else, we should just block. */1275err = SET_ERROR(ERESTART);1276}12771278/*1279* Return unless we decided to retry, or the caller does not1280* want to block.1281*/1282if (err != ERESTART || !(flags & DMU_TX_WAIT)) {1283ASSERT(err == EDQUOT || err == ENOSPC ||1284err == ERESTART || err == EIO);1285return (err);1286}12871288/*1289* Wait until there's room in this txg, or until it's been1290* synced out and a new one is available.1291*1292* If we're here because the pool suspended above, then we1293* unset tx_break_on_suspend to make sure that if dmu_tx_wait()1294* has to fall back to a txg_wait_synced_flags(), it doesn't1295* immediately return because the pool is suspended. That would1296* then immediately return here, and we'd end up in a busy loop1297* until the pool resumes.1298*1299* On the other hand, if the pool hasn't suspended yet, then it1300* should be allowed to break a txg wait if the pool does1301* suspend, so we can loop and reassess it in1302* dmu_tx_try_assign().1303*/1304if (suspended)1305tx->tx_break_on_suspend = B_FALSE;13061307dmu_tx_wait(tx);13081309/*1310* Reset tx_break_on_suspend for DMU_TX_SUSPEND. We do this1311* here so that it's available if we return for some other1312* reason, and then the caller calls dmu_tx_wait().1313*/1314if (!(flags & DMU_TX_SUSPEND))1315tx->tx_break_on_suspend = B_TRUE;1316}13171318txg_rele_to_quiesce(&tx->tx_txgh);13191320return (0);1321}13221323void1324dmu_tx_wait(dmu_tx_t *tx)1325{1326spa_t *spa = tx->tx_pool->dp_spa;1327dsl_pool_t *dp = tx->tx_pool;1328hrtime_t before;13291330ASSERT0(tx->tx_txg);1331ASSERT(!dsl_pool_config_held(tx->tx_pool));13321333/*1334* Break on suspend according to whether or not DMU_TX_SUSPEND was1335* supplied to the previous dmu_tx_assign() call. For clients, this1336* ensures that after dmu_tx_assign() fails, the followup dmu_tx_wait()1337* gets the same behaviour wrt suspend. See also the comments in1338* dmu_tx_assign().1339*/1340txg_wait_flag_t flags =1341(tx->tx_break_on_suspend ? TXG_WAIT_SUSPEND : TXG_WAIT_NONE);13421343before = gethrtime();13441345if (tx->tx_wait_dirty) {1346uint64_t dirty;13471348/*1349* dmu_tx_try_assign() has determined that we need to wait1350* because we've consumed much or all of the dirty buffer1351* space.1352*/1353mutex_enter(&dp->dp_lock);1354if (dp->dp_dirty_total >= zfs_dirty_data_max)1355DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);1356while (dp->dp_dirty_total >= zfs_dirty_data_max)1357cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);1358dirty = dp->dp_dirty_total;1359mutex_exit(&dp->dp_lock);13601361dmu_tx_delay(tx, dirty);13621363tx->tx_wait_dirty = B_FALSE;13641365/*1366* Note: setting tx_dirty_delayed only has effect if the1367* caller used DMU_TX_WAIT. Otherwise they are going to1368* destroy this tx and try again. The common case,1369* zfs_write(), uses DMU_TX_WAIT.1370*/1371tx->tx_dirty_delayed = B_TRUE;1372} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {1373/*1374* If the pool is suspended we need to wait until it1375* is resumed. Note that it's possible that the pool1376* has become active after this thread has tried to1377* obtain a tx. If that's the case then tx_lasttried_txg1378* would not have been set.1379*/1380txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags);1381} else if (tx->tx_needassign_txh) {1382dnode_t *dn = tx->tx_needassign_txh->txh_dnode;13831384mutex_enter(&dn->dn_mtx);1385while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)1386cv_wait(&dn->dn_notxholds, &dn->dn_mtx);1387mutex_exit(&dn->dn_mtx);1388tx->tx_needassign_txh = NULL;1389} else {1390/*1391* If we have a lot of dirty data just wait until we sync1392* out a TXG at which point we'll hopefully have synced1393* a portion of the changes.1394*/1395txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags);1396}13971398spa_tx_assign_add_nsecs(spa, gethrtime() - before);1399}14001401static void1402dmu_tx_destroy(dmu_tx_t *tx)1403{1404dmu_tx_hold_t *txh;14051406while ((txh = list_head(&tx->tx_holds)) != NULL) {1407dnode_t *dn = txh->txh_dnode;14081409list_remove(&tx->tx_holds, txh);1410zfs_refcount_destroy_many(&txh->txh_space_towrite,1411zfs_refcount_count(&txh->txh_space_towrite));1412zfs_refcount_destroy_many(&txh->txh_memory_tohold,1413zfs_refcount_count(&txh->txh_memory_tohold));1414kmem_free(txh, sizeof (dmu_tx_hold_t));1415if (dn != NULL)1416dnode_rele(dn, tx);1417}14181419list_destroy(&tx->tx_callbacks);1420list_destroy(&tx->tx_holds);1421kmem_free(tx, sizeof (dmu_tx_t));1422}14231424void1425dmu_tx_commit(dmu_tx_t *tx)1426{1427/* This function should only be used on assigned transactions. */1428ASSERT(tx->tx_txg != 0);14291430/*1431* Go through the transaction's hold list and remove holds on1432* associated dnodes, notifying waiters if no holds remain.1433*/1434for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;1435txh = list_next(&tx->tx_holds, txh)) {1436dnode_t *dn = txh->txh_dnode;14371438if (dn == NULL)1439continue;14401441mutex_enter(&dn->dn_mtx);1442ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);14431444if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {1445dn->dn_assigned_txg = 0;1446cv_broadcast(&dn->dn_notxholds);1447}1448mutex_exit(&dn->dn_mtx);1449}14501451if (tx->tx_tempreserve_cookie)1452dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);14531454if (!list_is_empty(&tx->tx_callbacks))1455txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);14561457if (tx->tx_anyobj == FALSE)1458txg_rele_to_sync(&tx->tx_txgh);14591460dmu_tx_destroy(tx);1461}14621463void1464dmu_tx_abort(dmu_tx_t *tx)1465{1466/* This function should not be used on assigned transactions. */1467ASSERT0(tx->tx_txg);14681469/* Should not be needed, but better be safe than sorry. */1470if (tx->tx_tempreserve_cookie)1471dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);14721473/*1474* Call any registered callbacks with an error code.1475*/1476if (!list_is_empty(&tx->tx_callbacks))1477dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED));14781479/* Should not be needed, but better be safe than sorry. */1480dmu_tx_unassign(tx);14811482dmu_tx_destroy(tx);1483}14841485uint64_t1486dmu_tx_get_txg(dmu_tx_t *tx)1487{1488ASSERT(tx->tx_txg != 0);1489return (tx->tx_txg);1490}14911492dsl_pool_t *1493dmu_tx_pool(dmu_tx_t *tx)1494{1495ASSERT(tx->tx_pool != NULL);1496return (tx->tx_pool);1497}14981499/*1500* Register a callback to be executed at the end of a TXG.1501*1502* Note: This currently exists for outside consumers, specifically the ZFS OSD1503* for Lustre. Please do not remove before checking that project. For examples1504* on how to use this see `ztest_commit_callback`.1505*/1506void1507dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)1508{1509dmu_tx_callback_t *dcb;15101511dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);15121513dcb->dcb_func = func;1514dcb->dcb_data = data;15151516list_insert_tail(&tx->tx_callbacks, dcb);1517}15181519/*1520* Call all the commit callbacks on a list, with a given error code.1521*/1522void1523dmu_tx_do_callbacks(list_t *cb_list, int error)1524{1525dmu_tx_callback_t *dcb;15261527while ((dcb = list_remove_tail(cb_list)) != NULL) {1528dcb->dcb_func(dcb->dcb_data, error);1529kmem_free(dcb, sizeof (dmu_tx_callback_t));1530}1531}15321533/*1534* Interface to hold a bunch of attributes.1535* used for creating new files.1536* attrsize is the total size of all attributes1537* to be added during object creation1538*1539* For updating/adding a single attribute dmu_tx_hold_sa() should be used.1540*/15411542/*1543* hold necessary attribute name for attribute registration.1544* should be a very rare case where this is needed. If it does1545* happen it would only happen on the first write to the file system.1546*/1547static void1548dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)1549{1550if (!sa->sa_need_attr_registration)1551return;15521553for (int i = 0; i != sa->sa_num_attrs; i++) {1554if (!sa->sa_attr_table[i].sa_registered) {1555if (sa->sa_reg_attr_obj)1556dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,1557B_TRUE, sa->sa_attr_table[i].sa_name);1558else1559dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,1560B_TRUE, sa->sa_attr_table[i].sa_name);1561}1562}1563}15641565void1566dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)1567{1568dmu_tx_hold_t *txh;15691570txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,1571THT_SPILL, 0, 0);1572if (txh != NULL)1573(void) zfs_refcount_add_many(&txh->txh_space_towrite,1574SPA_OLD_MAXBLOCKSIZE, FTAG);1575}15761577void1578dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)1579{1580sa_os_t *sa = tx->tx_objset->os_sa;15811582dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);15831584if (tx->tx_objset->os_sa->sa_master_obj == 0)1585return;15861587if (tx->tx_objset->os_sa->sa_layout_attr_obj) {1588dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);1589} else {1590dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);1591dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);1592dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);1593dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);1594}15951596dmu_tx_sa_registration_hold(sa, tx);15971598if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)1599return;16001601(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,1602THT_SPILL, 0, 0);1603}16041605/*1606* Hold SA attribute1607*1608* dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)1609*1610* variable_size is the total size of all variable sized attributes1611* passed to this function. It is not the total size of all1612* variable size attributes that *may* exist on this object.1613*/1614void1615dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)1616{1617uint64_t object;1618sa_os_t *sa = tx->tx_objset->os_sa;16191620ASSERT(hdl != NULL);16211622object = sa_handle_object(hdl);16231624dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;1625DB_DNODE_ENTER(db);1626dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));1627DB_DNODE_EXIT(db);16281629if (tx->tx_objset->os_sa->sa_master_obj == 0)1630return;16311632if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||1633tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {1634dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);1635dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);1636dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);1637dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);1638}16391640dmu_tx_sa_registration_hold(sa, tx);16411642if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)1643dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);16441645if (sa->sa_force_spill || may_grow || hdl->sa_spill) {1646ASSERT0(tx->tx_txg);1647dmu_tx_hold_spill(tx, object);1648} else {1649DB_DNODE_ENTER(db);1650if (DB_DNODE(db)->dn_have_spill) {1651ASSERT0(tx->tx_txg);1652dmu_tx_hold_spill(tx, object);1653}1654DB_DNODE_EXIT(db);1655}1656}16571658void1659dmu_tx_init(void)1660{1661dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",1662KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),1663KSTAT_FLAG_VIRTUAL);16641665if (dmu_tx_ksp != NULL) {1666dmu_tx_ksp->ks_data = &dmu_tx_stats;1667kstat_install(dmu_tx_ksp);1668}1669}16701671void1672dmu_tx_fini(void)1673{1674if (dmu_tx_ksp != NULL) {1675kstat_delete(dmu_tx_ksp);1676dmu_tx_ksp = NULL;1677}1678}16791680#if defined(_KERNEL)1681EXPORT_SYMBOL(dmu_tx_create);1682EXPORT_SYMBOL(dmu_tx_hold_write);1683EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);1684EXPORT_SYMBOL(dmu_tx_hold_append);1685EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);1686EXPORT_SYMBOL(dmu_tx_hold_free);1687EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);1688EXPORT_SYMBOL(dmu_tx_hold_zap);1689EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode);1690EXPORT_SYMBOL(dmu_tx_hold_bonus);1691EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode);1692EXPORT_SYMBOL(dmu_tx_abort);1693EXPORT_SYMBOL(dmu_tx_assign);1694EXPORT_SYMBOL(dmu_tx_wait);1695EXPORT_SYMBOL(dmu_tx_commit);1696EXPORT_SYMBOL(dmu_tx_mark_netfree);1697EXPORT_SYMBOL(dmu_tx_get_txg);1698EXPORT_SYMBOL(dmu_tx_callback_register);1699EXPORT_SYMBOL(dmu_tx_do_callbacks);1700EXPORT_SYMBOL(dmu_tx_hold_spill);1701EXPORT_SYMBOL(dmu_tx_hold_sa_create);1702EXPORT_SYMBOL(dmu_tx_hold_sa);1703#endif170417051706