Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_object.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2013, 2017 by Delphix. All rights reserved.24* Copyright 2014 HybridCluster. All rights reserved.25*/2627#include <sys/dbuf.h>28#include <sys/dmu.h>29#include <sys/dmu_impl.h>30#include <sys/dmu_objset.h>31#include <sys/dmu_tx.h>32#include <sys/dnode.h>33#include <sys/zap.h>34#include <sys/zfeature.h>35#include <sys/dsl_dataset.h>3637/*38* Each of the concurrent object allocators will grab39* 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to40* grab 128 slots, which is 4 blocks worth. This was experimentally41* determined to be the lowest value that eliminates the measurable effect42* of lock contention from this code path.43*/44uint_t dmu_object_alloc_chunk_shift = 7;4546static uint64_t47dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,48int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,49int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)50{51uint64_t object;52uint64_t L1_dnode_count = DNODES_PER_BLOCK <<53(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);54dnode_t *dn = NULL;55int dn_slots = dnodesize >> DNODE_SHIFT;56boolean_t restarted = B_FALSE;57uint64_t *cpuobj = NULL;58uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;59int error;6061cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %62os->os_obj_next_percpu_len];6364if (dn_slots == 0) {65dn_slots = DNODE_MIN_SLOTS;66} else {67ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);68ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);69}7071/*72* The "chunk" of dnodes that is assigned to a CPU-specific73* allocator needs to be at least one block's worth, to avoid74* lock contention on the dbuf. It can be at most one L1 block's75* worth, so that the "rescan after polishing off a L1's worth"76* logic below will be sure to kick in.77*/78if (dnodes_per_chunk < DNODES_PER_BLOCK)79dnodes_per_chunk = DNODES_PER_BLOCK;80if (dnodes_per_chunk > L1_dnode_count)81dnodes_per_chunk = L1_dnode_count;8283/*84* The caller requested the dnode be returned as a performance85* optimization in order to avoid releasing the hold only to86* immediately reacquire it. Since they caller is responsible87* for releasing the hold they must provide the tag.88*/89if (allocated_dnode != NULL) {90ASSERT3P(tag, !=, NULL);91} else {92ASSERT0P(tag);93tag = FTAG;94}9596object = *cpuobj;97for (;;) {98/*99* If we finished a chunk of dnodes, get a new one from100* the global allocator.101*/102if ((P2PHASE(object, dnodes_per_chunk) == 0) ||103(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <104dn_slots)) {105DNODE_STAT_BUMP(dnode_alloc_next_chunk);106mutex_enter(&os->os_obj_lock);107ASSERT0(P2PHASE(os->os_obj_next_chunk,108dnodes_per_chunk));109object = os->os_obj_next_chunk;110111/*112* Each time we polish off a L1 bp worth of dnodes113* (2^12 objects), move to another L1 bp that's114* still reasonably sparse (at most 1/4 full). Look115* from the beginning at most once per txg. If we116* still can't allocate from that L1 block, search117* for an empty L0 block, which will quickly skip118* to the end of the metadnode if no nearby L0119* blocks are empty. This fallback avoids a120* pathology where full dnode blocks containing121* large dnodes appear sparse because they have a122* low blk_fill, leading to many failed allocation123* attempts. In the long term a better mechanism to124* search for sparse metadnode regions, such as125* spacemaps, could be implemented.126*127* os_scan_dnodes is set during txg sync if enough128* objects have been freed since the previous129* rescan to justify backfilling again.130*131* Note that dmu_traverse depends on the behavior132* that we use multiple blocks of the dnode object133* before going back to reuse objects. Any change134* to this algorithm should preserve that property135* or find another solution to the issues described136* in traverse_visitbp.137*/138if (P2PHASE(object, L1_dnode_count) == 0) {139uint64_t offset;140uint64_t blkfill;141int minlvl;142if (os->os_rescan_dnodes) {143offset = 0;144os->os_rescan_dnodes = B_FALSE;145} else {146offset = object << DNODE_SHIFT;147}148blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;149minlvl = restarted ? 1 : 2;150restarted = B_TRUE;151error = dnode_next_offset(DMU_META_DNODE(os),152DNODE_FIND_HOLE, &offset, minlvl,153blkfill, 0);154if (error == 0) {155object = offset >> DNODE_SHIFT;156}157}158/*159* Note: if "restarted", we may find a L0 that160* is not suitably aligned.161*/162os->os_obj_next_chunk =163P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +164dnodes_per_chunk;165(void) atomic_swap_64(cpuobj, object);166mutex_exit(&os->os_obj_lock);167}168169/*170* The value of (*cpuobj) before adding dn_slots is the object171* ID assigned to us. The value afterwards is the object ID172* assigned to whoever wants to do an allocation next.173*/174object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;175176/*177* XXX We should check for an i/o error here and return178* up to our caller. Actually we should pre-read it in179* dmu_tx_assign(), but there is currently no mechanism180* to do so.181*/182error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,183dn_slots, tag, &dn);184if (error == 0) {185rw_enter(&dn->dn_struct_rwlock, RW_WRITER);186/*187* Another thread could have allocated it; check188* again now that we have the struct lock.189*/190if (dn->dn_type == DMU_OT_NONE) {191dnode_allocate(dn, ot, blocksize,192indirect_blockshift, bonustype,193bonuslen, dn_slots, tx);194rw_exit(&dn->dn_struct_rwlock);195dmu_tx_add_new_object(tx, dn);196197/*198* Caller requested the allocated dnode be199* returned and is responsible for the hold.200*/201if (allocated_dnode != NULL)202*allocated_dnode = dn;203else204dnode_rele(dn, tag);205206return (object);207}208rw_exit(&dn->dn_struct_rwlock);209dnode_rele(dn, tag);210DNODE_STAT_BUMP(dnode_alloc_race);211}212213/*214* Skip to next known valid starting point on error. This215* is the start of the next block of dnodes.216*/217if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {218object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);219DNODE_STAT_BUMP(dnode_alloc_next_block);220}221(void) atomic_swap_64(cpuobj, object);222}223}224225uint64_t226dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,227dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)228{229return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,230bonuslen, 0, NULL, NULL, tx);231}232233uint64_t234dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,235int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,236dmu_tx_t *tx)237{238return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,239bonustype, bonuslen, 0, NULL, NULL, tx);240}241242uint64_t243dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,244dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)245{246return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,247bonuslen, dnodesize, NULL, NULL, tx));248}249250/*251* Allocate a new object and return a pointer to the newly allocated dnode252* via the allocated_dnode argument. The returned dnode will be held and253* the caller is responsible for releasing the hold by calling dnode_rele().254*/255uint64_t256dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,257int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,258int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)259{260return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,261bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));262}263264int265dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,266int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)267{268return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,269bonuslen, 0, tx));270}271272int273dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,274int blocksize, dmu_object_type_t bonustype, int bonuslen,275int dnodesize, dmu_tx_t *tx)276{277dnode_t *dn;278int dn_slots = dnodesize >> DNODE_SHIFT;279int err;280281if (dn_slots == 0)282dn_slots = DNODE_MIN_SLOTS;283ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);284ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);285286if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))287return (SET_ERROR(EBADF));288289err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,290FTAG, &dn);291if (err)292return (err);293294dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);295dmu_tx_add_new_object(tx, dn);296297dnode_rele(dn, FTAG);298299return (0);300}301302int303dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,304int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)305{306return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,307bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));308}309310int311dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,312int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,313boolean_t keep_spill, dmu_tx_t *tx)314{315dnode_t *dn;316int dn_slots = dnodesize >> DNODE_SHIFT;317int err;318319if (dn_slots == 0)320dn_slots = DNODE_MIN_SLOTS;321322if (object == DMU_META_DNODE_OBJECT)323return (SET_ERROR(EBADF));324325err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,326FTAG, &dn);327if (err)328return (err);329330dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,331keep_spill, tx);332333dnode_rele(dn, FTAG);334return (err);335}336337int338dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)339{340dnode_t *dn;341int err;342343err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,344FTAG, &dn);345if (err)346return (err);347348rw_enter(&dn->dn_struct_rwlock, RW_WRITER);349if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {350dbuf_rm_spill(dn, tx);351dnode_rm_spill(dn, tx);352}353rw_exit(&dn->dn_struct_rwlock);354355dnode_rele(dn, FTAG);356return (err);357}358359int360dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)361{362dnode_t *dn;363int err;364365ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));366367err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,368FTAG, &dn);369if (err)370return (err);371372ASSERT(dn->dn_type != DMU_OT_NONE);373/*374* If we don't create this free range, we'll leak indirect blocks when375* we get to freeing the dnode in syncing context.376*/377dnode_free_range(dn, 0, DMU_OBJECT_END, tx);378dnode_free(dn, tx);379dnode_rele(dn, FTAG);380381return (0);382}383384/*385* Return (in *objectp) the next object which is allocated (or a hole)386* after *object, taking into account only objects that may have been modified387* after the specified txg.388*/389int390dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)391{392uint64_t offset;393uint64_t start_obj;394struct dsl_dataset *ds = os->os_dsl_dataset;395int error;396397if (*objectp == 0) {398start_obj = 1;399} else if (ds && dsl_dataset_feature_is_active(ds,400SPA_FEATURE_LARGE_DNODE)) {401uint64_t i = *objectp + 1;402uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);403dmu_object_info_t doi;404405/*406* Scan through the remaining meta dnode block. The contents407* of each slot in the block are known so it can be quickly408* checked. If the block is exhausted without a match then409* hand off to dnode_next_offset() for further scanning.410*/411while (i <= last_obj) {412if (i == 0)413return (SET_ERROR(ESRCH));414error = dmu_object_info(os, i, &doi);415if (error == ENOENT) {416if (hole) {417*objectp = i;418return (0);419} else {420i++;421}422} else if (error == EEXIST) {423i++;424} else if (error == 0) {425if (hole) {426i += doi.doi_dnodesize >> DNODE_SHIFT;427} else {428*objectp = i;429return (0);430}431} else {432return (error);433}434}435436start_obj = i;437} else {438start_obj = *objectp + 1;439}440441offset = start_obj << DNODE_SHIFT;442443error = dnode_next_offset(DMU_META_DNODE(os),444(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);445446*objectp = offset >> DNODE_SHIFT;447448return (error);449}450451/*452* Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the453* refcount on SPA_FEATURE_EXTENSIBLE_DATASET.454*455* Only for use from syncing context, on MOS objects.456*/457void458dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,459dmu_tx_t *tx)460{461dnode_t *dn;462463ASSERT(dmu_tx_is_syncing(tx));464465VERIFY0(dnode_hold(mos, object, FTAG, &dn));466if (dn->dn_type == DMU_OTN_ZAP_METADATA) {467dnode_rele(dn, FTAG);468return;469}470ASSERT3U(dn->dn_type, ==, old_type);471ASSERT0(dn->dn_maxblkid);472473/*474* We must initialize the ZAP data before changing the type,475* so that concurrent calls to *_is_zapified() can determine if476* the object has been completely zapified by checking the type.477*/478mzap_create_impl(dn, 0, 0, tx);479480dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =481DMU_OTN_ZAP_METADATA;482dnode_setdirty(dn, tx);483dnode_rele(dn, FTAG);484485spa_feature_incr(dmu_objset_spa(mos),486SPA_FEATURE_EXTENSIBLE_DATASET, tx);487}488489void490dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)491{492dnode_t *dn;493dmu_object_type_t t;494495ASSERT(dmu_tx_is_syncing(tx));496497VERIFY0(dnode_hold(mos, object, FTAG, &dn));498t = dn->dn_type;499dnode_rele(dn, FTAG);500501if (t == DMU_OTN_ZAP_METADATA) {502spa_feature_decr(dmu_objset_spa(mos),503SPA_FEATURE_EXTENSIBLE_DATASET, tx);504}505VERIFY0(dmu_object_free(mos, object, tx));506}507508EXPORT_SYMBOL(dmu_object_alloc);509EXPORT_SYMBOL(dmu_object_alloc_ibs);510EXPORT_SYMBOL(dmu_object_alloc_dnsize);511EXPORT_SYMBOL(dmu_object_alloc_hold);512EXPORT_SYMBOL(dmu_object_claim);513EXPORT_SYMBOL(dmu_object_claim_dnsize);514EXPORT_SYMBOL(dmu_object_reclaim);515EXPORT_SYMBOL(dmu_object_reclaim_dnsize);516EXPORT_SYMBOL(dmu_object_rm_spill);517EXPORT_SYMBOL(dmu_object_free);518EXPORT_SYMBOL(dmu_object_next);519EXPORT_SYMBOL(dmu_object_zapify);520EXPORT_SYMBOL(dmu_object_free_zapified);521522ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,523"CPU-specific allocator grabs 2^N objects at once");524525526