Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_objset.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2012, 2020 by Delphix. All rights reserved.25* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.26* Copyright (c) 2013, Joyent, Inc. All rights reserved.27* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.28* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.29* Copyright (c) 2016 Actifio, Inc. All rights reserved.30* Copyright 2017 Nexenta Systems, Inc.31* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.32* Copyright (c) 2018, loli10K <[email protected]>. All rights reserved.33* Copyright (c) 2019, Klara Inc.34* Copyright (c) 2019, Allan Jude35* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.36* Copyright (c) 2025, Rob Norris <[email protected]>37*/3839/* Portions Copyright 2010 Robert Milkowski */4041#include <sys/cred.h>42#include <sys/zfs_context.h>43#include <sys/dmu_objset.h>44#include <sys/dsl_dir.h>45#include <sys/dsl_dataset.h>46#include <sys/dsl_prop.h>47#include <sys/dsl_pool.h>48#include <sys/dsl_synctask.h>49#include <sys/dsl_deleg.h>50#include <sys/dnode.h>51#include <sys/dbuf.h>52#include <sys/zvol.h>53#include <sys/dmu_tx.h>54#include <sys/zap.h>55#include <sys/zil.h>56#include <sys/dmu_impl.h>57#include <sys/zfs_ioctl.h>58#include <sys/sa.h>59#include <sys/zfs_onexit.h>60#include <sys/dsl_destroy.h>61#include <sys/vdev.h>62#include <sys/zfeature.h>63#include <sys/policy.h>64#include <sys/spa_impl.h>65#include <sys/dmu_recv.h>66#include <sys/zfs_project.h>67#include "zfs_namecheck.h"68#include <sys/vdev_impl.h>69#include <sys/arc.h>70#include <cityhash.h>71#include <sys/cred.h>7273/*74* Needed to close a window in dnode_move() that allows the objset to be freed75* before it can be safely accessed.76*/77krwlock_t os_lock;7879/*80* Tunable to overwrite the maximum number of threads for the parallelization81* of dmu_objset_find_dp, needed to speed up the import of pools with many82* datasets.83* Default is 4 times the number of leaf vdevs.84*/85static const int dmu_find_threads = 0;8687/*88* Backfill lower metadnode objects after this many have been freed.89* Backfilling negatively impacts object creation rates, so only do it90* if there are enough holes to fill.91*/92static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;9394static const char *upgrade_tag = "upgrade_tag";9596static void dmu_objset_find_dp_cb(void *arg);9798static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);99static void dmu_objset_upgrade_stop(objset_t *os);100101void102dmu_objset_init(void)103{104rw_init(&os_lock, NULL, RW_DEFAULT, NULL);105}106107void108dmu_objset_fini(void)109{110rw_destroy(&os_lock);111}112113spa_t *114dmu_objset_spa(objset_t *os)115{116return (os->os_spa);117}118119zilog_t *120dmu_objset_zil(objset_t *os)121{122return (os->os_zil);123}124125dsl_pool_t *126dmu_objset_pool(objset_t *os)127{128dsl_dataset_t *ds;129130if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)131return (ds->ds_dir->dd_pool);132else133return (spa_get_dsl(os->os_spa));134}135136dsl_dataset_t *137dmu_objset_ds(objset_t *os)138{139return (os->os_dsl_dataset);140}141142dmu_objset_type_t143dmu_objset_type(objset_t *os)144{145return (os->os_phys->os_type);146}147148void149dmu_objset_name(objset_t *os, char *buf)150{151dsl_dataset_name(os->os_dsl_dataset, buf);152}153154uint64_t155dmu_objset_id(objset_t *os)156{157dsl_dataset_t *ds = os->os_dsl_dataset;158159return (ds ? ds->ds_object : 0);160}161162uint64_t163dmu_objset_dnodesize(objset_t *os)164{165return (os->os_dnodesize);166}167168zfs_sync_type_t169dmu_objset_syncprop(objset_t *os)170{171return (os->os_sync);172}173174zfs_logbias_op_t175dmu_objset_logbias(objset_t *os)176{177return (os->os_logbias);178}179180static void181checksum_changed_cb(void *arg, uint64_t newval)182{183objset_t *os = arg;184185/*186* Inheritance should have been done by now.187*/188ASSERT(newval != ZIO_CHECKSUM_INHERIT);189190os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);191}192193static void194compression_changed_cb(void *arg, uint64_t newval)195{196objset_t *os = arg;197198/*199* Inheritance and range checking should have been done by now.200*/201ASSERT(newval != ZIO_COMPRESS_INHERIT);202203os->os_compress = zio_compress_select(os->os_spa,204ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);205os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,206ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);207}208209static void210copies_changed_cb(void *arg, uint64_t newval)211{212objset_t *os = arg;213214/*215* Inheritance and range checking should have been done by now.216*/217ASSERT(newval > 0);218ASSERT(newval <= spa_max_replication(os->os_spa));219220os->os_copies = newval;221}222223static void224dedup_changed_cb(void *arg, uint64_t newval)225{226objset_t *os = arg;227spa_t *spa = os->os_spa;228enum zio_checksum checksum;229230/*231* Inheritance should have been done by now.232*/233ASSERT(newval != ZIO_CHECKSUM_INHERIT);234235checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);236237os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;238os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);239}240241static void242primary_cache_changed_cb(void *arg, uint64_t newval)243{244objset_t *os = arg;245246/*247* Inheritance and range checking should have been done by now.248*/249ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||250newval == ZFS_CACHE_METADATA);251252os->os_primary_cache = newval;253}254255static void256secondary_cache_changed_cb(void *arg, uint64_t newval)257{258objset_t *os = arg;259260/*261* Inheritance and range checking should have been done by now.262*/263ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||264newval == ZFS_CACHE_METADATA);265266os->os_secondary_cache = newval;267}268269static void270prefetch_changed_cb(void *arg, uint64_t newval)271{272objset_t *os = arg;273274/*275* Inheritance should have been done by now.276*/277ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||278newval == ZFS_PREFETCH_METADATA);279os->os_prefetch = newval;280}281282static void283sync_changed_cb(void *arg, uint64_t newval)284{285objset_t *os = arg;286287/*288* Inheritance and range checking should have been done by now.289*/290ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||291newval == ZFS_SYNC_DISABLED);292293os->os_sync = newval;294if (os->os_zil)295zil_set_sync(os->os_zil, newval);296}297298static void299redundant_metadata_changed_cb(void *arg, uint64_t newval)300{301objset_t *os = arg;302303/*304* Inheritance and range checking should have been done by now.305*/306ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||307newval == ZFS_REDUNDANT_METADATA_MOST ||308newval == ZFS_REDUNDANT_METADATA_SOME ||309newval == ZFS_REDUNDANT_METADATA_NONE);310311os->os_redundant_metadata = newval;312}313314static void315dnodesize_changed_cb(void *arg, uint64_t newval)316{317objset_t *os = arg;318319switch (newval) {320case ZFS_DNSIZE_LEGACY:321os->os_dnodesize = DNODE_MIN_SIZE;322break;323case ZFS_DNSIZE_AUTO:324/*325* Choose a dnode size that will work well for most326* workloads if the user specified "auto". Future code327* improvements could dynamically select a dnode size328* based on observed workload patterns.329*/330os->os_dnodesize = DNODE_MIN_SIZE * 2;331break;332case ZFS_DNSIZE_1K:333case ZFS_DNSIZE_2K:334case ZFS_DNSIZE_4K:335case ZFS_DNSIZE_8K:336case ZFS_DNSIZE_16K:337os->os_dnodesize = newval;338break;339}340}341342static void343smallblk_changed_cb(void *arg, uint64_t newval)344{345objset_t *os = arg;346347os->os_zpl_special_smallblock = newval;348}349350static void351direct_changed_cb(void *arg, uint64_t newval)352{353objset_t *os = arg;354355/*356* Inheritance and range checking should have been done by now.357*/358ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||359newval == ZFS_DIRECT_ALWAYS);360361os->os_direct = newval;362}363364static void365logbias_changed_cb(void *arg, uint64_t newval)366{367objset_t *os = arg;368369ASSERT(newval == ZFS_LOGBIAS_LATENCY ||370newval == ZFS_LOGBIAS_THROUGHPUT);371os->os_logbias = newval;372if (os->os_zil)373zil_set_logbias(os->os_zil, newval);374}375376static void377recordsize_changed_cb(void *arg, uint64_t newval)378{379objset_t *os = arg;380381os->os_recordsize = newval;382}383384void385dmu_objset_byteswap(void *buf, size_t size)386{387objset_phys_t *osp = buf;388389ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||390size == sizeof (objset_phys_t));391dnode_byteswap(&osp->os_meta_dnode);392byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));393osp->os_type = BSWAP_64(osp->os_type);394osp->os_flags = BSWAP_64(osp->os_flags);395if (size >= OBJSET_PHYS_SIZE_V2) {396dnode_byteswap(&osp->os_userused_dnode);397dnode_byteswap(&osp->os_groupused_dnode);398if (size >= sizeof (objset_phys_t))399dnode_byteswap(&osp->os_projectused_dnode);400}401}402403/*404* Runs cityhash on the objset_t pointer and the object number.405*/406static uint64_t407dnode_hash(const objset_t *os, uint64_t obj)408{409uintptr_t osv = (uintptr_t)os;410return (cityhash2((uint64_t)osv, obj));411}412413static unsigned int414dnode_multilist_index_func(multilist_t *ml, void *obj)415{416dnode_t *dn = obj;417418/*419* The low order bits of the hash value are thought to be420* distributed evenly. Otherwise, in the case that the multilist421* has a power of two number of sublists, each sublists' usage422* would not be evenly distributed. In this context full 64bit423* division would be a waste of time, so limit it to 32 bits.424*/425return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) %426multilist_get_num_sublists(ml));427}428429static inline boolean_t430dmu_os_is_l2cacheable(objset_t *os)431{432if (os->os_secondary_cache == ZFS_CACHE_ALL ||433os->os_secondary_cache == ZFS_CACHE_METADATA) {434if (l2arc_exclude_special == 0)435return (B_TRUE);436437blkptr_t *bp = os->os_rootbp;438if (bp == NULL || BP_IS_HOLE(bp))439return (B_FALSE);440uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);441vdev_t *rvd = os->os_spa->spa_root_vdev;442vdev_t *vd = NULL;443444if (vdev < rvd->vdev_children)445vd = rvd->vdev_child[vdev];446447if (vd == NULL)448return (B_TRUE);449450if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&451vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)452return (B_TRUE);453}454return (B_FALSE);455}456457/*458* Instantiates the objset_t in-memory structure corresponding to the459* objset_phys_t that's pointed to by the specified blkptr_t.460*/461int462dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,463objset_t **osp)464{465objset_t *os;466int i, err;467468ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));469ASSERT(!BP_IS_REDACTED(bp));470471/*472* We need the pool config lock to get properties.473*/474ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));475476/*477* The $ORIGIN dataset (if it exists) doesn't have an associated478* objset, so there's no reason to open it. The $ORIGIN dataset479* will not exist on pools older than SPA_VERSION_ORIGIN.480*/481if (ds != NULL && spa_get_dsl(spa) != NULL &&482spa_get_dsl(spa)->dp_origin_snap != NULL) {483ASSERT3P(ds->ds_dir, !=,484spa_get_dsl(spa)->dp_origin_snap->ds_dir);485}486487os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);488os->os_dsl_dataset = ds;489os->os_spa = spa;490os->os_rootbp = bp;491if (!BP_IS_HOLE(os->os_rootbp)) {492arc_flags_t aflags = ARC_FLAG_WAIT;493zbookmark_phys_t zb;494int size;495zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;496SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,497ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);498499if (dmu_os_is_l2cacheable(os))500aflags |= ARC_FLAG_L2CACHE;501502if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {503ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);504ASSERT(BP_IS_AUTHENTICATED(bp));505zio_flags |= ZIO_FLAG_RAW;506}507508dprintf_bp(os->os_rootbp, "reading %s", "");509err = arc_read(NULL, spa, os->os_rootbp,510arc_getbuf_func, &os->os_phys_buf,511ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);512if (err != 0) {513kmem_free(os, sizeof (objset_t));514/* convert checksum errors into IO errors */515if (err == ECKSUM)516err = SET_ERROR(EIO);517return (err);518}519520if (spa_version(spa) < SPA_VERSION_USERSPACE)521size = OBJSET_PHYS_SIZE_V1;522else if (!spa_feature_is_enabled(spa,523SPA_FEATURE_PROJECT_QUOTA))524size = OBJSET_PHYS_SIZE_V2;525else526size = sizeof (objset_phys_t);527528/* Increase the blocksize if we are permitted. */529if (arc_buf_size(os->os_phys_buf) < size) {530arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,531ARC_BUFC_METADATA, size);532memset(buf->b_data, 0, size);533memcpy(buf->b_data, os->os_phys_buf->b_data,534arc_buf_size(os->os_phys_buf));535arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);536os->os_phys_buf = buf;537}538539os->os_phys = os->os_phys_buf->b_data;540os->os_flags = os->os_phys->os_flags;541} else {542int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?543sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;544os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,545ARC_BUFC_METADATA, size);546os->os_phys = os->os_phys_buf->b_data;547memset(os->os_phys, 0, size);548}549/*550* These properties will be filled in by the logic in zfs_get_zplprop()551* when they are queried for the first time.552*/553os->os_version = OBJSET_PROP_UNINITIALIZED;554os->os_normalization = OBJSET_PROP_UNINITIALIZED;555os->os_utf8only = OBJSET_PROP_UNINITIALIZED;556os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;557558/*559* Note: the changed_cb will be called once before the register560* func returns, thus changing the checksum/compression from the561* default (fletcher2/off). Snapshots don't need to know about562* checksum/compression/copies.563*/564if (ds != NULL) {565os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);566567err = dsl_prop_register(ds,568zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),569primary_cache_changed_cb, os);570if (err == 0) {571err = dsl_prop_register(ds,572zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),573secondary_cache_changed_cb, os);574}575if (err == 0) {576err = dsl_prop_register(ds,577zfs_prop_to_name(ZFS_PROP_PREFETCH),578prefetch_changed_cb, os);579}580if (!ds->ds_is_snapshot) {581if (err == 0) {582err = dsl_prop_register(ds,583zfs_prop_to_name(ZFS_PROP_CHECKSUM),584checksum_changed_cb, os);585}586if (err == 0) {587err = dsl_prop_register(ds,588zfs_prop_to_name(ZFS_PROP_COMPRESSION),589compression_changed_cb, os);590}591if (err == 0) {592err = dsl_prop_register(ds,593zfs_prop_to_name(ZFS_PROP_COPIES),594copies_changed_cb, os);595}596if (err == 0) {597err = dsl_prop_register(ds,598zfs_prop_to_name(ZFS_PROP_DEDUP),599dedup_changed_cb, os);600}601if (err == 0) {602err = dsl_prop_register(ds,603zfs_prop_to_name(ZFS_PROP_LOGBIAS),604logbias_changed_cb, os);605}606if (err == 0) {607err = dsl_prop_register(ds,608zfs_prop_to_name(ZFS_PROP_SYNC),609sync_changed_cb, os);610}611if (err == 0) {612err = dsl_prop_register(ds,613zfs_prop_to_name(614ZFS_PROP_REDUNDANT_METADATA),615redundant_metadata_changed_cb, os);616}617if (err == 0) {618err = dsl_prop_register(ds,619zfs_prop_to_name(ZFS_PROP_RECORDSIZE),620recordsize_changed_cb, os);621}622if (err == 0) {623err = dsl_prop_register(ds,624zfs_prop_to_name(ZFS_PROP_DNODESIZE),625dnodesize_changed_cb, os);626}627if (err == 0) {628err = dsl_prop_register(ds,629zfs_prop_to_name(630ZFS_PROP_SPECIAL_SMALL_BLOCKS),631smallblk_changed_cb, os);632}633if (err == 0) {634err = dsl_prop_register(ds,635zfs_prop_to_name(ZFS_PROP_DIRECT),636direct_changed_cb, os);637}638}639if (err != 0) {640arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);641kmem_free(os, sizeof (objset_t));642return (err);643}644} else {645/* It's the meta-objset. */646os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;647os->os_compress = ZIO_COMPRESS_ON;648os->os_complevel = ZIO_COMPLEVEL_DEFAULT;649os->os_encrypted = B_FALSE;650os->os_copies = spa_max_replication(spa);651os->os_dedup_checksum = ZIO_CHECKSUM_OFF;652os->os_dedup_verify = B_FALSE;653os->os_logbias = ZFS_LOGBIAS_LATENCY;654os->os_sync = ZFS_SYNC_STANDARD;655os->os_primary_cache = ZFS_CACHE_ALL;656os->os_secondary_cache = ZFS_CACHE_ALL;657os->os_dnodesize = DNODE_MIN_SIZE;658os->os_prefetch = ZFS_PREFETCH_ALL;659}660661if (ds == NULL || !ds->ds_is_snapshot)662os->os_zil_header = os->os_phys->os_zil_header;663os->os_zil = zil_alloc(os, &os->os_zil_header);664665for (i = 0; i < TXG_SIZE; i++) {666multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),667offsetof(dnode_t, dn_dirty_link[i]),668dnode_multilist_index_func);669}670list_create(&os->os_dnodes, sizeof (dnode_t),671offsetof(dnode_t, dn_link));672list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),673offsetof(dmu_buf_impl_t, db_link));674675list_link_init(&os->os_evicting_node);676677mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);678mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);679mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);680mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);681os->os_obj_next_percpu_len = boot_ncpus;682os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *683sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);684685dnode_special_open(os, &os->os_phys->os_meta_dnode,686DMU_META_DNODE_OBJECT, &os->os_meta_dnode);687if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {688dnode_special_open(os, &os->os_phys->os_userused_dnode,689DMU_USERUSED_OBJECT, &os->os_userused_dnode);690dnode_special_open(os, &os->os_phys->os_groupused_dnode,691DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);692if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))693dnode_special_open(os,694&os->os_phys->os_projectused_dnode,695DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);696}697698mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);699700*osp = os;701return (0);702}703704int705dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)706{707int err = 0;708709/*710* We need the pool_config lock to manipulate the dsl_dataset_t.711* Even if the dataset is long-held, we need the pool_config lock712* to open the objset, as it needs to get properties.713*/714ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));715716mutex_enter(&ds->ds_opening_lock);717if (ds->ds_objset == NULL) {718objset_t *os;719rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);720err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),721ds, dsl_dataset_get_blkptr(ds), &os);722rrw_exit(&ds->ds_bp_rwlock, FTAG);723724if (err == 0) {725mutex_enter(&ds->ds_lock);726ASSERT0P(ds->ds_objset);727ds->ds_objset = os;728mutex_exit(&ds->ds_lock);729}730}731*osp = ds->ds_objset;732mutex_exit(&ds->ds_opening_lock);733return (err);734}735736/*737* Holds the pool while the objset is held. Therefore only one objset738* can be held at a time.739*/740int741dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag,742objset_t **osp)743{744dsl_pool_t *dp;745dsl_dataset_t *ds;746int err;747ds_hold_flags_t flags;748749flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;750err = dsl_pool_hold(name, tag, &dp);751if (err != 0)752return (err);753err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);754if (err != 0) {755dsl_pool_rele(dp, tag);756return (err);757}758759err = dmu_objset_from_ds(ds, osp);760if (err != 0) {761dsl_dataset_rele_flags(ds, flags, tag);762dsl_pool_rele(dp, tag);763}764765return (err);766}767768int769dmu_objset_hold(const char *name, const void *tag, objset_t **osp)770{771return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));772}773774static int775dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,776boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)777{778(void) tag;779780int err = dmu_objset_from_ds(ds, osp);781if (err != 0) {782return (err);783} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {784return (SET_ERROR(EINVAL));785} else if (!readonly && dsl_dataset_is_snapshot(ds)) {786return (SET_ERROR(EROFS));787} else if (!readonly && decrypt &&788dsl_dir_incompatible_encryption_version(ds->ds_dir)) {789return (SET_ERROR(EROFS));790}791792/* if we are decrypting, we can now check MACs in os->os_phys_buf */793if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {794zbookmark_phys_t zb;795796SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,797ZB_ROOT_LEVEL, ZB_ROOT_BLKID);798err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,799&zb, B_FALSE);800if (err != 0)801return (err);802803ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));804}805806return (0);807}808809/*810* dsl_pool must not be held when this is called.811* Upon successful return, there will be a longhold on the dataset,812* and the dsl_pool will not be held.813*/814int815dmu_objset_own(const char *name, dmu_objset_type_t type,816boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)817{818dsl_pool_t *dp;819dsl_dataset_t *ds;820int err;821ds_hold_flags_t flags;822823flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;824err = dsl_pool_hold(name, FTAG, &dp);825if (err != 0)826return (err);827err = dsl_dataset_own(dp, name, flags, tag, &ds);828if (err != 0) {829dsl_pool_rele(dp, FTAG);830return (err);831}832err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);833if (err != 0) {834dsl_dataset_disown(ds, flags, tag);835dsl_pool_rele(dp, FTAG);836return (err);837}838839/*840* User accounting requires the dataset to be decrypted and rw.841* We also don't begin user accounting during claiming to help842* speed up pool import times and to keep this txg reserved843* completely for recovery work.844*/845if (!readonly && !dp->dp_spa->spa_claiming &&846(ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {847if (dmu_objset_userobjspace_upgradable(*osp) ||848dmu_objset_projectquota_upgradable(*osp)) {849dmu_objset_id_quota_upgrade(*osp);850} else if (dmu_objset_userused_enabled(*osp)) {851dmu_objset_userspace_upgrade(*osp);852}853}854855dsl_pool_rele(dp, FTAG);856return (0);857}858859int860dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,861boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)862{863dsl_dataset_t *ds;864int err;865ds_hold_flags_t flags;866867flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;868err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);869if (err != 0)870return (err);871872err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);873if (err != 0) {874dsl_dataset_disown(ds, flags, tag);875return (err);876}877878return (0);879}880881void882dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag)883{884ds_hold_flags_t flags;885dsl_pool_t *dp = dmu_objset_pool(os);886887flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;888dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);889dsl_pool_rele(dp, tag);890}891892void893dmu_objset_rele(objset_t *os, const void *tag)894{895dmu_objset_rele_flags(os, B_FALSE, tag);896}897898/*899* When we are called, os MUST refer to an objset associated with a dataset900* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner901* == tag. We will then release and reacquire ownership of the dataset while902* holding the pool config_rwlock to avoid intervening namespace or ownership903* changes may occur.904*905* This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to906* release the hold on its dataset and acquire a new one on the dataset of the907* same name so that it can be partially torn down and reconstructed.908*/909void910dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,911boolean_t decrypt, const void *tag)912{913dsl_pool_t *dp;914char name[ZFS_MAX_DATASET_NAME_LEN];915ds_hold_flags_t flags;916917flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;918VERIFY3P(ds, !=, NULL);919VERIFY3P(ds->ds_owner, ==, tag);920VERIFY(dsl_dataset_long_held(ds));921922dsl_dataset_name(ds, name);923dp = ds->ds_dir->dd_pool;924dsl_pool_config_enter(dp, FTAG);925dsl_dataset_disown(ds, flags, tag);926VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));927dsl_pool_config_exit(dp, FTAG);928}929930void931dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag)932{933ds_hold_flags_t flags;934935flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;936/*937* Stop upgrading thread938*/939dmu_objset_upgrade_stop(os);940dsl_dataset_disown(os->os_dsl_dataset, flags, tag);941}942943void944dmu_objset_evict_dbufs(objset_t *os)945{946dnode_t *dn_marker;947dnode_t *dn;948949dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);950951mutex_enter(&os->os_lock);952dn = list_head(&os->os_dnodes);953while (dn != NULL) {954/*955* Skip dnodes without holds. We have to do this dance956* because dnode_add_ref() only works if there is already a957* hold. If the dnode has no holds, then it has no dbufs.958*/959if (dnode_add_ref(dn, FTAG)) {960list_insert_after(&os->os_dnodes, dn, dn_marker);961mutex_exit(&os->os_lock);962963dnode_evict_dbufs(dn);964dnode_rele(dn, FTAG);965966mutex_enter(&os->os_lock);967dn = list_next(&os->os_dnodes, dn_marker);968list_remove(&os->os_dnodes, dn_marker);969} else {970dn = list_next(&os->os_dnodes, dn);971}972}973mutex_exit(&os->os_lock);974975kmem_free(dn_marker, sizeof (dnode_t));976977if (DMU_USERUSED_DNODE(os) != NULL) {978if (DMU_PROJECTUSED_DNODE(os) != NULL)979dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));980dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));981dnode_evict_dbufs(DMU_USERUSED_DNODE(os));982}983dnode_evict_dbufs(DMU_META_DNODE(os));984}985986/*987* Objset eviction processing is split into into two pieces.988* The first marks the objset as evicting, evicts any dbufs that989* have a refcount of zero, and then queues up the objset for the990* second phase of eviction. Once os->os_dnodes has been cleared by991* dnode_buf_pageout()->dnode_destroy(), the second phase is executed.992* The second phase closes the special dnodes, dequeues the objset from993* the list of those undergoing eviction, and finally frees the objset.994*995* NOTE: Due to asynchronous eviction processing (invocation of996* dnode_buf_pageout()), it is possible for the meta dnode for the997* objset to have no holds even though os->os_dnodes is not empty.998*/999void1000dmu_objset_evict(objset_t *os)1001{1002dsl_dataset_t *ds = os->os_dsl_dataset;10031004for (int t = 0; t < TXG_SIZE; t++)1005ASSERT(!dmu_objset_is_dirty(os, t));10061007if (ds)1008dsl_prop_unregister_all(ds, os);10091010if (os->os_sa)1011sa_tear_down(os);10121013dmu_objset_evict_dbufs(os);10141015mutex_enter(&os->os_lock);1016spa_evicting_os_register(os->os_spa, os);1017if (list_is_empty(&os->os_dnodes)) {1018mutex_exit(&os->os_lock);1019dmu_objset_evict_done(os);1020} else {1021mutex_exit(&os->os_lock);1022}102310241025}10261027void1028dmu_objset_evict_done(objset_t *os)1029{1030ASSERT3P(list_head(&os->os_dnodes), ==, NULL);10311032dnode_special_close(&os->os_meta_dnode);1033if (DMU_USERUSED_DNODE(os)) {1034if (DMU_PROJECTUSED_DNODE(os))1035dnode_special_close(&os->os_projectused_dnode);1036dnode_special_close(&os->os_userused_dnode);1037dnode_special_close(&os->os_groupused_dnode);1038}1039zil_free(os->os_zil);10401041arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);10421043/*1044* This is a barrier to prevent the objset from going away in1045* dnode_move() until we can safely ensure that the objset is still in1046* use. We consider the objset valid before the barrier and invalid1047* after the barrier.1048*/1049rw_enter(&os_lock, RW_READER);1050rw_exit(&os_lock);10511052kmem_free(os->os_obj_next_percpu,1053os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));10541055mutex_destroy(&os->os_lock);1056mutex_destroy(&os->os_userused_lock);1057mutex_destroy(&os->os_obj_lock);1058mutex_destroy(&os->os_user_ptr_lock);1059mutex_destroy(&os->os_upgrade_lock);1060for (int i = 0; i < TXG_SIZE; i++)1061multilist_destroy(&os->os_dirty_dnodes[i]);1062spa_evicting_os_deregister(os->os_spa, os);1063kmem_free(os, sizeof (objset_t));1064}10651066inode_timespec_t1067dmu_objset_snap_cmtime(objset_t *os)1068{1069return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));1070}10711072objset_t *1073dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,1074dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)1075{1076objset_t *os;1077dnode_t *mdn;10781079ASSERT(dmu_tx_is_syncing(tx));10801081if (blksz == 0)1082blksz = DNODE_BLOCK_SIZE;1083if (ibs == 0)1084ibs = DN_MAX_INDBLKSHIFT;10851086if (ds != NULL)1087VERIFY0(dmu_objset_from_ds(ds, &os));1088else1089VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));10901091mdn = DMU_META_DNODE(os);10921093dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,1094DNODE_MIN_SLOTS, tx);10951096/*1097* We don't want to have to increase the meta-dnode's nlevels1098* later, because then we could do it in quiescing context while1099* we are also accessing it in open context.1100*1101* This precaution is not necessary for the MOS (ds == NULL),1102* because the MOS is only updated in syncing context.1103* This is most fortunate: the MOS is the only objset that1104* needs to be synced multiple times as spa_sync() iterates1105* to convergence, so minimizing its dn_nlevels matters.1106*/1107if (ds != NULL) {1108if (levels == 0) {1109levels = 1;11101111/*1112* Determine the number of levels necessary for the1113* meta-dnode to contain DN_MAX_OBJECT dnodes. Note1114* that in order to ensure that we do not overflow1115* 64 bits, there has to be a nlevels that gives us a1116* number of blocks > DN_MAX_OBJECT but < 2^64.1117* Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)1118* (10) must be less than (64 - log2(DN_MAX_OBJECT))1119* (16).1120*/1121while ((uint64_t)mdn->dn_nblkptr <<1122(mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *1123(mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <1124DN_MAX_OBJECT)1125levels++;1126}11271128mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =1129mdn->dn_nlevels = levels;1130}11311132ASSERT(type != DMU_OST_NONE);1133ASSERT(type != DMU_OST_ANY);1134ASSERT(type < DMU_OST_NUMTYPES);1135os->os_phys->os_type = type;11361137/*1138* Enable user accounting if it is enabled and this is not an1139* encrypted receive.1140*/1141if (dmu_objset_userused_enabled(os) &&1142(!os->os_encrypted || !dmu_objset_is_receiving(os))) {1143os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;1144if (dmu_objset_userobjused_enabled(os)) {1145ASSERT3P(ds, !=, NULL);1146ds->ds_feature_activation[1147SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;1148os->os_phys->os_flags |=1149OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;1150}1151if (dmu_objset_projectquota_enabled(os)) {1152ASSERT3P(ds, !=, NULL);1153ds->ds_feature_activation[1154SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;1155os->os_phys->os_flags |=1156OBJSET_FLAG_PROJECTQUOTA_COMPLETE;1157}1158os->os_flags = os->os_phys->os_flags;1159}11601161dsl_dataset_dirty(ds, tx);11621163return (os);1164}11651166/* called from dsl for meta-objset */1167objset_t *1168dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,1169dmu_objset_type_t type, dmu_tx_t *tx)1170{1171return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));1172}11731174typedef struct dmu_objset_create_arg {1175const char *doca_name;1176cred_t *doca_cred;1177void (*doca_userfunc)(objset_t *os, void *arg,1178cred_t *cr, dmu_tx_t *tx);1179void *doca_userarg;1180dmu_objset_type_t doca_type;1181uint64_t doca_flags;1182dsl_crypto_params_t *doca_dcp;1183} dmu_objset_create_arg_t;11841185static int1186dmu_objset_create_check(void *arg, dmu_tx_t *tx)1187{1188dmu_objset_create_arg_t *doca = arg;1189dsl_pool_t *dp = dmu_tx_pool(tx);1190dsl_dir_t *pdd;1191dsl_dataset_t *parentds;1192objset_t *parentos;1193const char *tail;1194int error;11951196if (strchr(doca->doca_name, '@') != NULL)1197return (SET_ERROR(EINVAL));11981199if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)1200return (SET_ERROR(ENAMETOOLONG));12011202if (dataset_nestcheck(doca->doca_name) != 0)1203return (SET_ERROR(ENAMETOOLONG));12041205error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);1206if (error != 0)1207return (error);1208if (tail == NULL) {1209dsl_dir_rele(pdd, FTAG);1210return (SET_ERROR(EEXIST));1211}12121213error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);1214if (error != 0) {1215dsl_dir_rele(pdd, FTAG);1216return (error);1217}12181219error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,1220doca->doca_cred);1221if (error != 0) {1222dsl_dir_rele(pdd, FTAG);1223return (error);1224}12251226/* can't create below anything but filesystems (eg. no ZVOLs) */1227error = dsl_dataset_hold_obj(pdd->dd_pool,1228dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);1229if (error != 0) {1230dsl_dir_rele(pdd, FTAG);1231return (error);1232}1233error = dmu_objset_from_ds(parentds, &parentos);1234if (error != 0) {1235dsl_dataset_rele(parentds, FTAG);1236dsl_dir_rele(pdd, FTAG);1237return (error);1238}1239if (dmu_objset_type(parentos) != DMU_OST_ZFS) {1240dsl_dataset_rele(parentds, FTAG);1241dsl_dir_rele(pdd, FTAG);1242return (SET_ERROR(ZFS_ERR_WRONG_PARENT));1243}1244dsl_dataset_rele(parentds, FTAG);1245dsl_dir_rele(pdd, FTAG);12461247return (error);1248}12491250static void1251dmu_objset_create_sync(void *arg, dmu_tx_t *tx)1252{1253dmu_objset_create_arg_t *doca = arg;1254dsl_pool_t *dp = dmu_tx_pool(tx);1255spa_t *spa = dp->dp_spa;1256dsl_dir_t *pdd;1257const char *tail;1258dsl_dataset_t *ds;1259uint64_t obj;1260blkptr_t *bp;1261objset_t *os;1262zio_t *rzio;12631264VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));12651266obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,1267doca->doca_cred, doca->doca_dcp, tx);12681269VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,1270DS_HOLD_FLAG_DECRYPT, FTAG, &ds));1271rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);1272bp = dsl_dataset_get_blkptr(ds);1273os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);1274rrw_exit(&ds->ds_bp_rwlock, FTAG);12751276if (doca->doca_userfunc != NULL) {1277doca->doca_userfunc(os, doca->doca_userarg,1278doca->doca_cred, tx);1279}12801281/*1282* The doca_userfunc() may write out some data that needs to be1283* encrypted if the dataset is encrypted (specifically the root1284* directory). This data must be written out before the encryption1285* key mapping is removed by dsl_dataset_rele_flags(). Force the1286* I/O to occur immediately by invoking the relevant sections of1287* dsl_pool_sync().1288*/1289if (os->os_encrypted) {1290dsl_dataset_t *tmpds = NULL;1291boolean_t need_sync_done = B_FALSE;12921293mutex_enter(&ds->ds_lock);1294ds->ds_owner = FTAG;1295mutex_exit(&ds->ds_lock);12961297rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);1298tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,1299tx->tx_txg);1300if (tmpds != NULL) {1301dsl_dataset_sync(ds, rzio, tx);1302need_sync_done = B_TRUE;1303}1304VERIFY0(zio_wait(rzio));13051306dmu_objset_sync_done(os, tx);1307taskq_wait(dp->dp_sync_taskq);1308if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {1309ASSERT3P(ds->ds_key_mapping, !=, NULL);1310key_mapping_rele(spa, ds->ds_key_mapping, ds);1311}13121313rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);1314tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,1315tx->tx_txg);1316if (tmpds != NULL) {1317dmu_buf_rele(ds->ds_dbuf, ds);1318dsl_dataset_sync(ds, rzio, tx);1319}1320VERIFY0(zio_wait(rzio));13211322if (need_sync_done) {1323ASSERT3P(ds->ds_key_mapping, !=, NULL);1324key_mapping_rele(spa, ds->ds_key_mapping, ds);1325dsl_dataset_sync_done(ds, tx);1326dmu_buf_rele(ds->ds_dbuf, ds);1327}13281329mutex_enter(&ds->ds_lock);1330ds->ds_owner = NULL;1331mutex_exit(&ds->ds_lock);1332}13331334spa_history_log_internal_ds(ds, "create", tx, " ");13351336dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);1337dsl_dir_rele(pdd, FTAG);1338}13391340int1341dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,1342dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)1343{1344dmu_objset_create_arg_t doca;1345dsl_crypto_params_t tmp_dcp = { 0 };13461347cred_t *cr = CRED();1348crhold(cr);13491350doca.doca_name = name;1351doca.doca_cred = cr;1352doca.doca_flags = flags;1353doca.doca_userfunc = func;1354doca.doca_userarg = arg;1355doca.doca_type = type;13561357/*1358* Some callers (mostly for testing) do not provide a dcp on their1359* own but various code inside the sync task will require it to be1360* allocated. Rather than adding NULL checks throughout this code1361* or adding dummy dcp's to all of the callers we simply create a1362* dummy one here and use that. This zero dcp will have the same1363* effect as asking for inheritance of all encryption params.1364*/1365doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;13661367int rv = dsl_sync_task(name,1368dmu_objset_create_check, dmu_objset_create_sync, &doca,13696, ZFS_SPACE_CHECK_NORMAL);13701371if (rv == 0)1372zvol_create_minors(name);13731374crfree(cr);13751376return (rv);1377}13781379int1380dmu_objset_snapshot_one(const char *fsname, const char *snapname)1381{1382int err;1383char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);1384nvlist_t *snaps = fnvlist_alloc();13851386fnvlist_add_boolean(snaps, longsnap);1387kmem_strfree(longsnap);1388err = dsl_dataset_snapshot(snaps, NULL, NULL);1389fnvlist_free(snaps);1390return (err);1391}13921393static void1394dmu_objset_upgrade_task_cb(void *data)1395{1396objset_t *os = data;13971398mutex_enter(&os->os_upgrade_lock);1399os->os_upgrade_status = EINTR;1400if (!os->os_upgrade_exit) {1401int status;14021403mutex_exit(&os->os_upgrade_lock);14041405status = os->os_upgrade_cb(os);14061407mutex_enter(&os->os_upgrade_lock);14081409os->os_upgrade_status = status;1410}1411os->os_upgrade_exit = B_TRUE;1412os->os_upgrade_id = 0;1413mutex_exit(&os->os_upgrade_lock);1414dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1415}14161417static void1418dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)1419{1420if (os->os_upgrade_id != 0)1421return;14221423ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));1424dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);14251426mutex_enter(&os->os_upgrade_lock);1427if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {1428os->os_upgrade_exit = B_FALSE;1429os->os_upgrade_cb = cb;1430os->os_upgrade_id = taskq_dispatch(1431os->os_spa->spa_upgrade_taskq,1432dmu_objset_upgrade_task_cb, os, TQ_SLEEP);1433if (os->os_upgrade_id == TASKQID_INVALID) {1434dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1435os->os_upgrade_status = ENOMEM;1436}1437} else {1438dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1439}1440mutex_exit(&os->os_upgrade_lock);1441}14421443static void1444dmu_objset_upgrade_stop(objset_t *os)1445{1446mutex_enter(&os->os_upgrade_lock);1447os->os_upgrade_exit = B_TRUE;1448if (os->os_upgrade_id != 0) {1449taskqid_t id = os->os_upgrade_id;14501451os->os_upgrade_id = 0;1452mutex_exit(&os->os_upgrade_lock);14531454if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {1455dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1456}1457txg_wait_synced(os->os_spa->spa_dsl_pool, 0);1458} else {1459mutex_exit(&os->os_upgrade_lock);1460}1461}14621463static void1464dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)1465{1466dnode_t *dn;14671468while ((dn = multilist_sublist_head(list)) != NULL) {1469ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);1470ASSERT(dn->dn_dbuf->db_data_pending);1471/*1472* Initialize dn_zio outside dnode_sync() because the1473* meta-dnode needs to set it outside dnode_sync().1474*/1475dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;1476ASSERT(dn->dn_zio);14771478ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);1479multilist_sublist_remove(list, dn);14801481/*1482* See the comment above dnode_rele_task() for an explanation1483* of why this dnode hold is always needed (even when not1484* doing user accounting).1485*/1486multilist_t *newlist = &dn->dn_objset->os_synced_dnodes;1487(void) dnode_add_ref(dn, newlist);1488multilist_insert(newlist, dn);14891490dnode_sync(dn, tx);1491}1492}14931494static void1495dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)1496{1497(void) abuf;1498blkptr_t *bp = zio->io_bp;1499objset_t *os = arg;1500dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;1501uint64_t fill = 0;15021503ASSERT(!BP_IS_EMBEDDED(bp));1504ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);1505ASSERT0(BP_GET_LEVEL(bp));15061507/*1508* Update rootbp fill count: it should be the number of objects1509* allocated in the object set (not counting the "special"1510* objects that are stored in the objset_phys_t -- the meta1511* dnode and user/group/project accounting objects).1512*/1513for (int i = 0; i < dnp->dn_nblkptr; i++)1514fill += BP_GET_FILL(&dnp->dn_blkptr[i]);15151516BP_SET_FILL(bp, fill);15171518if (os->os_dsl_dataset != NULL)1519rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);1520*os->os_rootbp = *bp;1521if (os->os_dsl_dataset != NULL)1522rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);1523}15241525static void1526dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)1527{1528(void) abuf;1529blkptr_t *bp = zio->io_bp;1530blkptr_t *bp_orig = &zio->io_bp_orig;1531objset_t *os = arg;15321533if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {1534ASSERT(BP_EQUAL(bp, bp_orig));1535} else {1536dsl_dataset_t *ds = os->os_dsl_dataset;1537dmu_tx_t *tx = os->os_synctx;15381539(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);1540dsl_dataset_block_born(ds, bp, tx);1541}1542kmem_free(bp, sizeof (*bp));1543}15441545typedef struct sync_objset_arg {1546zio_t *soa_zio;1547objset_t *soa_os;1548dmu_tx_t *soa_tx;1549kmutex_t soa_mutex;1550int soa_count;1551taskq_ent_t soa_tq_ent;1552} sync_objset_arg_t;15531554typedef struct sync_dnodes_arg {1555multilist_t *sda_list;1556int sda_sublist_idx;1557multilist_t *sda_newlist;1558sync_objset_arg_t *sda_soa;1559} sync_dnodes_arg_t;15601561static void sync_meta_dnode_task(void *arg);15621563static void1564sync_dnodes_task(void *arg)1565{1566sync_dnodes_arg_t *sda = arg;1567sync_objset_arg_t *soa = sda->sda_soa;1568objset_t *os = soa->soa_os;15691570uint_t allocator = spa_acq_allocator(os->os_spa);1571multilist_sublist_t *ms =1572multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);15731574dmu_objset_sync_dnodes(ms, soa->soa_tx);15751576multilist_sublist_unlock(ms);1577spa_rel_allocator(os->os_spa, allocator);15781579kmem_free(sda, sizeof (*sda));15801581mutex_enter(&soa->soa_mutex);1582ASSERT(soa->soa_count != 0);1583if (--soa->soa_count != 0) {1584mutex_exit(&soa->soa_mutex);1585return;1586}1587mutex_exit(&soa->soa_mutex);15881589taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,1590sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);1591}15921593/*1594* Issue the zio_nowait() for all dirty record zios on the meta dnode,1595* then trigger the callback for the zil_sync. This runs once for each1596* objset, only after any/all sublists in the objset have been synced.1597*/1598static void1599sync_meta_dnode_task(void *arg)1600{1601sync_objset_arg_t *soa = arg;1602objset_t *os = soa->soa_os;1603dmu_tx_t *tx = soa->soa_tx;1604int txgoff = tx->tx_txg & TXG_MASK;1605dbuf_dirty_record_t *dr;16061607ASSERT0(soa->soa_count);16081609list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];1610while ((dr = list_remove_head(list)) != NULL) {1611ASSERT0(dr->dr_dbuf->db_level);1612zio_nowait(dr->dr_zio);1613}16141615/* Enable dnode backfill if enough objects have been freed. */1616if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {1617os->os_rescan_dnodes = B_TRUE;1618os->os_freed_dnodes = 0;1619}16201621/*1622* Free intent log blocks up to this tx.1623*/1624zil_sync(os->os_zil, tx);1625os->os_phys->os_zil_header = os->os_zil_header;1626zio_nowait(soa->soa_zio);16271628mutex_destroy(&soa->soa_mutex);1629kmem_free(soa, sizeof (*soa));1630}16311632/* called from dsl */1633void1634dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)1635{1636int txgoff;1637zbookmark_phys_t zb;1638zio_prop_t zp;1639zio_t *zio;1640int num_sublists;1641multilist_t *ml;1642blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);1643*blkptr_copy = *os->os_rootbp;16441645dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg);16461647ASSERT(dmu_tx_is_syncing(tx));1648/* XXX the write_done callback should really give us the tx... */1649os->os_synctx = tx;16501651if (os->os_dsl_dataset == NULL) {1652/*1653* This is the MOS. If we have upgraded,1654* spa_max_replication() could change, so reset1655* os_copies here.1656*/1657os->os_copies = spa_max_replication(os->os_spa);1658}16591660/*1661* Create the root block IO1662*/1663SET_BOOKMARK(&zb, os->os_dsl_dataset ?1664os->os_dsl_dataset->ds_object : DMU_META_OBJSET,1665ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);1666arc_release(os->os_phys_buf, &os->os_phys_buf);16671668dmu_write_policy(os, NULL, 0, 0, &zp);16691670/*1671* If we are either claiming the ZIL or doing a raw receive, write1672* out the os_phys_buf raw. Neither of these actions will effect the1673* MAC at this point.1674*/1675if (os->os_raw_receive ||1676os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {1677ASSERT(os->os_encrypted);1678arc_convert_to_raw(os->os_phys_buf,1679os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,1680DMU_OT_OBJSET, NULL, NULL, NULL);1681}16821683zio = arc_write(pio, os->os_spa, tx->tx_txg,1684blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),1685&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,1686os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);16871688/*1689* Sync special dnodes - the parent IO for the sync is the root block1690*/1691DMU_META_DNODE(os)->dn_zio = zio;1692dnode_sync(DMU_META_DNODE(os), tx);16931694os->os_phys->os_flags = os->os_flags;16951696if (DMU_USERUSED_DNODE(os) &&1697DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {1698DMU_USERUSED_DNODE(os)->dn_zio = zio;1699dnode_sync(DMU_USERUSED_DNODE(os), tx);1700DMU_GROUPUSED_DNODE(os)->dn_zio = zio;1701dnode_sync(DMU_GROUPUSED_DNODE(os), tx);1702}17031704if (DMU_PROJECTUSED_DNODE(os) &&1705DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {1706DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;1707dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);1708}17091710txgoff = tx->tx_txg & TXG_MASK;17111712/*1713* We must create the list here because it uses the1714* dn_dirty_link[] of this txg. But it may already1715* exist because we call dsl_dataset_sync() twice per txg.1716*/1717if (os->os_synced_dnodes.ml_sublists == NULL) {1718multilist_create(&os->os_synced_dnodes, sizeof (dnode_t),1719offsetof(dnode_t, dn_dirty_link[txgoff]),1720dnode_multilist_index_func);1721} else {1722ASSERT3U(os->os_synced_dnodes.ml_offset, ==,1723offsetof(dnode_t, dn_dirty_link[txgoff]));1724}17251726/*1727* zio_nowait(zio) is done after any/all sublist and meta dnode1728* zios have been nowaited, and the zil_sync() has been performed.1729* The soa is freed at the end of sync_meta_dnode_task.1730*/1731sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);1732soa->soa_zio = zio;1733soa->soa_os = os;1734soa->soa_tx = tx;1735taskq_init_ent(&soa->soa_tq_ent);1736mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);17371738ml = &os->os_dirty_dnodes[txgoff];1739soa->soa_count = num_sublists = multilist_get_num_sublists(ml);17401741for (int i = 0; i < num_sublists; i++) {1742if (multilist_sublist_is_empty_idx(ml, i))1743soa->soa_count--;1744}17451746if (soa->soa_count == 0) {1747taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,1748sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);1749} else {1750/*1751* Sync sublists in parallel. The last to finish1752* (i.e., when soa->soa_count reaches zero) must1753* dispatch sync_meta_dnode_task.1754*/1755for (int i = 0; i < num_sublists; i++) {1756if (multilist_sublist_is_empty_idx(ml, i))1757continue;1758sync_dnodes_arg_t *sda =1759kmem_alloc(sizeof (*sda), KM_SLEEP);1760sda->sda_list = ml;1761sda->sda_sublist_idx = i;1762sda->sda_soa = soa;1763(void) taskq_dispatch(1764dmu_objset_pool(os)->dp_sync_taskq,1765sync_dnodes_task, sda, 0);1766/* sync_dnodes_task frees sda */1767}1768}1769}17701771boolean_t1772dmu_objset_is_dirty(objset_t *os, uint64_t txg)1773{1774return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]));1775}17761777static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];17781779void1780dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)1781{1782file_cbs[ost] = cb;1783}17841785int1786dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,1787zfs_file_info_t *zfi)1788{1789file_info_cb_t *cb = file_cbs[os->os_phys->os_type];1790if (cb == NULL)1791return (EINVAL);1792return (cb(bonustype, data, zfi));1793}17941795boolean_t1796dmu_objset_userused_enabled(objset_t *os)1797{1798return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&1799file_cbs[os->os_phys->os_type] != NULL &&1800DMU_USERUSED_DNODE(os) != NULL);1801}18021803boolean_t1804dmu_objset_userobjused_enabled(objset_t *os)1805{1806return (dmu_objset_userused_enabled(os) &&1807spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));1808}18091810boolean_t1811dmu_objset_projectquota_enabled(objset_t *os)1812{1813return (file_cbs[os->os_phys->os_type] != NULL &&1814DMU_PROJECTUSED_DNODE(os) != NULL &&1815spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));1816}18171818typedef struct userquota_node {1819/* must be in the first filed, see userquota_update_cache() */1820char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];1821int64_t uqn_delta;1822avl_node_t uqn_node;1823} userquota_node_t;18241825typedef struct userquota_cache {1826avl_tree_t uqc_user_deltas;1827avl_tree_t uqc_group_deltas;1828avl_tree_t uqc_project_deltas;1829} userquota_cache_t;18301831static int1832userquota_compare(const void *l, const void *r)1833{1834const userquota_node_t *luqn = l;1835const userquota_node_t *ruqn = r;1836int rv;18371838/*1839* NB: can only access uqn_id because userquota_update_cache() doesn't1840* pass in an entire userquota_node_t.1841*/1842rv = strcmp(luqn->uqn_id, ruqn->uqn_id);18431844return (TREE_ISIGN(rv));1845}18461847static void1848do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)1849{1850void *cookie;1851userquota_node_t *uqn;18521853ASSERT(dmu_tx_is_syncing(tx));18541855cookie = NULL;1856while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,1857&cookie)) != NULL) {1858/*1859* os_userused_lock protects against concurrent calls to1860* zap_increment_int(). It's needed because zap_increment_int()1861* is not thread-safe (i.e. not atomic).1862*/1863mutex_enter(&os->os_userused_lock);1864VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,1865uqn->uqn_id, uqn->uqn_delta, tx));1866mutex_exit(&os->os_userused_lock);1867kmem_free(uqn, sizeof (*uqn));1868}1869avl_destroy(&cache->uqc_user_deltas);18701871cookie = NULL;1872while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,1873&cookie)) != NULL) {1874mutex_enter(&os->os_userused_lock);1875VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,1876uqn->uqn_id, uqn->uqn_delta, tx));1877mutex_exit(&os->os_userused_lock);1878kmem_free(uqn, sizeof (*uqn));1879}1880avl_destroy(&cache->uqc_group_deltas);18811882if (dmu_objset_projectquota_enabled(os)) {1883cookie = NULL;1884while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,1885&cookie)) != NULL) {1886mutex_enter(&os->os_userused_lock);1887VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,1888uqn->uqn_id, uqn->uqn_delta, tx));1889mutex_exit(&os->os_userused_lock);1890kmem_free(uqn, sizeof (*uqn));1891}1892avl_destroy(&cache->uqc_project_deltas);1893}1894}18951896static void1897userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)1898{1899userquota_node_t *uqn;1900avl_index_t idx;19011902ASSERT(strlen(id) < sizeof (uqn->uqn_id));1903/*1904* Use id directly for searching because uqn_id is the first field of1905* userquota_node_t and fields after uqn_id won't be accessed in1906* avl_find().1907*/1908uqn = avl_find(avl, (const void *)id, &idx);1909if (uqn == NULL) {1910uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);1911strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));1912avl_insert(avl, uqn, idx);1913}1914uqn->uqn_delta += delta;1915}19161917static void1918do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,1919uint64_t flags, uint64_t user, uint64_t group, uint64_t project,1920boolean_t subtract)1921{1922if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {1923int64_t delta = DNODE_MIN_SIZE + used;1924char name[20];19251926if (subtract)1927delta = -delta;19281929(void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);1930userquota_update_cache(&cache->uqc_user_deltas, name, delta);19311932(void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);1933userquota_update_cache(&cache->uqc_group_deltas, name, delta);19341935if (dmu_objset_projectquota_enabled(os)) {1936(void) snprintf(name, sizeof (name), "%llx",1937(longlong_t)project);1938userquota_update_cache(&cache->uqc_project_deltas,1939name, delta);1940}1941}1942}19431944static void1945do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,1946uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)1947{1948if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {1949char name[20 + DMU_OBJACCT_PREFIX_LEN];1950int delta = subtract ? -1 : 1;19511952(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",1953(longlong_t)user);1954userquota_update_cache(&cache->uqc_user_deltas, name, delta);19551956(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",1957(longlong_t)group);1958userquota_update_cache(&cache->uqc_group_deltas, name, delta);19591960if (dmu_objset_projectquota_enabled(os)) {1961(void) snprintf(name, sizeof (name),1962DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);1963userquota_update_cache(&cache->uqc_project_deltas,1964name, delta);1965}1966}1967}19681969typedef struct userquota_updates_arg {1970objset_t *uua_os;1971int uua_sublist_idx;1972dmu_tx_t *uua_tx;1973} userquota_updates_arg_t;19741975static void1976userquota_updates_task(void *arg)1977{1978userquota_updates_arg_t *uua = arg;1979objset_t *os = uua->uua_os;1980dmu_tx_t *tx = uua->uua_tx;1981dnode_t *dn;1982userquota_cache_t cache = { { 0 } };19831984multilist_sublist_t *list = multilist_sublist_lock_idx(1985&os->os_synced_dnodes, uua->uua_sublist_idx);19861987ASSERT(multilist_sublist_head(list) == NULL ||1988dmu_objset_userused_enabled(os));1989avl_create(&cache.uqc_user_deltas, userquota_compare,1990sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));1991avl_create(&cache.uqc_group_deltas, userquota_compare,1992sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));1993if (dmu_objset_projectquota_enabled(os))1994avl_create(&cache.uqc_project_deltas, userquota_compare,1995sizeof (userquota_node_t), offsetof(userquota_node_t,1996uqn_node));19971998while ((dn = multilist_sublist_head(list)) != NULL) {1999int flags;2000ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));2001ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||2002dn->dn_phys->dn_flags &2003DNODE_FLAG_USERUSED_ACCOUNTED);20042005flags = dn->dn_id_flags;2006ASSERT(flags);2007if (flags & DN_ID_OLD_EXIST) {2008do_userquota_update(os, &cache, dn->dn_oldused,2009dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,2010dn->dn_oldprojid, B_TRUE);2011do_userobjquota_update(os, &cache, dn->dn_oldflags,2012dn->dn_olduid, dn->dn_oldgid,2013dn->dn_oldprojid, B_TRUE);2014}2015if (flags & DN_ID_NEW_EXIST) {2016do_userquota_update(os, &cache,2017DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,2018dn->dn_newuid, dn->dn_newgid,2019dn->dn_newprojid, B_FALSE);2020do_userobjquota_update(os, &cache,2021dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,2022dn->dn_newprojid, B_FALSE);2023}20242025mutex_enter(&dn->dn_mtx);2026dn->dn_oldused = 0;2027dn->dn_oldflags = 0;2028if (dn->dn_id_flags & DN_ID_NEW_EXIST) {2029dn->dn_olduid = dn->dn_newuid;2030dn->dn_oldgid = dn->dn_newgid;2031dn->dn_oldprojid = dn->dn_newprojid;2032dn->dn_id_flags |= DN_ID_OLD_EXIST;2033if (dn->dn_bonuslen == 0)2034dn->dn_id_flags |= DN_ID_CHKED_SPILL;2035else2036dn->dn_id_flags |= DN_ID_CHKED_BONUS;2037}2038dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);2039ASSERT3U(dn->dn_dirtycnt, >, 0);2040dn->dn_dirtycnt--;2041mutex_exit(&dn->dn_mtx);20422043multilist_sublist_remove(list, dn);2044dnode_rele(dn, &os->os_synced_dnodes);2045}2046do_userquota_cacheflush(os, &cache, tx);2047multilist_sublist_unlock(list);2048kmem_free(uua, sizeof (*uua));2049}20502051/*2052* Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being2053* synced (i.e. we have issued the zio's for blocks in the dnode), it can't be2054* evicted because the block containing the dnode can't be evicted until it is2055* written out. However, this hold is necessary to prevent the dnode_t from2056* being moved (via dnode_move()) while it's still referenced by2057* dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for2058* dirty_lightweight_leaf-type dirty records.2059*2060* If we are doing user-object accounting, the dnode_rele() happens from2061* userquota_updates_task() instead.2062*/2063static void2064dnode_rele_task(void *arg)2065{2066userquota_updates_arg_t *uua = arg;2067objset_t *os = uua->uua_os;20682069multilist_sublist_t *list = multilist_sublist_lock_idx(2070&os->os_synced_dnodes, uua->uua_sublist_idx);20712072dnode_t *dn;2073while ((dn = multilist_sublist_head(list)) != NULL) {2074mutex_enter(&dn->dn_mtx);2075ASSERT3U(dn->dn_dirtycnt, >, 0);2076dn->dn_dirtycnt--;2077mutex_exit(&dn->dn_mtx);2078multilist_sublist_remove(list, dn);2079dnode_rele(dn, &os->os_synced_dnodes);2080}2081multilist_sublist_unlock(list);2082kmem_free(uua, sizeof (*uua));2083}20842085/*2086* Return TRUE if userquota updates are needed.2087*/2088static boolean_t2089dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)2090{2091if (!dmu_objset_userused_enabled(os))2092return (B_FALSE);20932094/*2095* If this is a raw receive just return and handle accounting2096* later when we have the keys loaded. We also don't do user2097* accounting during claiming since the datasets are not owned2098* for the duration of claiming and this txg should only be2099* used for recovery.2100*/2101if (os->os_encrypted && dmu_objset_is_receiving(os))2102return (B_FALSE);21032104if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)2105return (B_FALSE);21062107/* Allocate the user/group/project used objects if necessary. */2108if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {2109VERIFY0(zap_create_claim(os,2110DMU_USERUSED_OBJECT,2111DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2112VERIFY0(zap_create_claim(os,2113DMU_GROUPUSED_OBJECT,2114DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2115}21162117if (dmu_objset_projectquota_enabled(os) &&2118DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {2119VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,2120DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2121}2122return (B_TRUE);2123}21242125/*2126* Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and2127* also release the holds on the dnodes from dmu_objset_sync_dnodes().2128* The caller must taskq_wait(dp_sync_taskq).2129*/2130void2131dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)2132{2133boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);21342135int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes);2136for (int i = 0; i < num_sublists; i++) {2137userquota_updates_arg_t *uua =2138kmem_alloc(sizeof (*uua), KM_SLEEP);2139uua->uua_os = os;2140uua->uua_sublist_idx = i;2141uua->uua_tx = tx;21422143/*2144* If we don't need to update userquotas, use2145* dnode_rele_task() to call dnode_rele()2146*/2147(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,2148need_userquota ? userquota_updates_task : dnode_rele_task,2149uua, 0);2150/* callback frees uua */2151}2152}215321542155/*2156* Returns a pointer to data to find uid/gid from2157*2158* If a dirty record for transaction group that is syncing can't2159* be found then NULL is returned. In the NULL case it is assumed2160* the uid/gid aren't changing.2161*/2162static void *2163dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)2164{2165dbuf_dirty_record_t *dr;2166void *data;21672168if (db->db_dirtycnt == 0) {2169ASSERT(MUTEX_HELD(&db->db_mtx));2170return (db->db.db_data); /* Nothing is changing */2171}21722173dr = dbuf_find_dirty_eq(db, tx->tx_txg);21742175if (dr == NULL) {2176data = NULL;2177} else {2178if (dr->dr_dnode->dn_bonuslen == 0 &&2179dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)2180data = dr->dt.dl.dr_data->b_data;2181else2182data = dr->dt.dl.dr_data;2183}21842185return (data);2186}21872188void2189dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)2190{2191objset_t *os = dn->dn_objset;2192void *data = NULL;2193dmu_buf_impl_t *db = NULL;2194int flags = dn->dn_id_flags;2195int error;2196boolean_t have_spill = B_FALSE;21972198if (!dmu_objset_userused_enabled(dn->dn_objset))2199return;22002201/*2202* Raw receives introduce a problem with user accounting. Raw2203* receives cannot update the user accounting info because the2204* user ids and the sizes are encrypted. To guarantee that we2205* never end up with bad user accounting, we simply disable it2206* during raw receives. We also disable this for normal receives2207* so that an incremental raw receive may be done on top of an2208* existing non-raw receive.2209*/2210if (os->os_encrypted && dmu_objset_is_receiving(os))2211return;22122213if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|2214DN_ID_CHKED_SPILL)))2215return;22162217if (before && dn->dn_bonuslen != 0)2218data = DN_BONUS(dn->dn_phys);2219else if (!before && dn->dn_bonuslen != 0) {2220if (dn->dn_bonus) {2221db = dn->dn_bonus;2222mutex_enter(&db->db_mtx);2223data = dmu_objset_userquota_find_data(db, tx);2224} else {2225data = DN_BONUS(dn->dn_phys);2226}2227} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {2228dmu_flags_t rf = DB_RF_MUST_SUCCEED;22292230if (RW_WRITE_HELD(&dn->dn_struct_rwlock))2231rf |= DB_RF_HAVESTRUCT;2232error = dmu_spill_hold_by_dnode(dn, rf,2233FTAG, (dmu_buf_t **)&db);2234ASSERT0(error);2235mutex_enter(&db->db_mtx);2236data = (before) ? db->db.db_data :2237dmu_objset_userquota_find_data(db, tx);2238have_spill = B_TRUE;2239} else {2240mutex_enter(&dn->dn_mtx);2241dn->dn_id_flags |= DN_ID_CHKED_BONUS;2242mutex_exit(&dn->dn_mtx);2243return;2244}22452246/*2247* Must always call the callback in case the object2248* type has changed and that type isn't an object type to track2249*/2250zfs_file_info_t zfi;2251error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);22522253if (before) {2254ASSERT(data);2255dn->dn_olduid = zfi.zfi_user;2256dn->dn_oldgid = zfi.zfi_group;2257dn->dn_oldprojid = zfi.zfi_project;2258} else if (data) {2259dn->dn_newuid = zfi.zfi_user;2260dn->dn_newgid = zfi.zfi_group;2261dn->dn_newprojid = zfi.zfi_project;2262}22632264/*2265* Preserve existing uid/gid when the callback can't determine2266* what the new uid/gid are and the callback returned EEXIST.2267* The EEXIST error tells us to just use the existing uid/gid.2268* If we don't know what the old values are then just assign2269* them to 0, since that is a new file being created.2270*/2271if (!before && data == NULL && error == EEXIST) {2272if (flags & DN_ID_OLD_EXIST) {2273dn->dn_newuid = dn->dn_olduid;2274dn->dn_newgid = dn->dn_oldgid;2275dn->dn_newprojid = dn->dn_oldprojid;2276} else {2277dn->dn_newuid = 0;2278dn->dn_newgid = 0;2279dn->dn_newprojid = ZFS_DEFAULT_PROJID;2280}2281error = 0;2282}22832284if (db)2285mutex_exit(&db->db_mtx);22862287mutex_enter(&dn->dn_mtx);2288if (error == 0 && before)2289dn->dn_id_flags |= DN_ID_OLD_EXIST;2290if (error == 0 && !before)2291dn->dn_id_flags |= DN_ID_NEW_EXIST;22922293if (have_spill) {2294dn->dn_id_flags |= DN_ID_CHKED_SPILL;2295} else {2296dn->dn_id_flags |= DN_ID_CHKED_BONUS;2297}2298mutex_exit(&dn->dn_mtx);2299if (have_spill)2300dmu_buf_rele((dmu_buf_t *)db, FTAG);2301}23022303boolean_t2304dmu_objset_userspace_present(objset_t *os)2305{2306return (os->os_phys->os_flags &2307OBJSET_FLAG_USERACCOUNTING_COMPLETE);2308}23092310boolean_t2311dmu_objset_userobjspace_present(objset_t *os)2312{2313return (os->os_phys->os_flags &2314OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);2315}23162317boolean_t2318dmu_objset_projectquota_present(objset_t *os)2319{2320return (os->os_phys->os_flags &2321OBJSET_FLAG_PROJECTQUOTA_COMPLETE);2322}23232324static int2325dmu_objset_space_upgrade(objset_t *os)2326{2327uint64_t obj;2328int err = 0;23292330/*2331* We simply need to mark every object dirty, so that it will be2332* synced out and now accounted. If this is called2333* concurrently, or if we already did some work before crashing,2334* that's fine, since we track each object's accounted state2335* independently.2336*/23372338for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {2339dmu_tx_t *tx;2340dmu_buf_t *db;2341int objerr;23422343mutex_enter(&os->os_upgrade_lock);2344if (os->os_upgrade_exit)2345err = SET_ERROR(EINTR);2346mutex_exit(&os->os_upgrade_lock);2347if (err != 0)2348return (err);23492350if (issig())2351return (SET_ERROR(EINTR));23522353objerr = dmu_bonus_hold(os, obj, FTAG, &db);2354if (objerr != 0)2355continue;2356tx = dmu_tx_create(os);2357dmu_tx_hold_bonus(tx, obj);2358objerr = dmu_tx_assign(tx, DMU_TX_WAIT);2359if (objerr != 0) {2360dmu_buf_rele(db, FTAG);2361dmu_tx_abort(tx);2362continue;2363}2364dmu_buf_will_dirty(db, tx);2365dmu_buf_rele(db, FTAG);2366dmu_tx_commit(tx);2367}2368return (0);2369}23702371static int2372dmu_objset_userspace_upgrade_cb(objset_t *os)2373{2374int err = 0;23752376if (dmu_objset_userspace_present(os))2377return (0);2378if (dmu_objset_is_snapshot(os))2379return (SET_ERROR(EINVAL));2380if (!dmu_objset_userused_enabled(os))2381return (SET_ERROR(ENOTSUP));23822383err = dmu_objset_space_upgrade(os);2384if (err)2385return (err);23862387os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;2388txg_wait_synced(dmu_objset_pool(os), 0);2389return (0);2390}23912392void2393dmu_objset_userspace_upgrade(objset_t *os)2394{2395dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);2396}23972398static int2399dmu_objset_id_quota_upgrade_cb(objset_t *os)2400{2401int err = 0;24022403if (dmu_objset_userobjspace_present(os) &&2404dmu_objset_projectquota_present(os))2405return (0);2406if (dmu_objset_is_snapshot(os))2407return (SET_ERROR(EINVAL));2408if (!dmu_objset_userused_enabled(os))2409return (SET_ERROR(ENOTSUP));2410if (!dmu_objset_projectquota_enabled(os) &&2411dmu_objset_userobjspace_present(os))2412return (SET_ERROR(ENOTSUP));24132414if (dmu_objset_userobjused_enabled(os))2415dmu_objset_ds(os)->ds_feature_activation[2416SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;2417if (dmu_objset_projectquota_enabled(os))2418dmu_objset_ds(os)->ds_feature_activation[2419SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;24202421err = dmu_objset_space_upgrade(os);2422if (err)2423return (err);24242425os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;2426if (dmu_objset_userobjused_enabled(os))2427os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;2428if (dmu_objset_projectquota_enabled(os))2429os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;24302431txg_wait_synced(dmu_objset_pool(os), 0);2432return (0);2433}24342435void2436dmu_objset_id_quota_upgrade(objset_t *os)2437{2438dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);2439}24402441boolean_t2442dmu_objset_userobjspace_upgradable(objset_t *os)2443{2444return (dmu_objset_type(os) == DMU_OST_ZFS &&2445!dmu_objset_is_snapshot(os) &&2446dmu_objset_userobjused_enabled(os) &&2447!dmu_objset_userobjspace_present(os) &&2448spa_writeable(dmu_objset_spa(os)));2449}24502451boolean_t2452dmu_objset_projectquota_upgradable(objset_t *os)2453{2454return (dmu_objset_type(os) == DMU_OST_ZFS &&2455!dmu_objset_is_snapshot(os) &&2456dmu_objset_projectquota_enabled(os) &&2457!dmu_objset_projectquota_present(os) &&2458spa_writeable(dmu_objset_spa(os)));2459}24602461void2462dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,2463uint64_t *usedobjsp, uint64_t *availobjsp)2464{2465dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,2466usedobjsp, availobjsp);2467}24682469uint64_t2470dmu_objset_fsid_guid(objset_t *os)2471{2472return (dsl_dataset_fsid_guid(os->os_dsl_dataset));2473}24742475void2476dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)2477{2478stat->dds_type = os->os_phys->os_type;2479if (os->os_dsl_dataset)2480dsl_dataset_fast_stat(os->os_dsl_dataset, stat);2481}24822483void2484dmu_objset_stats(objset_t *os, nvlist_t *nv)2485{2486ASSERT(os->os_dsl_dataset ||2487os->os_phys->os_type == DMU_OST_META);24882489if (os->os_dsl_dataset != NULL)2490dsl_dataset_stats(os->os_dsl_dataset, nv);24912492dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,2493os->os_phys->os_type);2494dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,2495dmu_objset_userspace_present(os));2496}24972498int2499dmu_objset_is_snapshot(objset_t *os)2500{2501if (os->os_dsl_dataset != NULL)2502return (os->os_dsl_dataset->ds_is_snapshot);2503else2504return (B_FALSE);2505}25062507int2508dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,2509boolean_t *conflict)2510{2511dsl_dataset_t *ds = os->os_dsl_dataset;2512uint64_t ignored;25132514if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)2515return (SET_ERROR(ENOENT));25162517return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,2518dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,2519MT_NORMALIZE, real, maxlen, conflict));2520}25212522int2523dmu_snapshot_list_next(objset_t *os, int namelen, char *name,2524uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)2525{2526dsl_dataset_t *ds = os->os_dsl_dataset;2527zap_cursor_t cursor;2528zap_attribute_t *attr;25292530ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));25312532if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)2533return (SET_ERROR(ENOENT));25342535attr = zap_attribute_alloc();2536zap_cursor_init_serialized(&cursor,2537ds->ds_dir->dd_pool->dp_meta_objset,2538dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);25392540if (zap_cursor_retrieve(&cursor, attr) != 0) {2541zap_cursor_fini(&cursor);2542zap_attribute_free(attr);2543return (SET_ERROR(ENOENT));2544}25452546if (strlen(attr->za_name) + 1 > namelen) {2547zap_cursor_fini(&cursor);2548zap_attribute_free(attr);2549return (SET_ERROR(ENAMETOOLONG));2550}25512552(void) strlcpy(name, attr->za_name, namelen);2553if (idp)2554*idp = attr->za_first_integer;2555if (case_conflict)2556*case_conflict = attr->za_normalization_conflict;2557zap_cursor_advance(&cursor);2558*offp = zap_cursor_serialize(&cursor);2559zap_cursor_fini(&cursor);2560zap_attribute_free(attr);25612562return (0);2563}25642565int2566dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)2567{2568return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));2569}25702571int2572dmu_dir_list_next(objset_t *os, int namelen, char *name,2573uint64_t *idp, uint64_t *offp)2574{2575dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;2576zap_cursor_t cursor;2577zap_attribute_t *attr;25782579/* there is no next dir on a snapshot! */2580if (os->os_dsl_dataset->ds_object !=2581dsl_dir_phys(dd)->dd_head_dataset_obj)2582return (SET_ERROR(ENOENT));25832584attr = zap_attribute_alloc();2585zap_cursor_init_serialized(&cursor,2586dd->dd_pool->dp_meta_objset,2587dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);25882589if (zap_cursor_retrieve(&cursor, attr) != 0) {2590zap_cursor_fini(&cursor);2591zap_attribute_free(attr);2592return (SET_ERROR(ENOENT));2593}25942595if (strlen(attr->za_name) + 1 > namelen) {2596zap_cursor_fini(&cursor);2597zap_attribute_free(attr);2598return (SET_ERROR(ENAMETOOLONG));2599}26002601(void) strlcpy(name, attr->za_name, namelen);2602if (idp)2603*idp = attr->za_first_integer;2604zap_cursor_advance(&cursor);2605*offp = zap_cursor_serialize(&cursor);2606zap_cursor_fini(&cursor);2607zap_attribute_free(attr);26082609return (0);2610}26112612typedef struct dmu_objset_find_ctx {2613taskq_t *dc_tq;2614dsl_pool_t *dc_dp;2615uint64_t dc_ddobj;2616char *dc_ddname; /* last component of ddobj's name */2617int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);2618void *dc_arg;2619int dc_flags;2620kmutex_t *dc_error_lock;2621int *dc_error;2622} dmu_objset_find_ctx_t;26232624static void2625dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)2626{2627dsl_pool_t *dp = dcp->dc_dp;2628dsl_dir_t *dd;2629dsl_dataset_t *ds;2630zap_cursor_t zc;2631zap_attribute_t *attr;2632uint64_t thisobj;2633int err = 0;26342635/* don't process if there already was an error */2636if (*dcp->dc_error != 0)2637goto out;26382639/*2640* Note: passing the name (dc_ddname) here is optional, but it2641* improves performance because we don't need to call2642* zap_value_search() to determine the name.2643*/2644err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);2645if (err != 0)2646goto out;26472648/* Don't visit hidden ($MOS & $ORIGIN) objsets. */2649if (dd->dd_myname[0] == '$') {2650dsl_dir_rele(dd, FTAG);2651goto out;2652}26532654thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;2655attr = zap_attribute_alloc();26562657/*2658* Iterate over all children.2659*/2660if (dcp->dc_flags & DS_FIND_CHILDREN) {2661for (zap_cursor_init(&zc, dp->dp_meta_objset,2662dsl_dir_phys(dd)->dd_child_dir_zapobj);2663zap_cursor_retrieve(&zc, attr) == 0;2664(void) zap_cursor_advance(&zc)) {2665ASSERT3U(attr->za_integer_length, ==,2666sizeof (uint64_t));2667ASSERT3U(attr->za_num_integers, ==, 1);26682669dmu_objset_find_ctx_t *child_dcp =2670kmem_alloc(sizeof (*child_dcp), KM_SLEEP);2671*child_dcp = *dcp;2672child_dcp->dc_ddobj = attr->za_first_integer;2673child_dcp->dc_ddname = spa_strdup(attr->za_name);2674if (dcp->dc_tq != NULL)2675(void) taskq_dispatch(dcp->dc_tq,2676dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);2677else2678dmu_objset_find_dp_impl(child_dcp);2679}2680zap_cursor_fini(&zc);2681}26822683/*2684* Iterate over all snapshots.2685*/2686if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {2687dsl_dataset_t *ds;2688err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);26892690if (err == 0) {2691uint64_t snapobj;26922693snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;2694dsl_dataset_rele(ds, FTAG);26952696for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);2697zap_cursor_retrieve(&zc, attr) == 0;2698(void) zap_cursor_advance(&zc)) {2699ASSERT3U(attr->za_integer_length, ==,2700sizeof (uint64_t));2701ASSERT3U(attr->za_num_integers, ==, 1);27022703err = dsl_dataset_hold_obj(dp,2704attr->za_first_integer, FTAG, &ds);2705if (err != 0)2706break;2707err = dcp->dc_func(dp, ds, dcp->dc_arg);2708dsl_dataset_rele(ds, FTAG);2709if (err != 0)2710break;2711}2712zap_cursor_fini(&zc);2713}2714}27152716zap_attribute_free(attr);27172718if (err != 0) {2719dsl_dir_rele(dd, FTAG);2720goto out;2721}27222723/*2724* Apply to self.2725*/2726err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);27272728/*2729* Note: we hold the dir while calling dsl_dataset_hold_obj() so2730* that the dir will remain cached, and we won't have to re-instantiate2731* it (which could be expensive due to finding its name via2732* zap_value_search()).2733*/2734dsl_dir_rele(dd, FTAG);2735if (err != 0)2736goto out;2737err = dcp->dc_func(dp, ds, dcp->dc_arg);2738dsl_dataset_rele(ds, FTAG);27392740out:2741if (err != 0) {2742mutex_enter(dcp->dc_error_lock);2743/* only keep first error */2744if (*dcp->dc_error == 0)2745*dcp->dc_error = err;2746mutex_exit(dcp->dc_error_lock);2747}27482749if (dcp->dc_ddname != NULL)2750spa_strfree(dcp->dc_ddname);2751kmem_free(dcp, sizeof (*dcp));2752}27532754static void2755dmu_objset_find_dp_cb(void *arg)2756{2757dmu_objset_find_ctx_t *dcp = arg;2758dsl_pool_t *dp = dcp->dc_dp;27592760/*2761* We need to get a pool_config_lock here, as there are several2762* assert(pool_config_held) down the stack. Getting a lock via2763* dsl_pool_config_enter is risky, as it might be stalled by a2764* pending writer. This would deadlock, as the write lock can2765* only be granted when our parent thread gives up the lock.2766* The _prio interface gives us priority over a pending writer.2767*/2768dsl_pool_config_enter_prio(dp, FTAG);27692770dmu_objset_find_dp_impl(dcp);27712772dsl_pool_config_exit(dp, FTAG);2773}27742775/*2776* Find objsets under and including ddobj, call func(ds) on each.2777* The order for the enumeration is completely undefined.2778* func is called with dsl_pool_config held.2779*/2780int2781dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,2782int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)2783{2784int error = 0;2785taskq_t *tq = NULL;2786int ntasks;2787dmu_objset_find_ctx_t *dcp;2788kmutex_t err_lock;27892790mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);2791dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);2792dcp->dc_tq = NULL;2793dcp->dc_dp = dp;2794dcp->dc_ddobj = ddobj;2795dcp->dc_ddname = NULL;2796dcp->dc_func = func;2797dcp->dc_arg = arg;2798dcp->dc_flags = flags;2799dcp->dc_error_lock = &err_lock;2800dcp->dc_error = &error;28012802if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {2803/*2804* In case a write lock is held we can't make use of2805* parallelism, as down the stack of the worker threads2806* the lock is asserted via dsl_pool_config_held.2807* In case of a read lock this is solved by getting a read2808* lock in each worker thread, which isn't possible in case2809* of a writer lock. So we fall back to the synchronous path2810* here.2811* In the future it might be possible to get some magic into2812* dsl_pool_config_held in a way that it returns true for2813* the worker threads so that a single lock held from this2814* thread suffices. For now, stay single threaded.2815*/2816dmu_objset_find_dp_impl(dcp);2817mutex_destroy(&err_lock);28182819return (error);2820}28212822ntasks = dmu_find_threads;2823if (ntasks == 0)2824ntasks = vdev_count_leaves(dp->dp_spa) * 4;2825tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,2826INT_MAX, 0);2827if (tq == NULL) {2828kmem_free(dcp, sizeof (*dcp));2829mutex_destroy(&err_lock);28302831return (SET_ERROR(ENOMEM));2832}2833dcp->dc_tq = tq;28342835/* dcp will be freed by task */2836(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);28372838/*2839* PORTING: this code relies on the property of taskq_wait to wait2840* until no more tasks are queued and no more tasks are active. As2841* we always queue new tasks from within other tasks, task_wait2842* reliably waits for the full recursion to finish, even though we2843* enqueue new tasks after taskq_wait has been called.2844* On platforms other than illumos, taskq_wait may not have this2845* property.2846*/2847taskq_wait(tq);2848taskq_destroy(tq);2849mutex_destroy(&err_lock);28502851return (error);2852}28532854/*2855* Find all objsets under name, and for each, call 'func(child_name, arg)'.2856* The dp_config_rwlock must not be held when this is called, and it2857* will not be held when the callback is called.2858* Therefore this function should only be used when the pool is not changing2859* (e.g. in syncing context), or the callback can deal with the possible races.2860*/2861static int2862dmu_objset_find_impl(spa_t *spa, const char *name,2863int func(const char *, void *), void *arg, int flags)2864{2865dsl_dir_t *dd;2866dsl_pool_t *dp = spa_get_dsl(spa);2867dsl_dataset_t *ds;2868zap_cursor_t zc;2869zap_attribute_t *attr;2870char *child;2871uint64_t thisobj;2872int err;28732874dsl_pool_config_enter(dp, FTAG);28752876err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);2877if (err != 0) {2878dsl_pool_config_exit(dp, FTAG);2879return (err);2880}28812882/* Don't visit hidden ($MOS & $ORIGIN) objsets. */2883if (dd->dd_myname[0] == '$') {2884dsl_dir_rele(dd, FTAG);2885dsl_pool_config_exit(dp, FTAG);2886return (0);2887}28882889thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;2890attr = zap_attribute_alloc();28912892/*2893* Iterate over all children.2894*/2895if (flags & DS_FIND_CHILDREN) {2896for (zap_cursor_init(&zc, dp->dp_meta_objset,2897dsl_dir_phys(dd)->dd_child_dir_zapobj);2898zap_cursor_retrieve(&zc, attr) == 0;2899(void) zap_cursor_advance(&zc)) {2900ASSERT3U(attr->za_integer_length, ==,2901sizeof (uint64_t));2902ASSERT3U(attr->za_num_integers, ==, 1);29032904child = kmem_asprintf("%s/%s", name, attr->za_name);2905dsl_pool_config_exit(dp, FTAG);2906err = dmu_objset_find_impl(spa, child,2907func, arg, flags);2908dsl_pool_config_enter(dp, FTAG);2909kmem_strfree(child);2910if (err != 0)2911break;2912}2913zap_cursor_fini(&zc);29142915if (err != 0) {2916dsl_dir_rele(dd, FTAG);2917dsl_pool_config_exit(dp, FTAG);2918zap_attribute_free(attr);2919return (err);2920}2921}29222923/*2924* Iterate over all snapshots.2925*/2926if (flags & DS_FIND_SNAPSHOTS) {2927err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);29282929if (err == 0) {2930uint64_t snapobj;29312932snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;2933dsl_dataset_rele(ds, FTAG);29342935for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);2936zap_cursor_retrieve(&zc, attr) == 0;2937(void) zap_cursor_advance(&zc)) {2938ASSERT3U(attr->za_integer_length, ==,2939sizeof (uint64_t));2940ASSERT3U(attr->za_num_integers, ==, 1);29412942child = kmem_asprintf("%s@%s",2943name, attr->za_name);2944dsl_pool_config_exit(dp, FTAG);2945err = func(child, arg);2946dsl_pool_config_enter(dp, FTAG);2947kmem_strfree(child);2948if (err != 0)2949break;2950}2951zap_cursor_fini(&zc);2952}2953}29542955dsl_dir_rele(dd, FTAG);2956zap_attribute_free(attr);2957dsl_pool_config_exit(dp, FTAG);29582959if (err != 0)2960return (err);29612962/* Apply to self. */2963return (func(name, arg));2964}29652966/*2967* See comment above dmu_objset_find_impl().2968*/2969int2970dmu_objset_find(const char *name, int func(const char *, void *), void *arg,2971int flags)2972{2973spa_t *spa;2974int error;29752976error = spa_open(name, &spa, FTAG);2977if (error != 0)2978return (error);2979error = dmu_objset_find_impl(spa, name, func, arg, flags);2980spa_close(spa, FTAG);2981return (error);2982}29832984boolean_t2985dmu_objset_incompatible_encryption_version(objset_t *os)2986{2987return (dsl_dir_incompatible_encryption_version(2988os->os_dsl_dataset->ds_dir));2989}29902991void2992dmu_objset_set_user(objset_t *os, void *user_ptr)2993{2994ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));2995os->os_user_ptr = user_ptr;2996}29972998void *2999dmu_objset_get_user(objset_t *os)3000{3001ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));3002return (os->os_user_ptr);3003}30043005/*3006* Determine name of filesystem, given name of snapshot.3007* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes3008*/3009int3010dmu_fsname(const char *snapname, char *buf)3011{3012char *atp = strchr(snapname, '@');3013if (atp == NULL)3014return (SET_ERROR(EINVAL));3015if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)3016return (SET_ERROR(ENAMETOOLONG));3017(void) strlcpy(buf, snapname, atp - snapname + 1);3018return (0);3019}30203021/*3022* Call when we think we're going to write/free space in open context3023* to track the amount of dirty data in the open txg, which is also the3024* amount of memory that can not be evicted until this txg syncs.3025*3026* Note that there are two conditions where this can be called from3027* syncing context:3028*3029* [1] When we just created the dataset, in which case we go on with3030* updating any accounting of dirty data as usual.3031* [2] When we are dirtying MOS data, in which case we only update the3032* pool's accounting of dirty data.3033*/3034void3035dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)3036{3037dsl_dataset_t *ds = os->os_dsl_dataset;3038int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);30393040if (ds != NULL) {3041dsl_dir_willuse_space(ds->ds_dir, aspace, tx);3042}30433044dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);3045}30463047#if defined(_KERNEL)3048EXPORT_SYMBOL(dmu_objset_zil);3049EXPORT_SYMBOL(dmu_objset_pool);3050EXPORT_SYMBOL(dmu_objset_ds);3051EXPORT_SYMBOL(dmu_objset_type);3052EXPORT_SYMBOL(dmu_objset_name);3053EXPORT_SYMBOL(dmu_objset_hold);3054EXPORT_SYMBOL(dmu_objset_hold_flags);3055EXPORT_SYMBOL(dmu_objset_own);3056EXPORT_SYMBOL(dmu_objset_rele);3057EXPORT_SYMBOL(dmu_objset_rele_flags);3058EXPORT_SYMBOL(dmu_objset_disown);3059EXPORT_SYMBOL(dmu_objset_from_ds);3060EXPORT_SYMBOL(dmu_objset_create);3061EXPORT_SYMBOL(dmu_objset_stats);3062EXPORT_SYMBOL(dmu_objset_fast_stat);3063EXPORT_SYMBOL(dmu_objset_spa);3064EXPORT_SYMBOL(dmu_objset_space);3065EXPORT_SYMBOL(dmu_objset_fsid_guid);3066EXPORT_SYMBOL(dmu_objset_find);3067EXPORT_SYMBOL(dmu_objset_byteswap);3068EXPORT_SYMBOL(dmu_objset_evict_dbufs);3069EXPORT_SYMBOL(dmu_objset_snap_cmtime);3070EXPORT_SYMBOL(dmu_objset_dnodesize);30713072EXPORT_SYMBOL(dmu_objset_sync);3073EXPORT_SYMBOL(dmu_objset_is_dirty);3074EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);3075EXPORT_SYMBOL(dmu_objset_create_impl);3076EXPORT_SYMBOL(dmu_objset_open_impl);3077EXPORT_SYMBOL(dmu_objset_evict);3078EXPORT_SYMBOL(dmu_objset_register_type);3079EXPORT_SYMBOL(dmu_objset_sync_done);3080EXPORT_SYMBOL(dmu_objset_userquota_get_ids);3081EXPORT_SYMBOL(dmu_objset_userused_enabled);3082EXPORT_SYMBOL(dmu_objset_userspace_upgrade);3083EXPORT_SYMBOL(dmu_objset_userspace_present);3084EXPORT_SYMBOL(dmu_objset_userobjused_enabled);3085EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);3086EXPORT_SYMBOL(dmu_objset_userobjspace_present);3087EXPORT_SYMBOL(dmu_objset_projectquota_enabled);3088EXPORT_SYMBOL(dmu_objset_projectquota_present);3089EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);3090EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);3091#endif309230933094