Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_objset.c
106842 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2012, 2020 by Delphix. All rights reserved.25* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.26* Copyright (c) 2013, Joyent, Inc. All rights reserved.27* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.28* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.29* Copyright (c) 2016 Actifio, Inc. All rights reserved.30* Copyright 2017 Nexenta Systems, Inc.31* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.32* Copyright (c) 2018, loli10K <[email protected]>. All rights reserved.33* Copyright (c) 2019, Klara Inc.34* Copyright (c) 2019, Allan Jude35* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.36* Copyright (c) 2025, Rob Norris <[email protected]>37*/3839/* Portions Copyright 2010 Robert Milkowski */4041#include <sys/cred.h>42#include <sys/zfs_context.h>43#include <sys/dmu_objset.h>44#include <sys/dsl_dir.h>45#include <sys/dsl_dataset.h>46#include <sys/dsl_prop.h>47#include <sys/dsl_pool.h>48#include <sys/dsl_synctask.h>49#include <sys/dsl_deleg.h>50#include <sys/dnode.h>51#include <sys/dbuf.h>52#include <sys/zvol.h>53#include <sys/dmu_tx.h>54#include <sys/zap.h>55#include <sys/zil.h>56#include <sys/dmu_impl.h>57#include <sys/zfs_ioctl.h>58#include <sys/sa.h>59#include <sys/zfs_onexit.h>60#include <sys/dsl_destroy.h>61#include <sys/vdev.h>62#include <sys/zfeature.h>63#include <sys/policy.h>64#include <sys/spa_impl.h>65#include <sys/dmu_recv.h>66#include <sys/zfs_project.h>67#include "zfs_namecheck.h"68#include <sys/vdev_impl.h>69#include <sys/arc.h>70#include <cityhash.h>71#include <sys/cred.h>7273/*74* Needed to close a window in dnode_move() that allows the objset to be freed75* before it can be safely accessed.76*/77krwlock_t os_lock;7879/*80* Tunable to overwrite the maximum number of threads for the parallelization81* of dmu_objset_find_dp, needed to speed up the import of pools with many82* datasets.83* Default is 4 times the number of leaf vdevs.84*/85static const int dmu_find_threads = 0;8687/*88* Backfill lower metadnode objects after this many have been freed.89* Backfilling negatively impacts object creation rates, so only do it90* if there are enough holes to fill.91*/92static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;9394static const char *upgrade_tag = "upgrade_tag";9596static void dmu_objset_find_dp_cb(void *arg);9798static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);99static void dmu_objset_upgrade_stop(objset_t *os);100101void102dmu_objset_init(void)103{104rw_init(&os_lock, NULL, RW_DEFAULT, NULL);105}106107void108dmu_objset_fini(void)109{110rw_destroy(&os_lock);111}112113spa_t *114dmu_objset_spa(objset_t *os)115{116return (os->os_spa);117}118119zilog_t *120dmu_objset_zil(objset_t *os)121{122return (os->os_zil);123}124125dsl_pool_t *126dmu_objset_pool(objset_t *os)127{128dsl_dataset_t *ds;129130if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)131return (ds->ds_dir->dd_pool);132else133return (spa_get_dsl(os->os_spa));134}135136dsl_dataset_t *137dmu_objset_ds(objset_t *os)138{139return (os->os_dsl_dataset);140}141142dmu_objset_type_t143dmu_objset_type(objset_t *os)144{145return (os->os_phys->os_type);146}147148void149dmu_objset_name(objset_t *os, char *buf)150{151dsl_dataset_name(os->os_dsl_dataset, buf);152}153154uint64_t155dmu_objset_id(objset_t *os)156{157dsl_dataset_t *ds = os->os_dsl_dataset;158159return (ds ? ds->ds_object : 0);160}161162uint64_t163dmu_objset_dnodesize(objset_t *os)164{165return (os->os_dnodesize);166}167168zfs_sync_type_t169dmu_objset_syncprop(objset_t *os)170{171return (os->os_sync);172}173174zfs_logbias_op_t175dmu_objset_logbias(objset_t *os)176{177return (os->os_logbias);178}179180static void181checksum_changed_cb(void *arg, uint64_t newval)182{183objset_t *os = arg;184185/*186* Inheritance should have been done by now.187*/188ASSERT(newval != ZIO_CHECKSUM_INHERIT);189190os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);191}192193static void194compression_changed_cb(void *arg, uint64_t newval)195{196objset_t *os = arg;197198/*199* Inheritance and range checking should have been done by now.200*/201ASSERT(newval != ZIO_COMPRESS_INHERIT);202203os->os_compress = zio_compress_select(os->os_spa,204ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);205os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,206ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);207}208209static void210copies_changed_cb(void *arg, uint64_t newval)211{212objset_t *os = arg;213214/*215* Inheritance and range checking should have been done by now.216*/217ASSERT(newval > 0);218ASSERT(newval <= spa_max_replication(os->os_spa));219220os->os_copies = newval;221}222223static void224dedup_changed_cb(void *arg, uint64_t newval)225{226objset_t *os = arg;227spa_t *spa = os->os_spa;228enum zio_checksum checksum;229230/*231* Inheritance should have been done by now.232*/233ASSERT(newval != ZIO_CHECKSUM_INHERIT);234235checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);236237os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;238os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);239}240241static void242primary_cache_changed_cb(void *arg, uint64_t newval)243{244objset_t *os = arg;245246/*247* Inheritance and range checking should have been done by now.248*/249ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||250newval == ZFS_CACHE_METADATA);251252os->os_primary_cache = newval;253}254255static void256secondary_cache_changed_cb(void *arg, uint64_t newval)257{258objset_t *os = arg;259260/*261* Inheritance and range checking should have been done by now.262*/263ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||264newval == ZFS_CACHE_METADATA);265266os->os_secondary_cache = newval;267}268269static void270prefetch_changed_cb(void *arg, uint64_t newval)271{272objset_t *os = arg;273274/*275* Inheritance should have been done by now.276*/277ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||278newval == ZFS_PREFETCH_METADATA);279os->os_prefetch = newval;280}281282static void283sync_changed_cb(void *arg, uint64_t newval)284{285objset_t *os = arg;286287/*288* Inheritance and range checking should have been done by now.289*/290ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||291newval == ZFS_SYNC_DISABLED);292293os->os_sync = newval;294if (os->os_zil)295zil_set_sync(os->os_zil, newval);296}297298static void299redundant_metadata_changed_cb(void *arg, uint64_t newval)300{301objset_t *os = arg;302303/*304* Inheritance and range checking should have been done by now.305*/306ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||307newval == ZFS_REDUNDANT_METADATA_MOST ||308newval == ZFS_REDUNDANT_METADATA_SOME ||309newval == ZFS_REDUNDANT_METADATA_NONE);310311os->os_redundant_metadata = newval;312}313314static void315dnodesize_changed_cb(void *arg, uint64_t newval)316{317objset_t *os = arg;318319switch (newval) {320case ZFS_DNSIZE_LEGACY:321os->os_dnodesize = DNODE_MIN_SIZE;322break;323case ZFS_DNSIZE_AUTO:324/*325* Choose a dnode size that will work well for most326* workloads if the user specified "auto". Future code327* improvements could dynamically select a dnode size328* based on observed workload patterns.329*/330os->os_dnodesize = DNODE_MIN_SIZE * 2;331break;332case ZFS_DNSIZE_1K:333case ZFS_DNSIZE_2K:334case ZFS_DNSIZE_4K:335case ZFS_DNSIZE_8K:336case ZFS_DNSIZE_16K:337os->os_dnodesize = newval;338break;339}340}341342static void343smallblk_changed_cb(void *arg, uint64_t newval)344{345objset_t *os = arg;346347os->os_zpl_special_smallblock = newval;348}349350static void351direct_changed_cb(void *arg, uint64_t newval)352{353objset_t *os = arg;354355/*356* Inheritance and range checking should have been done by now.357*/358ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||359newval == ZFS_DIRECT_ALWAYS);360361os->os_direct = newval;362}363364static void365logbias_changed_cb(void *arg, uint64_t newval)366{367objset_t *os = arg;368369ASSERT(newval == ZFS_LOGBIAS_LATENCY ||370newval == ZFS_LOGBIAS_THROUGHPUT);371os->os_logbias = newval;372if (os->os_zil)373zil_set_logbias(os->os_zil, newval);374}375376static void377recordsize_changed_cb(void *arg, uint64_t newval)378{379objset_t *os = arg;380381os->os_recordsize = newval;382}383384void385dmu_objset_byteswap(void *buf, size_t size)386{387objset_phys_t *osp = buf;388389ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||390size == sizeof (objset_phys_t));391dnode_byteswap(&osp->os_meta_dnode);392byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));393osp->os_type = BSWAP_64(osp->os_type);394osp->os_flags = BSWAP_64(osp->os_flags);395if (size >= OBJSET_PHYS_SIZE_V2) {396dnode_byteswap(&osp->os_userused_dnode);397dnode_byteswap(&osp->os_groupused_dnode);398if (size >= sizeof (objset_phys_t))399dnode_byteswap(&osp->os_projectused_dnode);400}401}402403/*404* Runs cityhash on the objset_t pointer and the object number.405*/406static uint64_t407dnode_hash(const objset_t *os, uint64_t obj)408{409uintptr_t osv = (uintptr_t)os;410return (cityhash2((uint64_t)osv, obj));411}412413static unsigned int414dnode_multilist_index_func(multilist_t *ml, void *obj)415{416dnode_t *dn = obj;417418/*419* The low order bits of the hash value are thought to be420* distributed evenly. Otherwise, in the case that the multilist421* has a power of two number of sublists, each sublists' usage422* would not be evenly distributed. In this context full 64bit423* division would be a waste of time, so limit it to 32 bits.424*/425return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) %426multilist_get_num_sublists(ml));427}428429static inline boolean_t430dmu_os_is_l2cacheable(objset_t *os)431{432if (os->os_secondary_cache == ZFS_CACHE_ALL ||433os->os_secondary_cache == ZFS_CACHE_METADATA) {434if (l2arc_exclude_special == 0)435return (B_TRUE);436437blkptr_t *bp = os->os_rootbp;438if (bp == NULL || BP_IS_HOLE(bp))439return (B_FALSE);440uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);441vdev_t *rvd = os->os_spa->spa_root_vdev;442vdev_t *vd = NULL;443444if (vdev < rvd->vdev_children)445vd = rvd->vdev_child[vdev];446447if (vd == NULL)448return (B_TRUE);449450if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&451vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)452return (B_TRUE);453}454return (B_FALSE);455}456457/*458* Instantiates the objset_t in-memory structure corresponding to the459* objset_phys_t that's pointed to by the specified blkptr_t.460*/461int462dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,463objset_t **osp)464{465objset_t *os;466int i, err;467468ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));469ASSERT(!BP_IS_REDACTED(bp));470471/*472* We need the pool config lock to get properties.473*/474ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));475476/*477* The $ORIGIN dataset (if it exists) doesn't have an associated478* objset, so there's no reason to open it. The $ORIGIN dataset479* will not exist on pools older than SPA_VERSION_ORIGIN.480*/481if (ds != NULL && spa_get_dsl(spa) != NULL &&482spa_get_dsl(spa)->dp_origin_snap != NULL) {483ASSERT3P(ds->ds_dir, !=,484spa_get_dsl(spa)->dp_origin_snap->ds_dir);485}486487os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);488os->os_dsl_dataset = ds;489os->os_spa = spa;490os->os_rootbp = bp;491if (!BP_IS_HOLE(os->os_rootbp)) {492arc_flags_t aflags = ARC_FLAG_WAIT;493zbookmark_phys_t zb;494int size;495zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;496SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,497ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);498499if (dmu_os_is_l2cacheable(os))500aflags |= ARC_FLAG_L2CACHE;501502if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {503ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);504ASSERT(BP_IS_AUTHENTICATED(bp));505zio_flags |= ZIO_FLAG_RAW;506}507508dprintf_bp(os->os_rootbp, "reading %s", "");509err = arc_read(NULL, spa, os->os_rootbp,510arc_getbuf_func, &os->os_phys_buf,511ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);512if (err != 0) {513kmem_free(os, sizeof (objset_t));514/* convert checksum errors into IO errors */515if (err == ECKSUM)516err = SET_ERROR(EIO);517return (err);518}519520if (spa_version(spa) < SPA_VERSION_USERSPACE)521size = OBJSET_PHYS_SIZE_V1;522else if (!spa_feature_is_enabled(spa,523SPA_FEATURE_PROJECT_QUOTA))524size = OBJSET_PHYS_SIZE_V2;525else526size = sizeof (objset_phys_t);527528/* Increase the blocksize if we are permitted. */529if (arc_buf_size(os->os_phys_buf) < size) {530arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,531ARC_BUFC_METADATA, size);532memset(buf->b_data, 0, size);533memcpy(buf->b_data, os->os_phys_buf->b_data,534arc_buf_size(os->os_phys_buf));535arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);536os->os_phys_buf = buf;537}538539os->os_phys = os->os_phys_buf->b_data;540os->os_flags = os->os_phys->os_flags;541} else {542int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?543sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;544os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,545ARC_BUFC_METADATA, size);546os->os_phys = os->os_phys_buf->b_data;547memset(os->os_phys, 0, size);548}549/*550* These properties will be filled in by the logic in zfs_get_zplprop()551* when they are queried for the first time.552*/553os->os_version = OBJSET_PROP_UNINITIALIZED;554os->os_normalization = OBJSET_PROP_UNINITIALIZED;555os->os_utf8only = OBJSET_PROP_UNINITIALIZED;556os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;557558/*559* Note: the changed_cb will be called once before the register560* func returns, thus changing the checksum/compression from the561* default (fletcher2/off). Snapshots don't need to know about562* checksum/compression/copies.563*/564if (ds != NULL) {565os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);566567err = dsl_prop_register(ds,568zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),569primary_cache_changed_cb, os);570if (err == 0) {571err = dsl_prop_register(ds,572zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),573secondary_cache_changed_cb, os);574}575if (err == 0) {576err = dsl_prop_register(ds,577zfs_prop_to_name(ZFS_PROP_PREFETCH),578prefetch_changed_cb, os);579}580if (!ds->ds_is_snapshot) {581if (err == 0) {582err = dsl_prop_register(ds,583zfs_prop_to_name(ZFS_PROP_CHECKSUM),584checksum_changed_cb, os);585}586if (err == 0) {587err = dsl_prop_register(ds,588zfs_prop_to_name(ZFS_PROP_COMPRESSION),589compression_changed_cb, os);590}591if (err == 0) {592err = dsl_prop_register(ds,593zfs_prop_to_name(ZFS_PROP_COPIES),594copies_changed_cb, os);595}596if (err == 0) {597err = dsl_prop_register(ds,598zfs_prop_to_name(ZFS_PROP_DEDUP),599dedup_changed_cb, os);600}601if (err == 0) {602err = dsl_prop_register(ds,603zfs_prop_to_name(ZFS_PROP_LOGBIAS),604logbias_changed_cb, os);605}606if (err == 0) {607err = dsl_prop_register(ds,608zfs_prop_to_name(ZFS_PROP_SYNC),609sync_changed_cb, os);610}611if (err == 0) {612err = dsl_prop_register(ds,613zfs_prop_to_name(614ZFS_PROP_REDUNDANT_METADATA),615redundant_metadata_changed_cb, os);616}617if (err == 0) {618err = dsl_prop_register(ds,619zfs_prop_to_name(ZFS_PROP_RECORDSIZE),620recordsize_changed_cb, os);621}622if (err == 0) {623err = dsl_prop_register(ds,624zfs_prop_to_name(ZFS_PROP_DNODESIZE),625dnodesize_changed_cb, os);626}627if (err == 0) {628err = dsl_prop_register(ds,629zfs_prop_to_name(630ZFS_PROP_SPECIAL_SMALL_BLOCKS),631smallblk_changed_cb, os);632}633if (err == 0) {634err = dsl_prop_register(ds,635zfs_prop_to_name(ZFS_PROP_DIRECT),636direct_changed_cb, os);637}638}639if (err != 0) {640arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);641kmem_free(os, sizeof (objset_t));642return (err);643}644} else {645/* It's the meta-objset. */646os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;647os->os_compress = ZIO_COMPRESS_ON;648os->os_complevel = ZIO_COMPLEVEL_DEFAULT;649os->os_encrypted = B_FALSE;650os->os_copies = spa_max_replication(spa);651os->os_dedup_checksum = ZIO_CHECKSUM_OFF;652os->os_dedup_verify = B_FALSE;653os->os_logbias = ZFS_LOGBIAS_LATENCY;654os->os_sync = ZFS_SYNC_STANDARD;655os->os_primary_cache = ZFS_CACHE_ALL;656os->os_secondary_cache = ZFS_CACHE_ALL;657os->os_dnodesize = DNODE_MIN_SIZE;658os->os_prefetch = ZFS_PREFETCH_ALL;659}660661if (ds == NULL || !ds->ds_is_snapshot)662os->os_zil_header = os->os_phys->os_zil_header;663os->os_zil = zil_alloc(os, &os->os_zil_header);664665for (i = 0; i < TXG_SIZE; i++) {666multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),667offsetof(dnode_t, dn_dirty_link[i]),668dnode_multilist_index_func);669}670list_create(&os->os_dnodes, sizeof (dnode_t),671offsetof(dnode_t, dn_link));672list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),673offsetof(dmu_buf_impl_t, db_link));674675list_link_init(&os->os_evicting_node);676677mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);678mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);679mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);680mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);681os->os_obj_next_percpu_len = boot_ncpus;682os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *683sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);684685dnode_special_open(os, &os->os_phys->os_meta_dnode,686DMU_META_DNODE_OBJECT, &os->os_meta_dnode);687if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {688dnode_special_open(os, &os->os_phys->os_userused_dnode,689DMU_USERUSED_OBJECT, &os->os_userused_dnode);690dnode_special_open(os, &os->os_phys->os_groupused_dnode,691DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);692if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))693dnode_special_open(os,694&os->os_phys->os_projectused_dnode,695DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);696}697698mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);699700*osp = os;701return (0);702}703704int705dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)706{707int err = 0;708709/*710* We need the pool_config lock to manipulate the dsl_dataset_t.711* Even if the dataset is long-held, we need the pool_config lock712* to open the objset, as it needs to get properties.713*/714ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));715716mutex_enter(&ds->ds_opening_lock);717if (ds->ds_objset == NULL) {718objset_t *os;719rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);720err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),721ds, dsl_dataset_get_blkptr(ds), &os);722rrw_exit(&ds->ds_bp_rwlock, FTAG);723724if (err == 0) {725mutex_enter(&ds->ds_lock);726ASSERT0P(ds->ds_objset);727ds->ds_objset = os;728mutex_exit(&ds->ds_lock);729}730}731*osp = ds->ds_objset;732mutex_exit(&ds->ds_opening_lock);733return (err);734}735736/*737* Holds the pool while the objset is held. Therefore only one objset738* can be held at a time.739*/740int741dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag,742objset_t **osp)743{744dsl_pool_t *dp;745dsl_dataset_t *ds;746int err;747ds_hold_flags_t flags;748749flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;750err = dsl_pool_hold(name, tag, &dp);751if (err != 0)752return (err);753err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);754if (err != 0) {755dsl_pool_rele(dp, tag);756return (err);757}758759err = dmu_objset_from_ds(ds, osp);760if (err != 0) {761dsl_dataset_rele_flags(ds, flags, tag);762dsl_pool_rele(dp, tag);763}764765return (err);766}767768int769dmu_objset_hold(const char *name, const void *tag, objset_t **osp)770{771return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));772}773774static int775dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,776boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)777{778(void) tag;779780int err = dmu_objset_from_ds(ds, osp);781if (err != 0) {782return (err);783} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {784return (SET_ERROR(EINVAL));785} else if (!readonly && dsl_dataset_is_snapshot(ds)) {786return (SET_ERROR(EROFS));787} else if (!readonly && decrypt &&788dsl_dir_incompatible_encryption_version(ds->ds_dir)) {789return (SET_ERROR(EROFS));790}791792/* if we are decrypting, we can now check MACs in os->os_phys_buf */793if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {794zbookmark_phys_t zb;795796SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,797ZB_ROOT_LEVEL, ZB_ROOT_BLKID);798err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,799&zb, B_FALSE);800if (err != 0)801return (err);802803ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));804}805806return (0);807}808809/*810* dsl_pool must not be held when this is called.811* Upon successful return, there will be a longhold on the dataset,812* and the dsl_pool will not be held.813*/814int815dmu_objset_own(const char *name, dmu_objset_type_t type,816boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)817{818dsl_pool_t *dp;819dsl_dataset_t *ds;820int err;821ds_hold_flags_t flags;822823flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;824err = dsl_pool_hold(name, FTAG, &dp);825if (err != 0)826return (err);827err = dsl_dataset_own(dp, name, flags, tag, &ds);828if (err != 0) {829dsl_pool_rele(dp, FTAG);830return (err);831}832err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);833if (err != 0) {834dsl_dataset_disown(ds, flags, tag);835dsl_pool_rele(dp, FTAG);836return (err);837}838839/*840* User accounting requires the dataset to be decrypted and rw.841* We also don't begin user accounting during claiming to help842* speed up pool import times and to keep this txg reserved843* completely for recovery work.844*/845if (!readonly && !dp->dp_spa->spa_claiming &&846(ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {847if (dmu_objset_userobjspace_upgradable(*osp) ||848dmu_objset_projectquota_upgradable(*osp)) {849dmu_objset_id_quota_upgrade(*osp);850} else if (dmu_objset_userused_enabled(*osp)) {851dmu_objset_userspace_upgrade(*osp);852}853}854855dsl_pool_rele(dp, FTAG);856return (0);857}858859int860dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,861boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)862{863dsl_dataset_t *ds;864int err;865ds_hold_flags_t flags;866867flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;868err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);869if (err != 0)870return (err);871872err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);873if (err != 0) {874dsl_dataset_disown(ds, flags, tag);875return (err);876}877878return (0);879}880881void882dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag)883{884ds_hold_flags_t flags;885dsl_pool_t *dp = dmu_objset_pool(os);886887flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;888dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);889dsl_pool_rele(dp, tag);890}891892void893dmu_objset_rele(objset_t *os, const void *tag)894{895dmu_objset_rele_flags(os, B_FALSE, tag);896}897898/*899* When we are called, os MUST refer to an objset associated with a dataset900* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner901* == tag. We will then release and reacquire ownership of the dataset while902* holding the pool config_rwlock to avoid intervening namespace or ownership903* changes may occur.904*905* This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to906* release the hold on its dataset and acquire a new one on the dataset of the907* same name so that it can be partially torn down and reconstructed.908*/909void910dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,911boolean_t decrypt, const void *tag)912{913dsl_pool_t *dp;914char name[ZFS_MAX_DATASET_NAME_LEN];915ds_hold_flags_t flags;916917flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;918VERIFY3P(ds, !=, NULL);919VERIFY3P(ds->ds_owner, ==, tag);920VERIFY(dsl_dataset_long_held(ds));921922dsl_dataset_name(ds, name);923dp = ds->ds_dir->dd_pool;924dsl_pool_config_enter(dp, FTAG);925dsl_dataset_disown(ds, flags, tag);926VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));927dsl_pool_config_exit(dp, FTAG);928}929930void931dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag)932{933ds_hold_flags_t flags;934935flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;936/*937* Stop upgrading thread938*/939dmu_objset_upgrade_stop(os);940dsl_dataset_disown(os->os_dsl_dataset, flags, tag);941}942943void944dmu_objset_evict_dbufs(objset_t *os)945{946dnode_t *dn_marker;947dnode_t *dn;948949dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);950951mutex_enter(&os->os_lock);952dn = list_head(&os->os_dnodes);953while (dn != NULL) {954/*955* Skip dnodes without holds. We have to do this dance956* because dnode_add_ref() only works if there is already a957* hold. If the dnode has no holds, then it has no dbufs.958*/959if (dnode_add_ref(dn, FTAG)) {960list_insert_after(&os->os_dnodes, dn, dn_marker);961mutex_exit(&os->os_lock);962963dnode_evict_dbufs(dn);964dnode_rele(dn, FTAG);965966mutex_enter(&os->os_lock);967dn = list_next(&os->os_dnodes, dn_marker);968list_remove(&os->os_dnodes, dn_marker);969} else {970dn = list_next(&os->os_dnodes, dn);971}972}973mutex_exit(&os->os_lock);974975kmem_free(dn_marker, sizeof (dnode_t));976977if (DMU_USERUSED_DNODE(os) != NULL) {978if (DMU_PROJECTUSED_DNODE(os) != NULL)979dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));980dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));981dnode_evict_dbufs(DMU_USERUSED_DNODE(os));982}983dnode_evict_dbufs(DMU_META_DNODE(os));984}985986/*987* Objset eviction processing is split into into two pieces.988* The first marks the objset as evicting, evicts any dbufs that989* have a refcount of zero, and then queues up the objset for the990* second phase of eviction. Once os->os_dnodes has been cleared by991* dnode_buf_pageout()->dnode_destroy(), the second phase is executed.992* The second phase closes the special dnodes, dequeues the objset from993* the list of those undergoing eviction, and finally frees the objset.994*995* NOTE: Due to asynchronous eviction processing (invocation of996* dnode_buf_pageout()), it is possible for the meta dnode for the997* objset to have no holds even though os->os_dnodes is not empty.998*/999void1000dmu_objset_evict(objset_t *os)1001{1002dsl_dataset_t *ds = os->os_dsl_dataset;10031004for (int t = 0; t < TXG_SIZE; t++)1005ASSERT(!dmu_objset_is_dirty(os, t));10061007if (ds)1008dsl_prop_unregister_all(ds, os);10091010if (os->os_sa)1011sa_tear_down(os);10121013dmu_objset_evict_dbufs(os);10141015mutex_enter(&os->os_lock);1016spa_evicting_os_register(os->os_spa, os);1017if (list_is_empty(&os->os_dnodes)) {1018mutex_exit(&os->os_lock);1019dmu_objset_evict_done(os);1020} else {1021mutex_exit(&os->os_lock);1022}102310241025}10261027void1028dmu_objset_evict_done(objset_t *os)1029{1030ASSERT3P(list_head(&os->os_dnodes), ==, NULL);10311032dnode_special_close(&os->os_meta_dnode);1033if (DMU_USERUSED_DNODE(os)) {1034if (DMU_PROJECTUSED_DNODE(os))1035dnode_special_close(&os->os_projectused_dnode);1036dnode_special_close(&os->os_userused_dnode);1037dnode_special_close(&os->os_groupused_dnode);1038}1039zil_free(os->os_zil);10401041arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);10421043/*1044* This is a barrier to prevent the objset from going away in1045* dnode_move() until we can safely ensure that the objset is still in1046* use. We consider the objset valid before the barrier and invalid1047* after the barrier.1048*/1049rw_enter(&os_lock, RW_READER);1050rw_exit(&os_lock);10511052kmem_free(os->os_obj_next_percpu,1053os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));10541055mutex_destroy(&os->os_lock);1056mutex_destroy(&os->os_userused_lock);1057mutex_destroy(&os->os_obj_lock);1058mutex_destroy(&os->os_user_ptr_lock);1059mutex_destroy(&os->os_upgrade_lock);1060for (int i = 0; i < TXG_SIZE; i++)1061multilist_destroy(&os->os_dirty_dnodes[i]);1062spa_evicting_os_deregister(os->os_spa, os);1063kmem_free(os, sizeof (objset_t));1064}10651066inode_timespec_t1067dmu_objset_snap_cmtime(objset_t *os)1068{1069return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));1070}10711072objset_t *1073dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,1074dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)1075{1076objset_t *os;1077dnode_t *mdn;10781079ASSERT(dmu_tx_is_syncing(tx));10801081if (blksz == 0)1082blksz = DNODE_BLOCK_SIZE;1083if (ibs == 0)1084ibs = DN_MAX_INDBLKSHIFT;10851086if (ds != NULL)1087VERIFY0(dmu_objset_from_ds(ds, &os));1088else1089VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));10901091mdn = DMU_META_DNODE(os);10921093dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,1094DNODE_MIN_SLOTS, tx);10951096/*1097* We don't want to have to increase the meta-dnode's nlevels1098* later, because then we could do it in quiescing context while1099* we are also accessing it in open context.1100*1101* This precaution is not necessary for the MOS (ds == NULL),1102* because the MOS is only updated in syncing context.1103* This is most fortunate: the MOS is the only objset that1104* needs to be synced multiple times as spa_sync() iterates1105* to convergence, so minimizing its dn_nlevels matters.1106*/1107if (ds != NULL) {1108if (levels == 0) {1109levels = 1;11101111/*1112* Determine the number of levels necessary for the1113* meta-dnode to contain DN_MAX_OBJECT dnodes. Note1114* that in order to ensure that we do not overflow1115* 64 bits, there has to be a nlevels that gives us a1116* number of blocks > DN_MAX_OBJECT but < 2^64.1117* Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)1118* (10) must be less than (64 - log2(DN_MAX_OBJECT))1119* (16).1120*/1121while ((uint64_t)mdn->dn_nblkptr <<1122(mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *1123(mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <1124DN_MAX_OBJECT)1125levels++;1126}11271128mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =1129mdn->dn_nlevels = levels;1130}11311132ASSERT(type != DMU_OST_NONE);1133ASSERT(type != DMU_OST_ANY);1134ASSERT(type < DMU_OST_NUMTYPES);1135os->os_phys->os_type = type;11361137/*1138* Enable user accounting if it is enabled and this is not an1139* encrypted receive.1140*/1141if (dmu_objset_userused_enabled(os) &&1142(!os->os_encrypted || !dmu_objset_is_receiving(os))) {1143os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;1144if (dmu_objset_userobjused_enabled(os)) {1145ASSERT3P(ds, !=, NULL);1146ds->ds_feature_activation[1147SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;1148os->os_phys->os_flags |=1149OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;1150}1151if (dmu_objset_projectquota_enabled(os)) {1152ASSERT3P(ds, !=, NULL);1153ds->ds_feature_activation[1154SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;1155os->os_phys->os_flags |=1156OBJSET_FLAG_PROJECTQUOTA_COMPLETE;1157}1158os->os_flags = os->os_phys->os_flags;1159}11601161dsl_dataset_dirty(ds, tx);11621163return (os);1164}11651166/* called from dsl for meta-objset */1167objset_t *1168dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,1169dmu_objset_type_t type, dmu_tx_t *tx)1170{1171return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));1172}11731174typedef struct dmu_objset_create_arg {1175const char *doca_name;1176cred_t *doca_cred;1177void (*doca_userfunc)(objset_t *os, void *arg,1178cred_t *cr, dmu_tx_t *tx);1179void *doca_userarg;1180dmu_objset_type_t doca_type;1181uint64_t doca_flags;1182dsl_crypto_params_t *doca_dcp;1183} dmu_objset_create_arg_t;11841185static int1186dmu_objset_create_check(void *arg, dmu_tx_t *tx)1187{1188dmu_objset_create_arg_t *doca = arg;1189dsl_pool_t *dp = dmu_tx_pool(tx);1190dsl_dir_t *pdd;1191dsl_dataset_t *parentds;1192objset_t *parentos;1193const char *tail;1194int error;11951196if (strchr(doca->doca_name, '@') != NULL)1197return (SET_ERROR(EINVAL));11981199if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)1200return (SET_ERROR(ENAMETOOLONG));12011202if (dataset_nestcheck(doca->doca_name) != 0)1203return (SET_ERROR(ENAMETOOLONG));12041205error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);1206if (error != 0)1207return (error);1208if (tail == NULL) {1209dsl_dir_rele(pdd, FTAG);1210return (SET_ERROR(EEXIST));1211}12121213error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);1214if (error != 0) {1215dsl_dir_rele(pdd, FTAG);1216return (error);1217}12181219error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,1220doca->doca_cred);1221if (error != 0) {1222dsl_dir_rele(pdd, FTAG);1223return (error);1224}12251226/* can't create below anything but filesystems (eg. no ZVOLs) */1227error = dsl_dataset_hold_obj(pdd->dd_pool,1228dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);1229if (error != 0) {1230dsl_dir_rele(pdd, FTAG);1231return (error);1232}1233error = dmu_objset_from_ds(parentds, &parentos);1234if (error != 0) {1235dsl_dataset_rele(parentds, FTAG);1236dsl_dir_rele(pdd, FTAG);1237return (error);1238}1239if (dmu_objset_type(parentos) != DMU_OST_ZFS) {1240dsl_dataset_rele(parentds, FTAG);1241dsl_dir_rele(pdd, FTAG);1242return (SET_ERROR(ZFS_ERR_WRONG_PARENT));1243}1244dsl_dataset_rele(parentds, FTAG);1245dsl_dir_rele(pdd, FTAG);12461247return (error);1248}12491250static void1251dmu_objset_create_sync(void *arg, dmu_tx_t *tx)1252{1253dmu_objset_create_arg_t *doca = arg;1254dsl_pool_t *dp = dmu_tx_pool(tx);1255spa_t *spa = dp->dp_spa;1256dsl_dir_t *pdd;1257const char *tail;1258dsl_dataset_t *ds;1259uint64_t obj;1260blkptr_t *bp;1261objset_t *os;1262zio_t *rzio;12631264VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));12651266obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,1267doca->doca_cred, doca->doca_dcp, tx);12681269VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,1270DS_HOLD_FLAG_DECRYPT, FTAG, &ds));1271rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);1272bp = dsl_dataset_get_blkptr(ds);1273os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);1274rrw_exit(&ds->ds_bp_rwlock, FTAG);12751276if (doca->doca_userfunc != NULL) {1277doca->doca_userfunc(os, doca->doca_userarg,1278doca->doca_cred, tx);1279}12801281/*1282* The doca_userfunc() may write out some data that needs to be1283* encrypted if the dataset is encrypted (specifically the root1284* directory). This data must be written out before the encryption1285* key mapping is removed by dsl_dataset_rele_flags(). Force the1286* I/O to occur immediately by invoking the relevant sections of1287* dsl_pool_sync().1288*/1289if (os->os_encrypted) {1290dsl_dataset_t *tmpds = NULL;1291boolean_t need_sync_done = B_FALSE;12921293mutex_enter(&ds->ds_lock);1294ds->ds_owner = FTAG;1295mutex_exit(&ds->ds_lock);12961297rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);1298tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,1299tx->tx_txg);1300if (tmpds != NULL) {1301dsl_dataset_sync(ds, rzio, tx);1302need_sync_done = B_TRUE;1303}1304VERIFY0(zio_wait(rzio));13051306dmu_objset_sync_done(os, tx);1307taskq_wait(dp->dp_sync_taskq);1308if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {1309ASSERT3P(ds->ds_key_mapping, !=, NULL);1310key_mapping_rele(spa, ds->ds_key_mapping, ds);1311}13121313rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);1314tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,1315tx->tx_txg);1316if (tmpds != NULL) {1317dmu_buf_rele(ds->ds_dbuf, ds);1318dsl_dataset_sync(ds, rzio, tx);1319}1320VERIFY0(zio_wait(rzio));13211322if (need_sync_done) {1323ASSERT3P(ds->ds_key_mapping, !=, NULL);1324key_mapping_rele(spa, ds->ds_key_mapping, ds);1325dsl_dataset_sync_done(ds, tx);1326dmu_buf_rele(ds->ds_dbuf, ds);1327}13281329mutex_enter(&ds->ds_lock);1330ds->ds_owner = NULL;1331mutex_exit(&ds->ds_lock);1332}13331334spa_history_log_internal_ds(ds, "create", tx, " ");13351336dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);1337dsl_dir_rele(pdd, FTAG);1338}13391340int1341dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,1342dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)1343{1344dmu_objset_create_arg_t doca;1345dsl_crypto_params_t tmp_dcp = { 0 };13461347cred_t *cr = CRED();1348crhold(cr);13491350doca.doca_name = name;1351doca.doca_cred = cr;1352doca.doca_flags = flags;1353doca.doca_userfunc = func;1354doca.doca_userarg = arg;1355doca.doca_type = type;13561357/*1358* Some callers (mostly for testing) do not provide a dcp on their1359* own but various code inside the sync task will require it to be1360* allocated. Rather than adding NULL checks throughout this code1361* or adding dummy dcp's to all of the callers we simply create a1362* dummy one here and use that. This zero dcp will have the same1363* effect as asking for inheritance of all encryption params.1364*/1365doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;13661367int rv = dsl_sync_task(name,1368dmu_objset_create_check, dmu_objset_create_sync, &doca,13696, ZFS_SPACE_CHECK_NORMAL);13701371if (rv == 0)1372zvol_create_minors(name);13731374crfree(cr);13751376return (rv);1377}13781379int1380dmu_objset_snapshot_one(const char *fsname, const char *snapname)1381{1382int err;1383char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);1384nvlist_t *snaps = fnvlist_alloc();13851386fnvlist_add_boolean(snaps, longsnap);1387kmem_strfree(longsnap);1388err = dsl_dataset_snapshot(snaps, NULL, NULL);1389fnvlist_free(snaps);1390return (err);1391}13921393static void1394dmu_objset_upgrade_task_cb(void *data)1395{1396objset_t *os = data;13971398mutex_enter(&os->os_upgrade_lock);1399os->os_upgrade_status = EINTR;1400if (!os->os_upgrade_exit) {1401int status;14021403mutex_exit(&os->os_upgrade_lock);14041405status = os->os_upgrade_cb(os);14061407mutex_enter(&os->os_upgrade_lock);14081409os->os_upgrade_status = status;1410}1411os->os_upgrade_exit = B_TRUE;1412os->os_upgrade_id = 0;1413mutex_exit(&os->os_upgrade_lock);1414dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1415}14161417static void1418dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)1419{1420if (os->os_upgrade_id != 0)1421return;14221423ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));1424dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);14251426mutex_enter(&os->os_upgrade_lock);1427if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {1428os->os_upgrade_exit = B_FALSE;1429os->os_upgrade_cb = cb;1430os->os_upgrade_id = taskq_dispatch(1431os->os_spa->spa_upgrade_taskq,1432dmu_objset_upgrade_task_cb, os, TQ_SLEEP);1433if (os->os_upgrade_id == TASKQID_INVALID) {1434dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1435os->os_upgrade_status = ENOMEM;1436}1437} else {1438dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1439}1440mutex_exit(&os->os_upgrade_lock);1441}14421443static void1444dmu_objset_upgrade_stop(objset_t *os)1445{1446mutex_enter(&os->os_upgrade_lock);1447os->os_upgrade_exit = B_TRUE;1448if (os->os_upgrade_id != 0) {1449taskqid_t id = os->os_upgrade_id;14501451os->os_upgrade_id = 0;1452mutex_exit(&os->os_upgrade_lock);14531454if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id,1455B_TRUE)) == 0) {1456dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);1457}1458txg_wait_synced(os->os_spa->spa_dsl_pool, 0);1459} else {1460mutex_exit(&os->os_upgrade_lock);1461}1462}14631464static void1465dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)1466{1467dnode_t *dn;14681469while ((dn = multilist_sublist_head(list)) != NULL) {1470ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);1471ASSERT(dn->dn_dbuf->db_data_pending);1472/*1473* Initialize dn_zio outside dnode_sync() because the1474* meta-dnode needs to set it outside dnode_sync().1475*/1476dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;1477ASSERT(dn->dn_zio);14781479ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);1480multilist_sublist_remove(list, dn);14811482/*1483* See the comment above dnode_rele_task() for an explanation1484* of why this dnode hold is always needed (even when not1485* doing user accounting).1486*/1487multilist_t *newlist = &dn->dn_objset->os_synced_dnodes;1488(void) dnode_add_ref(dn, newlist);1489multilist_insert(newlist, dn);14901491dnode_sync(dn, tx);1492}1493}14941495static void1496dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)1497{1498(void) abuf;1499blkptr_t *bp = zio->io_bp;1500objset_t *os = arg;1501dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;1502uint64_t fill = 0;15031504ASSERT(!BP_IS_EMBEDDED(bp));1505ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);1506ASSERT0(BP_GET_LEVEL(bp));15071508/*1509* Update rootbp fill count: it should be the number of objects1510* allocated in the object set (not counting the "special"1511* objects that are stored in the objset_phys_t -- the meta1512* dnode and user/group/project accounting objects).1513*/1514for (int i = 0; i < dnp->dn_nblkptr; i++)1515fill += BP_GET_FILL(&dnp->dn_blkptr[i]);15161517BP_SET_FILL(bp, fill);15181519if (os->os_dsl_dataset != NULL)1520rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);1521*os->os_rootbp = *bp;1522if (os->os_dsl_dataset != NULL)1523rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);1524}15251526static void1527dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)1528{1529(void) abuf;1530blkptr_t *bp = zio->io_bp;1531blkptr_t *bp_orig = &zio->io_bp_orig;1532objset_t *os = arg;15331534if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {1535ASSERT(BP_EQUAL(bp, bp_orig));1536} else {1537dsl_dataset_t *ds = os->os_dsl_dataset;1538dmu_tx_t *tx = os->os_synctx;15391540(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);1541dsl_dataset_block_born(ds, bp, tx);1542}1543kmem_free(bp, sizeof (*bp));1544}15451546typedef struct sync_objset_arg {1547zio_t *soa_zio;1548objset_t *soa_os;1549dmu_tx_t *soa_tx;1550kmutex_t soa_mutex;1551int soa_count;1552taskq_ent_t soa_tq_ent;1553} sync_objset_arg_t;15541555typedef struct sync_dnodes_arg {1556multilist_t *sda_list;1557int sda_sublist_idx;1558multilist_t *sda_newlist;1559sync_objset_arg_t *sda_soa;1560} sync_dnodes_arg_t;15611562static void sync_meta_dnode_task(void *arg);15631564static void1565sync_dnodes_task(void *arg)1566{1567sync_dnodes_arg_t *sda = arg;1568sync_objset_arg_t *soa = sda->sda_soa;1569objset_t *os = soa->soa_os;15701571uint_t allocator = spa_acq_allocator(os->os_spa);1572multilist_sublist_t *ms =1573multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);15741575dmu_objset_sync_dnodes(ms, soa->soa_tx);15761577multilist_sublist_unlock(ms);1578spa_rel_allocator(os->os_spa, allocator);15791580kmem_free(sda, sizeof (*sda));15811582mutex_enter(&soa->soa_mutex);1583ASSERT(soa->soa_count != 0);1584if (--soa->soa_count != 0) {1585mutex_exit(&soa->soa_mutex);1586return;1587}1588mutex_exit(&soa->soa_mutex);15891590taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,1591sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);1592}15931594/*1595* Issue the zio_nowait() for all dirty record zios on the meta dnode,1596* then trigger the callback for the zil_sync. This runs once for each1597* objset, only after any/all sublists in the objset have been synced.1598*/1599static void1600sync_meta_dnode_task(void *arg)1601{1602sync_objset_arg_t *soa = arg;1603objset_t *os = soa->soa_os;1604dmu_tx_t *tx = soa->soa_tx;1605int txgoff = tx->tx_txg & TXG_MASK;1606dbuf_dirty_record_t *dr;16071608ASSERT0(soa->soa_count);16091610list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];1611while ((dr = list_remove_head(list)) != NULL) {1612ASSERT0(dr->dr_dbuf->db_level);1613zio_nowait(dr->dr_zio);1614}16151616/* Enable dnode backfill if enough objects have been freed. */1617if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {1618os->os_rescan_dnodes = B_TRUE;1619os->os_freed_dnodes = 0;1620}16211622/*1623* Free intent log blocks up to this tx.1624*/1625zil_sync(os->os_zil, tx);1626os->os_phys->os_zil_header = os->os_zil_header;1627zio_nowait(soa->soa_zio);16281629mutex_destroy(&soa->soa_mutex);1630kmem_free(soa, sizeof (*soa));1631}16321633/* called from dsl */1634void1635dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)1636{1637int txgoff;1638zbookmark_phys_t zb;1639zio_prop_t zp;1640zio_t *zio;1641int num_sublists;1642multilist_t *ml;1643blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);1644*blkptr_copy = *os->os_rootbp;16451646dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg);16471648ASSERT(dmu_tx_is_syncing(tx));1649/* XXX the write_done callback should really give us the tx... */1650os->os_synctx = tx;16511652if (os->os_dsl_dataset == NULL) {1653/*1654* This is the MOS. If we have upgraded,1655* spa_max_replication() could change, so reset1656* os_copies here.1657*/1658os->os_copies = spa_max_replication(os->os_spa);1659}16601661/*1662* Create the root block IO1663*/1664SET_BOOKMARK(&zb, os->os_dsl_dataset ?1665os->os_dsl_dataset->ds_object : DMU_META_OBJSET,1666ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);1667arc_release(os->os_phys_buf, &os->os_phys_buf);16681669dmu_write_policy(os, NULL, 0, 0, &zp);16701671/*1672* If we are either claiming the ZIL or doing a raw receive, write1673* out the os_phys_buf raw. Neither of these actions will effect the1674* MAC at this point.1675*/1676if (os->os_raw_receive ||1677os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {1678ASSERT(os->os_encrypted);1679arc_convert_to_raw(os->os_phys_buf,1680os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,1681DMU_OT_OBJSET, NULL, NULL, NULL);1682}16831684zio = arc_write(pio, os->os_spa, tx->tx_txg,1685blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),1686&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,1687os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);16881689/*1690* Sync special dnodes - the parent IO for the sync is the root block1691*/1692DMU_META_DNODE(os)->dn_zio = zio;1693dnode_sync(DMU_META_DNODE(os), tx);16941695os->os_phys->os_flags = os->os_flags;16961697if (DMU_USERUSED_DNODE(os) &&1698DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {1699DMU_USERUSED_DNODE(os)->dn_zio = zio;1700dnode_sync(DMU_USERUSED_DNODE(os), tx);1701DMU_GROUPUSED_DNODE(os)->dn_zio = zio;1702dnode_sync(DMU_GROUPUSED_DNODE(os), tx);1703}17041705if (DMU_PROJECTUSED_DNODE(os) &&1706DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {1707DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;1708dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);1709}17101711txgoff = tx->tx_txg & TXG_MASK;17121713/*1714* We must create the list here because it uses the1715* dn_dirty_link[] of this txg. But it may already1716* exist because we call dsl_dataset_sync() twice per txg.1717*/1718if (os->os_synced_dnodes.ml_sublists == NULL) {1719multilist_create(&os->os_synced_dnodes, sizeof (dnode_t),1720offsetof(dnode_t, dn_dirty_link[txgoff]),1721dnode_multilist_index_func);1722} else {1723ASSERT3U(os->os_synced_dnodes.ml_offset, ==,1724offsetof(dnode_t, dn_dirty_link[txgoff]));1725}17261727/*1728* zio_nowait(zio) is done after any/all sublist and meta dnode1729* zios have been nowaited, and the zil_sync() has been performed.1730* The soa is freed at the end of sync_meta_dnode_task.1731*/1732sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);1733soa->soa_zio = zio;1734soa->soa_os = os;1735soa->soa_tx = tx;1736taskq_init_ent(&soa->soa_tq_ent);1737mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);17381739ml = &os->os_dirty_dnodes[txgoff];1740soa->soa_count = num_sublists = multilist_get_num_sublists(ml);17411742for (int i = 0; i < num_sublists; i++) {1743if (multilist_sublist_is_empty_idx(ml, i))1744soa->soa_count--;1745}17461747if (soa->soa_count == 0) {1748taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,1749sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);1750} else {1751/*1752* Sync sublists in parallel. The last to finish1753* (i.e., when soa->soa_count reaches zero) must1754* dispatch sync_meta_dnode_task.1755*/1756for (int i = 0; i < num_sublists; i++) {1757if (multilist_sublist_is_empty_idx(ml, i))1758continue;1759sync_dnodes_arg_t *sda =1760kmem_alloc(sizeof (*sda), KM_SLEEP);1761sda->sda_list = ml;1762sda->sda_sublist_idx = i;1763sda->sda_soa = soa;1764(void) taskq_dispatch(1765dmu_objset_pool(os)->dp_sync_taskq,1766sync_dnodes_task, sda, 0);1767/* sync_dnodes_task frees sda */1768}1769}1770}17711772boolean_t1773dmu_objset_is_dirty(objset_t *os, uint64_t txg)1774{1775return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]));1776}17771778static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];17791780void1781dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)1782{1783file_cbs[ost] = cb;1784}17851786int1787dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,1788zfs_file_info_t *zfi)1789{1790file_info_cb_t *cb = file_cbs[os->os_phys->os_type];1791if (cb == NULL)1792return (EINVAL);1793return (cb(bonustype, data, zfi));1794}17951796boolean_t1797dmu_objset_userused_enabled(objset_t *os)1798{1799return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&1800file_cbs[os->os_phys->os_type] != NULL &&1801DMU_USERUSED_DNODE(os) != NULL);1802}18031804boolean_t1805dmu_objset_userobjused_enabled(objset_t *os)1806{1807return (dmu_objset_userused_enabled(os) &&1808spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));1809}18101811boolean_t1812dmu_objset_projectquota_enabled(objset_t *os)1813{1814return (file_cbs[os->os_phys->os_type] != NULL &&1815DMU_PROJECTUSED_DNODE(os) != NULL &&1816spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));1817}18181819typedef struct userquota_node {1820/* must be in the first filed, see userquota_update_cache() */1821char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];1822int64_t uqn_delta;1823avl_node_t uqn_node;1824} userquota_node_t;18251826typedef struct userquota_cache {1827avl_tree_t uqc_user_deltas;1828avl_tree_t uqc_group_deltas;1829avl_tree_t uqc_project_deltas;1830} userquota_cache_t;18311832static int1833userquota_compare(const void *l, const void *r)1834{1835const userquota_node_t *luqn = l;1836const userquota_node_t *ruqn = r;1837int rv;18381839/*1840* NB: can only access uqn_id because userquota_update_cache() doesn't1841* pass in an entire userquota_node_t.1842*/1843rv = strcmp(luqn->uqn_id, ruqn->uqn_id);18441845return (TREE_ISIGN(rv));1846}18471848static void1849do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)1850{1851void *cookie;1852userquota_node_t *uqn;18531854ASSERT(dmu_tx_is_syncing(tx));18551856cookie = NULL;1857while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,1858&cookie)) != NULL) {1859/*1860* os_userused_lock protects against concurrent calls to1861* zap_increment_int(). It's needed because zap_increment_int()1862* is not thread-safe (i.e. not atomic).1863*/1864mutex_enter(&os->os_userused_lock);1865VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,1866uqn->uqn_id, uqn->uqn_delta, tx));1867mutex_exit(&os->os_userused_lock);1868kmem_free(uqn, sizeof (*uqn));1869}1870avl_destroy(&cache->uqc_user_deltas);18711872cookie = NULL;1873while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,1874&cookie)) != NULL) {1875mutex_enter(&os->os_userused_lock);1876VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,1877uqn->uqn_id, uqn->uqn_delta, tx));1878mutex_exit(&os->os_userused_lock);1879kmem_free(uqn, sizeof (*uqn));1880}1881avl_destroy(&cache->uqc_group_deltas);18821883if (dmu_objset_projectquota_enabled(os)) {1884cookie = NULL;1885while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,1886&cookie)) != NULL) {1887mutex_enter(&os->os_userused_lock);1888VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,1889uqn->uqn_id, uqn->uqn_delta, tx));1890mutex_exit(&os->os_userused_lock);1891kmem_free(uqn, sizeof (*uqn));1892}1893avl_destroy(&cache->uqc_project_deltas);1894}1895}18961897static void1898userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)1899{1900userquota_node_t *uqn;1901avl_index_t idx;19021903ASSERT(strlen(id) < sizeof (uqn->uqn_id));1904/*1905* Use id directly for searching because uqn_id is the first field of1906* userquota_node_t and fields after uqn_id won't be accessed in1907* avl_find().1908*/1909uqn = avl_find(avl, (const void *)id, &idx);1910if (uqn == NULL) {1911uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);1912strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));1913avl_insert(avl, uqn, idx);1914}1915uqn->uqn_delta += delta;1916}19171918static void1919do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,1920uint64_t flags, uint64_t user, uint64_t group, uint64_t project,1921boolean_t subtract)1922{1923if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {1924int64_t delta = DNODE_MIN_SIZE + used;1925char name[20];19261927if (subtract)1928delta = -delta;19291930(void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);1931userquota_update_cache(&cache->uqc_user_deltas, name, delta);19321933(void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);1934userquota_update_cache(&cache->uqc_group_deltas, name, delta);19351936if (dmu_objset_projectquota_enabled(os)) {1937(void) snprintf(name, sizeof (name), "%llx",1938(longlong_t)project);1939userquota_update_cache(&cache->uqc_project_deltas,1940name, delta);1941}1942}1943}19441945static void1946do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,1947uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)1948{1949if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {1950char name[20 + DMU_OBJACCT_PREFIX_LEN];1951int delta = subtract ? -1 : 1;19521953(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",1954(longlong_t)user);1955userquota_update_cache(&cache->uqc_user_deltas, name, delta);19561957(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",1958(longlong_t)group);1959userquota_update_cache(&cache->uqc_group_deltas, name, delta);19601961if (dmu_objset_projectquota_enabled(os)) {1962(void) snprintf(name, sizeof (name),1963DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);1964userquota_update_cache(&cache->uqc_project_deltas,1965name, delta);1966}1967}1968}19691970typedef struct userquota_updates_arg {1971objset_t *uua_os;1972int uua_sublist_idx;1973dmu_tx_t *uua_tx;1974} userquota_updates_arg_t;19751976static void1977userquota_updates_task(void *arg)1978{1979userquota_updates_arg_t *uua = arg;1980objset_t *os = uua->uua_os;1981dmu_tx_t *tx = uua->uua_tx;1982dnode_t *dn;1983userquota_cache_t cache = { { 0 } };19841985multilist_sublist_t *list = multilist_sublist_lock_idx(1986&os->os_synced_dnodes, uua->uua_sublist_idx);19871988ASSERT(multilist_sublist_head(list) == NULL ||1989dmu_objset_userused_enabled(os));1990avl_create(&cache.uqc_user_deltas, userquota_compare,1991sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));1992avl_create(&cache.uqc_group_deltas, userquota_compare,1993sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));1994if (dmu_objset_projectquota_enabled(os))1995avl_create(&cache.uqc_project_deltas, userquota_compare,1996sizeof (userquota_node_t), offsetof(userquota_node_t,1997uqn_node));19981999while ((dn = multilist_sublist_head(list)) != NULL) {2000int flags;2001ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));2002ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||2003dn->dn_phys->dn_flags &2004DNODE_FLAG_USERUSED_ACCOUNTED);20052006flags = dn->dn_id_flags;2007ASSERT(flags);2008if (flags & DN_ID_OLD_EXIST) {2009do_userquota_update(os, &cache, dn->dn_oldused,2010dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,2011dn->dn_oldprojid, B_TRUE);2012do_userobjquota_update(os, &cache, dn->dn_oldflags,2013dn->dn_olduid, dn->dn_oldgid,2014dn->dn_oldprojid, B_TRUE);2015}2016if (flags & DN_ID_NEW_EXIST) {2017do_userquota_update(os, &cache,2018DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,2019dn->dn_newuid, dn->dn_newgid,2020dn->dn_newprojid, B_FALSE);2021do_userobjquota_update(os, &cache,2022dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,2023dn->dn_newprojid, B_FALSE);2024}20252026mutex_enter(&dn->dn_mtx);2027dn->dn_oldused = 0;2028dn->dn_oldflags = 0;2029if (dn->dn_id_flags & DN_ID_NEW_EXIST) {2030dn->dn_olduid = dn->dn_newuid;2031dn->dn_oldgid = dn->dn_newgid;2032dn->dn_oldprojid = dn->dn_newprojid;2033dn->dn_id_flags |= DN_ID_OLD_EXIST;2034if (dn->dn_bonuslen == 0)2035dn->dn_id_flags |= DN_ID_CHKED_SPILL;2036else2037dn->dn_id_flags |= DN_ID_CHKED_BONUS;2038}2039dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);2040ASSERT3U(dn->dn_dirtycnt, >, 0);2041dn->dn_dirtycnt--;2042mutex_exit(&dn->dn_mtx);20432044multilist_sublist_remove(list, dn);2045dnode_rele(dn, &os->os_synced_dnodes);2046}2047do_userquota_cacheflush(os, &cache, tx);2048multilist_sublist_unlock(list);2049kmem_free(uua, sizeof (*uua));2050}20512052/*2053* Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being2054* synced (i.e. we have issued the zio's for blocks in the dnode), it can't be2055* evicted because the block containing the dnode can't be evicted until it is2056* written out. However, this hold is necessary to prevent the dnode_t from2057* being moved (via dnode_move()) while it's still referenced by2058* dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for2059* dirty_lightweight_leaf-type dirty records.2060*2061* If we are doing user-object accounting, the dnode_rele() happens from2062* userquota_updates_task() instead.2063*/2064static void2065dnode_rele_task(void *arg)2066{2067userquota_updates_arg_t *uua = arg;2068objset_t *os = uua->uua_os;20692070multilist_sublist_t *list = multilist_sublist_lock_idx(2071&os->os_synced_dnodes, uua->uua_sublist_idx);20722073dnode_t *dn;2074while ((dn = multilist_sublist_head(list)) != NULL) {2075mutex_enter(&dn->dn_mtx);2076ASSERT3U(dn->dn_dirtycnt, >, 0);2077dn->dn_dirtycnt--;2078mutex_exit(&dn->dn_mtx);2079multilist_sublist_remove(list, dn);2080dnode_rele(dn, &os->os_synced_dnodes);2081}2082multilist_sublist_unlock(list);2083kmem_free(uua, sizeof (*uua));2084}20852086/*2087* Return TRUE if userquota updates are needed.2088*/2089static boolean_t2090dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)2091{2092if (!dmu_objset_userused_enabled(os))2093return (B_FALSE);20942095/*2096* If this is a raw receive just return and handle accounting2097* later when we have the keys loaded. We also don't do user2098* accounting during claiming since the datasets are not owned2099* for the duration of claiming and this txg should only be2100* used for recovery.2101*/2102if (os->os_encrypted && dmu_objset_is_receiving(os))2103return (B_FALSE);21042105if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)2106return (B_FALSE);21072108/* Allocate the user/group/project used objects if necessary. */2109if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {2110VERIFY0(zap_create_claim(os,2111DMU_USERUSED_OBJECT,2112DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2113VERIFY0(zap_create_claim(os,2114DMU_GROUPUSED_OBJECT,2115DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2116}21172118if (dmu_objset_projectquota_enabled(os) &&2119DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {2120VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,2121DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));2122}2123return (B_TRUE);2124}21252126/*2127* Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and2128* also release the holds on the dnodes from dmu_objset_sync_dnodes().2129* The caller must taskq_wait(dp_sync_taskq).2130*/2131void2132dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)2133{2134boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);21352136int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes);2137for (int i = 0; i < num_sublists; i++) {2138userquota_updates_arg_t *uua =2139kmem_alloc(sizeof (*uua), KM_SLEEP);2140uua->uua_os = os;2141uua->uua_sublist_idx = i;2142uua->uua_tx = tx;21432144/*2145* If we don't need to update userquotas, use2146* dnode_rele_task() to call dnode_rele()2147*/2148(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,2149need_userquota ? userquota_updates_task : dnode_rele_task,2150uua, 0);2151/* callback frees uua */2152}2153}215421552156/*2157* Returns a pointer to data to find uid/gid from2158*2159* If a dirty record for transaction group that is syncing can't2160* be found then NULL is returned. In the NULL case it is assumed2161* the uid/gid aren't changing.2162*/2163static void *2164dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)2165{2166dbuf_dirty_record_t *dr;2167void *data;21682169if (db->db_dirtycnt == 0) {2170ASSERT(MUTEX_HELD(&db->db_mtx));2171return (db->db.db_data); /* Nothing is changing */2172}21732174dr = dbuf_find_dirty_eq(db, tx->tx_txg);21752176if (dr == NULL) {2177data = NULL;2178} else {2179if (dr->dr_dnode->dn_bonuslen == 0 &&2180dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)2181data = dr->dt.dl.dr_data->b_data;2182else2183data = dr->dt.dl.dr_data;2184}21852186return (data);2187}21882189void2190dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)2191{2192objset_t *os = dn->dn_objset;2193void *data = NULL;2194dmu_buf_impl_t *db = NULL;2195int flags = dn->dn_id_flags;2196int error;2197boolean_t have_spill = B_FALSE;21982199if (!dmu_objset_userused_enabled(dn->dn_objset))2200return;22012202/*2203* Raw receives introduce a problem with user accounting. Raw2204* receives cannot update the user accounting info because the2205* user ids and the sizes are encrypted. To guarantee that we2206* never end up with bad user accounting, we simply disable it2207* during raw receives. We also disable this for normal receives2208* so that an incremental raw receive may be done on top of an2209* existing non-raw receive.2210*/2211if (os->os_encrypted && dmu_objset_is_receiving(os))2212return;22132214if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|2215DN_ID_CHKED_SPILL)))2216return;22172218if (before && dn->dn_bonuslen != 0)2219data = DN_BONUS(dn->dn_phys);2220else if (!before && dn->dn_bonuslen != 0) {2221if (dn->dn_bonus) {2222db = dn->dn_bonus;2223mutex_enter(&db->db_mtx);2224data = dmu_objset_userquota_find_data(db, tx);2225} else {2226data = DN_BONUS(dn->dn_phys);2227}2228} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {2229dmu_flags_t rf = DB_RF_MUST_SUCCEED;22302231if (RW_WRITE_HELD(&dn->dn_struct_rwlock))2232rf |= DB_RF_HAVESTRUCT;2233error = dmu_spill_hold_by_dnode(dn, rf,2234FTAG, (dmu_buf_t **)&db);2235ASSERT0(error);2236mutex_enter(&db->db_mtx);2237data = (before) ? db->db.db_data :2238dmu_objset_userquota_find_data(db, tx);2239have_spill = B_TRUE;2240} else {2241mutex_enter(&dn->dn_mtx);2242dn->dn_id_flags |= DN_ID_CHKED_BONUS;2243mutex_exit(&dn->dn_mtx);2244return;2245}22462247/*2248* Must always call the callback in case the object2249* type has changed and that type isn't an object type to track2250*/2251zfs_file_info_t zfi;2252error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);22532254if (before) {2255ASSERT(data);2256dn->dn_olduid = zfi.zfi_user;2257dn->dn_oldgid = zfi.zfi_group;2258dn->dn_oldprojid = zfi.zfi_project;2259} else if (data) {2260dn->dn_newuid = zfi.zfi_user;2261dn->dn_newgid = zfi.zfi_group;2262dn->dn_newprojid = zfi.zfi_project;2263}22642265/*2266* Preserve existing uid/gid when the callback can't determine2267* what the new uid/gid are and the callback returned EEXIST.2268* The EEXIST error tells us to just use the existing uid/gid.2269* If we don't know what the old values are then just assign2270* them to 0, since that is a new file being created.2271*/2272if (!before && data == NULL && error == EEXIST) {2273if (flags & DN_ID_OLD_EXIST) {2274dn->dn_newuid = dn->dn_olduid;2275dn->dn_newgid = dn->dn_oldgid;2276dn->dn_newprojid = dn->dn_oldprojid;2277} else {2278dn->dn_newuid = 0;2279dn->dn_newgid = 0;2280dn->dn_newprojid = ZFS_DEFAULT_PROJID;2281}2282error = 0;2283}22842285if (db)2286mutex_exit(&db->db_mtx);22872288mutex_enter(&dn->dn_mtx);2289if (error == 0 && before)2290dn->dn_id_flags |= DN_ID_OLD_EXIST;2291if (error == 0 && !before)2292dn->dn_id_flags |= DN_ID_NEW_EXIST;22932294if (have_spill) {2295dn->dn_id_flags |= DN_ID_CHKED_SPILL;2296} else {2297dn->dn_id_flags |= DN_ID_CHKED_BONUS;2298}2299mutex_exit(&dn->dn_mtx);2300if (have_spill)2301dmu_buf_rele((dmu_buf_t *)db, FTAG);2302}23032304boolean_t2305dmu_objset_userspace_present(objset_t *os)2306{2307return (os->os_phys->os_flags &2308OBJSET_FLAG_USERACCOUNTING_COMPLETE);2309}23102311boolean_t2312dmu_objset_userobjspace_present(objset_t *os)2313{2314return (os->os_phys->os_flags &2315OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);2316}23172318boolean_t2319dmu_objset_projectquota_present(objset_t *os)2320{2321return (os->os_phys->os_flags &2322OBJSET_FLAG_PROJECTQUOTA_COMPLETE);2323}23242325static int2326dmu_objset_space_upgrade(objset_t *os)2327{2328uint64_t obj;2329int err = 0;23302331/*2332* We simply need to mark every object dirty, so that it will be2333* synced out and now accounted. If this is called2334* concurrently, or if we already did some work before crashing,2335* that's fine, since we track each object's accounted state2336* independently.2337*/23382339for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {2340dmu_tx_t *tx;2341dmu_buf_t *db;2342int objerr;23432344mutex_enter(&os->os_upgrade_lock);2345if (os->os_upgrade_exit)2346err = SET_ERROR(EINTR);2347mutex_exit(&os->os_upgrade_lock);2348if (err != 0)2349return (err);23502351if (issig())2352return (SET_ERROR(EINTR));23532354objerr = dmu_bonus_hold(os, obj, FTAG, &db);2355if (objerr != 0)2356continue;2357tx = dmu_tx_create(os);2358dmu_tx_hold_bonus(tx, obj);2359objerr = dmu_tx_assign(tx, DMU_TX_WAIT);2360if (objerr != 0) {2361dmu_buf_rele(db, FTAG);2362dmu_tx_abort(tx);2363continue;2364}2365dmu_buf_will_dirty(db, tx);2366dmu_buf_rele(db, FTAG);2367dmu_tx_commit(tx);2368}2369return (0);2370}23712372static int2373dmu_objset_userspace_upgrade_cb(objset_t *os)2374{2375int err = 0;23762377if (dmu_objset_userspace_present(os))2378return (0);2379if (dmu_objset_is_snapshot(os))2380return (SET_ERROR(EINVAL));2381if (!dmu_objset_userused_enabled(os))2382return (SET_ERROR(ENOTSUP));23832384err = dmu_objset_space_upgrade(os);2385if (err)2386return (err);23872388os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;2389txg_wait_synced(dmu_objset_pool(os), 0);2390return (0);2391}23922393void2394dmu_objset_userspace_upgrade(objset_t *os)2395{2396dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);2397}23982399static int2400dmu_objset_id_quota_upgrade_cb(objset_t *os)2401{2402int err = 0;24032404if (dmu_objset_userobjspace_present(os) &&2405dmu_objset_projectquota_present(os))2406return (0);2407if (dmu_objset_is_snapshot(os))2408return (SET_ERROR(EINVAL));2409if (!dmu_objset_userused_enabled(os))2410return (SET_ERROR(ENOTSUP));2411if (!dmu_objset_projectquota_enabled(os) &&2412dmu_objset_userobjspace_present(os))2413return (SET_ERROR(ENOTSUP));24142415if (dmu_objset_userobjused_enabled(os))2416dmu_objset_ds(os)->ds_feature_activation[2417SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;2418if (dmu_objset_projectquota_enabled(os))2419dmu_objset_ds(os)->ds_feature_activation[2420SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;24212422err = dmu_objset_space_upgrade(os);2423if (err)2424return (err);24252426os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;2427if (dmu_objset_userobjused_enabled(os))2428os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;2429if (dmu_objset_projectquota_enabled(os))2430os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;24312432txg_wait_synced(dmu_objset_pool(os), 0);2433return (0);2434}24352436void2437dmu_objset_id_quota_upgrade(objset_t *os)2438{2439dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);2440}24412442boolean_t2443dmu_objset_userobjspace_upgradable(objset_t *os)2444{2445return (dmu_objset_type(os) == DMU_OST_ZFS &&2446!dmu_objset_is_snapshot(os) &&2447dmu_objset_userobjused_enabled(os) &&2448!dmu_objset_userobjspace_present(os) &&2449spa_writeable(dmu_objset_spa(os)));2450}24512452boolean_t2453dmu_objset_projectquota_upgradable(objset_t *os)2454{2455return (dmu_objset_type(os) == DMU_OST_ZFS &&2456!dmu_objset_is_snapshot(os) &&2457dmu_objset_projectquota_enabled(os) &&2458!dmu_objset_projectquota_present(os) &&2459spa_writeable(dmu_objset_spa(os)));2460}24612462void2463dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,2464uint64_t *usedobjsp, uint64_t *availobjsp)2465{2466dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,2467usedobjsp, availobjsp);2468}24692470uint64_t2471dmu_objset_fsid_guid(objset_t *os)2472{2473return (dsl_dataset_fsid_guid(os->os_dsl_dataset));2474}24752476void2477dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)2478{2479stat->dds_type = os->os_phys->os_type;2480if (os->os_dsl_dataset)2481dsl_dataset_fast_stat(os->os_dsl_dataset, stat);2482}24832484void2485dmu_objset_stats(objset_t *os, nvlist_t *nv)2486{2487ASSERT(os->os_dsl_dataset ||2488os->os_phys->os_type == DMU_OST_META);24892490if (os->os_dsl_dataset != NULL)2491dsl_dataset_stats(os->os_dsl_dataset, nv);24922493dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,2494os->os_phys->os_type);2495dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,2496dmu_objset_userspace_present(os));2497}24982499int2500dmu_objset_is_snapshot(objset_t *os)2501{2502if (os->os_dsl_dataset != NULL)2503return (os->os_dsl_dataset->ds_is_snapshot);2504else2505return (B_FALSE);2506}25072508int2509dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,2510boolean_t *conflict)2511{2512dsl_dataset_t *ds = os->os_dsl_dataset;2513uint64_t ignored;25142515if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)2516return (SET_ERROR(ENOENT));25172518return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,2519dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,2520MT_NORMALIZE, real, maxlen, conflict));2521}25222523int2524dmu_snapshot_list_next(objset_t *os, int namelen, char *name,2525uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)2526{2527dsl_dataset_t *ds = os->os_dsl_dataset;2528zap_cursor_t cursor;2529zap_attribute_t *attr;25302531ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));25322533if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)2534return (SET_ERROR(ENOENT));25352536attr = zap_attribute_alloc();2537zap_cursor_init_serialized(&cursor,2538ds->ds_dir->dd_pool->dp_meta_objset,2539dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);25402541if (zap_cursor_retrieve(&cursor, attr) != 0) {2542zap_cursor_fini(&cursor);2543zap_attribute_free(attr);2544return (SET_ERROR(ENOENT));2545}25462547if (strlen(attr->za_name) + 1 > namelen) {2548zap_cursor_fini(&cursor);2549zap_attribute_free(attr);2550return (SET_ERROR(ENAMETOOLONG));2551}25522553(void) strlcpy(name, attr->za_name, namelen);2554if (idp)2555*idp = attr->za_first_integer;2556if (case_conflict)2557*case_conflict = attr->za_normalization_conflict;2558zap_cursor_advance(&cursor);2559*offp = zap_cursor_serialize(&cursor);2560zap_cursor_fini(&cursor);2561zap_attribute_free(attr);25622563return (0);2564}25652566int2567dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)2568{2569return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));2570}25712572int2573dmu_dir_list_next(objset_t *os, int namelen, char *name,2574uint64_t *idp, uint64_t *offp)2575{2576dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;2577zap_cursor_t cursor;2578zap_attribute_t *attr;25792580/* there is no next dir on a snapshot! */2581if (os->os_dsl_dataset->ds_object !=2582dsl_dir_phys(dd)->dd_head_dataset_obj)2583return (SET_ERROR(ENOENT));25842585attr = zap_attribute_alloc();2586zap_cursor_init_serialized(&cursor,2587dd->dd_pool->dp_meta_objset,2588dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);25892590if (zap_cursor_retrieve(&cursor, attr) != 0) {2591zap_cursor_fini(&cursor);2592zap_attribute_free(attr);2593return (SET_ERROR(ENOENT));2594}25952596if (strlen(attr->za_name) + 1 > namelen) {2597zap_cursor_fini(&cursor);2598zap_attribute_free(attr);2599return (SET_ERROR(ENAMETOOLONG));2600}26012602(void) strlcpy(name, attr->za_name, namelen);2603if (idp)2604*idp = attr->za_first_integer;2605zap_cursor_advance(&cursor);2606*offp = zap_cursor_serialize(&cursor);2607zap_cursor_fini(&cursor);2608zap_attribute_free(attr);26092610return (0);2611}26122613typedef struct dmu_objset_find_ctx {2614taskq_t *dc_tq;2615dsl_pool_t *dc_dp;2616uint64_t dc_ddobj;2617char *dc_ddname; /* last component of ddobj's name */2618int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);2619void *dc_arg;2620int dc_flags;2621kmutex_t *dc_error_lock;2622int *dc_error;2623} dmu_objset_find_ctx_t;26242625static void2626dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)2627{2628dsl_pool_t *dp = dcp->dc_dp;2629dsl_dir_t *dd;2630dsl_dataset_t *ds;2631zap_cursor_t zc;2632zap_attribute_t *attr;2633uint64_t thisobj;2634int err = 0;26352636/* don't process if there already was an error */2637if (*dcp->dc_error != 0)2638goto out;26392640/*2641* Note: passing the name (dc_ddname) here is optional, but it2642* improves performance because we don't need to call2643* zap_value_search() to determine the name.2644*/2645err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);2646if (err != 0)2647goto out;26482649/* Don't visit hidden ($MOS & $ORIGIN) objsets. */2650if (dd->dd_myname[0] == '$') {2651dsl_dir_rele(dd, FTAG);2652goto out;2653}26542655thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;2656attr = zap_attribute_alloc();26572658/*2659* Iterate over all children.2660*/2661if (dcp->dc_flags & DS_FIND_CHILDREN) {2662for (zap_cursor_init(&zc, dp->dp_meta_objset,2663dsl_dir_phys(dd)->dd_child_dir_zapobj);2664zap_cursor_retrieve(&zc, attr) == 0;2665(void) zap_cursor_advance(&zc)) {2666ASSERT3U(attr->za_integer_length, ==,2667sizeof (uint64_t));2668ASSERT3U(attr->za_num_integers, ==, 1);26692670dmu_objset_find_ctx_t *child_dcp =2671kmem_alloc(sizeof (*child_dcp), KM_SLEEP);2672*child_dcp = *dcp;2673child_dcp->dc_ddobj = attr->za_first_integer;2674child_dcp->dc_ddname = spa_strdup(attr->za_name);2675if (dcp->dc_tq != NULL)2676(void) taskq_dispatch(dcp->dc_tq,2677dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);2678else2679dmu_objset_find_dp_impl(child_dcp);2680}2681zap_cursor_fini(&zc);2682}26832684/*2685* Iterate over all snapshots.2686*/2687if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {2688dsl_dataset_t *ds;2689err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);26902691if (err == 0) {2692uint64_t snapobj;26932694snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;2695dsl_dataset_rele(ds, FTAG);26962697for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);2698zap_cursor_retrieve(&zc, attr) == 0;2699(void) zap_cursor_advance(&zc)) {2700ASSERT3U(attr->za_integer_length, ==,2701sizeof (uint64_t));2702ASSERT3U(attr->za_num_integers, ==, 1);27032704err = dsl_dataset_hold_obj(dp,2705attr->za_first_integer, FTAG, &ds);2706if (err != 0)2707break;2708err = dcp->dc_func(dp, ds, dcp->dc_arg);2709dsl_dataset_rele(ds, FTAG);2710if (err != 0)2711break;2712}2713zap_cursor_fini(&zc);2714}2715}27162717zap_attribute_free(attr);27182719if (err != 0) {2720dsl_dir_rele(dd, FTAG);2721goto out;2722}27232724/*2725* Apply to self.2726*/2727err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);27282729/*2730* Note: we hold the dir while calling dsl_dataset_hold_obj() so2731* that the dir will remain cached, and we won't have to re-instantiate2732* it (which could be expensive due to finding its name via2733* zap_value_search()).2734*/2735dsl_dir_rele(dd, FTAG);2736if (err != 0)2737goto out;2738err = dcp->dc_func(dp, ds, dcp->dc_arg);2739dsl_dataset_rele(ds, FTAG);27402741out:2742if (err != 0) {2743mutex_enter(dcp->dc_error_lock);2744/* only keep first error */2745if (*dcp->dc_error == 0)2746*dcp->dc_error = err;2747mutex_exit(dcp->dc_error_lock);2748}27492750if (dcp->dc_ddname != NULL)2751spa_strfree(dcp->dc_ddname);2752kmem_free(dcp, sizeof (*dcp));2753}27542755static void2756dmu_objset_find_dp_cb(void *arg)2757{2758dmu_objset_find_ctx_t *dcp = arg;2759dsl_pool_t *dp = dcp->dc_dp;27602761/*2762* We need to get a pool_config_lock here, as there are several2763* assert(pool_config_held) down the stack. Getting a lock via2764* dsl_pool_config_enter is risky, as it might be stalled by a2765* pending writer. This would deadlock, as the write lock can2766* only be granted when our parent thread gives up the lock.2767* The _prio interface gives us priority over a pending writer.2768*/2769dsl_pool_config_enter_prio(dp, FTAG);27702771dmu_objset_find_dp_impl(dcp);27722773dsl_pool_config_exit(dp, FTAG);2774}27752776/*2777* Find objsets under and including ddobj, call func(ds) on each.2778* The order for the enumeration is completely undefined.2779* func is called with dsl_pool_config held.2780*/2781int2782dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,2783int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)2784{2785int error = 0;2786taskq_t *tq = NULL;2787int ntasks;2788dmu_objset_find_ctx_t *dcp;2789kmutex_t err_lock;27902791mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);2792dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);2793dcp->dc_tq = NULL;2794dcp->dc_dp = dp;2795dcp->dc_ddobj = ddobj;2796dcp->dc_ddname = NULL;2797dcp->dc_func = func;2798dcp->dc_arg = arg;2799dcp->dc_flags = flags;2800dcp->dc_error_lock = &err_lock;2801dcp->dc_error = &error;28022803if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {2804/*2805* In case a write lock is held we can't make use of2806* parallelism, as down the stack of the worker threads2807* the lock is asserted via dsl_pool_config_held.2808* In case of a read lock this is solved by getting a read2809* lock in each worker thread, which isn't possible in case2810* of a writer lock. So we fall back to the synchronous path2811* here.2812* In the future it might be possible to get some magic into2813* dsl_pool_config_held in a way that it returns true for2814* the worker threads so that a single lock held from this2815* thread suffices. For now, stay single threaded.2816*/2817dmu_objset_find_dp_impl(dcp);2818mutex_destroy(&err_lock);28192820return (error);2821}28222823ntasks = dmu_find_threads;2824if (ntasks == 0)2825ntasks = vdev_count_leaves(dp->dp_spa) * 4;2826tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,2827INT_MAX, 0);2828if (tq == NULL) {2829kmem_free(dcp, sizeof (*dcp));2830mutex_destroy(&err_lock);28312832return (SET_ERROR(ENOMEM));2833}2834dcp->dc_tq = tq;28352836/* dcp will be freed by task */2837(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);28382839/*2840* PORTING: this code relies on the property of taskq_wait to wait2841* until no more tasks are queued and no more tasks are active. As2842* we always queue new tasks from within other tasks, task_wait2843* reliably waits for the full recursion to finish, even though we2844* enqueue new tasks after taskq_wait has been called.2845* On platforms other than illumos, taskq_wait may not have this2846* property.2847*/2848taskq_wait(tq);2849taskq_destroy(tq);2850mutex_destroy(&err_lock);28512852return (error);2853}28542855/*2856* Find all objsets under name, and for each, call 'func(child_name, arg)'.2857* The dp_config_rwlock must not be held when this is called, and it2858* will not be held when the callback is called.2859* Therefore this function should only be used when the pool is not changing2860* (e.g. in syncing context), or the callback can deal with the possible races.2861*/2862static int2863dmu_objset_find_impl(spa_t *spa, const char *name,2864int func(const char *, void *), void *arg, int flags)2865{2866dsl_dir_t *dd;2867dsl_pool_t *dp = spa_get_dsl(spa);2868dsl_dataset_t *ds;2869zap_cursor_t zc;2870zap_attribute_t *attr;2871char *child;2872uint64_t thisobj;2873int err;28742875dsl_pool_config_enter(dp, FTAG);28762877err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);2878if (err != 0) {2879dsl_pool_config_exit(dp, FTAG);2880return (err);2881}28822883/* Don't visit hidden ($MOS & $ORIGIN) objsets. */2884if (dd->dd_myname[0] == '$') {2885dsl_dir_rele(dd, FTAG);2886dsl_pool_config_exit(dp, FTAG);2887return (0);2888}28892890thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;2891attr = zap_attribute_alloc();28922893/*2894* Iterate over all children.2895*/2896if (flags & DS_FIND_CHILDREN) {2897for (zap_cursor_init(&zc, dp->dp_meta_objset,2898dsl_dir_phys(dd)->dd_child_dir_zapobj);2899zap_cursor_retrieve(&zc, attr) == 0;2900(void) zap_cursor_advance(&zc)) {2901ASSERT3U(attr->za_integer_length, ==,2902sizeof (uint64_t));2903ASSERT3U(attr->za_num_integers, ==, 1);29042905child = kmem_asprintf("%s/%s", name, attr->za_name);2906dsl_pool_config_exit(dp, FTAG);2907err = dmu_objset_find_impl(spa, child,2908func, arg, flags);2909dsl_pool_config_enter(dp, FTAG);2910kmem_strfree(child);2911if (err != 0)2912break;2913}2914zap_cursor_fini(&zc);29152916if (err != 0) {2917dsl_dir_rele(dd, FTAG);2918dsl_pool_config_exit(dp, FTAG);2919zap_attribute_free(attr);2920return (err);2921}2922}29232924/*2925* Iterate over all snapshots.2926*/2927if (flags & DS_FIND_SNAPSHOTS) {2928err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);29292930if (err == 0) {2931uint64_t snapobj;29322933snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;2934dsl_dataset_rele(ds, FTAG);29352936for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);2937zap_cursor_retrieve(&zc, attr) == 0;2938(void) zap_cursor_advance(&zc)) {2939ASSERT3U(attr->za_integer_length, ==,2940sizeof (uint64_t));2941ASSERT3U(attr->za_num_integers, ==, 1);29422943child = kmem_asprintf("%s@%s",2944name, attr->za_name);2945dsl_pool_config_exit(dp, FTAG);2946err = func(child, arg);2947dsl_pool_config_enter(dp, FTAG);2948kmem_strfree(child);2949if (err != 0)2950break;2951}2952zap_cursor_fini(&zc);2953}2954}29552956dsl_dir_rele(dd, FTAG);2957zap_attribute_free(attr);2958dsl_pool_config_exit(dp, FTAG);29592960if (err != 0)2961return (err);29622963/* Apply to self. */2964return (func(name, arg));2965}29662967/*2968* See comment above dmu_objset_find_impl().2969*/2970int2971dmu_objset_find(const char *name, int func(const char *, void *), void *arg,2972int flags)2973{2974spa_t *spa;2975int error;29762977error = spa_open(name, &spa, FTAG);2978if (error != 0)2979return (error);2980error = dmu_objset_find_impl(spa, name, func, arg, flags);2981spa_close(spa, FTAG);2982return (error);2983}29842985boolean_t2986dmu_objset_incompatible_encryption_version(objset_t *os)2987{2988return (dsl_dir_incompatible_encryption_version(2989os->os_dsl_dataset->ds_dir));2990}29912992void2993dmu_objset_set_user(objset_t *os, void *user_ptr)2994{2995ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));2996os->os_user_ptr = user_ptr;2997}29982999void *3000dmu_objset_get_user(objset_t *os)3001{3002ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));3003return (os->os_user_ptr);3004}30053006/*3007* Determine name of filesystem, given name of snapshot.3008* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes3009*/3010int3011dmu_fsname(const char *snapname, char *buf)3012{3013char *atp = strchr(snapname, '@');3014if (atp == NULL)3015return (SET_ERROR(EINVAL));3016if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)3017return (SET_ERROR(ENAMETOOLONG));3018(void) strlcpy(buf, snapname, atp - snapname + 1);3019return (0);3020}30213022/*3023* Call when we think we're going to write/free space in open context3024* to track the amount of dirty data in the open txg, which is also the3025* amount of memory that can not be evicted until this txg syncs.3026*3027* Note that there are two conditions where this can be called from3028* syncing context:3029*3030* [1] When we just created the dataset, in which case we go on with3031* updating any accounting of dirty data as usual.3032* [2] When we are dirtying MOS data, in which case we only update the3033* pool's accounting of dirty data.3034*/3035void3036dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)3037{3038dsl_dataset_t *ds = os->os_dsl_dataset;3039int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);30403041if (ds != NULL) {3042dsl_dir_willuse_space(ds->ds_dir, aspace, tx);3043}30443045dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);3046}30473048#if defined(_KERNEL)3049EXPORT_SYMBOL(dmu_objset_zil);3050EXPORT_SYMBOL(dmu_objset_pool);3051EXPORT_SYMBOL(dmu_objset_ds);3052EXPORT_SYMBOL(dmu_objset_type);3053EXPORT_SYMBOL(dmu_objset_name);3054EXPORT_SYMBOL(dmu_objset_hold);3055EXPORT_SYMBOL(dmu_objset_hold_flags);3056EXPORT_SYMBOL(dmu_objset_own);3057EXPORT_SYMBOL(dmu_objset_rele);3058EXPORT_SYMBOL(dmu_objset_rele_flags);3059EXPORT_SYMBOL(dmu_objset_disown);3060EXPORT_SYMBOL(dmu_objset_from_ds);3061EXPORT_SYMBOL(dmu_objset_create);3062EXPORT_SYMBOL(dmu_objset_stats);3063EXPORT_SYMBOL(dmu_objset_fast_stat);3064EXPORT_SYMBOL(dmu_objset_spa);3065EXPORT_SYMBOL(dmu_objset_space);3066EXPORT_SYMBOL(dmu_objset_fsid_guid);3067EXPORT_SYMBOL(dmu_objset_find);3068EXPORT_SYMBOL(dmu_objset_byteswap);3069EXPORT_SYMBOL(dmu_objset_evict_dbufs);3070EXPORT_SYMBOL(dmu_objset_snap_cmtime);3071EXPORT_SYMBOL(dmu_objset_dnodesize);30723073EXPORT_SYMBOL(dmu_objset_sync);3074EXPORT_SYMBOL(dmu_objset_is_dirty);3075EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);3076EXPORT_SYMBOL(dmu_objset_create_impl);3077EXPORT_SYMBOL(dmu_objset_open_impl);3078EXPORT_SYMBOL(dmu_objset_evict);3079EXPORT_SYMBOL(dmu_objset_register_type);3080EXPORT_SYMBOL(dmu_objset_sync_done);3081EXPORT_SYMBOL(dmu_objset_userquota_get_ids);3082EXPORT_SYMBOL(dmu_objset_userused_enabled);3083EXPORT_SYMBOL(dmu_objset_userspace_upgrade);3084EXPORT_SYMBOL(dmu_objset_userspace_present);3085EXPORT_SYMBOL(dmu_objset_userobjused_enabled);3086EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);3087EXPORT_SYMBOL(dmu_objset_userobjspace_present);3088EXPORT_SYMBOL(dmu_objset_projectquota_enabled);3089EXPORT_SYMBOL(dmu_objset_projectquota_present);3090EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);3091EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);3092#endif309330943095