Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
108059 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2013, 2016 by Delphix. All rights reserved.25* Copyright 2017 Nexenta Systems, Inc.26*/2728#include <sys/types.h>29#include <sys/param.h>30#include <sys/time.h>31#include <sys/sysmacros.h>32#include <sys/vfs.h>33#include <sys/vnode.h>34#include <sys/file.h>35#include <sys/kmem.h>36#include <sys/uio.h>37#include <sys/pathname.h>38#include <sys/cmn_err.h>39#include <sys/errno.h>40#include <sys/stat.h>41#include <sys/sunddi.h>42#include <sys/random.h>43#include <sys/policy.h>44#include <sys/zfs_dir.h>45#include <sys/zfs_acl.h>46#include <sys/zfs_vnops.h>47#include <sys/fs/zfs.h>48#include <sys/zap.h>49#include <sys/dmu.h>50#include <sys/atomic.h>51#include <sys/zfs_ctldir.h>52#include <sys/zfs_fuid.h>53#include <sys/sa.h>54#include <sys/zfs_sa.h>55#include <sys/dmu_objset.h>56#include <sys/dsl_dir.h>5758/*59* zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups60* of names after deciding which is the appropriate lookup interface.61*/62static int63zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,64matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,65uint64_t *zoid)66{67boolean_t conflict = B_FALSE;68int error;6970if (zfsvfs->z_norm) {71size_t bufsz = 0;72char *buf = NULL;7374if (rpnp) {75buf = rpnp->pn_buf;76bufsz = rpnp->pn_bufsize;77}7879/*80* In the non-mixed case we only expect there would ever81* be one match, but we need to use the normalizing lookup.82*/83error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,84zoid, mt, buf, bufsz, &conflict);85} else {86error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);87}8889/*90* Allow multiple entries provided the first entry is91* the object id. Non-zpl consumers may safely make92* use of the additional space.93*94* XXX: This should be a feature flag for compatibility95*/96if (error == EOVERFLOW)97error = 0;9899if (zfsvfs->z_norm && !error && deflags)100*deflags = conflict ? ED_CASE_CONFLICT : 0;101102*zoid = ZFS_DIRENT_OBJ(*zoid);103104return (error);105}106107/*108* Lock a directory entry. A dirlock on <dzp, name> protects that name109* in dzp's directory zap object. As long as you hold a dirlock, you can110* assume two things: (1) dzp cannot be reaped, and (2) no other thread111* can change the zap entry for (i.e. link or unlink) this name.112*113* Input arguments:114* dzp - znode for directory115* name - name of entry to lock116* flag - ZNEW: if the entry already exists, fail with EEXIST.117* ZEXISTS: if the entry does not exist, fail with ENOENT.118* ZSHARED: allow concurrent access with other ZSHARED callers.119* ZXATTR: we want dzp's xattr directory120* ZCILOOK: On a mixed sensitivity file system,121* this lookup should be case-insensitive.122* ZCIEXACT: On a purely case-insensitive file system,123* this lookup should be case-sensitive.124* ZRENAMING: we are locking for renaming, force narrow locks125* ZHAVELOCK: Don't grab the z_name_lock for this call. The126* current thread already holds it.127*128* Output arguments:129* zpp - pointer to the znode for the entry (NULL if there isn't one)130* dlpp - pointer to the dirlock for this entry (NULL on error)131* direntflags - (case-insensitive lookup only)132* flags if multiple case-sensitive matches exist in directory133* realpnp - (case-insensitive lookup only)134* actual name matched within the directory135*136* Return value: 0 on success or errno on failure.137*138* NOTE: Always checks for, and rejects, '.' and '..'.139* NOTE: For case-insensitive file systems we take wide locks (see below),140* but return znode pointers to a single match.141*/142int143zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,144znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)145{146zfsvfs_t *zfsvfs = ZTOZSB(dzp);147zfs_dirlock_t *dl;148boolean_t update;149matchtype_t mt = 0;150uint64_t zoid;151int error = 0;152int cmpflags;153154*zpp = NULL;155*dlpp = NULL;156157/*158* Verify that we are not trying to lock '.', '..', or '.zfs'159*/160if ((name[0] == '.' &&161(name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||162(zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))163return (SET_ERROR(EEXIST));164165/*166* Case sensitivity and normalization preferences are set when167* the file system is created. These are stored in the168* zfsvfs->z_case and zfsvfs->z_norm fields. These choices169* affect what vnodes can be cached in the DNLC, how we170* perform zap lookups, and the "width" of our dirlocks.171*172* A normal dirlock locks a single name. Note that with173* normalization a name can be composed multiple ways, but174* when normalized, these names all compare equal. A wide175* dirlock locks multiple names. We need these when the file176* system is supporting mixed-mode access. It is sometimes177* necessary to lock all case permutations of file name at178* once so that simultaneous case-insensitive/case-sensitive179* behaves as rationally as possible.180*/181182/*183* When matching we may need to normalize & change case according to184* FS settings.185*186* Note that a normalized match is necessary for a case insensitive187* filesystem when the lookup request is not exact because normalization188* can fold case independent of normalizing code point sequences.189*190* See the table above zfs_dropname().191*/192if (zfsvfs->z_norm != 0) {193mt = MT_NORMALIZE;194195/*196* Determine if the match needs to honor the case specified in197* lookup, and if so keep track of that so that during198* normalization we don't fold case.199*/200if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&201(flag & ZCIEXACT)) ||202(zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {203mt |= MT_MATCH_CASE;204}205}206207/*208* Only look in or update the DNLC if we are looking for the209* name on a file system that does not require normalization210* or case folding. We can also look there if we happen to be211* on a non-normalizing, mixed sensitivity file system IF we212* are looking for the exact name.213*214* Maybe can add TO-UPPERed version of name to dnlc in ci-only215* case for performance improvement?216*/217update = !zfsvfs->z_norm ||218(zfsvfs->z_case == ZFS_CASE_MIXED &&219!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));220221/*222* ZRENAMING indicates we are in a situation where we should223* take narrow locks regardless of the file system's224* preferences for normalizing and case folding. This will225* prevent us deadlocking trying to grab the same wide lock226* twice if the two names happen to be case-insensitive227* matches.228*/229if (flag & ZRENAMING)230cmpflags = 0;231else232cmpflags = zfsvfs->z_norm;233234/*235* Wait until there are no locks on this name.236*237* Don't grab the lock if it is already held. However, cannot238* have both ZSHARED and ZHAVELOCK together.239*/240ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));241if (!(flag & ZHAVELOCK))242rw_enter(&dzp->z_name_lock, RW_READER);243244mutex_enter(&dzp->z_lock);245for (;;) {246if (dzp->z_unlinked && !(flag & ZXATTR)) {247mutex_exit(&dzp->z_lock);248if (!(flag & ZHAVELOCK))249rw_exit(&dzp->z_name_lock);250return (SET_ERROR(ENOENT));251}252for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {253if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,254U8_UNICODE_LATEST, &error) == 0) || error != 0)255break;256}257if (error != 0) {258mutex_exit(&dzp->z_lock);259if (!(flag & ZHAVELOCK))260rw_exit(&dzp->z_name_lock);261return (SET_ERROR(ENOENT));262}263if (dl == NULL) {264/*265* Allocate a new dirlock and add it to the list.266*/267dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);268cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);269dl->dl_name = name;270dl->dl_sharecnt = 0;271dl->dl_namelock = 0;272dl->dl_namesize = 0;273dl->dl_dzp = dzp;274dl->dl_next = dzp->z_dirlocks;275dzp->z_dirlocks = dl;276break;277}278if ((flag & ZSHARED) && dl->dl_sharecnt != 0)279break;280cv_wait(&dl->dl_cv, &dzp->z_lock);281}282283/*284* If the z_name_lock was NOT held for this dirlock record it.285*/286if (flag & ZHAVELOCK)287dl->dl_namelock = 1;288289if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {290/*291* We're the second shared reference to dl. Make a copy of292* dl_name in case the first thread goes away before we do.293* Note that we initialize the new name before storing its294* pointer into dl_name, because the first thread may load295* dl->dl_name at any time. It'll either see the old value,296* which belongs to it, or the new shared copy; either is OK.297*/298dl->dl_namesize = strlen(dl->dl_name) + 1;299name = kmem_alloc(dl->dl_namesize, KM_SLEEP);300memcpy(name, dl->dl_name, dl->dl_namesize);301dl->dl_name = name;302}303304mutex_exit(&dzp->z_lock);305306/*307* We have a dirlock on the name. (Note that it is the dirlock,308* not the dzp's z_lock, that protects the name in the zap object.)309* See if there's an object by this name; if so, put a hold on it.310*/311if (flag & ZXATTR) {312error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,313sizeof (zoid));314if (error == 0)315error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);316} else {317error = zfs_match_find(zfsvfs, dzp, name, mt,318update, direntflags, realpnp, &zoid);319}320if (error) {321if (error != ENOENT || (flag & ZEXISTS)) {322zfs_dirent_unlock(dl);323return (error);324}325} else {326if (flag & ZNEW) {327zfs_dirent_unlock(dl);328return (SET_ERROR(EEXIST));329}330error = zfs_zget(zfsvfs, zoid, zpp);331if (error) {332zfs_dirent_unlock(dl);333return (error);334}335}336337*dlpp = dl;338339return (0);340}341342/*343* Unlock this directory entry and wake anyone who was waiting for it.344*/345void346zfs_dirent_unlock(zfs_dirlock_t *dl)347{348znode_t *dzp = dl->dl_dzp;349zfs_dirlock_t **prev_dl, *cur_dl;350351mutex_enter(&dzp->z_lock);352353if (!dl->dl_namelock)354rw_exit(&dzp->z_name_lock);355356if (dl->dl_sharecnt > 1) {357dl->dl_sharecnt--;358mutex_exit(&dzp->z_lock);359return;360}361prev_dl = &dzp->z_dirlocks;362while ((cur_dl = *prev_dl) != dl)363prev_dl = &cur_dl->dl_next;364*prev_dl = dl->dl_next;365cv_broadcast(&dl->dl_cv);366mutex_exit(&dzp->z_lock);367368if (dl->dl_namesize != 0)369kmem_free(dl->dl_name, dl->dl_namesize);370cv_destroy(&dl->dl_cv);371kmem_free(dl, sizeof (*dl));372}373374/*375* Look up an entry in a directory.376*377* NOTE: '.' and '..' are handled as special cases because378* no directory entries are actually stored for them. If this is379* the root of a filesystem, then '.zfs' is also treated as a380* special pseudo-directory.381*/382int383zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,384int *deflg, pathname_t *rpnp)385{386zfs_dirlock_t *dl;387znode_t *zp;388struct inode *ip;389int error = 0;390uint64_t parent;391392if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {393*zpp = dzp;394zhold(*zpp);395} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {396zfsvfs_t *zfsvfs = ZTOZSB(dzp);397398/*399* If we are a snapshot mounted under .zfs, return400* the inode pointer for the snapshot directory.401*/402if ((error = sa_lookup(dzp->z_sa_hdl,403SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)404return (error);405406if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {407error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,408"snapshot", &ip, 0, kcred, NULL, NULL);409*zpp = ITOZ(ip);410return (error);411}412rw_enter(&dzp->z_parent_lock, RW_READER);413error = zfs_zget(zfsvfs, parent, &zp);414if (error == 0)415*zpp = zp;416rw_exit(&dzp->z_parent_lock);417} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {418if (ZTOZSB(dzp)->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {419return (SET_ERROR(ENOENT));420}421ip = zfsctl_root(dzp);422*zpp = ITOZ(ip);423} else {424int zf;425426zf = ZEXISTS | ZSHARED;427if (flags & FIGNORECASE)428zf |= ZCILOOK;429430error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);431if (error == 0) {432*zpp = zp;433zfs_dirent_unlock(dl);434dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */435}436rpnp = NULL;437}438439if ((flags & FIGNORECASE) && rpnp && !error)440(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);441442return (error);443}444445/*446* unlinked Set (formerly known as the "delete queue") Error Handling447*448* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we449* don't specify the name of the entry that we will be manipulating. We450* also fib and say that we won't be adding any new entries to the451* unlinked set, even though we might (this is to lower the minimum file452* size that can be deleted in a full filesystem). So on the small453* chance that the nlink list is using a fat zap (ie. has more than454* 2000 entries), we *may* not pre-read a block that's needed.455* Therefore it is remotely possible for some of the assertions456* regarding the unlinked set below to fail due to i/o error. On a457* nondebug system, this will result in the space being leaked.458*/459void460zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)461{462zfsvfs_t *zfsvfs = ZTOZSB(zp);463464ASSERT(zp->z_unlinked);465ASSERT0(ZTOI(zp)->i_nlink);466467VERIFY3U(0, ==,468zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));469470dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);471}472473/*474* Clean up any znodes that had no links when we either crashed or475* (force) umounted the file system.476*/477static void478zfs_unlinked_drain_task(void *arg)479{480zfsvfs_t *zfsvfs = arg;481zap_cursor_t zc;482zap_attribute_t *zap = zap_attribute_alloc();483dmu_object_info_t doi;484znode_t *zp;485int error;486487ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);488489/*490* Iterate over the contents of the unlinked set.491*/492for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);493zap_cursor_retrieve(&zc, zap) == 0 && !zfsvfs->z_drain_cancel;494zap_cursor_advance(&zc)) {495496/*497* See what kind of object we have in list498*/499500error = dmu_object_info(zfsvfs->z_os,501zap->za_first_integer, &doi);502if (error != 0)503continue;504505ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||506(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));507/*508* We need to re-mark these list entries for deletion,509* so we pull them back into core and set zp->z_unlinked.510*/511error = zfs_zget(zfsvfs, zap->za_first_integer, &zp);512513/*514* We may pick up znodes that are already marked for deletion.515* This could happen during the purge of an extended attribute516* directory. All we need to do is skip over them, since they517* are already in the system marked z_unlinked.518*/519if (error != 0)520continue;521522zp->z_unlinked = B_TRUE;523524/*525* zrele() decrements the znode's ref count and may cause526* it to be synchronously freed. We interrupt freeing527* of this znode by checking the return value of528* dmu_objset_zfs_unmounting() in dmu_free_long_range()529* when an unmount is requested.530*/531zrele(zp);532ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);533}534zap_cursor_fini(&zc);535536zfsvfs->z_draining = B_FALSE;537zfsvfs->z_drain_task = TASKQID_INVALID;538zap_attribute_free(zap);539}540541/*542* Sets z_draining then tries to dispatch async unlinked drain.543* If that fails executes synchronous unlinked drain.544*/545void546zfs_unlinked_drain(zfsvfs_t *zfsvfs)547{548ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);549ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);550551zfsvfs->z_draining = B_TRUE;552zfsvfs->z_drain_cancel = B_FALSE;553554zfsvfs->z_drain_task = taskq_dispatch(555dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),556zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);557if (zfsvfs->z_drain_task == TASKQID_INVALID) {558zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");559zfs_unlinked_drain_task(zfsvfs);560}561}562563/*564* Wait for the unlinked drain taskq task to stop. This will interrupt the565* unlinked set processing if it is in progress.566*/567void568zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)569{570ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);571572if (zfsvfs->z_draining) {573zfsvfs->z_drain_cancel = B_TRUE;574taskq_cancel_id(dsl_pool_unlinked_drain_taskq(575dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task,576B_TRUE);577zfsvfs->z_drain_task = TASKQID_INVALID;578zfsvfs->z_draining = B_FALSE;579}580}581582/*583* Delete the entire contents of a directory. Return a count584* of the number of entries that could not be deleted. If we encounter585* an error, return a count of at least one so that the directory stays586* in the unlinked set.587*588* NOTE: this function assumes that the directory is inactive,589* so there is no need to lock its entries before deletion.590* Also, it assumes the directory contents is *only* regular591* files.592*/593static int594zfs_purgedir(znode_t *dzp)595{596zap_cursor_t zc;597zap_attribute_t *zap = zap_attribute_alloc();598znode_t *xzp;599dmu_tx_t *tx;600zfsvfs_t *zfsvfs = ZTOZSB(dzp);601zfs_dirlock_t dl;602int skipped = 0;603int error;604605for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);606(error = zap_cursor_retrieve(&zc, zap)) == 0;607zap_cursor_advance(&zc)) {608error = zfs_zget(zfsvfs,609ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp);610if (error) {611skipped += 1;612continue;613}614615ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||616S_ISLNK(ZTOI(xzp)->i_mode));617618tx = dmu_tx_create(zfsvfs->z_os);619dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);620dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name);621dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);622dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);623/* Is this really needed ? */624zfs_sa_upgrade_txholds(tx, xzp);625dmu_tx_mark_netfree(tx);626error = dmu_tx_assign(tx, DMU_TX_WAIT);627if (error) {628dmu_tx_abort(tx);629zfs_zrele_async(xzp);630skipped += 1;631continue;632}633memset(&dl, 0, sizeof (dl));634dl.dl_dzp = dzp;635dl.dl_name = zap->za_name;636637error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);638if (error)639skipped += 1;640dmu_tx_commit(tx);641642zfs_zrele_async(xzp);643}644zap_cursor_fini(&zc);645zap_attribute_free(zap);646if (error != ENOENT)647skipped += 1;648return (skipped);649}650651void652zfs_rmnode(znode_t *zp)653{654zfsvfs_t *zfsvfs = ZTOZSB(zp);655objset_t *os = zfsvfs->z_os;656znode_t *xzp = NULL;657dmu_tx_t *tx;658znode_hold_t *zh;659uint64_t z_id = zp->z_id;660uint64_t acl_obj;661uint64_t xattr_obj;662uint64_t links;663int error;664665ASSERT0(ZTOI(zp)->i_nlink);666ASSERT0(atomic_read(&ZTOI(zp)->i_count));667668/*669* If this is an attribute directory, purge its contents.670*/671if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {672if (zfs_purgedir(zp) != 0) {673/*674* Not enough space to delete some xattrs.675* Leave it in the unlinked set.676*/677zh = zfs_znode_hold_enter(zfsvfs, z_id);678zfs_znode_dmu_fini(zp);679zfs_znode_hold_exit(zfsvfs, zh);680return;681}682}683684/*685* Free up all the data in the file. We don't do this for directories686* because we need truncate and remove to be in the same tx, like in687* zfs_znode_delete(). Otherwise, if we crash here we'll end up with688* an inconsistent truncated zap object in the delete queue. Note a689* truncated file is harmless since it only contains user data.690*/691if (S_ISREG(ZTOI(zp)->i_mode)) {692error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);693if (error) {694/*695* Not enough space or we were interrupted by unmount.696* Leave the file in the unlinked set.697*/698zh = zfs_znode_hold_enter(zfsvfs, z_id);699zfs_znode_dmu_fini(zp);700zfs_znode_hold_exit(zfsvfs, zh);701return;702}703}704705/*706* If the file has extended attributes, we're going to unlink707* the xattr dir.708*/709error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),710&xattr_obj, sizeof (xattr_obj));711if (error == 0 && xattr_obj) {712error = zfs_zget(zfsvfs, xattr_obj, &xzp);713ASSERT0(error);714}715716acl_obj = zfs_external_acl(zp);717718/*719* Set up the final transaction.720*/721tx = dmu_tx_create(os);722dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);723dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);724if (xzp) {725dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);726dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);727}728if (acl_obj)729dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);730731zfs_sa_upgrade_txholds(tx, zp);732error = dmu_tx_assign(tx, DMU_TX_WAIT);733if (error) {734/*735* Not enough space to delete the file. Leave it in the736* unlinked set, leaking it until the fs is remounted (at737* which point we'll call zfs_unlinked_drain() to process it).738*/739dmu_tx_abort(tx);740zh = zfs_znode_hold_enter(zfsvfs, z_id);741zfs_znode_dmu_fini(zp);742zfs_znode_hold_exit(zfsvfs, zh);743goto out;744}745746if (xzp) {747ASSERT0(error);748mutex_enter(&xzp->z_lock);749xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */750clear_nlink(ZTOI(xzp)); /* no more links to it */751links = 0;752VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),753&links, sizeof (links), tx));754mutex_exit(&xzp->z_lock);755zfs_unlinked_add(xzp, tx);756}757758mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);759760/*761* Remove this znode from the unlinked set. If a has rollback has762* occurred while a file is open and unlinked. Then when the file763* is closed post rollback it will not exist in the rolled back764* version of the unlinked object.765*/766error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,767zp->z_id, tx);768VERIFY(error == 0 || error == ENOENT);769770uint64_t count;771if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {772cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);773}774775mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);776777dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);778779zfs_znode_delete(zp, tx);780781dmu_tx_commit(tx);782out:783if (xzp)784zfs_zrele_async(xzp);785}786787static uint64_t788zfs_dirent(znode_t *zp, uint64_t mode)789{790uint64_t de = zp->z_id;791792if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)793de |= IFTODT(mode) << 60;794return (de);795}796797/*798* Link zp into dl. Can fail in the following cases :799* - if zp has been unlinked.800* - if the number of entries with the same hash (aka. colliding entries)801* exceed the capacity of a leaf-block of fatzap and splitting of the802* leaf-block does not help.803*/804int805zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)806{807znode_t *dzp = dl->dl_dzp;808zfsvfs_t *zfsvfs = ZTOZSB(zp);809uint64_t value;810int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);811sa_bulk_attr_t bulk[5];812uint64_t mtime[2], ctime[2];813uint64_t links;814int count = 0;815int error;816817mutex_enter(&zp->z_lock);818819if (!(flag & ZRENAMING)) {820if (zp->z_unlinked) { /* no new links to unlinked zp */821ASSERT(!(flag & (ZNEW | ZEXISTS)));822mutex_exit(&zp->z_lock);823return (SET_ERROR(ENOENT));824}825if (!(flag & ZNEW)) {826/*827* ZNEW nodes come from zfs_mknode() where the link828* count has already been initialised829*/830inc_nlink(ZTOI(zp));831links = ZTOI(zp)->i_nlink;832SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),833NULL, &links, sizeof (links));834}835}836837value = zfs_dirent(zp, zp->z_mode);838error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,839&value, tx);840841/*842* zap_add could fail to add the entry if it exceeds the capacity of the843* leaf-block and zap_leaf_split() failed to help.844* The caller of this routine is responsible for failing the transaction845* which will rollback the SA updates done above.846*/847if (error != 0) {848if (!(flag & ZRENAMING) && !(flag & ZNEW))849drop_nlink(ZTOI(zp));850mutex_exit(&zp->z_lock);851return (error);852}853854/*855* If we added a longname activate the SPA_FEATURE_LONGNAME.856*/857if (strlen(dl->dl_name) >= ZAP_MAXNAMELEN) {858dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);859ds->ds_feature_activation[SPA_FEATURE_LONGNAME] =860(void *)B_TRUE;861}862863SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,864&dzp->z_id, sizeof (dzp->z_id));865SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,866&zp->z_pflags, sizeof (zp->z_pflags));867868if (!(flag & ZNEW)) {869SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,870ctime, sizeof (ctime));871zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,872ctime);873}874error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);875ASSERT0(error);876877mutex_exit(&zp->z_lock);878879mutex_enter(&dzp->z_lock);880dzp->z_size++;881if (zp_is_dir)882inc_nlink(ZTOI(dzp));883links = ZTOI(dzp)->i_nlink;884count = 0;885SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,886&dzp->z_size, sizeof (dzp->z_size));887SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,888&links, sizeof (links));889SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,890mtime, sizeof (mtime));891SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,892ctime, sizeof (ctime));893SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,894&dzp->z_pflags, sizeof (dzp->z_pflags));895zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);896error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);897ASSERT0(error);898mutex_exit(&dzp->z_lock);899900return (0);901}902903/*904* The match type in the code for this function should conform to:905*906* ------------------------------------------------------------------------907* fs type | z_norm | lookup type | match type908* ---------|-------------|-------------|----------------------------------909* CS !norm | 0 | 0 | 0 (exact)910* CS norm | formX | 0 | MT_NORMALIZE911* CI !norm | upper | !ZCIEXACT | MT_NORMALIZE912* CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE913* CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE914* CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE915* CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE916* CM !norm | upper | ZCILOOK | MT_NORMALIZE917* CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE918* CM norm | upper|formX | ZCILOOK | MT_NORMALIZE919*920* Abbreviations:921* CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed922* upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)923* formX = unicode normalization form set on fs creation924*/925static int926zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,927int flag)928{929int error;930931if (ZTOZSB(zp)->z_norm) {932matchtype_t mt = MT_NORMALIZE;933934if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&935(flag & ZCIEXACT)) ||936(ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&937!(flag & ZCILOOK))) {938mt |= MT_MATCH_CASE;939}940941error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,942dl->dl_name, mt, tx);943} else {944error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,945tx);946}947948return (error);949}950951static int952zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)953{954zfsvfs_t *zfsvfs = ZTOZSB(zp);955int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);956boolean_t unlinked = B_FALSE;957sa_bulk_attr_t bulk[3];958uint64_t mtime[2], ctime[2];959uint64_t links;960int count = 0;961int error;962963if (zp_is_dir && !zfs_dirempty(zp))964return (SET_ERROR(ENOTEMPTY));965966if (ZTOI(zp)->i_nlink <= zp_is_dir) {967zfs_panic_recover("zfs: link count on %lu is %u, "968"should be at least %u", zp->z_id,969(int)ZTOI(zp)->i_nlink, zp_is_dir + 1);970set_nlink(ZTOI(zp), zp_is_dir + 1);971}972drop_nlink(ZTOI(zp));973if (ZTOI(zp)->i_nlink == zp_is_dir) {974zp->z_unlinked = B_TRUE;975clear_nlink(ZTOI(zp));976unlinked = B_TRUE;977} else {978SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),979NULL, &ctime, sizeof (ctime));980SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),981NULL, &zp->z_pflags, sizeof (zp->z_pflags));982zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,983ctime);984}985links = ZTOI(zp)->i_nlink;986SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),987NULL, &links, sizeof (links));988error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);989ASSERT0(error);990991if (unlinkedp != NULL)992*unlinkedp = unlinked;993else if (unlinked)994zfs_unlinked_add(zp, tx);995996return (0);997}998999/*1000* Forcefully drop an nlink reference from (zp) and mark it for deletion if it1001* was the last link. This *must* only be done to znodes which have already1002* been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in1003* the error path of zfs_rename(), where we have to correct the nlink count if1004* we failed to link the target as well as failing to re-link the original1005* znodes.1006*/1007int1008zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)1009{1010int error;10111012mutex_enter(&zp->z_lock);1013error = zfs_drop_nlink_locked(zp, tx, unlinkedp);1014mutex_exit(&zp->z_lock);10151016return (error);1017}10181019/*1020* Unlink zp from dl, and mark zp for deletion if this was the last link. Can1021* fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).1022* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.1023* If it's non-NULL, we use it to indicate whether the znode needs deletion,1024* and it's the caller's job to do it.1025*/1026int1027zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,1028boolean_t *unlinkedp)1029{1030znode_t *dzp = dl->dl_dzp;1031zfsvfs_t *zfsvfs = ZTOZSB(dzp);1032int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);1033boolean_t unlinked = B_FALSE;1034sa_bulk_attr_t bulk[5];1035uint64_t mtime[2], ctime[2];1036uint64_t links;1037int count = 0;1038int error;10391040if (!(flag & ZRENAMING)) {1041mutex_enter(&zp->z_lock);10421043if (zp_is_dir && !zfs_dirempty(zp)) {1044mutex_exit(&zp->z_lock);1045return (SET_ERROR(ENOTEMPTY));1046}10471048/*1049* If we get here, we are going to try to remove the object.1050* First try removing the name from the directory; if that1051* fails, return the error.1052*/1053error = zfs_dropname(dl, zp, dzp, tx, flag);1054if (error != 0) {1055mutex_exit(&zp->z_lock);1056return (error);1057}10581059/* The only error is !zfs_dirempty() and we checked earlier. */1060error = zfs_drop_nlink_locked(zp, tx, &unlinked);1061ASSERT0(error);1062mutex_exit(&zp->z_lock);1063} else {1064error = zfs_dropname(dl, zp, dzp, tx, flag);1065if (error != 0)1066return (error);1067}10681069mutex_enter(&dzp->z_lock);1070dzp->z_size--; /* one dirent removed */1071if (zp_is_dir)1072drop_nlink(ZTOI(dzp)); /* ".." link from zp */1073links = ZTOI(dzp)->i_nlink;1074SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),1075NULL, &links, sizeof (links));1076SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),1077NULL, &dzp->z_size, sizeof (dzp->z_size));1078SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),1079NULL, ctime, sizeof (ctime));1080SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),1081NULL, mtime, sizeof (mtime));1082SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),1083NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));1084zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);1085error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);1086ASSERT0(error);1087mutex_exit(&dzp->z_lock);10881089if (unlinkedp != NULL)1090*unlinkedp = unlinked;1091else if (unlinked)1092zfs_unlinked_add(zp, tx);10931094return (0);1095}10961097/*1098* Indicate whether the directory is empty. Works with or without z_lock1099* held, but can only be consider a hint in the latter case. Returns true1100* if only "." and ".." remain and there's no work in progress.1101*1102* The internal ZAP size, rather than zp->z_size, needs to be checked since1103* some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.1104*/1105boolean_t1106zfs_dirempty(znode_t *dzp)1107{1108zfsvfs_t *zfsvfs = ZTOZSB(dzp);1109uint64_t count;1110int error;11111112if (dzp->z_dirlocks != NULL)1113return (B_FALSE);11141115error = zap_count(zfsvfs->z_os, dzp->z_id, &count);1116if (error != 0 || count != 0)1117return (B_FALSE);11181119return (B_TRUE);1120}11211122int1123zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)1124{1125zfsvfs_t *zfsvfs = ZTOZSB(zp);1126znode_t *xzp;1127dmu_tx_t *tx;1128int error;1129zfs_acl_ids_t acl_ids;1130boolean_t fuid_dirtied;1131#ifdef ZFS_DEBUG1132uint64_t parent;1133#endif11341135*xzpp = NULL;11361137if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,1138&acl_ids, zfs_init_idmap)) != 0)1139return (error);1140if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {1141zfs_acl_ids_free(&acl_ids);1142return (SET_ERROR(EDQUOT));1143}11441145tx = dmu_tx_create(zfsvfs->z_os);1146dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +1147ZFS_SA_BASE_ATTR_SIZE);1148dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);1149dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);1150fuid_dirtied = zfsvfs->z_fuid_dirty;1151if (fuid_dirtied)1152zfs_fuid_txhold(zfsvfs, tx);1153error = dmu_tx_assign(tx, DMU_TX_WAIT);1154if (error) {1155zfs_acl_ids_free(&acl_ids);1156dmu_tx_abort(tx);1157return (error);1158}1159zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);11601161if (fuid_dirtied)1162zfs_fuid_sync(zfsvfs, tx);11631164#ifdef ZFS_DEBUG1165error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),1166&parent, sizeof (parent));1167ASSERT(error == 0 && parent == zp->z_id);1168#endif11691170VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,1171sizeof (xzp->z_id), tx));11721173if (!zp->z_unlinked)1174zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL,1175acl_ids.z_fuidp, vap);11761177zfs_acl_ids_free(&acl_ids);1178dmu_tx_commit(tx);11791180*xzpp = xzp;11811182return (0);1183}11841185/*1186* Return a znode for the extended attribute directory for zp.1187* ** If the directory does not already exist, it is created **1188*1189* IN: zp - znode to obtain attribute directory from1190* cr - credentials of caller1191* flags - flags from the VOP_LOOKUP call1192*1193* OUT: xipp - pointer to extended attribute znode1194*1195* RETURN: 0 on success1196* error number on failure1197*/1198int1199zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)1200{1201zfsvfs_t *zfsvfs = ZTOZSB(zp);1202znode_t *xzp;1203zfs_dirlock_t *dl;1204vattr_t va;1205int error;1206top:1207error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);1208if (error)1209return (error);12101211if (xzp != NULL) {1212*xzpp = xzp;1213zfs_dirent_unlock(dl);1214return (0);1215}12161217if (!(flags & CREATE_XATTR_DIR)) {1218zfs_dirent_unlock(dl);1219return (SET_ERROR(ENOENT));1220}12211222if (zfs_is_readonly(zfsvfs)) {1223zfs_dirent_unlock(dl);1224return (SET_ERROR(EROFS));1225}12261227/*1228* The ability to 'create' files in an attribute1229* directory comes from the write_xattr permission on the base file.1230*1231* The ability to 'search' an attribute directory requires1232* read_xattr permission on the base file.1233*1234* Once in a directory the ability to read/write attributes1235* is controlled by the permissions on the attribute file.1236*/1237va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;1238va.va_mode = S_IFDIR | S_ISVTX | 0777;1239zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);12401241va.va_dentry = NULL;1242error = zfs_make_xattrdir(zp, &va, xzpp, cr);1243zfs_dirent_unlock(dl);12441245if (error == ERESTART) {1246/* NB: we already did dmu_tx_wait() if necessary */1247goto top;1248}12491250return (error);1251}12521253/*1254* Decide whether it is okay to remove within a sticky directory.1255*1256* In sticky directories, write access is not sufficient;1257* you can remove entries from a directory only if:1258*1259* you own the directory,1260* you own the entry,1261* you have write access to the entry,1262* or you are privileged (checked in secpolicy...).1263*1264* The function returns 0 if remove access is granted.1265*/1266int1267zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)1268{1269uid_t uid;1270uid_t downer;1271uid_t fowner;1272zfsvfs_t *zfsvfs = ZTOZSB(zdp);12731274if (zfsvfs->z_replay)1275return (0);12761277if ((zdp->z_mode & S_ISVTX) == 0)1278return (0);12791280downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),1281cr, ZFS_OWNER);1282fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),1283cr, ZFS_OWNER);12841285if ((uid = crgetuid(cr)) == downer || uid == fowner ||1286zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,1287zfs_init_idmap) == 0)1288return (0);1289else1290return (secpolicy_vnode_remove(cr));1291}129212931294