Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
48774 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/2122/*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (c) 2013, 2016 by Delphix. All rights reserved.25* Copyright 2017 Nexenta Systems, Inc.26*/2728#include <sys/types.h>29#include <sys/param.h>30#include <sys/time.h>31#include <sys/sysmacros.h>32#include <sys/vfs.h>33#include <sys/vnode.h>34#include <sys/file.h>35#include <sys/kmem.h>36#include <sys/uio.h>37#include <sys/pathname.h>38#include <sys/cmn_err.h>39#include <sys/errno.h>40#include <sys/stat.h>41#include <sys/sunddi.h>42#include <sys/random.h>43#include <sys/policy.h>44#include <sys/zfs_dir.h>45#include <sys/zfs_acl.h>46#include <sys/zfs_vnops.h>47#include <sys/fs/zfs.h>48#include <sys/zap.h>49#include <sys/dmu.h>50#include <sys/atomic.h>51#include <sys/zfs_ctldir.h>52#include <sys/zfs_fuid.h>53#include <sys/sa.h>54#include <sys/zfs_sa.h>55#include <sys/dmu_objset.h>56#include <sys/dsl_dir.h>5758/*59* zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups60* of names after deciding which is the appropriate lookup interface.61*/62static int63zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,64matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,65uint64_t *zoid)66{67boolean_t conflict = B_FALSE;68int error;6970if (zfsvfs->z_norm) {71size_t bufsz = 0;72char *buf = NULL;7374if (rpnp) {75buf = rpnp->pn_buf;76bufsz = rpnp->pn_bufsize;77}7879/*80* In the non-mixed case we only expect there would ever81* be one match, but we need to use the normalizing lookup.82*/83error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,84zoid, mt, buf, bufsz, &conflict);85} else {86error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);87}8889/*90* Allow multiple entries provided the first entry is91* the object id. Non-zpl consumers may safely make92* use of the additional space.93*94* XXX: This should be a feature flag for compatibility95*/96if (error == EOVERFLOW)97error = 0;9899if (zfsvfs->z_norm && !error && deflags)100*deflags = conflict ? ED_CASE_CONFLICT : 0;101102*zoid = ZFS_DIRENT_OBJ(*zoid);103104return (error);105}106107/*108* Lock a directory entry. A dirlock on <dzp, name> protects that name109* in dzp's directory zap object. As long as you hold a dirlock, you can110* assume two things: (1) dzp cannot be reaped, and (2) no other thread111* can change the zap entry for (i.e. link or unlink) this name.112*113* Input arguments:114* dzp - znode for directory115* name - name of entry to lock116* flag - ZNEW: if the entry already exists, fail with EEXIST.117* ZEXISTS: if the entry does not exist, fail with ENOENT.118* ZSHARED: allow concurrent access with other ZSHARED callers.119* ZXATTR: we want dzp's xattr directory120* ZCILOOK: On a mixed sensitivity file system,121* this lookup should be case-insensitive.122* ZCIEXACT: On a purely case-insensitive file system,123* this lookup should be case-sensitive.124* ZRENAMING: we are locking for renaming, force narrow locks125* ZHAVELOCK: Don't grab the z_name_lock for this call. The126* current thread already holds it.127*128* Output arguments:129* zpp - pointer to the znode for the entry (NULL if there isn't one)130* dlpp - pointer to the dirlock for this entry (NULL on error)131* direntflags - (case-insensitive lookup only)132* flags if multiple case-sensitive matches exist in directory133* realpnp - (case-insensitive lookup only)134* actual name matched within the directory135*136* Return value: 0 on success or errno on failure.137*138* NOTE: Always checks for, and rejects, '.' and '..'.139* NOTE: For case-insensitive file systems we take wide locks (see below),140* but return znode pointers to a single match.141*/142int143zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,144znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)145{146zfsvfs_t *zfsvfs = ZTOZSB(dzp);147zfs_dirlock_t *dl;148boolean_t update;149matchtype_t mt = 0;150uint64_t zoid;151int error = 0;152int cmpflags;153154*zpp = NULL;155*dlpp = NULL;156157/*158* Verify that we are not trying to lock '.', '..', or '.zfs'159*/160if ((name[0] == '.' &&161(name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||162(zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))163return (SET_ERROR(EEXIST));164165/*166* Case sensitivity and normalization preferences are set when167* the file system is created. These are stored in the168* zfsvfs->z_case and zfsvfs->z_norm fields. These choices169* affect what vnodes can be cached in the DNLC, how we170* perform zap lookups, and the "width" of our dirlocks.171*172* A normal dirlock locks a single name. Note that with173* normalization a name can be composed multiple ways, but174* when normalized, these names all compare equal. A wide175* dirlock locks multiple names. We need these when the file176* system is supporting mixed-mode access. It is sometimes177* necessary to lock all case permutations of file name at178* once so that simultaneous case-insensitive/case-sensitive179* behaves as rationally as possible.180*/181182/*183* When matching we may need to normalize & change case according to184* FS settings.185*186* Note that a normalized match is necessary for a case insensitive187* filesystem when the lookup request is not exact because normalization188* can fold case independent of normalizing code point sequences.189*190* See the table above zfs_dropname().191*/192if (zfsvfs->z_norm != 0) {193mt = MT_NORMALIZE;194195/*196* Determine if the match needs to honor the case specified in197* lookup, and if so keep track of that so that during198* normalization we don't fold case.199*/200if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&201(flag & ZCIEXACT)) ||202(zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {203mt |= MT_MATCH_CASE;204}205}206207/*208* Only look in or update the DNLC if we are looking for the209* name on a file system that does not require normalization210* or case folding. We can also look there if we happen to be211* on a non-normalizing, mixed sensitivity file system IF we212* are looking for the exact name.213*214* Maybe can add TO-UPPERed version of name to dnlc in ci-only215* case for performance improvement?216*/217update = !zfsvfs->z_norm ||218(zfsvfs->z_case == ZFS_CASE_MIXED &&219!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));220221/*222* ZRENAMING indicates we are in a situation where we should223* take narrow locks regardless of the file system's224* preferences for normalizing and case folding. This will225* prevent us deadlocking trying to grab the same wide lock226* twice if the two names happen to be case-insensitive227* matches.228*/229if (flag & ZRENAMING)230cmpflags = 0;231else232cmpflags = zfsvfs->z_norm;233234/*235* Wait until there are no locks on this name.236*237* Don't grab the lock if it is already held. However, cannot238* have both ZSHARED and ZHAVELOCK together.239*/240ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));241if (!(flag & ZHAVELOCK))242rw_enter(&dzp->z_name_lock, RW_READER);243244mutex_enter(&dzp->z_lock);245for (;;) {246if (dzp->z_unlinked && !(flag & ZXATTR)) {247mutex_exit(&dzp->z_lock);248if (!(flag & ZHAVELOCK))249rw_exit(&dzp->z_name_lock);250return (SET_ERROR(ENOENT));251}252for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {253if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,254U8_UNICODE_LATEST, &error) == 0) || error != 0)255break;256}257if (error != 0) {258mutex_exit(&dzp->z_lock);259if (!(flag & ZHAVELOCK))260rw_exit(&dzp->z_name_lock);261return (SET_ERROR(ENOENT));262}263if (dl == NULL) {264/*265* Allocate a new dirlock and add it to the list.266*/267dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);268cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);269dl->dl_name = name;270dl->dl_sharecnt = 0;271dl->dl_namelock = 0;272dl->dl_namesize = 0;273dl->dl_dzp = dzp;274dl->dl_next = dzp->z_dirlocks;275dzp->z_dirlocks = dl;276break;277}278if ((flag & ZSHARED) && dl->dl_sharecnt != 0)279break;280cv_wait(&dl->dl_cv, &dzp->z_lock);281}282283/*284* If the z_name_lock was NOT held for this dirlock record it.285*/286if (flag & ZHAVELOCK)287dl->dl_namelock = 1;288289if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {290/*291* We're the second shared reference to dl. Make a copy of292* dl_name in case the first thread goes away before we do.293* Note that we initialize the new name before storing its294* pointer into dl_name, because the first thread may load295* dl->dl_name at any time. It'll either see the old value,296* which belongs to it, or the new shared copy; either is OK.297*/298dl->dl_namesize = strlen(dl->dl_name) + 1;299name = kmem_alloc(dl->dl_namesize, KM_SLEEP);300memcpy(name, dl->dl_name, dl->dl_namesize);301dl->dl_name = name;302}303304mutex_exit(&dzp->z_lock);305306/*307* We have a dirlock on the name. (Note that it is the dirlock,308* not the dzp's z_lock, that protects the name in the zap object.)309* See if there's an object by this name; if so, put a hold on it.310*/311if (flag & ZXATTR) {312error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,313sizeof (zoid));314if (error == 0)315error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);316} else {317error = zfs_match_find(zfsvfs, dzp, name, mt,318update, direntflags, realpnp, &zoid);319}320if (error) {321if (error != ENOENT || (flag & ZEXISTS)) {322zfs_dirent_unlock(dl);323return (error);324}325} else {326if (flag & ZNEW) {327zfs_dirent_unlock(dl);328return (SET_ERROR(EEXIST));329}330error = zfs_zget(zfsvfs, zoid, zpp);331if (error) {332zfs_dirent_unlock(dl);333return (error);334}335}336337*dlpp = dl;338339return (0);340}341342/*343* Unlock this directory entry and wake anyone who was waiting for it.344*/345void346zfs_dirent_unlock(zfs_dirlock_t *dl)347{348znode_t *dzp = dl->dl_dzp;349zfs_dirlock_t **prev_dl, *cur_dl;350351mutex_enter(&dzp->z_lock);352353if (!dl->dl_namelock)354rw_exit(&dzp->z_name_lock);355356if (dl->dl_sharecnt > 1) {357dl->dl_sharecnt--;358mutex_exit(&dzp->z_lock);359return;360}361prev_dl = &dzp->z_dirlocks;362while ((cur_dl = *prev_dl) != dl)363prev_dl = &cur_dl->dl_next;364*prev_dl = dl->dl_next;365cv_broadcast(&dl->dl_cv);366mutex_exit(&dzp->z_lock);367368if (dl->dl_namesize != 0)369kmem_free(dl->dl_name, dl->dl_namesize);370cv_destroy(&dl->dl_cv);371kmem_free(dl, sizeof (*dl));372}373374/*375* Look up an entry in a directory.376*377* NOTE: '.' and '..' are handled as special cases because378* no directory entries are actually stored for them. If this is379* the root of a filesystem, then '.zfs' is also treated as a380* special pseudo-directory.381*/382int383zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,384int *deflg, pathname_t *rpnp)385{386zfs_dirlock_t *dl;387znode_t *zp;388struct inode *ip;389int error = 0;390uint64_t parent;391392if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {393*zpp = dzp;394zhold(*zpp);395} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {396zfsvfs_t *zfsvfs = ZTOZSB(dzp);397398/*399* If we are a snapshot mounted under .zfs, return400* the inode pointer for the snapshot directory.401*/402if ((error = sa_lookup(dzp->z_sa_hdl,403SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)404return (error);405406if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {407error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,408"snapshot", &ip, 0, kcred, NULL, NULL);409*zpp = ITOZ(ip);410return (error);411}412rw_enter(&dzp->z_parent_lock, RW_READER);413error = zfs_zget(zfsvfs, parent, &zp);414if (error == 0)415*zpp = zp;416rw_exit(&dzp->z_parent_lock);417} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {418if (ZTOZSB(dzp)->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {419return (SET_ERROR(ENOENT));420}421ip = zfsctl_root(dzp);422*zpp = ITOZ(ip);423} else {424int zf;425426zf = ZEXISTS | ZSHARED;427if (flags & FIGNORECASE)428zf |= ZCILOOK;429430error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);431if (error == 0) {432*zpp = zp;433zfs_dirent_unlock(dl);434dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */435}436rpnp = NULL;437}438439if ((flags & FIGNORECASE) && rpnp && !error)440(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);441442return (error);443}444445/*446* unlinked Set (formerly known as the "delete queue") Error Handling447*448* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we449* don't specify the name of the entry that we will be manipulating. We450* also fib and say that we won't be adding any new entries to the451* unlinked set, even though we might (this is to lower the minimum file452* size that can be deleted in a full filesystem). So on the small453* chance that the nlink list is using a fat zap (ie. has more than454* 2000 entries), we *may* not pre-read a block that's needed.455* Therefore it is remotely possible for some of the assertions456* regarding the unlinked set below to fail due to i/o error. On a457* nondebug system, this will result in the space being leaked.458*/459void460zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)461{462zfsvfs_t *zfsvfs = ZTOZSB(zp);463464ASSERT(zp->z_unlinked);465ASSERT0(ZTOI(zp)->i_nlink);466467VERIFY3U(0, ==,468zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));469470dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);471}472473/*474* Clean up any znodes that had no links when we either crashed or475* (force) umounted the file system.476*/477static void478zfs_unlinked_drain_task(void *arg)479{480zfsvfs_t *zfsvfs = arg;481zap_cursor_t zc;482zap_attribute_t *zap = zap_attribute_alloc();483dmu_object_info_t doi;484znode_t *zp;485int error;486487ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);488489/*490* Iterate over the contents of the unlinked set.491*/492for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);493zap_cursor_retrieve(&zc, zap) == 0 && !zfsvfs->z_drain_cancel;494zap_cursor_advance(&zc)) {495496/*497* See what kind of object we have in list498*/499500error = dmu_object_info(zfsvfs->z_os,501zap->za_first_integer, &doi);502if (error != 0)503continue;504505ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||506(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));507/*508* We need to re-mark these list entries for deletion,509* so we pull them back into core and set zp->z_unlinked.510*/511error = zfs_zget(zfsvfs, zap->za_first_integer, &zp);512513/*514* We may pick up znodes that are already marked for deletion.515* This could happen during the purge of an extended attribute516* directory. All we need to do is skip over them, since they517* are already in the system marked z_unlinked.518*/519if (error != 0)520continue;521522zp->z_unlinked = B_TRUE;523524/*525* zrele() decrements the znode's ref count and may cause526* it to be synchronously freed. We interrupt freeing527* of this znode by checking the return value of528* dmu_objset_zfs_unmounting() in dmu_free_long_range()529* when an unmount is requested.530*/531zrele(zp);532ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);533}534zap_cursor_fini(&zc);535536zfsvfs->z_draining = B_FALSE;537zfsvfs->z_drain_task = TASKQID_INVALID;538zap_attribute_free(zap);539}540541/*542* Sets z_draining then tries to dispatch async unlinked drain.543* If that fails executes synchronous unlinked drain.544*/545void546zfs_unlinked_drain(zfsvfs_t *zfsvfs)547{548ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);549ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);550551zfsvfs->z_draining = B_TRUE;552zfsvfs->z_drain_cancel = B_FALSE;553554zfsvfs->z_drain_task = taskq_dispatch(555dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),556zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);557if (zfsvfs->z_drain_task == TASKQID_INVALID) {558zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");559zfs_unlinked_drain_task(zfsvfs);560}561}562563/*564* Wait for the unlinked drain taskq task to stop. This will interrupt the565* unlinked set processing if it is in progress.566*/567void568zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)569{570ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);571572if (zfsvfs->z_draining) {573zfsvfs->z_drain_cancel = B_TRUE;574taskq_cancel_id(dsl_pool_unlinked_drain_taskq(575dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);576zfsvfs->z_drain_task = TASKQID_INVALID;577zfsvfs->z_draining = B_FALSE;578}579}580581/*582* Delete the entire contents of a directory. Return a count583* of the number of entries that could not be deleted. If we encounter584* an error, return a count of at least one so that the directory stays585* in the unlinked set.586*587* NOTE: this function assumes that the directory is inactive,588* so there is no need to lock its entries before deletion.589* Also, it assumes the directory contents is *only* regular590* files.591*/592static int593zfs_purgedir(znode_t *dzp)594{595zap_cursor_t zc;596zap_attribute_t *zap = zap_attribute_alloc();597znode_t *xzp;598dmu_tx_t *tx;599zfsvfs_t *zfsvfs = ZTOZSB(dzp);600zfs_dirlock_t dl;601int skipped = 0;602int error;603604for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);605(error = zap_cursor_retrieve(&zc, zap)) == 0;606zap_cursor_advance(&zc)) {607error = zfs_zget(zfsvfs,608ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp);609if (error) {610skipped += 1;611continue;612}613614ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||615S_ISLNK(ZTOI(xzp)->i_mode));616617tx = dmu_tx_create(zfsvfs->z_os);618dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);619dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name);620dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);621dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);622/* Is this really needed ? */623zfs_sa_upgrade_txholds(tx, xzp);624dmu_tx_mark_netfree(tx);625error = dmu_tx_assign(tx, DMU_TX_WAIT);626if (error) {627dmu_tx_abort(tx);628zfs_zrele_async(xzp);629skipped += 1;630continue;631}632memset(&dl, 0, sizeof (dl));633dl.dl_dzp = dzp;634dl.dl_name = zap->za_name;635636error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);637if (error)638skipped += 1;639dmu_tx_commit(tx);640641zfs_zrele_async(xzp);642}643zap_cursor_fini(&zc);644zap_attribute_free(zap);645if (error != ENOENT)646skipped += 1;647return (skipped);648}649650void651zfs_rmnode(znode_t *zp)652{653zfsvfs_t *zfsvfs = ZTOZSB(zp);654objset_t *os = zfsvfs->z_os;655znode_t *xzp = NULL;656dmu_tx_t *tx;657znode_hold_t *zh;658uint64_t z_id = zp->z_id;659uint64_t acl_obj;660uint64_t xattr_obj;661uint64_t links;662int error;663664ASSERT0(ZTOI(zp)->i_nlink);665ASSERT0(atomic_read(&ZTOI(zp)->i_count));666667/*668* If this is an attribute directory, purge its contents.669*/670if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {671if (zfs_purgedir(zp) != 0) {672/*673* Not enough space to delete some xattrs.674* Leave it in the unlinked set.675*/676zh = zfs_znode_hold_enter(zfsvfs, z_id);677zfs_znode_dmu_fini(zp);678zfs_znode_hold_exit(zfsvfs, zh);679return;680}681}682683/*684* Free up all the data in the file. We don't do this for directories685* because we need truncate and remove to be in the same tx, like in686* zfs_znode_delete(). Otherwise, if we crash here we'll end up with687* an inconsistent truncated zap object in the delete queue. Note a688* truncated file is harmless since it only contains user data.689*/690if (S_ISREG(ZTOI(zp)->i_mode)) {691error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);692if (error) {693/*694* Not enough space or we were interrupted by unmount.695* Leave the file in the unlinked set.696*/697zh = zfs_znode_hold_enter(zfsvfs, z_id);698zfs_znode_dmu_fini(zp);699zfs_znode_hold_exit(zfsvfs, zh);700return;701}702}703704/*705* If the file has extended attributes, we're going to unlink706* the xattr dir.707*/708error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),709&xattr_obj, sizeof (xattr_obj));710if (error == 0 && xattr_obj) {711error = zfs_zget(zfsvfs, xattr_obj, &xzp);712ASSERT0(error);713}714715acl_obj = zfs_external_acl(zp);716717/*718* Set up the final transaction.719*/720tx = dmu_tx_create(os);721dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);722dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);723if (xzp) {724dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);725dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);726}727if (acl_obj)728dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);729730zfs_sa_upgrade_txholds(tx, zp);731error = dmu_tx_assign(tx, DMU_TX_WAIT);732if (error) {733/*734* Not enough space to delete the file. Leave it in the735* unlinked set, leaking it until the fs is remounted (at736* which point we'll call zfs_unlinked_drain() to process it).737*/738dmu_tx_abort(tx);739zh = zfs_znode_hold_enter(zfsvfs, z_id);740zfs_znode_dmu_fini(zp);741zfs_znode_hold_exit(zfsvfs, zh);742goto out;743}744745if (xzp) {746ASSERT0(error);747mutex_enter(&xzp->z_lock);748xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */749clear_nlink(ZTOI(xzp)); /* no more links to it */750links = 0;751VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),752&links, sizeof (links), tx));753mutex_exit(&xzp->z_lock);754zfs_unlinked_add(xzp, tx);755}756757mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);758759/*760* Remove this znode from the unlinked set. If a has rollback has761* occurred while a file is open and unlinked. Then when the file762* is closed post rollback it will not exist in the rolled back763* version of the unlinked object.764*/765error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,766zp->z_id, tx);767VERIFY(error == 0 || error == ENOENT);768769uint64_t count;770if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {771cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);772}773774mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);775776dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);777778zfs_znode_delete(zp, tx);779780dmu_tx_commit(tx);781out:782if (xzp)783zfs_zrele_async(xzp);784}785786static uint64_t787zfs_dirent(znode_t *zp, uint64_t mode)788{789uint64_t de = zp->z_id;790791if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)792de |= IFTODT(mode) << 60;793return (de);794}795796/*797* Link zp into dl. Can fail in the following cases :798* - if zp has been unlinked.799* - if the number of entries with the same hash (aka. colliding entries)800* exceed the capacity of a leaf-block of fatzap and splitting of the801* leaf-block does not help.802*/803int804zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)805{806znode_t *dzp = dl->dl_dzp;807zfsvfs_t *zfsvfs = ZTOZSB(zp);808uint64_t value;809int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);810sa_bulk_attr_t bulk[5];811uint64_t mtime[2], ctime[2];812uint64_t links;813int count = 0;814int error;815816mutex_enter(&zp->z_lock);817818if (!(flag & ZRENAMING)) {819if (zp->z_unlinked) { /* no new links to unlinked zp */820ASSERT(!(flag & (ZNEW | ZEXISTS)));821mutex_exit(&zp->z_lock);822return (SET_ERROR(ENOENT));823}824if (!(flag & ZNEW)) {825/*826* ZNEW nodes come from zfs_mknode() where the link827* count has already been initialised828*/829inc_nlink(ZTOI(zp));830links = ZTOI(zp)->i_nlink;831SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),832NULL, &links, sizeof (links));833}834}835836value = zfs_dirent(zp, zp->z_mode);837error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,838&value, tx);839840/*841* zap_add could fail to add the entry if it exceeds the capacity of the842* leaf-block and zap_leaf_split() failed to help.843* The caller of this routine is responsible for failing the transaction844* which will rollback the SA updates done above.845*/846if (error != 0) {847if (!(flag & ZRENAMING) && !(flag & ZNEW))848drop_nlink(ZTOI(zp));849mutex_exit(&zp->z_lock);850return (error);851}852853/*854* If we added a longname activate the SPA_FEATURE_LONGNAME.855*/856if (strlen(dl->dl_name) >= ZAP_MAXNAMELEN) {857dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);858ds->ds_feature_activation[SPA_FEATURE_LONGNAME] =859(void *)B_TRUE;860}861862SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,863&dzp->z_id, sizeof (dzp->z_id));864SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,865&zp->z_pflags, sizeof (zp->z_pflags));866867if (!(flag & ZNEW)) {868SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,869ctime, sizeof (ctime));870zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,871ctime);872}873error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);874ASSERT0(error);875876mutex_exit(&zp->z_lock);877878mutex_enter(&dzp->z_lock);879dzp->z_size++;880if (zp_is_dir)881inc_nlink(ZTOI(dzp));882links = ZTOI(dzp)->i_nlink;883count = 0;884SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,885&dzp->z_size, sizeof (dzp->z_size));886SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,887&links, sizeof (links));888SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,889mtime, sizeof (mtime));890SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,891ctime, sizeof (ctime));892SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,893&dzp->z_pflags, sizeof (dzp->z_pflags));894zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);895error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);896ASSERT0(error);897mutex_exit(&dzp->z_lock);898899return (0);900}901902/*903* The match type in the code for this function should conform to:904*905* ------------------------------------------------------------------------906* fs type | z_norm | lookup type | match type907* ---------|-------------|-------------|----------------------------------908* CS !norm | 0 | 0 | 0 (exact)909* CS norm | formX | 0 | MT_NORMALIZE910* CI !norm | upper | !ZCIEXACT | MT_NORMALIZE911* CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE912* CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE913* CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE914* CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE915* CM !norm | upper | ZCILOOK | MT_NORMALIZE916* CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE917* CM norm | upper|formX | ZCILOOK | MT_NORMALIZE918*919* Abbreviations:920* CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed921* upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)922* formX = unicode normalization form set on fs creation923*/924static int925zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,926int flag)927{928int error;929930if (ZTOZSB(zp)->z_norm) {931matchtype_t mt = MT_NORMALIZE;932933if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&934(flag & ZCIEXACT)) ||935(ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&936!(flag & ZCILOOK))) {937mt |= MT_MATCH_CASE;938}939940error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,941dl->dl_name, mt, tx);942} else {943error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,944tx);945}946947return (error);948}949950static int951zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)952{953zfsvfs_t *zfsvfs = ZTOZSB(zp);954int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);955boolean_t unlinked = B_FALSE;956sa_bulk_attr_t bulk[3];957uint64_t mtime[2], ctime[2];958uint64_t links;959int count = 0;960int error;961962if (zp_is_dir && !zfs_dirempty(zp))963return (SET_ERROR(ENOTEMPTY));964965if (ZTOI(zp)->i_nlink <= zp_is_dir) {966zfs_panic_recover("zfs: link count on %lu is %u, "967"should be at least %u", zp->z_id,968(int)ZTOI(zp)->i_nlink, zp_is_dir + 1);969set_nlink(ZTOI(zp), zp_is_dir + 1);970}971drop_nlink(ZTOI(zp));972if (ZTOI(zp)->i_nlink == zp_is_dir) {973zp->z_unlinked = B_TRUE;974clear_nlink(ZTOI(zp));975unlinked = B_TRUE;976} else {977SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),978NULL, &ctime, sizeof (ctime));979SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),980NULL, &zp->z_pflags, sizeof (zp->z_pflags));981zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,982ctime);983}984links = ZTOI(zp)->i_nlink;985SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),986NULL, &links, sizeof (links));987error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);988ASSERT0(error);989990if (unlinkedp != NULL)991*unlinkedp = unlinked;992else if (unlinked)993zfs_unlinked_add(zp, tx);994995return (0);996}997998/*999* Forcefully drop an nlink reference from (zp) and mark it for deletion if it1000* was the last link. This *must* only be done to znodes which have already1001* been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in1002* the error path of zfs_rename(), where we have to correct the nlink count if1003* we failed to link the target as well as failing to re-link the original1004* znodes.1005*/1006int1007zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)1008{1009int error;10101011mutex_enter(&zp->z_lock);1012error = zfs_drop_nlink_locked(zp, tx, unlinkedp);1013mutex_exit(&zp->z_lock);10141015return (error);1016}10171018/*1019* Unlink zp from dl, and mark zp for deletion if this was the last link. Can1020* fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).1021* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.1022* If it's non-NULL, we use it to indicate whether the znode needs deletion,1023* and it's the caller's job to do it.1024*/1025int1026zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,1027boolean_t *unlinkedp)1028{1029znode_t *dzp = dl->dl_dzp;1030zfsvfs_t *zfsvfs = ZTOZSB(dzp);1031int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);1032boolean_t unlinked = B_FALSE;1033sa_bulk_attr_t bulk[5];1034uint64_t mtime[2], ctime[2];1035uint64_t links;1036int count = 0;1037int error;10381039if (!(flag & ZRENAMING)) {1040mutex_enter(&zp->z_lock);10411042if (zp_is_dir && !zfs_dirempty(zp)) {1043mutex_exit(&zp->z_lock);1044return (SET_ERROR(ENOTEMPTY));1045}10461047/*1048* If we get here, we are going to try to remove the object.1049* First try removing the name from the directory; if that1050* fails, return the error.1051*/1052error = zfs_dropname(dl, zp, dzp, tx, flag);1053if (error != 0) {1054mutex_exit(&zp->z_lock);1055return (error);1056}10571058/* The only error is !zfs_dirempty() and we checked earlier. */1059error = zfs_drop_nlink_locked(zp, tx, &unlinked);1060ASSERT0(error);1061mutex_exit(&zp->z_lock);1062} else {1063error = zfs_dropname(dl, zp, dzp, tx, flag);1064if (error != 0)1065return (error);1066}10671068mutex_enter(&dzp->z_lock);1069dzp->z_size--; /* one dirent removed */1070if (zp_is_dir)1071drop_nlink(ZTOI(dzp)); /* ".." link from zp */1072links = ZTOI(dzp)->i_nlink;1073SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),1074NULL, &links, sizeof (links));1075SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),1076NULL, &dzp->z_size, sizeof (dzp->z_size));1077SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),1078NULL, ctime, sizeof (ctime));1079SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),1080NULL, mtime, sizeof (mtime));1081SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),1082NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));1083zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);1084error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);1085ASSERT0(error);1086mutex_exit(&dzp->z_lock);10871088if (unlinkedp != NULL)1089*unlinkedp = unlinked;1090else if (unlinked)1091zfs_unlinked_add(zp, tx);10921093return (0);1094}10951096/*1097* Indicate whether the directory is empty. Works with or without z_lock1098* held, but can only be consider a hint in the latter case. Returns true1099* if only "." and ".." remain and there's no work in progress.1100*1101* The internal ZAP size, rather than zp->z_size, needs to be checked since1102* some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.1103*/1104boolean_t1105zfs_dirempty(znode_t *dzp)1106{1107zfsvfs_t *zfsvfs = ZTOZSB(dzp);1108uint64_t count;1109int error;11101111if (dzp->z_dirlocks != NULL)1112return (B_FALSE);11131114error = zap_count(zfsvfs->z_os, dzp->z_id, &count);1115if (error != 0 || count != 0)1116return (B_FALSE);11171118return (B_TRUE);1119}11201121int1122zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)1123{1124zfsvfs_t *zfsvfs = ZTOZSB(zp);1125znode_t *xzp;1126dmu_tx_t *tx;1127int error;1128zfs_acl_ids_t acl_ids;1129boolean_t fuid_dirtied;1130#ifdef ZFS_DEBUG1131uint64_t parent;1132#endif11331134*xzpp = NULL;11351136if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,1137&acl_ids, zfs_init_idmap)) != 0)1138return (error);1139if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {1140zfs_acl_ids_free(&acl_ids);1141return (SET_ERROR(EDQUOT));1142}11431144tx = dmu_tx_create(zfsvfs->z_os);1145dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +1146ZFS_SA_BASE_ATTR_SIZE);1147dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);1148dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);1149fuid_dirtied = zfsvfs->z_fuid_dirty;1150if (fuid_dirtied)1151zfs_fuid_txhold(zfsvfs, tx);1152error = dmu_tx_assign(tx, DMU_TX_WAIT);1153if (error) {1154zfs_acl_ids_free(&acl_ids);1155dmu_tx_abort(tx);1156return (error);1157}1158zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);11591160if (fuid_dirtied)1161zfs_fuid_sync(zfsvfs, tx);11621163#ifdef ZFS_DEBUG1164error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),1165&parent, sizeof (parent));1166ASSERT(error == 0 && parent == zp->z_id);1167#endif11681169VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,1170sizeof (xzp->z_id), tx));11711172if (!zp->z_unlinked)1173zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL,1174acl_ids.z_fuidp, vap);11751176zfs_acl_ids_free(&acl_ids);1177dmu_tx_commit(tx);11781179*xzpp = xzp;11801181return (0);1182}11831184/*1185* Return a znode for the extended attribute directory for zp.1186* ** If the directory does not already exist, it is created **1187*1188* IN: zp - znode to obtain attribute directory from1189* cr - credentials of caller1190* flags - flags from the VOP_LOOKUP call1191*1192* OUT: xipp - pointer to extended attribute znode1193*1194* RETURN: 0 on success1195* error number on failure1196*/1197int1198zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)1199{1200zfsvfs_t *zfsvfs = ZTOZSB(zp);1201znode_t *xzp;1202zfs_dirlock_t *dl;1203vattr_t va;1204int error;1205top:1206error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);1207if (error)1208return (error);12091210if (xzp != NULL) {1211*xzpp = xzp;1212zfs_dirent_unlock(dl);1213return (0);1214}12151216if (!(flags & CREATE_XATTR_DIR)) {1217zfs_dirent_unlock(dl);1218return (SET_ERROR(ENOENT));1219}12201221if (zfs_is_readonly(zfsvfs)) {1222zfs_dirent_unlock(dl);1223return (SET_ERROR(EROFS));1224}12251226/*1227* The ability to 'create' files in an attribute1228* directory comes from the write_xattr permission on the base file.1229*1230* The ability to 'search' an attribute directory requires1231* read_xattr permission on the base file.1232*1233* Once in a directory the ability to read/write attributes1234* is controlled by the permissions on the attribute file.1235*/1236va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;1237va.va_mode = S_IFDIR | S_ISVTX | 0777;1238zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);12391240va.va_dentry = NULL;1241error = zfs_make_xattrdir(zp, &va, xzpp, cr);1242zfs_dirent_unlock(dl);12431244if (error == ERESTART) {1245/* NB: we already did dmu_tx_wait() if necessary */1246goto top;1247}12481249return (error);1250}12511252/*1253* Decide whether it is okay to remove within a sticky directory.1254*1255* In sticky directories, write access is not sufficient;1256* you can remove entries from a directory only if:1257*1258* you own the directory,1259* you own the entry,1260* you have write access to the entry,1261* or you are privileged (checked in secpolicy...).1262*1263* The function returns 0 if remove access is granted.1264*/1265int1266zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)1267{1268uid_t uid;1269uid_t downer;1270uid_t fowner;1271zfsvfs_t *zfsvfs = ZTOZSB(zdp);12721273if (zfsvfs->z_replay)1274return (0);12751276if ((zdp->z_mode & S_ISVTX) == 0)1277return (0);12781279downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),1280cr, ZFS_OWNER);1281fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),1282cr, ZFS_OWNER);12831284if ((uid = crgetuid(cr)) == downer || uid == fowner ||1285zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,1286zfs_init_idmap) == 0)1287return (0);1288else1289return (secpolicy_vnode_remove(cr));1290}129112921293