Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
48775 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22*23* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.24* Copyright (C) 2011 Lawrence Livermore National Security, LLC.25* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).26* LLNL-CODE-403049.27* Rewritten for Linux by:28* Rohan Puri <[email protected]>29* Brian Behlendorf <[email protected]>30* Copyright (c) 2013 by Delphix. All rights reserved.31* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.32* Copyright (c) 2018 George Melikov. All Rights Reserved.33* Copyright (c) 2019 Datto, Inc. All rights reserved.34* Copyright (c) 2020 The MathWorks, Inc. All rights reserved.35*/3637/*38* ZFS control directory (a.k.a. ".zfs")39*40* This directory provides a common location for all ZFS meta-objects.41* Currently, this is only the 'snapshot' and 'shares' directory, but this may42* expand in the future. The elements are built dynamically, as the hierarchy43* does not actually exist on disk.44*45* For 'snapshot', we don't want to have all snapshots always mounted, because46* this would take up a huge amount of space in /etc/mnttab. We have three47* types of objects:48*49* ctldir ------> snapshotdir -------> snapshot50* |51* |52* V53* mounted fs54*55* The 'snapshot' node contains just enough information to lookup '..' and act56* as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we57* perform an automount of the underlying filesystem and return the58* corresponding inode.59*60* All mounts are handled automatically by an user mode helper which invokes61* the mount procedure. Unmounts are handled by allowing the mount62* point to expire so the kernel may automatically unmount it.63*64* The '.zfs', '.zfs/snapshot', and all directories created under65* '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same66* zfsvfs_t as the head filesystem (what '.zfs' lives under).67*68* File systems mounted on top of the '.zfs/snapshot/<snapname>' paths69* (ie: snapshots) are complete ZFS filesystems and have their own unique70* zfsvfs_t. However, the fsid reported by these mounts will be the same71* as that used by the parent zfsvfs_t to make NFS happy.72*/7374#include <sys/types.h>75#include <sys/param.h>76#include <sys/time.h>77#include <sys/sysmacros.h>78#include <sys/pathname.h>79#include <sys/vfs.h>80#include <sys/zfs_ctldir.h>81#include <sys/zfs_ioctl.h>82#include <sys/zfs_vfsops.h>83#include <sys/zfs_vnops.h>84#include <sys/stat.h>85#include <sys/dmu.h>86#include <sys/dmu_objset.h>87#include <sys/dsl_destroy.h>88#include <sys/dsl_deleg.h>89#include <sys/zpl.h>90#include <sys/mntent.h>91#include "zfs_namecheck.h"9293/*94* Two AVL trees are maintained which contain all currently automounted95* snapshots. Every automounted snapshots maps to a single zfs_snapentry_t96* entry which MUST:97*98* - be attached to both trees, and99* - be unique, no duplicate entries are allowed.100*101* The zfs_snapshots_by_name tree is indexed by the full dataset name102* while the zfs_snapshots_by_objsetid tree is indexed by the unique103* objsetid. This allows for fast lookups either by name or objsetid.104*/105static avl_tree_t zfs_snapshots_by_name;106static avl_tree_t zfs_snapshots_by_objsetid;107static krwlock_t zfs_snapshot_lock;108109/*110* Control Directory Tunables (.zfs)111*/112int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;113static int zfs_admin_snapshot = 0;114static int zfs_snapshot_no_setuid = 0;115116typedef struct {117char *se_name; /* full snapshot name */118char *se_path; /* full mount path */119spa_t *se_spa; /* pool spa */120uint64_t se_objsetid; /* snapshot objset id */121struct dentry *se_root_dentry; /* snapshot root dentry */122krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */123taskqid_t se_taskqid; /* scheduled unmount taskqid */124avl_node_t se_node_name; /* zfs_snapshots_by_name link */125avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */126zfs_refcount_t se_refcount; /* reference count */127} zfs_snapentry_t;128129static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);130131/*132* Allocate a new zfs_snapentry_t being careful to make a copy of the133* the snapshot name and provided mount point. No reference is taken.134*/135static zfs_snapentry_t *136zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,137uint64_t objsetid, struct dentry *root_dentry)138{139zfs_snapentry_t *se;140141se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);142143se->se_name = kmem_strdup(full_name);144se->se_path = kmem_strdup(full_path);145se->se_spa = spa;146se->se_objsetid = objsetid;147se->se_root_dentry = root_dentry;148se->se_taskqid = TASKQID_INVALID;149rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);150151zfs_refcount_create(&se->se_refcount);152153return (se);154}155156/*157* Free a zfs_snapentry_t the caller must ensure there are no active158* references.159*/160static void161zfsctl_snapshot_free(zfs_snapentry_t *se)162{163zfs_refcount_destroy(&se->se_refcount);164kmem_strfree(se->se_name);165kmem_strfree(se->se_path);166rw_destroy(&se->se_taskqid_lock);167168kmem_free(se, sizeof (zfs_snapentry_t));169}170171/*172* Hold a reference on the zfs_snapentry_t.173*/174static void175zfsctl_snapshot_hold(zfs_snapentry_t *se)176{177zfs_refcount_add(&se->se_refcount, NULL);178}179180/*181* Release a reference on the zfs_snapentry_t. When the number of182* references drops to zero the structure will be freed.183*/184static void185zfsctl_snapshot_rele(zfs_snapentry_t *se)186{187if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)188zfsctl_snapshot_free(se);189}190191/*192* Add a zfs_snapentry_t to both the zfs_snapshots_by_name and193* zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part194* of the trees a reference is held.195*/196static void197zfsctl_snapshot_add(zfs_snapentry_t *se)198{199ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));200zfsctl_snapshot_hold(se);201avl_add(&zfs_snapshots_by_name, se);202avl_add(&zfs_snapshots_by_objsetid, se);203}204205/*206* Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and207* zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,208* this can result in the structure being freed if that was the last209* remaining reference.210*/211static void212zfsctl_snapshot_remove(zfs_snapentry_t *se)213{214ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));215avl_remove(&zfs_snapshots_by_name, se);216avl_remove(&zfs_snapshots_by_objsetid, se);217zfsctl_snapshot_rele(se);218}219220/*221* Snapshot name comparison function for the zfs_snapshots_by_name.222*/223static int224snapentry_compare_by_name(const void *a, const void *b)225{226const zfs_snapentry_t *se_a = a;227const zfs_snapentry_t *se_b = b;228int ret;229230ret = strcmp(se_a->se_name, se_b->se_name);231232if (ret < 0)233return (-1);234else if (ret > 0)235return (1);236else237return (0);238}239240/*241* Snapshot name comparison function for the zfs_snapshots_by_objsetid.242*/243static int244snapentry_compare_by_objsetid(const void *a, const void *b)245{246const zfs_snapentry_t *se_a = a;247const zfs_snapentry_t *se_b = b;248249if (se_a->se_spa != se_b->se_spa)250return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);251252if (se_a->se_objsetid < se_b->se_objsetid)253return (-1);254else if (se_a->se_objsetid > se_b->se_objsetid)255return (1);256else257return (0);258}259260/*261* Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname262* is found a pointer to the zfs_snapentry_t is returned and a reference263* taken on the structure. The caller is responsible for dropping the264* reference with zfsctl_snapshot_rele(). If the snapname is not found265* NULL will be returned.266*/267static zfs_snapentry_t *268zfsctl_snapshot_find_by_name(const char *snapname)269{270zfs_snapentry_t *se, search;271272ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));273274search.se_name = (char *)snapname;275se = avl_find(&zfs_snapshots_by_name, &search, NULL);276if (se)277zfsctl_snapshot_hold(se);278279return (se);280}281282/*283* Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id284* rather than the snapname. In all other respects it behaves the same285* as zfsctl_snapshot_find_by_name().286*/287static zfs_snapentry_t *288zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)289{290zfs_snapentry_t *se, search;291292ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));293294search.se_spa = spa;295search.se_objsetid = objsetid;296se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);297if (se)298zfsctl_snapshot_hold(se);299300return (se);301}302303/*304* Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is305* removed, renamed, and added back to the new correct location in the tree.306*/307static int308zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)309{310zfs_snapentry_t *se;311312ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));313314se = zfsctl_snapshot_find_by_name(old_snapname);315if (se == NULL)316return (SET_ERROR(ENOENT));317318zfsctl_snapshot_remove(se);319kmem_strfree(se->se_name);320se->se_name = kmem_strdup(new_snapname);321zfsctl_snapshot_add(se);322zfsctl_snapshot_rele(se);323324return (0);325}326327/*328* Delayed task responsible for unmounting an expired automounted snapshot.329*/330static void331snapentry_expire(void *data)332{333zfs_snapentry_t *se = (zfs_snapentry_t *)data;334spa_t *spa = se->se_spa;335uint64_t objsetid = se->se_objsetid;336337if (zfs_expire_snapshot <= 0) {338zfsctl_snapshot_rele(se);339return;340}341342rw_enter(&se->se_taskqid_lock, RW_WRITER);343se->se_taskqid = TASKQID_INVALID;344rw_exit(&se->se_taskqid_lock);345(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);346zfsctl_snapshot_rele(se);347348/*349* Reschedule the unmount if the zfs_snapentry_t wasn't removed.350* This can occur when the snapshot is busy.351*/352rw_enter(&zfs_snapshot_lock, RW_READER);353if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {354zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);355zfsctl_snapshot_rele(se);356}357rw_exit(&zfs_snapshot_lock);358}359360/*361* Cancel an automatic unmount of a snapname. This callback is responsible362* for dropping the reference on the zfs_snapentry_t which was taken when363* during dispatch.364*/365static void366zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)367{368int err = 0;369rw_enter(&se->se_taskqid_lock, RW_WRITER);370err = taskq_cancel_id(system_delay_taskq, se->se_taskqid);371/*372* if we get ENOENT, the taskq couldn't be found to be373* canceled, so we can just mark it as invalid because374* it's already gone. If we got EBUSY, then we already375* blocked until it was gone _anyway_, so we don't care.376*/377se->se_taskqid = TASKQID_INVALID;378rw_exit(&se->se_taskqid_lock);379if (err == 0) {380zfsctl_snapshot_rele(se);381}382}383384/*385* Dispatch the unmount task for delayed handling with a hold protecting it.386*/387static void388zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)389{390391if (delay <= 0)392return;393394zfsctl_snapshot_hold(se);395rw_enter(&se->se_taskqid_lock, RW_WRITER);396/*397* If this condition happens, we managed to:398* - dispatch once399* - want to dispatch _again_ before it returned400*401* So let's just return - if that task fails at unmounting,402* we'll eventually dispatch again, and if it succeeds,403* no problem.404*/405if (se->se_taskqid != TASKQID_INVALID) {406rw_exit(&se->se_taskqid_lock);407zfsctl_snapshot_rele(se);408return;409}410se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,411snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);412rw_exit(&se->se_taskqid_lock);413}414415/*416* Schedule an automatic unmount of objset id to occur in delay seconds from417* now. Any previous delayed unmount will be cancelled in favor of the418* updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()419* and held until the outstanding task is handled or cancelled.420*/421int422zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)423{424zfs_snapentry_t *se;425int error = ENOENT;426427rw_enter(&zfs_snapshot_lock, RW_READER);428if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {429zfsctl_snapshot_unmount_cancel(se);430zfsctl_snapshot_unmount_delay_impl(se, delay);431zfsctl_snapshot_rele(se);432error = 0;433}434rw_exit(&zfs_snapshot_lock);435436return (error);437}438439/*440* Check if snapname is currently mounted. Returned non-zero when mounted441* and zero when unmounted.442*/443static boolean_t444zfsctl_snapshot_ismounted(const char *snapname)445{446zfs_snapentry_t *se;447boolean_t ismounted = B_FALSE;448449rw_enter(&zfs_snapshot_lock, RW_READER);450if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {451zfsctl_snapshot_rele(se);452ismounted = B_TRUE;453}454rw_exit(&zfs_snapshot_lock);455456return (ismounted);457}458459/*460* Check if the given inode is a part of the virtual .zfs directory.461*/462boolean_t463zfsctl_is_node(struct inode *ip)464{465return (ITOZ(ip)->z_is_ctldir);466}467468/*469* Check if the given inode is a .zfs/snapshots/snapname directory.470*/471boolean_t472zfsctl_is_snapdir(struct inode *ip)473{474return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));475}476477/*478* Allocate a new inode with the passed id and ops.479*/480static struct inode *481zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,482const struct file_operations *fops, const struct inode_operations *ops,483uint64_t creation)484{485struct inode *ip;486znode_t *zp;487inode_timespec_t now = {.tv_sec = creation};488489ip = new_inode(zfsvfs->z_sb);490if (ip == NULL)491return (NULL);492493if (!creation)494now = current_time(ip);495zp = ITOZ(ip);496ASSERT0P(zp->z_dirlocks);497ASSERT0P(zp->z_acl_cached);498ASSERT0P(zp->z_xattr_cached);499zp->z_id = id;500zp->z_unlinked = B_FALSE;501zp->z_atime_dirty = B_FALSE;502zp->z_zn_prefetch = B_FALSE;503zp->z_is_sa = B_FALSE;504zp->z_is_ctldir = B_TRUE;505zp->z_sa_hdl = NULL;506zp->z_blksz = 0;507zp->z_seq = 0;508zp->z_mapcnt = 0;509zp->z_size = 0;510zp->z_pflags = 0;511zp->z_mode = 0;512zp->z_sync_cnt = 0;513ip->i_generation = 0;514ip->i_ino = id;515ip->i_mode = (S_IFDIR | S_IRWXUGO);516ip->i_uid = SUID_TO_KUID(0);517ip->i_gid = SGID_TO_KGID(0);518ip->i_blkbits = SPA_MINBLOCKSHIFT;519zpl_inode_set_atime_to_ts(ip, now);520zpl_inode_set_mtime_to_ts(ip, now);521zpl_inode_set_ctime_to_ts(ip, now);522ip->i_fop = fops;523ip->i_op = ops;524#if defined(IOP_XATTR)525ip->i_opflags &= ~IOP_XATTR;526#endif527528if (insert_inode_locked(ip)) {529unlock_new_inode(ip);530iput(ip);531return (NULL);532}533534mutex_enter(&zfsvfs->z_znodes_lock);535list_insert_tail(&zfsvfs->z_all_znodes, zp);536membar_producer();537mutex_exit(&zfsvfs->z_znodes_lock);538539unlock_new_inode(ip);540541return (ip);542}543544/*545* Lookup the inode with given id, it will be allocated if needed.546*/547static struct inode *548zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,549const struct file_operations *fops, const struct inode_operations *ops)550{551struct inode *ip = NULL;552uint64_t creation = 0;553dsl_dataset_t *snap_ds;554dsl_pool_t *pool;555556while (ip == NULL) {557ip = ilookup(zfsvfs->z_sb, (unsigned long)id);558if (ip)559break;560561if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {562pool = dmu_objset_pool(zfsvfs->z_os);563dsl_pool_config_enter(pool, FTAG);564if (!dsl_dataset_hold_obj(pool,565ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {566creation = dsl_get_creation(snap_ds);567dsl_dataset_rele(snap_ds, FTAG);568}569dsl_pool_config_exit(pool, FTAG);570}571572/* May fail due to concurrent zfsctl_inode_alloc() */573ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);574}575576return (ip);577}578579/*580* Create the '.zfs' directory. This directory is cached as part of the VFS581* structure. This results in a hold on the zfsvfs_t. The code in zfs_umount()582* therefore checks against a vfs_count of 2 instead of 1. This reference583* is removed when the ctldir is destroyed in the unmount. All other entities584* under the '.zfs' directory are created dynamically as needed.585*586* Because the dynamically created '.zfs' directory entries assume the use587* of 64-bit inode numbers this support must be disabled on 32-bit systems.588*/589int590zfsctl_create(zfsvfs_t *zfsvfs)591{592ASSERT0P(zfsvfs->z_ctldir);593594zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,595&zpl_fops_root, &zpl_ops_root, 0);596if (zfsvfs->z_ctldir == NULL)597return (SET_ERROR(ENOENT));598599return (0);600}601602/*603* Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.604* Only called when the filesystem is unmounted.605*/606void607zfsctl_destroy(zfsvfs_t *zfsvfs)608{609if (zfsvfs->z_issnap) {610zfs_snapentry_t *se;611spa_t *spa = zfsvfs->z_os->os_spa;612uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);613614rw_enter(&zfs_snapshot_lock, RW_WRITER);615se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);616if (se != NULL)617zfsctl_snapshot_remove(se);618rw_exit(&zfs_snapshot_lock);619if (se != NULL) {620zfsctl_snapshot_unmount_cancel(se);621zfsctl_snapshot_rele(se);622}623} else if (zfsvfs->z_ctldir) {624iput(zfsvfs->z_ctldir);625zfsvfs->z_ctldir = NULL;626}627}628629/*630* Given a root znode, retrieve the associated .zfs directory.631* Add a hold to the vnode and return it.632*/633struct inode *634zfsctl_root(znode_t *zp)635{636ASSERT(zfs_has_ctldir(zp));637/* Must have an existing ref, so igrab() cannot return NULL */638VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL);639return (ZTOZSB(zp)->z_ctldir);640}641642/*643* Generate a long fid to indicate a snapdir. We encode whether snapdir is644* already mounted in gen field. We do this because nfsd lookup will not645* trigger automount. Next time the nfsd does fh_to_dentry, we will notice646* this and do automount and return ESTALE to force nfsd revalidate and follow647* mount.648*/649static int650zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)651{652zfid_short_t *zfid = (zfid_short_t *)fidp;653zfid_long_t *zlfid = (zfid_long_t *)fidp;654uint32_t gen = 0;655uint64_t object;656uint64_t objsetid;657int i;658struct dentry *dentry;659660if (fidp->fid_len < LONG_FID_LEN) {661fidp->fid_len = LONG_FID_LEN;662return (SET_ERROR(ENOSPC));663}664665object = ip->i_ino;666objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;667zfid->zf_len = LONG_FID_LEN;668669dentry = d_obtain_alias(igrab(ip));670if (!IS_ERR(dentry)) {671gen = !!d_mountpoint(dentry);672dput(dentry);673}674675for (i = 0; i < sizeof (zfid->zf_object); i++)676zfid->zf_object[i] = (uint8_t)(object >> (8 * i));677678for (i = 0; i < sizeof (zfid->zf_gen); i++)679zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));680681for (i = 0; i < sizeof (zlfid->zf_setid); i++)682zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));683684for (i = 0; i < sizeof (zlfid->zf_setgen); i++)685zlfid->zf_setgen[i] = 0;686687return (0);688}689690/*691* Generate an appropriate fid for an entry in the .zfs directory.692*/693int694zfsctl_fid(struct inode *ip, fid_t *fidp)695{696znode_t *zp = ITOZ(ip);697zfsvfs_t *zfsvfs = ITOZSB(ip);698uint64_t object = zp->z_id;699zfid_short_t *zfid;700int i;701int error;702703if ((error = zfs_enter(zfsvfs, FTAG)) != 0)704return (error);705706if (zfsctl_is_snapdir(ip)) {707zfs_exit(zfsvfs, FTAG);708return (zfsctl_snapdir_fid(ip, fidp));709}710711if (fidp->fid_len < SHORT_FID_LEN) {712fidp->fid_len = SHORT_FID_LEN;713zfs_exit(zfsvfs, FTAG);714return (SET_ERROR(ENOSPC));715}716717zfid = (zfid_short_t *)fidp;718719zfid->zf_len = SHORT_FID_LEN;720721for (i = 0; i < sizeof (zfid->zf_object); i++)722zfid->zf_object[i] = (uint8_t)(object >> (8 * i));723724/* .zfs znodes always have a generation number of 0 */725for (i = 0; i < sizeof (zfid->zf_gen); i++)726zfid->zf_gen[i] = 0;727728zfs_exit(zfsvfs, FTAG);729return (0);730}731732/*733* Construct a full dataset name in full_name: "pool/dataset@snap_name"734*/735static int736zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,737char *full_name)738{739objset_t *os = zfsvfs->z_os;740741if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)742return (SET_ERROR(EILSEQ));743744dmu_objset_name(os, full_name);745if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)746return (SET_ERROR(ENAMETOOLONG));747748(void) strcat(full_name, "@");749(void) strcat(full_name, snap_name);750751return (0);752}753754/*755* Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"756*/757static int758zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,759int path_len, char *full_path)760{761objset_t *os = zfsvfs->z_os;762fstrans_cookie_t cookie;763char *snapname;764boolean_t case_conflict;765uint64_t id, pos = 0;766int error = 0;767768cookie = spl_fstrans_mark();769snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);770771while (error == 0) {772dsl_pool_config_enter(dmu_objset_pool(os), FTAG);773error = dmu_snapshot_list_next(zfsvfs->z_os,774ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,775&case_conflict);776dsl_pool_config_exit(dmu_objset_pool(os), FTAG);777if (error)778goto out;779780if (id == objsetid)781break;782}783784mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);785if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {786snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",787zfsvfs->z_vfs->vfs_mntpoint, snapname);788} else789error = SET_ERROR(ENOENT);790mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);791792out:793kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);794spl_fstrans_unmark(cookie);795796return (error);797}798799/*800* Special case the handling of "..".801*/802int803zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,804int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)805{806zfsvfs_t *zfsvfs = ITOZSB(dip);807int error = 0;808809if ((error = zfs_enter(zfsvfs, FTAG)) != 0)810return (error);811812if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {813*ipp = NULL;814} else if (strcmp(name, "..") == 0) {815*ipp = dip->i_sb->s_root->d_inode;816} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {817*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,818&zpl_fops_snapdir, &zpl_ops_snapdir);819} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {820*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,821&zpl_fops_shares, &zpl_ops_shares);822} else {823*ipp = NULL;824}825826if (*ipp == NULL)827error = SET_ERROR(ENOENT);828829zfs_exit(zfsvfs, FTAG);830831return (error);832}833834/*835* Lookup entry point for the 'snapshot' directory. Try to open the836* snapshot if it exist, creating the pseudo filesystem inode as necessary.837*/838int839zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,840int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)841{842zfsvfs_t *zfsvfs = ITOZSB(dip);843uint64_t id;844int error;845846if ((error = zfs_enter(zfsvfs, FTAG)) != 0)847return (error);848849error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);850if (error) {851zfs_exit(zfsvfs, FTAG);852return (error);853}854855*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,856&simple_dir_operations, &simple_dir_inode_operations);857if (*ipp == NULL)858error = SET_ERROR(ENOENT);859860zfs_exit(zfsvfs, FTAG);861862return (error);863}864865/*866* Renaming a directory under '.zfs/snapshot' will automatically trigger867* a rename of the snapshot to the new given name. The rename is confined868* to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.869*/870int871zfsctl_snapdir_rename(struct inode *sdip, const char *snm,872struct inode *tdip, const char *tnm, cred_t *cr, int flags)873{874zfsvfs_t *zfsvfs = ITOZSB(sdip);875char *to, *from, *real, *fsname;876int error;877878if (!zfs_admin_snapshot)879return (SET_ERROR(EACCES));880881if ((error = zfs_enter(zfsvfs, FTAG)) != 0)882return (error);883884to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);885from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);886real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);887fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);888889if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {890error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,891ZFS_MAX_DATASET_NAME_LEN, NULL);892if (error == 0) {893snm = real;894} else if (error != ENOTSUP) {895goto out;896}897}898899dmu_objset_name(zfsvfs->z_os, fsname);900901error = zfsctl_snapshot_name(ITOZSB(sdip), snm,902ZFS_MAX_DATASET_NAME_LEN, from);903if (error == 0)904error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,905ZFS_MAX_DATASET_NAME_LEN, to);906if (error == 0)907error = zfs_secpolicy_rename_perms(from, to, cr);908if (error != 0)909goto out;910911/*912* Cannot move snapshots out of the snapdir.913*/914if (sdip != tdip) {915error = SET_ERROR(EINVAL);916goto out;917}918919/*920* No-op when names are identical.921*/922if (strcmp(snm, tnm) == 0) {923error = 0;924goto out;925}926927rw_enter(&zfs_snapshot_lock, RW_WRITER);928929error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);930if (error == 0)931(void) zfsctl_snapshot_rename(snm, tnm);932933rw_exit(&zfs_snapshot_lock);934out:935kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);936kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);937kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);938kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);939940zfs_exit(zfsvfs, FTAG);941942return (error);943}944945/*946* Removing a directory under '.zfs/snapshot' will automatically trigger947* the removal of the snapshot with the given name.948*/949int950zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr,951int flags)952{953zfsvfs_t *zfsvfs = ITOZSB(dip);954char *snapname, *real;955int error;956957if (!zfs_admin_snapshot)958return (SET_ERROR(EACCES));959960if ((error = zfs_enter(zfsvfs, FTAG)) != 0)961return (error);962963snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);964real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);965966if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {967error = dmu_snapshot_realname(zfsvfs->z_os, name, real,968ZFS_MAX_DATASET_NAME_LEN, NULL);969if (error == 0) {970name = real;971} else if (error != ENOTSUP) {972goto out;973}974}975976error = zfsctl_snapshot_name(ITOZSB(dip), name,977ZFS_MAX_DATASET_NAME_LEN, snapname);978if (error == 0)979error = zfs_secpolicy_destroy_perms(snapname, cr);980if (error != 0)981goto out;982983error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);984if ((error == 0) || (error == ENOENT))985error = dsl_destroy_snapshot(snapname, B_FALSE);986out:987kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);988kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);989990zfs_exit(zfsvfs, FTAG);991992return (error);993}994995/*996* Creating a directory under '.zfs/snapshot' will automatically trigger997* the creation of a new snapshot with the given name.998*/999int1000zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap,1001struct inode **ipp, cred_t *cr, int flags)1002{1003zfsvfs_t *zfsvfs = ITOZSB(dip);1004char *dsname;1005int error;10061007if (!zfs_admin_snapshot)1008return (SET_ERROR(EACCES));10091010dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);10111012if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {1013error = SET_ERROR(EILSEQ);1014goto out;1015}10161017dmu_objset_name(zfsvfs->z_os, dsname);10181019error = zfs_secpolicy_snapshot_perms(dsname, cr);1020if (error != 0)1021goto out;10221023if (error == 0) {1024error = dmu_objset_snapshot_one(dsname, dirname);1025if (error != 0)1026goto out;10271028error = zfsctl_snapdir_lookup(dip, dirname, ipp,10290, cr, NULL, NULL);1030}1031out:1032kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);10331034return (error);1035}10361037/*1038* Flush everything out of the kernel's export table and such.1039* This is needed as once the snapshot is used over NFS, its1040* entries in svc_export and svc_expkey caches hold reference1041* to the snapshot mount point. There is no known way of flushing1042* only the entries related to the snapshot.1043*/1044static void1045exportfs_flush(void)1046{1047char *argv[] = { "/usr/sbin/exportfs", "-f", NULL };1048char *envp[] = { NULL };10491050(void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);1051}10521053/*1054* Returns the path in char format for given struct path. Uses1055* d_path exported by kernel to convert struct path to char1056* format. Returns the correct path for mountpoints and chroot1057* environments.1058*1059* If chroot environment has directories that are mounted with1060* --bind or --rbind flag, d_path returns the complete path inside1061* chroot environment but does not return the absolute path, i.e.1062* the path to chroot environment is missing.1063*/1064static int1065get_root_path(struct path *path, char *buff, int len)1066{1067char *path_buffer, *path_ptr;1068int error = 0;10691070path_get(path);1071path_buffer = kmem_zalloc(len, KM_SLEEP);1072path_ptr = d_path(path, path_buffer, len);1073if (IS_ERR(path_ptr))1074error = SET_ERROR(-PTR_ERR(path_ptr));1075else1076strcpy(buff, path_ptr);10771078kmem_free(path_buffer, len);1079path_put(path);1080return (error);1081}10821083/*1084* Returns if the current process root is chrooted or not. Linux1085* kernel exposes the task_struct for current process and init.1086* Since init process root points to actual root filesystem when1087* Linux runtime is reached, we can compare the current process1088* root with init process root to determine if root of the current1089* process is different from init, which can reliably determine if1090* current process is in chroot context or not.1091*/1092static int1093is_current_chrooted(void)1094{1095struct task_struct *curr = current, *global = &init_task;1096struct path cr_root, gl_root;10971098task_lock(curr);1099get_fs_root(curr->fs, &cr_root);1100task_unlock(curr);11011102task_lock(global);1103get_fs_root(global->fs, &gl_root);1104task_unlock(global);11051106int chrooted = !path_equal(&cr_root, &gl_root);1107path_put(&gl_root);1108path_put(&cr_root);11091110return (chrooted);1111}11121113/*1114* Attempt to unmount a snapshot by making a call to user space.1115* There is no assurance that this can or will succeed, is just a1116* best effort. In the case where it does fail, perhaps because1117* it's in use, the unmount will fail harmlessly.1118*/1119int1120zfsctl_snapshot_unmount(const char *snapname, int flags)1121{1122char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,1123NULL };1124char *envp[] = { NULL };1125zfs_snapentry_t *se;1126int error;11271128rw_enter(&zfs_snapshot_lock, RW_READER);1129if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {1130rw_exit(&zfs_snapshot_lock);1131return (SET_ERROR(ENOENT));1132}1133rw_exit(&zfs_snapshot_lock);11341135exportfs_flush();11361137if (flags & MNT_FORCE)1138argv[4] = "-fn";1139argv[5] = se->se_path;1140dprintf("unmount; path=%s\n", se->se_path);1141error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);1142zfsctl_snapshot_rele(se);114311441145/*1146* The umount system utility will return 256 on error. We must1147* assume this error is because the file system is busy so it is1148* converted to the more sensible EBUSY.1149*/1150if (error)1151error = SET_ERROR(EBUSY);11521153return (error);1154}11551156int1157zfsctl_snapshot_mount(struct path *path, int flags)1158{1159struct dentry *dentry = path->dentry;1160struct inode *ip = dentry->d_inode;1161zfsvfs_t *zfsvfs;1162zfsvfs_t *snap_zfsvfs;1163zfs_snapentry_t *se;1164char *full_name, *full_path, *options;1165char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",1166"-o", NULL, NULL, NULL, NULL };1167char *envp[] = { NULL };1168int error;1169struct path spath;11701171if (ip == NULL)1172return (SET_ERROR(EISDIR));11731174zfsvfs = ITOZSB(ip);1175if ((error = zfs_enter(zfsvfs, FTAG)) != 0)1176return (error);11771178full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);1179full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);1180options = kmem_zalloc(7, KM_SLEEP);11811182error = zfsctl_snapshot_name(zfsvfs, dname(dentry),1183ZFS_MAX_DATASET_NAME_LEN, full_name);1184if (error)1185goto error;11861187if (is_current_chrooted() == 0) {1188/*1189* Current process is not in chroot context1190*/11911192char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP);1193struct path mnt_path;1194mnt_path.mnt = path->mnt;1195mnt_path.dentry = path->mnt->mnt_root;11961197/*1198* Get path to current mountpoint1199*/1200error = get_root_path(&mnt_path, m, MAXPATHLEN);1201if (error != 0) {1202kmem_free(m, MAXPATHLEN);1203goto error;1204}1205mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);1206if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {1207/*1208* If current mnountpoint and vfs_mntpoint are not same,1209* store current mountpoint in vfs_mntpoint.1210*/1211if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) {1212kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint);1213zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);1214}1215} else1216zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);1217mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);1218kmem_free(m, MAXPATHLEN);1219}12201221/*1222* Construct a mount point path from sb of the ctldir inode and dirent1223* name, instead of from d_path(), so that chroot'd process doesn't fail1224* on mount.zfs(8).1225*/1226mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);1227snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",1228zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",1229dname(dentry));1230mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);12311232snprintf(options, 7, "%s",1233zfs_snapshot_no_setuid ? "nosuid" : "suid");12341235/*1236* Multiple concurrent automounts of a snapshot are never allowed.1237* The snapshot may be manually mounted as many times as desired.1238*/1239if (zfsctl_snapshot_ismounted(full_name)) {1240error = 0;1241goto error;1242}12431244/*1245* Attempt to mount the snapshot from user space. Normally this1246* would be done using the vfs_kern_mount() function, however that1247* function is marked GPL-only and cannot be used. On error we1248* careful to log the real error to the console and return EISDIR1249* to safely abort the automount. This should be very rare.1250*1251* If the user mode helper happens to return EBUSY, a concurrent1252* mount is already in progress in which case the error is ignored.1253* Take note that if the program was executed successfully the return1254* value from call_usermodehelper() will be (exitcode << 8 + signal).1255*/1256dprintf("mount; name=%s path=%s\n", full_name, full_path);1257argv[7] = options;1258argv[8] = full_name;1259argv[9] = full_path;1260error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);1261if (error) {1262if (!(error & MOUNT_BUSY << 8)) {1263zfs_dbgmsg("Unable to automount %s error=%d",1264full_path, error);1265error = SET_ERROR(EISDIR);1266} else {1267/*1268* EBUSY, this could mean a concurrent mount, or the1269* snapshot has already been mounted at completely1270* different place. We return 0 so VFS will retry. For1271* the latter case the VFS will retry several times1272* and return ELOOP, which is probably not a very good1273* behavior.1274*/1275error = 0;1276}1277goto error;1278}12791280/*1281* Follow down in to the mounted snapshot and set MNT_SHRINKABLE1282* to identify this as an automounted filesystem.1283*/1284spath = *path;1285path_get(&spath);1286if (follow_down_one(&spath)) {1287snap_zfsvfs = ITOZSB(spath.dentry->d_inode);1288snap_zfsvfs->z_parent = zfsvfs;1289dentry = spath.dentry;1290spath.mnt->mnt_flags |= MNT_SHRINKABLE;12911292rw_enter(&zfs_snapshot_lock, RW_WRITER);1293se = zfsctl_snapshot_alloc(full_name, full_path,1294snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),1295dentry);1296zfsctl_snapshot_add(se);1297zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);1298rw_exit(&zfs_snapshot_lock);1299}1300path_put(&spath);1301error:1302kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);1303kmem_free(full_path, MAXPATHLEN);13041305zfs_exit(zfsvfs, FTAG);13061307return (error);1308}13091310/*1311* Get the snapdir inode from fid1312*/1313int1314zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,1315struct inode **ipp)1316{1317int error;1318struct path path;1319char *mnt;1320struct dentry *dentry;13211322mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);13231324error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,1325MAXPATHLEN, mnt);1326if (error)1327goto out;13281329/* Trigger automount */1330error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);1331if (error)1332goto out;13331334path_put(&path);1335/*1336* Get the snapdir inode. Note, we don't want to use the above1337* path because it contains the root of the snapshot rather1338* than the snapdir.1339*/1340*ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);1341if (*ipp == NULL) {1342error = SET_ERROR(ENOENT);1343goto out;1344}13451346/* check gen, see zfsctl_snapdir_fid */1347dentry = d_obtain_alias(igrab(*ipp));1348if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {1349iput(*ipp);1350*ipp = NULL;1351error = SET_ERROR(ENOENT);1352}1353if (!IS_ERR(dentry))1354dput(dentry);1355out:1356kmem_free(mnt, MAXPATHLEN);1357return (error);1358}13591360int1361zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,1362int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)1363{1364zfsvfs_t *zfsvfs = ITOZSB(dip);1365znode_t *zp;1366znode_t *dzp;1367int error;13681369if ((error = zfs_enter(zfsvfs, FTAG)) != 0)1370return (error);13711372if (zfsvfs->z_shares_dir == 0) {1373zfs_exit(zfsvfs, FTAG);1374return (SET_ERROR(ENOTSUP));1375}13761377if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {1378error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);1379zrele(dzp);1380}13811382zfs_exit(zfsvfs, FTAG);13831384return (error);1385}13861387/*1388* Initialize the various pieces we'll need to create and manipulate .zfs1389* directories. Currently this is unused but available.1390*/1391void1392zfsctl_init(void)1393{1394avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,1395sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,1396se_node_name));1397avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,1398sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,1399se_node_objsetid));1400rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);1401}14021403/*1404* Cleanup the various pieces we needed for .zfs directories. In particular1405* ensure the expiry timer is canceled safely.1406*/1407void1408zfsctl_fini(void)1409{1410avl_destroy(&zfs_snapshots_by_name);1411avl_destroy(&zfs_snapshots_by_objsetid);1412rw_destroy(&zfs_snapshot_lock);1413}14141415module_param(zfs_admin_snapshot, int, 0644);1416MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");14171418module_param(zfs_expire_snapshot, int, 0644);1419MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");14201421module_param(zfs_snapshot_no_setuid, int, 0644);1422MODULE_PARM_DESC(zfs_snapshot_no_setuid,1423"Disable setuid/setgid for automounts in .zfs/snapshot");142414251426