Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
48775 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2011, Lawrence Livermore National Security, LLC.23* Copyright (c) 2023, Datto Inc. All rights reserved.24* Copyright (c) 2025, Klara, Inc.25* Copyright (c) 2025, Rob Norris <[email protected]>26*/272829#include <sys/zfs_znode.h>30#include <sys/zfs_vfsops.h>31#include <sys/zfs_vnops.h>32#include <sys/zfs_ctldir.h>33#include <sys/zpl.h>34#include <linux/iversion.h>35#include <linux/version.h>36#include <linux/vfs_compat.h>3738/*39* What to do when the last reference to an inode is released. If 0, the kernel40* will cache it on the superblock. If 1, the inode will be freed immediately.41* See zpl_drop_inode().42*/43int zfs_delete_inode = 0;4445/*46* What to do when the last reference to a dentry is released. If 0, the kernel47* will cache it until the entry (file) is destroyed. If 1, the dentry will be48* marked for cleanup, at which time its inode reference will be released. See49* zpl_dentry_delete().50*/51int zfs_delete_dentry = 0;5253static struct inode *54zpl_inode_alloc(struct super_block *sb)55{56struct inode *ip;5758VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);59inode_set_iversion(ip, 1);6061return (ip);62}6364#ifdef HAVE_SOPS_FREE_INODE65static void66zpl_inode_free(struct inode *ip)67{68ASSERT0(atomic_read(&ip->i_count));69zfs_inode_free(ip);70}71#endif7273static void74zpl_inode_destroy(struct inode *ip)75{76ASSERT0(atomic_read(&ip->i_count));77zfs_inode_destroy(ip);78}7980/*81* Called from __mark_inode_dirty() to reflect that something in the82* inode has changed. We use it to ensure the znode system attributes83* are always strictly update to date with respect to the inode.84*/85static void86zpl_dirty_inode(struct inode *ip, int flags)87{88fstrans_cookie_t cookie;8990cookie = spl_fstrans_mark();91zfs_dirty_inode(ip, flags);92spl_fstrans_unmark(cookie);93}9495/*96* ->drop_inode() is called when the last reference to an inode is released.97* Its return value indicates if the inode should be destroyed immediately, or98* cached on the superblock structure.99*100* By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns101* "destroy immediately" if the inode is unhashed and has no links (roughly: no102* longer exists on disk). On datasets with millions of rarely-accessed files,103* this can cause a large amount of memory to be "pinned" by cached inodes,104* which in turn pin their associated dnodes and dbufs, until the kernel starts105* reporting memory pressure and requests OpenZFS release some memory (see106* zfs_prune()).107*108* When set to 1, we call generic_delete_inode(), which always returns "destroy109* immediately", resulting in inodes being destroyed immediately, releasing110* their associated dnodes and dbufs to the dbuf cached and the ARC to be111* evicted as normal.112*113* Note that the "last reference" doesn't always mean the last _userspace_114* reference; the dentry cache also holds a reference, so "busy" inodes will115* still be kept alive that way (subject to dcache tuning).116*/117static int118zpl_drop_inode(struct inode *ip)119{120if (zfs_delete_inode)121return (generic_delete_inode(ip));122return (generic_drop_inode(ip));123}124125/*126* The ->evict_inode() callback must minimally truncate the inode pages,127* and call clear_inode(). For 2.6.35 and later kernels this will128* simply update the inode state, with the sync occurring before the129* truncate in evict(). For earlier kernels clear_inode() maps to130* end_writeback() which is responsible for completing all outstanding131* write back. In either case, once this is done it is safe to cleanup132* any remaining inode specific data via zfs_inactive().133* remaining filesystem specific data.134*/135static void136zpl_evict_inode(struct inode *ip)137{138fstrans_cookie_t cookie;139140cookie = spl_fstrans_mark();141truncate_setsize(ip, 0);142clear_inode(ip);143zfs_inactive(ip);144spl_fstrans_unmark(cookie);145}146147static void148zpl_put_super(struct super_block *sb)149{150fstrans_cookie_t cookie;151int error;152153cookie = spl_fstrans_mark();154error = -zfs_umount(sb);155spl_fstrans_unmark(cookie);156ASSERT3S(error, <=, 0);157}158159/*160* zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)161* syscalls, via sb->s_op->sync_fs().162*163* Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->164* sync_filesystem() would ignore the return from sync_fs(), instead only165* considing the error from syncing the underlying block device (sb->s_dev).166* Since OpenZFS doesn't _have_ an underlying block device, there's no way for167* us to report a sync directly.168*169* However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra170* error store `s_wb_err`, to carry errors seen on page writeback since the171* last call to syncfs(). If sync_filesystem() does not return an error, any172* existing writeback error on the superblock will be used instead (and cleared173* either way). We don't use this (page writeback is a different thing for us),174* so for 5.8-5.17 we can use that instead to get syncfs() to return the error.175*176* Before 5.8, we have no other good options - no matter what happens, the177* userspace program will be told the call has succeeded, and so we must make178* it so, Therefore, when we are asked to wait for sync to complete (wait ==179* 1), if zfs_sync() has returned an error we have no choice but to block,180* regardless of the reason.181*182* The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely183* to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the184* mainline Linux series at time of writing), and has likely been backported to185* vendor kernels before 5.8. We don't really want to use a workaround when we186* don't have to, but we can't really detect whether or not sync_filesystem()187* will return our errors (without a difficult runtime test anyway). So, we use188* a static version check: any kernel reporting its version as 5.17+ will use a189* direct error return, otherwise, we'll either use s_wb_err if it was detected190* at configure (5.8-5.16 + vendor backports). If it's unavailable, we will191* block to ensure the correct semantics.192*193* See https://github.com/openzfs/zfs/issues/17416 for further discussion.194*/195static int196zpl_sync_fs(struct super_block *sb, int wait)197{198fstrans_cookie_t cookie;199cred_t *cr = CRED();200int error;201202crhold(cr);203cookie = spl_fstrans_mark();204error = -zfs_sync(sb, wait, cr);205206#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)207#ifdef HAVE_SUPER_BLOCK_S_WB_ERR208if (error && wait)209errseq_set(&sb->s_wb_err, error);210#else211if (error && wait) {212zfsvfs_t *zfsvfs = sb->s_fs_info;213ASSERT3P(zfsvfs, !=, NULL);214if (zfs_enter(zfsvfs, FTAG) == 0) {215txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);216zfs_exit(zfsvfs, FTAG);217error = 0;218}219}220#endif221#endif /* < 5.17.0 */222223spl_fstrans_unmark(cookie);224crfree(cr);225226ASSERT3S(error, <=, 0);227return (error);228}229230static int231zpl_statfs(struct dentry *dentry, struct kstatfs *statp)232{233fstrans_cookie_t cookie;234int error;235236cookie = spl_fstrans_mark();237error = -zfs_statvfs(dentry->d_inode, statp);238spl_fstrans_unmark(cookie);239ASSERT3S(error, <=, 0);240241/*242* If required by a 32-bit system call, dynamically scale the243* block size up to 16MiB and decrease the block counts. This244* allows for a maximum size of 64EiB to be reported. The file245* counts must be artificially capped at 2^32-1.246*/247if (unlikely(zpl_is_32bit_api())) {248while (statp->f_blocks > UINT32_MAX &&249statp->f_bsize < SPA_MAXBLOCKSIZE) {250statp->f_frsize <<= 1;251statp->f_bsize <<= 1;252253statp->f_blocks >>= 1;254statp->f_bfree >>= 1;255statp->f_bavail >>= 1;256}257258uint64_t usedobjs = statp->f_files - statp->f_ffree;259statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);260statp->f_files = statp->f_ffree + usedobjs;261}262263return (error);264}265266static int267zpl_remount_fs(struct super_block *sb, int *flags, char *data)268{269zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };270fstrans_cookie_t cookie;271int error;272273cookie = spl_fstrans_mark();274error = -zfs_remount(sb, flags, &zm);275spl_fstrans_unmark(cookie);276ASSERT3S(error, <=, 0);277278return (error);279}280281static int282__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)283{284int error;285if ((error = zpl_enter(zfsvfs, FTAG)) != 0)286return (error);287288char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);289dmu_objset_name(zfsvfs->z_os, fsname);290291for (int i = 0; fsname[i] != 0; i++) {292/*293* Spaces in the dataset name must be converted to their294* octal escape sequence for getmntent(3) to correctly295* parse then fsname portion of /proc/self/mounts.296*/297if (fsname[i] == ' ') {298seq_puts(seq, "\\040");299} else {300seq_putc(seq, fsname[i]);301}302}303304kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);305306zpl_exit(zfsvfs, FTAG);307308return (0);309}310311static int312zpl_show_devname(struct seq_file *seq, struct dentry *root)313{314return (__zpl_show_devname(seq, root->d_sb->s_fs_info));315}316317static int318__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)319{320seq_printf(seq, ",%s",321zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");322323#ifdef CONFIG_FS_POSIX_ACL324switch (zfsvfs->z_acl_type) {325case ZFS_ACLTYPE_POSIX:326seq_puts(seq, ",posixacl");327break;328default:329seq_puts(seq, ",noacl");330break;331}332#endif /* CONFIG_FS_POSIX_ACL */333334switch (zfsvfs->z_case) {335case ZFS_CASE_SENSITIVE:336seq_puts(seq, ",casesensitive");337break;338case ZFS_CASE_INSENSITIVE:339seq_puts(seq, ",caseinsensitive");340break;341default:342seq_puts(seq, ",casemixed");343break;344}345346return (0);347}348349static int350zpl_show_options(struct seq_file *seq, struct dentry *root)351{352return (__zpl_show_options(seq, root->d_sb->s_fs_info));353}354355static int356zpl_fill_super(struct super_block *sb, void *data, int silent)357{358zfs_mnt_t *zm = (zfs_mnt_t *)data;359fstrans_cookie_t cookie;360int error;361362cookie = spl_fstrans_mark();363error = -zfs_domount(sb, zm, silent);364spl_fstrans_unmark(cookie);365ASSERT3S(error, <=, 0);366367return (error);368}369370static int371zpl_test_super(struct super_block *s, void *data)372{373zfsvfs_t *zfsvfs = s->s_fs_info;374objset_t *os = data;375/*376* If the os doesn't match the z_os in the super_block, assume it is377* not a match. Matching would imply a multimount of a dataset. It is378* possible that during a multimount, there is a simultaneous operation379* that changes the z_os, e.g., rollback, where the match will be380* missed, but in that case the user will get an EBUSY.381*/382return (zfsvfs != NULL && os == zfsvfs->z_os);383}384385static struct super_block *386zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)387{388struct super_block *s;389objset_t *os;390boolean_t issnap = B_FALSE;391int err;392393err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);394if (err)395return (ERR_PTR(-err));396397/*398* The dsl pool lock must be released prior to calling sget().399* It is possible sget() may block on the lock in grab_super()400* while deactivate_super() holds that same lock and waits for401* a txg sync. If the dsl_pool lock is held over sget()402* this can prevent the pool sync and cause a deadlock.403*/404dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);405dsl_pool_rele(dmu_objset_pool(os), FTAG);406407s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);408409/*410* Recheck with the lock held to prevent mounting the wrong dataset411* since z_os can be stale when the teardown lock is held.412*413* We can't do this in zpl_test_super in since it's under spinlock and414* also s_umount lock is not held there so it would race with415* zfs_umount and zfsvfs can be freed.416*/417if (!IS_ERR(s) && s->s_fs_info != NULL) {418zfsvfs_t *zfsvfs = s->s_fs_info;419if (zpl_enter(zfsvfs, FTAG) == 0) {420if (os != zfsvfs->z_os)421err = -SET_ERROR(EBUSY);422issnap = zfsvfs->z_issnap;423zpl_exit(zfsvfs, FTAG);424} else {425err = -SET_ERROR(EBUSY);426}427}428dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);429dsl_dataset_rele(dmu_objset_ds(os), FTAG);430431if (IS_ERR(s))432return (ERR_CAST(s));433434if (err) {435deactivate_locked_super(s);436return (ERR_PTR(err));437}438439if (s->s_root == NULL) {440err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);441if (err) {442deactivate_locked_super(s);443return (ERR_PTR(err));444}445s->s_flags |= SB_ACTIVE;446} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {447/*448* Skip ro check for snap since snap is always ro regardless449* ro flag is passed by mount or not.450*/451deactivate_locked_super(s);452return (ERR_PTR(-EBUSY));453}454455return (s);456}457458static struct dentry *459zpl_mount(struct file_system_type *fs_type, int flags,460const char *osname, void *data)461{462zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };463464struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);465if (IS_ERR(sb))466return (ERR_CAST(sb));467468return (dget(sb->s_root));469}470471static void472zpl_kill_sb(struct super_block *sb)473{474zfs_preumount(sb);475kill_anon_super(sb);476}477478void479zpl_prune_sb(uint64_t nr_to_scan, void *arg)480{481struct super_block *sb = (struct super_block *)arg;482int objects = 0;483484/*485* Ensure the superblock is not in the process of being torn down.486*/487#ifdef HAVE_SB_DYING488if (down_read_trylock(&sb->s_umount)) {489if (!(sb->s_flags & SB_DYING) && sb->s_root &&490(sb->s_flags & SB_BORN)) {491(void) zfs_prune(sb, nr_to_scan, &objects);492}493up_read(&sb->s_umount);494}495#else496if (down_read_trylock(&sb->s_umount)) {497if (!hlist_unhashed(&sb->s_instances) &&498sb->s_root && (sb->s_flags & SB_BORN)) {499(void) zfs_prune(sb, nr_to_scan, &objects);500}501up_read(&sb->s_umount);502}503#endif504}505506const struct super_operations zpl_super_operations = {507.alloc_inode = zpl_inode_alloc,508#ifdef HAVE_SOPS_FREE_INODE509.free_inode = zpl_inode_free,510#endif511.destroy_inode = zpl_inode_destroy,512.dirty_inode = zpl_dirty_inode,513.write_inode = NULL,514.drop_inode = zpl_drop_inode,515.evict_inode = zpl_evict_inode,516.put_super = zpl_put_super,517.sync_fs = zpl_sync_fs,518.statfs = zpl_statfs,519.remount_fs = zpl_remount_fs,520.show_devname = zpl_show_devname,521.show_options = zpl_show_options,522.show_stats = NULL,523};524525/*526* ->d_delete() is called when the last reference to a dentry is released. Its527* return value indicates if the dentry should be destroyed immediately, or528* retained in the dentry cache.529*530* By default (zfs_delete_dentry=0) the kernel will always cache unused531* entries. Each dentry holds an inode reference, so cached dentries can hold532* the final inode reference indefinitely, leading to the inode and its related533* data being pinned (see zpl_drop_inode()).534*535* When set to 1, we signal that the dentry should be destroyed immediately and536* never cached. This reduces memory usage, at the cost of higher overheads to537* lookup a file, as the inode and its underlying data (dnode/dbuf) need to be538* reloaded and reinflated.539*540* Note that userspace does not have direct control over dentry references and541* reclaim; rather, this is part of the kernel's caching and reclaim subsystems542* (eg vm.vfs_cache_pressure).543*/544static int545zpl_dentry_delete(const struct dentry *dentry)546{547return (zfs_delete_dentry ? 1 : 0);548}549550const struct dentry_operations zpl_dentry_operations = {551.d_delete = zpl_dentry_delete,552};553554struct file_system_type zpl_fs_type = {555.owner = THIS_MODULE,556.name = ZFS_DRIVER,557#if defined(HAVE_IDMAP_MNT_API)558.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,559#else560.fs_flags = FS_USERNS_MOUNT,561#endif562.mount = zpl_mount,563.kill_sb = zpl_kill_sb,564};565566ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,567"Delete inodes as soon as the last reference is released.");568569ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,570"Delete dentries from dentry cache as soon as the last reference is "571"released.");572573574