Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
48774 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2011, Lawrence Livermore National Security, LLC.23* Copyright (c) 2015 by Chunwei Chen. All rights reserved.24* Copyright (c) 2025, Klara, Inc.25* Copyright (c) 2025, Rob Norris <[email protected]>26*/272829#ifdef CONFIG_COMPAT30#include <linux/compat.h>31#endif32#include <linux/fs.h>33#include <linux/migrate.h>34#include <sys/file.h>35#include <sys/dmu_objset.h>36#include <sys/zfs_znode.h>37#include <sys/zfs_vfsops.h>38#include <sys/zfs_vnops.h>39#include <sys/zfs_project.h>40#include <linux/pagemap_compat.h>41#include <linux/fadvise.h>42#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO43#include <linux/writeback.h>44#endif4546/*47* When using fallocate(2) to preallocate space, inflate the requested48* capacity check by 10% to account for the required metadata blocks.49*/50static unsigned int zfs_fallocate_reserve_percent = 110;5152static int53zpl_open(struct inode *ip, struct file *filp)54{55cred_t *cr = CRED();56int error;57fstrans_cookie_t cookie;5859error = generic_file_open(ip, filp);60if (error)61return (error);6263crhold(cr);64cookie = spl_fstrans_mark();65error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);66spl_fstrans_unmark(cookie);67crfree(cr);68ASSERT3S(error, <=, 0);6970return (error);71}7273static int74zpl_release(struct inode *ip, struct file *filp)75{76cred_t *cr = CRED();77int error;78fstrans_cookie_t cookie;7980cookie = spl_fstrans_mark();81if (ITOZ(ip)->z_atime_dirty)82zfs_mark_inode_dirty(ip);8384crhold(cr);85error = -zfs_close(ip, filp->f_flags, cr);86spl_fstrans_unmark(cookie);87crfree(cr);88ASSERT3S(error, <=, 0);8990return (error);91}9293static int94zpl_iterate(struct file *filp, struct dir_context *ctx)95{96cred_t *cr = CRED();97int error;98fstrans_cookie_t cookie;99100crhold(cr);101cookie = spl_fstrans_mark();102error = -zfs_readdir(file_inode(filp), ctx, cr);103spl_fstrans_unmark(cookie);104crfree(cr);105ASSERT3S(error, <=, 0);106107return (error);108}109110static inline int111zpl_write_cache_pages(struct address_space *mapping,112struct writeback_control *wbc, void *data);113114static int115zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)116{117struct inode *inode = filp->f_mapping->host;118znode_t *zp = ITOZ(inode);119cred_t *cr = CRED();120int error;121fstrans_cookie_t cookie;122123/*124* Force dirty pages in the range out to the DMU and the log, ready125* for zil_commit() to write down.126*127* We call write_cache_pages() directly to ensure that zpl_putpage() is128* called with the flags we need. We need WB_SYNC_NONE to avoid a call129* to zil_commit() (since we're doing this as a kind of pre-sync); but130* we do need for_sync so that the pages remain in writeback until131* they're on disk, and so that we get an error if the DMU write fails.132*/133if (filemap_range_has_page(inode->i_mapping, start, end)) {134int for_sync = 1;135struct writeback_control wbc = {136.sync_mode = WB_SYNC_NONE,137.nr_to_write = LONG_MAX,138.range_start = start,139.range_end = end,140};141error =142zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);143if (error != 0) {144/*145* Unclear what state things are in. zfs_putpage() will146* ensure the pages remain dirty if they haven't been147* written down to the DMU, but because there may be148* nothing logged, we can't assume that zfs_sync() ->149* zil_commit() will give us a useful error. It's150* safest if we just error out here.151*/152return (error);153}154}155156crhold(cr);157cookie = spl_fstrans_mark();158error = -zfs_fsync(zp, datasync, cr);159spl_fstrans_unmark(cookie);160crfree(cr);161ASSERT3S(error, <=, 0);162163return (error);164}165166static inline int167zfs_io_flags(struct kiocb *kiocb)168{169int flags = 0;170171#if defined(IOCB_DSYNC)172if (kiocb->ki_flags & IOCB_DSYNC)173flags |= O_DSYNC;174#endif175#if defined(IOCB_SYNC)176if (kiocb->ki_flags & IOCB_SYNC)177flags |= O_SYNC;178#endif179#if defined(IOCB_APPEND)180if (kiocb->ki_flags & IOCB_APPEND)181flags |= O_APPEND;182#endif183#if defined(IOCB_DIRECT)184if (kiocb->ki_flags & IOCB_DIRECT)185flags |= O_DIRECT;186#endif187return (flags);188}189190/*191* If relatime is enabled, call file_accessed() if zfs_relatime_need_update()192* is true. This is needed since datasets with inherited "relatime" property193* aren't necessarily mounted with the MNT_RELATIME flag (e.g. after194* `zfs set relatime=...`), which is what relatime test in VFS by195* relatime_need_update() is based on.196*/197static inline void198zpl_file_accessed(struct file *filp)199{200struct inode *ip = filp->f_mapping->host;201202if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {203if (zfs_relatime_need_update(ip))204file_accessed(filp);205} else {206file_accessed(filp);207}208}209210static ssize_t211zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)212{213cred_t *cr = CRED();214fstrans_cookie_t cookie;215struct file *filp = kiocb->ki_filp;216ssize_t count = iov_iter_count(to);217zfs_uio_t uio;218219zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);220221crhold(cr);222cookie = spl_fstrans_mark();223224ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,225filp->f_flags | zfs_io_flags(kiocb), cr);226227spl_fstrans_unmark(cookie);228crfree(cr);229230if (ret < 0)231return (ret);232233ssize_t read = count - uio.uio_resid;234kiocb->ki_pos += read;235236zpl_file_accessed(filp);237238return (read);239}240241static inline ssize_t242zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,243size_t *countp)244{245ssize_t ret = generic_write_checks(kiocb, from);246if (ret <= 0)247return (ret);248249*countp = ret;250251return (0);252}253254static ssize_t255zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)256{257cred_t *cr = CRED();258fstrans_cookie_t cookie;259struct file *filp = kiocb->ki_filp;260struct inode *ip = filp->f_mapping->host;261zfs_uio_t uio;262size_t count = 0;263ssize_t ret;264265ret = zpl_generic_write_checks(kiocb, from, &count);266if (ret)267return (ret);268269zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);270271crhold(cr);272cookie = spl_fstrans_mark();273274ret = -zfs_write(ITOZ(ip), &uio,275filp->f_flags | zfs_io_flags(kiocb), cr);276277spl_fstrans_unmark(cookie);278crfree(cr);279280if (ret < 0)281return (ret);282283ssize_t wrote = count - uio.uio_resid;284kiocb->ki_pos += wrote;285286return (wrote);287}288289static ssize_t290zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)291{292/*293* All O_DIRECT requests should be handled by294* zpl_iter_write/read}(). There is no way kernel generic code should295* call the direct_IO address_space_operations function. We set this296* code path to be fatal if it is executed.297*/298PANIC(0);299return (0);300}301302static loff_t303zpl_llseek(struct file *filp, loff_t offset, int whence)304{305#if defined(SEEK_HOLE) && defined(SEEK_DATA)306fstrans_cookie_t cookie;307308if (whence == SEEK_DATA || whence == SEEK_HOLE) {309struct inode *ip = filp->f_mapping->host;310loff_t maxbytes = ip->i_sb->s_maxbytes;311loff_t error;312313spl_inode_lock_shared(ip);314cookie = spl_fstrans_mark();315error = -zfs_holey(ITOZ(ip), whence, &offset);316spl_fstrans_unmark(cookie);317if (error == 0)318error = lseek_execute(filp, ip, offset, maxbytes);319spl_inode_unlock_shared(ip);320321return (error);322}323#endif /* SEEK_HOLE && SEEK_DATA */324325return (generic_file_llseek(filp, offset, whence));326}327328/*329* It's worth taking a moment to describe how mmap is implemented330* for zfs because it differs considerably from other Linux filesystems.331* However, this issue is handled the same way under OpenSolaris.332*333* The issue is that by design zfs bypasses the Linux page cache and334* leaves all caching up to the ARC. This has been shown to work335* well for the common read(2)/write(2) case. However, mmap(2)336* is problem because it relies on being tightly integrated with the337* page cache. To handle this we cache mmap'ed files twice, once in338* the ARC and a second time in the page cache. The code is careful339* to keep both copies synchronized.340*341* When a file with an mmap'ed region is written to using write(2)342* both the data in the ARC and existing pages in the page cache343* are updated. For a read(2) data will be read first from the page344* cache then the ARC if needed. Neither a write(2) or read(2) will345* will ever result in new pages being added to the page cache.346*347* New pages are added to the page cache only via .readpage() which348* is called when the vfs needs to read a page off disk to back the349* virtual memory region. These pages may be modified without350* notifying the ARC and will be written out periodically via351* .writepage(). This will occur due to either a sync or the usual352* page aging behavior. Note because a read(2) of a mmap'ed file353* will always check the page cache first even when the ARC is out354* of date correct data will still be returned.355*356* While this implementation ensures correct behavior it does have357* have some drawbacks. The most obvious of which is that it358* increases the required memory footprint when access mmap'ed359* files. It also adds additional complexity to the code keeping360* both caches synchronized.361*362* Longer term it may be possible to cleanly resolve this wart by363* mapping page cache pages directly on to the ARC buffers. The364* Linux address space operations are flexible enough to allow365* selection of which pages back a particular index. The trick366* would be working out the details of which subsystem is in367* charge, the ARC, the page cache, or both. It may also prove368* helpful to move the ARC buffers to a scatter-gather lists369* rather than a vmalloc'ed region.370*/371static int372zpl_mmap(struct file *filp, struct vm_area_struct *vma)373{374struct inode *ip = filp->f_mapping->host;375int error;376fstrans_cookie_t cookie;377378cookie = spl_fstrans_mark();379error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,380(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);381spl_fstrans_unmark(cookie);382383if (error)384return (error);385386error = generic_file_mmap(filp, vma);387if (error)388return (error);389390return (error);391}392393/*394* Populate a page with data for the Linux page cache. This function is395* only used to support mmap(2). There will be an identical copy of the396* data in the ARC which is kept up to date via .write() and .writepage().397*/398static inline int399zpl_readpage_common(struct page *pp)400{401fstrans_cookie_t cookie;402403ASSERT(PageLocked(pp));404405cookie = spl_fstrans_mark();406int error = -zfs_getpage(pp->mapping->host, pp);407spl_fstrans_unmark(cookie);408409unlock_page(pp);410411return (error);412}413414#ifdef HAVE_VFS_READ_FOLIO415static int416zpl_read_folio(struct file *filp, struct folio *folio)417{418return (zpl_readpage_common(&folio->page));419}420#else421static int422zpl_readpage(struct file *filp, struct page *pp)423{424return (zpl_readpage_common(pp));425}426#endif427428static int429zpl_readpage_filler(void *data, struct page *pp)430{431return (zpl_readpage_common(pp));432}433434/*435* Populate a set of pages with data for the Linux page cache. This436* function will only be called for read ahead and never for demand437* paging. For simplicity, the code relies on read_cache_pages() to438* correctly lock each page for IO and call zpl_readpage().439*/440#ifdef HAVE_VFS_READPAGES441static int442zpl_readpages(struct file *filp, struct address_space *mapping,443struct list_head *pages, unsigned nr_pages)444{445return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));446}447#else448static void449zpl_readahead(struct readahead_control *ractl)450{451struct page *page;452453while ((page = readahead_page(ractl)) != NULL) {454int ret;455456ret = zpl_readpage_filler(NULL, page);457put_page(page);458if (ret)459break;460}461}462#endif463464static int465zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)466{467boolean_t *for_sync = data;468fstrans_cookie_t cookie;469int ret;470471ASSERT(PageLocked(pp));472ASSERT(!PageWriteback(pp));473474cookie = spl_fstrans_mark();475ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);476spl_fstrans_unmark(cookie);477478return (ret);479}480481#ifdef HAVE_WRITE_CACHE_PAGES482#ifdef HAVE_WRITEPAGE_T_FOLIO483static int484zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)485{486return (zpl_putpage(&pp->page, wbc, data));487}488#endif489490static inline int491zpl_write_cache_pages(struct address_space *mapping,492struct writeback_control *wbc, void *data)493{494int result;495496#ifdef HAVE_WRITEPAGE_T_FOLIO497result = write_cache_pages(mapping, wbc, zpl_putfolio, data);498#else499result = write_cache_pages(mapping, wbc, zpl_putpage, data);500#endif501return (result);502}503#else504static inline int505zpl_write_cache_pages(struct address_space *mapping,506struct writeback_control *wbc, void *data)507{508pgoff_t start = wbc->range_start >> PAGE_SHIFT;509pgoff_t end = wbc->range_end >> PAGE_SHIFT;510511struct folio_batch fbatch;512folio_batch_init(&fbatch);513514/*515* This atomically (-ish) tags all DIRTY pages in the range with516* TOWRITE, allowing users to continue dirtying or undirtying pages517* while we get on with writeback, without us treading on each other.518*/519tag_pages_for_writeback(mapping, start, end);520521int err = 0;522unsigned int npages;523524/*525* Grab references to the TOWRITE pages just flagged. This may not get526* all of them, so we do it in a loop until there are none left.527*/528while ((npages = filemap_get_folios_tag(mapping, &start, end,529PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {530531/* Loop over each page and write it out. */532struct folio *folio;533while ((folio = folio_batch_next(&fbatch)) != NULL) {534folio_lock(folio);535536/*537* If the folio has been remapped, or is no longer538* dirty, then there's nothing to do.539*/540if (folio->mapping != mapping ||541!folio_test_dirty(folio)) {542folio_unlock(folio);543continue;544}545546/*547* If writeback is already in progress, wait for it to548* finish. We continue after this even if the page549* ends up clean; zfs_putpage() will skip it if no550* further work is required.551*/552while (folio_test_writeback(folio))553folio_wait_bit(folio, PG_writeback);554555/*556* Write it out and collect any error. zfs_putpage()557* will clear the TOWRITE and DIRTY flags, and return558* with the page unlocked.559*/560int ferr = zpl_putpage(&folio->page, wbc, data);561if (err == 0 && ferr != 0)562err = ferr;563564/* Housekeeping for the caller. */565wbc->nr_to_write -= folio_nr_pages(folio);566}567568/* Release any remaining references on the batch. */569folio_batch_release(&fbatch);570}571572return (err);573}574#endif575576static int577zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)578{579znode_t *zp = ITOZ(mapping->host);580zfsvfs_t *zfsvfs = ITOZSB(mapping->host);581enum writeback_sync_modes sync_mode;582int result;583584if ((result = zpl_enter(zfsvfs, FTAG)) != 0)585return (result);586if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)587wbc->sync_mode = WB_SYNC_ALL;588zpl_exit(zfsvfs, FTAG);589sync_mode = wbc->sync_mode;590591/*592* We don't want to run write_cache_pages() in SYNC mode here, because593* that would make putpage() wait for a single page to be committed to594* disk every single time, resulting in atrocious performance. Instead595* we run it once in non-SYNC mode so that the ZIL gets all the data,596* and then we commit it all in one go.597*/598boolean_t for_sync = (sync_mode == WB_SYNC_ALL);599wbc->sync_mode = WB_SYNC_NONE;600result = zpl_write_cache_pages(mapping, wbc, &for_sync);601if (sync_mode != wbc->sync_mode) {602if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)603return (result);604605if (zfsvfs->z_log != NULL) {606/*607* We don't want to block here if the pool suspends,608* because this is not a syncing op by itself, but609* might be part of one that the caller will610* coordinate.611*/612result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,613ZIL_COMMIT_NOW);614}615616zpl_exit(zfsvfs, FTAG);617618/*619* If zil_commit_flags() failed, it's unclear what state things620* are currently in. putpage() has written back out what it can621* to the DMU, but it may not be on disk. We have little choice622* but to escape.623*/624if (result != 0)625return (result);626627/*628* We need to call write_cache_pages() again (we can't just629* return after the commit) because the previous call in630* non-SYNC mode does not guarantee that we got all the dirty631* pages (see the implementation of write_cache_pages() for632* details). That being said, this is a no-op in most cases.633*/634wbc->sync_mode = sync_mode;635result = zpl_write_cache_pages(mapping, wbc, &for_sync);636}637return (result);638}639640#ifdef HAVE_VFS_WRITEPAGE641/*642* Write out dirty pages to the ARC, this function is only required to643* support mmap(2). Mapped pages may be dirtied by memory operations644* which never call .write(). These dirty pages are kept in sync with645* the ARC buffers via this hook.646*/647static int648zpl_writepage(struct page *pp, struct writeback_control *wbc)649{650if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)651wbc->sync_mode = WB_SYNC_ALL;652653boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);654655return (zpl_putpage(pp, wbc, &for_sync));656}657#endif658659/*660* The flag combination which matches the behavior of zfs_space() is661* FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE662* flag was introduced in the 2.6.38 kernel.663*664* The original mode=0 (allocate space) behavior can be reasonably emulated665* by checking if enough space exists and creating a sparse file, as real666* persistent space reservation is not possible due to COW, snapshots, etc.667*/668static long669zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)670{671cred_t *cr = CRED();672loff_t olen;673fstrans_cookie_t cookie;674int error = 0;675676int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;677678if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)679return (-EOPNOTSUPP);680681if (offset < 0 || len <= 0)682return (-EINVAL);683684spl_inode_lock(ip);685olen = i_size_read(ip);686687crhold(cr);688cookie = spl_fstrans_mark();689if (mode & (test_mode)) {690flock64_t bf;691692if (mode & FALLOC_FL_KEEP_SIZE) {693if (offset > olen)694goto out_unmark;695696if (offset + len > olen)697len = olen - offset;698}699bf.l_type = F_WRLCK;700bf.l_whence = SEEK_SET;701bf.l_start = offset;702bf.l_len = len;703bf.l_pid = 0;704705error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);706} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {707unsigned int percent = zfs_fallocate_reserve_percent;708struct kstatfs statfs;709710/* Legacy mode, disable fallocate compatibility. */711if (percent == 0) {712error = -EOPNOTSUPP;713goto out_unmark;714}715716/*717* Use zfs_statvfs() instead of dmu_objset_space() since it718* also checks project quota limits, which are relevant here.719*/720error = zfs_statvfs(ip, &statfs);721if (error)722goto out_unmark;723724/*725* Shrink available space a bit to account for overhead/races.726* We know the product previously fit into availbytes from727* dmu_objset_space(), so the smaller product will also fit.728*/729if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {730error = -ENOSPC;731goto out_unmark;732}733if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)734error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);735}736out_unmark:737spl_fstrans_unmark(cookie);738spl_inode_unlock(ip);739740crfree(cr);741742return (error);743}744745static long746zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)747{748return zpl_fallocate_common(file_inode(filp),749mode, offset, len);750}751752static int753zpl_ioctl_getversion(struct file *filp, void __user *arg)754{755uint32_t generation = file_inode(filp)->i_generation;756757return (copy_to_user(arg, &generation, sizeof (generation)));758}759760static int761zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)762{763struct inode *ip = file_inode(filp);764znode_t *zp = ITOZ(ip);765zfsvfs_t *zfsvfs = ITOZSB(ip);766objset_t *os = zfsvfs->z_os;767int error = 0;768769if (S_ISFIFO(ip->i_mode))770return (-ESPIPE);771772if (offset < 0 || len < 0)773return (-EINVAL);774775if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)776return (error);777778switch (advice) {779case POSIX_FADV_SEQUENTIAL:780case POSIX_FADV_WILLNEED:781#ifdef HAVE_GENERIC_FADVISE782if (zn_has_cached_data(zp, offset, offset + len - 1))783error = generic_fadvise(filp, offset, len, advice);784#endif785/*786* Pass on the caller's size directly, but note that787* dmu_prefetch_max will effectively cap it. If there788* really is a larger sequential access pattern, perhaps789* dmu_zfetch will detect it.790*/791if (len == 0)792len = i_size_read(ip) - offset;793794dmu_prefetch(os, zp->z_id, 0, offset, len,795ZIO_PRIORITY_ASYNC_READ);796break;797case POSIX_FADV_NORMAL:798case POSIX_FADV_RANDOM:799case POSIX_FADV_DONTNEED:800case POSIX_FADV_NOREUSE:801/* ignored for now */802break;803default:804error = -EINVAL;805break;806}807808zfs_exit(zfsvfs, FTAG);809810return (error);811}812813#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)814#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)815816static uint32_t817__zpl_ioctl_getflags(struct inode *ip)818{819uint64_t zfs_flags = ITOZ(ip)->z_pflags;820uint32_t ioctl_flags = 0;821822if (zfs_flags & ZFS_IMMUTABLE)823ioctl_flags |= FS_IMMUTABLE_FL;824825if (zfs_flags & ZFS_APPENDONLY)826ioctl_flags |= FS_APPEND_FL;827828if (zfs_flags & ZFS_NODUMP)829ioctl_flags |= FS_NODUMP_FL;830831if (zfs_flags & ZFS_PROJINHERIT)832ioctl_flags |= ZFS_PROJINHERIT_FL;833834return (ioctl_flags & ZFS_FL_USER_VISIBLE);835}836837/*838* Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file839* attributes common to both Linux and Solaris are mapped.840*/841static int842zpl_ioctl_getflags(struct file *filp, void __user *arg)843{844uint32_t flags;845int err;846847flags = __zpl_ioctl_getflags(file_inode(filp));848err = copy_to_user(arg, &flags, sizeof (flags));849850return (err);851}852853/*854* fchange() is a helper macro to detect if we have been asked to change a855* flag. This is ugly, but the requirement that we do this is a consequence of856* how the Linux file attribute interface was designed. Another consequence is857* that concurrent modification of files suffers from a TOCTOU race. Neither858* are things we can fix without modifying the kernel-userland interface, which859* is outside of our jurisdiction.860*/861862#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))863864static int865__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)866{867uint64_t zfs_flags = ITOZ(ip)->z_pflags;868xoptattr_t *xoap;869870if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |871ZFS_PROJINHERIT_FL))872return (-EOPNOTSUPP);873874if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)875return (-EACCES);876877if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||878fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&879!capable(CAP_LINUX_IMMUTABLE))880return (-EPERM);881882if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))883return (-EACCES);884885xva_init(xva);886xoap = xva_getxoptattr(xva);887888#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \889if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \890((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \891XVA_SET_REQ(xva, (xflag)); \892(xfield) = ((ioctl_flags & (iflag)) != 0); \893} \894} while (0)895896FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,897xoap->xoa_immutable);898FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,899xoap->xoa_appendonly);900FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,901xoap->xoa_nodump);902FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,903xoap->xoa_projinherit);904905#undef FLAG_CHANGE906907return (0);908}909910static int911zpl_ioctl_setflags(struct file *filp, void __user *arg)912{913struct inode *ip = file_inode(filp);914uint32_t flags;915cred_t *cr = CRED();916xvattr_t xva;917int err;918fstrans_cookie_t cookie;919920if (copy_from_user(&flags, arg, sizeof (flags)))921return (-EFAULT);922923err = __zpl_ioctl_setflags(ip, flags, &xva);924if (err)925return (err);926927crhold(cr);928cookie = spl_fstrans_mark();929err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);930spl_fstrans_unmark(cookie);931crfree(cr);932933return (err);934}935936static int937zpl_ioctl_getxattr(struct file *filp, void __user *arg)938{939zfsxattr_t fsx = { 0 };940struct inode *ip = file_inode(filp);941int err;942943fsx.fsx_xflags = __zpl_ioctl_getflags(ip);944fsx.fsx_projid = ITOZ(ip)->z_projid;945err = copy_to_user(arg, &fsx, sizeof (fsx));946947return (err);948}949950static int951zpl_ioctl_setxattr(struct file *filp, void __user *arg)952{953struct inode *ip = file_inode(filp);954zfsxattr_t fsx;955cred_t *cr = CRED();956xvattr_t xva;957xoptattr_t *xoap;958int err;959fstrans_cookie_t cookie;960961if (copy_from_user(&fsx, arg, sizeof (fsx)))962return (-EFAULT);963964if (!zpl_is_valid_projid(fsx.fsx_projid))965return (-EINVAL);966967err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);968if (err)969return (err);970971xoap = xva_getxoptattr(&xva);972XVA_SET_REQ(&xva, XAT_PROJID);973xoap->xoa_projid = fsx.fsx_projid;974975crhold(cr);976cookie = spl_fstrans_mark();977err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);978spl_fstrans_unmark(cookie);979crfree(cr);980981return (err);982}983984/*985* Expose Additional File Level Attributes of ZFS.986*/987static int988zpl_ioctl_getdosflags(struct file *filp, void __user *arg)989{990struct inode *ip = file_inode(filp);991uint64_t dosflags = ITOZ(ip)->z_pflags;992dosflags &= ZFS_DOS_FL_USER_VISIBLE;993int err = copy_to_user(arg, &dosflags, sizeof (dosflags));994995return (err);996}997998static int999__zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)1000{1001uint64_t zfs_flags = ITOZ(ip)->z_pflags;1002xoptattr_t *xoap;10031004if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))1005return (-EOPNOTSUPP);10061007if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||1008fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&1009!capable(CAP_LINUX_IMMUTABLE))1010return (-EPERM);10111012if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))1013return (-EACCES);10141015xva_init(xva);1016xoap = xva_getxoptattr(xva);10171018#define FLAG_CHANGE(iflag, xflag, xfield) do { \1019if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \1020((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \1021XVA_SET_REQ(xva, (xflag)); \1022(xfield) = ((ioctl_flags & (iflag)) != 0); \1023} \1024} while (0)10251026FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);1027FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);1028FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);1029FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);1030FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);1031FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);1032FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);1033FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);1034FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);1035FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);1036FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);10371038#undef FLAG_CHANGE10391040return (0);1041}10421043/*1044* Set Additional File Level Attributes of ZFS.1045*/1046static int1047zpl_ioctl_setdosflags(struct file *filp, void __user *arg)1048{1049struct inode *ip = file_inode(filp);1050uint64_t dosflags;1051cred_t *cr = CRED();1052xvattr_t xva;1053int err;1054fstrans_cookie_t cookie;10551056if (copy_from_user(&dosflags, arg, sizeof (dosflags)))1057return (-EFAULT);10581059err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);1060if (err)1061return (err);10621063crhold(cr);1064cookie = spl_fstrans_mark();1065err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);1066spl_fstrans_unmark(cookie);1067crfree(cr);10681069return (err);1070}10711072static int1073zpl_ioctl_rewrite(struct file *filp, void __user *arg)1074{1075struct inode *ip = file_inode(filp);1076zfs_rewrite_args_t args;1077fstrans_cookie_t cookie;1078int err;10791080if (copy_from_user(&args, arg, sizeof (args)))1081return (-EFAULT);10821083if (unlikely(!(filp->f_mode & FMODE_WRITE)))1084return (-EBADF);10851086cookie = spl_fstrans_mark();1087err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);1088spl_fstrans_unmark(cookie);10891090return (err);1091}10921093static long1094zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)1095{1096switch (cmd) {1097case FS_IOC_GETVERSION:1098return (zpl_ioctl_getversion(filp, (void *)arg));1099case FS_IOC_GETFLAGS:1100return (zpl_ioctl_getflags(filp, (void *)arg));1101case FS_IOC_SETFLAGS:1102return (zpl_ioctl_setflags(filp, (void *)arg));1103case ZFS_IOC_FSGETXATTR:1104return (zpl_ioctl_getxattr(filp, (void *)arg));1105case ZFS_IOC_FSSETXATTR:1106return (zpl_ioctl_setxattr(filp, (void *)arg));1107case ZFS_IOC_GETDOSFLAGS:1108return (zpl_ioctl_getdosflags(filp, (void *)arg));1109case ZFS_IOC_SETDOSFLAGS:1110return (zpl_ioctl_setdosflags(filp, (void *)arg));1111case ZFS_IOC_REWRITE:1112return (zpl_ioctl_rewrite(filp, (void *)arg));1113default:1114return (-ENOTTY);1115}1116}11171118#ifdef CONFIG_COMPAT1119static long1120zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)1121{1122switch (cmd) {1123case FS_IOC32_GETVERSION:1124cmd = FS_IOC_GETVERSION;1125break;1126case FS_IOC32_GETFLAGS:1127cmd = FS_IOC_GETFLAGS;1128break;1129case FS_IOC32_SETFLAGS:1130cmd = FS_IOC_SETFLAGS;1131break;1132default:1133return (-ENOTTY);1134}1135return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));1136}1137#endif /* CONFIG_COMPAT */11381139const struct address_space_operations zpl_address_space_operations = {1140#ifdef HAVE_VFS_READPAGES1141.readpages = zpl_readpages,1142#else1143.readahead = zpl_readahead,1144#endif1145#ifdef HAVE_VFS_READ_FOLIO1146.read_folio = zpl_read_folio,1147#else1148.readpage = zpl_readpage,1149#endif1150#ifdef HAVE_VFS_WRITEPAGE1151.writepage = zpl_writepage,1152#endif1153.writepages = zpl_writepages,1154.direct_IO = zpl_direct_IO,1155#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS1156.set_page_dirty = __set_page_dirty_nobuffers,1157#endif1158#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO1159.dirty_folio = filemap_dirty_folio,1160#endif1161#ifdef HAVE_VFS_MIGRATE_FOLIO1162.migrate_folio = migrate_folio,1163#elif defined(HAVE_VFS_MIGRATEPAGE)1164.migratepage = migrate_page,1165#endif1166};11671168const struct file_operations zpl_file_operations = {1169.open = zpl_open,1170.release = zpl_release,1171.llseek = zpl_llseek,1172.read_iter = zpl_iter_read,1173.write_iter = zpl_iter_write,1174#ifdef HAVE_COPY_SPLICE_READ1175.splice_read = copy_splice_read,1176#else1177.splice_read = generic_file_splice_read,1178#endif1179.splice_write = iter_file_splice_write,1180.mmap = zpl_mmap,1181.fsync = zpl_fsync,1182.fallocate = zpl_fallocate,1183.copy_file_range = zpl_copy_file_range,1184#ifdef HAVE_VFS_CLONE_FILE_RANGE1185.clone_file_range = zpl_clone_file_range,1186#endif1187#ifdef HAVE_VFS_REMAP_FILE_RANGE1188.remap_file_range = zpl_remap_file_range,1189#endif1190#ifdef HAVE_VFS_DEDUPE_FILE_RANGE1191.dedupe_file_range = zpl_dedupe_file_range,1192#endif1193.fadvise = zpl_fadvise,1194.unlocked_ioctl = zpl_ioctl,1195#ifdef CONFIG_COMPAT1196.compat_ioctl = zpl_compat_ioctl,1197#endif1198};11991200const struct file_operations zpl_dir_file_operations = {1201.llseek = generic_file_llseek,1202.read = generic_read_dir,1203.iterate_shared = zpl_iterate,1204.fsync = zpl_fsync,1205.unlocked_ioctl = zpl_ioctl,1206#ifdef CONFIG_COMPAT1207.compat_ioctl = zpl_compat_ioctl,1208#endif1209};12101211module_param(zfs_fallocate_reserve_percent, uint, 0644);1212MODULE_PARM_DESC(zfs_fallocate_reserve_percent,1213"Percentage of length to use for the available capacity check");121412151216