CoCalc -- zpl

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
⁴⁸⁷⁷⁴ views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
 * CDDL HEADER START
4
 *
5
 * The contents of this file are subject to the terms of the
6
 * Common Development and Distribution License (the "License").
7
 * You may not use this file except in compliance with the License.
8
 *
9
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
 * or https://opensource.org/licenses/CDDL-1.0.
11
 * See the License for the specific language governing permissions
12
 * and limitations under the License.
13
 *
14
 * When distributing Covered Code, include this CDDL HEADER in each
15
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
 * If applicable, add the following below this CDDL HEADER, with the
17
 * fields enclosed by brackets "[]" replaced with your own identifying
18
 * information: Portions Copyright [yyyy] [name of copyright owner]
19
 *
20
 * CDDL HEADER END
21
 */
22
/*
23
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24
 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
25
 * Copyright (c) 2025, Klara, Inc.
26
 * Copyright (c) 2025, Rob Norris <[email protected]>
27
 */
28

29

30
#ifdef CONFIG_COMPAT
31
#include <linux/compat.h>
32
#endif
33
#include <linux/fs.h>
34
#include <linux/migrate.h>
35
#include <sys/file.h>
36
#include <sys/dmu_objset.h>
37
#include <sys/zfs_znode.h>
38
#include <sys/zfs_vfsops.h>
39
#include <sys/zfs_vnops.h>
40
#include <sys/zfs_project.h>
41
#include <linux/pagemap_compat.h>
42
#include <linux/fadvise.h>
43
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
44
#include <linux/writeback.h>
45
#endif
46

47
/*
48
 * When using fallocate(2) to preallocate space, inflate the requested
49
 * capacity check by 10% to account for the required metadata blocks.
50
 */
51
static unsigned int zfs_fallocate_reserve_percent = 110;
52

53
static int
54
zpl_open(struct inode *ip, struct file *filp)
55
{
56
	cred_t *cr = CRED();
57
	int error;
58
	fstrans_cookie_t cookie;
59

60
	error = generic_file_open(ip, filp);
61
	if (error)
62
		return (error);
63

64
	crhold(cr);
65
	cookie = spl_fstrans_mark();
66
	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
67
	spl_fstrans_unmark(cookie);
68
	crfree(cr);
69
	ASSERT3S(error, <=, 0);
70

71
	return (error);
72
}
73

74
static int
75
zpl_release(struct inode *ip, struct file *filp)
76
{
77
	cred_t *cr = CRED();
78
	int error;
79
	fstrans_cookie_t cookie;
80

81
	cookie = spl_fstrans_mark();
82
	if (ITOZ(ip)->z_atime_dirty)
83
		zfs_mark_inode_dirty(ip);
84

85
	crhold(cr);
86
	error = -zfs_close(ip, filp->f_flags, cr);
87
	spl_fstrans_unmark(cookie);
88
	crfree(cr);
89
	ASSERT3S(error, <=, 0);
90

91
	return (error);
92
}
93

94
static int
95
zpl_iterate(struct file *filp, struct dir_context *ctx)
96
{
97
	cred_t *cr = CRED();
98
	int error;
99
	fstrans_cookie_t cookie;
100

101
	crhold(cr);
102
	cookie = spl_fstrans_mark();
103
	error = -zfs_readdir(file_inode(filp), ctx, cr);
104
	spl_fstrans_unmark(cookie);
105
	crfree(cr);
106
	ASSERT3S(error, <=, 0);
107

108
	return (error);
109
}
110

111
static inline int
112
zpl_write_cache_pages(struct address_space *mapping,
113
    struct writeback_control *wbc, void *data);
114

115
static int
116
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
117
{
118
	struct inode *inode = filp->f_mapping->host;
119
	znode_t *zp = ITOZ(inode);
120
	cred_t *cr = CRED();
121
	int error;
122
	fstrans_cookie_t cookie;
123

124
	/*
125
	 * Force dirty pages in the range out to the DMU and the log, ready
126
	 * for zil_commit() to write down.
127
	 *
128
	 * We call write_cache_pages() directly to ensure that zpl_putpage() is
129
	 * called with the flags we need. We need WB_SYNC_NONE to avoid a call
130
	 * to zil_commit() (since we're doing this as a kind of pre-sync); but
131
	 * we do need for_sync so that the pages remain in writeback until
132
	 * they're on disk, and so that we get an error if the DMU write fails.
133
	 */
134
	if (filemap_range_has_page(inode->i_mapping, start, end)) {
135
		int for_sync = 1;
136
		struct writeback_control wbc = {
137
			.sync_mode = WB_SYNC_NONE,
138
			.nr_to_write = LONG_MAX,
139
			.range_start = start,
140
			.range_end = end,
141
		};
142
		error =
143
		    zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
144
		if (error != 0) {
145
			/*
146
			 * Unclear what state things are in. zfs_putpage() will
147
			 * ensure the pages remain dirty if they haven't been
148
			 * written down to the DMU, but because there may be
149
			 * nothing logged, we can't assume that zfs_sync() ->
150
			 * zil_commit() will give us a useful error. It's
151
			 * safest if we just error out here.
152
			 */
153
			return (error);
154
		}
155
	}
156

157
	crhold(cr);
158
	cookie = spl_fstrans_mark();
159
	error = -zfs_fsync(zp, datasync, cr);
160
	spl_fstrans_unmark(cookie);
161
	crfree(cr);
162
	ASSERT3S(error, <=, 0);
163

164
	return (error);
165
}
166

167
static inline int
168
zfs_io_flags(struct kiocb *kiocb)
169
{
170
	int flags = 0;
171

172
#if defined(IOCB_DSYNC)
173
	if (kiocb->ki_flags & IOCB_DSYNC)
174
		flags |= O_DSYNC;
175
#endif
176
#if defined(IOCB_SYNC)
177
	if (kiocb->ki_flags & IOCB_SYNC)
178
		flags |= O_SYNC;
179
#endif
180
#if defined(IOCB_APPEND)
181
	if (kiocb->ki_flags & IOCB_APPEND)
182
		flags |= O_APPEND;
183
#endif
184
#if defined(IOCB_DIRECT)
185
	if (kiocb->ki_flags & IOCB_DIRECT)
186
		flags |= O_DIRECT;
187
#endif
188
	return (flags);
189
}
190

191
/*
192
 * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
193
 * is true.  This is needed since datasets with inherited "relatime" property
194
 * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
195
 * `zfs set relatime=...`), which is what relatime test in VFS by
196
 * relatime_need_update() is based on.
197
 */
198
static inline void
199
zpl_file_accessed(struct file *filp)
200
{
201
	struct inode *ip = filp->f_mapping->host;
202

203
	if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
204
		if (zfs_relatime_need_update(ip))
205
			file_accessed(filp);
206
	} else {
207
		file_accessed(filp);
208
	}
209
}
210

211
static ssize_t
212
zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
213
{
214
	cred_t *cr = CRED();
215
	fstrans_cookie_t cookie;
216
	struct file *filp = kiocb->ki_filp;
217
	ssize_t count = iov_iter_count(to);
218
	zfs_uio_t uio;
219

220
	zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);
221

222
	crhold(cr);
223
	cookie = spl_fstrans_mark();
224

225
	ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
226
	    filp->f_flags | zfs_io_flags(kiocb), cr);
227

228
	spl_fstrans_unmark(cookie);
229
	crfree(cr);
230

231
	if (ret < 0)
232
		return (ret);
233

234
	ssize_t read = count - uio.uio_resid;
235
	kiocb->ki_pos += read;
236

237
	zpl_file_accessed(filp);
238

239
	return (read);
240
}
241

242
static inline ssize_t
243
zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
244
    size_t *countp)
245
{
246
	ssize_t ret = generic_write_checks(kiocb, from);
247
	if (ret <= 0)
248
		return (ret);
249

250
	*countp = ret;
251

252
	return (0);
253
}
254

255
static ssize_t
256
zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
257
{
258
	cred_t *cr = CRED();
259
	fstrans_cookie_t cookie;
260
	struct file *filp = kiocb->ki_filp;
261
	struct inode *ip = filp->f_mapping->host;
262
	zfs_uio_t uio;
263
	size_t count = 0;
264
	ssize_t ret;
265

266
	ret = zpl_generic_write_checks(kiocb, from, &count);
267
	if (ret)
268
		return (ret);
269

270
	zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);
271

272
	crhold(cr);
273
	cookie = spl_fstrans_mark();
274

275
	ret = -zfs_write(ITOZ(ip), &uio,
276
	    filp->f_flags | zfs_io_flags(kiocb), cr);
277

278
	spl_fstrans_unmark(cookie);
279
	crfree(cr);
280

281
	if (ret < 0)
282
		return (ret);
283

284
	ssize_t wrote = count - uio.uio_resid;
285
	kiocb->ki_pos += wrote;
286

287
	return (wrote);
288
}
289

290
static ssize_t
291
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
292
{
293
	/*
294
	 * All O_DIRECT requests should be handled by
295
	 * zpl_iter_write/read}(). There is no way kernel generic code should
296
	 * call the direct_IO address_space_operations function. We set this
297
	 * code path to be fatal if it is executed.
298
	 */
299
	PANIC(0);
300
	return (0);
301
}
302

303
static loff_t
304
zpl_llseek(struct file *filp, loff_t offset, int whence)
305
{
306
#if defined(SEEK_HOLE) && defined(SEEK_DATA)
307
	fstrans_cookie_t cookie;
308

309
	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
310
		struct inode *ip = filp->f_mapping->host;
311
		loff_t maxbytes = ip->i_sb->s_maxbytes;
312
		loff_t error;
313

314
		spl_inode_lock_shared(ip);
315
		cookie = spl_fstrans_mark();
316
		error = -zfs_holey(ITOZ(ip), whence, &offset);
317
		spl_fstrans_unmark(cookie);
318
		if (error == 0)
319
			error = lseek_execute(filp, ip, offset, maxbytes);
320
		spl_inode_unlock_shared(ip);
321

322
		return (error);
323
	}
324
#endif /* SEEK_HOLE && SEEK_DATA */
325

326
	return (generic_file_llseek(filp, offset, whence));
327
}
328

329
/*
330
 * It's worth taking a moment to describe how mmap is implemented
331
 * for zfs because it differs considerably from other Linux filesystems.
332
 * However, this issue is handled the same way under OpenSolaris.
333
 *
334
 * The issue is that by design zfs bypasses the Linux page cache and
335
 * leaves all caching up to the ARC.  This has been shown to work
336
 * well for the common read(2)/write(2) case.  However, mmap(2)
337
 * is problem because it relies on being tightly integrated with the
338
 * page cache.  To handle this we cache mmap'ed files twice, once in
339
 * the ARC and a second time in the page cache.  The code is careful
340
 * to keep both copies synchronized.
341
 *
342
 * When a file with an mmap'ed region is written to using write(2)
343
 * both the data in the ARC and existing pages in the page cache
344
 * are updated.  For a read(2) data will be read first from the page
345
 * cache then the ARC if needed.  Neither a write(2) or read(2) will
346
 * will ever result in new pages being added to the page cache.
347
 *
348
 * New pages are added to the page cache only via .readpage() which
349
 * is called when the vfs needs to read a page off disk to back the
350
 * virtual memory region.  These pages may be modified without
351
 * notifying the ARC and will be written out periodically via
352
 * .writepage().  This will occur due to either a sync or the usual
353
 * page aging behavior.  Note because a read(2) of a mmap'ed file
354
 * will always check the page cache first even when the ARC is out
355
 * of date correct data will still be returned.
356
 *
357
 * While this implementation ensures correct behavior it does have
358
 * have some drawbacks.  The most obvious of which is that it
359
 * increases the required memory footprint when access mmap'ed
360
 * files.  It also adds additional complexity to the code keeping
361
 * both caches synchronized.
362
 *
363
 * Longer term it may be possible to cleanly resolve this wart by
364
 * mapping page cache pages directly on to the ARC buffers.  The
365
 * Linux address space operations are flexible enough to allow
366
 * selection of which pages back a particular index.  The trick
367
 * would be working out the details of which subsystem is in
368
 * charge, the ARC, the page cache, or both.  It may also prove
369
 * helpful to move the ARC buffers to a scatter-gather lists
370
 * rather than a vmalloc'ed region.
371
 */
372
static int
373
zpl_mmap(struct file *filp, struct vm_area_struct *vma)
374
{
375
	struct inode *ip = filp->f_mapping->host;
376
	int error;
377
	fstrans_cookie_t cookie;
378

379
	cookie = spl_fstrans_mark();
380
	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
381
	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
382
	spl_fstrans_unmark(cookie);
383

384
	if (error)
385
		return (error);
386

387
	error = generic_file_mmap(filp, vma);
388
	if (error)
389
		return (error);
390

391
	return (error);
392
}
393

394
/*
395
 * Populate a page with data for the Linux page cache.  This function is
396
 * only used to support mmap(2).  There will be an identical copy of the
397
 * data in the ARC which is kept up to date via .write() and .writepage().
398
 */
399
static inline int
400
zpl_readpage_common(struct page *pp)
401
{
402
	fstrans_cookie_t cookie;
403

404
	ASSERT(PageLocked(pp));
405

406
	cookie = spl_fstrans_mark();
407
	int error = -zfs_getpage(pp->mapping->host, pp);
408
	spl_fstrans_unmark(cookie);
409

410
	unlock_page(pp);
411

412
	return (error);
413
}
414

415
#ifdef HAVE_VFS_READ_FOLIO
416
static int
417
zpl_read_folio(struct file *filp, struct folio *folio)
418
{
419
	return (zpl_readpage_common(&folio->page));
420
}
421
#else
422
static int
423
zpl_readpage(struct file *filp, struct page *pp)
424
{
425
	return (zpl_readpage_common(pp));
426
}
427
#endif
428

429
static int
430
zpl_readpage_filler(void *data, struct page *pp)
431
{
432
	return (zpl_readpage_common(pp));
433
}
434

435
/*
436
 * Populate a set of pages with data for the Linux page cache.  This
437
 * function will only be called for read ahead and never for demand
438
 * paging.  For simplicity, the code relies on read_cache_pages() to
439
 * correctly lock each page for IO and call zpl_readpage().
440
 */
441
#ifdef HAVE_VFS_READPAGES
442
static int
443
zpl_readpages(struct file *filp, struct address_space *mapping,
444
    struct list_head *pages, unsigned nr_pages)
445
{
446
	return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
447
}
448
#else
449
static void
450
zpl_readahead(struct readahead_control *ractl)
451
{
452
	struct page *page;
453

454
	while ((page = readahead_page(ractl)) != NULL) {
455
		int ret;
456

457
		ret = zpl_readpage_filler(NULL, page);
458
		put_page(page);
459
		if (ret)
460
			break;
461
	}
462
}
463
#endif
464

465
static int
466
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
467
{
468
	boolean_t *for_sync = data;
469
	fstrans_cookie_t cookie;
470
	int ret;
471

472
	ASSERT(PageLocked(pp));
473
	ASSERT(!PageWriteback(pp));
474

475
	cookie = spl_fstrans_mark();
476
	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
477
	spl_fstrans_unmark(cookie);
478

479
	return (ret);
480
}
481

482
#ifdef HAVE_WRITE_CACHE_PAGES
483
#ifdef HAVE_WRITEPAGE_T_FOLIO
484
static int
485
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
486
{
487
	return (zpl_putpage(&pp->page, wbc, data));
488
}
489
#endif
490

491
static inline int
492
zpl_write_cache_pages(struct address_space *mapping,
493
    struct writeback_control *wbc, void *data)
494
{
495
	int result;
496

497
#ifdef HAVE_WRITEPAGE_T_FOLIO
498
	result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
499
#else
500
	result = write_cache_pages(mapping, wbc, zpl_putpage, data);
501
#endif
502
	return (result);
503
}
504
#else
505
static inline int
506
zpl_write_cache_pages(struct address_space *mapping,
507
    struct writeback_control *wbc, void *data)
508
{
509
	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
510
	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
511

512
	struct folio_batch fbatch;
513
	folio_batch_init(&fbatch);
514

515
	/*
516
	 * This atomically (-ish) tags all DIRTY pages in the range with
517
	 * TOWRITE, allowing users to continue dirtying or undirtying pages
518
	 * while we get on with writeback, without us treading on each other.
519
	 */
520
	tag_pages_for_writeback(mapping, start, end);
521

522
	int err = 0;
523
	unsigned int npages;
524

525
	/*
526
	 * Grab references to the TOWRITE pages just flagged. This may not get
527
	 * all of them, so we do it in a loop until there are none left.
528
	 */
529
	while ((npages = filemap_get_folios_tag(mapping, &start, end,
530
	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
531

532
		/* Loop over each page and write it out. */
533
		struct folio *folio;
534
		while ((folio = folio_batch_next(&fbatch)) != NULL) {
535
			folio_lock(folio);
536

537
			/*
538
			 * If the folio has been remapped, or is no longer
539
			 * dirty, then there's nothing to do.
540
			 */
541
			if (folio->mapping != mapping ||
542
			    !folio_test_dirty(folio)) {
543
				folio_unlock(folio);
544
				continue;
545
			}
546

547
			/*
548
			 * If writeback is already in progress, wait for it to
549
			 * finish. We continue after this even if the page
550
			 * ends up clean; zfs_putpage() will skip it if no
551
			 * further work is required.
552
			 */
553
			while (folio_test_writeback(folio))
554
				folio_wait_bit(folio, PG_writeback);
555

556
			/*
557
			 * Write it out and collect any error. zfs_putpage()
558
			 * will clear the TOWRITE and DIRTY flags, and return
559
			 * with the page unlocked.
560
			 */
561
			int ferr = zpl_putpage(&folio->page, wbc, data);
562
			if (err == 0 && ferr != 0)
563
				err = ferr;
564

565
			/* Housekeeping for the caller. */
566
			wbc->nr_to_write -= folio_nr_pages(folio);
567
		}
568

569
		/* Release any remaining references on the batch. */
570
		folio_batch_release(&fbatch);
571
	}
572

573
	return (err);
574
}
575
#endif
576

577
static int
578
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
579
{
580
	znode_t		*zp = ITOZ(mapping->host);
581
	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
582
	enum writeback_sync_modes sync_mode;
583
	int result;
584

585
	if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
586
		return (result);
587
	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
588
		wbc->sync_mode = WB_SYNC_ALL;
589
	zpl_exit(zfsvfs, FTAG);
590
	sync_mode = wbc->sync_mode;
591

592
	/*
593
	 * We don't want to run write_cache_pages() in SYNC mode here, because
594
	 * that would make putpage() wait for a single page to be committed to
595
	 * disk every single time, resulting in atrocious performance. Instead
596
	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
597
	 * and then we commit it all in one go.
598
	 */
599
	boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
600
	wbc->sync_mode = WB_SYNC_NONE;
601
	result = zpl_write_cache_pages(mapping, wbc, &for_sync);
602
	if (sync_mode != wbc->sync_mode) {
603
		if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
604
			return (result);
605

606
		if (zfsvfs->z_log != NULL) {
607
			/*
608
			 * We don't want to block here if the pool suspends,
609
			 * because this is not a syncing op by itself, but
610
			 * might be part of one that the caller will
611
			 * coordinate.
612
			 */
613
			result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
614
			    ZIL_COMMIT_NOW);
615
		}
616

617
		zpl_exit(zfsvfs, FTAG);
618

619
		/*
620
		 * If zil_commit_flags() failed, it's unclear what state things
621
		 * are currently in. putpage() has written back out what it can
622
		 * to the DMU, but it may not be on disk. We have little choice
623
		 * but to escape.
624
		 */
625
		if (result != 0)
626
			return (result);
627

628
		/*
629
		 * We need to call write_cache_pages() again (we can't just
630
		 * return after the commit) because the previous call in
631
		 * non-SYNC mode does not guarantee that we got all the dirty
632
		 * pages (see the implementation of write_cache_pages() for
633
		 * details). That being said, this is a no-op in most cases.
634
		 */
635
		wbc->sync_mode = sync_mode;
636
		result = zpl_write_cache_pages(mapping, wbc, &for_sync);
637
	}
638
	return (result);
639
}
640

641
#ifdef HAVE_VFS_WRITEPAGE
642
/*
643
 * Write out dirty pages to the ARC, this function is only required to
644
 * support mmap(2).  Mapped pages may be dirtied by memory operations
645
 * which never call .write().  These dirty pages are kept in sync with
646
 * the ARC buffers via this hook.
647
 */
648
static int
649
zpl_writepage(struct page *pp, struct writeback_control *wbc)
650
{
651
	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
652
		wbc->sync_mode = WB_SYNC_ALL;
653

654
	boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
655

656
	return (zpl_putpage(pp, wbc, &for_sync));
657
}
658
#endif
659

660
/*
661
 * The flag combination which matches the behavior of zfs_space() is
662
 * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
663
 * flag was introduced in the 2.6.38 kernel.
664
 *
665
 * The original mode=0 (allocate space) behavior can be reasonably emulated
666
 * by checking if enough space exists and creating a sparse file, as real
667
 * persistent space reservation is not possible due to COW, snapshots, etc.
668
 */
669
static long
670
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
671
{
672
	cred_t *cr = CRED();
673
	loff_t olen;
674
	fstrans_cookie_t cookie;
675
	int error = 0;
676

677
	int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;
678

679
	if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)
680
		return (-EOPNOTSUPP);
681

682
	if (offset < 0 || len <= 0)
683
		return (-EINVAL);
684

685
	spl_inode_lock(ip);
686
	olen = i_size_read(ip);
687

688
	crhold(cr);
689
	cookie = spl_fstrans_mark();
690
	if (mode & (test_mode)) {
691
		flock64_t bf;
692

693
		if (mode & FALLOC_FL_KEEP_SIZE) {
694
			if (offset > olen)
695
				goto out_unmark;
696

697
			if (offset + len > olen)
698
				len = olen - offset;
699
		}
700
		bf.l_type = F_WRLCK;
701
		bf.l_whence = SEEK_SET;
702
		bf.l_start = offset;
703
		bf.l_len = len;
704
		bf.l_pid = 0;
705

706
		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
707
	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
708
		unsigned int percent = zfs_fallocate_reserve_percent;
709
		struct kstatfs statfs;
710

711
		/* Legacy mode, disable fallocate compatibility. */
712
		if (percent == 0) {
713
			error = -EOPNOTSUPP;
714
			goto out_unmark;
715
		}
716

717
		/*
718
		 * Use zfs_statvfs() instead of dmu_objset_space() since it
719
		 * also checks project quota limits, which are relevant here.
720
		 */
721
		error = zfs_statvfs(ip, &statfs);
722
		if (error)
723
			goto out_unmark;
724

725
		/*
726
		 * Shrink available space a bit to account for overhead/races.
727
		 * We know the product previously fit into availbytes from
728
		 * dmu_objset_space(), so the smaller product will also fit.
729
		 */
730
		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
731
			error = -ENOSPC;
732
			goto out_unmark;
733
		}
734
		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
735
			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
736
	}
737
out_unmark:
738
	spl_fstrans_unmark(cookie);
739
	spl_inode_unlock(ip);
740

741
	crfree(cr);
742

743
	return (error);
744
}
745

746
static long
747
zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
748
{
749
	return zpl_fallocate_common(file_inode(filp),
750
	    mode, offset, len);
751
}
752

753
static int
754
zpl_ioctl_getversion(struct file *filp, void __user *arg)
755
{
756
	uint32_t generation = file_inode(filp)->i_generation;
757

758
	return (copy_to_user(arg, &generation, sizeof (generation)));
759
}
760

761
static int
762
zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
763
{
764
	struct inode *ip = file_inode(filp);
765
	znode_t *zp = ITOZ(ip);
766
	zfsvfs_t *zfsvfs = ITOZSB(ip);
767
	objset_t *os = zfsvfs->z_os;
768
	int error = 0;
769

770
	if (S_ISFIFO(ip->i_mode))
771
		return (-ESPIPE);
772

773
	if (offset < 0 || len < 0)
774
		return (-EINVAL);
775

776
	if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
777
		return (error);
778

779
	switch (advice) {
780
	case POSIX_FADV_SEQUENTIAL:
781
	case POSIX_FADV_WILLNEED:
782
#ifdef HAVE_GENERIC_FADVISE
783
		if (zn_has_cached_data(zp, offset, offset + len - 1))
784
			error = generic_fadvise(filp, offset, len, advice);
785
#endif
786
		/*
787
		 * Pass on the caller's size directly, but note that
788
		 * dmu_prefetch_max will effectively cap it.  If there
789
		 * really is a larger sequential access pattern, perhaps
790
		 * dmu_zfetch will detect it.
791
		 */
792
		if (len == 0)
793
			len = i_size_read(ip) - offset;
794

795
		dmu_prefetch(os, zp->z_id, 0, offset, len,
796
		    ZIO_PRIORITY_ASYNC_READ);
797
		break;
798
	case POSIX_FADV_NORMAL:
799
	case POSIX_FADV_RANDOM:
800
	case POSIX_FADV_DONTNEED:
801
	case POSIX_FADV_NOREUSE:
802
		/* ignored for now */
803
		break;
804
	default:
805
		error = -EINVAL;
806
		break;
807
	}
808

809
	zfs_exit(zfsvfs, FTAG);
810

811
	return (error);
812
}
813

814
#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
815
#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
816

817
static uint32_t
818
__zpl_ioctl_getflags(struct inode *ip)
819
{
820
	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
821
	uint32_t ioctl_flags = 0;
822

823
	if (zfs_flags & ZFS_IMMUTABLE)
824
		ioctl_flags |= FS_IMMUTABLE_FL;
825

826
	if (zfs_flags & ZFS_APPENDONLY)
827
		ioctl_flags |= FS_APPEND_FL;
828

829
	if (zfs_flags & ZFS_NODUMP)
830
		ioctl_flags |= FS_NODUMP_FL;
831

832
	if (zfs_flags & ZFS_PROJINHERIT)
833
		ioctl_flags |= ZFS_PROJINHERIT_FL;
834

835
	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
836
}
837

838
/*
839
 * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
840
 * attributes common to both Linux and Solaris are mapped.
841
 */
842
static int
843
zpl_ioctl_getflags(struct file *filp, void __user *arg)
844
{
845
	uint32_t flags;
846
	int err;
847

848
	flags = __zpl_ioctl_getflags(file_inode(filp));
849
	err = copy_to_user(arg, &flags, sizeof (flags));
850

851
	return (err);
852
}
853

854
/*
855
 * fchange() is a helper macro to detect if we have been asked to change a
856
 * flag. This is ugly, but the requirement that we do this is a consequence of
857
 * how the Linux file attribute interface was designed. Another consequence is
858
 * that concurrent modification of files suffers from a TOCTOU race. Neither
859
 * are things we can fix without modifying the kernel-userland interface, which
860
 * is outside of our jurisdiction.
861
 */
862

863
#define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
864

865
static int
866
__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
867
{
868
	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
869
	xoptattr_t *xoap;
870

871
	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
872
	    ZFS_PROJINHERIT_FL))
873
		return (-EOPNOTSUPP);
874

875
	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
876
		return (-EACCES);
877

878
	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
879
	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
880
	    !capable(CAP_LINUX_IMMUTABLE))
881
		return (-EPERM);
882

883
	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
884
		return (-EACCES);
885

886
	xva_init(xva);
887
	xoap = xva_getxoptattr(xva);
888

889
#define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {	\
890
	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
891
	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
892
		XVA_SET_REQ(xva, (xflag));	\
893
		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
894
	}	\
895
} while (0)
896

897
	FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,
898
	    xoap->xoa_immutable);
899
	FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,
900
	    xoap->xoa_appendonly);
901
	FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
902
	    xoap->xoa_nodump);
903
	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
904
	    xoap->xoa_projinherit);
905

906
#undef	FLAG_CHANGE
907

908
	return (0);
909
}
910

911
static int
912
zpl_ioctl_setflags(struct file *filp, void __user *arg)
913
{
914
	struct inode *ip = file_inode(filp);
915
	uint32_t flags;
916
	cred_t *cr = CRED();
917
	xvattr_t xva;
918
	int err;
919
	fstrans_cookie_t cookie;
920

921
	if (copy_from_user(&flags, arg, sizeof (flags)))
922
		return (-EFAULT);
923

924
	err = __zpl_ioctl_setflags(ip, flags, &xva);
925
	if (err)
926
		return (err);
927

928
	crhold(cr);
929
	cookie = spl_fstrans_mark();
930
	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
931
	spl_fstrans_unmark(cookie);
932
	crfree(cr);
933

934
	return (err);
935
}
936

937
static int
938
zpl_ioctl_getxattr(struct file *filp, void __user *arg)
939
{
940
	zfsxattr_t fsx = { 0 };
941
	struct inode *ip = file_inode(filp);
942
	int err;
943

944
	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
945
	fsx.fsx_projid = ITOZ(ip)->z_projid;
946
	err = copy_to_user(arg, &fsx, sizeof (fsx));
947

948
	return (err);
949
}
950

951
static int
952
zpl_ioctl_setxattr(struct file *filp, void __user *arg)
953
{
954
	struct inode *ip = file_inode(filp);
955
	zfsxattr_t fsx;
956
	cred_t *cr = CRED();
957
	xvattr_t xva;
958
	xoptattr_t *xoap;
959
	int err;
960
	fstrans_cookie_t cookie;
961

962
	if (copy_from_user(&fsx, arg, sizeof (fsx)))
963
		return (-EFAULT);
964

965
	if (!zpl_is_valid_projid(fsx.fsx_projid))
966
		return (-EINVAL);
967

968
	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
969
	if (err)
970
		return (err);
971

972
	xoap = xva_getxoptattr(&xva);
973
	XVA_SET_REQ(&xva, XAT_PROJID);
974
	xoap->xoa_projid = fsx.fsx_projid;
975

976
	crhold(cr);
977
	cookie = spl_fstrans_mark();
978
	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
979
	spl_fstrans_unmark(cookie);
980
	crfree(cr);
981

982
	return (err);
983
}
984

985
/*
986
 * Expose Additional File Level Attributes of ZFS.
987
 */
988
static int
989
zpl_ioctl_getdosflags(struct file *filp, void __user *arg)
990
{
991
	struct inode *ip = file_inode(filp);
992
	uint64_t dosflags = ITOZ(ip)->z_pflags;
993
	dosflags &= ZFS_DOS_FL_USER_VISIBLE;
994
	int err = copy_to_user(arg, &dosflags, sizeof (dosflags));
995

996
	return (err);
997
}
998

999
static int
1000
__zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
1001
{
1002
	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
1003
	xoptattr_t *xoap;
1004

1005
	if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))
1006
		return (-EOPNOTSUPP);
1007

1008
	if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||
1009
	    fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&
1010
	    !capable(CAP_LINUX_IMMUTABLE))
1011
		return (-EPERM);
1012

1013
	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
1014
		return (-EACCES);
1015

1016
	xva_init(xva);
1017
	xoap = xva_getxoptattr(xva);
1018

1019
#define	FLAG_CHANGE(iflag, xflag, xfield)	do {	\
1020
	if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) ||	\
1021
	    ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) {	\
1022
		XVA_SET_REQ(xva, (xflag));	\
1023
		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
1024
	}	\
1025
} while (0)
1026

1027
	FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);
1028
	FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);
1029
	FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);
1030
	FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);
1031
	FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);
1032
	FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);
1033
	FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);
1034
	FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);
1035
	FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);
1036
	FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);
1037
	FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);
1038

1039
#undef	FLAG_CHANGE
1040

1041
	return (0);
1042
}
1043

1044
/*
1045
 * Set Additional File Level Attributes of ZFS.
1046
 */
1047
static int
1048
zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
1049
{
1050
	struct inode *ip = file_inode(filp);
1051
	uint64_t dosflags;
1052
	cred_t *cr = CRED();
1053
	xvattr_t xva;
1054
	int err;
1055
	fstrans_cookie_t cookie;
1056

1057
	if (copy_from_user(&dosflags, arg, sizeof (dosflags)))
1058
		return (-EFAULT);
1059

1060
	err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);
1061
	if (err)
1062
		return (err);
1063

1064
	crhold(cr);
1065
	cookie = spl_fstrans_mark();
1066
	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
1067
	spl_fstrans_unmark(cookie);
1068
	crfree(cr);
1069

1070
	return (err);
1071
}
1072

1073
static int
1074
zpl_ioctl_rewrite(struct file *filp, void __user *arg)
1075
{
1076
	struct inode *ip = file_inode(filp);
1077
	zfs_rewrite_args_t args;
1078
	fstrans_cookie_t cookie;
1079
	int err;
1080

1081
	if (copy_from_user(&args, arg, sizeof (args)))
1082
		return (-EFAULT);
1083

1084
	if (unlikely(!(filp->f_mode & FMODE_WRITE)))
1085
		return (-EBADF);
1086

1087
	cookie = spl_fstrans_mark();
1088
	err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
1089
	spl_fstrans_unmark(cookie);
1090

1091
	return (err);
1092
}
1093

1094
static long
1095
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1096
{
1097
	switch (cmd) {
1098
	case FS_IOC_GETVERSION:
1099
		return (zpl_ioctl_getversion(filp, (void *)arg));
1100
	case FS_IOC_GETFLAGS:
1101
		return (zpl_ioctl_getflags(filp, (void *)arg));
1102
	case FS_IOC_SETFLAGS:
1103
		return (zpl_ioctl_setflags(filp, (void *)arg));
1104
	case ZFS_IOC_FSGETXATTR:
1105
		return (zpl_ioctl_getxattr(filp, (void *)arg));
1106
	case ZFS_IOC_FSSETXATTR:
1107
		return (zpl_ioctl_setxattr(filp, (void *)arg));
1108
	case ZFS_IOC_GETDOSFLAGS:
1109
		return (zpl_ioctl_getdosflags(filp, (void *)arg));
1110
	case ZFS_IOC_SETDOSFLAGS:
1111
		return (zpl_ioctl_setdosflags(filp, (void *)arg));
1112
	case ZFS_IOC_REWRITE:
1113
		return (zpl_ioctl_rewrite(filp, (void *)arg));
1114
	default:
1115
		return (-ENOTTY);
1116
	}
1117
}
1118

1119
#ifdef CONFIG_COMPAT
1120
static long
1121
zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1122
{
1123
	switch (cmd) {
1124
	case FS_IOC32_GETVERSION:
1125
		cmd = FS_IOC_GETVERSION;
1126
		break;
1127
	case FS_IOC32_GETFLAGS:
1128
		cmd = FS_IOC_GETFLAGS;
1129
		break;
1130
	case FS_IOC32_SETFLAGS:
1131
		cmd = FS_IOC_SETFLAGS;
1132
		break;
1133
	default:
1134
		return (-ENOTTY);
1135
	}
1136
	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1137
}
1138
#endif /* CONFIG_COMPAT */
1139

1140
const struct address_space_operations zpl_address_space_operations = {
1141
#ifdef HAVE_VFS_READPAGES
1142
	.readpages	= zpl_readpages,
1143
#else
1144
	.readahead	= zpl_readahead,
1145
#endif
1146
#ifdef HAVE_VFS_READ_FOLIO
1147
	.read_folio	= zpl_read_folio,
1148
#else
1149
	.readpage	= zpl_readpage,
1150
#endif
1151
#ifdef HAVE_VFS_WRITEPAGE
1152
	.writepage	= zpl_writepage,
1153
#endif
1154
	.writepages	= zpl_writepages,
1155
	.direct_IO	= zpl_direct_IO,
1156
#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
1157
	.set_page_dirty = __set_page_dirty_nobuffers,
1158
#endif
1159
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
1160
	.dirty_folio	= filemap_dirty_folio,
1161
#endif
1162
#ifdef HAVE_VFS_MIGRATE_FOLIO
1163
	.migrate_folio	= migrate_folio,
1164
#elif defined(HAVE_VFS_MIGRATEPAGE)
1165
	.migratepage	= migrate_page,
1166
#endif
1167
};
1168

1169
const struct file_operations zpl_file_operations = {
1170
	.open		= zpl_open,
1171
	.release	= zpl_release,
1172
	.llseek		= zpl_llseek,
1173
	.read_iter	= zpl_iter_read,
1174
	.write_iter	= zpl_iter_write,
1175
#ifdef HAVE_COPY_SPLICE_READ
1176
	.splice_read	= copy_splice_read,
1177
#else
1178
	.splice_read	= generic_file_splice_read,
1179
#endif
1180
	.splice_write	= iter_file_splice_write,
1181
	.mmap		= zpl_mmap,
1182
	.fsync		= zpl_fsync,
1183
	.fallocate	= zpl_fallocate,
1184
	.copy_file_range	= zpl_copy_file_range,
1185
#ifdef HAVE_VFS_CLONE_FILE_RANGE
1186
	.clone_file_range	= zpl_clone_file_range,
1187
#endif
1188
#ifdef HAVE_VFS_REMAP_FILE_RANGE
1189
	.remap_file_range	= zpl_remap_file_range,
1190
#endif
1191
#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
1192
	.dedupe_file_range	= zpl_dedupe_file_range,
1193
#endif
1194
	.fadvise	= zpl_fadvise,
1195
	.unlocked_ioctl	= zpl_ioctl,
1196
#ifdef CONFIG_COMPAT
1197
	.compat_ioctl	= zpl_compat_ioctl,
1198
#endif
1199
};
1200

1201
const struct file_operations zpl_dir_file_operations = {
1202
	.llseek		= generic_file_llseek,
1203
	.read		= generic_read_dir,
1204
	.iterate_shared	= zpl_iterate,
1205
	.fsync		= zpl_fsync,
1206
	.unlocked_ioctl = zpl_ioctl,
1207
#ifdef CONFIG_COMPAT
1208
	.compat_ioctl   = zpl_compat_ioctl,
1209
#endif
1210
};
1211

1212
module_param(zfs_fallocate_reserve_percent, uint, 0644);
1213
MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
1214
	"Percentage of length to use for the available capacity check");
1215

1216
Product

Resources

Company