Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
48774 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
25
* Copyright (c) 2025, Klara, Inc.
26
* Copyright (c) 2025, Rob Norris <[email protected]>
27
*/
28
29
30
#ifdef CONFIG_COMPAT
31
#include <linux/compat.h>
32
#endif
33
#include <linux/fs.h>
34
#include <linux/migrate.h>
35
#include <sys/file.h>
36
#include <sys/dmu_objset.h>
37
#include <sys/zfs_znode.h>
38
#include <sys/zfs_vfsops.h>
39
#include <sys/zfs_vnops.h>
40
#include <sys/zfs_project.h>
41
#include <linux/pagemap_compat.h>
42
#include <linux/fadvise.h>
43
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
44
#include <linux/writeback.h>
45
#endif
46
47
/*
48
* When using fallocate(2) to preallocate space, inflate the requested
49
* capacity check by 10% to account for the required metadata blocks.
50
*/
51
static unsigned int zfs_fallocate_reserve_percent = 110;
52
53
static int
54
zpl_open(struct inode *ip, struct file *filp)
55
{
56
cred_t *cr = CRED();
57
int error;
58
fstrans_cookie_t cookie;
59
60
error = generic_file_open(ip, filp);
61
if (error)
62
return (error);
63
64
crhold(cr);
65
cookie = spl_fstrans_mark();
66
error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
67
spl_fstrans_unmark(cookie);
68
crfree(cr);
69
ASSERT3S(error, <=, 0);
70
71
return (error);
72
}
73
74
static int
75
zpl_release(struct inode *ip, struct file *filp)
76
{
77
cred_t *cr = CRED();
78
int error;
79
fstrans_cookie_t cookie;
80
81
cookie = spl_fstrans_mark();
82
if (ITOZ(ip)->z_atime_dirty)
83
zfs_mark_inode_dirty(ip);
84
85
crhold(cr);
86
error = -zfs_close(ip, filp->f_flags, cr);
87
spl_fstrans_unmark(cookie);
88
crfree(cr);
89
ASSERT3S(error, <=, 0);
90
91
return (error);
92
}
93
94
static int
95
zpl_iterate(struct file *filp, struct dir_context *ctx)
96
{
97
cred_t *cr = CRED();
98
int error;
99
fstrans_cookie_t cookie;
100
101
crhold(cr);
102
cookie = spl_fstrans_mark();
103
error = -zfs_readdir(file_inode(filp), ctx, cr);
104
spl_fstrans_unmark(cookie);
105
crfree(cr);
106
ASSERT3S(error, <=, 0);
107
108
return (error);
109
}
110
111
static inline int
112
zpl_write_cache_pages(struct address_space *mapping,
113
struct writeback_control *wbc, void *data);
114
115
static int
116
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
117
{
118
struct inode *inode = filp->f_mapping->host;
119
znode_t *zp = ITOZ(inode);
120
cred_t *cr = CRED();
121
int error;
122
fstrans_cookie_t cookie;
123
124
/*
125
* Force dirty pages in the range out to the DMU and the log, ready
126
* for zil_commit() to write down.
127
*
128
* We call write_cache_pages() directly to ensure that zpl_putpage() is
129
* called with the flags we need. We need WB_SYNC_NONE to avoid a call
130
* to zil_commit() (since we're doing this as a kind of pre-sync); but
131
* we do need for_sync so that the pages remain in writeback until
132
* they're on disk, and so that we get an error if the DMU write fails.
133
*/
134
if (filemap_range_has_page(inode->i_mapping, start, end)) {
135
int for_sync = 1;
136
struct writeback_control wbc = {
137
.sync_mode = WB_SYNC_NONE,
138
.nr_to_write = LONG_MAX,
139
.range_start = start,
140
.range_end = end,
141
};
142
error =
143
zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
144
if (error != 0) {
145
/*
146
* Unclear what state things are in. zfs_putpage() will
147
* ensure the pages remain dirty if they haven't been
148
* written down to the DMU, but because there may be
149
* nothing logged, we can't assume that zfs_sync() ->
150
* zil_commit() will give us a useful error. It's
151
* safest if we just error out here.
152
*/
153
return (error);
154
}
155
}
156
157
crhold(cr);
158
cookie = spl_fstrans_mark();
159
error = -zfs_fsync(zp, datasync, cr);
160
spl_fstrans_unmark(cookie);
161
crfree(cr);
162
ASSERT3S(error, <=, 0);
163
164
return (error);
165
}
166
167
static inline int
168
zfs_io_flags(struct kiocb *kiocb)
169
{
170
int flags = 0;
171
172
#if defined(IOCB_DSYNC)
173
if (kiocb->ki_flags & IOCB_DSYNC)
174
flags |= O_DSYNC;
175
#endif
176
#if defined(IOCB_SYNC)
177
if (kiocb->ki_flags & IOCB_SYNC)
178
flags |= O_SYNC;
179
#endif
180
#if defined(IOCB_APPEND)
181
if (kiocb->ki_flags & IOCB_APPEND)
182
flags |= O_APPEND;
183
#endif
184
#if defined(IOCB_DIRECT)
185
if (kiocb->ki_flags & IOCB_DIRECT)
186
flags |= O_DIRECT;
187
#endif
188
return (flags);
189
}
190
191
/*
192
* If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
193
* is true. This is needed since datasets with inherited "relatime" property
194
* aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
195
* `zfs set relatime=...`), which is what relatime test in VFS by
196
* relatime_need_update() is based on.
197
*/
198
static inline void
199
zpl_file_accessed(struct file *filp)
200
{
201
struct inode *ip = filp->f_mapping->host;
202
203
if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
204
if (zfs_relatime_need_update(ip))
205
file_accessed(filp);
206
} else {
207
file_accessed(filp);
208
}
209
}
210
211
static ssize_t
212
zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
213
{
214
cred_t *cr = CRED();
215
fstrans_cookie_t cookie;
216
struct file *filp = kiocb->ki_filp;
217
ssize_t count = iov_iter_count(to);
218
zfs_uio_t uio;
219
220
zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);
221
222
crhold(cr);
223
cookie = spl_fstrans_mark();
224
225
ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
226
filp->f_flags | zfs_io_flags(kiocb), cr);
227
228
spl_fstrans_unmark(cookie);
229
crfree(cr);
230
231
if (ret < 0)
232
return (ret);
233
234
ssize_t read = count - uio.uio_resid;
235
kiocb->ki_pos += read;
236
237
zpl_file_accessed(filp);
238
239
return (read);
240
}
241
242
static inline ssize_t
243
zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
244
size_t *countp)
245
{
246
ssize_t ret = generic_write_checks(kiocb, from);
247
if (ret <= 0)
248
return (ret);
249
250
*countp = ret;
251
252
return (0);
253
}
254
255
static ssize_t
256
zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
257
{
258
cred_t *cr = CRED();
259
fstrans_cookie_t cookie;
260
struct file *filp = kiocb->ki_filp;
261
struct inode *ip = filp->f_mapping->host;
262
zfs_uio_t uio;
263
size_t count = 0;
264
ssize_t ret;
265
266
ret = zpl_generic_write_checks(kiocb, from, &count);
267
if (ret)
268
return (ret);
269
270
zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);
271
272
crhold(cr);
273
cookie = spl_fstrans_mark();
274
275
ret = -zfs_write(ITOZ(ip), &uio,
276
filp->f_flags | zfs_io_flags(kiocb), cr);
277
278
spl_fstrans_unmark(cookie);
279
crfree(cr);
280
281
if (ret < 0)
282
return (ret);
283
284
ssize_t wrote = count - uio.uio_resid;
285
kiocb->ki_pos += wrote;
286
287
return (wrote);
288
}
289
290
static ssize_t
291
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
292
{
293
/*
294
* All O_DIRECT requests should be handled by
295
* zpl_iter_write/read}(). There is no way kernel generic code should
296
* call the direct_IO address_space_operations function. We set this
297
* code path to be fatal if it is executed.
298
*/
299
PANIC(0);
300
return (0);
301
}
302
303
static loff_t
304
zpl_llseek(struct file *filp, loff_t offset, int whence)
305
{
306
#if defined(SEEK_HOLE) && defined(SEEK_DATA)
307
fstrans_cookie_t cookie;
308
309
if (whence == SEEK_DATA || whence == SEEK_HOLE) {
310
struct inode *ip = filp->f_mapping->host;
311
loff_t maxbytes = ip->i_sb->s_maxbytes;
312
loff_t error;
313
314
spl_inode_lock_shared(ip);
315
cookie = spl_fstrans_mark();
316
error = -zfs_holey(ITOZ(ip), whence, &offset);
317
spl_fstrans_unmark(cookie);
318
if (error == 0)
319
error = lseek_execute(filp, ip, offset, maxbytes);
320
spl_inode_unlock_shared(ip);
321
322
return (error);
323
}
324
#endif /* SEEK_HOLE && SEEK_DATA */
325
326
return (generic_file_llseek(filp, offset, whence));
327
}
328
329
/*
330
* It's worth taking a moment to describe how mmap is implemented
331
* for zfs because it differs considerably from other Linux filesystems.
332
* However, this issue is handled the same way under OpenSolaris.
333
*
334
* The issue is that by design zfs bypasses the Linux page cache and
335
* leaves all caching up to the ARC. This has been shown to work
336
* well for the common read(2)/write(2) case. However, mmap(2)
337
* is problem because it relies on being tightly integrated with the
338
* page cache. To handle this we cache mmap'ed files twice, once in
339
* the ARC and a second time in the page cache. The code is careful
340
* to keep both copies synchronized.
341
*
342
* When a file with an mmap'ed region is written to using write(2)
343
* both the data in the ARC and existing pages in the page cache
344
* are updated. For a read(2) data will be read first from the page
345
* cache then the ARC if needed. Neither a write(2) or read(2) will
346
* will ever result in new pages being added to the page cache.
347
*
348
* New pages are added to the page cache only via .readpage() which
349
* is called when the vfs needs to read a page off disk to back the
350
* virtual memory region. These pages may be modified without
351
* notifying the ARC and will be written out periodically via
352
* .writepage(). This will occur due to either a sync or the usual
353
* page aging behavior. Note because a read(2) of a mmap'ed file
354
* will always check the page cache first even when the ARC is out
355
* of date correct data will still be returned.
356
*
357
* While this implementation ensures correct behavior it does have
358
* have some drawbacks. The most obvious of which is that it
359
* increases the required memory footprint when access mmap'ed
360
* files. It also adds additional complexity to the code keeping
361
* both caches synchronized.
362
*
363
* Longer term it may be possible to cleanly resolve this wart by
364
* mapping page cache pages directly on to the ARC buffers. The
365
* Linux address space operations are flexible enough to allow
366
* selection of which pages back a particular index. The trick
367
* would be working out the details of which subsystem is in
368
* charge, the ARC, the page cache, or both. It may also prove
369
* helpful to move the ARC buffers to a scatter-gather lists
370
* rather than a vmalloc'ed region.
371
*/
372
static int
373
zpl_mmap(struct file *filp, struct vm_area_struct *vma)
374
{
375
struct inode *ip = filp->f_mapping->host;
376
int error;
377
fstrans_cookie_t cookie;
378
379
cookie = spl_fstrans_mark();
380
error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
381
(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
382
spl_fstrans_unmark(cookie);
383
384
if (error)
385
return (error);
386
387
error = generic_file_mmap(filp, vma);
388
if (error)
389
return (error);
390
391
return (error);
392
}
393
394
/*
395
* Populate a page with data for the Linux page cache. This function is
396
* only used to support mmap(2). There will be an identical copy of the
397
* data in the ARC which is kept up to date via .write() and .writepage().
398
*/
399
static inline int
400
zpl_readpage_common(struct page *pp)
401
{
402
fstrans_cookie_t cookie;
403
404
ASSERT(PageLocked(pp));
405
406
cookie = spl_fstrans_mark();
407
int error = -zfs_getpage(pp->mapping->host, pp);
408
spl_fstrans_unmark(cookie);
409
410
unlock_page(pp);
411
412
return (error);
413
}
414
415
#ifdef HAVE_VFS_READ_FOLIO
416
static int
417
zpl_read_folio(struct file *filp, struct folio *folio)
418
{
419
return (zpl_readpage_common(&folio->page));
420
}
421
#else
422
static int
423
zpl_readpage(struct file *filp, struct page *pp)
424
{
425
return (zpl_readpage_common(pp));
426
}
427
#endif
428
429
static int
430
zpl_readpage_filler(void *data, struct page *pp)
431
{
432
return (zpl_readpage_common(pp));
433
}
434
435
/*
436
* Populate a set of pages with data for the Linux page cache. This
437
* function will only be called for read ahead and never for demand
438
* paging. For simplicity, the code relies on read_cache_pages() to
439
* correctly lock each page for IO and call zpl_readpage().
440
*/
441
#ifdef HAVE_VFS_READPAGES
442
static int
443
zpl_readpages(struct file *filp, struct address_space *mapping,
444
struct list_head *pages, unsigned nr_pages)
445
{
446
return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
447
}
448
#else
449
static void
450
zpl_readahead(struct readahead_control *ractl)
451
{
452
struct page *page;
453
454
while ((page = readahead_page(ractl)) != NULL) {
455
int ret;
456
457
ret = zpl_readpage_filler(NULL, page);
458
put_page(page);
459
if (ret)
460
break;
461
}
462
}
463
#endif
464
465
static int
466
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
467
{
468
boolean_t *for_sync = data;
469
fstrans_cookie_t cookie;
470
int ret;
471
472
ASSERT(PageLocked(pp));
473
ASSERT(!PageWriteback(pp));
474
475
cookie = spl_fstrans_mark();
476
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
477
spl_fstrans_unmark(cookie);
478
479
return (ret);
480
}
481
482
#ifdef HAVE_WRITE_CACHE_PAGES
483
#ifdef HAVE_WRITEPAGE_T_FOLIO
484
static int
485
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
486
{
487
return (zpl_putpage(&pp->page, wbc, data));
488
}
489
#endif
490
491
static inline int
492
zpl_write_cache_pages(struct address_space *mapping,
493
struct writeback_control *wbc, void *data)
494
{
495
int result;
496
497
#ifdef HAVE_WRITEPAGE_T_FOLIO
498
result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
499
#else
500
result = write_cache_pages(mapping, wbc, zpl_putpage, data);
501
#endif
502
return (result);
503
}
504
#else
505
static inline int
506
zpl_write_cache_pages(struct address_space *mapping,
507
struct writeback_control *wbc, void *data)
508
{
509
pgoff_t start = wbc->range_start >> PAGE_SHIFT;
510
pgoff_t end = wbc->range_end >> PAGE_SHIFT;
511
512
struct folio_batch fbatch;
513
folio_batch_init(&fbatch);
514
515
/*
516
* This atomically (-ish) tags all DIRTY pages in the range with
517
* TOWRITE, allowing users to continue dirtying or undirtying pages
518
* while we get on with writeback, without us treading on each other.
519
*/
520
tag_pages_for_writeback(mapping, start, end);
521
522
int err = 0;
523
unsigned int npages;
524
525
/*
526
* Grab references to the TOWRITE pages just flagged. This may not get
527
* all of them, so we do it in a loop until there are none left.
528
*/
529
while ((npages = filemap_get_folios_tag(mapping, &start, end,
530
PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
531
532
/* Loop over each page and write it out. */
533
struct folio *folio;
534
while ((folio = folio_batch_next(&fbatch)) != NULL) {
535
folio_lock(folio);
536
537
/*
538
* If the folio has been remapped, or is no longer
539
* dirty, then there's nothing to do.
540
*/
541
if (folio->mapping != mapping ||
542
!folio_test_dirty(folio)) {
543
folio_unlock(folio);
544
continue;
545
}
546
547
/*
548
* If writeback is already in progress, wait for it to
549
* finish. We continue after this even if the page
550
* ends up clean; zfs_putpage() will skip it if no
551
* further work is required.
552
*/
553
while (folio_test_writeback(folio))
554
folio_wait_bit(folio, PG_writeback);
555
556
/*
557
* Write it out and collect any error. zfs_putpage()
558
* will clear the TOWRITE and DIRTY flags, and return
559
* with the page unlocked.
560
*/
561
int ferr = zpl_putpage(&folio->page, wbc, data);
562
if (err == 0 && ferr != 0)
563
err = ferr;
564
565
/* Housekeeping for the caller. */
566
wbc->nr_to_write -= folio_nr_pages(folio);
567
}
568
569
/* Release any remaining references on the batch. */
570
folio_batch_release(&fbatch);
571
}
572
573
return (err);
574
}
575
#endif
576
577
static int
578
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
579
{
580
znode_t *zp = ITOZ(mapping->host);
581
zfsvfs_t *zfsvfs = ITOZSB(mapping->host);
582
enum writeback_sync_modes sync_mode;
583
int result;
584
585
if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
586
return (result);
587
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
588
wbc->sync_mode = WB_SYNC_ALL;
589
zpl_exit(zfsvfs, FTAG);
590
sync_mode = wbc->sync_mode;
591
592
/*
593
* We don't want to run write_cache_pages() in SYNC mode here, because
594
* that would make putpage() wait for a single page to be committed to
595
* disk every single time, resulting in atrocious performance. Instead
596
* we run it once in non-SYNC mode so that the ZIL gets all the data,
597
* and then we commit it all in one go.
598
*/
599
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
600
wbc->sync_mode = WB_SYNC_NONE;
601
result = zpl_write_cache_pages(mapping, wbc, &for_sync);
602
if (sync_mode != wbc->sync_mode) {
603
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
604
return (result);
605
606
if (zfsvfs->z_log != NULL) {
607
/*
608
* We don't want to block here if the pool suspends,
609
* because this is not a syncing op by itself, but
610
* might be part of one that the caller will
611
* coordinate.
612
*/
613
result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
614
ZIL_COMMIT_NOW);
615
}
616
617
zpl_exit(zfsvfs, FTAG);
618
619
/*
620
* If zil_commit_flags() failed, it's unclear what state things
621
* are currently in. putpage() has written back out what it can
622
* to the DMU, but it may not be on disk. We have little choice
623
* but to escape.
624
*/
625
if (result != 0)
626
return (result);
627
628
/*
629
* We need to call write_cache_pages() again (we can't just
630
* return after the commit) because the previous call in
631
* non-SYNC mode does not guarantee that we got all the dirty
632
* pages (see the implementation of write_cache_pages() for
633
* details). That being said, this is a no-op in most cases.
634
*/
635
wbc->sync_mode = sync_mode;
636
result = zpl_write_cache_pages(mapping, wbc, &for_sync);
637
}
638
return (result);
639
}
640
641
#ifdef HAVE_VFS_WRITEPAGE
642
/*
643
* Write out dirty pages to the ARC, this function is only required to
644
* support mmap(2). Mapped pages may be dirtied by memory operations
645
* which never call .write(). These dirty pages are kept in sync with
646
* the ARC buffers via this hook.
647
*/
648
static int
649
zpl_writepage(struct page *pp, struct writeback_control *wbc)
650
{
651
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
652
wbc->sync_mode = WB_SYNC_ALL;
653
654
boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
655
656
return (zpl_putpage(pp, wbc, &for_sync));
657
}
658
#endif
659
660
/*
661
* The flag combination which matches the behavior of zfs_space() is
662
* FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
663
* flag was introduced in the 2.6.38 kernel.
664
*
665
* The original mode=0 (allocate space) behavior can be reasonably emulated
666
* by checking if enough space exists and creating a sparse file, as real
667
* persistent space reservation is not possible due to COW, snapshots, etc.
668
*/
669
static long
670
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
671
{
672
cred_t *cr = CRED();
673
loff_t olen;
674
fstrans_cookie_t cookie;
675
int error = 0;
676
677
int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;
678
679
if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)
680
return (-EOPNOTSUPP);
681
682
if (offset < 0 || len <= 0)
683
return (-EINVAL);
684
685
spl_inode_lock(ip);
686
olen = i_size_read(ip);
687
688
crhold(cr);
689
cookie = spl_fstrans_mark();
690
if (mode & (test_mode)) {
691
flock64_t bf;
692
693
if (mode & FALLOC_FL_KEEP_SIZE) {
694
if (offset > olen)
695
goto out_unmark;
696
697
if (offset + len > olen)
698
len = olen - offset;
699
}
700
bf.l_type = F_WRLCK;
701
bf.l_whence = SEEK_SET;
702
bf.l_start = offset;
703
bf.l_len = len;
704
bf.l_pid = 0;
705
706
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
707
} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
708
unsigned int percent = zfs_fallocate_reserve_percent;
709
struct kstatfs statfs;
710
711
/* Legacy mode, disable fallocate compatibility. */
712
if (percent == 0) {
713
error = -EOPNOTSUPP;
714
goto out_unmark;
715
}
716
717
/*
718
* Use zfs_statvfs() instead of dmu_objset_space() since it
719
* also checks project quota limits, which are relevant here.
720
*/
721
error = zfs_statvfs(ip, &statfs);
722
if (error)
723
goto out_unmark;
724
725
/*
726
* Shrink available space a bit to account for overhead/races.
727
* We know the product previously fit into availbytes from
728
* dmu_objset_space(), so the smaller product will also fit.
729
*/
730
if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
731
error = -ENOSPC;
732
goto out_unmark;
733
}
734
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
735
error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
736
}
737
out_unmark:
738
spl_fstrans_unmark(cookie);
739
spl_inode_unlock(ip);
740
741
crfree(cr);
742
743
return (error);
744
}
745
746
static long
747
zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
748
{
749
return zpl_fallocate_common(file_inode(filp),
750
mode, offset, len);
751
}
752
753
static int
754
zpl_ioctl_getversion(struct file *filp, void __user *arg)
755
{
756
uint32_t generation = file_inode(filp)->i_generation;
757
758
return (copy_to_user(arg, &generation, sizeof (generation)));
759
}
760
761
static int
762
zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
763
{
764
struct inode *ip = file_inode(filp);
765
znode_t *zp = ITOZ(ip);
766
zfsvfs_t *zfsvfs = ITOZSB(ip);
767
objset_t *os = zfsvfs->z_os;
768
int error = 0;
769
770
if (S_ISFIFO(ip->i_mode))
771
return (-ESPIPE);
772
773
if (offset < 0 || len < 0)
774
return (-EINVAL);
775
776
if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
777
return (error);
778
779
switch (advice) {
780
case POSIX_FADV_SEQUENTIAL:
781
case POSIX_FADV_WILLNEED:
782
#ifdef HAVE_GENERIC_FADVISE
783
if (zn_has_cached_data(zp, offset, offset + len - 1))
784
error = generic_fadvise(filp, offset, len, advice);
785
#endif
786
/*
787
* Pass on the caller's size directly, but note that
788
* dmu_prefetch_max will effectively cap it. If there
789
* really is a larger sequential access pattern, perhaps
790
* dmu_zfetch will detect it.
791
*/
792
if (len == 0)
793
len = i_size_read(ip) - offset;
794
795
dmu_prefetch(os, zp->z_id, 0, offset, len,
796
ZIO_PRIORITY_ASYNC_READ);
797
break;
798
case POSIX_FADV_NORMAL:
799
case POSIX_FADV_RANDOM:
800
case POSIX_FADV_DONTNEED:
801
case POSIX_FADV_NOREUSE:
802
/* ignored for now */
803
break;
804
default:
805
error = -EINVAL;
806
break;
807
}
808
809
zfs_exit(zfsvfs, FTAG);
810
811
return (error);
812
}
813
814
#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
815
#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
816
817
static uint32_t
818
__zpl_ioctl_getflags(struct inode *ip)
819
{
820
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
821
uint32_t ioctl_flags = 0;
822
823
if (zfs_flags & ZFS_IMMUTABLE)
824
ioctl_flags |= FS_IMMUTABLE_FL;
825
826
if (zfs_flags & ZFS_APPENDONLY)
827
ioctl_flags |= FS_APPEND_FL;
828
829
if (zfs_flags & ZFS_NODUMP)
830
ioctl_flags |= FS_NODUMP_FL;
831
832
if (zfs_flags & ZFS_PROJINHERIT)
833
ioctl_flags |= ZFS_PROJINHERIT_FL;
834
835
return (ioctl_flags & ZFS_FL_USER_VISIBLE);
836
}
837
838
/*
839
* Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
840
* attributes common to both Linux and Solaris are mapped.
841
*/
842
static int
843
zpl_ioctl_getflags(struct file *filp, void __user *arg)
844
{
845
uint32_t flags;
846
int err;
847
848
flags = __zpl_ioctl_getflags(file_inode(filp));
849
err = copy_to_user(arg, &flags, sizeof (flags));
850
851
return (err);
852
}
853
854
/*
855
* fchange() is a helper macro to detect if we have been asked to change a
856
* flag. This is ugly, but the requirement that we do this is a consequence of
857
* how the Linux file attribute interface was designed. Another consequence is
858
* that concurrent modification of files suffers from a TOCTOU race. Neither
859
* are things we can fix without modifying the kernel-userland interface, which
860
* is outside of our jurisdiction.
861
*/
862
863
#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
864
865
static int
866
__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
867
{
868
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
869
xoptattr_t *xoap;
870
871
if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
872
ZFS_PROJINHERIT_FL))
873
return (-EOPNOTSUPP);
874
875
if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
876
return (-EACCES);
877
878
if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
879
fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
880
!capable(CAP_LINUX_IMMUTABLE))
881
return (-EPERM);
882
883
if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
884
return (-EACCES);
885
886
xva_init(xva);
887
xoap = xva_getxoptattr(xva);
888
889
#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
890
if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
891
((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
892
XVA_SET_REQ(xva, (xflag)); \
893
(xfield) = ((ioctl_flags & (iflag)) != 0); \
894
} \
895
} while (0)
896
897
FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,
898
xoap->xoa_immutable);
899
FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,
900
xoap->xoa_appendonly);
901
FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
902
xoap->xoa_nodump);
903
FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
904
xoap->xoa_projinherit);
905
906
#undef FLAG_CHANGE
907
908
return (0);
909
}
910
911
static int
912
zpl_ioctl_setflags(struct file *filp, void __user *arg)
913
{
914
struct inode *ip = file_inode(filp);
915
uint32_t flags;
916
cred_t *cr = CRED();
917
xvattr_t xva;
918
int err;
919
fstrans_cookie_t cookie;
920
921
if (copy_from_user(&flags, arg, sizeof (flags)))
922
return (-EFAULT);
923
924
err = __zpl_ioctl_setflags(ip, flags, &xva);
925
if (err)
926
return (err);
927
928
crhold(cr);
929
cookie = spl_fstrans_mark();
930
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
931
spl_fstrans_unmark(cookie);
932
crfree(cr);
933
934
return (err);
935
}
936
937
static int
938
zpl_ioctl_getxattr(struct file *filp, void __user *arg)
939
{
940
zfsxattr_t fsx = { 0 };
941
struct inode *ip = file_inode(filp);
942
int err;
943
944
fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
945
fsx.fsx_projid = ITOZ(ip)->z_projid;
946
err = copy_to_user(arg, &fsx, sizeof (fsx));
947
948
return (err);
949
}
950
951
static int
952
zpl_ioctl_setxattr(struct file *filp, void __user *arg)
953
{
954
struct inode *ip = file_inode(filp);
955
zfsxattr_t fsx;
956
cred_t *cr = CRED();
957
xvattr_t xva;
958
xoptattr_t *xoap;
959
int err;
960
fstrans_cookie_t cookie;
961
962
if (copy_from_user(&fsx, arg, sizeof (fsx)))
963
return (-EFAULT);
964
965
if (!zpl_is_valid_projid(fsx.fsx_projid))
966
return (-EINVAL);
967
968
err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
969
if (err)
970
return (err);
971
972
xoap = xva_getxoptattr(&xva);
973
XVA_SET_REQ(&xva, XAT_PROJID);
974
xoap->xoa_projid = fsx.fsx_projid;
975
976
crhold(cr);
977
cookie = spl_fstrans_mark();
978
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
979
spl_fstrans_unmark(cookie);
980
crfree(cr);
981
982
return (err);
983
}
984
985
/*
986
* Expose Additional File Level Attributes of ZFS.
987
*/
988
static int
989
zpl_ioctl_getdosflags(struct file *filp, void __user *arg)
990
{
991
struct inode *ip = file_inode(filp);
992
uint64_t dosflags = ITOZ(ip)->z_pflags;
993
dosflags &= ZFS_DOS_FL_USER_VISIBLE;
994
int err = copy_to_user(arg, &dosflags, sizeof (dosflags));
995
996
return (err);
997
}
998
999
static int
1000
__zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
1001
{
1002
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
1003
xoptattr_t *xoap;
1004
1005
if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))
1006
return (-EOPNOTSUPP);
1007
1008
if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||
1009
fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&
1010
!capable(CAP_LINUX_IMMUTABLE))
1011
return (-EPERM);
1012
1013
if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
1014
return (-EACCES);
1015
1016
xva_init(xva);
1017
xoap = xva_getxoptattr(xva);
1018
1019
#define FLAG_CHANGE(iflag, xflag, xfield) do { \
1020
if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \
1021
((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \
1022
XVA_SET_REQ(xva, (xflag)); \
1023
(xfield) = ((ioctl_flags & (iflag)) != 0); \
1024
} \
1025
} while (0)
1026
1027
FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);
1028
FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);
1029
FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);
1030
FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);
1031
FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);
1032
FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);
1033
FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);
1034
FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);
1035
FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);
1036
FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);
1037
FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);
1038
1039
#undef FLAG_CHANGE
1040
1041
return (0);
1042
}
1043
1044
/*
1045
* Set Additional File Level Attributes of ZFS.
1046
*/
1047
static int
1048
zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
1049
{
1050
struct inode *ip = file_inode(filp);
1051
uint64_t dosflags;
1052
cred_t *cr = CRED();
1053
xvattr_t xva;
1054
int err;
1055
fstrans_cookie_t cookie;
1056
1057
if (copy_from_user(&dosflags, arg, sizeof (dosflags)))
1058
return (-EFAULT);
1059
1060
err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);
1061
if (err)
1062
return (err);
1063
1064
crhold(cr);
1065
cookie = spl_fstrans_mark();
1066
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
1067
spl_fstrans_unmark(cookie);
1068
crfree(cr);
1069
1070
return (err);
1071
}
1072
1073
static int
1074
zpl_ioctl_rewrite(struct file *filp, void __user *arg)
1075
{
1076
struct inode *ip = file_inode(filp);
1077
zfs_rewrite_args_t args;
1078
fstrans_cookie_t cookie;
1079
int err;
1080
1081
if (copy_from_user(&args, arg, sizeof (args)))
1082
return (-EFAULT);
1083
1084
if (unlikely(!(filp->f_mode & FMODE_WRITE)))
1085
return (-EBADF);
1086
1087
cookie = spl_fstrans_mark();
1088
err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
1089
spl_fstrans_unmark(cookie);
1090
1091
return (err);
1092
}
1093
1094
static long
1095
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1096
{
1097
switch (cmd) {
1098
case FS_IOC_GETVERSION:
1099
return (zpl_ioctl_getversion(filp, (void *)arg));
1100
case FS_IOC_GETFLAGS:
1101
return (zpl_ioctl_getflags(filp, (void *)arg));
1102
case FS_IOC_SETFLAGS:
1103
return (zpl_ioctl_setflags(filp, (void *)arg));
1104
case ZFS_IOC_FSGETXATTR:
1105
return (zpl_ioctl_getxattr(filp, (void *)arg));
1106
case ZFS_IOC_FSSETXATTR:
1107
return (zpl_ioctl_setxattr(filp, (void *)arg));
1108
case ZFS_IOC_GETDOSFLAGS:
1109
return (zpl_ioctl_getdosflags(filp, (void *)arg));
1110
case ZFS_IOC_SETDOSFLAGS:
1111
return (zpl_ioctl_setdosflags(filp, (void *)arg));
1112
case ZFS_IOC_REWRITE:
1113
return (zpl_ioctl_rewrite(filp, (void *)arg));
1114
default:
1115
return (-ENOTTY);
1116
}
1117
}
1118
1119
#ifdef CONFIG_COMPAT
1120
static long
1121
zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1122
{
1123
switch (cmd) {
1124
case FS_IOC32_GETVERSION:
1125
cmd = FS_IOC_GETVERSION;
1126
break;
1127
case FS_IOC32_GETFLAGS:
1128
cmd = FS_IOC_GETFLAGS;
1129
break;
1130
case FS_IOC32_SETFLAGS:
1131
cmd = FS_IOC_SETFLAGS;
1132
break;
1133
default:
1134
return (-ENOTTY);
1135
}
1136
return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1137
}
1138
#endif /* CONFIG_COMPAT */
1139
1140
const struct address_space_operations zpl_address_space_operations = {
1141
#ifdef HAVE_VFS_READPAGES
1142
.readpages = zpl_readpages,
1143
#else
1144
.readahead = zpl_readahead,
1145
#endif
1146
#ifdef HAVE_VFS_READ_FOLIO
1147
.read_folio = zpl_read_folio,
1148
#else
1149
.readpage = zpl_readpage,
1150
#endif
1151
#ifdef HAVE_VFS_WRITEPAGE
1152
.writepage = zpl_writepage,
1153
#endif
1154
.writepages = zpl_writepages,
1155
.direct_IO = zpl_direct_IO,
1156
#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
1157
.set_page_dirty = __set_page_dirty_nobuffers,
1158
#endif
1159
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
1160
.dirty_folio = filemap_dirty_folio,
1161
#endif
1162
#ifdef HAVE_VFS_MIGRATE_FOLIO
1163
.migrate_folio = migrate_folio,
1164
#elif defined(HAVE_VFS_MIGRATEPAGE)
1165
.migratepage = migrate_page,
1166
#endif
1167
};
1168
1169
const struct file_operations zpl_file_operations = {
1170
.open = zpl_open,
1171
.release = zpl_release,
1172
.llseek = zpl_llseek,
1173
.read_iter = zpl_iter_read,
1174
.write_iter = zpl_iter_write,
1175
#ifdef HAVE_COPY_SPLICE_READ
1176
.splice_read = copy_splice_read,
1177
#else
1178
.splice_read = generic_file_splice_read,
1179
#endif
1180
.splice_write = iter_file_splice_write,
1181
.mmap = zpl_mmap,
1182
.fsync = zpl_fsync,
1183
.fallocate = zpl_fallocate,
1184
.copy_file_range = zpl_copy_file_range,
1185
#ifdef HAVE_VFS_CLONE_FILE_RANGE
1186
.clone_file_range = zpl_clone_file_range,
1187
#endif
1188
#ifdef HAVE_VFS_REMAP_FILE_RANGE
1189
.remap_file_range = zpl_remap_file_range,
1190
#endif
1191
#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
1192
.dedupe_file_range = zpl_dedupe_file_range,
1193
#endif
1194
.fadvise = zpl_fadvise,
1195
.unlocked_ioctl = zpl_ioctl,
1196
#ifdef CONFIG_COMPAT
1197
.compat_ioctl = zpl_compat_ioctl,
1198
#endif
1199
};
1200
1201
const struct file_operations zpl_dir_file_operations = {
1202
.llseek = generic_file_llseek,
1203
.read = generic_read_dir,
1204
.iterate_shared = zpl_iterate,
1205
.fsync = zpl_fsync,
1206
.unlocked_ioctl = zpl_ioctl,
1207
#ifdef CONFIG_COMPAT
1208
.compat_ioctl = zpl_compat_ioctl,
1209
#endif
1210
};
1211
1212
module_param(zfs_fallocate_reserve_percent, uint, 0644);
1213
MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
1214
"Percentage of length to use for the available capacity check");
1215
1216