Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
108090 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
25
* Copyright (c) 2025, Klara, Inc.
26
* Copyright (c) 2025, Rob Norris <[email protected]>
27
*/
28
29
30
#ifdef CONFIG_COMPAT
31
#include <linux/compat.h>
32
#endif
33
#include <linux/fs.h>
34
#include <linux/migrate.h>
35
#include <sys/file.h>
36
#include <sys/dmu_objset.h>
37
#include <sys/zfs_znode.h>
38
#include <sys/zfs_vfsops.h>
39
#include <sys/zfs_vnops.h>
40
#include <sys/zfs_project.h>
41
#include <linux/pagemap_compat.h>
42
#include <linux/fadvise.h>
43
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
44
#include <linux/writeback.h>
45
#endif
46
47
/*
48
* When using fallocate(2) to preallocate space, inflate the requested
49
* capacity check by 10% to account for the required metadata blocks.
50
*/
51
static unsigned int zfs_fallocate_reserve_percent = 110;
52
53
static int
54
zpl_open(struct inode *ip, struct file *filp)
55
{
56
cred_t *cr = CRED();
57
int error;
58
fstrans_cookie_t cookie;
59
60
error = generic_file_open(ip, filp);
61
if (error)
62
return (error);
63
64
crhold(cr);
65
cookie = spl_fstrans_mark();
66
error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
67
spl_fstrans_unmark(cookie);
68
crfree(cr);
69
ASSERT3S(error, <=, 0);
70
71
return (error);
72
}
73
74
static int
75
zpl_release(struct inode *ip, struct file *filp)
76
{
77
cred_t *cr = CRED();
78
int error;
79
fstrans_cookie_t cookie;
80
81
cookie = spl_fstrans_mark();
82
if (ITOZ(ip)->z_atime_dirty)
83
zfs_mark_inode_dirty(ip);
84
85
crhold(cr);
86
error = -zfs_close(ip, filp->f_flags, cr);
87
spl_fstrans_unmark(cookie);
88
crfree(cr);
89
ASSERT3S(error, <=, 0);
90
91
return (error);
92
}
93
94
static int
95
zpl_iterate(struct file *filp, struct dir_context *ctx)
96
{
97
cred_t *cr = CRED();
98
int error;
99
fstrans_cookie_t cookie;
100
101
crhold(cr);
102
cookie = spl_fstrans_mark();
103
error = -zfs_readdir(file_inode(filp), ctx, cr);
104
spl_fstrans_unmark(cookie);
105
crfree(cr);
106
ASSERT3S(error, <=, 0);
107
108
return (error);
109
}
110
111
static inline int
112
zpl_write_cache_pages(struct address_space *mapping,
113
struct writeback_control *wbc, void *data);
114
115
static int
116
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
117
{
118
struct inode *inode = filp->f_mapping->host;
119
znode_t *zp = ITOZ(inode);
120
cred_t *cr = CRED();
121
int error;
122
fstrans_cookie_t cookie;
123
124
/*
125
* Force dirty pages in the range out to the DMU and the log, ready
126
* for zil_commit() to write down.
127
*
128
* We call write_cache_pages() directly to ensure that zpl_putpage() is
129
* called with the flags we need. We need WB_SYNC_NONE to avoid a call
130
* to zil_commit() (since we're doing this as a kind of pre-sync); but
131
* we do need for_sync so that the pages remain in writeback until
132
* they're on disk, and so that we get an error if the DMU write fails.
133
*/
134
if (filemap_range_has_page(inode->i_mapping, start, end)) {
135
int for_sync = 1;
136
struct writeback_control wbc = {
137
.sync_mode = WB_SYNC_NONE,
138
.nr_to_write = LONG_MAX,
139
.range_start = start,
140
.range_end = end,
141
};
142
error =
143
zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
144
if (error != 0) {
145
/*
146
* Unclear what state things are in. zfs_putpage() will
147
* ensure the pages remain dirty if they haven't been
148
* written down to the DMU, but because there may be
149
* nothing logged, we can't assume that zfs_sync() ->
150
* zil_commit() will give us a useful error. It's
151
* safest if we just error out here.
152
*/
153
return (error);
154
}
155
}
156
157
crhold(cr);
158
cookie = spl_fstrans_mark();
159
error = -zfs_fsync(zp, datasync, cr);
160
spl_fstrans_unmark(cookie);
161
crfree(cr);
162
ASSERT3S(error, <=, 0);
163
164
return (error);
165
}
166
167
static inline int
168
zfs_io_flags(struct kiocb *kiocb)
169
{
170
int flags = 0;
171
172
#if defined(IOCB_DSYNC)
173
if (kiocb->ki_flags & IOCB_DSYNC)
174
flags |= O_DSYNC;
175
#endif
176
#if defined(IOCB_SYNC)
177
if (kiocb->ki_flags & IOCB_SYNC)
178
flags |= O_SYNC;
179
#endif
180
#if defined(IOCB_APPEND)
181
if (kiocb->ki_flags & IOCB_APPEND)
182
flags |= O_APPEND;
183
#endif
184
#if defined(IOCB_DIRECT)
185
if (kiocb->ki_flags & IOCB_DIRECT)
186
flags |= O_DIRECT;
187
#endif
188
return (flags);
189
}
190
191
/*
192
* If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
193
* is true. This is needed since datasets with inherited "relatime" property
194
* aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
195
* `zfs set relatime=...`), which is what relatime test in VFS by
196
* relatime_need_update() is based on.
197
*/
198
static inline void
199
zpl_file_accessed(struct file *filp)
200
{
201
struct inode *ip = filp->f_mapping->host;
202
203
if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
204
if (zfs_relatime_need_update(ip))
205
file_accessed(filp);
206
} else {
207
file_accessed(filp);
208
}
209
}
210
211
static ssize_t
212
zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
213
{
214
cred_t *cr = CRED();
215
fstrans_cookie_t cookie;
216
struct file *filp = kiocb->ki_filp;
217
ssize_t count = iov_iter_count(to);
218
zfs_uio_t uio;
219
220
zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count);
221
222
crhold(cr);
223
cookie = spl_fstrans_mark();
224
225
ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
226
filp->f_flags | zfs_io_flags(kiocb), cr);
227
228
spl_fstrans_unmark(cookie);
229
crfree(cr);
230
231
if (ret < 0)
232
return (ret);
233
234
ssize_t read = count - uio.uio_resid;
235
kiocb->ki_pos += read;
236
237
zpl_file_accessed(filp);
238
239
return (read);
240
}
241
242
static inline ssize_t
243
zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
244
size_t *countp)
245
{
246
ssize_t ret = generic_write_checks(kiocb, from);
247
if (ret <= 0)
248
return (ret);
249
250
*countp = ret;
251
252
return (0);
253
}
254
255
static ssize_t
256
zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
257
{
258
cred_t *cr = CRED();
259
fstrans_cookie_t cookie;
260
struct file *filp = kiocb->ki_filp;
261
struct inode *ip = filp->f_mapping->host;
262
zfs_uio_t uio;
263
size_t count = 0;
264
ssize_t ret;
265
266
ret = zpl_generic_write_checks(kiocb, from, &count);
267
if (ret)
268
return (ret);
269
270
zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count);
271
272
crhold(cr);
273
cookie = spl_fstrans_mark();
274
275
ret = -zfs_write(ITOZ(ip), &uio,
276
filp->f_flags | zfs_io_flags(kiocb), cr);
277
278
spl_fstrans_unmark(cookie);
279
crfree(cr);
280
281
if (ret < 0)
282
return (ret);
283
284
ssize_t wrote = count - uio.uio_resid;
285
kiocb->ki_pos += wrote;
286
287
return (wrote);
288
}
289
290
static ssize_t
291
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
292
{
293
/*
294
* All O_DIRECT requests should be handled by
295
* zpl_iter_write/read}(). There is no way kernel generic code should
296
* call the direct_IO address_space_operations function. We set this
297
* code path to be fatal if it is executed.
298
*/
299
PANIC(0);
300
return (0);
301
}
302
303
static loff_t
304
zpl_llseek(struct file *filp, loff_t offset, int whence)
305
{
306
#if defined(SEEK_HOLE) && defined(SEEK_DATA)
307
fstrans_cookie_t cookie;
308
309
if (whence == SEEK_DATA || whence == SEEK_HOLE) {
310
struct inode *ip = filp->f_mapping->host;
311
loff_t maxbytes = ip->i_sb->s_maxbytes;
312
loff_t error;
313
314
spl_inode_lock_shared(ip);
315
cookie = spl_fstrans_mark();
316
error = -zfs_holey(ITOZ(ip), whence, &offset);
317
spl_fstrans_unmark(cookie);
318
if (error == 0)
319
error = lseek_execute(filp, ip, offset, maxbytes);
320
spl_inode_unlock_shared(ip);
321
322
return (error);
323
}
324
#endif /* SEEK_HOLE && SEEK_DATA */
325
326
return (generic_file_llseek(filp, offset, whence));
327
}
328
329
/*
330
* It's worth taking a moment to describe how mmap is implemented
331
* for zfs because it differs considerably from other Linux filesystems.
332
* However, this issue is handled the same way under OpenSolaris.
333
*
334
* The issue is that by design zfs bypasses the Linux page cache and
335
* leaves all caching up to the ARC. This has been shown to work
336
* well for the common read(2)/write(2) case. However, mmap(2)
337
* is problem because it relies on being tightly integrated with the
338
* page cache. To handle this we cache mmap'ed files twice, once in
339
* the ARC and a second time in the page cache. The code is careful
340
* to keep both copies synchronized.
341
*
342
* When a file with an mmap'ed region is written to using write(2)
343
* both the data in the ARC and existing pages in the page cache
344
* are updated. For a read(2) data will be read first from the page
345
* cache then the ARC if needed. Neither a write(2) or read(2) will
346
* will ever result in new pages being added to the page cache.
347
*
348
* New pages are added to the page cache only via .readpage() which
349
* is called when the vfs needs to read a page off disk to back the
350
* virtual memory region. These pages may be modified without
351
* notifying the ARC and will be written out periodically via
352
* .writepage(). This will occur due to either a sync or the usual
353
* page aging behavior. Note because a read(2) of a mmap'ed file
354
* will always check the page cache first even when the ARC is out
355
* of date correct data will still be returned.
356
*
357
* While this implementation ensures correct behavior it does have
358
* have some drawbacks. The most obvious of which is that it
359
* increases the required memory footprint when access mmap'ed
360
* files. It also adds additional complexity to the code keeping
361
* both caches synchronized.
362
*
363
* Longer term it may be possible to cleanly resolve this wart by
364
* mapping page cache pages directly on to the ARC buffers. The
365
* Linux address space operations are flexible enough to allow
366
* selection of which pages back a particular index. The trick
367
* would be working out the details of which subsystem is in
368
* charge, the ARC, the page cache, or both. It may also prove
369
* helpful to move the ARC buffers to a scatter-gather lists
370
* rather than a vmalloc'ed region.
371
*/
372
static int
373
zpl_mmap(struct file *filp, struct vm_area_struct *vma)
374
{
375
struct inode *ip = filp->f_mapping->host;
376
int error;
377
fstrans_cookie_t cookie;
378
379
cookie = spl_fstrans_mark();
380
error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
381
(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
382
spl_fstrans_unmark(cookie);
383
384
if (error)
385
return (error);
386
387
error = generic_file_mmap(filp, vma);
388
if (error)
389
return (error);
390
391
return (error);
392
}
393
394
/*
395
* Populate a page with data for the Linux page cache. This function is
396
* only used to support mmap(2). There will be an identical copy of the
397
* data in the ARC which is kept up to date via .write() and .writepage().
398
*/
399
static inline int
400
zpl_readpage_common(struct page *pp)
401
{
402
fstrans_cookie_t cookie;
403
404
ASSERT(PageLocked(pp));
405
406
cookie = spl_fstrans_mark();
407
int error = -zfs_getpage(pp->mapping->host, pp);
408
spl_fstrans_unmark(cookie);
409
410
unlock_page(pp);
411
412
return (error);
413
}
414
415
#ifdef HAVE_VFS_READ_FOLIO
416
static int
417
zpl_read_folio(struct file *filp, struct folio *folio)
418
{
419
return (zpl_readpage_common(&folio->page));
420
}
421
#else
422
static int
423
zpl_readpage(struct file *filp, struct page *pp)
424
{
425
return (zpl_readpage_common(pp));
426
}
427
#endif
428
429
static int
430
zpl_readpage_filler(void *data, struct page *pp)
431
{
432
return (zpl_readpage_common(pp));
433
}
434
435
/*
436
* Populate a set of pages with data for the Linux page cache. This
437
* function will only be called for read ahead and never for demand
438
* paging. For simplicity, the code relies on read_cache_pages() to
439
* correctly lock each page for IO and call zpl_readpage().
440
*/
441
#ifdef HAVE_VFS_READPAGES
442
static int
443
zpl_readpages(struct file *filp, struct address_space *mapping,
444
struct list_head *pages, unsigned nr_pages)
445
{
446
return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
447
}
448
#else
449
static void
450
zpl_readahead(struct readahead_control *ractl)
451
{
452
struct page *page;
453
454
while ((page = readahead_page(ractl)) != NULL) {
455
int ret;
456
457
ret = zpl_readpage_filler(NULL, page);
458
put_page(page);
459
if (ret)
460
break;
461
}
462
}
463
#endif
464
465
static int
466
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
467
{
468
boolean_t *for_sync = data;
469
fstrans_cookie_t cookie;
470
int ret;
471
472
ASSERT(PageLocked(pp));
473
ASSERT(!PageWriteback(pp));
474
475
cookie = spl_fstrans_mark();
476
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
477
spl_fstrans_unmark(cookie);
478
479
return (ret);
480
}
481
482
#ifdef HAVE_WRITE_CACHE_PAGES
483
#ifdef HAVE_WRITEPAGE_T_FOLIO
484
static int
485
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
486
{
487
return (zpl_putpage(&pp->page, wbc, data));
488
}
489
#endif
490
491
static inline int
492
zpl_write_cache_pages(struct address_space *mapping,
493
struct writeback_control *wbc, void *data)
494
{
495
int result;
496
497
#ifdef HAVE_WRITEPAGE_T_FOLIO
498
result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
499
#else
500
result = write_cache_pages(mapping, wbc, zpl_putpage, data);
501
#endif
502
return (result);
503
}
504
#else
505
static inline int
506
zpl_write_cache_pages(struct address_space *mapping,
507
struct writeback_control *wbc, void *data)
508
{
509
pgoff_t start = wbc->range_start >> PAGE_SHIFT;
510
pgoff_t end = wbc->range_end >> PAGE_SHIFT;
511
512
struct folio_batch fbatch;
513
folio_batch_init(&fbatch);
514
515
/*
516
* This atomically (-ish) tags all DIRTY pages in the range with
517
* TOWRITE, allowing users to continue dirtying or undirtying pages
518
* while we get on with writeback, without us treading on each other.
519
*/
520
tag_pages_for_writeback(mapping, start, end);
521
522
int err = 0;
523
unsigned int npages;
524
525
/*
526
* Grab references to the TOWRITE pages just flagged. This may not get
527
* all of them, so we do it in a loop until there are none left.
528
*/
529
while ((npages = filemap_get_folios_tag(mapping, &start, end,
530
PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
531
532
/* Loop over each page and write it out. */
533
struct folio *folio;
534
while ((folio = folio_batch_next(&fbatch)) != NULL) {
535
folio_lock(folio);
536
537
/*
538
* If the folio has been remapped, or is no longer
539
* dirty, then there's nothing to do.
540
*/
541
if (folio->mapping != mapping ||
542
!folio_test_dirty(folio)) {
543
folio_unlock(folio);
544
continue;
545
}
546
547
/*
548
* If writeback is already in progress, wait for it to
549
* finish. We continue after this even if the page
550
* ends up clean; zfs_putpage() will skip it if no
551
* further work is required.
552
*/
553
while (folio_test_writeback(folio))
554
folio_wait_bit(folio, PG_writeback);
555
556
/*
557
* Write it out and collect any error. zfs_putpage()
558
* will clear the TOWRITE and DIRTY flags, and return
559
* with the page unlocked.
560
*/
561
int ferr = zpl_putpage(&folio->page, wbc, data);
562
if (err == 0 && ferr != 0)
563
err = ferr;
564
565
/* Housekeeping for the caller. */
566
wbc->nr_to_write -= folio_nr_pages(folio);
567
}
568
569
/* Release any remaining references on the batch. */
570
folio_batch_release(&fbatch);
571
}
572
573
return (err);
574
}
575
#endif
576
577
static int
578
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
579
{
580
znode_t *zp = ITOZ(mapping->host);
581
zfsvfs_t *zfsvfs = ITOZSB(mapping->host);
582
enum writeback_sync_modes sync_mode;
583
int result;
584
585
if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
586
return (result);
587
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
588
wbc->sync_mode = WB_SYNC_ALL;
589
zpl_exit(zfsvfs, FTAG);
590
sync_mode = wbc->sync_mode;
591
592
/*
593
* We don't want to run write_cache_pages() in SYNC mode here, because
594
* that would make putpage() wait for a single page to be committed to
595
* disk every single time, resulting in atrocious performance. Instead
596
* we run it once in non-SYNC mode so that the ZIL gets all the data,
597
* and then we commit it all in one go.
598
*/
599
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
600
wbc->sync_mode = WB_SYNC_NONE;
601
result = zpl_write_cache_pages(mapping, wbc, &for_sync);
602
if (sync_mode != wbc->sync_mode) {
603
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
604
return (result);
605
606
if (zfsvfs->z_log != NULL) {
607
/*
608
* We don't want to block here if the pool suspends,
609
* because this is not a syncing op by itself, but
610
* might be part of one that the caller will
611
* coordinate.
612
*/
613
result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
614
ZIL_COMMIT_NOW);
615
}
616
617
zpl_exit(zfsvfs, FTAG);
618
619
/*
620
* If zil_commit_flags() failed, it's unclear what state things
621
* are currently in. putpage() has written back out what it can
622
* to the DMU, but it may not be on disk. We have little choice
623
* but to escape.
624
*/
625
if (result != 0)
626
return (result);
627
628
/*
629
* We need to call write_cache_pages() again (we can't just
630
* return after the commit) because the previous call in
631
* non-SYNC mode does not guarantee that we got all the dirty
632
* pages (see the implementation of write_cache_pages() for
633
* details). That being said, this is a no-op in most cases.
634
*/
635
wbc->sync_mode = sync_mode;
636
result = zpl_write_cache_pages(mapping, wbc, &for_sync);
637
}
638
return (result);
639
}
640
641
#ifdef HAVE_VFS_WRITEPAGE
642
/*
643
* Write out dirty pages to the ARC, this function is only required to
644
* support mmap(2). Mapped pages may be dirtied by memory operations
645
* which never call .write(). These dirty pages are kept in sync with
646
* the ARC buffers via this hook.
647
*/
648
static int
649
zpl_writepage(struct page *pp, struct writeback_control *wbc)
650
{
651
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
652
wbc->sync_mode = WB_SYNC_ALL;
653
654
boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
655
656
return (zpl_putpage(pp, wbc, &for_sync));
657
}
658
#endif
659
660
/*
661
* The flag combination which matches the behavior of zfs_space() is
662
* FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
663
* flag was introduced in the 2.6.38 kernel.
664
*
665
* The original mode=0 (allocate space) behavior can be reasonably emulated
666
* by checking if enough space exists and creating a sparse file, as real
667
* persistent space reservation is not possible due to COW, snapshots, etc.
668
*/
669
static long
670
zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
671
{
672
cred_t *cr = CRED();
673
loff_t olen;
674
fstrans_cookie_t cookie;
675
int error = 0;
676
677
int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;
678
679
if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)
680
return (-EOPNOTSUPP);
681
682
if (offset < 0 || len <= 0)
683
return (-EINVAL);
684
685
spl_inode_lock(ip);
686
olen = i_size_read(ip);
687
688
crhold(cr);
689
cookie = spl_fstrans_mark();
690
if (mode & (test_mode)) {
691
flock64_t bf;
692
693
if (mode & FALLOC_FL_KEEP_SIZE) {
694
if (offset > olen)
695
goto out_unmark;
696
697
if (offset + len > olen)
698
len = olen - offset;
699
}
700
bf.l_type = F_WRLCK;
701
bf.l_whence = SEEK_SET;
702
bf.l_start = offset;
703
bf.l_len = len;
704
bf.l_pid = 0;
705
706
error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
707
} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
708
unsigned int percent = zfs_fallocate_reserve_percent;
709
struct kstatfs statfs;
710
711
/* Legacy mode, disable fallocate compatibility. */
712
if (percent == 0) {
713
error = -EOPNOTSUPP;
714
goto out_unmark;
715
}
716
717
/*
718
* Use zfs_statvfs() instead of dmu_objset_space() since it
719
* also checks project quota limits, which are relevant here.
720
*/
721
error = zfs_statvfs(ip, &statfs);
722
if (error)
723
goto out_unmark;
724
725
/*
726
* Shrink available space a bit to account for overhead/races.
727
* We know the product previously fit into availbytes from
728
* dmu_objset_space(), so the smaller product will also fit.
729
*/
730
if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
731
error = -ENOSPC;
732
goto out_unmark;
733
}
734
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
735
error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
736
}
737
out_unmark:
738
spl_fstrans_unmark(cookie);
739
spl_inode_unlock(ip);
740
741
crfree(cr);
742
743
return (error);
744
}
745
746
static long
747
zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
748
{
749
return zpl_fallocate_common(file_inode(filp),
750
mode, offset, len);
751
}
752
753
static int
754
zpl_ioctl_getversion(struct file *filp, void __user *arg)
755
{
756
uint32_t generation = file_inode(filp)->i_generation;
757
758
return (copy_to_user(arg, &generation, sizeof (generation)));
759
}
760
761
static int
762
zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
763
{
764
struct inode *ip = file_inode(filp);
765
znode_t *zp = ITOZ(ip);
766
zfsvfs_t *zfsvfs = ITOZSB(ip);
767
objset_t *os = zfsvfs->z_os;
768
int error = 0;
769
770
if (S_ISFIFO(ip->i_mode))
771
return (-ESPIPE);
772
773
if (offset < 0 || len < 0)
774
return (-EINVAL);
775
776
if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
777
return (error);
778
779
switch (advice) {
780
case POSIX_FADV_SEQUENTIAL:
781
case POSIX_FADV_WILLNEED:
782
#ifdef HAVE_GENERIC_FADVISE
783
if (zn_has_cached_data(zp, offset, offset + len - 1))
784
error = generic_fadvise(filp, offset, len, advice);
785
#endif
786
/*
787
* Pass on the caller's size directly, but note that
788
* dmu_prefetch_max will effectively cap it. If there
789
* really is a larger sequential access pattern, perhaps
790
* dmu_zfetch will detect it.
791
*/
792
if (len == 0)
793
len = i_size_read(ip) - offset;
794
795
dmu_prefetch(os, zp->z_id, 0, offset, len,
796
ZIO_PRIORITY_ASYNC_READ);
797
break;
798
case POSIX_FADV_NORMAL:
799
case POSIX_FADV_RANDOM:
800
case POSIX_FADV_DONTNEED:
801
case POSIX_FADV_NOREUSE:
802
/* ignored for now */
803
break;
804
default:
805
error = -EINVAL;
806
break;
807
}
808
809
zfs_exit(zfsvfs, FTAG);
810
811
return (error);
812
}
813
814
#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)
815
#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL)
816
817
818
static struct {
819
uint64_t zfs_flag;
820
uint32_t fs_flag;
821
uint32_t xflag;
822
} flags_lookup[] = {
823
{ZFS_IMMUTABLE, FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE},
824
{ZFS_APPENDONLY, FS_APPEND_FL, FS_XFLAG_APPEND},
825
{ZFS_NODUMP, FS_NODUMP_FL, FS_XFLAG_NODUMP},
826
{ZFS_PROJINHERIT, FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT}
827
};
828
829
static uint32_t
830
__zpl_ioctl_getflags(struct inode *ip)
831
{
832
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
833
uint32_t ioctl_flags = 0;
834
for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
835
if (zfs_flags & flags_lookup[i].zfs_flag)
836
ioctl_flags |= flags_lookup[i].fs_flag;
837
838
return (ioctl_flags);
839
}
840
841
static uint32_t
842
__zpl_ioctl_getxflags(struct inode *ip)
843
{
844
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
845
uint32_t ioctl_flags = 0;
846
847
for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
848
if (zfs_flags & flags_lookup[i].zfs_flag)
849
ioctl_flags |= flags_lookup[i].xflag;
850
851
return (ioctl_flags);
852
}
853
854
/*
855
* Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
856
* attributes common to both Linux and Solaris are mapped.
857
*/
858
static int
859
zpl_ioctl_getflags(struct file *filp, void __user *arg)
860
{
861
uint32_t flags;
862
int err;
863
864
flags = __zpl_ioctl_getflags(file_inode(filp));
865
flags = flags & ZFS_FL_USER_VISIBLE;
866
err = copy_to_user(arg, &flags, sizeof (flags));
867
868
return (err);
869
}
870
871
/*
872
* fchange() is a helper macro to detect if we have been asked to change a
873
* flag. This is ugly, but the requirement that we do this is a consequence of
874
* how the Linux file attribute interface was designed. Another consequence is
875
* that concurrent modification of files suffers from a TOCTOU race. Neither
876
* are things we can fix without modifying the kernel-userland interface, which
877
* is outside of our jurisdiction.
878
*/
879
880
#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
881
882
static int
883
__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
884
{
885
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
886
xoptattr_t *xoap;
887
888
if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
889
FS_PROJINHERIT_FL))
890
return (-EOPNOTSUPP);
891
892
if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
893
return (-EACCES);
894
895
if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
896
fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
897
!capable(CAP_LINUX_IMMUTABLE))
898
return (-EPERM);
899
900
if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
901
return (-EACCES);
902
903
xva_init(xva);
904
xoap = xva_getxoptattr(xva);
905
906
#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
907
if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
908
((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
909
XVA_SET_REQ(xva, (xflag)); \
910
(xfield) = ((ioctl_flags & (iflag)) != 0); \
911
} \
912
} while (0)
913
914
FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,
915
xoap->xoa_immutable);
916
FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,
917
xoap->xoa_appendonly);
918
FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
919
xoap->xoa_nodump);
920
FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
921
xoap->xoa_projinherit);
922
923
#undef FLAG_CHANGE
924
925
return (0);
926
}
927
928
static int
929
__zpl_ioctl_setxflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
930
{
931
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
932
xoptattr_t *xoap;
933
934
if (ioctl_flags & ~(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND |
935
FS_XFLAG_NODUMP | FS_XFLAG_PROJINHERIT))
936
return (-EOPNOTSUPP);
937
938
if ((fchange(ioctl_flags, zfs_flags, FS_XFLAG_IMMUTABLE,
939
ZFS_IMMUTABLE) ||
940
fchange(ioctl_flags, zfs_flags, FS_XFLAG_APPEND, ZFS_APPENDONLY)) &&
941
!capable(CAP_LINUX_IMMUTABLE))
942
return (-EPERM);
943
944
if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
945
return (-EACCES);
946
947
xva_init(xva);
948
xoap = xva_getxoptattr(xva);
949
950
#define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
951
if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
952
((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
953
XVA_SET_REQ(xva, (xflag)); \
954
(xfield) = ((ioctl_flags & (iflag)) != 0); \
955
} \
956
} while (0)
957
958
FLAG_CHANGE(FS_XFLAG_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
959
xoap->xoa_immutable);
960
FLAG_CHANGE(FS_XFLAG_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
961
xoap->xoa_appendonly);
962
FLAG_CHANGE(FS_XFLAG_NODUMP, ZFS_NODUMP, XAT_NODUMP,
963
xoap->xoa_nodump);
964
FLAG_CHANGE(FS_XFLAG_PROJINHERIT, ZFS_PROJINHERIT, XAT_PROJINHERIT,
965
xoap->xoa_projinherit);
966
967
#undef FLAG_CHANGE
968
969
return (0);
970
}
971
972
static int
973
zpl_ioctl_setflags(struct file *filp, void __user *arg)
974
{
975
struct inode *ip = file_inode(filp);
976
uint32_t flags;
977
cred_t *cr = CRED();
978
xvattr_t xva;
979
int err;
980
fstrans_cookie_t cookie;
981
982
if (copy_from_user(&flags, arg, sizeof (flags)))
983
return (-EFAULT);
984
985
err = __zpl_ioctl_setflags(ip, flags, &xva);
986
if (err)
987
return (err);
988
989
crhold(cr);
990
cookie = spl_fstrans_mark();
991
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
992
spl_fstrans_unmark(cookie);
993
crfree(cr);
994
995
return (err);
996
}
997
998
static int
999
zpl_ioctl_getxattr(struct file *filp, void __user *arg)
1000
{
1001
zfsxattr_t fsx = { 0 };
1002
struct inode *ip = file_inode(filp);
1003
int err;
1004
1005
fsx.fsx_xflags = __zpl_ioctl_getxflags(ip);
1006
fsx.fsx_projid = ITOZ(ip)->z_projid;
1007
err = copy_to_user(arg, &fsx, sizeof (fsx));
1008
1009
return (err);
1010
}
1011
1012
static int
1013
zpl_ioctl_setxattr(struct file *filp, void __user *arg)
1014
{
1015
struct inode *ip = file_inode(filp);
1016
zfsxattr_t fsx;
1017
cred_t *cr = CRED();
1018
xvattr_t xva;
1019
xoptattr_t *xoap;
1020
int err;
1021
fstrans_cookie_t cookie;
1022
1023
if (copy_from_user(&fsx, arg, sizeof (fsx)))
1024
return (-EFAULT);
1025
1026
if (!zpl_is_valid_projid(fsx.fsx_projid))
1027
return (-EINVAL);
1028
1029
err = __zpl_ioctl_setxflags(ip, fsx.fsx_xflags, &xva);
1030
if (err)
1031
return (err);
1032
1033
xoap = xva_getxoptattr(&xva);
1034
XVA_SET_REQ(&xva, XAT_PROJID);
1035
xoap->xoa_projid = fsx.fsx_projid;
1036
1037
crhold(cr);
1038
cookie = spl_fstrans_mark();
1039
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
1040
spl_fstrans_unmark(cookie);
1041
crfree(cr);
1042
1043
return (err);
1044
}
1045
1046
/*
1047
* Expose Additional File Level Attributes of ZFS.
1048
*/
1049
static int
1050
zpl_ioctl_getdosflags(struct file *filp, void __user *arg)
1051
{
1052
struct inode *ip = file_inode(filp);
1053
uint64_t dosflags = ITOZ(ip)->z_pflags;
1054
dosflags &= ZFS_DOS_FL_USER_VISIBLE;
1055
int err = copy_to_user(arg, &dosflags, sizeof (dosflags));
1056
1057
return (err);
1058
}
1059
1060
static int
1061
__zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
1062
{
1063
uint64_t zfs_flags = ITOZ(ip)->z_pflags;
1064
xoptattr_t *xoap;
1065
1066
if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))
1067
return (-EOPNOTSUPP);
1068
1069
if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||
1070
fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&
1071
!capable(CAP_LINUX_IMMUTABLE))
1072
return (-EPERM);
1073
1074
if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
1075
return (-EACCES);
1076
1077
xva_init(xva);
1078
xoap = xva_getxoptattr(xva);
1079
1080
#define FLAG_CHANGE(iflag, xflag, xfield) do { \
1081
if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \
1082
((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \
1083
XVA_SET_REQ(xva, (xflag)); \
1084
(xfield) = ((ioctl_flags & (iflag)) != 0); \
1085
} \
1086
} while (0)
1087
1088
FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);
1089
FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);
1090
FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);
1091
FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);
1092
FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);
1093
FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);
1094
FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);
1095
FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);
1096
FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);
1097
FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);
1098
FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);
1099
1100
#undef FLAG_CHANGE
1101
1102
return (0);
1103
}
1104
1105
/*
1106
* Set Additional File Level Attributes of ZFS.
1107
*/
1108
static int
1109
zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
1110
{
1111
struct inode *ip = file_inode(filp);
1112
uint64_t dosflags;
1113
cred_t *cr = CRED();
1114
xvattr_t xva;
1115
int err;
1116
fstrans_cookie_t cookie;
1117
1118
if (copy_from_user(&dosflags, arg, sizeof (dosflags)))
1119
return (-EFAULT);
1120
1121
err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);
1122
if (err)
1123
return (err);
1124
1125
crhold(cr);
1126
cookie = spl_fstrans_mark();
1127
err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
1128
spl_fstrans_unmark(cookie);
1129
crfree(cr);
1130
1131
return (err);
1132
}
1133
1134
static int
1135
zpl_ioctl_rewrite(struct file *filp, void __user *arg)
1136
{
1137
struct inode *ip = file_inode(filp);
1138
zfs_rewrite_args_t args;
1139
fstrans_cookie_t cookie;
1140
int err;
1141
1142
if (copy_from_user(&args, arg, sizeof (args)))
1143
return (-EFAULT);
1144
1145
if (unlikely(!(filp->f_mode & FMODE_WRITE)))
1146
return (-EBADF);
1147
1148
cookie = spl_fstrans_mark();
1149
err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
1150
spl_fstrans_unmark(cookie);
1151
1152
return (err);
1153
}
1154
1155
static long
1156
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1157
{
1158
switch (cmd) {
1159
case FS_IOC_GETVERSION:
1160
return (zpl_ioctl_getversion(filp, (void *)arg));
1161
case FS_IOC_GETFLAGS:
1162
return (zpl_ioctl_getflags(filp, (void *)arg));
1163
case FS_IOC_SETFLAGS:
1164
return (zpl_ioctl_setflags(filp, (void *)arg));
1165
case ZFS_IOC_FSGETXATTR:
1166
return (zpl_ioctl_getxattr(filp, (void *)arg));
1167
case ZFS_IOC_FSSETXATTR:
1168
return (zpl_ioctl_setxattr(filp, (void *)arg));
1169
case ZFS_IOC_GETDOSFLAGS:
1170
return (zpl_ioctl_getdosflags(filp, (void *)arg));
1171
case ZFS_IOC_SETDOSFLAGS:
1172
return (zpl_ioctl_setdosflags(filp, (void *)arg));
1173
case ZFS_IOC_REWRITE:
1174
return (zpl_ioctl_rewrite(filp, (void *)arg));
1175
default:
1176
return (-ENOTTY);
1177
}
1178
}
1179
1180
#ifdef CONFIG_COMPAT
1181
static long
1182
zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1183
{
1184
switch (cmd) {
1185
case FS_IOC32_GETVERSION:
1186
cmd = FS_IOC_GETVERSION;
1187
break;
1188
case FS_IOC32_GETFLAGS:
1189
cmd = FS_IOC_GETFLAGS;
1190
break;
1191
case FS_IOC32_SETFLAGS:
1192
cmd = FS_IOC_SETFLAGS;
1193
break;
1194
default:
1195
return (-ENOTTY);
1196
}
1197
return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1198
}
1199
#endif /* CONFIG_COMPAT */
1200
1201
const struct address_space_operations zpl_address_space_operations = {
1202
#ifdef HAVE_VFS_READPAGES
1203
.readpages = zpl_readpages,
1204
#else
1205
.readahead = zpl_readahead,
1206
#endif
1207
#ifdef HAVE_VFS_READ_FOLIO
1208
.read_folio = zpl_read_folio,
1209
#else
1210
.readpage = zpl_readpage,
1211
#endif
1212
#ifdef HAVE_VFS_WRITEPAGE
1213
.writepage = zpl_writepage,
1214
#endif
1215
.writepages = zpl_writepages,
1216
.direct_IO = zpl_direct_IO,
1217
#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
1218
.set_page_dirty = __set_page_dirty_nobuffers,
1219
#endif
1220
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
1221
.dirty_folio = filemap_dirty_folio,
1222
#endif
1223
#ifdef HAVE_VFS_MIGRATE_FOLIO
1224
.migrate_folio = migrate_folio,
1225
#elif defined(HAVE_VFS_MIGRATEPAGE)
1226
.migratepage = migrate_page,
1227
#endif
1228
};
1229
1230
const struct file_operations zpl_file_operations = {
1231
.open = zpl_open,
1232
.release = zpl_release,
1233
.llseek = zpl_llseek,
1234
.read_iter = zpl_iter_read,
1235
.write_iter = zpl_iter_write,
1236
#ifdef HAVE_COPY_SPLICE_READ
1237
.splice_read = copy_splice_read,
1238
#else
1239
.splice_read = generic_file_splice_read,
1240
#endif
1241
.splice_write = iter_file_splice_write,
1242
.mmap = zpl_mmap,
1243
.fsync = zpl_fsync,
1244
.fallocate = zpl_fallocate,
1245
.copy_file_range = zpl_copy_file_range,
1246
#ifdef HAVE_VFS_CLONE_FILE_RANGE
1247
.clone_file_range = zpl_clone_file_range,
1248
#endif
1249
#ifdef HAVE_VFS_REMAP_FILE_RANGE
1250
.remap_file_range = zpl_remap_file_range,
1251
#endif
1252
#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
1253
.dedupe_file_range = zpl_dedupe_file_range,
1254
#endif
1255
.fadvise = zpl_fadvise,
1256
.unlocked_ioctl = zpl_ioctl,
1257
#ifdef CONFIG_COMPAT
1258
.compat_ioctl = zpl_compat_ioctl,
1259
#endif
1260
};
1261
1262
const struct file_operations zpl_dir_file_operations = {
1263
.llseek = generic_file_llseek,
1264
.read = generic_read_dir,
1265
.iterate_shared = zpl_iterate,
1266
.fsync = zpl_fsync,
1267
.unlocked_ioctl = zpl_ioctl,
1268
#ifdef CONFIG_COMPAT
1269
.compat_ioctl = zpl_compat_ioctl,
1270
#endif
1271
};
1272
1273
module_param(zfs_fallocate_reserve_percent, uint, 0644);
1274
MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
1275
"Percentage of length to use for the available capacity check");
1276
1277