Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
48775 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24
* Copyright (c) 2023, Datto Inc. All rights reserved.
25
* Copyright (c) 2025, Klara, Inc.
26
* Copyright (c) 2025, Rob Norris <[email protected]>
27
*/
28
29
30
#include <sys/zfs_znode.h>
31
#include <sys/zfs_vfsops.h>
32
#include <sys/zfs_vnops.h>
33
#include <sys/zfs_ctldir.h>
34
#include <sys/zpl.h>
35
#include <linux/iversion.h>
36
#include <linux/version.h>
37
#include <linux/vfs_compat.h>
38
39
/*
40
* What to do when the last reference to an inode is released. If 0, the kernel
41
* will cache it on the superblock. If 1, the inode will be freed immediately.
42
* See zpl_drop_inode().
43
*/
44
int zfs_delete_inode = 0;
45
46
/*
47
* What to do when the last reference to a dentry is released. If 0, the kernel
48
* will cache it until the entry (file) is destroyed. If 1, the dentry will be
49
* marked for cleanup, at which time its inode reference will be released. See
50
* zpl_dentry_delete().
51
*/
52
int zfs_delete_dentry = 0;
53
54
static struct inode *
55
zpl_inode_alloc(struct super_block *sb)
56
{
57
struct inode *ip;
58
59
VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
60
inode_set_iversion(ip, 1);
61
62
return (ip);
63
}
64
65
#ifdef HAVE_SOPS_FREE_INODE
66
static void
67
zpl_inode_free(struct inode *ip)
68
{
69
ASSERT0(atomic_read(&ip->i_count));
70
zfs_inode_free(ip);
71
}
72
#endif
73
74
static void
75
zpl_inode_destroy(struct inode *ip)
76
{
77
ASSERT0(atomic_read(&ip->i_count));
78
zfs_inode_destroy(ip);
79
}
80
81
/*
82
* Called from __mark_inode_dirty() to reflect that something in the
83
* inode has changed. We use it to ensure the znode system attributes
84
* are always strictly update to date with respect to the inode.
85
*/
86
static void
87
zpl_dirty_inode(struct inode *ip, int flags)
88
{
89
fstrans_cookie_t cookie;
90
91
cookie = spl_fstrans_mark();
92
zfs_dirty_inode(ip, flags);
93
spl_fstrans_unmark(cookie);
94
}
95
96
/*
97
* ->drop_inode() is called when the last reference to an inode is released.
98
* Its return value indicates if the inode should be destroyed immediately, or
99
* cached on the superblock structure.
100
*
101
* By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
102
* "destroy immediately" if the inode is unhashed and has no links (roughly: no
103
* longer exists on disk). On datasets with millions of rarely-accessed files,
104
* this can cause a large amount of memory to be "pinned" by cached inodes,
105
* which in turn pin their associated dnodes and dbufs, until the kernel starts
106
* reporting memory pressure and requests OpenZFS release some memory (see
107
* zfs_prune()).
108
*
109
* When set to 1, we call generic_delete_inode(), which always returns "destroy
110
* immediately", resulting in inodes being destroyed immediately, releasing
111
* their associated dnodes and dbufs to the dbuf cached and the ARC to be
112
* evicted as normal.
113
*
114
* Note that the "last reference" doesn't always mean the last _userspace_
115
* reference; the dentry cache also holds a reference, so "busy" inodes will
116
* still be kept alive that way (subject to dcache tuning).
117
*/
118
static int
119
zpl_drop_inode(struct inode *ip)
120
{
121
if (zfs_delete_inode)
122
return (generic_delete_inode(ip));
123
return (generic_drop_inode(ip));
124
}
125
126
/*
127
* The ->evict_inode() callback must minimally truncate the inode pages,
128
* and call clear_inode(). For 2.6.35 and later kernels this will
129
* simply update the inode state, with the sync occurring before the
130
* truncate in evict(). For earlier kernels clear_inode() maps to
131
* end_writeback() which is responsible for completing all outstanding
132
* write back. In either case, once this is done it is safe to cleanup
133
* any remaining inode specific data via zfs_inactive().
134
* remaining filesystem specific data.
135
*/
136
static void
137
zpl_evict_inode(struct inode *ip)
138
{
139
fstrans_cookie_t cookie;
140
141
cookie = spl_fstrans_mark();
142
truncate_setsize(ip, 0);
143
clear_inode(ip);
144
zfs_inactive(ip);
145
spl_fstrans_unmark(cookie);
146
}
147
148
static void
149
zpl_put_super(struct super_block *sb)
150
{
151
fstrans_cookie_t cookie;
152
int error;
153
154
cookie = spl_fstrans_mark();
155
error = -zfs_umount(sb);
156
spl_fstrans_unmark(cookie);
157
ASSERT3S(error, <=, 0);
158
}
159
160
/*
161
* zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
162
* syscalls, via sb->s_op->sync_fs().
163
*
164
* Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
165
* sync_filesystem() would ignore the return from sync_fs(), instead only
166
* considing the error from syncing the underlying block device (sb->s_dev).
167
* Since OpenZFS doesn't _have_ an underlying block device, there's no way for
168
* us to report a sync directly.
169
*
170
* However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
171
* error store `s_wb_err`, to carry errors seen on page writeback since the
172
* last call to syncfs(). If sync_filesystem() does not return an error, any
173
* existing writeback error on the superblock will be used instead (and cleared
174
* either way). We don't use this (page writeback is a different thing for us),
175
* so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
176
*
177
* Before 5.8, we have no other good options - no matter what happens, the
178
* userspace program will be told the call has succeeded, and so we must make
179
* it so, Therefore, when we are asked to wait for sync to complete (wait ==
180
* 1), if zfs_sync() has returned an error we have no choice but to block,
181
* regardless of the reason.
182
*
183
* The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
184
* to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
185
* mainline Linux series at time of writing), and has likely been backported to
186
* vendor kernels before 5.8. We don't really want to use a workaround when we
187
* don't have to, but we can't really detect whether or not sync_filesystem()
188
* will return our errors (without a difficult runtime test anyway). So, we use
189
* a static version check: any kernel reporting its version as 5.17+ will use a
190
* direct error return, otherwise, we'll either use s_wb_err if it was detected
191
* at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
192
* block to ensure the correct semantics.
193
*
194
* See https://github.com/openzfs/zfs/issues/17416 for further discussion.
195
*/
196
static int
197
zpl_sync_fs(struct super_block *sb, int wait)
198
{
199
fstrans_cookie_t cookie;
200
cred_t *cr = CRED();
201
int error;
202
203
crhold(cr);
204
cookie = spl_fstrans_mark();
205
error = -zfs_sync(sb, wait, cr);
206
207
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
208
#ifdef HAVE_SUPER_BLOCK_S_WB_ERR
209
if (error && wait)
210
errseq_set(&sb->s_wb_err, error);
211
#else
212
if (error && wait) {
213
zfsvfs_t *zfsvfs = sb->s_fs_info;
214
ASSERT3P(zfsvfs, !=, NULL);
215
if (zfs_enter(zfsvfs, FTAG) == 0) {
216
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
217
zfs_exit(zfsvfs, FTAG);
218
error = 0;
219
}
220
}
221
#endif
222
#endif /* < 5.17.0 */
223
224
spl_fstrans_unmark(cookie);
225
crfree(cr);
226
227
ASSERT3S(error, <=, 0);
228
return (error);
229
}
230
231
static int
232
zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
233
{
234
fstrans_cookie_t cookie;
235
int error;
236
237
cookie = spl_fstrans_mark();
238
error = -zfs_statvfs(dentry->d_inode, statp);
239
spl_fstrans_unmark(cookie);
240
ASSERT3S(error, <=, 0);
241
242
/*
243
* If required by a 32-bit system call, dynamically scale the
244
* block size up to 16MiB and decrease the block counts. This
245
* allows for a maximum size of 64EiB to be reported. The file
246
* counts must be artificially capped at 2^32-1.
247
*/
248
if (unlikely(zpl_is_32bit_api())) {
249
while (statp->f_blocks > UINT32_MAX &&
250
statp->f_bsize < SPA_MAXBLOCKSIZE) {
251
statp->f_frsize <<= 1;
252
statp->f_bsize <<= 1;
253
254
statp->f_blocks >>= 1;
255
statp->f_bfree >>= 1;
256
statp->f_bavail >>= 1;
257
}
258
259
uint64_t usedobjs = statp->f_files - statp->f_ffree;
260
statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
261
statp->f_files = statp->f_ffree + usedobjs;
262
}
263
264
return (error);
265
}
266
267
static int
268
zpl_remount_fs(struct super_block *sb, int *flags, char *data)
269
{
270
zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
271
fstrans_cookie_t cookie;
272
int error;
273
274
cookie = spl_fstrans_mark();
275
error = -zfs_remount(sb, flags, &zm);
276
spl_fstrans_unmark(cookie);
277
ASSERT3S(error, <=, 0);
278
279
return (error);
280
}
281
282
static int
283
__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
284
{
285
int error;
286
if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
287
return (error);
288
289
char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
290
dmu_objset_name(zfsvfs->z_os, fsname);
291
292
for (int i = 0; fsname[i] != 0; i++) {
293
/*
294
* Spaces in the dataset name must be converted to their
295
* octal escape sequence for getmntent(3) to correctly
296
* parse then fsname portion of /proc/self/mounts.
297
*/
298
if (fsname[i] == ' ') {
299
seq_puts(seq, "\\040");
300
} else {
301
seq_putc(seq, fsname[i]);
302
}
303
}
304
305
kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
306
307
zpl_exit(zfsvfs, FTAG);
308
309
return (0);
310
}
311
312
static int
313
zpl_show_devname(struct seq_file *seq, struct dentry *root)
314
{
315
return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
316
}
317
318
static int
319
__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
320
{
321
seq_printf(seq, ",%s",
322
zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
323
324
#ifdef CONFIG_FS_POSIX_ACL
325
switch (zfsvfs->z_acl_type) {
326
case ZFS_ACLTYPE_POSIX:
327
seq_puts(seq, ",posixacl");
328
break;
329
default:
330
seq_puts(seq, ",noacl");
331
break;
332
}
333
#endif /* CONFIG_FS_POSIX_ACL */
334
335
switch (zfsvfs->z_case) {
336
case ZFS_CASE_SENSITIVE:
337
seq_puts(seq, ",casesensitive");
338
break;
339
case ZFS_CASE_INSENSITIVE:
340
seq_puts(seq, ",caseinsensitive");
341
break;
342
default:
343
seq_puts(seq, ",casemixed");
344
break;
345
}
346
347
return (0);
348
}
349
350
static int
351
zpl_show_options(struct seq_file *seq, struct dentry *root)
352
{
353
return (__zpl_show_options(seq, root->d_sb->s_fs_info));
354
}
355
356
static int
357
zpl_fill_super(struct super_block *sb, void *data, int silent)
358
{
359
zfs_mnt_t *zm = (zfs_mnt_t *)data;
360
fstrans_cookie_t cookie;
361
int error;
362
363
cookie = spl_fstrans_mark();
364
error = -zfs_domount(sb, zm, silent);
365
spl_fstrans_unmark(cookie);
366
ASSERT3S(error, <=, 0);
367
368
return (error);
369
}
370
371
static int
372
zpl_test_super(struct super_block *s, void *data)
373
{
374
zfsvfs_t *zfsvfs = s->s_fs_info;
375
objset_t *os = data;
376
/*
377
* If the os doesn't match the z_os in the super_block, assume it is
378
* not a match. Matching would imply a multimount of a dataset. It is
379
* possible that during a multimount, there is a simultaneous operation
380
* that changes the z_os, e.g., rollback, where the match will be
381
* missed, but in that case the user will get an EBUSY.
382
*/
383
return (zfsvfs != NULL && os == zfsvfs->z_os);
384
}
385
386
static struct super_block *
387
zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
388
{
389
struct super_block *s;
390
objset_t *os;
391
boolean_t issnap = B_FALSE;
392
int err;
393
394
err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
395
if (err)
396
return (ERR_PTR(-err));
397
398
/*
399
* The dsl pool lock must be released prior to calling sget().
400
* It is possible sget() may block on the lock in grab_super()
401
* while deactivate_super() holds that same lock and waits for
402
* a txg sync. If the dsl_pool lock is held over sget()
403
* this can prevent the pool sync and cause a deadlock.
404
*/
405
dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
406
dsl_pool_rele(dmu_objset_pool(os), FTAG);
407
408
s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
409
410
/*
411
* Recheck with the lock held to prevent mounting the wrong dataset
412
* since z_os can be stale when the teardown lock is held.
413
*
414
* We can't do this in zpl_test_super in since it's under spinlock and
415
* also s_umount lock is not held there so it would race with
416
* zfs_umount and zfsvfs can be freed.
417
*/
418
if (!IS_ERR(s) && s->s_fs_info != NULL) {
419
zfsvfs_t *zfsvfs = s->s_fs_info;
420
if (zpl_enter(zfsvfs, FTAG) == 0) {
421
if (os != zfsvfs->z_os)
422
err = -SET_ERROR(EBUSY);
423
issnap = zfsvfs->z_issnap;
424
zpl_exit(zfsvfs, FTAG);
425
} else {
426
err = -SET_ERROR(EBUSY);
427
}
428
}
429
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
430
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
431
432
if (IS_ERR(s))
433
return (ERR_CAST(s));
434
435
if (err) {
436
deactivate_locked_super(s);
437
return (ERR_PTR(err));
438
}
439
440
if (s->s_root == NULL) {
441
err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
442
if (err) {
443
deactivate_locked_super(s);
444
return (ERR_PTR(err));
445
}
446
s->s_flags |= SB_ACTIVE;
447
} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
448
/*
449
* Skip ro check for snap since snap is always ro regardless
450
* ro flag is passed by mount or not.
451
*/
452
deactivate_locked_super(s);
453
return (ERR_PTR(-EBUSY));
454
}
455
456
return (s);
457
}
458
459
static struct dentry *
460
zpl_mount(struct file_system_type *fs_type, int flags,
461
const char *osname, void *data)
462
{
463
zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
464
465
struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
466
if (IS_ERR(sb))
467
return (ERR_CAST(sb));
468
469
return (dget(sb->s_root));
470
}
471
472
static void
473
zpl_kill_sb(struct super_block *sb)
474
{
475
zfs_preumount(sb);
476
kill_anon_super(sb);
477
}
478
479
void
480
zpl_prune_sb(uint64_t nr_to_scan, void *arg)
481
{
482
struct super_block *sb = (struct super_block *)arg;
483
int objects = 0;
484
485
/*
486
* Ensure the superblock is not in the process of being torn down.
487
*/
488
#ifdef HAVE_SB_DYING
489
if (down_read_trylock(&sb->s_umount)) {
490
if (!(sb->s_flags & SB_DYING) && sb->s_root &&
491
(sb->s_flags & SB_BORN)) {
492
(void) zfs_prune(sb, nr_to_scan, &objects);
493
}
494
up_read(&sb->s_umount);
495
}
496
#else
497
if (down_read_trylock(&sb->s_umount)) {
498
if (!hlist_unhashed(&sb->s_instances) &&
499
sb->s_root && (sb->s_flags & SB_BORN)) {
500
(void) zfs_prune(sb, nr_to_scan, &objects);
501
}
502
up_read(&sb->s_umount);
503
}
504
#endif
505
}
506
507
const struct super_operations zpl_super_operations = {
508
.alloc_inode = zpl_inode_alloc,
509
#ifdef HAVE_SOPS_FREE_INODE
510
.free_inode = zpl_inode_free,
511
#endif
512
.destroy_inode = zpl_inode_destroy,
513
.dirty_inode = zpl_dirty_inode,
514
.write_inode = NULL,
515
.drop_inode = zpl_drop_inode,
516
.evict_inode = zpl_evict_inode,
517
.put_super = zpl_put_super,
518
.sync_fs = zpl_sync_fs,
519
.statfs = zpl_statfs,
520
.remount_fs = zpl_remount_fs,
521
.show_devname = zpl_show_devname,
522
.show_options = zpl_show_options,
523
.show_stats = NULL,
524
};
525
526
/*
527
* ->d_delete() is called when the last reference to a dentry is released. Its
528
* return value indicates if the dentry should be destroyed immediately, or
529
* retained in the dentry cache.
530
*
531
* By default (zfs_delete_dentry=0) the kernel will always cache unused
532
* entries. Each dentry holds an inode reference, so cached dentries can hold
533
* the final inode reference indefinitely, leading to the inode and its related
534
* data being pinned (see zpl_drop_inode()).
535
*
536
* When set to 1, we signal that the dentry should be destroyed immediately and
537
* never cached. This reduces memory usage, at the cost of higher overheads to
538
* lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
539
* reloaded and reinflated.
540
*
541
* Note that userspace does not have direct control over dentry references and
542
* reclaim; rather, this is part of the kernel's caching and reclaim subsystems
543
* (eg vm.vfs_cache_pressure).
544
*/
545
static int
546
zpl_dentry_delete(const struct dentry *dentry)
547
{
548
return (zfs_delete_dentry ? 1 : 0);
549
}
550
551
const struct dentry_operations zpl_dentry_operations = {
552
.d_delete = zpl_dentry_delete,
553
};
554
555
struct file_system_type zpl_fs_type = {
556
.owner = THIS_MODULE,
557
.name = ZFS_DRIVER,
558
#if defined(HAVE_IDMAP_MNT_API)
559
.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
560
#else
561
.fs_flags = FS_USERNS_MOUNT,
562
#endif
563
.mount = zpl_mount,
564
.kill_sb = zpl_kill_sb,
565
};
566
567
ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
568
"Delete inodes as soon as the last reference is released.");
569
570
ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
571
"Delete dentries from dentry cache as soon as the last reference is "
572
"released.");
573
574