Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/unionfs/union_subr.c
106601 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1994 Jan-Simon Pendry
5
* Copyright (c) 1994
6
* The Regents of the University of California. All rights reserved.
7
* Copyright (c) 2005, 2006, 2012 Masanori Ozawa <[email protected]>, ONGS Inc.
8
* Copyright (c) 2006, 2012 Daichi Goto <[email protected]>
9
*
10
* This code is derived from software contributed to Berkeley by
11
* Jan-Simon Pendry.
12
*
13
* Redistribution and use in source and binary forms, with or without
14
* modification, are permitted provided that the following conditions
15
* are met:
16
* 1. Redistributions of source code must retain the above copyright
17
* notice, this list of conditions and the following disclaimer.
18
* 2. Redistributions in binary form must reproduce the above copyright
19
* notice, this list of conditions and the following disclaimer in the
20
* documentation and/or other materials provided with the distribution.
21
* 3. Neither the name of the University nor the names of its contributors
22
* may be used to endorse or promote products derived from this software
23
* without specific prior written permission.
24
*
25
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35
* SUCH DAMAGE.
36
*/
37
38
#include <sys/param.h>
39
#include <sys/systm.h>
40
#include <sys/kernel.h>
41
#include <sys/ktr.h>
42
#include <sys/lock.h>
43
#include <sys/mutex.h>
44
#include <sys/malloc.h>
45
#include <sys/mount.h>
46
#include <sys/namei.h>
47
#include <sys/proc.h>
48
#include <sys/vnode.h>
49
#include <sys/dirent.h>
50
#include <sys/fcntl.h>
51
#include <sys/filedesc.h>
52
#include <sys/stat.h>
53
#include <sys/sysctl.h>
54
#include <sys/taskqueue.h>
55
#include <sys/resourcevar.h>
56
57
#include <machine/atomic.h>
58
59
#include <security/mac/mac_framework.h>
60
61
#include <vm/uma.h>
62
63
#include <fs/unionfs/union.h>
64
65
#define NUNIONFSNODECACHE 16
66
#define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1)
67
68
static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table");
69
MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part");
70
MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part");
71
72
static struct task unionfs_deferred_rele_task;
73
static struct mtx unionfs_deferred_rele_lock;
74
static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list =
75
STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list);
76
static TASKQUEUE_DEFINE_THREAD(unionfs_rele);
77
78
unsigned int unionfs_ndeferred = 0;
79
SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD,
80
&unionfs_ndeferred, 0, "unionfs deferred vnode release");
81
82
static void unionfs_deferred_rele(void *, int);
83
84
/*
85
* Initialize
86
*/
87
int
88
unionfs_init(struct vfsconf *vfsp)
89
{
90
UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */
91
TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL);
92
mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF);
93
return (0);
94
}
95
96
/*
97
* Uninitialize
98
*/
99
int
100
unionfs_uninit(struct vfsconf *vfsp)
101
{
102
taskqueue_quiesce(taskqueue_unionfs_rele);
103
taskqueue_free(taskqueue_unionfs_rele);
104
mtx_destroy(&unionfs_deferred_rele_lock);
105
return (0);
106
}
107
108
static void
109
unionfs_deferred_rele(void *arg __unused, int pending __unused)
110
{
111
STAILQ_HEAD(, unionfs_node) local_rele_list;
112
struct unionfs_node *unp, *tunp;
113
unsigned int ndeferred;
114
115
ndeferred = 0;
116
STAILQ_INIT(&local_rele_list);
117
mtx_lock(&unionfs_deferred_rele_lock);
118
STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list);
119
mtx_unlock(&unionfs_deferred_rele_lock);
120
STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) {
121
++ndeferred;
122
MPASS(unp->un_dvp != NULL);
123
vrele(unp->un_dvp);
124
free(unp, M_UNIONFSNODE);
125
}
126
127
/* We expect this function to be single-threaded, thus no atomic */
128
unionfs_ndeferred += ndeferred;
129
}
130
131
static struct unionfs_node_hashhead *
132
unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup)
133
{
134
struct unionfs_node *unp;
135
136
unp = VTOUNIONFS(dvp);
137
138
return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK]));
139
}
140
141
/*
142
* Attempt to lookup a cached unionfs vnode by upper/lower vp
143
* from dvp, with dvp's interlock held.
144
*/
145
static struct vnode *
146
unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp)
147
{
148
struct unionfs_node *unp;
149
struct unionfs_node_hashhead *hd;
150
struct vnode *vp;
151
152
hd = unionfs_get_hashhead(dvp, lookup);
153
154
LIST_FOREACH(unp, hd, un_hash) {
155
if (unp->un_uppervp == lookup ||
156
unp->un_lowervp == lookup) {
157
vp = UNIONFSTOV(unp);
158
VI_LOCK_FLAGS(vp, MTX_DUPOK);
159
vp->v_iflag &= ~VI_OWEINACT;
160
if (VN_IS_DOOMED(vp) ||
161
((vp->v_iflag & VI_DOINGINACT) != 0)) {
162
VI_UNLOCK(vp);
163
vp = NULL;
164
} else {
165
vrefl(vp);
166
VI_UNLOCK(vp);
167
}
168
return (vp);
169
}
170
}
171
172
return (NULL);
173
}
174
175
176
/*
177
* Get the cached vnode.
178
*/
179
static struct vnode *
180
unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp,
181
struct vnode *dvp)
182
{
183
struct vnode *vp;
184
185
vp = NULL;
186
VI_LOCK(dvp);
187
if (uvp != NULL)
188
vp = unionfs_get_cached_vnode_locked(uvp, dvp);
189
else if (lvp != NULL)
190
vp = unionfs_get_cached_vnode_locked(lvp, dvp);
191
VI_UNLOCK(dvp);
192
193
return (vp);
194
}
195
196
/*
197
* Add the new vnode into cache.
198
*/
199
static struct vnode *
200
unionfs_ins_cached_vnode(struct unionfs_node *uncp,
201
struct vnode *dvp)
202
{
203
struct unionfs_node_hashhead *hd;
204
struct vnode *vp;
205
206
vp = NULL;
207
VI_LOCK(dvp);
208
if (uncp->un_uppervp != NULL) {
209
ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
210
KASSERT(uncp->un_uppervp->v_type == VDIR,
211
("%s: v_type != VDIR", __func__));
212
vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
213
} else if (uncp->un_lowervp != NULL) {
214
ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
215
KASSERT(uncp->un_lowervp->v_type == VDIR,
216
("%s: v_type != VDIR", __func__));
217
vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
218
}
219
if (vp == NULL) {
220
hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULL ?
221
uncp->un_uppervp : uncp->un_lowervp));
222
LIST_INSERT_HEAD(hd, uncp, un_hash);
223
}
224
VI_UNLOCK(dvp);
225
226
return (vp);
227
}
228
229
/*
230
* Remove the vnode.
231
*/
232
static void
233
unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp)
234
{
235
KASSERT(unp != NULL, ("%s: null node", __func__));
236
KASSERT(dvp != NULL,
237
("%s: null parent vnode", __func__));
238
239
VI_LOCK(dvp);
240
if (unp->un_hash.le_prev != NULL) {
241
LIST_REMOVE(unp, un_hash);
242
unp->un_hash.le_next = NULL;
243
unp->un_hash.le_prev = NULL;
244
}
245
VI_UNLOCK(dvp);
246
}
247
248
/*
249
* Common cleanup handling for unionfs_nodeget
250
* Upper, lower, and parent directory vnodes are expected to be referenced by
251
* the caller. Upper and lower vnodes, if non-NULL, are also expected to be
252
* exclusively locked by the caller.
253
* This function will return with the caller's locks and references undone.
254
*/
255
static void
256
unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp)
257
{
258
259
/*
260
* Lock and reset the default vnode lock; vgone() expects a locked
261
* vnode, and we're going to reset the vnode ops.
262
*/
263
lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
264
265
/*
266
* Clear out private data and reset the vnode ops to avoid use of
267
* unionfs vnode ops on a partially constructed vnode.
268
*/
269
VI_LOCK(vp);
270
vp->v_data = NULL;
271
vp->v_vnlock = &vp->v_lock;
272
vp->v_op = &dead_vnodeops;
273
VI_UNLOCK(vp);
274
vgone(vp);
275
vput(vp);
276
277
if (unp->un_dvp != NULL)
278
vrele(unp->un_dvp);
279
if (unp->un_uppervp != NULL) {
280
vput(unp->un_uppervp);
281
if (unp->un_lowervp != NULL)
282
vrele(unp->un_lowervp);
283
} else if (unp->un_lowervp != NULL)
284
vput(unp->un_lowervp);
285
if (unp->un_hashtbl != NULL)
286
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
287
free(unp->un_path, M_UNIONFSPATH);
288
free(unp, M_UNIONFSNODE);
289
}
290
291
/*
292
* Make a new or get existing unionfs node.
293
*
294
* uppervp and lowervp should be unlocked. Because if new unionfs vnode is
295
* locked, uppervp or lowervp is locked too. In order to prevent dead lock,
296
* you should not lock plurality simultaneously.
297
*/
298
int
299
unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
300
struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp,
301
struct componentname *cnp)
302
{
303
char *path;
304
struct unionfs_mount *ump;
305
struct unionfs_node *unp;
306
struct vnode *vp;
307
u_long hashmask;
308
int error;
309
int lkflags;
310
__enum_uint8(vtype) vt;
311
312
error = 0;
313
ump = MOUNTTOUNIONFSMOUNT(mp);
314
lkflags = (cnp ? cnp->cn_lkflags : 0);
315
path = (cnp ? cnp->cn_nameptr : NULL);
316
*vpp = NULL;
317
318
if (uppervp == NULL && lowervp == NULL)
319
panic("%s: upper and lower are both null", __func__);
320
321
vt = (uppervp != NULL ? uppervp->v_type : lowervp->v_type);
322
323
/* If it has no ISLASTCN flag, path check is skipped. */
324
if (cnp && !(cnp->cn_flags & ISLASTCN))
325
path = NULL;
326
327
/* check the cache */
328
if (dvp != NULL && vt == VDIR) {
329
vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
330
if (vp != NULL) {
331
*vpp = vp;
332
if (lkflags != 0)
333
vn_lock(*vpp, lkflags | LK_RETRY);
334
return (0);
335
}
336
}
337
338
unp = malloc(sizeof(struct unionfs_node),
339
M_UNIONFSNODE, M_WAITOK | M_ZERO);
340
341
error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp);
342
if (error != 0) {
343
free(unp, M_UNIONFSNODE);
344
return (error);
345
}
346
if (dvp != NULL)
347
vref(dvp);
348
if (uppervp != NULL)
349
vref(uppervp);
350
if (lowervp != NULL)
351
vref(lowervp);
352
353
if (vt == VDIR) {
354
unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH,
355
&hashmask);
356
KASSERT(hashmask == UNIONFSHASHMASK,
357
("unexpected unionfs hash mask 0x%lx", hashmask));
358
}
359
360
unp->un_vnode = vp;
361
unp->un_uppervp = uppervp;
362
unp->un_lowervp = lowervp;
363
unp->un_dvp = dvp;
364
if (uppervp != NULL)
365
vp->v_vnlock = uppervp->v_vnlock;
366
else
367
vp->v_vnlock = lowervp->v_vnlock;
368
369
if (path != NULL) {
370
unp->un_path = malloc(cnp->cn_namelen + 1,
371
M_UNIONFSPATH, M_WAITOK | M_ZERO);
372
bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen);
373
unp->un_path[cnp->cn_namelen] = '\0';
374
unp->un_pathlen = cnp->cn_namelen;
375
}
376
vp->v_type = vt;
377
vp->v_data = unp;
378
379
/*
380
* TODO: This is an imperfect check, as there's no guarantee that
381
* the underlying filesystems will always return vnode pointers
382
* for the root inodes that match our cached values. To reduce
383
* the likelihood of failure, for example in the case where either
384
* vnode has been forcibly doomed, we check both pointers and set
385
* VV_ROOT if either matches.
386
*/
387
if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp)
388
vp->v_vflag |= VV_ROOT;
389
KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
390
("%s: NULL dvp for non-root vp %p", __func__, vp));
391
392
393
/*
394
* NOTE: There is still a possibility for cross-filesystem locking here.
395
* If dvp has an upper FS component and is locked, while the new vnode
396
* created here only has a lower-layer FS component, then we will end
397
* up taking a lower-FS lock while holding an upper-FS lock.
398
* That situation could be dealt with here using vn_lock_pair().
399
* However, that would only address one instance out of many in which
400
* a child vnode lock is taken while holding a lock on its parent
401
* directory. This is done in many places in common VFS code, as well as
402
* a few places within unionfs (which could lead to the same cross-FS
403
* locking issue if, for example, the upper FS is another nested unionfs
404
* instance). Additionally, it is unclear under what circumstances this
405
* specific lock sequence (a directory on one FS followed by a child of
406
* its 'peer' directory on another FS) would present the practical
407
* possibility of deadlock due to some other agent on the system
408
* attempting to lock those two specific vnodes in the opposite order.
409
*/
410
if (uppervp != NULL)
411
vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY);
412
else
413
vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
414
error = insmntque1(vp, mp);
415
if (error != 0) {
416
unionfs_nodeget_cleanup(vp, unp);
417
return (error);
418
}
419
/*
420
* lowervp and uppervp should only be doomed by a forced unmount of
421
* their respective filesystems, but that can only happen if the
422
* unionfs instance is first unmounted. We also effectively hold the
423
* lock on the new unionfs vnode at this point. Therefore, if a
424
* unionfs umount has not yet reached the point at which the above
425
* insmntque1() would fail, then its vflush() call will end up
426
* blocked on our vnode lock, effectively also preventing unmount
427
* of the underlying filesystems.
428
*/
429
VNASSERT(lowervp == NULL || !VN_IS_DOOMED(lowervp), vp,
430
("%s: doomed lowervp %p", __func__, lowervp));
431
VNASSERT(uppervp == NULL || !VN_IS_DOOMED(uppervp), vp,
432
("%s: doomed lowervp %p", __func__, uppervp));
433
434
vn_set_state(vp, VSTATE_CONSTRUCTED);
435
436
if (dvp != NULL && vt == VDIR)
437
*vpp = unionfs_ins_cached_vnode(unp, dvp);
438
if (*vpp != NULL) {
439
unionfs_nodeget_cleanup(vp, unp);
440
if (lkflags != 0)
441
vn_lock(*vpp, lkflags | LK_RETRY);
442
return (0);
443
} else
444
*vpp = vp;
445
446
if ((lkflags & LK_SHARED) != 0)
447
vn_lock(vp, LK_DOWNGRADE);
448
else if ((lkflags & LK_EXCLUSIVE) == 0)
449
VOP_UNLOCK(vp);
450
451
return (0);
452
}
453
454
/*
455
* Clean up the unionfs node.
456
*/
457
void
458
unionfs_noderem(struct vnode *vp)
459
{
460
struct unionfs_node *unp, *unp_t1, *unp_t2;
461
struct unionfs_node_hashhead *hd;
462
struct unionfs_node_status *unsp, *unsp_tmp;
463
struct vnode *lvp;
464
struct vnode *uvp;
465
struct vnode *dvp;
466
int count;
467
int writerefs;
468
bool unlock_lvp;
469
470
/*
471
* The root vnode lock may be recursed during unmount, because
472
* it may share the same lock as the unionfs mount's covered vnode,
473
* which is locked across VFS_UNMOUNT(). This lock will then be
474
* recursively taken during the vflush() issued by unionfs_unmount().
475
* But we still only need to lock the unionfs lock once, because only
476
* one of those lock operations was taken against a unionfs vnode and
477
* will be undone against a unionfs vnode.
478
*/
479
KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
480
("%s: vnode %p locked recursively", __func__, vp));
481
482
unp = VTOUNIONFS(vp);
483
VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__));
484
lvp = unp->un_lowervp;
485
uvp = unp->un_uppervp;
486
dvp = unp->un_dvp;
487
unlock_lvp = (uvp == NULL);
488
489
/*
490
* Lock the lower vnode in addition to the upper vnode lock in order
491
* to synchronize against any unionfs_lock() operation which may still
492
* hold the lower vnode lock. We do not need to do this for the root
493
* vnode, as the root vnode should always have both upper and lower
494
* base vnodes for its entire lifecycled, so unionfs_lock() should
495
* never attempt to lock its lower vnode in the first place.
496
* Moreover, during unmount of a non-"below" unionfs mount, the lower
497
* root vnode will already be locked as it is the covered vnode.
498
*/
499
if (uvp != NULL && lvp != NULL && (vp->v_vflag & VV_ROOT) == 0) {
500
vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE);
501
unlock_lvp = true;
502
}
503
504
if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
505
panic("%s: failed to acquire lock for vnode lock", __func__);
506
/*
507
* Use the interlock to protect the clearing of v_data to
508
* prevent faults in unionfs_lock().
509
*/
510
VI_LOCK(vp);
511
unp->un_lowervp = unp->un_uppervp = NULL;
512
vp->v_vnlock = &(vp->v_lock);
513
vp->v_data = NULL;
514
vp->v_object = NULL;
515
if (unp->un_hashtbl != NULL) {
516
/*
517
* Clear out any cached child vnodes. This should only
518
* be necessary during forced unmount, when the vnode may
519
* be reclaimed with a non-zero use count. Otherwise the
520
* reference held by each child should prevent reclamation.
521
*/
522
for (count = 0; count <= UNIONFSHASHMASK; count++) {
523
hd = unp->un_hashtbl + count;
524
LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) {
525
LIST_REMOVE(unp_t1, un_hash);
526
unp_t1->un_hash.le_next = NULL;
527
unp_t1->un_hash.le_prev = NULL;
528
}
529
}
530
}
531
VI_UNLOCK(vp);
532
533
writerefs = atomic_load_int(&vp->v_writecount);
534
VNASSERT(writerefs >= 0, vp,
535
("%s: write count %d, unexpected text ref", __func__, writerefs));
536
/*
537
* If we were opened for write, we leased the write reference
538
* to the lower vnode. If this is a reclamation due to the
539
* forced unmount, undo the reference now.
540
*/
541
if (writerefs > 0) {
542
VNASSERT(uvp != NULL, vp,
543
("%s: write reference without upper vnode", __func__));
544
VOP_ADD_WRITECOUNT(uvp, -writerefs);
545
}
546
if (uvp != NULL)
547
vput(uvp);
548
if (unlock_lvp)
549
vput(lvp);
550
else if (lvp != NULL)
551
vrele(lvp);
552
553
if (dvp != NULL)
554
unionfs_rem_cached_vnode(unp, dvp);
555
556
if (unp->un_path != NULL) {
557
free(unp->un_path, M_UNIONFSPATH);
558
unp->un_path = NULL;
559
unp->un_pathlen = 0;
560
}
561
562
if (unp->un_hashtbl != NULL) {
563
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
564
}
565
566
LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) {
567
LIST_REMOVE(unsp, uns_list);
568
free(unsp, M_TEMP);
569
}
570
if (dvp != NULL) {
571
mtx_lock(&unionfs_deferred_rele_lock);
572
STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele);
573
mtx_unlock(&unionfs_deferred_rele_lock);
574
taskqueue_enqueue(taskqueue_unionfs_rele,
575
&unionfs_deferred_rele_task);
576
} else
577
free(unp, M_UNIONFSNODE);
578
}
579
580
/*
581
* Find the unionfs node status object for the vnode corresponding to unp,
582
* for the process that owns td. Return NULL if no such object exists.
583
*/
584
struct unionfs_node_status *
585
unionfs_find_node_status(struct unionfs_node *unp, struct thread *td)
586
{
587
struct unionfs_node_status *unsp;
588
pid_t pid;
589
590
MPASS(td != NULL);
591
pid = td->td_proc->p_pid;
592
593
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
594
595
LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) {
596
if (unsp->uns_pid == pid) {
597
return (unsp);
598
}
599
}
600
601
return (NULL);
602
}
603
604
/*
605
* Get the unionfs node status object for the vnode corresponding to unp,
606
* for the process that owns td. Allocate a new status object if one
607
* does not already exist.
608
*/
609
void
610
unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
611
struct unionfs_node_status **unspp)
612
{
613
struct unionfs_node_status *unsp;
614
pid_t pid;
615
616
MPASS(td != NULL);
617
pid = td->td_proc->p_pid;
618
619
KASSERT(NULL != unspp, ("%s: NULL status", __func__));
620
unsp = unionfs_find_node_status(unp, td);
621
if (unsp == NULL) {
622
/* create a new unionfs node status */
623
unsp = malloc(sizeof(struct unionfs_node_status),
624
M_TEMP, M_WAITOK | M_ZERO);
625
626
unsp->uns_pid = pid;
627
LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
628
}
629
630
*unspp = unsp;
631
}
632
633
/*
634
* Remove the unionfs node status, if you can.
635
* You need exclusive lock this vnode.
636
*/
637
void
638
unionfs_tryrem_node_status(struct unionfs_node *unp,
639
struct unionfs_node_status *unsp)
640
{
641
KASSERT(NULL != unsp, ("%s: NULL status", __func__));
642
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
643
644
if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt)
645
return;
646
647
LIST_REMOVE(unsp, uns_list);
648
free(unsp, M_TEMP);
649
}
650
651
/*
652
* Create upper node attr.
653
*/
654
void
655
unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva,
656
struct vattr *uva, struct thread *td)
657
{
658
VATTR_NULL(uva);
659
uva->va_type = lva->va_type;
660
uva->va_atime = lva->va_atime;
661
uva->va_mtime = lva->va_mtime;
662
uva->va_ctime = lva->va_ctime;
663
664
switch (ump->um_copymode) {
665
case UNIONFS_TRANSPARENT:
666
uva->va_mode = lva->va_mode;
667
uva->va_uid = lva->va_uid;
668
uva->va_gid = lva->va_gid;
669
break;
670
case UNIONFS_MASQUERADE:
671
if (ump->um_uid == lva->va_uid) {
672
uva->va_mode = lva->va_mode & 077077;
673
uva->va_mode |= (lva->va_type == VDIR ?
674
ump->um_udir : ump->um_ufile) & 0700;
675
uva->va_uid = lva->va_uid;
676
uva->va_gid = lva->va_gid;
677
} else {
678
uva->va_mode = (lva->va_type == VDIR ?
679
ump->um_udir : ump->um_ufile);
680
uva->va_uid = ump->um_uid;
681
uva->va_gid = ump->um_gid;
682
}
683
break;
684
default: /* UNIONFS_TRADITIONAL */
685
uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask;
686
uva->va_uid = ump->um_uid;
687
uva->va_gid = ump->um_gid;
688
break;
689
}
690
}
691
692
/*
693
* Create upper node attr.
694
*/
695
int
696
unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp,
697
struct vattr *uva, struct ucred *cred, struct thread *td)
698
{
699
struct vattr lva;
700
int error;
701
702
if ((error = VOP_GETATTR(lvp, &lva, cred)))
703
return (error);
704
705
unionfs_create_uppervattr_core(ump, &lva, uva, td);
706
707
return (error);
708
}
709
710
/*
711
* relookup
712
*
713
* dvp should be locked on entry and will be locked on return.
714
*
715
* If an error is returned, *vpp will be invalid, otherwise it will hold a
716
* locked, referenced vnode. If *vpp == dvp then remember that only one
717
* LK_EXCLUSIVE lock is held.
718
*/
719
int
720
unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
721
struct componentname *cnp, struct componentname *cn, struct thread *td,
722
char *path, int pathlen, u_long nameiop)
723
{
724
int error;
725
bool refstart;
726
727
cn->cn_namelen = pathlen;
728
cn->cn_pnbuf = path;
729
cn->cn_nameiop = nameiop;
730
cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN);
731
cn->cn_lkflags = LK_EXCLUSIVE;
732
cn->cn_cred = cnp->cn_cred;
733
cn->cn_nameptr = cn->cn_pnbuf;
734
735
refstart = false;
736
if (nameiop == DELETE) {
737
cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT);
738
} else if (nameiop == RENAME) {
739
refstart = true;
740
} else if (nameiop == CREATE) {
741
cn->cn_flags |= NOCACHE;
742
}
743
744
vref(dvp);
745
VOP_UNLOCK(dvp);
746
747
if ((error = vfs_relookup(dvp, vpp, cn, refstart))) {
748
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
749
} else
750
vrele(dvp);
751
752
KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__));
753
754
return (error);
755
}
756
757
/*
758
* Update the unionfs_node.
759
*
760
* uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the
761
* uvp's lock and lower's lock will be unlocked.
762
*/
763
static void
764
unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
765
struct thread *td)
766
{
767
struct unionfs_node_hashhead *hd;
768
struct vnode *vp;
769
struct vnode *lvp;
770
struct vnode *dvp;
771
unsigned count, lockrec;
772
773
vp = UNIONFSTOV(unp);
774
lvp = unp->un_lowervp;
775
ASSERT_VOP_ELOCKED(lvp, __func__);
776
ASSERT_VOP_ELOCKED(uvp, __func__);
777
dvp = unp->un_dvp;
778
779
VNASSERT(vp->v_writecount == 0, vp,
780
("%s: non-zero writecount", __func__));
781
/*
782
* Update the upper vnode's lock state to match the lower vnode,
783
* and then switch the unionfs vnode's lock to the upper vnode.
784
*/
785
lockrec = lvp->v_vnlock->lk_recurse;
786
for (count = 0; count < lockrec; count++)
787
vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
788
VI_LOCK(vp);
789
unp->un_uppervp = uvp;
790
vp->v_vnlock = uvp->v_vnlock;
791
VI_UNLOCK(vp);
792
793
for (count = 0; count < lockrec + 1; count++)
794
VOP_UNLOCK(lvp);
795
/*
796
* Re-cache the unionfs vnode against the upper vnode
797
*/
798
if (dvp != NULL && vp->v_type == VDIR) {
799
VI_LOCK(dvp);
800
if (unp->un_hash.le_prev != NULL) {
801
LIST_REMOVE(unp, un_hash);
802
hd = unionfs_get_hashhead(dvp, uvp);
803
LIST_INSERT_HEAD(hd, unp, un_hash);
804
}
805
VI_UNLOCK(unp->un_dvp);
806
}
807
}
808
809
/*
810
* Mark a unionfs operation as being in progress, sleeping if the
811
* same operation is already in progress.
812
* This is useful, for example, during copy-up operations in which
813
* we may drop the target vnode lock, but we want to avoid the
814
* possibility of a concurrent copy-up on the same vnode triggering
815
* a spurious failure.
816
*/
817
int
818
unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag)
819
{
820
struct unionfs_node *unp;
821
int error;
822
823
error = 0;
824
ASSERT_VOP_ELOCKED(vp, __func__);
825
VI_LOCK(vp);
826
unp = VTOUNIONFS(vp);
827
while (error == 0 && (unp->un_flag & flag) != 0) {
828
VOP_UNLOCK(vp);
829
error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0);
830
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
831
VI_LOCK(vp);
832
if (error == 0) {
833
/*
834
* If we waited on a concurrent copy-up and that
835
* copy-up was successful, return a non-fatal
836
* indication that the desired operation is already
837
* complete. If we waited on a concurrent lookup,
838
* return ERELOOKUP to indicate the VFS cache should
839
* be re-queried to avoid creating a duplicate unionfs
840
* vnode.
841
*/
842
unp = VTOUNIONFS(vp);
843
if (unp == NULL)
844
error = ENOENT;
845
else if (flag == UNIONFS_COPY_IN_PROGRESS &&
846
unp->un_uppervp != NULL)
847
error = EJUSTRETURN;
848
else if (flag == UNIONFS_LOOKUP_IN_PROGRESS)
849
error = ERELOOKUP;
850
}
851
}
852
if (error == 0)
853
unp->un_flag |= flag;
854
VI_UNLOCK(vp);
855
856
return (error);
857
}
858
859
void
860
unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag)
861
{
862
struct unionfs_node *unp;
863
864
ASSERT_VOP_ELOCKED(vp, __func__);
865
unp = VTOUNIONFS(vp);
866
VI_LOCK(vp);
867
if (unp != NULL) {
868
VNASSERT((unp->un_flag & flag) != 0, vp,
869
("%s: copy not in progress", __func__));
870
unp->un_flag &= ~flag;
871
}
872
wakeup(vp);
873
VI_UNLOCK(vp);
874
}
875
876
/*
877
* Create a new shadow dir.
878
*
879
* dvp and vp are unionfs vnodes representing a parent directory and
880
* child file, should be locked on entry, and will be locked on return.
881
*
882
* If no error returned, unp will be updated.
883
*/
884
int
885
unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp,
886
struct componentname *cnp, struct thread *td)
887
{
888
struct vnode *lvp;
889
struct vnode *uvp;
890
struct vnode *udvp;
891
struct vattr va;
892
struct vattr lva;
893
struct nameidata nd;
894
struct mount *mp;
895
struct ucred *cred;
896
struct ucred *credbk;
897
struct uidinfo *rootinfo;
898
struct unionfs_mount *ump;
899
struct unionfs_node *dunp;
900
struct unionfs_node *unp;
901
int error;
902
903
ASSERT_VOP_ELOCKED(dvp, __func__);
904
ASSERT_VOP_ELOCKED(vp, __func__);
905
ump = MOUNTTOUNIONFSMOUNT(vp->v_mount);
906
unp = VTOUNIONFS(vp);
907
if (unp->un_uppervp != NULL)
908
return (EEXIST);
909
dunp = VTOUNIONFS(dvp);
910
udvp = dunp->un_uppervp;
911
912
error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
913
if (error == EJUSTRETURN)
914
return (0);
915
else if (error != 0)
916
return (error);
917
918
lvp = unp->un_lowervp;
919
uvp = NULL;
920
credbk = cnp->cn_cred;
921
922
/* Authority change to root */
923
rootinfo = uifind((uid_t)0);
924
cred = crdup(cnp->cn_cred);
925
change_euid(cred, rootinfo);
926
change_ruid(cred, rootinfo);
927
change_svuid(cred, (uid_t)0);
928
uifree(rootinfo);
929
cnp->cn_cred = cred;
930
931
memset(&nd.ni_cnd, 0, sizeof(struct componentname));
932
NDPREINIT(&nd);
933
934
if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
935
goto unionfs_mkshadowdir_finish;
936
937
vref(udvp);
938
VOP_UNLOCK(vp);
939
if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
940
cnp->cn_nameptr, cnp->cn_namelen, CREATE))) {
941
/*
942
* When handling error cases here, we drop udvp's lock and
943
* then jump to exit code that relocks dvp, which in most
944
* cases will effectively relock udvp. However, this is
945
* not guaranteed to be the case, as various calls made
946
* here (such as unionfs_relookup() above and VOP_MKDIR()
947
* below) may unlock and then relock udvp, allowing dvp to
948
* be reclaimed in the meantime. In such a situation dvp
949
* will no longer share its lock with udvp. Since
950
* performance isn't a concern for these error cases, it
951
* makes more sense to reuse the common code that locks
952
* dvp on exit than to explicitly check for reclamation
953
* of dvp.
954
*/
955
vput(udvp);
956
goto unionfs_mkshadowdir_relock;
957
}
958
if (uvp != NULL) {
959
if (udvp == uvp)
960
vrele(uvp);
961
else
962
vput(uvp);
963
964
error = EEXIST;
965
vput(udvp);
966
goto unionfs_mkshadowdir_relock;
967
}
968
969
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) {
970
vput(udvp);
971
goto unionfs_mkshadowdir_relock;
972
}
973
unionfs_create_uppervattr_core(ump, &lva, &va, td);
974
975
/*
976
* Temporarily NUL-terminate the current pathname component.
977
* This function may be called during lookup operations in which
978
* the current pathname component is not the leaf, meaning that
979
* the NUL terminator is some distance beyond the end of the current
980
* component. This *should* be fine, as cn_namelen will still
981
* correctly indicate the length of only the current component,
982
* but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
983
* implementation.
984
* Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
985
* something like a local namei() operation and the temporary
986
* NUL-termination will not have an effect on other threads.
987
*/
988
char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen];
989
char pathterm = *pathend;
990
*pathend = '\0';
991
error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
992
*pathend = pathterm;
993
if (error != 0) {
994
/*
995
* See the comment after unionfs_relookup() above for an
996
* explanation of why we unlock udvp here only to relock
997
* dvp on exit.
998
*/
999
vput(udvp);
1000
vn_finished_write(mp);
1001
goto unionfs_mkshadowdir_relock;
1002
}
1003
1004
/*
1005
* XXX The bug which cannot set uid/gid was corrected.
1006
* Ignore errors.
1007
*/
1008
va.va_type = VNON;
1009
/*
1010
* VOP_SETATTR() may transiently drop uvp's lock, so it's
1011
* important to call it before unionfs_node_update() transfers
1012
* the unionfs vnode's lock from lvp to uvp; otherwise the
1013
* unionfs vnode itself would be transiently unlocked and
1014
* potentially doomed.
1015
*/
1016
VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
1017
1018
/*
1019
* uvp may become doomed during VOP_VPUT_PAIR() if the implementation
1020
* must temporarily drop uvp's lock. However, since we hold a
1021
* reference to uvp from the VOP_MKDIR() call above, this would require
1022
* a forcible unmount of uvp's filesystem, which in turn can only
1023
* happen if our unionfs instance is first forcibly unmounted. We'll
1024
* therefore catch this case in the NULL check of unp below.
1025
*/
1026
VOP_VPUT_PAIR(udvp, &uvp, false);
1027
vn_finished_write(mp);
1028
vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
1029
unp = VTOUNIONFS(vp);
1030
if (unp == NULL) {
1031
vput(uvp);
1032
error = ENOENT;
1033
} else
1034
unionfs_node_update(unp, uvp, td);
1035
VOP_UNLOCK(vp);
1036
1037
unionfs_mkshadowdir_relock:
1038
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
1039
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1040
if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)))
1041
error = ENOENT;
1042
1043
unionfs_mkshadowdir_finish:
1044
unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1045
cnp->cn_cred = credbk;
1046
crfree(cred);
1047
1048
return (error);
1049
}
1050
1051
static inline void
1052
unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags)
1053
{
1054
ASSERT_VOP_LOCKED(basevp, __func__);
1055
*lkflags = VOP_ISLOCKED(basevp);
1056
vref(basevp);
1057
}
1058
1059
/*
1060
* Prepare unionfs to issue a forwarded VOP to either the upper or lower
1061
* FS. This should be used for any VOP which may drop the vnode lock;
1062
* it is not required otherwise.
1063
* The unionfs vnode shares its lock with the base-layer vnode(s); if the
1064
* base FS must transiently drop its vnode lock, the unionfs vnode may
1065
* effectively become unlocked. During that window, a concurrent forced
1066
* unmount may doom the unionfs vnode, which leads to two significant
1067
* issues:
1068
* 1) Completion of, and return from, the unionfs VOP with the unionfs
1069
* vnode completely unlocked. When the unionfs vnode becomes doomed
1070
* it stops sharing its lock with the base vnode, so even if the
1071
* forwarded VOP reacquires the base vnode lock the unionfs vnode
1072
* lock will no longer be held. This can lead to violation of the
1073
* caller's sychronization requirements as well as various failed
1074
* locking assertions when INVARIANTS is enabled.
1075
* 2) Loss of reference on the base vnode. The caller is expected to
1076
* hold a v_usecount reference on the unionfs vnode, while the
1077
* unionfs vnode holds a reference on the base-layer vnode(s). But
1078
* these references are released when the unionfs vnode becomes
1079
* doomed, violating the base layer's expectation that its caller
1080
* must hold a reference to prevent vnode recycling.
1081
*
1082
* basevp1 and basevp2 represent two base-layer vnodes which are
1083
* expected to be locked when this function is called. basevp2
1084
* may be NULL, but if not NULL basevp1 and basevp2 should represent
1085
* a parent directory and a filed linked to it, respectively.
1086
* lkflags1 and lkflags2 are output parameters that will store the
1087
* current lock status of basevp1 and basevp2, respectively. They
1088
* are intended to be passed as the lkflags1 and lkflags2 parameters
1089
* in the subsequent call to unionfs_forward_vop_finish_pair().
1090
* lkflags2 may be NULL iff basevp2 is NULL.
1091
*/
1092
void
1093
unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1,
1094
struct vnode *basevp2, int *lkflags2)
1095
{
1096
/*
1097
* Take an additional reference on the base-layer vnodes to
1098
* avoid loss of reference if the unionfs vnodes are doomed.
1099
*/
1100
unionfs_forward_vop_ref(basevp1, lkflags1);
1101
if (basevp2 != NULL)
1102
unionfs_forward_vop_ref(basevp2, lkflags2);
1103
}
1104
1105
static inline bool
1106
unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp,
1107
int lkflags)
1108
{
1109
bool unionvp_doomed;
1110
1111
if (__predict_false(VTOUNIONFS(unionvp) == NULL)) {
1112
if ((lkflags & LK_EXCLUSIVE) != 0)
1113
ASSERT_VOP_ELOCKED(basevp, __func__);
1114
else
1115
ASSERT_VOP_LOCKED(basevp, __func__);
1116
unionvp_doomed = true;
1117
} else {
1118
vrele(basevp);
1119
unionvp_doomed = false;
1120
}
1121
1122
return (unionvp_doomed);
1123
}
1124
1125
1126
/*
1127
* Indicate completion of a forwarded VOP previously prepared by
1128
* unionfs_forward_vop_start_pair().
1129
* basevp1 and basevp2 must be the same values passed to the prior
1130
* call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2
1131
* must be the unionfs vnodes that were initially above basevp1 and
1132
* basevp2, respectively.
1133
* basevp1 and basevp2 (if not NULL) must be locked when this function
1134
* is called, while unionvp1 and/or unionvp2 may be unlocked if either
1135
* unionfs vnode has become doomed.
1136
* lkflags1 and lkflag2 represent the locking flags that should be
1137
* used to re-lock unionvp1 and unionvp2, respectively, if either
1138
* vnode has become doomed.
1139
*
1140
* Returns true if any unionfs vnode was found to be doomed, false
1141
* otherwise.
1142
*/
1143
bool
1144
unionfs_forward_vop_finish_pair(
1145
struct vnode *unionvp1, struct vnode *basevp1, int lkflags1,
1146
struct vnode *unionvp2, struct vnode *basevp2, int lkflags2)
1147
{
1148
bool vp1_doomed, vp2_doomed;
1149
1150
/*
1151
* If either vnode is found to have been doomed, set
1152
* a flag indicating that it needs to be re-locked.
1153
* Otherwise, simply drop the base-vnode reference that
1154
* was taken in unionfs_forward_vop_start().
1155
*/
1156
vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1);
1157
1158
if (unionvp2 != NULL)
1159
vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2);
1160
else
1161
vp2_doomed = false;
1162
1163
/*
1164
* If any of the unionfs vnodes need to be re-locked, that
1165
* means the unionfs vnode's lock is now de-coupled from the
1166
* corresponding base vnode. We therefore need to drop the
1167
* base vnode lock (since nothing else will after this point),
1168
* and also release the reference taken in
1169
* unionfs_forward_vop_start_pair().
1170
*/
1171
if (__predict_false(vp1_doomed && vp2_doomed))
1172
VOP_VPUT_PAIR(basevp1, &basevp2, true);
1173
else if (__predict_false(vp1_doomed)) {
1174
/*
1175
* If basevp1 needs to be unlocked, then we may not
1176
* be able to safely unlock it with basevp2 still locked,
1177
* for the same reason that an ordinary VFS call would
1178
* need to use VOP_VPUT_PAIR() here. We might be able
1179
* to use VOP_VPUT_PAIR(..., false) here, but then we
1180
* would need to deal with the possibility of basevp2
1181
* changing out from under us, which could result in
1182
* either the unionfs vnode becoming doomed or its
1183
* upper/lower vp no longer matching basevp2. Either
1184
* scenario would require at least re-locking the unionfs
1185
* vnode anyway.
1186
*/
1187
if (unionvp2 != NULL) {
1188
VOP_UNLOCK(unionvp2);
1189
vp2_doomed = true;
1190
}
1191
vput(basevp1);
1192
} else if (__predict_false(vp2_doomed))
1193
vput(basevp2);
1194
1195
if (__predict_false(vp1_doomed || vp2_doomed))
1196
vn_lock_pair(unionvp1, !vp1_doomed, lkflags1,
1197
unionvp2, !vp2_doomed, lkflags2);
1198
1199
return (vp1_doomed || vp2_doomed);
1200
}
1201
1202
/*
1203
* Create a new whiteout.
1204
*
1205
* dvp and vp are unionfs vnodes representing a parent directory and
1206
* child file, should be locked on entry, and will be locked on return.
1207
*/
1208
int
1209
unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp,
1210
struct componentname *cnp, struct thread *td, char *path, int pathlen)
1211
{
1212
struct vnode *udvp;
1213
struct vnode *wvp;
1214
struct nameidata nd;
1215
struct mount *mp;
1216
int error;
1217
bool dvp_locked;
1218
1219
ASSERT_VOP_ELOCKED(dvp, __func__);
1220
ASSERT_VOP_ELOCKED(vp, __func__);
1221
1222
udvp = VTOUNIONFS(dvp)->un_uppervp;
1223
wvp = NULL;
1224
NDPREINIT(&nd);
1225
vref(udvp);
1226
VOP_UNLOCK(vp);
1227
if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
1228
pathlen, CREATE))) {
1229
goto unionfs_mkwhiteout_cleanup;
1230
}
1231
if (wvp != NULL) {
1232
if (udvp == wvp)
1233
vrele(wvp);
1234
else
1235
vput(wvp);
1236
1237
if (nd.ni_cnd.cn_flags & ISWHITEOUT)
1238
error = 0;
1239
else
1240
error = EEXIST;
1241
goto unionfs_mkwhiteout_cleanup;
1242
}
1243
1244
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
1245
goto unionfs_mkwhiteout_cleanup;
1246
error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
1247
vn_finished_write(mp);
1248
1249
unionfs_mkwhiteout_cleanup:
1250
if (VTOUNIONFS(dvp) == NULL) {
1251
vput(udvp);
1252
dvp_locked = false;
1253
} else {
1254
vrele(udvp);
1255
dvp_locked = true;
1256
}
1257
vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
1258
return (error);
1259
}
1260
1261
/*
1262
* Create a new regular file on upper.
1263
*
1264
* If an error is returned, *vpp will be invalid, otherwise it will hold a
1265
* locked, referenced and opened vnode.
1266
*
1267
* unp is never updated.
1268
*/
1269
static int
1270
unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
1271
struct vnode *vp, struct vattr *uvap, struct thread *td)
1272
{
1273
struct unionfs_mount *ump;
1274
struct unionfs_node *unp;
1275
struct vnode *uvp;
1276
struct vnode *lvp;
1277
struct ucred *cred;
1278
struct vattr lva;
1279
struct nameidata nd;
1280
int fmode;
1281
int error;
1282
1283
ASSERT_VOP_ELOCKED(vp, __func__);
1284
unp = VTOUNIONFS(vp);
1285
ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
1286
uvp = NULL;
1287
lvp = unp->un_lowervp;
1288
cred = td->td_ucred;
1289
fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
1290
error = 0;
1291
1292
if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0)
1293
return (error);
1294
unionfs_create_uppervattr_core(ump, &lva, uvap, td);
1295
1296
if (unp->un_path == NULL)
1297
panic("%s: NULL un_path", __func__);
1298
1299
nd.ni_cnd.cn_namelen = unp->un_pathlen;
1300
nd.ni_cnd.cn_pnbuf = unp->un_path;
1301
nd.ni_cnd.cn_nameiop = CREATE;
1302
nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN;
1303
nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
1304
nd.ni_cnd.cn_cred = cred;
1305
nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf;
1306
NDPREINIT(&nd);
1307
1308
vref(udvp);
1309
VOP_UNLOCK(vp);
1310
if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
1311
vrele(udvp);
1312
return (error);
1313
}
1314
1315
if (uvp != NULL) {
1316
if (uvp == udvp)
1317
vrele(uvp);
1318
else
1319
vput(uvp);
1320
error = EEXIST;
1321
goto unionfs_vn_create_on_upper_cleanup;
1322
}
1323
1324
if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0)
1325
goto unionfs_vn_create_on_upper_cleanup;
1326
1327
if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) {
1328
vput(uvp);
1329
goto unionfs_vn_create_on_upper_cleanup;
1330
}
1331
error = VOP_ADD_WRITECOUNT(uvp, 1);
1332
CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
1333
__func__, uvp, uvp->v_writecount);
1334
if (error == 0) {
1335
*vpp = uvp;
1336
} else {
1337
VOP_CLOSE(uvp, fmode, cred, td);
1338
}
1339
1340
unionfs_vn_create_on_upper_cleanup:
1341
vput(udvp);
1342
return (error);
1343
}
1344
1345
/*
1346
* Copy contents of lvp to uvp.
1347
*
1348
* lvp and uvp should be locked and opened on entry and will be locked and
1349
* opened on return.
1350
*/
1351
static int
1352
unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp,
1353
struct ucred *cred, struct thread *td)
1354
{
1355
char *buf;
1356
struct uio uio;
1357
struct iovec iov;
1358
off_t offset;
1359
int count;
1360
int error;
1361
int bufoffset;
1362
1363
error = 0;
1364
memset(&uio, 0, sizeof(uio));
1365
1366
uio.uio_td = td;
1367
uio.uio_segflg = UIO_SYSSPACE;
1368
uio.uio_offset = 0;
1369
1370
buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
1371
1372
while (error == 0) {
1373
offset = uio.uio_offset;
1374
1375
uio.uio_iov = &iov;
1376
uio.uio_iovcnt = 1;
1377
iov.iov_base = buf;
1378
iov.iov_len = MAXBSIZE;
1379
uio.uio_resid = iov.iov_len;
1380
uio.uio_rw = UIO_READ;
1381
1382
if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0)
1383
break;
1384
if ((count = MAXBSIZE - uio.uio_resid) == 0)
1385
break;
1386
1387
bufoffset = 0;
1388
while (bufoffset < count) {
1389
uio.uio_iov = &iov;
1390
uio.uio_iovcnt = 1;
1391
iov.iov_base = buf + bufoffset;
1392
iov.iov_len = count - bufoffset;
1393
uio.uio_offset = offset + bufoffset;
1394
uio.uio_resid = iov.iov_len;
1395
uio.uio_rw = UIO_WRITE;
1396
1397
if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0)
1398
break;
1399
1400
bufoffset += (count - bufoffset) - uio.uio_resid;
1401
}
1402
1403
uio.uio_offset = offset + bufoffset;
1404
}
1405
1406
free(buf, M_TEMP);
1407
1408
return (error);
1409
}
1410
1411
/*
1412
* Copy file from lower to upper.
1413
*
1414
* If docopy is non-zero, copy the contents as well.
1415
*
1416
* vp is a unionfs vnode that should be locked on entry and will be
1417
* locked on return.
1418
*
1419
* If no error returned, unp will be updated.
1420
*/
1421
int
1422
unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred,
1423
struct thread *td)
1424
{
1425
struct unionfs_node *unp;
1426
struct unionfs_node *dunp;
1427
struct mount *mp;
1428
struct vnode *udvp;
1429
struct vnode *lvp;
1430
struct vnode *uvp;
1431
struct vattr uva;
1432
int error;
1433
1434
ASSERT_VOP_ELOCKED(vp, __func__);
1435
unp = VTOUNIONFS(vp);
1436
lvp = unp->un_lowervp;
1437
uvp = NULL;
1438
1439
if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY))
1440
return (EROFS);
1441
if (unp->un_dvp == NULL)
1442
return (EINVAL);
1443
if (unp->un_uppervp != NULL)
1444
return (EEXIST);
1445
1446
udvp = NULL;
1447
VI_LOCK(unp->un_dvp);
1448
dunp = VTOUNIONFS(unp->un_dvp);
1449
if (dunp != NULL)
1450
udvp = dunp->un_uppervp;
1451
VI_UNLOCK(unp->un_dvp);
1452
1453
if (udvp == NULL)
1454
return (EROFS);
1455
if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
1456
return (EROFS);
1457
ASSERT_VOP_UNLOCKED(udvp, __func__);
1458
1459
error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1460
if (error == EJUSTRETURN)
1461
return (0);
1462
else if (error != 0)
1463
return (error);
1464
1465
error = VOP_ACCESS(lvp, VREAD, cred, td);
1466
if (error != 0)
1467
goto unionfs_copyfile_cleanup;
1468
1469
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
1470
goto unionfs_copyfile_cleanup;
1471
error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td);
1472
if (error != 0) {
1473
vn_finished_write(mp);
1474
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1475
goto unionfs_copyfile_cleanup;
1476
}
1477
1478
/*
1479
* Note that it's still possible for e.g. VOP_WRITE to relock
1480
* uvp below while holding vp[=lvp] locked. Replacing
1481
* unionfs_copyfile_core with vn_generic_copy_file_range() will
1482
* allow us to avoid the problem by moving this vn_lock_pair()
1483
* call much later.
1484
*/
1485
vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
1486
unp = VTOUNIONFS(vp);
1487
if (unp == NULL) {
1488
error = ENOENT;
1489
goto unionfs_copyfile_cleanup;
1490
}
1491
1492
if (docopy != 0) {
1493
error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
1494
if (error == 0) {
1495
error = unionfs_copyfile_core(lvp, uvp, cred, td);
1496
VOP_CLOSE(lvp, FREAD, cred, td);
1497
}
1498
}
1499
VOP_CLOSE(uvp, FWRITE, cred, td);
1500
VOP_ADD_WRITECOUNT_CHECKED(uvp, -1);
1501
CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
1502
__func__, uvp, uvp->v_writecount);
1503
1504
vn_finished_write(mp);
1505
1506
if (error == 0) {
1507
/* Reset the attributes. Ignore errors. */
1508
uva.va_type = VNON;
1509
VOP_SETATTR(uvp, &uva, cred);
1510
unionfs_node_update(unp, uvp, td);
1511
}
1512
1513
unionfs_copyfile_cleanup:
1514
unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1515
return (error);
1516
}
1517
1518
/*
1519
* Create a new symbolic link on upper.
1520
*
1521
* If an error is returned, *vpp will be invalid, otherwise it will hold a
1522
* locked, referenced and opened vnode.
1523
*
1524
* unp is never updated.
1525
*/
1526
static int
1527
unionfs_vn_symlink_on_upper(struct vnode **vpp, struct vnode *udvp,
1528
struct vnode *vp, struct vattr *uvap, const char *target,
1529
struct thread *td)
1530
{
1531
struct unionfs_mount *ump;
1532
struct unionfs_node *unp;
1533
struct vnode *uvp;
1534
struct vnode *lvp;
1535
struct ucred *cred;
1536
struct vattr lva;
1537
struct nameidata nd;
1538
int error;
1539
1540
ASSERT_VOP_ELOCKED(vp, __func__);
1541
unp = VTOUNIONFS(vp);
1542
ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
1543
uvp = NULL;
1544
lvp = unp->un_lowervp;
1545
cred = td->td_ucred;
1546
error = 0;
1547
1548
if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0)
1549
return (error);
1550
unionfs_create_uppervattr_core(ump, &lva, uvap, td);
1551
1552
if (unp->un_path == NULL)
1553
panic("%s: NULL un_path", __func__);
1554
1555
nd.ni_cnd.cn_namelen = unp->un_pathlen;
1556
nd.ni_cnd.cn_pnbuf = unp->un_path;
1557
nd.ni_cnd.cn_nameiop = CREATE;
1558
nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN;
1559
nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
1560
nd.ni_cnd.cn_cred = cred;
1561
nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf;
1562
NDPREINIT(&nd);
1563
1564
vref(udvp);
1565
VOP_UNLOCK(vp);
1566
if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
1567
vrele(udvp);
1568
return (error);
1569
}
1570
1571
if (uvp != NULL) {
1572
if (uvp == udvp)
1573
vrele(uvp);
1574
else
1575
vput(uvp);
1576
error = EEXIST;
1577
goto unionfs_vn_symlink_on_upper_cleanup;
1578
}
1579
1580
error = VOP_SYMLINK(udvp, &uvp, &nd.ni_cnd, uvap, target);
1581
if (error == 0)
1582
*vpp = uvp;
1583
1584
unionfs_vn_symlink_on_upper_cleanup:
1585
vput(udvp);
1586
return (error);
1587
}
1588
1589
/*
1590
* Copy symbolic link from lower to upper.
1591
*
1592
* vp is a unionfs vnode that should be locked on entry and will be
1593
* locked on return.
1594
*
1595
* If no error returned, unp will be updated.
1596
*/
1597
int
1598
unionfs_copylink(struct vnode *vp, struct ucred *cred,
1599
struct thread *td)
1600
{
1601
struct unionfs_node *unp;
1602
struct unionfs_node *dunp;
1603
struct mount *mp;
1604
struct vnode *udvp;
1605
struct vnode *lvp;
1606
struct vnode *uvp;
1607
struct vattr uva;
1608
char *buf = NULL;
1609
struct uio uio;
1610
struct iovec iov;
1611
int error;
1612
1613
ASSERT_VOP_ELOCKED(vp, __func__);
1614
unp = VTOUNIONFS(vp);
1615
lvp = unp->un_lowervp;
1616
uvp = NULL;
1617
1618
if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY))
1619
return (EROFS);
1620
if (unp->un_dvp == NULL)
1621
return (EINVAL);
1622
if (unp->un_uppervp != NULL)
1623
return (EEXIST);
1624
1625
udvp = NULL;
1626
VI_LOCK(unp->un_dvp);
1627
dunp = VTOUNIONFS(unp->un_dvp);
1628
if (dunp != NULL)
1629
udvp = dunp->un_uppervp;
1630
VI_UNLOCK(unp->un_dvp);
1631
1632
if (udvp == NULL)
1633
return (EROFS);
1634
if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
1635
return (EROFS);
1636
ASSERT_VOP_UNLOCKED(udvp, __func__);
1637
1638
error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1639
if (error == EJUSTRETURN)
1640
return (0);
1641
else if (error != 0)
1642
return (error);
1643
1644
uio.uio_td = td;
1645
uio.uio_segflg = UIO_SYSSPACE;
1646
uio.uio_offset = 0;
1647
uio.uio_iov = &iov;
1648
uio.uio_iovcnt = 1;
1649
iov.iov_base = buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1650
uio.uio_resid = iov.iov_len = MAXPATHLEN;
1651
uio.uio_rw = UIO_READ;
1652
1653
if ((error = VOP_READLINK(lvp, &uio, cred)) != 0)
1654
goto unionfs_copylink_cleanup;
1655
buf[iov.iov_len - uio.uio_resid] = '\0';
1656
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
1657
goto unionfs_copylink_cleanup;
1658
error = unionfs_vn_symlink_on_upper(&uvp, udvp, vp, &uva, buf, td);
1659
vn_finished_write(mp);
1660
if (error != 0) {
1661
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1662
goto unionfs_copylink_cleanup;
1663
}
1664
1665
vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
1666
unp = VTOUNIONFS(vp);
1667
if (unp == NULL) {
1668
error = ENOENT;
1669
goto unionfs_copylink_cleanup;
1670
}
1671
1672
if (error == 0) {
1673
/* Reset the attributes. Ignore errors. */
1674
uva.va_type = VNON;
1675
VOP_SETATTR(uvp, &uva, cred);
1676
unionfs_node_update(unp, uvp, td);
1677
}
1678
1679
unionfs_copylink_cleanup:
1680
if (buf != NULL)
1681
free(buf, M_TEMP);
1682
unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1683
return (error);
1684
}
1685
1686
/*
1687
* Determine if the unionfs view of a directory is empty such that
1688
* an rmdir operation can be permitted.
1689
*
1690
* We assume the VOP_RMDIR() against the upper layer vnode will take
1691
* care of this check for us where the upper FS is concerned, so here
1692
* we concentrate on the lower FS. We need to check for the presence
1693
* of files other than "." and ".." in the lower FS directory and
1694
* then cross-check any files we find against the upper FS to see if
1695
* a whiteout is present (in which case we treat the lower file as
1696
* non-present).
1697
*
1698
* The logic here is based heavily on vn_dir_check_empty().
1699
*
1700
* vp should be a locked unionfs node, and vp's lowervp should also be
1701
* locked.
1702
*/
1703
int
1704
unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
1705
{
1706
struct vnode *uvp;
1707
struct vnode *lvp;
1708
struct vnode *tvp;
1709
char *dirbuf;
1710
size_t dirbuflen, len;
1711
off_t off;
1712
struct dirent *dp;
1713
struct componentname cn;
1714
struct vattr va;
1715
int error;
1716
int eofflag;
1717
1718
eofflag = 0;
1719
lvp = UNIONFSVPTOLOWERVP(vp);
1720
uvp = UNIONFSVPTOUPPERVP(vp);
1721
1722
/*
1723
* Note that the locking here still isn't ideal: We expect the caller
1724
* to hold both the upper and lower layer locks as well as the upper
1725
* parent directory lock, which it can do in a manner that avoids
1726
* deadlock. However, if the cross-check logic below needs to call
1727
* VOP_LOOKUP(), that may relock the upper vnode and lock any found
1728
* child vnode in a way that doesn't protect against deadlock given
1729
* the other held locks. Beyond that, the various other VOPs we issue
1730
* below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the
1731
* lower vnode.
1732
* We might instead just handoff between the upper vnode lock
1733
* (and its parent directory lock) and the lower vnode lock as needed,
1734
* so that the lower lock is never held at the same time as the upper
1735
* locks, but that opens up a wider window in which the upper
1736
* directory (and also the lower directory if it isn't truly
1737
* read-only) may change while the relevant lock is dropped. But
1738
* since re-locking may happen here and open up such a window anyway,
1739
* perhaps that is a worthwile tradeoff? Or perhaps we can ultimately
1740
* do sufficient tracking of empty state within the unionfs vnode
1741
* (in conjunction with upcalls from the lower FSes to notify us
1742
* of out-of-band state changes) that we can avoid these costly checks
1743
* altogether.
1744
*/
1745
ASSERT_VOP_LOCKED(lvp, __func__);
1746
ASSERT_VOP_ELOCKED(uvp, __func__);
1747
1748
if ((error = VOP_GETATTR(uvp, &va, cred)) != 0)
1749
return (error);
1750
if (va.va_flags & OPAQUE)
1751
return (0);
1752
1753
#ifdef MAC
1754
if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0)
1755
return (error);
1756
#endif
1757
if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0)
1758
return (error);
1759
if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0)
1760
return (error);
1761
if ((error = VOP_GETATTR(lvp, &va, cred)) != 0)
1762
return (error);
1763
1764
dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
1765
if (dirbuflen < va.va_blocksize)
1766
dirbuflen = va.va_blocksize;
1767
dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
1768
1769
len = 0;
1770
off = 0;
1771
eofflag = 0;
1772
1773
for (;;) {
1774
error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen,
1775
&dp, &len, &off, &eofflag);
1776
if (error != 0)
1777
break;
1778
1779
if (len == 0) {
1780
/* EOF */
1781
error = 0;
1782
break;
1783
}
1784
1785
if (dp->d_type == DT_WHT)
1786
continue;
1787
1788
/*
1789
* Any file in the directory which is not '.' or '..' indicates
1790
* the directory is not empty.
1791
*/
1792
switch (dp->d_namlen) {
1793
case 2:
1794
if (dp->d_name[1] != '.') {
1795
/* Can't be '..' (nor '.') */
1796
break;
1797
}
1798
/* FALLTHROUGH */
1799
case 1:
1800
if (dp->d_name[0] != '.') {
1801
/* Can't be '..' nor '.' */
1802
break;
1803
}
1804
continue;
1805
default:
1806
break;
1807
}
1808
1809
cn.cn_namelen = dp->d_namlen;
1810
cn.cn_pnbuf = NULL;
1811
cn.cn_nameptr = dp->d_name;
1812
cn.cn_nameiop = LOOKUP;
1813
cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
1814
cn.cn_lkflags = LK_EXCLUSIVE;
1815
cn.cn_cred = cred;
1816
1817
error = VOP_LOOKUP(uvp, &tvp, &cn);
1818
if (tvp != NULL)
1819
vput(tvp);
1820
if (error != 0 && error != ENOENT && error != EJUSTRETURN)
1821
break;
1822
else if ((cn.cn_flags & ISWHITEOUT) == 0) {
1823
error = ENOTEMPTY;
1824
break;
1825
} else
1826
error = 0;
1827
}
1828
1829
VOP_CLOSE(lvp, FREAD, cred, td);
1830
free(dirbuf, M_TEMP);
1831
return (error);
1832
}
1833
1834