Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/unionfs/union_subr.c
39483 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1994 Jan-Simon Pendry
5
* Copyright (c) 1994
6
* The Regents of the University of California. All rights reserved.
7
* Copyright (c) 2005, 2006, 2012 Masanori Ozawa <[email protected]>, ONGS Inc.
8
* Copyright (c) 2006, 2012 Daichi Goto <[email protected]>
9
*
10
* This code is derived from software contributed to Berkeley by
11
* Jan-Simon Pendry.
12
*
13
* Redistribution and use in source and binary forms, with or without
14
* modification, are permitted provided that the following conditions
15
* are met:
16
* 1. Redistributions of source code must retain the above copyright
17
* notice, this list of conditions and the following disclaimer.
18
* 2. Redistributions in binary form must reproduce the above copyright
19
* notice, this list of conditions and the following disclaimer in the
20
* documentation and/or other materials provided with the distribution.
21
* 3. Neither the name of the University nor the names of its contributors
22
* may be used to endorse or promote products derived from this software
23
* without specific prior written permission.
24
*
25
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35
* SUCH DAMAGE.
36
*/
37
38
#include <sys/param.h>
39
#include <sys/systm.h>
40
#include <sys/kernel.h>
41
#include <sys/ktr.h>
42
#include <sys/lock.h>
43
#include <sys/mutex.h>
44
#include <sys/malloc.h>
45
#include <sys/mount.h>
46
#include <sys/namei.h>
47
#include <sys/proc.h>
48
#include <sys/vnode.h>
49
#include <sys/dirent.h>
50
#include <sys/fcntl.h>
51
#include <sys/filedesc.h>
52
#include <sys/stat.h>
53
#include <sys/sysctl.h>
54
#include <sys/taskqueue.h>
55
#include <sys/resourcevar.h>
56
57
#include <machine/atomic.h>
58
59
#include <security/mac/mac_framework.h>
60
61
#include <vm/uma.h>
62
63
#include <fs/unionfs/union.h>
64
65
#define NUNIONFSNODECACHE 16
66
#define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1)
67
68
static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table");
69
MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part");
70
MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part");
71
72
static struct task unionfs_deferred_rele_task;
73
static struct mtx unionfs_deferred_rele_lock;
74
static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list =
75
STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list);
76
static TASKQUEUE_DEFINE_THREAD(unionfs_rele);
77
78
unsigned int unionfs_ndeferred = 0;
79
SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD,
80
&unionfs_ndeferred, 0, "unionfs deferred vnode release");
81
82
static void unionfs_deferred_rele(void *, int);
83
84
/*
85
* Initialize
86
*/
87
int
88
unionfs_init(struct vfsconf *vfsp)
89
{
90
UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */
91
TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL);
92
mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF);
93
return (0);
94
}
95
96
/*
97
* Uninitialize
98
*/
99
int
100
unionfs_uninit(struct vfsconf *vfsp)
101
{
102
taskqueue_quiesce(taskqueue_unionfs_rele);
103
taskqueue_free(taskqueue_unionfs_rele);
104
mtx_destroy(&unionfs_deferred_rele_lock);
105
return (0);
106
}
107
108
static void
109
unionfs_deferred_rele(void *arg __unused, int pending __unused)
110
{
111
STAILQ_HEAD(, unionfs_node) local_rele_list;
112
struct unionfs_node *unp, *tunp;
113
unsigned int ndeferred;
114
115
ndeferred = 0;
116
STAILQ_INIT(&local_rele_list);
117
mtx_lock(&unionfs_deferred_rele_lock);
118
STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list);
119
mtx_unlock(&unionfs_deferred_rele_lock);
120
STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) {
121
++ndeferred;
122
MPASS(unp->un_dvp != NULL);
123
vrele(unp->un_dvp);
124
free(unp, M_UNIONFSNODE);
125
}
126
127
/* We expect this function to be single-threaded, thus no atomic */
128
unionfs_ndeferred += ndeferred;
129
}
130
131
static struct unionfs_node_hashhead *
132
unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup)
133
{
134
struct unionfs_node *unp;
135
136
unp = VTOUNIONFS(dvp);
137
138
return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK]));
139
}
140
141
/*
142
* Attempt to lookup a cached unionfs vnode by upper/lower vp
143
* from dvp, with dvp's interlock held.
144
*/
145
static struct vnode *
146
unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp)
147
{
148
struct unionfs_node *unp;
149
struct unionfs_node_hashhead *hd;
150
struct vnode *vp;
151
152
hd = unionfs_get_hashhead(dvp, lookup);
153
154
LIST_FOREACH(unp, hd, un_hash) {
155
if (unp->un_uppervp == lookup ||
156
unp->un_lowervp == lookup) {
157
vp = UNIONFSTOV(unp);
158
VI_LOCK_FLAGS(vp, MTX_DUPOK);
159
vp->v_iflag &= ~VI_OWEINACT;
160
if (VN_IS_DOOMED(vp) ||
161
((vp->v_iflag & VI_DOINGINACT) != 0)) {
162
VI_UNLOCK(vp);
163
vp = NULL;
164
} else {
165
vrefl(vp);
166
VI_UNLOCK(vp);
167
}
168
return (vp);
169
}
170
}
171
172
return (NULL);
173
}
174
175
176
/*
177
* Get the cached vnode.
178
*/
179
static struct vnode *
180
unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp,
181
struct vnode *dvp)
182
{
183
struct vnode *vp;
184
185
vp = NULL;
186
VI_LOCK(dvp);
187
if (uvp != NULL)
188
vp = unionfs_get_cached_vnode_locked(uvp, dvp);
189
else if (lvp != NULL)
190
vp = unionfs_get_cached_vnode_locked(lvp, dvp);
191
VI_UNLOCK(dvp);
192
193
return (vp);
194
}
195
196
/*
197
* Add the new vnode into cache.
198
*/
199
static struct vnode *
200
unionfs_ins_cached_vnode(struct unionfs_node *uncp,
201
struct vnode *dvp)
202
{
203
struct unionfs_node_hashhead *hd;
204
struct vnode *vp;
205
206
vp = NULL;
207
VI_LOCK(dvp);
208
if (uncp->un_uppervp != NULL) {
209
ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
210
KASSERT(uncp->un_uppervp->v_type == VDIR,
211
("%s: v_type != VDIR", __func__));
212
vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
213
} else if (uncp->un_lowervp != NULL) {
214
ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
215
KASSERT(uncp->un_lowervp->v_type == VDIR,
216
("%s: v_type != VDIR", __func__));
217
vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
218
}
219
if (vp == NULL) {
220
hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULL ?
221
uncp->un_uppervp : uncp->un_lowervp));
222
LIST_INSERT_HEAD(hd, uncp, un_hash);
223
}
224
VI_UNLOCK(dvp);
225
226
return (vp);
227
}
228
229
/*
230
* Remove the vnode.
231
*/
232
static void
233
unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp)
234
{
235
KASSERT(unp != NULL, ("%s: null node", __func__));
236
KASSERT(dvp != NULL,
237
("%s: null parent vnode", __func__));
238
239
VI_LOCK(dvp);
240
if (unp->un_hash.le_prev != NULL) {
241
LIST_REMOVE(unp, un_hash);
242
unp->un_hash.le_next = NULL;
243
unp->un_hash.le_prev = NULL;
244
}
245
VI_UNLOCK(dvp);
246
}
247
248
/*
249
* Common cleanup handling for unionfs_nodeget
250
* Upper, lower, and parent directory vnodes are expected to be referenced by
251
* the caller. Upper and lower vnodes, if non-NULL, are also expected to be
252
* exclusively locked by the caller.
253
* This function will return with the caller's locks and references undone.
254
*/
255
static void
256
unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp)
257
{
258
259
/*
260
* Lock and reset the default vnode lock; vgone() expects a locked
261
* vnode, and we're going to reset the vnode ops.
262
*/
263
lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
264
265
/*
266
* Clear out private data and reset the vnode ops to avoid use of
267
* unionfs vnode ops on a partially constructed vnode.
268
*/
269
VI_LOCK(vp);
270
vp->v_data = NULL;
271
vp->v_vnlock = &vp->v_lock;
272
vp->v_op = &dead_vnodeops;
273
VI_UNLOCK(vp);
274
vgone(vp);
275
vput(vp);
276
277
if (unp->un_dvp != NULL)
278
vrele(unp->un_dvp);
279
if (unp->un_uppervp != NULL) {
280
vput(unp->un_uppervp);
281
if (unp->un_lowervp != NULL)
282
vrele(unp->un_lowervp);
283
} else if (unp->un_lowervp != NULL)
284
vput(unp->un_lowervp);
285
if (unp->un_hashtbl != NULL)
286
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
287
free(unp->un_path, M_UNIONFSPATH);
288
free(unp, M_UNIONFSNODE);
289
}
290
291
/*
292
* Make a new or get existing unionfs node.
293
*
294
* uppervp and lowervp should be unlocked. Because if new unionfs vnode is
295
* locked, uppervp or lowervp is locked too. In order to prevent dead lock,
296
* you should not lock plurality simultaneously.
297
*/
298
int
299
unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
300
struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp,
301
struct componentname *cnp)
302
{
303
char *path;
304
struct unionfs_mount *ump;
305
struct unionfs_node *unp;
306
struct vnode *vp;
307
u_long hashmask;
308
int error;
309
int lkflags;
310
__enum_uint8(vtype) vt;
311
312
error = 0;
313
ump = MOUNTTOUNIONFSMOUNT(mp);
314
lkflags = (cnp ? cnp->cn_lkflags : 0);
315
path = (cnp ? cnp->cn_nameptr : NULL);
316
*vpp = NULL;
317
318
if (uppervp == NULL && lowervp == NULL)
319
panic("%s: upper and lower are both null", __func__);
320
321
vt = (uppervp != NULL ? uppervp->v_type : lowervp->v_type);
322
323
/* If it has no ISLASTCN flag, path check is skipped. */
324
if (cnp && !(cnp->cn_flags & ISLASTCN))
325
path = NULL;
326
327
/* check the cache */
328
if (dvp != NULL && vt == VDIR) {
329
vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
330
if (vp != NULL) {
331
*vpp = vp;
332
if (lkflags != 0)
333
vn_lock(*vpp, lkflags | LK_RETRY);
334
return (0);
335
}
336
}
337
338
unp = malloc(sizeof(struct unionfs_node),
339
M_UNIONFSNODE, M_WAITOK | M_ZERO);
340
341
error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp);
342
if (error != 0) {
343
free(unp, M_UNIONFSNODE);
344
return (error);
345
}
346
if (dvp != NULL)
347
vref(dvp);
348
if (uppervp != NULL)
349
vref(uppervp);
350
if (lowervp != NULL)
351
vref(lowervp);
352
353
if (vt == VDIR) {
354
unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH,
355
&hashmask);
356
KASSERT(hashmask == UNIONFSHASHMASK,
357
("unexpected unionfs hash mask 0x%lx", hashmask));
358
}
359
360
unp->un_vnode = vp;
361
unp->un_uppervp = uppervp;
362
unp->un_lowervp = lowervp;
363
unp->un_dvp = dvp;
364
if (uppervp != NULL)
365
vp->v_vnlock = uppervp->v_vnlock;
366
else
367
vp->v_vnlock = lowervp->v_vnlock;
368
369
if (path != NULL) {
370
unp->un_path = malloc(cnp->cn_namelen + 1,
371
M_UNIONFSPATH, M_WAITOK | M_ZERO);
372
bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen);
373
unp->un_path[cnp->cn_namelen] = '\0';
374
unp->un_pathlen = cnp->cn_namelen;
375
}
376
vp->v_type = vt;
377
vp->v_data = unp;
378
379
/*
380
* TODO: This is an imperfect check, as there's no guarantee that
381
* the underlying filesystems will always return vnode pointers
382
* for the root inodes that match our cached values. To reduce
383
* the likelihood of failure, for example in the case where either
384
* vnode has been forcibly doomed, we check both pointers and set
385
* VV_ROOT if either matches.
386
*/
387
if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp)
388
vp->v_vflag |= VV_ROOT;
389
KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
390
("%s: NULL dvp for non-root vp %p", __func__, vp));
391
392
393
/*
394
* NOTE: There is still a possibility for cross-filesystem locking here.
395
* If dvp has an upper FS component and is locked, while the new vnode
396
* created here only has a lower-layer FS component, then we will end
397
* up taking a lower-FS lock while holding an upper-FS lock.
398
* That situation could be dealt with here using vn_lock_pair().
399
* However, that would only address one instance out of many in which
400
* a child vnode lock is taken while holding a lock on its parent
401
* directory. This is done in many places in common VFS code, as well as
402
* a few places within unionfs (which could lead to the same cross-FS
403
* locking issue if, for example, the upper FS is another nested unionfs
404
* instance). Additionally, it is unclear under what circumstances this
405
* specific lock sequence (a directory on one FS followed by a child of
406
* its 'peer' directory on another FS) would present the practical
407
* possibility of deadlock due to some other agent on the system
408
* attempting to lock those two specific vnodes in the opposite order.
409
*/
410
if (uppervp != NULL)
411
vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY);
412
else
413
vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
414
error = insmntque1(vp, mp);
415
if (error != 0) {
416
unionfs_nodeget_cleanup(vp, unp);
417
return (error);
418
}
419
/*
420
* lowervp and uppervp should only be doomed by a forced unmount of
421
* their respective filesystems, but that can only happen if the
422
* unionfs instance is first unmounted. We also effectively hold the
423
* lock on the new unionfs vnode at this point. Therefore, if a
424
* unionfs umount has not yet reached the point at which the above
425
* insmntque1() would fail, then its vflush() call will end up
426
* blocked on our vnode lock, effectively also preventing unmount
427
* of the underlying filesystems.
428
*/
429
VNASSERT(lowervp == NULL || !VN_IS_DOOMED(lowervp), vp,
430
("%s: doomed lowervp %p", __func__, lowervp));
431
VNASSERT(uppervp == NULL || !VN_IS_DOOMED(uppervp), vp,
432
("%s: doomed lowervp %p", __func__, uppervp));
433
434
vn_set_state(vp, VSTATE_CONSTRUCTED);
435
436
if (dvp != NULL && vt == VDIR)
437
*vpp = unionfs_ins_cached_vnode(unp, dvp);
438
if (*vpp != NULL) {
439
unionfs_nodeget_cleanup(vp, unp);
440
if (lkflags != 0)
441
vn_lock(*vpp, lkflags | LK_RETRY);
442
return (0);
443
} else
444
*vpp = vp;
445
446
if ((lkflags & LK_SHARED) != 0)
447
vn_lock(vp, LK_DOWNGRADE);
448
else if ((lkflags & LK_EXCLUSIVE) == 0)
449
VOP_UNLOCK(vp);
450
451
return (0);
452
}
453
454
/*
455
* Clean up the unionfs node.
456
*/
457
void
458
unionfs_noderem(struct vnode *vp)
459
{
460
struct unionfs_node *unp, *unp_t1, *unp_t2;
461
struct unionfs_node_hashhead *hd;
462
struct unionfs_node_status *unsp, *unsp_tmp;
463
struct vnode *lvp;
464
struct vnode *uvp;
465
struct vnode *dvp;
466
int count;
467
int writerefs;
468
bool unlock_lvp;
469
470
/*
471
* The root vnode lock may be recursed during unmount, because
472
* it may share the same lock as the unionfs mount's covered vnode,
473
* which is locked across VFS_UNMOUNT(). This lock will then be
474
* recursively taken during the vflush() issued by unionfs_unmount().
475
* But we still only need to lock the unionfs lock once, because only
476
* one of those lock operations was taken against a unionfs vnode and
477
* will be undone against a unionfs vnode.
478
*/
479
KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
480
("%s: vnode %p locked recursively", __func__, vp));
481
482
unp = VTOUNIONFS(vp);
483
VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__));
484
lvp = unp->un_lowervp;
485
uvp = unp->un_uppervp;
486
dvp = unp->un_dvp;
487
unlock_lvp = (uvp == NULL);
488
489
/*
490
* Lock the lower vnode in addition to the upper vnode lock in order
491
* to synchronize against any unionfs_lock() operation which may still
492
* hold the lower vnode lock. We do not need to do this for the root
493
* vnode, as the root vnode should always have both upper and lower
494
* base vnodes for its entire lifecycled, so unionfs_lock() should
495
* never attempt to lock its lower vnode in the first place.
496
* Moreover, during unmount of a non-"below" unionfs mount, the lower
497
* root vnode will already be locked as it is the covered vnode.
498
*/
499
if (uvp != NULL && lvp != NULL && (vp->v_vflag & VV_ROOT) == 0) {
500
vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE);
501
unlock_lvp = true;
502
}
503
504
if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
505
panic("%s: failed to acquire lock for vnode lock", __func__);
506
/*
507
* Use the interlock to protect the clearing of v_data to
508
* prevent faults in unionfs_lock().
509
*/
510
VI_LOCK(vp);
511
unp->un_lowervp = unp->un_uppervp = NULL;
512
vp->v_vnlock = &(vp->v_lock);
513
vp->v_data = NULL;
514
vp->v_object = NULL;
515
if (unp->un_hashtbl != NULL) {
516
/*
517
* Clear out any cached child vnodes. This should only
518
* be necessary during forced unmount, when the vnode may
519
* be reclaimed with a non-zero use count. Otherwise the
520
* reference held by each child should prevent reclamation.
521
*/
522
for (count = 0; count <= UNIONFSHASHMASK; count++) {
523
hd = unp->un_hashtbl + count;
524
LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) {
525
LIST_REMOVE(unp_t1, un_hash);
526
unp_t1->un_hash.le_next = NULL;
527
unp_t1->un_hash.le_prev = NULL;
528
}
529
}
530
}
531
VI_UNLOCK(vp);
532
533
writerefs = atomic_load_int(&vp->v_writecount);
534
VNASSERT(writerefs >= 0, vp,
535
("%s: write count %d, unexpected text ref", __func__, writerefs));
536
/*
537
* If we were opened for write, we leased the write reference
538
* to the lower vnode. If this is a reclamation due to the
539
* forced unmount, undo the reference now.
540
*/
541
if (writerefs > 0) {
542
VNASSERT(uvp != NULL, vp,
543
("%s: write reference without upper vnode", __func__));
544
VOP_ADD_WRITECOUNT(uvp, -writerefs);
545
}
546
if (uvp != NULL)
547
vput(uvp);
548
if (unlock_lvp)
549
vput(lvp);
550
else if (lvp != NULL)
551
vrele(lvp);
552
553
if (dvp != NULL)
554
unionfs_rem_cached_vnode(unp, dvp);
555
556
if (unp->un_path != NULL) {
557
free(unp->un_path, M_UNIONFSPATH);
558
unp->un_path = NULL;
559
unp->un_pathlen = 0;
560
}
561
562
if (unp->un_hashtbl != NULL) {
563
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
564
}
565
566
LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) {
567
LIST_REMOVE(unsp, uns_list);
568
free(unsp, M_TEMP);
569
}
570
if (dvp != NULL) {
571
mtx_lock(&unionfs_deferred_rele_lock);
572
STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele);
573
mtx_unlock(&unionfs_deferred_rele_lock);
574
taskqueue_enqueue(taskqueue_unionfs_rele,
575
&unionfs_deferred_rele_task);
576
} else
577
free(unp, M_UNIONFSNODE);
578
}
579
580
/*
581
* Find the unionfs node status object for the vnode corresponding to unp,
582
* for the process that owns td. Return NULL if no such object exists.
583
*/
584
struct unionfs_node_status *
585
unionfs_find_node_status(struct unionfs_node *unp, struct thread *td)
586
{
587
struct unionfs_node_status *unsp;
588
pid_t pid;
589
590
pid = td->td_proc->p_pid;
591
592
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
593
594
LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) {
595
if (unsp->uns_pid == pid) {
596
return (unsp);
597
}
598
}
599
600
return (NULL);
601
}
602
603
/*
604
* Get the unionfs node status object for the vnode corresponding to unp,
605
* for the process that owns td. Allocate a new status object if one
606
* does not already exist.
607
*/
608
void
609
unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
610
struct unionfs_node_status **unspp)
611
{
612
struct unionfs_node_status *unsp;
613
pid_t pid;
614
615
pid = td->td_proc->p_pid;
616
617
KASSERT(NULL != unspp, ("%s: NULL status", __func__));
618
unsp = unionfs_find_node_status(unp, td);
619
if (unsp == NULL) {
620
/* create a new unionfs node status */
621
unsp = malloc(sizeof(struct unionfs_node_status),
622
M_TEMP, M_WAITOK | M_ZERO);
623
624
unsp->uns_pid = pid;
625
LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
626
}
627
628
*unspp = unsp;
629
}
630
631
/*
632
* Remove the unionfs node status, if you can.
633
* You need exclusive lock this vnode.
634
*/
635
void
636
unionfs_tryrem_node_status(struct unionfs_node *unp,
637
struct unionfs_node_status *unsp)
638
{
639
KASSERT(NULL != unsp, ("%s: NULL status", __func__));
640
ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
641
642
if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt)
643
return;
644
645
LIST_REMOVE(unsp, uns_list);
646
free(unsp, M_TEMP);
647
}
648
649
/*
650
* Create upper node attr.
651
*/
652
void
653
unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva,
654
struct vattr *uva, struct thread *td)
655
{
656
VATTR_NULL(uva);
657
uva->va_type = lva->va_type;
658
uva->va_atime = lva->va_atime;
659
uva->va_mtime = lva->va_mtime;
660
uva->va_ctime = lva->va_ctime;
661
662
switch (ump->um_copymode) {
663
case UNIONFS_TRANSPARENT:
664
uva->va_mode = lva->va_mode;
665
uva->va_uid = lva->va_uid;
666
uva->va_gid = lva->va_gid;
667
break;
668
case UNIONFS_MASQUERADE:
669
if (ump->um_uid == lva->va_uid) {
670
uva->va_mode = lva->va_mode & 077077;
671
uva->va_mode |= (lva->va_type == VDIR ?
672
ump->um_udir : ump->um_ufile) & 0700;
673
uva->va_uid = lva->va_uid;
674
uva->va_gid = lva->va_gid;
675
} else {
676
uva->va_mode = (lva->va_type == VDIR ?
677
ump->um_udir : ump->um_ufile);
678
uva->va_uid = ump->um_uid;
679
uva->va_gid = ump->um_gid;
680
}
681
break;
682
default: /* UNIONFS_TRADITIONAL */
683
uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask;
684
uva->va_uid = ump->um_uid;
685
uva->va_gid = ump->um_gid;
686
break;
687
}
688
}
689
690
/*
691
* Create upper node attr.
692
*/
693
int
694
unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp,
695
struct vattr *uva, struct ucred *cred, struct thread *td)
696
{
697
struct vattr lva;
698
int error;
699
700
if ((error = VOP_GETATTR(lvp, &lva, cred)))
701
return (error);
702
703
unionfs_create_uppervattr_core(ump, &lva, uva, td);
704
705
return (error);
706
}
707
708
/*
709
* relookup
710
*
711
* dvp should be locked on entry and will be locked on return.
712
*
713
* If an error is returned, *vpp will be invalid, otherwise it will hold a
714
* locked, referenced vnode. If *vpp == dvp then remember that only one
715
* LK_EXCLUSIVE lock is held.
716
*/
717
int
718
unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
719
struct componentname *cnp, struct componentname *cn, struct thread *td,
720
char *path, int pathlen, u_long nameiop)
721
{
722
int error;
723
bool refstart;
724
725
cn->cn_namelen = pathlen;
726
cn->cn_pnbuf = path;
727
cn->cn_nameiop = nameiop;
728
cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN);
729
cn->cn_lkflags = LK_EXCLUSIVE;
730
cn->cn_cred = cnp->cn_cred;
731
cn->cn_nameptr = cn->cn_pnbuf;
732
733
refstart = false;
734
if (nameiop == DELETE) {
735
cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT);
736
} else if (nameiop == RENAME) {
737
refstart = true;
738
} else if (nameiop == CREATE) {
739
cn->cn_flags |= NOCACHE;
740
}
741
742
vref(dvp);
743
VOP_UNLOCK(dvp);
744
745
if ((error = vfs_relookup(dvp, vpp, cn, refstart))) {
746
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
747
} else
748
vrele(dvp);
749
750
KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__));
751
752
return (error);
753
}
754
755
/*
756
* Update the unionfs_node.
757
*
758
* uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the
759
* uvp's lock and lower's lock will be unlocked.
760
*/
761
static void
762
unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
763
struct thread *td)
764
{
765
struct unionfs_node_hashhead *hd;
766
struct vnode *vp;
767
struct vnode *lvp;
768
struct vnode *dvp;
769
unsigned count, lockrec;
770
771
vp = UNIONFSTOV(unp);
772
lvp = unp->un_lowervp;
773
ASSERT_VOP_ELOCKED(lvp, __func__);
774
ASSERT_VOP_ELOCKED(uvp, __func__);
775
dvp = unp->un_dvp;
776
777
VNASSERT(vp->v_writecount == 0, vp,
778
("%s: non-zero writecount", __func__));
779
/*
780
* Update the upper vnode's lock state to match the lower vnode,
781
* and then switch the unionfs vnode's lock to the upper vnode.
782
*/
783
lockrec = lvp->v_vnlock->lk_recurse;
784
for (count = 0; count < lockrec; count++)
785
vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
786
VI_LOCK(vp);
787
unp->un_uppervp = uvp;
788
vp->v_vnlock = uvp->v_vnlock;
789
VI_UNLOCK(vp);
790
791
for (count = 0; count < lockrec + 1; count++)
792
VOP_UNLOCK(lvp);
793
/*
794
* Re-cache the unionfs vnode against the upper vnode
795
*/
796
if (dvp != NULL && vp->v_type == VDIR) {
797
VI_LOCK(dvp);
798
if (unp->un_hash.le_prev != NULL) {
799
LIST_REMOVE(unp, un_hash);
800
hd = unionfs_get_hashhead(dvp, uvp);
801
LIST_INSERT_HEAD(hd, unp, un_hash);
802
}
803
VI_UNLOCK(unp->un_dvp);
804
}
805
}
806
807
/*
808
* Mark a unionfs operation as being in progress, sleeping if the
809
* same operation is already in progress.
810
* This is useful, for example, during copy-up operations in which
811
* we may drop the target vnode lock, but we want to avoid the
812
* possibility of a concurrent copy-up on the same vnode triggering
813
* a spurious failure.
814
*/
815
int
816
unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag)
817
{
818
struct unionfs_node *unp;
819
int error;
820
821
error = 0;
822
ASSERT_VOP_ELOCKED(vp, __func__);
823
VI_LOCK(vp);
824
unp = VTOUNIONFS(vp);
825
while (error == 0 && (unp->un_flag & flag) != 0) {
826
VOP_UNLOCK(vp);
827
error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0);
828
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
829
VI_LOCK(vp);
830
if (error == 0) {
831
/*
832
* If we waited on a concurrent copy-up and that
833
* copy-up was successful, return a non-fatal
834
* indication that the desired operation is already
835
* complete. If we waited on a concurrent lookup,
836
* return ERELOOKUP to indicate the VFS cache should
837
* be re-queried to avoid creating a duplicate unionfs
838
* vnode.
839
*/
840
unp = VTOUNIONFS(vp);
841
if (unp == NULL)
842
error = ENOENT;
843
else if (flag == UNIONFS_COPY_IN_PROGRESS &&
844
unp->un_uppervp != NULL)
845
error = EJUSTRETURN;
846
else if (flag == UNIONFS_LOOKUP_IN_PROGRESS)
847
error = ERELOOKUP;
848
}
849
}
850
if (error == 0)
851
unp->un_flag |= flag;
852
VI_UNLOCK(vp);
853
854
return (error);
855
}
856
857
void
858
unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag)
859
{
860
struct unionfs_node *unp;
861
862
ASSERT_VOP_ELOCKED(vp, __func__);
863
unp = VTOUNIONFS(vp);
864
VI_LOCK(vp);
865
if (unp != NULL) {
866
VNASSERT((unp->un_flag & flag) != 0, vp,
867
("%s: copy not in progress", __func__));
868
unp->un_flag &= ~flag;
869
}
870
wakeup(vp);
871
VI_UNLOCK(vp);
872
}
873
874
/*
875
* Create a new shadow dir.
876
*
877
* dvp and vp are unionfs vnodes representing a parent directory and
878
* child file, should be locked on entry, and will be locked on return.
879
*
880
* If no error returned, unp will be updated.
881
*/
882
int
883
unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp,
884
struct componentname *cnp, struct thread *td)
885
{
886
struct vnode *lvp;
887
struct vnode *uvp;
888
struct vnode *udvp;
889
struct vattr va;
890
struct vattr lva;
891
struct nameidata nd;
892
struct mount *mp;
893
struct ucred *cred;
894
struct ucred *credbk;
895
struct uidinfo *rootinfo;
896
struct unionfs_mount *ump;
897
struct unionfs_node *dunp;
898
struct unionfs_node *unp;
899
int error;
900
901
ASSERT_VOP_ELOCKED(dvp, __func__);
902
ASSERT_VOP_ELOCKED(vp, __func__);
903
ump = MOUNTTOUNIONFSMOUNT(vp->v_mount);
904
unp = VTOUNIONFS(vp);
905
if (unp->un_uppervp != NULL)
906
return (EEXIST);
907
dunp = VTOUNIONFS(dvp);
908
udvp = dunp->un_uppervp;
909
910
error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
911
if (error == EJUSTRETURN)
912
return (0);
913
else if (error != 0)
914
return (error);
915
916
lvp = unp->un_lowervp;
917
uvp = NULL;
918
credbk = cnp->cn_cred;
919
920
/* Authority change to root */
921
rootinfo = uifind((uid_t)0);
922
cred = crdup(cnp->cn_cred);
923
change_euid(cred, rootinfo);
924
change_ruid(cred, rootinfo);
925
change_svuid(cred, (uid_t)0);
926
uifree(rootinfo);
927
cnp->cn_cred = cred;
928
929
memset(&nd.ni_cnd, 0, sizeof(struct componentname));
930
NDPREINIT(&nd);
931
932
if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
933
goto unionfs_mkshadowdir_finish;
934
935
vref(udvp);
936
VOP_UNLOCK(vp);
937
if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
938
cnp->cn_nameptr, cnp->cn_namelen, CREATE))) {
939
/*
940
* When handling error cases here, we drop udvp's lock and
941
* then jump to exit code that relocks dvp, which in most
942
* cases will effectively relock udvp. However, this is
943
* not guaranteed to be the case, as various calls made
944
* here (such as unionfs_relookup() above and VOP_MKDIR()
945
* below) may unlock and then relock udvp, allowing dvp to
946
* be reclaimed in the meantime. In such a situation dvp
947
* will no longer share its lock with udvp. Since
948
* performance isn't a concern for these error cases, it
949
* makes more sense to reuse the common code that locks
950
* dvp on exit than to explicitly check for reclamation
951
* of dvp.
952
*/
953
vput(udvp);
954
goto unionfs_mkshadowdir_relock;
955
}
956
if (uvp != NULL) {
957
if (udvp == uvp)
958
vrele(uvp);
959
else
960
vput(uvp);
961
962
error = EEXIST;
963
vput(udvp);
964
goto unionfs_mkshadowdir_relock;
965
}
966
967
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) {
968
vput(udvp);
969
goto unionfs_mkshadowdir_relock;
970
}
971
unionfs_create_uppervattr_core(ump, &lva, &va, td);
972
973
/*
974
* Temporarily NUL-terminate the current pathname component.
975
* This function may be called during lookup operations in which
976
* the current pathname component is not the leaf, meaning that
977
* the NUL terminator is some distance beyond the end of the current
978
* component. This *should* be fine, as cn_namelen will still
979
* correctly indicate the length of only the current component,
980
* but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
981
* implementation.
982
* Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
983
* something like a local namei() operation and the temporary
984
* NUL-termination will not have an effect on other threads.
985
*/
986
char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen];
987
char pathterm = *pathend;
988
*pathend = '\0';
989
error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
990
*pathend = pathterm;
991
if (error != 0) {
992
/*
993
* See the comment after unionfs_relookup() above for an
994
* explanation of why we unlock udvp here only to relock
995
* dvp on exit.
996
*/
997
vput(udvp);
998
vn_finished_write(mp);
999
goto unionfs_mkshadowdir_relock;
1000
}
1001
1002
/*
1003
* XXX The bug which cannot set uid/gid was corrected.
1004
* Ignore errors.
1005
*/
1006
va.va_type = VNON;
1007
/*
1008
* VOP_SETATTR() may transiently drop uvp's lock, so it's
1009
* important to call it before unionfs_node_update() transfers
1010
* the unionfs vnode's lock from lvp to uvp; otherwise the
1011
* unionfs vnode itself would be transiently unlocked and
1012
* potentially doomed.
1013
*/
1014
VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
1015
1016
/*
1017
* uvp may become doomed during VOP_VPUT_PAIR() if the implementation
1018
* must temporarily drop uvp's lock. However, since we hold a
1019
* reference to uvp from the VOP_MKDIR() call above, this would require
1020
* a forcible unmount of uvp's filesystem, which in turn can only
1021
* happen if our unionfs instance is first forcibly unmounted. We'll
1022
* therefore catch this case in the NULL check of unp below.
1023
*/
1024
VOP_VPUT_PAIR(udvp, &uvp, false);
1025
vn_finished_write(mp);
1026
vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
1027
unp = VTOUNIONFS(vp);
1028
if (unp == NULL) {
1029
vput(uvp);
1030
error = ENOENT;
1031
} else
1032
unionfs_node_update(unp, uvp, td);
1033
VOP_UNLOCK(vp);
1034
1035
unionfs_mkshadowdir_relock:
1036
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
1037
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1038
if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)))
1039
error = ENOENT;
1040
1041
unionfs_mkshadowdir_finish:
1042
unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1043
cnp->cn_cred = credbk;
1044
crfree(cred);
1045
1046
return (error);
1047
}
1048
1049
static inline void
1050
unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags)
1051
{
1052
ASSERT_VOP_LOCKED(basevp, __func__);
1053
*lkflags = VOP_ISLOCKED(basevp);
1054
vref(basevp);
1055
}
1056
1057
/*
1058
* Prepare unionfs to issue a forwarded VOP to either the upper or lower
1059
* FS. This should be used for any VOP which may drop the vnode lock;
1060
* it is not required otherwise.
1061
* The unionfs vnode shares its lock with the base-layer vnode(s); if the
1062
* base FS must transiently drop its vnode lock, the unionfs vnode may
1063
* effectively become unlocked. During that window, a concurrent forced
1064
* unmount may doom the unionfs vnode, which leads to two significant
1065
* issues:
1066
* 1) Completion of, and return from, the unionfs VOP with the unionfs
1067
* vnode completely unlocked. When the unionfs vnode becomes doomed
1068
* it stops sharing its lock with the base vnode, so even if the
1069
* forwarded VOP reacquires the base vnode lock the unionfs vnode
1070
* lock will no longer be held. This can lead to violation of the
1071
* caller's sychronization requirements as well as various failed
1072
* locking assertions when DEBUG_VFS_LOCKS is enabled.
1073
* 2) Loss of reference on the base vnode. The caller is expected to
1074
* hold a v_usecount reference on the unionfs vnode, while the
1075
* unionfs vnode holds a reference on the base-layer vnode(s). But
1076
* these references are released when the unionfs vnode becomes
1077
* doomed, violating the base layer's expectation that its caller
1078
* must hold a reference to prevent vnode recycling.
1079
*
1080
* basevp1 and basevp2 represent two base-layer vnodes which are
1081
* expected to be locked when this function is called. basevp2
1082
* may be NULL, but if not NULL basevp1 and basevp2 should represent
1083
* a parent directory and a filed linked to it, respectively.
1084
* lkflags1 and lkflags2 are output parameters that will store the
1085
* current lock status of basevp1 and basevp2, respectively. They
1086
* are intended to be passed as the lkflags1 and lkflags2 parameters
1087
* in the subsequent call to unionfs_forward_vop_finish_pair().
1088
* lkflags2 may be NULL iff basevp2 is NULL.
1089
*/
1090
void
1091
unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1,
1092
struct vnode *basevp2, int *lkflags2)
1093
{
1094
/*
1095
* Take an additional reference on the base-layer vnodes to
1096
* avoid loss of reference if the unionfs vnodes are doomed.
1097
*/
1098
unionfs_forward_vop_ref(basevp1, lkflags1);
1099
if (basevp2 != NULL)
1100
unionfs_forward_vop_ref(basevp2, lkflags2);
1101
}
1102
1103
static inline bool
1104
unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp,
1105
int lkflags)
1106
{
1107
bool unionvp_doomed;
1108
1109
if (__predict_false(VTOUNIONFS(unionvp) == NULL)) {
1110
if ((lkflags & LK_EXCLUSIVE) != 0)
1111
ASSERT_VOP_ELOCKED(basevp, __func__);
1112
else
1113
ASSERT_VOP_LOCKED(basevp, __func__);
1114
unionvp_doomed = true;
1115
} else {
1116
vrele(basevp);
1117
unionvp_doomed = false;
1118
}
1119
1120
return (unionvp_doomed);
1121
}
1122
1123
1124
/*
1125
* Indicate completion of a forwarded VOP previously prepared by
1126
* unionfs_forward_vop_start_pair().
1127
* basevp1 and basevp2 must be the same values passed to the prior
1128
* call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2
1129
* must be the unionfs vnodes that were initially above basevp1 and
1130
* basevp2, respectively.
1131
* basevp1 and basevp2 (if not NULL) must be locked when this function
1132
* is called, while unionvp1 and/or unionvp2 may be unlocked if either
1133
* unionfs vnode has become doomed.
1134
* lkflags1 and lkflag2 represent the locking flags that should be
1135
* used to re-lock unionvp1 and unionvp2, respectively, if either
1136
* vnode has become doomed.
1137
*
1138
* Returns true if any unionfs vnode was found to be doomed, false
1139
* otherwise.
1140
*/
1141
bool
1142
unionfs_forward_vop_finish_pair(
1143
struct vnode *unionvp1, struct vnode *basevp1, int lkflags1,
1144
struct vnode *unionvp2, struct vnode *basevp2, int lkflags2)
1145
{
1146
bool vp1_doomed, vp2_doomed;
1147
1148
/*
1149
* If either vnode is found to have been doomed, set
1150
* a flag indicating that it needs to be re-locked.
1151
* Otherwise, simply drop the base-vnode reference that
1152
* was taken in unionfs_forward_vop_start().
1153
*/
1154
vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1);
1155
1156
if (unionvp2 != NULL)
1157
vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2);
1158
else
1159
vp2_doomed = false;
1160
1161
/*
1162
* If any of the unionfs vnodes need to be re-locked, that
1163
* means the unionfs vnode's lock is now de-coupled from the
1164
* corresponding base vnode. We therefore need to drop the
1165
* base vnode lock (since nothing else will after this point),
1166
* and also release the reference taken in
1167
* unionfs_forward_vop_start_pair().
1168
*/
1169
if (__predict_false(vp1_doomed && vp2_doomed))
1170
VOP_VPUT_PAIR(basevp1, &basevp2, true);
1171
else if (__predict_false(vp1_doomed)) {
1172
/*
1173
* If basevp1 needs to be unlocked, then we may not
1174
* be able to safely unlock it with basevp2 still locked,
1175
* for the same reason that an ordinary VFS call would
1176
* need to use VOP_VPUT_PAIR() here. We might be able
1177
* to use VOP_VPUT_PAIR(..., false) here, but then we
1178
* would need to deal with the possibility of basevp2
1179
* changing out from under us, which could result in
1180
* either the unionfs vnode becoming doomed or its
1181
* upper/lower vp no longer matching basevp2. Either
1182
* scenario would require at least re-locking the unionfs
1183
* vnode anyway.
1184
*/
1185
if (unionvp2 != NULL) {
1186
VOP_UNLOCK(unionvp2);
1187
vp2_doomed = true;
1188
}
1189
vput(basevp1);
1190
} else if (__predict_false(vp2_doomed))
1191
vput(basevp2);
1192
1193
if (__predict_false(vp1_doomed || vp2_doomed))
1194
vn_lock_pair(unionvp1, !vp1_doomed, lkflags1,
1195
unionvp2, !vp2_doomed, lkflags2);
1196
1197
return (vp1_doomed || vp2_doomed);
1198
}
1199
1200
/*
1201
* Create a new whiteout.
1202
*
1203
* dvp and vp are unionfs vnodes representing a parent directory and
1204
* child file, should be locked on entry, and will be locked on return.
1205
*/
1206
int
1207
unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp,
1208
struct componentname *cnp, struct thread *td, char *path, int pathlen)
1209
{
1210
struct vnode *udvp;
1211
struct vnode *wvp;
1212
struct nameidata nd;
1213
struct mount *mp;
1214
int error;
1215
bool dvp_locked;
1216
1217
ASSERT_VOP_ELOCKED(dvp, __func__);
1218
ASSERT_VOP_ELOCKED(vp, __func__);
1219
1220
udvp = VTOUNIONFS(dvp)->un_uppervp;
1221
wvp = NULL;
1222
NDPREINIT(&nd);
1223
vref(udvp);
1224
VOP_UNLOCK(vp);
1225
if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
1226
pathlen, CREATE))) {
1227
goto unionfs_mkwhiteout_cleanup;
1228
}
1229
if (wvp != NULL) {
1230
if (udvp == wvp)
1231
vrele(wvp);
1232
else
1233
vput(wvp);
1234
1235
if (nd.ni_cnd.cn_flags & ISWHITEOUT)
1236
error = 0;
1237
else
1238
error = EEXIST;
1239
goto unionfs_mkwhiteout_cleanup;
1240
}
1241
1242
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
1243
goto unionfs_mkwhiteout_cleanup;
1244
error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
1245
vn_finished_write(mp);
1246
1247
unionfs_mkwhiteout_cleanup:
1248
if (VTOUNIONFS(dvp) == NULL) {
1249
vput(udvp);
1250
dvp_locked = false;
1251
} else {
1252
vrele(udvp);
1253
dvp_locked = true;
1254
}
1255
vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
1256
return (error);
1257
}
1258
1259
/*
1260
* Create a new vnode for create a new shadow file.
1261
*
1262
* If an error is returned, *vpp will be invalid, otherwise it will hold a
1263
* locked, referenced and opened vnode.
1264
*
1265
* unp is never updated.
1266
*/
1267
static int
1268
unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
1269
struct vnode *vp, struct vattr *uvap, struct thread *td)
1270
{
1271
struct unionfs_mount *ump;
1272
struct unionfs_node *unp;
1273
struct vnode *uvp;
1274
struct vnode *lvp;
1275
struct ucred *cred;
1276
struct vattr lva;
1277
struct nameidata nd;
1278
int fmode;
1279
int error;
1280
1281
ASSERT_VOP_ELOCKED(vp, __func__);
1282
unp = VTOUNIONFS(vp);
1283
ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
1284
uvp = NULL;
1285
lvp = unp->un_lowervp;
1286
cred = td->td_ucred;
1287
fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
1288
error = 0;
1289
1290
if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0)
1291
return (error);
1292
unionfs_create_uppervattr_core(ump, &lva, uvap, td);
1293
1294
if (unp->un_path == NULL)
1295
panic("%s: NULL un_path", __func__);
1296
1297
nd.ni_cnd.cn_namelen = unp->un_pathlen;
1298
nd.ni_cnd.cn_pnbuf = unp->un_path;
1299
nd.ni_cnd.cn_nameiop = CREATE;
1300
nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN;
1301
nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
1302
nd.ni_cnd.cn_cred = cred;
1303
nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf;
1304
NDPREINIT(&nd);
1305
1306
vref(udvp);
1307
VOP_UNLOCK(vp);
1308
if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
1309
vrele(udvp);
1310
return (error);
1311
}
1312
1313
if (uvp != NULL) {
1314
if (uvp == udvp)
1315
vrele(uvp);
1316
else
1317
vput(uvp);
1318
error = EEXIST;
1319
goto unionfs_vn_create_on_upper_cleanup;
1320
}
1321
1322
if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0)
1323
goto unionfs_vn_create_on_upper_cleanup;
1324
1325
if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) {
1326
vput(uvp);
1327
goto unionfs_vn_create_on_upper_cleanup;
1328
}
1329
error = VOP_ADD_WRITECOUNT(uvp, 1);
1330
CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
1331
__func__, uvp, uvp->v_writecount);
1332
if (error == 0) {
1333
*vpp = uvp;
1334
} else {
1335
VOP_CLOSE(uvp, fmode, cred, td);
1336
}
1337
1338
unionfs_vn_create_on_upper_cleanup:
1339
vput(udvp);
1340
return (error);
1341
}
1342
1343
/*
1344
* Copy from lvp to uvp.
1345
*
1346
* lvp and uvp should be locked and opened on entry and will be locked and
1347
* opened on return.
1348
*/
1349
static int
1350
unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp,
1351
struct ucred *cred, struct thread *td)
1352
{
1353
char *buf;
1354
struct uio uio;
1355
struct iovec iov;
1356
off_t offset;
1357
int count;
1358
int error;
1359
int bufoffset;
1360
1361
error = 0;
1362
memset(&uio, 0, sizeof(uio));
1363
1364
uio.uio_td = td;
1365
uio.uio_segflg = UIO_SYSSPACE;
1366
uio.uio_offset = 0;
1367
1368
buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
1369
1370
while (error == 0) {
1371
offset = uio.uio_offset;
1372
1373
uio.uio_iov = &iov;
1374
uio.uio_iovcnt = 1;
1375
iov.iov_base = buf;
1376
iov.iov_len = MAXBSIZE;
1377
uio.uio_resid = iov.iov_len;
1378
uio.uio_rw = UIO_READ;
1379
1380
if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0)
1381
break;
1382
if ((count = MAXBSIZE - uio.uio_resid) == 0)
1383
break;
1384
1385
bufoffset = 0;
1386
while (bufoffset < count) {
1387
uio.uio_iov = &iov;
1388
uio.uio_iovcnt = 1;
1389
iov.iov_base = buf + bufoffset;
1390
iov.iov_len = count - bufoffset;
1391
uio.uio_offset = offset + bufoffset;
1392
uio.uio_resid = iov.iov_len;
1393
uio.uio_rw = UIO_WRITE;
1394
1395
if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0)
1396
break;
1397
1398
bufoffset += (count - bufoffset) - uio.uio_resid;
1399
}
1400
1401
uio.uio_offset = offset + bufoffset;
1402
}
1403
1404
free(buf, M_TEMP);
1405
1406
return (error);
1407
}
1408
1409
/*
1410
* Copy file from lower to upper.
1411
*
1412
* If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
1413
* docopy.
1414
*
1415
* vp is a unionfs vnode that should be locked on entry and will be
1416
* locked on return.
1417
*
1418
* If no error returned, unp will be updated.
1419
*/
1420
int
1421
unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred,
1422
struct thread *td)
1423
{
1424
struct unionfs_node *unp;
1425
struct unionfs_node *dunp;
1426
struct mount *mp;
1427
struct vnode *udvp;
1428
struct vnode *lvp;
1429
struct vnode *uvp;
1430
struct vattr uva;
1431
int error;
1432
1433
ASSERT_VOP_ELOCKED(vp, __func__);
1434
unp = VTOUNIONFS(vp);
1435
lvp = unp->un_lowervp;
1436
uvp = NULL;
1437
1438
if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY))
1439
return (EROFS);
1440
if (unp->un_dvp == NULL)
1441
return (EINVAL);
1442
if (unp->un_uppervp != NULL)
1443
return (EEXIST);
1444
1445
udvp = NULL;
1446
VI_LOCK(unp->un_dvp);
1447
dunp = VTOUNIONFS(unp->un_dvp);
1448
if (dunp != NULL)
1449
udvp = dunp->un_uppervp;
1450
VI_UNLOCK(unp->un_dvp);
1451
1452
if (udvp == NULL)
1453
return (EROFS);
1454
if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
1455
return (EROFS);
1456
ASSERT_VOP_UNLOCKED(udvp, __func__);
1457
1458
error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1459
if (error == EJUSTRETURN)
1460
return (0);
1461
else if (error != 0)
1462
return (error);
1463
1464
error = VOP_ACCESS(lvp, VREAD, cred, td);
1465
if (error != 0)
1466
goto unionfs_copyfile_cleanup;
1467
1468
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
1469
goto unionfs_copyfile_cleanup;
1470
error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td);
1471
if (error != 0) {
1472
vn_finished_write(mp);
1473
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1474
goto unionfs_copyfile_cleanup;
1475
}
1476
1477
/*
1478
* Note that it's still possible for e.g. VOP_WRITE to relock
1479
* uvp below while holding vp[=lvp] locked. Replacing
1480
* unionfs_copyfile_core with vn_generic_copy_file_range() will
1481
* allow us to avoid the problem by moving this vn_lock_pair()
1482
* call much later.
1483
*/
1484
vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
1485
unp = VTOUNIONFS(vp);
1486
if (unp == NULL) {
1487
error = ENOENT;
1488
goto unionfs_copyfile_cleanup;
1489
}
1490
1491
if (docopy != 0) {
1492
error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
1493
if (error == 0) {
1494
error = unionfs_copyfile_core(lvp, uvp, cred, td);
1495
VOP_CLOSE(lvp, FREAD, cred, td);
1496
}
1497
}
1498
VOP_CLOSE(uvp, FWRITE, cred, td);
1499
VOP_ADD_WRITECOUNT_CHECKED(uvp, -1);
1500
CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
1501
__func__, uvp, uvp->v_writecount);
1502
1503
vn_finished_write(mp);
1504
1505
if (error == 0) {
1506
/* Reset the attributes. Ignore errors. */
1507
uva.va_type = VNON;
1508
VOP_SETATTR(uvp, &uva, cred);
1509
unionfs_node_update(unp, uvp, td);
1510
}
1511
1512
unionfs_copyfile_cleanup:
1513
unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
1514
return (error);
1515
}
1516
1517
/*
1518
* Determine if the unionfs view of a directory is empty such that
1519
* an rmdir operation can be permitted.
1520
*
1521
* We assume the VOP_RMDIR() against the upper layer vnode will take
1522
* care of this check for us where the upper FS is concerned, so here
1523
* we concentrate on the lower FS. We need to check for the presence
1524
* of files other than "." and ".." in the lower FS directory and
1525
* then cross-check any files we find against the upper FS to see if
1526
* a whiteout is present (in which case we treat the lower file as
1527
* non-present).
1528
*
1529
* The logic here is based heavily on vn_dir_check_empty().
1530
*
1531
* vp should be a locked unionfs node, and vp's lowervp should also be
1532
* locked.
1533
*/
1534
int
1535
unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
1536
{
1537
struct vnode *uvp;
1538
struct vnode *lvp;
1539
struct vnode *tvp;
1540
char *dirbuf;
1541
size_t dirbuflen, len;
1542
off_t off;
1543
struct dirent *dp;
1544
struct componentname cn;
1545
struct vattr va;
1546
int error;
1547
int eofflag;
1548
1549
eofflag = 0;
1550
lvp = UNIONFSVPTOLOWERVP(vp);
1551
uvp = UNIONFSVPTOUPPERVP(vp);
1552
1553
/*
1554
* Note that the locking here still isn't ideal: We expect the caller
1555
* to hold both the upper and lower layer locks as well as the upper
1556
* parent directory lock, which it can do in a manner that avoids
1557
* deadlock. However, if the cross-check logic below needs to call
1558
* VOP_LOOKUP(), that may relock the upper vnode and lock any found
1559
* child vnode in a way that doesn't protect against deadlock given
1560
* the other held locks. Beyond that, the various other VOPs we issue
1561
* below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the
1562
* lower vnode.
1563
* We might instead just handoff between the upper vnode lock
1564
* (and its parent directory lock) and the lower vnode lock as needed,
1565
* so that the lower lock is never held at the same time as the upper
1566
* locks, but that opens up a wider window in which the upper
1567
* directory (and also the lower directory if it isn't truly
1568
* read-only) may change while the relevant lock is dropped. But
1569
* since re-locking may happen here and open up such a window anyway,
1570
* perhaps that is a worthwile tradeoff? Or perhaps we can ultimately
1571
* do sufficient tracking of empty state within the unionfs vnode
1572
* (in conjunction with upcalls from the lower FSes to notify us
1573
* of out-of-band state changes) that we can avoid these costly checks
1574
* altogether.
1575
*/
1576
ASSERT_VOP_LOCKED(lvp, __func__);
1577
ASSERT_VOP_ELOCKED(uvp, __func__);
1578
1579
if ((error = VOP_GETATTR(uvp, &va, cred)) != 0)
1580
return (error);
1581
if (va.va_flags & OPAQUE)
1582
return (0);
1583
1584
#ifdef MAC
1585
if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0)
1586
return (error);
1587
#endif
1588
if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0)
1589
return (error);
1590
if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0)
1591
return (error);
1592
if ((error = VOP_GETATTR(lvp, &va, cred)) != 0)
1593
return (error);
1594
1595
dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
1596
if (dirbuflen < va.va_blocksize)
1597
dirbuflen = va.va_blocksize;
1598
dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
1599
1600
len = 0;
1601
off = 0;
1602
eofflag = 0;
1603
1604
for (;;) {
1605
error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen,
1606
&dp, &len, &off, &eofflag);
1607
if (error != 0)
1608
break;
1609
1610
if (len == 0) {
1611
/* EOF */
1612
error = 0;
1613
break;
1614
}
1615
1616
if (dp->d_type == DT_WHT)
1617
continue;
1618
1619
/*
1620
* Any file in the directory which is not '.' or '..' indicates
1621
* the directory is not empty.
1622
*/
1623
switch (dp->d_namlen) {
1624
case 2:
1625
if (dp->d_name[1] != '.') {
1626
/* Can't be '..' (nor '.') */
1627
break;
1628
}
1629
/* FALLTHROUGH */
1630
case 1:
1631
if (dp->d_name[0] != '.') {
1632
/* Can't be '..' nor '.' */
1633
break;
1634
}
1635
continue;
1636
default:
1637
break;
1638
}
1639
1640
cn.cn_namelen = dp->d_namlen;
1641
cn.cn_pnbuf = NULL;
1642
cn.cn_nameptr = dp->d_name;
1643
cn.cn_nameiop = LOOKUP;
1644
cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
1645
cn.cn_lkflags = LK_EXCLUSIVE;
1646
cn.cn_cred = cred;
1647
1648
error = VOP_LOOKUP(uvp, &tvp, &cn);
1649
if (tvp != NULL)
1650
vput(tvp);
1651
if (error != 0 && error != ENOENT && error != EJUSTRETURN)
1652
break;
1653
else if ((cn.cn_flags & ISWHITEOUT) == 0) {
1654
error = ENOTEMPTY;
1655
break;
1656
} else
1657
error = 0;
1658
}
1659
1660
VOP_CLOSE(lvp, FREAD, cred, td);
1661
free(dirbuf, M_TEMP);
1662
return (error);
1663
}
1664
1665