CoCalc -- zfs

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
⁴⁸⁷⁷⁴ views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
 * CDDL HEADER START
4
 *
5
 * The contents of this file are subject to the terms of the
6
 * Common Development and Distribution License (the "License").
7
 * You may not use this file except in compliance with the License.
8
 *
9
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
 * or https://opensource.org/licenses/CDDL-1.0.
11
 * See the License for the specific language governing permissions
12
 * and limitations under the License.
13
 *
14
 * When distributing Covered Code, include this CDDL HEADER in each
15
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
 * If applicable, add the following below this CDDL HEADER, with the
17
 * fields enclosed by brackets "[]" replaced with your own identifying
18
 * information: Portions Copyright [yyyy] [name of copyright owner]
19
 *
20
 * CDDL HEADER END
21
 */
22

23
/*
24
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25
 * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
26
 * Copyright 2017 Nexenta Systems, Inc.
27
 */
28

29
#include <sys/types.h>
30
#include <sys/param.h>
31
#include <sys/time.h>
32
#include <sys/sysmacros.h>
33
#include <sys/vfs.h>
34
#include <sys/vnode.h>
35
#include <sys/file.h>
36
#include <sys/kmem.h>
37
#include <sys/uio.h>
38
#include <sys/pathname.h>
39
#include <sys/cmn_err.h>
40
#include <sys/errno.h>
41
#include <sys/stat.h>
42
#include <sys/sunddi.h>
43
#include <sys/random.h>
44
#include <sys/policy.h>
45
#include <sys/zfs_dir.h>
46
#include <sys/zfs_acl.h>
47
#include <sys/zfs_vnops.h>
48
#include <sys/fs/zfs.h>
49
#include <sys/zap.h>
50
#include <sys/dmu.h>
51
#include <sys/atomic.h>
52
#include <sys/zfs_ctldir.h>
53
#include <sys/zfs_fuid.h>
54
#include <sys/sa.h>
55
#include <sys/zfs_sa.h>
56
#include <sys/dmu_objset.h>
57
#include <sys/dsl_dir.h>
58

59
/*
60
 * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
61
 * of names after deciding which is the appropriate lookup interface.
62
 */
63
static int
64
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
65
    matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,
66
    uint64_t *zoid)
67
{
68
	boolean_t conflict = B_FALSE;
69
	int error;
70

71
	if (zfsvfs->z_norm) {
72
		size_t bufsz = 0;
73
		char *buf = NULL;
74

75
		if (rpnp) {
76
			buf = rpnp->pn_buf;
77
			bufsz = rpnp->pn_bufsize;
78
		}
79

80
		/*
81
		 * In the non-mixed case we only expect there would ever
82
		 * be one match, but we need to use the normalizing lookup.
83
		 */
84
		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
85
		    zoid, mt, buf, bufsz, &conflict);
86
	} else {
87
		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
88
	}
89

90
	/*
91
	 * Allow multiple entries provided the first entry is
92
	 * the object id.  Non-zpl consumers may safely make
93
	 * use of the additional space.
94
	 *
95
	 * XXX: This should be a feature flag for compatibility
96
	 */
97
	if (error == EOVERFLOW)
98
		error = 0;
99

100
	if (zfsvfs->z_norm && !error && deflags)
101
		*deflags = conflict ? ED_CASE_CONFLICT : 0;
102

103
	*zoid = ZFS_DIRENT_OBJ(*zoid);
104

105
	return (error);
106
}
107

108
/*
109
 * Lock a directory entry.  A dirlock on <dzp, name> protects that name
110
 * in dzp's directory zap object.  As long as you hold a dirlock, you can
111
 * assume two things: (1) dzp cannot be reaped, and (2) no other thread
112
 * can change the zap entry for (i.e. link or unlink) this name.
113
 *
114
 * Input arguments:
115
 *	dzp	- znode for directory
116
 *	name	- name of entry to lock
117
 *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
118
 *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
119
 *		  ZSHARED: allow concurrent access with other ZSHARED callers.
120
 *		  ZXATTR: we want dzp's xattr directory
121
 *		  ZCILOOK: On a mixed sensitivity file system,
122
 *			   this lookup should be case-insensitive.
123
 *		  ZCIEXACT: On a purely case-insensitive file system,
124
 *			    this lookup should be case-sensitive.
125
 *		  ZRENAMING: we are locking for renaming, force narrow locks
126
 *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
127
 *			     current thread already holds it.
128
 *
129
 * Output arguments:
130
 *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
131
 *	dlpp	- pointer to the dirlock for this entry (NULL on error)
132
 *      direntflags - (case-insensitive lookup only)
133
 *		flags if multiple case-sensitive matches exist in directory
134
 *      realpnp     - (case-insensitive lookup only)
135
 *		actual name matched within the directory
136
 *
137
 * Return value: 0 on success or errno on failure.
138
 *
139
 * NOTE: Always checks for, and rejects, '.' and '..'.
140
 * NOTE: For case-insensitive file systems we take wide locks (see below),
141
 *	 but return znode pointers to a single match.
142
 */
143
int
144
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,
145
    znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)
146
{
147
	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
148
	zfs_dirlock_t	*dl;
149
	boolean_t	update;
150
	matchtype_t	mt = 0;
151
	uint64_t	zoid;
152
	int		error = 0;
153
	int		cmpflags;
154

155
	*zpp = NULL;
156
	*dlpp = NULL;
157

158
	/*
159
	 * Verify that we are not trying to lock '.', '..', or '.zfs'
160
	 */
161
	if ((name[0] == '.' &&
162
	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
163
	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
164
		return (SET_ERROR(EEXIST));
165

166
	/*
167
	 * Case sensitivity and normalization preferences are set when
168
	 * the file system is created.  These are stored in the
169
	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
170
	 * affect what vnodes can be cached in the DNLC, how we
171
	 * perform zap lookups, and the "width" of our dirlocks.
172
	 *
173
	 * A normal dirlock locks a single name.  Note that with
174
	 * normalization a name can be composed multiple ways, but
175
	 * when normalized, these names all compare equal.  A wide
176
	 * dirlock locks multiple names.  We need these when the file
177
	 * system is supporting mixed-mode access.  It is sometimes
178
	 * necessary to lock all case permutations of file name at
179
	 * once so that simultaneous case-insensitive/case-sensitive
180
	 * behaves as rationally as possible.
181
	 */
182

183
	/*
184
	 * When matching we may need to normalize & change case according to
185
	 * FS settings.
186
	 *
187
	 * Note that a normalized match is necessary for a case insensitive
188
	 * filesystem when the lookup request is not exact because normalization
189
	 * can fold case independent of normalizing code point sequences.
190
	 *
191
	 * See the table above zfs_dropname().
192
	 */
193
	if (zfsvfs->z_norm != 0) {
194
		mt = MT_NORMALIZE;
195

196
		/*
197
		 * Determine if the match needs to honor the case specified in
198
		 * lookup, and if so keep track of that so that during
199
		 * normalization we don't fold case.
200
		 */
201
		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
202
		    (flag & ZCIEXACT)) ||
203
		    (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
204
			mt |= MT_MATCH_CASE;
205
		}
206
	}
207

208
	/*
209
	 * Only look in or update the DNLC if we are looking for the
210
	 * name on a file system that does not require normalization
211
	 * or case folding.  We can also look there if we happen to be
212
	 * on a non-normalizing, mixed sensitivity file system IF we
213
	 * are looking for the exact name.
214
	 *
215
	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
216
	 * case for performance improvement?
217
	 */
218
	update = !zfsvfs->z_norm ||
219
	    (zfsvfs->z_case == ZFS_CASE_MIXED &&
220
	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
221

222
	/*
223
	 * ZRENAMING indicates we are in a situation where we should
224
	 * take narrow locks regardless of the file system's
225
	 * preferences for normalizing and case folding.  This will
226
	 * prevent us deadlocking trying to grab the same wide lock
227
	 * twice if the two names happen to be case-insensitive
228
	 * matches.
229
	 */
230
	if (flag & ZRENAMING)
231
		cmpflags = 0;
232
	else
233
		cmpflags = zfsvfs->z_norm;
234

235
	/*
236
	 * Wait until there are no locks on this name.
237
	 *
238
	 * Don't grab the lock if it is already held. However, cannot
239
	 * have both ZSHARED and ZHAVELOCK together.
240
	 */
241
	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
242
	if (!(flag & ZHAVELOCK))
243
		rw_enter(&dzp->z_name_lock, RW_READER);
244

245
	mutex_enter(&dzp->z_lock);
246
	for (;;) {
247
		if (dzp->z_unlinked && !(flag & ZXATTR)) {
248
			mutex_exit(&dzp->z_lock);
249
			if (!(flag & ZHAVELOCK))
250
				rw_exit(&dzp->z_name_lock);
251
			return (SET_ERROR(ENOENT));
252
		}
253
		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
254
			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
255
			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
256
				break;
257
		}
258
		if (error != 0) {
259
			mutex_exit(&dzp->z_lock);
260
			if (!(flag & ZHAVELOCK))
261
				rw_exit(&dzp->z_name_lock);
262
			return (SET_ERROR(ENOENT));
263
		}
264
		if (dl == NULL)	{
265
			/*
266
			 * Allocate a new dirlock and add it to the list.
267
			 */
268
			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
269
			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
270
			dl->dl_name = name;
271
			dl->dl_sharecnt = 0;
272
			dl->dl_namelock = 0;
273
			dl->dl_namesize = 0;
274
			dl->dl_dzp = dzp;
275
			dl->dl_next = dzp->z_dirlocks;
276
			dzp->z_dirlocks = dl;
277
			break;
278
		}
279
		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
280
			break;
281
		cv_wait(&dl->dl_cv, &dzp->z_lock);
282
	}
283

284
	/*
285
	 * If the z_name_lock was NOT held for this dirlock record it.
286
	 */
287
	if (flag & ZHAVELOCK)
288
		dl->dl_namelock = 1;
289

290
	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
291
		/*
292
		 * We're the second shared reference to dl.  Make a copy of
293
		 * dl_name in case the first thread goes away before we do.
294
		 * Note that we initialize the new name before storing its
295
		 * pointer into dl_name, because the first thread may load
296
		 * dl->dl_name at any time.  It'll either see the old value,
297
		 * which belongs to it, or the new shared copy; either is OK.
298
		 */
299
		dl->dl_namesize = strlen(dl->dl_name) + 1;
300
		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
301
		memcpy(name, dl->dl_name, dl->dl_namesize);
302
		dl->dl_name = name;
303
	}
304

305
	mutex_exit(&dzp->z_lock);
306

307
	/*
308
	 * We have a dirlock on the name.  (Note that it is the dirlock,
309
	 * not the dzp's z_lock, that protects the name in the zap object.)
310
	 * See if there's an object by this name; if so, put a hold on it.
311
	 */
312
	if (flag & ZXATTR) {
313
		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
314
		    sizeof (zoid));
315
		if (error == 0)
316
			error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
317
	} else {
318
		error = zfs_match_find(zfsvfs, dzp, name, mt,
319
		    update, direntflags, realpnp, &zoid);
320
	}
321
	if (error) {
322
		if (error != ENOENT || (flag & ZEXISTS)) {
323
			zfs_dirent_unlock(dl);
324
			return (error);
325
		}
326
	} else {
327
		if (flag & ZNEW) {
328
			zfs_dirent_unlock(dl);
329
			return (SET_ERROR(EEXIST));
330
		}
331
		error = zfs_zget(zfsvfs, zoid, zpp);
332
		if (error) {
333
			zfs_dirent_unlock(dl);
334
			return (error);
335
		}
336
	}
337

338
	*dlpp = dl;
339

340
	return (0);
341
}
342

343
/*
344
 * Unlock this directory entry and wake anyone who was waiting for it.
345
 */
346
void
347
zfs_dirent_unlock(zfs_dirlock_t *dl)
348
{
349
	znode_t *dzp = dl->dl_dzp;
350
	zfs_dirlock_t **prev_dl, *cur_dl;
351

352
	mutex_enter(&dzp->z_lock);
353

354
	if (!dl->dl_namelock)
355
		rw_exit(&dzp->z_name_lock);
356

357
	if (dl->dl_sharecnt > 1) {
358
		dl->dl_sharecnt--;
359
		mutex_exit(&dzp->z_lock);
360
		return;
361
	}
362
	prev_dl = &dzp->z_dirlocks;
363
	while ((cur_dl = *prev_dl) != dl)
364
		prev_dl = &cur_dl->dl_next;
365
	*prev_dl = dl->dl_next;
366
	cv_broadcast(&dl->dl_cv);
367
	mutex_exit(&dzp->z_lock);
368

369
	if (dl->dl_namesize != 0)
370
		kmem_free(dl->dl_name, dl->dl_namesize);
371
	cv_destroy(&dl->dl_cv);
372
	kmem_free(dl, sizeof (*dl));
373
}
374

375
/*
376
 * Look up an entry in a directory.
377
 *
378
 * NOTE: '.' and '..' are handled as special cases because
379
 *	no directory entries are actually stored for them.  If this is
380
 *	the root of a filesystem, then '.zfs' is also treated as a
381
 *	special pseudo-directory.
382
 */
383
int
384
zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
385
    int *deflg, pathname_t *rpnp)
386
{
387
	zfs_dirlock_t *dl;
388
	znode_t *zp;
389
	struct inode *ip;
390
	int error = 0;
391
	uint64_t parent;
392

393
	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
394
		*zpp = dzp;
395
		zhold(*zpp);
396
	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
397
		zfsvfs_t *zfsvfs = ZTOZSB(dzp);
398

399
		/*
400
		 * If we are a snapshot mounted under .zfs, return
401
		 * the inode pointer for the snapshot directory.
402
		 */
403
		if ((error = sa_lookup(dzp->z_sa_hdl,
404
		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
405
			return (error);
406

407
		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
408
			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
409
			    "snapshot", &ip, 0, kcred, NULL, NULL);
410
			*zpp = ITOZ(ip);
411
			return (error);
412
		}
413
		rw_enter(&dzp->z_parent_lock, RW_READER);
414
		error = zfs_zget(zfsvfs, parent, &zp);
415
		if (error == 0)
416
			*zpp = zp;
417
		rw_exit(&dzp->z_parent_lock);
418
	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
419
		if (ZTOZSB(dzp)->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {
420
			return (SET_ERROR(ENOENT));
421
		}
422
		ip = zfsctl_root(dzp);
423
		*zpp = ITOZ(ip);
424
	} else {
425
		int zf;
426

427
		zf = ZEXISTS | ZSHARED;
428
		if (flags & FIGNORECASE)
429
			zf |= ZCILOOK;
430

431
		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
432
		if (error == 0) {
433
			*zpp = zp;
434
			zfs_dirent_unlock(dl);
435
			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
436
		}
437
		rpnp = NULL;
438
	}
439

440
	if ((flags & FIGNORECASE) && rpnp && !error)
441
		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
442

443
	return (error);
444
}
445

446
/*
447
 * unlinked Set (formerly known as the "delete queue") Error Handling
448
 *
449
 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
450
 * don't specify the name of the entry that we will be manipulating.  We
451
 * also fib and say that we won't be adding any new entries to the
452
 * unlinked set, even though we might (this is to lower the minimum file
453
 * size that can be deleted in a full filesystem).  So on the small
454
 * chance that the nlink list is using a fat zap (ie. has more than
455
 * 2000 entries), we *may* not pre-read a block that's needed.
456
 * Therefore it is remotely possible for some of the assertions
457
 * regarding the unlinked set below to fail due to i/o error.  On a
458
 * nondebug system, this will result in the space being leaked.
459
 */
460
void
461
zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
462
{
463
	zfsvfs_t *zfsvfs = ZTOZSB(zp);
464

465
	ASSERT(zp->z_unlinked);
466
	ASSERT0(ZTOI(zp)->i_nlink);
467

468
	VERIFY3U(0, ==,
469
	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
470

471
	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
472
}
473

474
/*
475
 * Clean up any znodes that had no links when we either crashed or
476
 * (force) umounted the file system.
477
 */
478
static void
479
zfs_unlinked_drain_task(void *arg)
480
{
481
	zfsvfs_t *zfsvfs = arg;
482
	zap_cursor_t	zc;
483
	zap_attribute_t *zap = zap_attribute_alloc();
484
	dmu_object_info_t doi;
485
	znode_t		*zp;
486
	int		error;
487

488
	ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
489

490
	/*
491
	 * Iterate over the contents of the unlinked set.
492
	 */
493
	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
494
	    zap_cursor_retrieve(&zc, zap) == 0 && !zfsvfs->z_drain_cancel;
495
	    zap_cursor_advance(&zc)) {
496

497
		/*
498
		 * See what kind of object we have in list
499
		 */
500

501
		error = dmu_object_info(zfsvfs->z_os,
502
		    zap->za_first_integer, &doi);
503
		if (error != 0)
504
			continue;
505

506
		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
507
		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
508
		/*
509
		 * We need to re-mark these list entries for deletion,
510
		 * so we pull them back into core and set zp->z_unlinked.
511
		 */
512
		error = zfs_zget(zfsvfs, zap->za_first_integer, &zp);
513

514
		/*
515
		 * We may pick up znodes that are already marked for deletion.
516
		 * This could happen during the purge of an extended attribute
517
		 * directory.  All we need to do is skip over them, since they
518
		 * are already in the system marked z_unlinked.
519
		 */
520
		if (error != 0)
521
			continue;
522

523
		zp->z_unlinked = B_TRUE;
524

525
		/*
526
		 * zrele() decrements the znode's ref count and may cause
527
		 * it to be synchronously freed. We interrupt freeing
528
		 * of this znode by checking the return value of
529
		 * dmu_objset_zfs_unmounting() in dmu_free_long_range()
530
		 * when an unmount is requested.
531
		 */
532
		zrele(zp);
533
		ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
534
	}
535
	zap_cursor_fini(&zc);
536

537
	zfsvfs->z_draining = B_FALSE;
538
	zfsvfs->z_drain_task = TASKQID_INVALID;
539
	zap_attribute_free(zap);
540
}
541

542
/*
543
 * Sets z_draining then tries to dispatch async unlinked drain.
544
 * If that fails executes synchronous unlinked drain.
545
 */
546
void
547
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
548
{
549
	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
550
	ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
551

552
	zfsvfs->z_draining = B_TRUE;
553
	zfsvfs->z_drain_cancel = B_FALSE;
554

555
	zfsvfs->z_drain_task = taskq_dispatch(
556
	    dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
557
	    zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
558
	if (zfsvfs->z_drain_task == TASKQID_INVALID) {
559
		zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
560
		zfs_unlinked_drain_task(zfsvfs);
561
	}
562
}
563

564
/*
565
 * Wait for the unlinked drain taskq task to stop. This will interrupt the
566
 * unlinked set processing if it is in progress.
567
 */
568
void
569
zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
570
{
571
	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
572

573
	if (zfsvfs->z_draining) {
574
		zfsvfs->z_drain_cancel = B_TRUE;
575
		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
576
		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
577
		zfsvfs->z_drain_task = TASKQID_INVALID;
578
		zfsvfs->z_draining = B_FALSE;
579
	}
580
}
581

582
/*
583
 * Delete the entire contents of a directory.  Return a count
584
 * of the number of entries that could not be deleted. If we encounter
585
 * an error, return a count of at least one so that the directory stays
586
 * in the unlinked set.
587
 *
588
 * NOTE: this function assumes that the directory is inactive,
589
 *	so there is no need to lock its entries before deletion.
590
 *	Also, it assumes the directory contents is *only* regular
591
 *	files.
592
 */
593
static int
594
zfs_purgedir(znode_t *dzp)
595
{
596
	zap_cursor_t	zc;
597
	zap_attribute_t	*zap = zap_attribute_alloc();
598
	znode_t		*xzp;
599
	dmu_tx_t	*tx;
600
	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
601
	zfs_dirlock_t	dl;
602
	int skipped = 0;
603
	int error;
604

605
	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
606
	    (error = zap_cursor_retrieve(&zc, zap)) == 0;
607
	    zap_cursor_advance(&zc)) {
608
		error = zfs_zget(zfsvfs,
609
		    ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp);
610
		if (error) {
611
			skipped += 1;
612
			continue;
613
		}
614

615
		ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
616
		    S_ISLNK(ZTOI(xzp)->i_mode));
617

618
		tx = dmu_tx_create(zfsvfs->z_os);
619
		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
620
		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name);
621
		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
622
		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
623
		/* Is this really needed ? */
624
		zfs_sa_upgrade_txholds(tx, xzp);
625
		dmu_tx_mark_netfree(tx);
626
		error = dmu_tx_assign(tx, DMU_TX_WAIT);
627
		if (error) {
628
			dmu_tx_abort(tx);
629
			zfs_zrele_async(xzp);
630
			skipped += 1;
631
			continue;
632
		}
633
		memset(&dl, 0, sizeof (dl));
634
		dl.dl_dzp = dzp;
635
		dl.dl_name = zap->za_name;
636

637
		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
638
		if (error)
639
			skipped += 1;
640
		dmu_tx_commit(tx);
641

642
		zfs_zrele_async(xzp);
643
	}
644
	zap_cursor_fini(&zc);
645
	zap_attribute_free(zap);
646
	if (error != ENOENT)
647
		skipped += 1;
648
	return (skipped);
649
}
650

651
void
652
zfs_rmnode(znode_t *zp)
653
{
654
	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
655
	objset_t	*os = zfsvfs->z_os;
656
	znode_t		*xzp = NULL;
657
	dmu_tx_t	*tx;
658
	znode_hold_t	*zh;
659
	uint64_t	z_id = zp->z_id;
660
	uint64_t	acl_obj;
661
	uint64_t	xattr_obj;
662
	uint64_t	links;
663
	int		error;
664

665
	ASSERT0(ZTOI(zp)->i_nlink);
666
	ASSERT0(atomic_read(&ZTOI(zp)->i_count));
667

668
	/*
669
	 * If this is an attribute directory, purge its contents.
670
	 */
671
	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
672
		if (zfs_purgedir(zp) != 0) {
673
			/*
674
			 * Not enough space to delete some xattrs.
675
			 * Leave it in the unlinked set.
676
			 */
677
			zh = zfs_znode_hold_enter(zfsvfs, z_id);
678
			zfs_znode_dmu_fini(zp);
679
			zfs_znode_hold_exit(zfsvfs, zh);
680
			return;
681
		}
682
	}
683

684
	/*
685
	 * Free up all the data in the file.  We don't do this for directories
686
	 * because we need truncate and remove to be in the same tx, like in
687
	 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
688
	 * an inconsistent truncated zap object in the delete queue.  Note a
689
	 * truncated file is harmless since it only contains user data.
690
	 */
691
	if (S_ISREG(ZTOI(zp)->i_mode)) {
692
		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
693
		if (error) {
694
			/*
695
			 * Not enough space or we were interrupted by unmount.
696
			 * Leave the file in the unlinked set.
697
			 */
698
			zh = zfs_znode_hold_enter(zfsvfs, z_id);
699
			zfs_znode_dmu_fini(zp);
700
			zfs_znode_hold_exit(zfsvfs, zh);
701
			return;
702
		}
703
	}
704

705
	/*
706
	 * If the file has extended attributes, we're going to unlink
707
	 * the xattr dir.
708
	 */
709
	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
710
	    &xattr_obj, sizeof (xattr_obj));
711
	if (error == 0 && xattr_obj) {
712
		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
713
		ASSERT0(error);
714
	}
715

716
	acl_obj = zfs_external_acl(zp);
717

718
	/*
719
	 * Set up the final transaction.
720
	 */
721
	tx = dmu_tx_create(os);
722
	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
723
	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
724
	if (xzp) {
725
		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
726
		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
727
	}
728
	if (acl_obj)
729
		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
730

731
	zfs_sa_upgrade_txholds(tx, zp);
732
	error = dmu_tx_assign(tx, DMU_TX_WAIT);
733
	if (error) {
734
		/*
735
		 * Not enough space to delete the file.  Leave it in the
736
		 * unlinked set, leaking it until the fs is remounted (at
737
		 * which point we'll call zfs_unlinked_drain() to process it).
738
		 */
739
		dmu_tx_abort(tx);
740
		zh = zfs_znode_hold_enter(zfsvfs, z_id);
741
		zfs_znode_dmu_fini(zp);
742
		zfs_znode_hold_exit(zfsvfs, zh);
743
		goto out;
744
	}
745

746
	if (xzp) {
747
		ASSERT0(error);
748
		mutex_enter(&xzp->z_lock);
749
		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
750
		clear_nlink(ZTOI(xzp));		/* no more links to it */
751
		links = 0;
752
		VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
753
		    &links, sizeof (links), tx));
754
		mutex_exit(&xzp->z_lock);
755
		zfs_unlinked_add(xzp, tx);
756
	}
757

758
	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
759

760
	/*
761
	 * Remove this znode from the unlinked set.  If a has rollback has
762
	 * occurred while a file is open and unlinked.  Then when the file
763
	 * is closed post rollback it will not exist in the rolled back
764
	 * version of the unlinked object.
765
	 */
766
	error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
767
	    zp->z_id, tx);
768
	VERIFY(error == 0 || error == ENOENT);
769

770
	uint64_t count;
771
	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
772
		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
773
	}
774

775
	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
776

777
	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
778

779
	zfs_znode_delete(zp, tx);
780

781
	dmu_tx_commit(tx);
782
out:
783
	if (xzp)
784
		zfs_zrele_async(xzp);
785
}
786

787
static uint64_t
788
zfs_dirent(znode_t *zp, uint64_t mode)
789
{
790
	uint64_t de = zp->z_id;
791

792
	if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
793
		de |= IFTODT(mode) << 60;
794
	return (de);
795
}
796

797
/*
798
 * Link zp into dl.  Can fail in the following cases :
799
 * - if zp has been unlinked.
800
 * - if the number of entries with the same hash (aka. colliding entries)
801
 *    exceed the capacity of a leaf-block of fatzap and splitting of the
802
 *    leaf-block does not help.
803
 */
804
int
805
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
806
{
807
	znode_t *dzp = dl->dl_dzp;
808
	zfsvfs_t *zfsvfs = ZTOZSB(zp);
809
	uint64_t value;
810
	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
811
	sa_bulk_attr_t bulk[5];
812
	uint64_t mtime[2], ctime[2];
813
	uint64_t links;
814
	int count = 0;
815
	int error;
816

817
	mutex_enter(&zp->z_lock);
818

819
	if (!(flag & ZRENAMING)) {
820
		if (zp->z_unlinked) {	/* no new links to unlinked zp */
821
			ASSERT(!(flag & (ZNEW | ZEXISTS)));
822
			mutex_exit(&zp->z_lock);
823
			return (SET_ERROR(ENOENT));
824
		}
825
		if (!(flag & ZNEW)) {
826
			/*
827
			 * ZNEW nodes come from zfs_mknode() where the link
828
			 * count has already been initialised
829
			 */
830
			inc_nlink(ZTOI(zp));
831
			links = ZTOI(zp)->i_nlink;
832
			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
833
			    NULL, &links, sizeof (links));
834
		}
835
	}
836

837
	value = zfs_dirent(zp, zp->z_mode);
838
	error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
839
	    &value, tx);
840

841
	/*
842
	 * zap_add could fail to add the entry if it exceeds the capacity of the
843
	 * leaf-block and zap_leaf_split() failed to help.
844
	 * The caller of this routine is responsible for failing the transaction
845
	 * which will rollback the SA updates done above.
846
	 */
847
	if (error != 0) {
848
		if (!(flag & ZRENAMING) && !(flag & ZNEW))
849
			drop_nlink(ZTOI(zp));
850
		mutex_exit(&zp->z_lock);
851
		return (error);
852
	}
853

854
	/*
855
	 * If we added a longname activate the SPA_FEATURE_LONGNAME.
856
	 */
857
	if (strlen(dl->dl_name) >= ZAP_MAXNAMELEN) {
858
		dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
859
		ds->ds_feature_activation[SPA_FEATURE_LONGNAME] =
860
		    (void *)B_TRUE;
861
	}
862

863
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
864
	    &dzp->z_id, sizeof (dzp->z_id));
865
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
866
	    &zp->z_pflags, sizeof (zp->z_pflags));
867

868
	if (!(flag & ZNEW)) {
869
		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
870
		    ctime, sizeof (ctime));
871
		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
872
		    ctime);
873
	}
874
	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
875
	ASSERT0(error);
876

877
	mutex_exit(&zp->z_lock);
878

879
	mutex_enter(&dzp->z_lock);
880
	dzp->z_size++;
881
	if (zp_is_dir)
882
		inc_nlink(ZTOI(dzp));
883
	links = ZTOI(dzp)->i_nlink;
884
	count = 0;
885
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
886
	    &dzp->z_size, sizeof (dzp->z_size));
887
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
888
	    &links, sizeof (links));
889
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
890
	    mtime, sizeof (mtime));
891
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
892
	    ctime, sizeof (ctime));
893
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
894
	    &dzp->z_pflags, sizeof (dzp->z_pflags));
895
	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
896
	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
897
	ASSERT0(error);
898
	mutex_exit(&dzp->z_lock);
899

900
	return (0);
901
}
902

903
/*
904
 * The match type in the code for this function should conform to:
905
 *
906
 * ------------------------------------------------------------------------
907
 * fs type  | z_norm      | lookup type | match type
908
 * ---------|-------------|-------------|----------------------------------
909
 * CS !norm | 0           |           0 | 0 (exact)
910
 * CS  norm | formX       |           0 | MT_NORMALIZE
911
 * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
912
 * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
913
 * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
914
 * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
915
 * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
916
 * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
917
 * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
918
 * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
919
 *
920
 * Abbreviations:
921
 *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
922
 *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
923
 *    formX = unicode normalization form set on fs creation
924
 */
925
static int
926
zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
927
    int flag)
928
{
929
	int error;
930

931
	if (ZTOZSB(zp)->z_norm) {
932
		matchtype_t mt = MT_NORMALIZE;
933

934
		if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
935
		    (flag & ZCIEXACT)) ||
936
		    (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
937
		    !(flag & ZCILOOK))) {
938
			mt |= MT_MATCH_CASE;
939
		}
940

941
		error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
942
		    dl->dl_name, mt, tx);
943
	} else {
944
		error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
945
		    tx);
946
	}
947

948
	return (error);
949
}
950

951
static int
952
zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
953
{
954
	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
955
	int		zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
956
	boolean_t	unlinked = B_FALSE;
957
	sa_bulk_attr_t	bulk[3];
958
	uint64_t	mtime[2], ctime[2];
959
	uint64_t	links;
960
	int		count = 0;
961
	int		error;
962

963
	if (zp_is_dir && !zfs_dirempty(zp))
964
		return (SET_ERROR(ENOTEMPTY));
965

966
	if (ZTOI(zp)->i_nlink <= zp_is_dir) {
967
		zfs_panic_recover("zfs: link count on %lu is %u, "
968
		    "should be at least %u", zp->z_id,
969
		    (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
970
		set_nlink(ZTOI(zp), zp_is_dir + 1);
971
	}
972
	drop_nlink(ZTOI(zp));
973
	if (ZTOI(zp)->i_nlink == zp_is_dir) {
974
		zp->z_unlinked = B_TRUE;
975
		clear_nlink(ZTOI(zp));
976
		unlinked = B_TRUE;
977
	} else {
978
		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
979
		    NULL, &ctime, sizeof (ctime));
980
		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
981
		    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
982
		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
983
		    ctime);
984
	}
985
	links = ZTOI(zp)->i_nlink;
986
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
987
	    NULL, &links, sizeof (links));
988
	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
989
	ASSERT0(error);
990

991
	if (unlinkedp != NULL)
992
		*unlinkedp = unlinked;
993
	else if (unlinked)
994
		zfs_unlinked_add(zp, tx);
995

996
	return (0);
997
}
998

999
/*
1000
 * Forcefully drop an nlink reference from (zp) and mark it for deletion if it
1001
 * was the last link. This *must* only be done to znodes which have already
1002
 * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in
1003
 * the error path of zfs_rename(), where we have to correct the nlink count if
1004
 * we failed to link the target as well as failing to re-link the original
1005
 * znodes.
1006
 */
1007
int
1008
zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
1009
{
1010
	int error;
1011

1012
	mutex_enter(&zp->z_lock);
1013
	error = zfs_drop_nlink_locked(zp, tx, unlinkedp);
1014
	mutex_exit(&zp->z_lock);
1015

1016
	return (error);
1017
}
1018

1019
/*
1020
 * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
1021
 * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
1022
 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
1023
 * If it's non-NULL, we use it to indicate whether the znode needs deletion,
1024
 * and it's the caller's job to do it.
1025
 */
1026
int
1027
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
1028
    boolean_t *unlinkedp)
1029
{
1030
	znode_t *dzp = dl->dl_dzp;
1031
	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1032
	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
1033
	boolean_t unlinked = B_FALSE;
1034
	sa_bulk_attr_t bulk[5];
1035
	uint64_t mtime[2], ctime[2];
1036
	uint64_t links;
1037
	int count = 0;
1038
	int error;
1039

1040
	if (!(flag & ZRENAMING)) {
1041
		mutex_enter(&zp->z_lock);
1042

1043
		if (zp_is_dir && !zfs_dirempty(zp)) {
1044
			mutex_exit(&zp->z_lock);
1045
			return (SET_ERROR(ENOTEMPTY));
1046
		}
1047

1048
		/*
1049
		 * If we get here, we are going to try to remove the object.
1050
		 * First try removing the name from the directory; if that
1051
		 * fails, return the error.
1052
		 */
1053
		error = zfs_dropname(dl, zp, dzp, tx, flag);
1054
		if (error != 0) {
1055
			mutex_exit(&zp->z_lock);
1056
			return (error);
1057
		}
1058

1059
		/* The only error is !zfs_dirempty() and we checked earlier. */
1060
		error = zfs_drop_nlink_locked(zp, tx, &unlinked);
1061
		ASSERT0(error);
1062
		mutex_exit(&zp->z_lock);
1063
	} else {
1064
		error = zfs_dropname(dl, zp, dzp, tx, flag);
1065
		if (error != 0)
1066
			return (error);
1067
	}
1068

1069
	mutex_enter(&dzp->z_lock);
1070
	dzp->z_size--;		/* one dirent removed */
1071
	if (zp_is_dir)
1072
		drop_nlink(ZTOI(dzp));	/* ".." link from zp */
1073
	links = ZTOI(dzp)->i_nlink;
1074
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
1075
	    NULL, &links, sizeof (links));
1076
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1077
	    NULL, &dzp->z_size, sizeof (dzp->z_size));
1078
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
1079
	    NULL, ctime, sizeof (ctime));
1080
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
1081
	    NULL, mtime, sizeof (mtime));
1082
	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1083
	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
1084
	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
1085
	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
1086
	ASSERT0(error);
1087
	mutex_exit(&dzp->z_lock);
1088

1089
	if (unlinkedp != NULL)
1090
		*unlinkedp = unlinked;
1091
	else if (unlinked)
1092
		zfs_unlinked_add(zp, tx);
1093

1094
	return (0);
1095
}
1096

1097
/*
1098
 * Indicate whether the directory is empty.  Works with or without z_lock
1099
 * held, but can only be consider a hint in the latter case.  Returns true
1100
 * if only "." and ".." remain and there's no work in progress.
1101
 *
1102
 * The internal ZAP size, rather than zp->z_size, needs to be checked since
1103
 * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
1104
 */
1105
boolean_t
1106
zfs_dirempty(znode_t *dzp)
1107
{
1108
	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1109
	uint64_t count;
1110
	int error;
1111

1112
	if (dzp->z_dirlocks != NULL)
1113
		return (B_FALSE);
1114

1115
	error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
1116
	if (error != 0 || count != 0)
1117
		return (B_FALSE);
1118

1119
	return (B_TRUE);
1120
}
1121

1122
int
1123
zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
1124
{
1125
	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1126
	znode_t *xzp;
1127
	dmu_tx_t *tx;
1128
	int error;
1129
	zfs_acl_ids_t acl_ids;
1130
	boolean_t fuid_dirtied;
1131
#ifdef ZFS_DEBUG
1132
	uint64_t parent;
1133
#endif
1134

1135
	*xzpp = NULL;
1136

1137
	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
1138
	    &acl_ids, zfs_init_idmap)) != 0)
1139
		return (error);
1140
	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
1141
		zfs_acl_ids_free(&acl_ids);
1142
		return (SET_ERROR(EDQUOT));
1143
	}
1144

1145
	tx = dmu_tx_create(zfsvfs->z_os);
1146
	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1147
	    ZFS_SA_BASE_ATTR_SIZE);
1148
	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1149
	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1150
	fuid_dirtied = zfsvfs->z_fuid_dirty;
1151
	if (fuid_dirtied)
1152
		zfs_fuid_txhold(zfsvfs, tx);
1153
	error = dmu_tx_assign(tx, DMU_TX_WAIT);
1154
	if (error) {
1155
		zfs_acl_ids_free(&acl_ids);
1156
		dmu_tx_abort(tx);
1157
		return (error);
1158
	}
1159
	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
1160

1161
	if (fuid_dirtied)
1162
		zfs_fuid_sync(zfsvfs, tx);
1163

1164
#ifdef ZFS_DEBUG
1165
	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1166
	    &parent, sizeof (parent));
1167
	ASSERT(error == 0 && parent == zp->z_id);
1168
#endif
1169

1170
	VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
1171
	    sizeof (xzp->z_id), tx));
1172

1173
	if (!zp->z_unlinked)
1174
		zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL,
1175
		    acl_ids.z_fuidp, vap);
1176

1177
	zfs_acl_ids_free(&acl_ids);
1178
	dmu_tx_commit(tx);
1179

1180
	*xzpp = xzp;
1181

1182
	return (0);
1183
}
1184

1185
/*
1186
 * Return a znode for the extended attribute directory for zp.
1187
 * ** If the directory does not already exist, it is created **
1188
 *
1189
 *	IN:	zp	- znode to obtain attribute directory from
1190
 *		cr	- credentials of caller
1191
 *		flags	- flags from the VOP_LOOKUP call
1192
 *
1193
 *	OUT:	xipp	- pointer to extended attribute znode
1194
 *
1195
 *	RETURN:	0 on success
1196
 *		error number on failure
1197
 */
1198
int
1199
zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
1200
{
1201
	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
1202
	znode_t		*xzp;
1203
	zfs_dirlock_t	*dl;
1204
	vattr_t		va;
1205
	int		error;
1206
top:
1207
	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
1208
	if (error)
1209
		return (error);
1210

1211
	if (xzp != NULL) {
1212
		*xzpp = xzp;
1213
		zfs_dirent_unlock(dl);
1214
		return (0);
1215
	}
1216

1217
	if (!(flags & CREATE_XATTR_DIR)) {
1218
		zfs_dirent_unlock(dl);
1219
		return (SET_ERROR(ENOENT));
1220
	}
1221

1222
	if (zfs_is_readonly(zfsvfs)) {
1223
		zfs_dirent_unlock(dl);
1224
		return (SET_ERROR(EROFS));
1225
	}
1226

1227
	/*
1228
	 * The ability to 'create' files in an attribute
1229
	 * directory comes from the write_xattr permission on the base file.
1230
	 *
1231
	 * The ability to 'search' an attribute directory requires
1232
	 * read_xattr permission on the base file.
1233
	 *
1234
	 * Once in a directory the ability to read/write attributes
1235
	 * is controlled by the permissions on the attribute file.
1236
	 */
1237
	va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
1238
	va.va_mode = S_IFDIR | S_ISVTX | 0777;
1239
	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
1240

1241
	va.va_dentry = NULL;
1242
	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
1243
	zfs_dirent_unlock(dl);
1244

1245
	if (error == ERESTART) {
1246
		/* NB: we already did dmu_tx_wait() if necessary */
1247
		goto top;
1248
	}
1249

1250
	return (error);
1251
}
1252

1253
/*
1254
 * Decide whether it is okay to remove within a sticky directory.
1255
 *
1256
 * In sticky directories, write access is not sufficient;
1257
 * you can remove entries from a directory only if:
1258
 *
1259
 *	you own the directory,
1260
 *	you own the entry,
1261
 *	you have write access to the entry,
1262
 *	or you are privileged (checked in secpolicy...).
1263
 *
1264
 * The function returns 0 if remove access is granted.
1265
 */
1266
int
1267
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
1268
{
1269
	uid_t		uid;
1270
	uid_t		downer;
1271
	uid_t		fowner;
1272
	zfsvfs_t	*zfsvfs = ZTOZSB(zdp);
1273

1274
	if (zfsvfs->z_replay)
1275
		return (0);
1276

1277
	if ((zdp->z_mode & S_ISVTX) == 0)
1278
		return (0);
1279

1280
	downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
1281
	    cr, ZFS_OWNER);
1282
	fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
1283
	    cr, ZFS_OWNER);
1284

1285
	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
1286
	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
1287
	    zfs_init_idmap) == 0)
1288
		return (0);
1289
	else
1290
		return (secpolicy_vnode_remove(cr));
1291
}
1292

1293
Product

Resources

Company