CoCalc -- dnode.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dnode.c
⁴⁸³⁸³ views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
 * CDDL HEADER START
4
 *
5
 * The contents of this file are subject to the terms of the
6
 * Common Development and Distribution License (the "License").
7
 * You may not use this file except in compliance with the License.
8
 *
9
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
 * or https://opensource.org/licenses/CDDL-1.0.
11
 * See the License for the specific language governing permissions
12
 * and limitations under the License.
13
 *
14
 * When distributing Covered Code, include this CDDL HEADER in each
15
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
 * If applicable, add the following below this CDDL HEADER, with the
17
 * fields enclosed by brackets "[]" replaced with your own identifying
18
 * information: Portions Copyright [yyyy] [name of copyright owner]
19
 *
20
 * CDDL HEADER END
21
 */
22
/*
23
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26
 */
27

28
#include <sys/zfs_context.h>
29
#include <sys/dbuf.h>
30
#include <sys/dnode.h>
31
#include <sys/dmu.h>
32
#include <sys/dmu_impl.h>
33
#include <sys/dmu_tx.h>
34
#include <sys/dmu_objset.h>
35
#include <sys/dsl_dir.h>
36
#include <sys/dsl_dataset.h>
37
#include <sys/spa.h>
38
#include <sys/zio.h>
39
#include <sys/dmu_zfetch.h>
40
#include <sys/range_tree.h>
41
#include <sys/trace_zfs.h>
42
#include <sys/zfs_project.h>
43

44
dnode_stats_t dnode_stats = {
45
	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
46
	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
47
	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
48
	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
49
	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
50
	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
51
	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
52
	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
53
	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
54
	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
55
	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
56
	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
57
	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
58
	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
59
	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
60
	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
61
	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
62
	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
63
	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
64
	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
65
	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
66
	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
67
	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
68
	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
69
	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
70
	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
71
	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
72
	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
73
};
74

75
dnode_sums_t dnode_sums;
76

77
static kstat_t *dnode_ksp;
78
static kmem_cache_t *dnode_cache;
79

80
static dnode_phys_t dnode_phys_zero __maybe_unused;
81

82
int zfs_default_bs = SPA_MINBLOCKSHIFT;
83
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
84

85
#ifdef	_KERNEL
86
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
87
#endif /* _KERNEL */
88

89
static char *
90
rt_name(dnode_t *dn, const char *name)
91
{
92
	struct objset *os = dn->dn_objset;
93

94
	return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
95
	    spa_name(os->os_spa),
96
	    (u_longlong_t)(os->os_dsl_dataset ?
97
	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
98
	    (u_longlong_t)dn->dn_object,
99
	    name));
100
}
101

102
static int
103
dbuf_compare(const void *x1, const void *x2)
104
{
105
	const dmu_buf_impl_t *d1 = x1;
106
	const dmu_buf_impl_t *d2 = x2;
107

108
	int cmp = TREE_CMP(d1->db_level, d2->db_level);
109
	if (likely(cmp))
110
		return (cmp);
111

112
	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
113
	if (likely(cmp))
114
		return (cmp);
115

116
	if (d1->db_state == DB_MARKER) {
117
		ASSERT3S(d2->db_state, !=, DB_MARKER);
118
		return (TREE_PCMP(d1->db_parent, d2));
119
	} else if (d2->db_state == DB_MARKER) {
120
		ASSERT3S(d1->db_state, !=, DB_MARKER);
121
		return (TREE_PCMP(d1, d2->db_parent));
122
	}
123

124
	if (d1->db_state == DB_SEARCH) {
125
		ASSERT3S(d2->db_state, !=, DB_SEARCH);
126
		return (-1);
127
	} else if (d2->db_state == DB_SEARCH) {
128
		ASSERT3S(d1->db_state, !=, DB_SEARCH);
129
		return (1);
130
	}
131

132
	return (TREE_PCMP(d1, d2));
133
}
134

135
static int
136
dnode_cons(void *arg, void *unused, int kmflag)
137
{
138
	(void) unused, (void) kmflag;
139
	dnode_t *dn = arg;
140

141
	rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
142
	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
143
	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
144
	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
145
	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
146

147
	/*
148
	 * Every dbuf has a reference, and dropping a tracked reference is
149
	 * O(number of references), so don't track dn_holds.
150
	 */
151
	zfs_refcount_create_untracked(&dn->dn_holds);
152
	zfs_refcount_create(&dn->dn_tx_holds);
153
	list_link_init(&dn->dn_link);
154

155
	memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
156
	memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
157
	memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
158
	memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
159
	memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
160
	memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
161
	memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
162
	memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
163
	memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
164

165
	for (int i = 0; i < TXG_SIZE; i++) {
166
		multilist_link_init(&dn->dn_dirty_link[i]);
167
		dn->dn_free_ranges[i] = NULL;
168
		list_create(&dn->dn_dirty_records[i],
169
		    sizeof (dbuf_dirty_record_t),
170
		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
171
	}
172

173
	dn->dn_allocated_txg = 0;
174
	dn->dn_free_txg = 0;
175
	dn->dn_assigned_txg = 0;
176
	dn->dn_dirtycnt = 0;
177
	dn->dn_bonus = NULL;
178
	dn->dn_have_spill = B_FALSE;
179
	dn->dn_zio = NULL;
180
	dn->dn_oldused = 0;
181
	dn->dn_oldflags = 0;
182
	dn->dn_olduid = 0;
183
	dn->dn_oldgid = 0;
184
	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
185
	dn->dn_newuid = 0;
186
	dn->dn_newgid = 0;
187
	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
188
	dn->dn_id_flags = 0;
189

190
	dn->dn_dbufs_count = 0;
191
	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
192
	    offsetof(dmu_buf_impl_t, db_link));
193

194
	dn->dn_moved = 0;
195
	return (0);
196
}
197

198
static void
199
dnode_dest(void *arg, void *unused)
200
{
201
	(void) unused;
202
	dnode_t *dn = arg;
203

204
	rw_destroy(&dn->dn_struct_rwlock);
205
	mutex_destroy(&dn->dn_mtx);
206
	mutex_destroy(&dn->dn_dbufs_mtx);
207
	cv_destroy(&dn->dn_notxholds);
208
	cv_destroy(&dn->dn_nodnholds);
209
	zfs_refcount_destroy(&dn->dn_holds);
210
	zfs_refcount_destroy(&dn->dn_tx_holds);
211
	ASSERT(!list_link_active(&dn->dn_link));
212

213
	for (int i = 0; i < TXG_SIZE; i++) {
214
		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
215
		ASSERT0P(dn->dn_free_ranges[i]);
216
		list_destroy(&dn->dn_dirty_records[i]);
217
		ASSERT0(dn->dn_next_nblkptr[i]);
218
		ASSERT0(dn->dn_next_nlevels[i]);
219
		ASSERT0(dn->dn_next_indblkshift[i]);
220
		ASSERT0(dn->dn_next_bonustype[i]);
221
		ASSERT0(dn->dn_rm_spillblk[i]);
222
		ASSERT0(dn->dn_next_bonuslen[i]);
223
		ASSERT0(dn->dn_next_blksz[i]);
224
		ASSERT0(dn->dn_next_maxblkid[i]);
225
	}
226

227
	ASSERT0(dn->dn_allocated_txg);
228
	ASSERT0(dn->dn_free_txg);
229
	ASSERT0(dn->dn_assigned_txg);
230
	ASSERT0(dn->dn_dirtycnt);
231
	ASSERT0P(dn->dn_bonus);
232
	ASSERT(!dn->dn_have_spill);
233
	ASSERT0P(dn->dn_zio);
234
	ASSERT0(dn->dn_oldused);
235
	ASSERT0(dn->dn_oldflags);
236
	ASSERT0(dn->dn_olduid);
237
	ASSERT0(dn->dn_oldgid);
238
	ASSERT0(dn->dn_oldprojid);
239
	ASSERT0(dn->dn_newuid);
240
	ASSERT0(dn->dn_newgid);
241
	ASSERT0(dn->dn_newprojid);
242
	ASSERT0(dn->dn_id_flags);
243

244
	ASSERT0(dn->dn_dbufs_count);
245
	avl_destroy(&dn->dn_dbufs);
246
}
247

248
static int
249
dnode_kstats_update(kstat_t *ksp, int rw)
250
{
251
	dnode_stats_t *ds = ksp->ks_data;
252

253
	if (rw == KSTAT_WRITE)
254
		return (EACCES);
255
	ds->dnode_hold_dbuf_hold.value.ui64 =
256
	    wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
257
	ds->dnode_hold_dbuf_read.value.ui64 =
258
	    wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
259
	ds->dnode_hold_alloc_hits.value.ui64 =
260
	    wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
261
	ds->dnode_hold_alloc_misses.value.ui64 =
262
	    wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
263
	ds->dnode_hold_alloc_interior.value.ui64 =
264
	    wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
265
	ds->dnode_hold_alloc_lock_retry.value.ui64 =
266
	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
267
	ds->dnode_hold_alloc_lock_misses.value.ui64 =
268
	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
269
	ds->dnode_hold_alloc_type_none.value.ui64 =
270
	    wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
271
	ds->dnode_hold_free_hits.value.ui64 =
272
	    wmsum_value(&dnode_sums.dnode_hold_free_hits);
273
	ds->dnode_hold_free_misses.value.ui64 =
274
	    wmsum_value(&dnode_sums.dnode_hold_free_misses);
275
	ds->dnode_hold_free_lock_misses.value.ui64 =
276
	    wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
277
	ds->dnode_hold_free_lock_retry.value.ui64 =
278
	    wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
279
	ds->dnode_hold_free_refcount.value.ui64 =
280
	    wmsum_value(&dnode_sums.dnode_hold_free_refcount);
281
	ds->dnode_hold_free_overflow.value.ui64 =
282
	    wmsum_value(&dnode_sums.dnode_hold_free_overflow);
283
	ds->dnode_free_interior_lock_retry.value.ui64 =
284
	    wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
285
	ds->dnode_allocate.value.ui64 =
286
	    wmsum_value(&dnode_sums.dnode_allocate);
287
	ds->dnode_reallocate.value.ui64 =
288
	    wmsum_value(&dnode_sums.dnode_reallocate);
289
	ds->dnode_buf_evict.value.ui64 =
290
	    wmsum_value(&dnode_sums.dnode_buf_evict);
291
	ds->dnode_alloc_next_chunk.value.ui64 =
292
	    wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
293
	ds->dnode_alloc_race.value.ui64 =
294
	    wmsum_value(&dnode_sums.dnode_alloc_race);
295
	ds->dnode_alloc_next_block.value.ui64 =
296
	    wmsum_value(&dnode_sums.dnode_alloc_next_block);
297
	ds->dnode_move_invalid.value.ui64 =
298
	    wmsum_value(&dnode_sums.dnode_move_invalid);
299
	ds->dnode_move_recheck1.value.ui64 =
300
	    wmsum_value(&dnode_sums.dnode_move_recheck1);
301
	ds->dnode_move_recheck2.value.ui64 =
302
	    wmsum_value(&dnode_sums.dnode_move_recheck2);
303
	ds->dnode_move_special.value.ui64 =
304
	    wmsum_value(&dnode_sums.dnode_move_special);
305
	ds->dnode_move_handle.value.ui64 =
306
	    wmsum_value(&dnode_sums.dnode_move_handle);
307
	ds->dnode_move_rwlock.value.ui64 =
308
	    wmsum_value(&dnode_sums.dnode_move_rwlock);
309
	ds->dnode_move_active.value.ui64 =
310
	    wmsum_value(&dnode_sums.dnode_move_active);
311
	return (0);
312
}
313

314
void
315
dnode_init(void)
316
{
317
	ASSERT0P(dnode_cache);
318
	dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
319
	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
320
	kmem_cache_set_move(dnode_cache, dnode_move);
321

322
	wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
323
	wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
324
	wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
325
	wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
326
	wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
327
	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
328
	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
329
	wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
330
	wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
331
	wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
332
	wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
333
	wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
334
	wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
335
	wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
336
	wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
337
	wmsum_init(&dnode_sums.dnode_allocate, 0);
338
	wmsum_init(&dnode_sums.dnode_reallocate, 0);
339
	wmsum_init(&dnode_sums.dnode_buf_evict, 0);
340
	wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
341
	wmsum_init(&dnode_sums.dnode_alloc_race, 0);
342
	wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
343
	wmsum_init(&dnode_sums.dnode_move_invalid, 0);
344
	wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
345
	wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
346
	wmsum_init(&dnode_sums.dnode_move_special, 0);
347
	wmsum_init(&dnode_sums.dnode_move_handle, 0);
348
	wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
349
	wmsum_init(&dnode_sums.dnode_move_active, 0);
350

351
	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
352
	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
353
	    KSTAT_FLAG_VIRTUAL);
354
	if (dnode_ksp != NULL) {
355
		dnode_ksp->ks_data = &dnode_stats;
356
		dnode_ksp->ks_update = dnode_kstats_update;
357
		kstat_install(dnode_ksp);
358
	}
359
}
360

361
void
362
dnode_fini(void)
363
{
364
	if (dnode_ksp != NULL) {
365
		kstat_delete(dnode_ksp);
366
		dnode_ksp = NULL;
367
	}
368

369
	wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
370
	wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
371
	wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
372
	wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
373
	wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
374
	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
375
	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
376
	wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
377
	wmsum_fini(&dnode_sums.dnode_hold_free_hits);
378
	wmsum_fini(&dnode_sums.dnode_hold_free_misses);
379
	wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
380
	wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
381
	wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
382
	wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
383
	wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
384
	wmsum_fini(&dnode_sums.dnode_allocate);
385
	wmsum_fini(&dnode_sums.dnode_reallocate);
386
	wmsum_fini(&dnode_sums.dnode_buf_evict);
387
	wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
388
	wmsum_fini(&dnode_sums.dnode_alloc_race);
389
	wmsum_fini(&dnode_sums.dnode_alloc_next_block);
390
	wmsum_fini(&dnode_sums.dnode_move_invalid);
391
	wmsum_fini(&dnode_sums.dnode_move_recheck1);
392
	wmsum_fini(&dnode_sums.dnode_move_recheck2);
393
	wmsum_fini(&dnode_sums.dnode_move_special);
394
	wmsum_fini(&dnode_sums.dnode_move_handle);
395
	wmsum_fini(&dnode_sums.dnode_move_rwlock);
396
	wmsum_fini(&dnode_sums.dnode_move_active);
397

398
	kmem_cache_destroy(dnode_cache);
399
	dnode_cache = NULL;
400
}
401

402

403
#ifdef ZFS_DEBUG
404
void
405
dnode_verify(dnode_t *dn)
406
{
407
	int drop_struct_lock = FALSE;
408

409
	ASSERT(dn->dn_phys);
410
	ASSERT(dn->dn_objset);
411
	ASSERT(dn->dn_handle->dnh_dnode == dn);
412

413
	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
414

415
	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
416
		return;
417

418
	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
419
		rw_enter(&dn->dn_struct_rwlock, RW_READER);
420
		drop_struct_lock = TRUE;
421
	}
422
	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
423
		int i;
424
		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
425
		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
426
		if (dn->dn_datablkshift) {
427
			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
428
			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
429
			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
430
		}
431
		ASSERT3U(dn->dn_nlevels, <=, 30);
432
		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
433
		ASSERT3U(dn->dn_nblkptr, >=, 1);
434
		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
435
		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
436
		ASSERT3U(dn->dn_datablksz, ==,
437
		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
438
		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
439
		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
440
		    dn->dn_bonuslen, <=, max_bonuslen);
441
		for (i = 0; i < TXG_SIZE; i++) {
442
			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
443
		}
444
	}
445
	if (dn->dn_phys->dn_type != DMU_OT_NONE)
446
		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
447
	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
448
	if (dn->dn_dbuf != NULL) {
449
		ASSERT3P(dn->dn_phys, ==,
450
		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
451
		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
452
	}
453
	if (drop_struct_lock)
454
		rw_exit(&dn->dn_struct_rwlock);
455
}
456
#endif
457

458
void
459
dnode_byteswap(dnode_phys_t *dnp)
460
{
461
	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
462
	int i;
463

464
	if (dnp->dn_type == DMU_OT_NONE) {
465
		memset(dnp, 0, sizeof (dnode_phys_t));
466
		return;
467
	}
468

469
	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
470
	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
471
	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
472
	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
473
	dnp->dn_used = BSWAP_64(dnp->dn_used);
474

475
	/*
476
	 * dn_nblkptr is only one byte, so it's OK to read it in either
477
	 * byte order.  We can't read dn_bouslen.
478
	 */
479
	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
480
	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
481
	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
482
		buf64[i] = BSWAP_64(buf64[i]);
483

484
	/*
485
	 * OK to check dn_bonuslen for zero, because it won't matter if
486
	 * we have the wrong byte order.  This is necessary because the
487
	 * dnode dnode is smaller than a regular dnode.
488
	 */
489
	if (dnp->dn_bonuslen != 0) {
490
		dmu_object_byteswap_t byteswap;
491
		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
492
		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
493
		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
494
		    DN_MAX_BONUS_LEN(dnp));
495
	}
496

497
	/* Swap SPILL block if we have one */
498
	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
499
		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
500
}
501

502
void
503
dnode_buf_byteswap(void *vbuf, size_t size)
504
{
505
	int i = 0;
506

507
	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
508
	ASSERT0((size & (sizeof (dnode_phys_t)-1)));
509

510
	while (i < size) {
511
		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
512
		dnode_byteswap(dnp);
513

514
		i += DNODE_MIN_SIZE;
515
		if (dnp->dn_type != DMU_OT_NONE)
516
			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
517
	}
518
}
519

520
void
521
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
522
{
523
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
524

525
	dnode_setdirty(dn, tx);
526
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
527
	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
528
	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
529

530
	if (newsize < dn->dn_bonuslen) {
531
		/* clear any data after the end of the new size */
532
		size_t diff = dn->dn_bonuslen - newsize;
533
		char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
534
		memset(data_end, 0, diff);
535
	}
536

537
	dn->dn_bonuslen = newsize;
538
	if (newsize == 0)
539
		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
540
	else
541
		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
542
	rw_exit(&dn->dn_struct_rwlock);
543
}
544

545
void
546
dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
547
{
548
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
549
	dnode_setdirty(dn, tx);
550
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
551
	dn->dn_bonustype = newtype;
552
	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
553
	rw_exit(&dn->dn_struct_rwlock);
554
}
555

556
void
557
dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype)
558
{
559
	/*
560
	 * This is not in the dnode_phys, but it should be, and perhaps one day
561
	 * will. For now we require it be set after taking a hold.
562
	 */
563
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
564
	dn->dn_storage_type = newtype;
565
}
566

567
void
568
dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
569
{
570
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
571
	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
572
	dnode_setdirty(dn, tx);
573
	dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
574
	dn->dn_have_spill = B_FALSE;
575
}
576

577
static void
578
dnode_setdblksz(dnode_t *dn, int size)
579
{
580
	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
581
	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
582
	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
583
	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
584
	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
585
	dn->dn_datablksz = size;
586
	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
587
	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
588
}
589

590
static dnode_t *
591
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
592
    uint64_t object, dnode_handle_t *dnh)
593
{
594
	dnode_t *dn;
595

596
	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
597
	dn->dn_moved = 0;
598

599
	/*
600
	 * Defer setting dn_objset until the dnode is ready to be a candidate
601
	 * for the dnode_move() callback.
602
	 */
603
	dn->dn_object = object;
604
	dn->dn_dbuf = db;
605
	dn->dn_handle = dnh;
606
	dn->dn_phys = dnp;
607

608
	if (dnp->dn_datablkszsec) {
609
		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
610
	} else {
611
		dn->dn_datablksz = 0;
612
		dn->dn_datablkszsec = 0;
613
		dn->dn_datablkshift = 0;
614
	}
615
	dn->dn_indblkshift = dnp->dn_indblkshift;
616
	dn->dn_nlevels = dnp->dn_nlevels;
617
	dn->dn_type = dnp->dn_type;
618
	dn->dn_nblkptr = dnp->dn_nblkptr;
619
	dn->dn_checksum = dnp->dn_checksum;
620
	dn->dn_compress = dnp->dn_compress;
621
	dn->dn_bonustype = dnp->dn_bonustype;
622
	dn->dn_bonuslen = dnp->dn_bonuslen;
623
	dn->dn_num_slots = dnp->dn_extra_slots + 1;
624
	dn->dn_maxblkid = dnp->dn_maxblkid;
625
	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
626
	dn->dn_id_flags = 0;
627

628
	dn->dn_storage_type = DMU_OT_NONE;
629

630
	dmu_zfetch_init(&dn->dn_zfetch, dn);
631

632
	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
633
	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
634
	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
635

636
	mutex_enter(&os->os_lock);
637

638
	/*
639
	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
640
	 * signifies that the special dnodes have no references from
641
	 * their children (the entries in os_dnodes).  This allows
642
	 * dnode_destroy() to easily determine if the last child has
643
	 * been removed and then complete eviction of the objset.
644
	 */
645
	if (!DMU_OBJECT_IS_SPECIAL(object))
646
		list_insert_head(&os->os_dnodes, dn);
647
	membar_producer();
648

649
	/*
650
	 * Everything else must be valid before assigning dn_objset
651
	 * makes the dnode eligible for dnode_move().
652
	 */
653
	dn->dn_objset = os;
654

655
	dnh->dnh_dnode = dn;
656
	mutex_exit(&os->os_lock);
657

658
	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
659

660
	return (dn);
661
}
662

663
/*
664
 * Caller must be holding the dnode handle, which is released upon return.
665
 */
666
static void
667
dnode_destroy(dnode_t *dn)
668
{
669
	objset_t *os = dn->dn_objset;
670
	boolean_t complete_os_eviction = B_FALSE;
671

672
	ASSERT0((dn->dn_id_flags & DN_ID_NEW_EXIST));
673

674
	mutex_enter(&os->os_lock);
675
	POINTER_INVALIDATE(&dn->dn_objset);
676
	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
677
		list_remove(&os->os_dnodes, dn);
678
		complete_os_eviction =
679
		    list_is_empty(&os->os_dnodes) &&
680
		    list_link_active(&os->os_evicting_node);
681
	}
682
	mutex_exit(&os->os_lock);
683

684
	/* the dnode can no longer move, so we can release the handle */
685
	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
686
		zrl_remove(&dn->dn_handle->dnh_zrlock);
687

688
	dn->dn_allocated_txg = 0;
689
	dn->dn_free_txg = 0;
690
	dn->dn_assigned_txg = 0;
691
	dn->dn_dirtycnt = 0;
692

693
	if (dn->dn_bonus != NULL) {
694
		mutex_enter(&dn->dn_bonus->db_mtx);
695
		dbuf_destroy(dn->dn_bonus);
696
		dn->dn_bonus = NULL;
697
	}
698
	dn->dn_zio = NULL;
699

700
	dn->dn_have_spill = B_FALSE;
701
	dn->dn_oldused = 0;
702
	dn->dn_oldflags = 0;
703
	dn->dn_olduid = 0;
704
	dn->dn_oldgid = 0;
705
	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
706
	dn->dn_newuid = 0;
707
	dn->dn_newgid = 0;
708
	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
709
	dn->dn_id_flags = 0;
710

711
	dn->dn_storage_type = DMU_OT_NONE;
712

713
	dmu_zfetch_fini(&dn->dn_zfetch);
714
	kmem_cache_free(dnode_cache, dn);
715
	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
716

717
	if (complete_os_eviction)
718
		dmu_objset_evict_done(os);
719
}
720

721
void
722
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
723
    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
724
{
725
	int i;
726

727
	ASSERT3U(dn_slots, >, 0);
728
	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
729
	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
730
	ASSERT3U(blocksize, <=,
731
	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
732
	if (blocksize == 0)
733
		blocksize = 1 << zfs_default_bs;
734
	else
735
		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
736

737
	if (ibs == 0)
738
		ibs = zfs_default_ibs;
739

740
	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
741

742
	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
743
	    dn->dn_objset, (u_longlong_t)dn->dn_object,
744
	    (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);
745
	DNODE_STAT_BUMP(dnode_allocate);
746

747
	ASSERT(dn->dn_type == DMU_OT_NONE);
748
	ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
749
	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
750
	ASSERT(ot != DMU_OT_NONE);
751
	ASSERT(DMU_OT_IS_VALID(ot));
752
	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
753
	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
754
	    (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
755
	    (bonustype != DMU_OT_NONE && bonuslen != 0));
756
	ASSERT(DMU_OT_IS_VALID(bonustype));
757
	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
758
	ASSERT(dn->dn_type == DMU_OT_NONE);
759
	ASSERT0(dn->dn_maxblkid);
760
	ASSERT0(dn->dn_allocated_txg);
761
	ASSERT0(dn->dn_assigned_txg);
762
	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
763
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
764
	ASSERT(avl_is_empty(&dn->dn_dbufs));
765

766
	for (i = 0; i < TXG_SIZE; i++) {
767
		ASSERT0(dn->dn_next_nblkptr[i]);
768
		ASSERT0(dn->dn_next_nlevels[i]);
769
		ASSERT0(dn->dn_next_indblkshift[i]);
770
		ASSERT0(dn->dn_next_bonuslen[i]);
771
		ASSERT0(dn->dn_next_bonustype[i]);
772
		ASSERT0(dn->dn_rm_spillblk[i]);
773
		ASSERT0(dn->dn_next_blksz[i]);
774
		ASSERT0(dn->dn_next_maxblkid[i]);
775
		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
776
		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
777
		ASSERT0P(dn->dn_free_ranges[i]);
778
	}
779

780
	dn->dn_type = ot;
781
	dnode_setdblksz(dn, blocksize);
782
	dn->dn_indblkshift = ibs;
783
	dn->dn_nlevels = 1;
784
	dn->dn_num_slots = dn_slots;
785
	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
786
		dn->dn_nblkptr = 1;
787
	else {
788
		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
789
		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
790
		    SPA_BLKPTRSHIFT));
791
	}
792

793
	dn->dn_bonustype = bonustype;
794
	dn->dn_bonuslen = bonuslen;
795
	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
796
	dn->dn_compress = ZIO_COMPRESS_INHERIT;
797

798
	dn->dn_free_txg = 0;
799
	dn->dn_dirtycnt = 0;
800

801
	dn->dn_allocated_txg = tx->tx_txg;
802
	dn->dn_id_flags = 0;
803

804
	dnode_setdirty(dn, tx);
805
	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
806
	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
807
	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
808
	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
809
}
810

811
void
812
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
813
    dmu_object_type_t bonustype, int bonuslen, int dn_slots,
814
    boolean_t keep_spill, dmu_tx_t *tx)
815
{
816
	int nblkptr;
817

818
	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
819
	ASSERT3U(blocksize, <=,
820
	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
821
	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
822
	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
823
	ASSERT(tx->tx_txg != 0);
824
	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
825
	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
826
	    (bonustype == DMU_OT_SA && bonuslen == 0));
827
	ASSERT(DMU_OT_IS_VALID(bonustype));
828
	ASSERT3U(bonuslen, <=,
829
	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
830
	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
831

832
	dnode_free_interior_slots(dn);
833
	DNODE_STAT_BUMP(dnode_reallocate);
834

835
	/* clean up any unreferenced dbufs */
836
	dnode_evict_dbufs(dn);
837

838
	dn->dn_id_flags = 0;
839

840
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
841
	dnode_setdirty(dn, tx);
842
	if (dn->dn_datablksz != blocksize) {
843
		/* change blocksize */
844
		ASSERT0(dn->dn_maxblkid);
845
		ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
846
		    dnode_block_freed(dn, 0));
847

848
		dnode_setdblksz(dn, blocksize);
849
		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
850
	}
851
	if (dn->dn_bonuslen != bonuslen)
852
		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
853

854
	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
855
		nblkptr = 1;
856
	else
857
		nblkptr = MIN(DN_MAX_NBLKPTR,
858
		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
859
		    SPA_BLKPTRSHIFT));
860
	if (dn->dn_bonustype != bonustype)
861
		dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
862
	if (dn->dn_nblkptr != nblkptr)
863
		dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
864
	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
865
		dbuf_rm_spill(dn, tx);
866
		dnode_rm_spill(dn, tx);
867
	}
868

869
	rw_exit(&dn->dn_struct_rwlock);
870

871
	/* change type */
872
	dn->dn_type = ot;
873

874
	/* change bonus size and type */
875
	mutex_enter(&dn->dn_mtx);
876
	dn->dn_bonustype = bonustype;
877
	dn->dn_bonuslen = bonuslen;
878
	dn->dn_num_slots = dn_slots;
879
	dn->dn_nblkptr = nblkptr;
880
	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
881
	dn->dn_compress = ZIO_COMPRESS_INHERIT;
882
	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
883

884
	/* fix up the bonus db_size */
885
	if (dn->dn_bonus) {
886
		dn->dn_bonus->db.db_size =
887
		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
888
		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
889
		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
890
	}
891

892
	dn->dn_allocated_txg = tx->tx_txg;
893
	mutex_exit(&dn->dn_mtx);
894
}
895

896
#ifdef	_KERNEL
897
static void
898
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
899
{
900
	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
901
	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
902
	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
903

904
	/* Copy fields. */
905
	ndn->dn_objset = odn->dn_objset;
906
	ndn->dn_object = odn->dn_object;
907
	ndn->dn_dbuf = odn->dn_dbuf;
908
	ndn->dn_handle = odn->dn_handle;
909
	ndn->dn_phys = odn->dn_phys;
910
	ndn->dn_type = odn->dn_type;
911
	ndn->dn_bonuslen = odn->dn_bonuslen;
912
	ndn->dn_bonustype = odn->dn_bonustype;
913
	ndn->dn_nblkptr = odn->dn_nblkptr;
914
	ndn->dn_checksum = odn->dn_checksum;
915
	ndn->dn_compress = odn->dn_compress;
916
	ndn->dn_nlevels = odn->dn_nlevels;
917
	ndn->dn_indblkshift = odn->dn_indblkshift;
918
	ndn->dn_datablkshift = odn->dn_datablkshift;
919
	ndn->dn_datablkszsec = odn->dn_datablkszsec;
920
	ndn->dn_datablksz = odn->dn_datablksz;
921
	ndn->dn_maxblkid = odn->dn_maxblkid;
922
	ndn->dn_num_slots = odn->dn_num_slots;
923
	memcpy(ndn->dn_next_type, odn->dn_next_type,
924
	    sizeof (odn->dn_next_type));
925
	memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
926
	    sizeof (odn->dn_next_nblkptr));
927
	memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
928
	    sizeof (odn->dn_next_nlevels));
929
	memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
930
	    sizeof (odn->dn_next_indblkshift));
931
	memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
932
	    sizeof (odn->dn_next_bonustype));
933
	memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
934
	    sizeof (odn->dn_rm_spillblk));
935
	memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
936
	    sizeof (odn->dn_next_bonuslen));
937
	memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
938
	    sizeof (odn->dn_next_blksz));
939
	memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
940
	    sizeof (odn->dn_next_maxblkid));
941
	for (int i = 0; i < TXG_SIZE; i++) {
942
		list_move_tail(&ndn->dn_dirty_records[i],
943
		    &odn->dn_dirty_records[i]);
944
	}
945
	memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
946
	    sizeof (odn->dn_free_ranges));
947
	ndn->dn_allocated_txg = odn->dn_allocated_txg;
948
	ndn->dn_free_txg = odn->dn_free_txg;
949
	ndn->dn_assigned_txg = odn->dn_assigned_txg;
950
	ndn->dn_dirtycnt = odn->dn_dirtycnt;
951
	ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));
952
	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
953
	ASSERT(avl_is_empty(&ndn->dn_dbufs));
954
	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
955
	ndn->dn_dbufs_count = odn->dn_dbufs_count;
956
	ndn->dn_bonus = odn->dn_bonus;
957
	ndn->dn_have_spill = odn->dn_have_spill;
958
	ndn->dn_zio = odn->dn_zio;
959
	ndn->dn_oldused = odn->dn_oldused;
960
	ndn->dn_oldflags = odn->dn_oldflags;
961
	ndn->dn_olduid = odn->dn_olduid;
962
	ndn->dn_oldgid = odn->dn_oldgid;
963
	ndn->dn_oldprojid = odn->dn_oldprojid;
964
	ndn->dn_newuid = odn->dn_newuid;
965
	ndn->dn_newgid = odn->dn_newgid;
966
	ndn->dn_newprojid = odn->dn_newprojid;
967
	ndn->dn_id_flags = odn->dn_id_flags;
968
	ndn->dn_storage_type = odn->dn_storage_type;
969
	dmu_zfetch_init(&ndn->dn_zfetch, ndn);
970

971
	/*
972
	 * Update back pointers. Updating the handle fixes the back pointer of
973
	 * every descendant dbuf as well as the bonus dbuf.
974
	 */
975
	ASSERT(ndn->dn_handle->dnh_dnode == odn);
976
	ndn->dn_handle->dnh_dnode = ndn;
977

978
	/*
979
	 * Invalidate the original dnode by clearing all of its back pointers.
980
	 */
981
	odn->dn_dbuf = NULL;
982
	odn->dn_handle = NULL;
983
	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
984
	    offsetof(dmu_buf_impl_t, db_link));
985
	odn->dn_dbufs_count = 0;
986
	odn->dn_bonus = NULL;
987
	dmu_zfetch_fini(&odn->dn_zfetch);
988

989
	/*
990
	 * Set the low bit of the objset pointer to ensure that dnode_move()
991
	 * recognizes the dnode as invalid in any subsequent callback.
992
	 */
993
	POINTER_INVALIDATE(&odn->dn_objset);
994

995
	/*
996
	 * Satisfy the destructor.
997
	 */
998
	for (int i = 0; i < TXG_SIZE; i++) {
999
		list_create(&odn->dn_dirty_records[i],
1000
		    sizeof (dbuf_dirty_record_t),
1001
		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1002
		odn->dn_free_ranges[i] = NULL;
1003
		odn->dn_next_nlevels[i] = 0;
1004
		odn->dn_next_indblkshift[i] = 0;
1005
		odn->dn_next_bonustype[i] = 0;
1006
		odn->dn_rm_spillblk[i] = 0;
1007
		odn->dn_next_bonuslen[i] = 0;
1008
		odn->dn_next_blksz[i] = 0;
1009
	}
1010
	odn->dn_allocated_txg = 0;
1011
	odn->dn_free_txg = 0;
1012
	odn->dn_assigned_txg = 0;
1013
	odn->dn_dirtycnt = 0;
1014
	odn->dn_have_spill = B_FALSE;
1015
	odn->dn_zio = NULL;
1016
	odn->dn_oldused = 0;
1017
	odn->dn_oldflags = 0;
1018
	odn->dn_olduid = 0;
1019
	odn->dn_oldgid = 0;
1020
	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
1021
	odn->dn_newuid = 0;
1022
	odn->dn_newgid = 0;
1023
	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
1024
	odn->dn_id_flags = 0;
1025
	odn->dn_storage_type = DMU_OT_NONE;
1026

1027
	/*
1028
	 * Mark the dnode.
1029
	 */
1030
	ndn->dn_moved = 1;
1031
	odn->dn_moved = (uint8_t)-1;
1032
}
1033

1034
static kmem_cbrc_t
1035
dnode_move(void *buf, void *newbuf, size_t size, void *arg)
1036
{
1037
	dnode_t *odn = buf, *ndn = newbuf;
1038
	objset_t *os;
1039
	int64_t refcount;
1040
	uint32_t dbufs;
1041

1042
#ifndef USE_DNODE_HANDLE
1043
	/*
1044
	 * We can't move dnodes if dbufs reference them directly without
1045
	 * using handles and respecitve locking.  Unless USE_DNODE_HANDLE
1046
	 * is defined the code below is only to make sure it still builds,
1047
	 * but it should never be used, since it is unsafe.
1048
	 */
1049
#ifdef ZFS_DEBUG
1050
	PANIC("dnode_move() called without USE_DNODE_HANDLE");
1051
#endif
1052
	return (KMEM_CBRC_NO);
1053
#endif
1054

1055
	/*
1056
	 * The dnode is on the objset's list of known dnodes if the objset
1057
	 * pointer is valid. We set the low bit of the objset pointer when
1058
	 * freeing the dnode to invalidate it, and the memory patterns written
1059
	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
1060
	 * A newly created dnode sets the objset pointer last of all to indicate
1061
	 * that the dnode is known and in a valid state to be moved by this
1062
	 * function.
1063
	 */
1064
	os = odn->dn_objset;
1065
	if (!POINTER_IS_VALID(os)) {
1066
		DNODE_STAT_BUMP(dnode_move_invalid);
1067
		return (KMEM_CBRC_DONT_KNOW);
1068
	}
1069

1070
	/*
1071
	 * Ensure that the objset does not go away during the move.
1072
	 */
1073
	rw_enter(&os_lock, RW_WRITER);
1074
	if (os != odn->dn_objset) {
1075
		rw_exit(&os_lock);
1076
		DNODE_STAT_BUMP(dnode_move_recheck1);
1077
		return (KMEM_CBRC_DONT_KNOW);
1078
	}
1079

1080
	/*
1081
	 * If the dnode is still valid, then so is the objset. We know that no
1082
	 * valid objset can be freed while we hold os_lock, so we can safely
1083
	 * ensure that the objset remains in use.
1084
	 */
1085
	mutex_enter(&os->os_lock);
1086

1087
	/*
1088
	 * Recheck the objset pointer in case the dnode was removed just before
1089
	 * acquiring the lock.
1090
	 */
1091
	if (os != odn->dn_objset) {
1092
		mutex_exit(&os->os_lock);
1093
		rw_exit(&os_lock);
1094
		DNODE_STAT_BUMP(dnode_move_recheck2);
1095
		return (KMEM_CBRC_DONT_KNOW);
1096
	}
1097

1098
	/*
1099
	 * At this point we know that as long as we hold os->os_lock, the dnode
1100
	 * cannot be freed and fields within the dnode can be safely accessed.
1101
	 * The objset listing this dnode cannot go away as long as this dnode is
1102
	 * on its list.
1103
	 */
1104
	rw_exit(&os_lock);
1105
	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
1106
		mutex_exit(&os->os_lock);
1107
		DNODE_STAT_BUMP(dnode_move_special);
1108
		return (KMEM_CBRC_NO);
1109
	}
1110
	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
1111

1112
	/*
1113
	 * Lock the dnode handle to prevent the dnode from obtaining any new
1114
	 * holds. This also prevents the descendant dbufs and the bonus dbuf
1115
	 * from accessing the dnode, so that we can discount their holds. The
1116
	 * handle is safe to access because we know that while the dnode cannot
1117
	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
1118
	 * safely move any dnode referenced only by dbufs.
1119
	 */
1120
	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
1121
		mutex_exit(&os->os_lock);
1122
		DNODE_STAT_BUMP(dnode_move_handle);
1123
		return (KMEM_CBRC_LATER);
1124
	}
1125

1126
	/*
1127
	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
1128
	 * We need to guarantee that there is a hold for every dbuf in order to
1129
	 * determine whether the dnode is actively referenced. Falsely matching
1130
	 * a dbuf to an active hold would lead to an unsafe move. It's possible
1131
	 * that a thread already having an active dnode hold is about to add a
1132
	 * dbuf, and we can't compare hold and dbuf counts while the add is in
1133
	 * progress.
1134
	 */
1135
	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
1136
		zrl_exit(&odn->dn_handle->dnh_zrlock);
1137
		mutex_exit(&os->os_lock);
1138
		DNODE_STAT_BUMP(dnode_move_rwlock);
1139
		return (KMEM_CBRC_LATER);
1140
	}
1141

1142
	/*
1143
	 * A dbuf may be removed (evicted) without an active dnode hold. In that
1144
	 * case, the dbuf count is decremented under the handle lock before the
1145
	 * dbuf's hold is released. This order ensures that if we count the hold
1146
	 * after the dbuf is removed but before its hold is released, we will
1147
	 * treat the unmatched hold as active and exit safely. If we count the
1148
	 * hold before the dbuf is removed, the hold is discounted, and the
1149
	 * removal is blocked until the move completes.
1150
	 */
1151
	refcount = zfs_refcount_count(&odn->dn_holds);
1152
	ASSERT(refcount >= 0);
1153
	dbufs = DN_DBUFS_COUNT(odn);
1154

1155
	/* We can't have more dbufs than dnode holds. */
1156
	ASSERT3U(dbufs, <=, refcount);
1157
	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1158
	    uint32_t, dbufs);
1159

1160
	if (refcount > dbufs) {
1161
		rw_exit(&odn->dn_struct_rwlock);
1162
		zrl_exit(&odn->dn_handle->dnh_zrlock);
1163
		mutex_exit(&os->os_lock);
1164
		DNODE_STAT_BUMP(dnode_move_active);
1165
		return (KMEM_CBRC_LATER);
1166
	}
1167

1168
	rw_exit(&odn->dn_struct_rwlock);
1169

1170
	/*
1171
	 * At this point we know that anyone with a hold on the dnode is not
1172
	 * actively referencing it. The dnode is known and in a valid state to
1173
	 * move. We're holding the locks needed to execute the critical section.
1174
	 */
1175
	dnode_move_impl(odn, ndn);
1176

1177
	list_link_replace(&odn->dn_link, &ndn->dn_link);
1178
	/* If the dnode was safe to move, the refcount cannot have changed. */
1179
	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1180
	ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
1181
	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1182
	mutex_exit(&os->os_lock);
1183

1184
	return (KMEM_CBRC_YES);
1185
}
1186
#endif	/* _KERNEL */
1187

1188
static void
1189
dnode_slots_hold(dnode_children_t *children, int idx, int slots)
1190
{
1191
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1192

1193
	for (int i = idx; i < idx + slots; i++) {
1194
		dnode_handle_t *dnh = &children->dnc_children[i];
1195
		zrl_add(&dnh->dnh_zrlock);
1196
	}
1197
}
1198

1199
static void
1200
dnode_slots_rele(dnode_children_t *children, int idx, int slots)
1201
{
1202
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1203

1204
	for (int i = idx; i < idx + slots; i++) {
1205
		dnode_handle_t *dnh = &children->dnc_children[i];
1206

1207
		if (zrl_is_locked(&dnh->dnh_zrlock))
1208
			zrl_exit(&dnh->dnh_zrlock);
1209
		else
1210
			zrl_remove(&dnh->dnh_zrlock);
1211
	}
1212
}
1213

1214
static int
1215
dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
1216
{
1217
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1218

1219
	for (int i = idx; i < idx + slots; i++) {
1220
		dnode_handle_t *dnh = &children->dnc_children[i];
1221

1222
		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
1223
			for (int j = idx; j < i; j++) {
1224
				dnh = &children->dnc_children[j];
1225
				zrl_exit(&dnh->dnh_zrlock);
1226
			}
1227

1228
			return (0);
1229
		}
1230
	}
1231

1232
	return (1);
1233
}
1234

1235
static void
1236
dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
1237
{
1238
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1239

1240
	for (int i = idx; i < idx + slots; i++) {
1241
		dnode_handle_t *dnh = &children->dnc_children[i];
1242
		dnh->dnh_dnode = ptr;
1243
	}
1244
}
1245

1246
static boolean_t
1247
dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
1248
{
1249
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1250

1251
	/*
1252
	 * If all dnode slots are either already free or
1253
	 * evictable return B_TRUE.
1254
	 */
1255
	for (int i = idx; i < idx + slots; i++) {
1256
		dnode_handle_t *dnh = &children->dnc_children[i];
1257
		dnode_t *dn = dnh->dnh_dnode;
1258

1259
		if (dn == DN_SLOT_FREE) {
1260
			continue;
1261
		} else if (DN_SLOT_IS_PTR(dn)) {
1262
			mutex_enter(&dn->dn_mtx);
1263
			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1264
			    dn->dn_dirtycnt == 0 &&
1265
			    zfs_refcount_is_zero(&dn->dn_holds));
1266
			mutex_exit(&dn->dn_mtx);
1267

1268
			if (!can_free)
1269
				return (B_FALSE);
1270
			else
1271
				continue;
1272
		} else {
1273
			return (B_FALSE);
1274
		}
1275
	}
1276

1277
	return (B_TRUE);
1278
}
1279

1280
static uint_t
1281
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
1282
{
1283
	uint_t reclaimed = 0;
1284

1285
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1286

1287
	for (int i = idx; i < idx + slots; i++) {
1288
		dnode_handle_t *dnh = &children->dnc_children[i];
1289

1290
		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
1291

1292
		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1293
			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
1294
			dnode_destroy(dnh->dnh_dnode);
1295
			dnh->dnh_dnode = DN_SLOT_FREE;
1296
			reclaimed++;
1297
		}
1298
	}
1299

1300
	return (reclaimed);
1301
}
1302

1303
void
1304
dnode_free_interior_slots(dnode_t *dn)
1305
{
1306
	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
1307
	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
1308
	int idx = (dn->dn_object & (epb - 1)) + 1;
1309
	int slots = dn->dn_num_slots - 1;
1310

1311
	if (slots == 0)
1312
		return;
1313

1314
	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1315

1316
	while (!dnode_slots_tryenter(children, idx, slots)) {
1317
		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
1318
		kpreempt(KPREEMPT_SYNC);
1319
	}
1320

1321
	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
1322
	dnode_slots_rele(children, idx, slots);
1323
}
1324

1325
void
1326
dnode_special_close(dnode_handle_t *dnh)
1327
{
1328
	dnode_t *dn = dnh->dnh_dnode;
1329

1330
	/*
1331
	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
1332
	 * zfs_refcount_remove()
1333
	 */
1334
	mutex_enter(&dn->dn_mtx);
1335
	if (zfs_refcount_count(&dn->dn_holds) > 0)
1336
		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
1337
	mutex_exit(&dn->dn_mtx);
1338
	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
1339

1340
	ASSERT(dn->dn_dbuf == NULL ||
1341
	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1342
	zrl_add(&dnh->dnh_zrlock);
1343
	dnode_destroy(dn); /* implicit zrl_remove() */
1344
	zrl_destroy(&dnh->dnh_zrlock);
1345
	dnh->dnh_dnode = NULL;
1346
}
1347

1348
void
1349
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1350
    dnode_handle_t *dnh)
1351
{
1352
	dnode_t *dn;
1353

1354
	zrl_init(&dnh->dnh_zrlock);
1355
	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
1356

1357
	dn = dnode_create(os, dnp, NULL, object, dnh);
1358
	DNODE_VERIFY(dn);
1359

1360
	zrl_exit(&dnh->dnh_zrlock);
1361
}
1362

1363
static void
1364
dnode_buf_evict_async(void *dbu)
1365
{
1366
	dnode_children_t *dnc = dbu;
1367

1368
	DNODE_STAT_BUMP(dnode_buf_evict);
1369

1370
	for (int i = 0; i < dnc->dnc_count; i++) {
1371
		dnode_handle_t *dnh = &dnc->dnc_children[i];
1372
		dnode_t *dn;
1373

1374
		/*
1375
		 * The dnode handle lock guards against the dnode moving to
1376
		 * another valid address, so there is no need here to guard
1377
		 * against changes to or from NULL.
1378
		 */
1379
		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1380
			zrl_destroy(&dnh->dnh_zrlock);
1381
			dnh->dnh_dnode = DN_SLOT_UNINIT;
1382
			continue;
1383
		}
1384

1385
		zrl_add(&dnh->dnh_zrlock);
1386
		dn = dnh->dnh_dnode;
1387
		/*
1388
		 * If there are holds on this dnode, then there should
1389
		 * be holds on the dnode's containing dbuf as well; thus
1390
		 * it wouldn't be eligible for eviction and this function
1391
		 * would not have been called.
1392
		 */
1393
		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1394
		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1395

1396
		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1397
		zrl_destroy(&dnh->dnh_zrlock);
1398
		dnh->dnh_dnode = DN_SLOT_UNINIT;
1399
	}
1400
	kmem_free(dnc, sizeof (dnode_children_t) +
1401
	    dnc->dnc_count * sizeof (dnode_handle_t));
1402
}
1403

1404
/*
1405
 * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
1406
 * to ensure the hole at the specified object offset is large enough to
1407
 * hold the dnode being created. The slots parameter is also used to ensure
1408
 * a dnode does not span multiple dnode blocks. In both of these cases, if
1409
 * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
1410
 * are only possible when using DNODE_MUST_BE_FREE.
1411
 *
1412
 * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
1413
 * dnode_hold_impl() will check if the requested dnode is already consumed
1414
 * as an extra dnode slot by an large dnode, in which case it returns
1415
 * ENOENT.
1416
 *
1417
 * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
1418
 * return whether the hold would succeed or not. tag and dnp should set to
1419
 * NULL in this case.
1420
 *
1421
 * errors:
1422
 * EINVAL - Invalid object number or flags.
1423
 * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
1424
 * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
1425
 *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
1426
 *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
1427
 * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
1428
 *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
1429
 * EIO    - I/O error when reading the meta dnode dbuf.
1430
 *
1431
 * succeeds even for free dnodes.
1432
 */
1433
int
1434
dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1435
    const void *tag, dnode_t **dnp)
1436
{
1437
	int epb, idx, err;
1438
	int drop_struct_lock = FALSE;
1439
	int type;
1440
	uint64_t blk;
1441
	dnode_t *mdn, *dn;
1442
	dmu_buf_impl_t *db;
1443
	dnode_children_t *dnc;
1444
	dnode_phys_t *dn_block;
1445
	dnode_handle_t *dnh;
1446

1447
	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
1448
	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1449
	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
1450

1451
	/*
1452
	 * If you are holding the spa config lock as writer, you shouldn't
1453
	 * be asking the DMU to do *anything* unless it's the root pool
1454
	 * which may require us to read from the root filesystem while
1455
	 * holding some (not all) of the locks as writer.
1456
	 */
1457
	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1458
	    (spa_is_root(os->os_spa) &&
1459
	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1460

1461
	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
1462

1463
	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1464
	    object == DMU_PROJECTUSED_OBJECT) {
1465
		if (object == DMU_USERUSED_OBJECT)
1466
			dn = DMU_USERUSED_DNODE(os);
1467
		else if (object == DMU_GROUPUSED_OBJECT)
1468
			dn = DMU_GROUPUSED_DNODE(os);
1469
		else
1470
			dn = DMU_PROJECTUSED_DNODE(os);
1471
		if (dn == NULL)
1472
			return (SET_ERROR(ENOENT));
1473
		type = dn->dn_type;
1474
		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1475
			return (SET_ERROR(ENOENT));
1476
		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1477
			return (SET_ERROR(EEXIST));
1478
		DNODE_VERIFY(dn);
1479
		/* Don't actually hold if dry run, just return 0 */
1480
		if (!(flag & DNODE_DRY_RUN)) {
1481
			(void) zfs_refcount_add(&dn->dn_holds, tag);
1482
			*dnp = dn;
1483
		}
1484
		return (0);
1485
	}
1486

1487
	if (object == 0 || object >= DN_MAX_OBJECT)
1488
		return (SET_ERROR(EINVAL));
1489

1490
	mdn = DMU_META_DNODE(os);
1491
	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1492

1493
	DNODE_VERIFY(mdn);
1494

1495
	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1496
		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1497
		drop_struct_lock = TRUE;
1498
	}
1499

1500
	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1501
	db = dbuf_hold(mdn, blk, FTAG);
1502
	if (drop_struct_lock)
1503
		rw_exit(&mdn->dn_struct_rwlock);
1504
	if (db == NULL) {
1505
		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1506
		return (SET_ERROR(EIO));
1507
	}
1508

1509
	/*
1510
	 * We do not need to decrypt to read the dnode so it doesn't matter
1511
	 * if we get the encrypted or decrypted version.
1512
	 */
1513
	err = dbuf_read(db, NULL, DB_RF_CANFAIL |
1514
	    DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
1515
	if (err) {
1516
		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1517
		dbuf_rele(db, FTAG);
1518
		return (err);
1519
	}
1520

1521
	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1522
	epb = db->db.db_size >> DNODE_SHIFT;
1523

1524
	idx = object & (epb - 1);
1525
	dn_block = (dnode_phys_t *)db->db.db_data;
1526

1527
	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1528
	dnc = dmu_buf_get_user(&db->db);
1529
	dnh = NULL;
1530
	if (dnc == NULL) {
1531
		dnode_children_t *winner;
1532
		int skip = 0;
1533

1534
		dnc = kmem_zalloc(sizeof (dnode_children_t) +
1535
		    epb * sizeof (dnode_handle_t), KM_SLEEP);
1536
		dnc->dnc_count = epb;
1537
		dnh = &dnc->dnc_children[0];
1538

1539
		/* Initialize dnode slot status from dnode_phys_t */
1540
		for (int i = 0; i < epb; i++) {
1541
			zrl_init(&dnh[i].dnh_zrlock);
1542

1543
			if (skip) {
1544
				skip--;
1545
				continue;
1546
			}
1547

1548
			if (dn_block[i].dn_type != DMU_OT_NONE) {
1549
				int interior = dn_block[i].dn_extra_slots;
1550

1551
				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
1552
				dnode_set_slots(dnc, i + 1, interior,
1553
				    DN_SLOT_INTERIOR);
1554
				skip = interior;
1555
			} else {
1556
				dnh[i].dnh_dnode = DN_SLOT_FREE;
1557
				skip = 0;
1558
			}
1559
		}
1560

1561
		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
1562
		    dnode_buf_evict_async, NULL);
1563
		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1564
		if (winner != NULL) {
1565

1566
			for (int i = 0; i < epb; i++)
1567
				zrl_destroy(&dnh[i].dnh_zrlock);
1568

1569
			kmem_free(dnc, sizeof (dnode_children_t) +
1570
			    epb * sizeof (dnode_handle_t));
1571
			dnc = winner;
1572
		}
1573
	}
1574

1575
	ASSERT(dnc->dnc_count == epb);
1576

1577
	if (flag & DNODE_MUST_BE_ALLOCATED) {
1578
		slots = 1;
1579

1580
		dnode_slots_hold(dnc, idx, slots);
1581
		dnh = &dnc->dnc_children[idx];
1582

1583
		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1584
			dn = dnh->dnh_dnode;
1585
		} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
1586
			DNODE_STAT_BUMP(dnode_hold_alloc_interior);
1587
			dnode_slots_rele(dnc, idx, slots);
1588
			dbuf_rele(db, FTAG);
1589
			return (SET_ERROR(EEXIST));
1590
		} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
1591
			DNODE_STAT_BUMP(dnode_hold_alloc_misses);
1592
			dnode_slots_rele(dnc, idx, slots);
1593
			dbuf_rele(db, FTAG);
1594
			return (SET_ERROR(ENOENT));
1595
		} else {
1596
			dnode_slots_rele(dnc, idx, slots);
1597
			while (!dnode_slots_tryenter(dnc, idx, slots)) {
1598
				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
1599
				kpreempt(KPREEMPT_SYNC);
1600
			}
1601

1602
			/*
1603
			 * Someone else won the race and called dnode_create()
1604
			 * after we checked DN_SLOT_IS_PTR() above but before
1605
			 * we acquired the lock.
1606
			 */
1607
			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1608
				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
1609
				dn = dnh->dnh_dnode;
1610
			} else {
1611
				dn = dnode_create(os, dn_block + idx, db,
1612
				    object, dnh);
1613
				dmu_buf_add_user_size(&db->db,
1614
				    sizeof (dnode_t));
1615
			}
1616
		}
1617

1618
		mutex_enter(&dn->dn_mtx);
1619
		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
1620
			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
1621
			mutex_exit(&dn->dn_mtx);
1622
			dnode_slots_rele(dnc, idx, slots);
1623
			dbuf_rele(db, FTAG);
1624
			return (SET_ERROR(ENOENT));
1625
		}
1626

1627
		/* Don't actually hold if dry run, just return 0 */
1628
		if (flag & DNODE_DRY_RUN) {
1629
			mutex_exit(&dn->dn_mtx);
1630
			dnode_slots_rele(dnc, idx, slots);
1631
			dbuf_rele(db, FTAG);
1632
			return (0);
1633
		}
1634

1635
		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
1636
	} else if (flag & DNODE_MUST_BE_FREE) {
1637

1638
		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
1639
			DNODE_STAT_BUMP(dnode_hold_free_overflow);
1640
			dbuf_rele(db, FTAG);
1641
			return (SET_ERROR(ENOSPC));
1642
		}
1643

1644
		dnode_slots_hold(dnc, idx, slots);
1645

1646
		if (!dnode_check_slots_free(dnc, idx, slots)) {
1647
			DNODE_STAT_BUMP(dnode_hold_free_misses);
1648
			dnode_slots_rele(dnc, idx, slots);
1649
			dbuf_rele(db, FTAG);
1650
			return (SET_ERROR(ENOSPC));
1651
		}
1652

1653
		dnode_slots_rele(dnc, idx, slots);
1654
		while (!dnode_slots_tryenter(dnc, idx, slots)) {
1655
			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
1656
			kpreempt(KPREEMPT_SYNC);
1657
		}
1658

1659
		if (!dnode_check_slots_free(dnc, idx, slots)) {
1660
			DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
1661
			dnode_slots_rele(dnc, idx, slots);
1662
			dbuf_rele(db, FTAG);
1663
			return (SET_ERROR(ENOSPC));
1664
		}
1665

1666
		/*
1667
		 * Allocated but otherwise free dnodes which would
1668
		 * be in the interior of a multi-slot dnodes need
1669
		 * to be freed.  Single slot dnodes can be safely
1670
		 * re-purposed as a performance optimization.
1671
		 */
1672
		if (slots > 1) {
1673
			uint_t reclaimed =
1674
			    dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1675
			if (reclaimed > 0)
1676
				dmu_buf_sub_user_size(&db->db,
1677
				    reclaimed * sizeof (dnode_t));
1678
		}
1679

1680
		dnh = &dnc->dnc_children[idx];
1681
		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1682
			dn = dnh->dnh_dnode;
1683
		} else {
1684
			dn = dnode_create(os, dn_block + idx, db,
1685
			    object, dnh);
1686
			dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
1687
		}
1688

1689
		mutex_enter(&dn->dn_mtx);
1690
		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
1691
			DNODE_STAT_BUMP(dnode_hold_free_refcount);
1692
			mutex_exit(&dn->dn_mtx);
1693
			dnode_slots_rele(dnc, idx, slots);
1694
			dbuf_rele(db, FTAG);
1695
			return (SET_ERROR(EEXIST));
1696
		}
1697

1698
		/* Don't actually hold if dry run, just return 0 */
1699
		if (flag & DNODE_DRY_RUN) {
1700
			mutex_exit(&dn->dn_mtx);
1701
			dnode_slots_rele(dnc, idx, slots);
1702
			dbuf_rele(db, FTAG);
1703
			return (0);
1704
		}
1705

1706
		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
1707
		DNODE_STAT_BUMP(dnode_hold_free_hits);
1708
	} else {
1709
		dbuf_rele(db, FTAG);
1710
		return (SET_ERROR(EINVAL));
1711
	}
1712

1713
	ASSERT0(dn->dn_free_txg);
1714

1715
	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1716
		dbuf_add_ref(db, dnh);
1717

1718
	mutex_exit(&dn->dn_mtx);
1719

1720
	/* Now we can rely on the hold to prevent the dnode from moving. */
1721
	dnode_slots_rele(dnc, idx, slots);
1722

1723
	DNODE_VERIFY(dn);
1724
	ASSERT3P(dnp, !=, NULL);
1725
	ASSERT3P(dn->dn_dbuf, ==, db);
1726
	ASSERT3U(dn->dn_object, ==, object);
1727
	dbuf_rele(db, FTAG);
1728

1729
	*dnp = dn;
1730
	return (0);
1731
}
1732

1733
/*
1734
 * Return held dnode if the object is allocated, NULL if not.
1735
 */
1736
int
1737
dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
1738
{
1739
	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
1740
	    dnp));
1741
}
1742

1743
/*
1744
 * Can only add a reference if there is already at least one
1745
 * reference on the dnode.  Returns FALSE if unable to add a
1746
 * new reference.
1747
 */
1748
static boolean_t
1749
dnode_add_ref_locked(dnode_t *dn, const void *tag)
1750
{
1751
	ASSERT(MUTEX_HELD(&dn->dn_mtx));
1752
	if (zfs_refcount_is_zero(&dn->dn_holds))
1753
		return (FALSE);
1754
	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
1755
	return (TRUE);
1756
}
1757

1758
boolean_t
1759
dnode_add_ref(dnode_t *dn, const void *tag)
1760
{
1761
	mutex_enter(&dn->dn_mtx);
1762
	boolean_t r = dnode_add_ref_locked(dn, tag);
1763
	mutex_exit(&dn->dn_mtx);
1764
	return (r);
1765
}
1766

1767
void
1768
dnode_rele(dnode_t *dn, const void *tag)
1769
{
1770
	mutex_enter(&dn->dn_mtx);
1771
	dnode_rele_and_unlock(dn, tag, B_FALSE);
1772
}
1773

1774
void
1775
dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
1776
{
1777
	uint64_t refs;
1778
	/* Get while the hold prevents the dnode from moving. */
1779
	dmu_buf_impl_t *db = dn->dn_dbuf;
1780
	dnode_handle_t *dnh = dn->dn_handle;
1781

1782
	refs = zfs_refcount_remove(&dn->dn_holds, tag);
1783
	if (refs == 0)
1784
		cv_broadcast(&dn->dn_nodnholds);
1785
	mutex_exit(&dn->dn_mtx);
1786
	/* dnode could get destroyed at this point, so don't use it anymore */
1787

1788
	/*
1789
	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
1790
	 * indirectly by dbuf_rele() while relying on the dnode handle to
1791
	 * prevent the dnode from moving, since releasing the last hold could
1792
	 * result in the dnode's parent dbuf evicting its dnode handles. For
1793
	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
1794
	 * other direct or indirect hold on the dnode must first drop the dnode
1795
	 * handle.
1796
	 */
1797
#ifdef ZFS_DEBUG
1798
	ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread);
1799
#endif
1800

1801
	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1802
	if (refs == 0 && db != NULL) {
1803
		/*
1804
		 * Another thread could add a hold to the dnode handle in
1805
		 * dnode_hold_impl() while holding the parent dbuf. Since the
1806
		 * hold on the parent dbuf prevents the handle from being
1807
		 * destroyed, the hold on the handle is OK. We can't yet assert
1808
		 * that the handle has zero references, but that will be
1809
		 * asserted anyway when the handle gets destroyed.
1810
		 */
1811
		mutex_enter(&db->db_mtx);
1812
		dbuf_rele_and_unlock(db, dnh, evicting);
1813
	}
1814
}
1815

1816
/*
1817
 * Test whether we can create a dnode at the specified location.
1818
 */
1819
int
1820
dnode_try_claim(objset_t *os, uint64_t object, int slots)
1821
{
1822
	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
1823
	    slots, NULL, NULL));
1824
}
1825

1826
/*
1827
 * Test if the dnode is dirty, or carrying uncommitted records.
1828
 *
1829
 * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented
1830
 * in dnode_setdirty() the first time the dnode is dirtied on a txg, and
1831
 * decremented in either dnode_rele_task() or userquota_updates_task() when the
1832
 * txg is synced out.
1833
 */
1834
boolean_t
1835
dnode_is_dirty(dnode_t *dn)
1836
{
1837
	mutex_enter(&dn->dn_mtx);
1838
	boolean_t dirty = (dn->dn_dirtycnt != 0);
1839
	mutex_exit(&dn->dn_mtx);
1840
	return (dirty);
1841
}
1842

1843
void
1844
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1845
{
1846
	objset_t *os = dn->dn_objset;
1847
	uint64_t txg = tx->tx_txg;
1848

1849
	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1850
		dsl_dataset_dirty(os->os_dsl_dataset, tx);
1851
		return;
1852
	}
1853

1854
	DNODE_VERIFY(dn);
1855

1856
#ifdef ZFS_DEBUG
1857
	mutex_enter(&dn->dn_mtx);
1858
	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1859
	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1860
	mutex_exit(&dn->dn_mtx);
1861
#endif
1862

1863
	/*
1864
	 * Determine old uid/gid when necessary
1865
	 */
1866
	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1867

1868
	multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];
1869
	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1870

1871
	/*
1872
	 * If we are already marked dirty, we're done.
1873
	 */
1874
	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1875
		multilist_sublist_unlock(mls);
1876
		return;
1877
	}
1878

1879
	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
1880
	    !avl_is_empty(&dn->dn_dbufs));
1881
	ASSERT(dn->dn_datablksz != 0);
1882
	ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
1883
	ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
1884
	ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
1885

1886
	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1887
	    (u_longlong_t)dn->dn_object, (u_longlong_t)txg);
1888

1889
	multilist_sublist_insert_head(mls, dn);
1890

1891
	multilist_sublist_unlock(mls);
1892

1893
	/*
1894
	 * The dnode maintains a hold on its containing dbuf as
1895
	 * long as there are holds on it.  Each instantiated child
1896
	 * dbuf maintains a hold on the dnode.  When the last child
1897
	 * drops its hold, the dnode will drop its hold on the
1898
	 * containing dbuf. We add a "dirty hold" here so that the
1899
	 * dnode will hang around after we finish processing its
1900
	 * children.
1901
	 */
1902
	mutex_enter(&dn->dn_mtx);
1903
	VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));
1904
	dn->dn_dirtycnt++;
1905
	ASSERT3U(dn->dn_dirtycnt, <=, 3);
1906
	mutex_exit(&dn->dn_mtx);
1907

1908
	(void) dbuf_dirty(dn->dn_dbuf, tx);
1909

1910
	dsl_dataset_dirty(os->os_dsl_dataset, tx);
1911
}
1912

1913
void
1914
dnode_free(dnode_t *dn, dmu_tx_t *tx)
1915
{
1916
	mutex_enter(&dn->dn_mtx);
1917
	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1918
		mutex_exit(&dn->dn_mtx);
1919
		return;
1920
	}
1921
	dn->dn_free_txg = tx->tx_txg;
1922
	mutex_exit(&dn->dn_mtx);
1923

1924
	dnode_setdirty(dn, tx);
1925
}
1926

1927
/*
1928
 * Try to change the block size for the indicated dnode.  This can only
1929
 * succeed if there are no blocks allocated or dirty beyond first block
1930
 */
1931
int
1932
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1933
{
1934
	dmu_buf_impl_t *db;
1935
	int err;
1936

1937
	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1938
	if (size == 0)
1939
		size = SPA_MINBLOCKSIZE;
1940
	else
1941
		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1942

1943
	if (ibs == dn->dn_indblkshift)
1944
		ibs = 0;
1945

1946
	if (size == dn->dn_datablksz && ibs == 0)
1947
		return (0);
1948

1949
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1950

1951
	/* Check for any allocated blocks beyond the first */
1952
	if (dn->dn_maxblkid != 0)
1953
		goto fail;
1954

1955
	mutex_enter(&dn->dn_dbufs_mtx);
1956
	for (db = avl_first(&dn->dn_dbufs); db != NULL;
1957
	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
1958
		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1959
		    db->db_blkid != DMU_SPILL_BLKID) {
1960
			mutex_exit(&dn->dn_dbufs_mtx);
1961
			goto fail;
1962
		}
1963
	}
1964
	mutex_exit(&dn->dn_dbufs_mtx);
1965

1966
	if (ibs && dn->dn_nlevels != 1)
1967
		goto fail;
1968

1969
	dnode_setdirty(dn, tx);
1970
	if (size != dn->dn_datablksz) {
1971
		/* resize the old block */
1972
		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
1973
		if (err == 0) {
1974
			dbuf_new_size(db, size, tx);
1975
		} else if (err != ENOENT) {
1976
			goto fail;
1977
		}
1978

1979
		dnode_setdblksz(dn, size);
1980
		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
1981
		if (db)
1982
			dbuf_rele(db, FTAG);
1983
	}
1984
	if (ibs) {
1985
		dn->dn_indblkshift = ibs;
1986
		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
1987
	}
1988

1989
	rw_exit(&dn->dn_struct_rwlock);
1990
	return (0);
1991

1992
fail:
1993
	rw_exit(&dn->dn_struct_rwlock);
1994
	return (SET_ERROR(ENOTSUP));
1995
}
1996

1997
static void
1998
dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1999
{
2000
	uint64_t txgoff = tx->tx_txg & TXG_MASK;
2001
	int old_nlevels = dn->dn_nlevels;
2002
	dmu_buf_impl_t *db;
2003
	list_t *list;
2004
	dbuf_dirty_record_t *new, *dr, *dr_next;
2005

2006
	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2007

2008
	ASSERT3U(new_nlevels, >, dn->dn_nlevels);
2009
	dn->dn_nlevels = new_nlevels;
2010

2011
	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
2012
	dn->dn_next_nlevels[txgoff] = new_nlevels;
2013

2014
	/* dirty the left indirects */
2015
	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
2016
	ASSERT(db != NULL);
2017
	new = dbuf_dirty(db, tx);
2018
	dbuf_rele(db, FTAG);
2019

2020
	/* transfer the dirty records to the new indirect */
2021
	mutex_enter(&dn->dn_mtx);
2022
	mutex_enter(&new->dt.di.dr_mtx);
2023
	list = &dn->dn_dirty_records[txgoff];
2024
	for (dr = list_head(list); dr; dr = dr_next) {
2025
		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
2026

2027
		IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
2028
		if (dr->dr_dbuf == NULL ||
2029
		    (dr->dr_dbuf->db_level == old_nlevels - 1 &&
2030
		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2031
		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
2032
			list_remove(&dn->dn_dirty_records[txgoff], dr);
2033
			list_insert_tail(&new->dt.di.dr_children, dr);
2034
			dr->dr_parent = new;
2035
		}
2036
	}
2037
	mutex_exit(&new->dt.di.dr_mtx);
2038
	mutex_exit(&dn->dn_mtx);
2039
}
2040

2041
int
2042
dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
2043
{
2044
	int ret = 0;
2045

2046
	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2047

2048
	if (dn->dn_nlevels == nlevels) {
2049
		ret = 0;
2050
		goto out;
2051
	} else if (nlevels < dn->dn_nlevels) {
2052
		ret = SET_ERROR(EINVAL);
2053
		goto out;
2054
	}
2055

2056
	dnode_set_nlevels_impl(dn, nlevels, tx);
2057

2058
out:
2059
	rw_exit(&dn->dn_struct_rwlock);
2060
	return (ret);
2061
}
2062

2063
/* read-holding callers must not rely on the lock being continuously held */
2064
void
2065
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
2066
    boolean_t force)
2067
{
2068
	int epbs, new_nlevels;
2069
	uint64_t sz;
2070

2071
	ASSERT(blkid != DMU_BONUS_BLKID);
2072

2073
	ASSERT(have_read ?
2074
	    RW_READ_HELD(&dn->dn_struct_rwlock) :
2075
	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
2076

2077
	/*
2078
	 * if we have a read-lock, check to see if we need to do any work
2079
	 * before upgrading to a write-lock.
2080
	 */
2081
	if (have_read) {
2082
		if (blkid <= dn->dn_maxblkid)
2083
			return;
2084

2085
		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
2086
			rw_exit(&dn->dn_struct_rwlock);
2087
			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2088
		}
2089
	}
2090

2091
	/*
2092
	 * Raw sends (indicated by the force flag) require that we take the
2093
	 * given blkid even if the value is lower than the current value.
2094
	 */
2095
	if (!force && blkid <= dn->dn_maxblkid)
2096
		goto out;
2097

2098
	/*
2099
	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
2100
	 * to indicate that this field is set. This allows us to set the
2101
	 * maxblkid to 0 on an existing object in dnode_sync().
2102
	 */
2103
	dn->dn_maxblkid = blkid;
2104
	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
2105
	    blkid | DMU_NEXT_MAXBLKID_SET;
2106

2107
	/*
2108
	 * Compute the number of levels necessary to support the new maxblkid.
2109
	 * Raw sends will ensure nlevels is set correctly for us.
2110
	 */
2111
	new_nlevels = 1;
2112
	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2113
	for (sz = dn->dn_nblkptr;
2114
	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
2115
		new_nlevels++;
2116

2117
	ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
2118

2119
	if (!force) {
2120
		if (new_nlevels > dn->dn_nlevels)
2121
			dnode_set_nlevels_impl(dn, new_nlevels, tx);
2122
	} else {
2123
		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
2124
	}
2125

2126
out:
2127
	if (have_read)
2128
		rw_downgrade(&dn->dn_struct_rwlock);
2129
}
2130

2131
static void
2132
dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
2133
{
2134
	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
2135
	if (db != NULL) {
2136
		dmu_buf_will_dirty(&db->db, tx);
2137
		dbuf_rele(db, FTAG);
2138
	}
2139
}
2140

2141
/*
2142
 * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
2143
 * and end_blkid.
2144
 */
2145
static void
2146
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
2147
    dmu_tx_t *tx)
2148
{
2149
	dmu_buf_impl_t *db_search;
2150
	dmu_buf_impl_t *db;
2151
	avl_index_t where;
2152

2153
	db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
2154

2155
	mutex_enter(&dn->dn_dbufs_mtx);
2156

2157
	db_search->db_level = 1;
2158
	db_search->db_blkid = start_blkid + 1;
2159
	db_search->db_state = DB_SEARCH;
2160
	for (;;) {
2161

2162
		db = avl_find(&dn->dn_dbufs, db_search, &where);
2163
		if (db == NULL)
2164
			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
2165

2166
		if (db == NULL || db->db_level != 1 ||
2167
		    db->db_blkid >= end_blkid) {
2168
			break;
2169
		}
2170

2171
		/*
2172
		 * Setup the next blkid we want to search for.
2173
		 */
2174
		db_search->db_blkid = db->db_blkid + 1;
2175
		ASSERT3U(db->db_blkid, >=, start_blkid);
2176

2177
		/*
2178
		 * If the dbuf transitions to DB_EVICTING while we're trying
2179
		 * to dirty it, then we will be unable to discover it in
2180
		 * the dbuf hash table. This will result in a call to
2181
		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
2182
		 * lock. To avoid a deadlock, we drop the lock before
2183
		 * dirtying the level-1 dbuf.
2184
		 */
2185
		mutex_exit(&dn->dn_dbufs_mtx);
2186
		dnode_dirty_l1(dn, db->db_blkid, tx);
2187
		mutex_enter(&dn->dn_dbufs_mtx);
2188
	}
2189

2190
#ifdef ZFS_DEBUG
2191
	/*
2192
	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
2193
	 */
2194
	db_search->db_level = 1;
2195
	db_search->db_blkid = start_blkid + 1;
2196
	db_search->db_state = DB_SEARCH;
2197
	db = avl_find(&dn->dn_dbufs, db_search, &where);
2198
	if (db == NULL)
2199
		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
2200
	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
2201
		if (db->db_level != 1 || db->db_blkid >= end_blkid)
2202
			break;
2203
		if (db->db_state != DB_EVICTING)
2204
			ASSERT(db->db_dirtycnt > 0);
2205
	}
2206
#endif
2207
	kmem_free(db_search, sizeof (dmu_buf_impl_t));
2208
	mutex_exit(&dn->dn_dbufs_mtx);
2209
}
2210

2211
static void
2212
dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
2213
    dmu_tx_t *tx)
2214
{
2215
	dmu_buf_impl_t *db;
2216
	int res;
2217

2218
	rw_enter(&dn->dn_struct_rwlock, RW_READER);
2219
	res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,
2220
	    FTAG, &db);
2221
	rw_exit(&dn->dn_struct_rwlock);
2222
	if (res == 0) {
2223
		db_lock_type_t dblt;
2224
		boolean_t dirty;
2225

2226
		dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2227
		/* don't dirty if not on disk and not dirty */
2228
		dirty = !list_is_empty(&db->db_dirty_records) ||
2229
		    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
2230
		dmu_buf_unlock_parent(db, dblt, FTAG);
2231
		if (dirty) {
2232
			caddr_t data;
2233

2234
			dmu_buf_will_dirty(&db->db, tx);
2235
			data = db->db.db_data;
2236
			memset(data + blkoff, 0, len);
2237
		}
2238
		dbuf_rele(db, FTAG);
2239
	}
2240
}
2241

2242
void
2243
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
2244
{
2245
	uint64_t blkoff, blkid, nblks;
2246
	int blksz, blkshift, head, tail;
2247
	int trunc = FALSE;
2248
	int epbs;
2249

2250
	blksz = dn->dn_datablksz;
2251
	blkshift = dn->dn_datablkshift;
2252
	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2253

2254
	if (len == DMU_OBJECT_END) {
2255
		len = UINT64_MAX - off;
2256
		trunc = TRUE;
2257
	}
2258

2259
	/*
2260
	 * First, block align the region to free:
2261
	 */
2262
	if (ISP2(blksz)) {
2263
		head = P2NPHASE(off, blksz);
2264
		blkoff = P2PHASE(off, blksz);
2265
		if ((off >> blkshift) > dn->dn_maxblkid)
2266
			return;
2267
	} else {
2268
		ASSERT0(dn->dn_maxblkid);
2269
		if (off == 0 && len >= blksz) {
2270
			/*
2271
			 * Freeing the whole block; fast-track this request.
2272
			 */
2273
			blkid = 0;
2274
			nblks = 1;
2275
			if (dn->dn_nlevels > 1) {
2276
				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2277
				dnode_dirty_l1(dn, 0, tx);
2278
				rw_exit(&dn->dn_struct_rwlock);
2279
			}
2280
			goto done;
2281
		} else if (off >= blksz) {
2282
			/* Freeing past end-of-data */
2283
			return;
2284
		} else {
2285
			/* Freeing part of the block. */
2286
			head = blksz - off;
2287
			ASSERT3U(head, >, 0);
2288
		}
2289
		blkoff = off;
2290
	}
2291
	/* zero out any partial block data at the start of the range */
2292
	if (head) {
2293
		ASSERT3U(blkoff + head, ==, blksz);
2294
		if (len < head)
2295
			head = len;
2296
		dnode_partial_zero(dn, off, blkoff, head, tx);
2297
		off += head;
2298
		len -= head;
2299
	}
2300

2301
	/* If the range was less than one block, we're done */
2302
	if (len == 0)
2303
		return;
2304

2305
	/* If the remaining range is past end of file, we're done */
2306
	if ((off >> blkshift) > dn->dn_maxblkid)
2307
		return;
2308

2309
	ASSERT(ISP2(blksz));
2310
	if (trunc)
2311
		tail = 0;
2312
	else
2313
		tail = P2PHASE(len, blksz);
2314

2315
	ASSERT0(P2PHASE(off, blksz));
2316
	/* zero out any partial block data at the end of the range */
2317
	if (tail) {
2318
		if (len < tail)
2319
			tail = len;
2320
		dnode_partial_zero(dn, off + len, 0, tail, tx);
2321
		len -= tail;
2322
	}
2323

2324
	/* If the range did not include a full block, we are done */
2325
	if (len == 0)
2326
		return;
2327

2328
	ASSERT(IS_P2ALIGNED(off, blksz));
2329
	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2330
	blkid = off >> blkshift;
2331
	nblks = len >> blkshift;
2332
	if (trunc)
2333
		nblks += 1;
2334

2335
	/*
2336
	 * Dirty all the indirect blocks in this range.  Note that only
2337
	 * the first and last indirect blocks can actually be written
2338
	 * (if they were partially freed) -- they must be dirtied, even if
2339
	 * they do not exist on disk yet.  The interior blocks will
2340
	 * be freed by free_children(), so they will not actually be written.
2341
	 * Even though these interior blocks will not be written, we
2342
	 * dirty them for two reasons:
2343
	 *
2344
	 *  - It ensures that the indirect blocks remain in memory until
2345
	 *    syncing context.  (They have already been prefetched by
2346
	 *    dmu_tx_hold_free(), so we don't have to worry about reading
2347
	 *    them serially here.)
2348
	 *
2349
	 *  - The dirty space accounting will put pressure on the txg sync
2350
	 *    mechanism to begin syncing, and to delay transactions if there
2351
	 *    is a large amount of freeing.  Even though these indirect
2352
	 *    blocks will not be written, we could need to write the same
2353
	 *    amount of space if we copy the freed BPs into deadlists.
2354
	 */
2355
	if (dn->dn_nlevels > 1) {
2356
		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2357
		uint64_t first, last;
2358

2359
		first = blkid >> epbs;
2360
		dnode_dirty_l1(dn, first, tx);
2361
		if (trunc)
2362
			last = dn->dn_maxblkid >> epbs;
2363
		else
2364
			last = (blkid + nblks - 1) >> epbs;
2365
		if (last != first)
2366
			dnode_dirty_l1(dn, last, tx);
2367

2368
		dnode_dirty_l1range(dn, first, last, tx);
2369

2370
		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
2371
		    SPA_BLKPTRSHIFT;
2372
		for (uint64_t i = first + 1; i < last; i++) {
2373
			/*
2374
			 * Set i to the blockid of the next non-hole
2375
			 * level-1 indirect block at or after i.  Note
2376
			 * that dnode_next_offset() operates in terms of
2377
			 * level-0-equivalent bytes.
2378
			 */
2379
			uint64_t ibyte = i << shift;
2380
			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
2381
			    &ibyte, 2, 1, 0);
2382
			i = ibyte >> shift;
2383
			if (i >= last)
2384
				break;
2385

2386
			/*
2387
			 * Normally we should not see an error, either
2388
			 * from dnode_next_offset() or dbuf_hold_level()
2389
			 * (except for ESRCH from dnode_next_offset).
2390
			 * If there is an i/o error, then when we read
2391
			 * this block in syncing context, it will use
2392
			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
2393
			 * to the "failmode" property.  dnode_next_offset()
2394
			 * doesn't have a flag to indicate MUSTSUCCEED.
2395
			 */
2396
			if (err != 0)
2397
				break;
2398

2399
			dnode_dirty_l1(dn, i, tx);
2400
		}
2401
		rw_exit(&dn->dn_struct_rwlock);
2402
	}
2403

2404
done:
2405
	/*
2406
	 * Add this range to the dnode range list.
2407
	 * We will finish up this free operation in the syncing phase.
2408
	 */
2409
	mutex_enter(&dn->dn_mtx);
2410
	{
2411
		int txgoff = tx->tx_txg & TXG_MASK;
2412
		if (dn->dn_free_ranges[txgoff] == NULL) {
2413
			dn->dn_free_ranges[txgoff] =
2414
			    zfs_range_tree_create_flags(
2415
			    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
2416
			    ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
2417
		}
2418
		zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2419
		zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2420
	}
2421
	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2422
	    (u_longlong_t)blkid, (u_longlong_t)nblks,
2423
	    (u_longlong_t)tx->tx_txg);
2424
	mutex_exit(&dn->dn_mtx);
2425

2426
	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2427
	dnode_setdirty(dn, tx);
2428
}
2429

2430
static boolean_t
2431
dnode_spill_freed(dnode_t *dn)
2432
{
2433
	int i;
2434

2435
	mutex_enter(&dn->dn_mtx);
2436
	for (i = 0; i < TXG_SIZE; i++) {
2437
		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
2438
			break;
2439
	}
2440
	mutex_exit(&dn->dn_mtx);
2441
	return (i < TXG_SIZE);
2442
}
2443

2444
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2445
uint64_t
2446
dnode_block_freed(dnode_t *dn, uint64_t blkid)
2447
{
2448
	int i;
2449

2450
	if (blkid == DMU_BONUS_BLKID)
2451
		return (FALSE);
2452

2453
	if (dn->dn_free_txg)
2454
		return (TRUE);
2455

2456
	if (blkid == DMU_SPILL_BLKID)
2457
		return (dnode_spill_freed(dn));
2458

2459
	mutex_enter(&dn->dn_mtx);
2460
	for (i = 0; i < TXG_SIZE; i++) {
2461
		if (dn->dn_free_ranges[i] != NULL &&
2462
		    zfs_range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2463
			break;
2464
	}
2465
	mutex_exit(&dn->dn_mtx);
2466
	return (i < TXG_SIZE);
2467
}
2468

2469
/* call from syncing context when we actually write/free space for this dnode */
2470
void
2471
dnode_diduse_space(dnode_t *dn, int64_t delta)
2472
{
2473
	uint64_t space;
2474
	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2475
	    dn, dn->dn_phys,
2476
	    (u_longlong_t)dn->dn_phys->dn_used,
2477
	    (longlong_t)delta);
2478

2479
	mutex_enter(&dn->dn_mtx);
2480
	space = DN_USED_BYTES(dn->dn_phys);
2481
	if (delta > 0) {
2482
		ASSERT3U(space + delta, >=, space); /* no overflow */
2483
	} else {
2484
		ASSERT3U(space, >=, -delta); /* no underflow */
2485
	}
2486
	space += delta;
2487
	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
2488
		ASSERT0((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES));
2489
		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
2490
		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2491
	} else {
2492
		dn->dn_phys->dn_used = space;
2493
		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2494
	}
2495
	mutex_exit(&dn->dn_mtx);
2496
}
2497

2498
/*
2499
 * Scans a block at the indicated "level" looking for a hole or data,
2500
 * depending on 'flags'.
2501
 *
2502
 * If level > 0, then we are scanning an indirect block looking at its
2503
 * pointers.  If level == 0, then we are looking at a block of dnodes.
2504
 *
2505
 * If we don't find what we are looking for in the block, we return ESRCH.
2506
 * Otherwise, return with *offset pointing to the beginning (if searching
2507
 * forwards) or end (if searching backwards) of the range covered by the
2508
 * block pointer we matched on (or dnode).
2509
 *
2510
 * The basic search algorithm used below by dnode_next_offset() is to
2511
 * use this function to search up the block tree (widen the search) until
2512
 * we find something (i.e., we don't return ESRCH) and then search back
2513
 * down the tree (narrow the search) until we reach our original search
2514
 * level.
2515
 */
2516
static int
2517
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2518
    int lvl, uint64_t blkfill, uint64_t txg)
2519
{
2520
	dmu_buf_impl_t *db = NULL;
2521
	void *data = NULL;
2522
	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2523
	uint64_t epb = 1ULL << epbs;
2524
	uint64_t minfill, maxfill;
2525
	boolean_t hole;
2526
	int i, inc, error, span;
2527

2528
	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2529

2530
	hole = ((flags & DNODE_FIND_HOLE) != 0);
2531
	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
2532
	ASSERT(txg == 0 || !hole);
2533

2534
	if (lvl == dn->dn_phys->dn_nlevels) {
2535
		error = 0;
2536
		epb = dn->dn_phys->dn_nblkptr;
2537
		data = dn->dn_phys->dn_blkptr;
2538
		if (dn->dn_dbuf != NULL)
2539
			rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);
2540
		else if (dmu_objset_ds(dn->dn_objset) != NULL)
2541
			rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
2542
			    RW_READER, FTAG);
2543
	} else {
2544
		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2545
		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2546
		if (error) {
2547
			if (error != ENOENT)
2548
				return (error);
2549
			if (hole)
2550
				return (0);
2551
			/*
2552
			 * This can only happen when we are searching up
2553
			 * the block tree for data.  We don't really need to
2554
			 * adjust the offset, as we will just end up looking
2555
			 * at the pointer to this block in its parent, and its
2556
			 * going to be unallocated, so we will skip over it.
2557
			 */
2558
			return (SET_ERROR(ESRCH));
2559
		}
2560
		error = dbuf_read(db, NULL,
2561
		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
2562
		    DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
2563
		if (error) {
2564
			dbuf_rele(db, FTAG);
2565
			return (error);
2566
		}
2567
		data = db->db.db_data;
2568
		rw_enter(&db->db_rwlock, RW_READER);
2569
	}
2570

2571
	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
2572
	    BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
2573
	    BP_IS_HOLE(db->db_blkptr))) {
2574
		/*
2575
		 * This can only happen when we are searching up the tree
2576
		 * and these conditions mean that we need to keep climbing.
2577
		 */
2578
		error = SET_ERROR(ESRCH);
2579
	} else if (lvl == 0) {
2580
		dnode_phys_t *dnp = data;
2581

2582
		ASSERT(dn->dn_type == DMU_OT_DNODE);
2583
		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2584

2585
		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2586
		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
2587
			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2588
				break;
2589
		}
2590

2591
		if (i == blkfill)
2592
			error = SET_ERROR(ESRCH);
2593

2594
		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2595
		    (i << DNODE_SHIFT);
2596
	} else {
2597
		blkptr_t *bp = data;
2598
		uint64_t start = *offset;
2599
		span = (lvl - 1) * epbs + dn->dn_datablkshift;
2600
		minfill = 0;
2601
		maxfill = blkfill << ((lvl - 1) * epbs);
2602

2603
		if (hole)
2604
			maxfill--;
2605
		else
2606
			minfill++;
2607

2608
		if (span >= 8 * sizeof (*offset)) {
2609
			/* This only happens on the highest indirection level */
2610
			ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
2611
			*offset = 0;
2612
		} else {
2613
			*offset = *offset >> span;
2614
		}
2615

2616
		for (i = BF64_GET(*offset, 0, epbs);
2617
		    i >= 0 && i < epb; i += inc) {
2618
			if (BP_GET_FILL(&bp[i]) >= minfill &&
2619
			    BP_GET_FILL(&bp[i]) <= maxfill &&
2620
			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
2621
				break;
2622
			if (inc > 0 || *offset > 0)
2623
				*offset += inc;
2624
		}
2625

2626
		if (span >= 8 * sizeof (*offset)) {
2627
			*offset = start;
2628
		} else {
2629
			*offset = *offset << span;
2630
		}
2631

2632
		if (inc < 0) {
2633
			/* traversing backwards; position offset at the end */
2634
			if (span < 8 * sizeof (*offset))
2635
				*offset = MIN(*offset + (1ULL << span) - 1,
2636
				    start);
2637
		} else if (*offset < start) {
2638
			*offset = start;
2639
		}
2640
		if (i < 0 || i >= epb)
2641
			error = SET_ERROR(ESRCH);
2642
	}
2643

2644
	if (db != NULL) {
2645
		rw_exit(&db->db_rwlock);
2646
		dbuf_rele(db, FTAG);
2647
	} else {
2648
		if (dn->dn_dbuf != NULL)
2649
			rw_exit(&dn->dn_dbuf->db_rwlock);
2650
		else if (dmu_objset_ds(dn->dn_objset) != NULL)
2651
			rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
2652
			    FTAG);
2653
	}
2654

2655
	return (error);
2656
}
2657

2658
/*
2659
 * Adjust *offset to the next (or previous) block byte offset at lvl.
2660
 * Returns FALSE if *offset would overflow or underflow.
2661
 */
2662
static boolean_t
2663
dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
2664
{
2665
	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2666
	int span = lvl * epbs + dn->dn_datablkshift;
2667
	uint64_t blkid, maxblkid;
2668

2669
	if (span >= 8 * sizeof (uint64_t))
2670
		return (B_FALSE);
2671

2672
	blkid = *offset >> span;
2673
	maxblkid = 1ULL << (8 * sizeof (*offset) - span);
2674
	if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
2675
		*offset = (blkid + 1) << span;
2676
	else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
2677
		*offset = (blkid << span) - 1;
2678
	else
2679
		return (B_FALSE);
2680

2681
	return (B_TRUE);
2682
}
2683

2684
/*
2685
 * Find the next hole, data, or sparse region at or after *offset.
2686
 * The value 'blkfill' tells us how many items we expect to find
2687
 * in an L0 data block; this value is 1 for normal objects,
2688
 * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2689
 * DNODES_PER_BLOCK when searching for sparse regions thereof.
2690
 *
2691
 * Examples:
2692
 *
2693
 * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2694
 *	Finds the next/previous hole/data in a file.
2695
 *	Used in dmu_offset_next().
2696
 *
2697
 * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2698
 *	Finds the next free/allocated dnode an objset's meta-dnode.
2699
 *	Only finds objects that have new contents since txg (ie.
2700
 *	bonus buffer changes and content removal are ignored).
2701
 *	Used in dmu_object_next().
2702
 *
2703
 * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2704
 *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
2705
 *	Used in dmu_object_alloc().
2706
 */
2707
int
2708
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
2709
    int minlvl, uint64_t blkfill, uint64_t txg)
2710
{
2711
	uint64_t matched = *offset;
2712
	int lvl, maxlvl;
2713
	int error = 0;
2714

2715
	if (!(flags & DNODE_FIND_HAVELOCK))
2716
		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2717

2718
	if (dn->dn_phys->dn_nlevels == 0) {
2719
		error = SET_ERROR(ESRCH);
2720
		goto out;
2721
	}
2722

2723
	if (dn->dn_datablkshift == 0) {
2724
		if (*offset < dn->dn_datablksz) {
2725
			if (flags & DNODE_FIND_HOLE)
2726
				*offset = dn->dn_datablksz;
2727
		} else {
2728
			error = SET_ERROR(ESRCH);
2729
		}
2730
		goto out;
2731
	}
2732

2733
	maxlvl = dn->dn_phys->dn_nlevels;
2734

2735
	for (lvl = minlvl; lvl <= maxlvl; ) {
2736
		error = dnode_next_offset_level(dn,
2737
		    flags, offset, lvl, blkfill, txg);
2738
		if (error == 0 && lvl > minlvl) {
2739
			--lvl;
2740
			matched = *offset;
2741
		} else if (error == ESRCH && lvl < maxlvl &&
2742
		    dnode_next_block(dn, flags, &matched, lvl)) {
2743
			/*
2744
			 * Continue search at next/prev offset in lvl+1 block.
2745
			 *
2746
			 * Usually we only search upwards at the start of the
2747
			 * search as higher level blocks point at a matching
2748
			 * minlvl block in most cases, but we backtrack if not.
2749
			 *
2750
			 * This can happen for txg > 0 searches if the block
2751
			 * contains only BPs/dnodes freed at that txg. It also
2752
			 * happens if we are still syncing out the tree, and
2753
			 * some BP's at higher levels are not updated yet.
2754
			 *
2755
			 * We must adjust offset to avoid coming back to the
2756
			 * same offset and getting stuck looping forever. This
2757
			 * also deals with the case where offset is already at
2758
			 * the beginning or end of the object.
2759
			 */
2760
			++lvl;
2761
			*offset = matched;
2762
		} else {
2763
			break;
2764
		}
2765
	}
2766

2767
	/*
2768
	 * There's always a "virtual hole" at the end of the object, even
2769
	 * if all BP's which physically exist are non-holes.
2770
	 */
2771
	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2772
	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2773
		error = 0;
2774
	}
2775

2776
out:
2777
	if (!(flags & DNODE_FIND_HAVELOCK))
2778
		rw_exit(&dn->dn_struct_rwlock);
2779

2780
	return (error);
2781
}
2782

2783
#if defined(_KERNEL)
2784
EXPORT_SYMBOL(dnode_hold);
2785
EXPORT_SYMBOL(dnode_rele);
2786
EXPORT_SYMBOL(dnode_set_nlevels);
2787
EXPORT_SYMBOL(dnode_set_blksz);
2788
EXPORT_SYMBOL(dnode_free_range);
2789
EXPORT_SYMBOL(dnode_evict_dbufs);
2790
EXPORT_SYMBOL(dnode_evict_bonus);
2791
#endif
2792

2793
ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
2794
	"Default dnode block shift");
2795
ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
2796
	"Default dnode indirect block shift");
2797

2798
Product

Resources

Company