Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dnode.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26
*/
27
28
#include <sys/zfs_context.h>
29
#include <sys/dbuf.h>
30
#include <sys/dnode.h>
31
#include <sys/dmu.h>
32
#include <sys/dmu_impl.h>
33
#include <sys/dmu_tx.h>
34
#include <sys/dmu_objset.h>
35
#include <sys/dsl_dir.h>
36
#include <sys/dsl_dataset.h>
37
#include <sys/spa.h>
38
#include <sys/zio.h>
39
#include <sys/dmu_zfetch.h>
40
#include <sys/range_tree.h>
41
#include <sys/trace_zfs.h>
42
#include <sys/zfs_project.h>
43
44
dnode_stats_t dnode_stats = {
45
{ "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
46
{ "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
47
{ "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
48
{ "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
49
{ "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
50
{ "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
51
{ "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
52
{ "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
53
{ "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
54
{ "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
55
{ "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
56
{ "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
57
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
58
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
59
{ "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
60
{ "dnode_allocate", KSTAT_DATA_UINT64 },
61
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
62
{ "dnode_buf_evict", KSTAT_DATA_UINT64 },
63
{ "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
64
{ "dnode_alloc_race", KSTAT_DATA_UINT64 },
65
{ "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
66
{ "dnode_move_invalid", KSTAT_DATA_UINT64 },
67
{ "dnode_move_recheck1", KSTAT_DATA_UINT64 },
68
{ "dnode_move_recheck2", KSTAT_DATA_UINT64 },
69
{ "dnode_move_special", KSTAT_DATA_UINT64 },
70
{ "dnode_move_handle", KSTAT_DATA_UINT64 },
71
{ "dnode_move_rwlock", KSTAT_DATA_UINT64 },
72
{ "dnode_move_active", KSTAT_DATA_UINT64 },
73
};
74
75
dnode_sums_t dnode_sums;
76
77
static kstat_t *dnode_ksp;
78
static kmem_cache_t *dnode_cache;
79
80
static dnode_phys_t dnode_phys_zero __maybe_unused;
81
82
int zfs_default_bs = SPA_MINBLOCKSHIFT;
83
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
84
85
#ifdef _KERNEL
86
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
87
#endif /* _KERNEL */
88
89
static char *
90
rt_name(dnode_t *dn, const char *name)
91
{
92
struct objset *os = dn->dn_objset;
93
94
return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
95
spa_name(os->os_spa),
96
(u_longlong_t)(os->os_dsl_dataset ?
97
os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
98
(u_longlong_t)dn->dn_object,
99
name));
100
}
101
102
static int
103
dbuf_compare(const void *x1, const void *x2)
104
{
105
const dmu_buf_impl_t *d1 = x1;
106
const dmu_buf_impl_t *d2 = x2;
107
108
int cmp = TREE_CMP(d1->db_level, d2->db_level);
109
if (likely(cmp))
110
return (cmp);
111
112
cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
113
if (likely(cmp))
114
return (cmp);
115
116
if (d1->db_state == DB_MARKER) {
117
ASSERT3S(d2->db_state, !=, DB_MARKER);
118
return (TREE_PCMP(d1->db_parent, d2));
119
} else if (d2->db_state == DB_MARKER) {
120
ASSERT3S(d1->db_state, !=, DB_MARKER);
121
return (TREE_PCMP(d1, d2->db_parent));
122
}
123
124
if (d1->db_state == DB_SEARCH) {
125
ASSERT3S(d2->db_state, !=, DB_SEARCH);
126
return (-1);
127
} else if (d2->db_state == DB_SEARCH) {
128
ASSERT3S(d1->db_state, !=, DB_SEARCH);
129
return (1);
130
}
131
132
return (TREE_PCMP(d1, d2));
133
}
134
135
static int
136
dnode_cons(void *arg, void *unused, int kmflag)
137
{
138
(void) unused, (void) kmflag;
139
dnode_t *dn = arg;
140
141
rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
142
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
143
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
144
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
145
cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
146
147
/*
148
* Every dbuf has a reference, and dropping a tracked reference is
149
* O(number of references), so don't track dn_holds.
150
*/
151
zfs_refcount_create_untracked(&dn->dn_holds);
152
zfs_refcount_create(&dn->dn_tx_holds);
153
list_link_init(&dn->dn_link);
154
155
memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
156
memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
157
memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
158
memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
159
memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
160
memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
161
memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
162
memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
163
memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
164
165
for (int i = 0; i < TXG_SIZE; i++) {
166
multilist_link_init(&dn->dn_dirty_link[i]);
167
dn->dn_free_ranges[i] = NULL;
168
list_create(&dn->dn_dirty_records[i],
169
sizeof (dbuf_dirty_record_t),
170
offsetof(dbuf_dirty_record_t, dr_dirty_node));
171
}
172
173
dn->dn_allocated_txg = 0;
174
dn->dn_free_txg = 0;
175
dn->dn_assigned_txg = 0;
176
dn->dn_dirtycnt = 0;
177
dn->dn_bonus = NULL;
178
dn->dn_have_spill = B_FALSE;
179
dn->dn_zio = NULL;
180
dn->dn_oldused = 0;
181
dn->dn_oldflags = 0;
182
dn->dn_olduid = 0;
183
dn->dn_oldgid = 0;
184
dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
185
dn->dn_newuid = 0;
186
dn->dn_newgid = 0;
187
dn->dn_newprojid = ZFS_DEFAULT_PROJID;
188
dn->dn_id_flags = 0;
189
190
dn->dn_dbufs_count = 0;
191
avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
192
offsetof(dmu_buf_impl_t, db_link));
193
194
dn->dn_moved = 0;
195
return (0);
196
}
197
198
static void
199
dnode_dest(void *arg, void *unused)
200
{
201
(void) unused;
202
dnode_t *dn = arg;
203
204
rw_destroy(&dn->dn_struct_rwlock);
205
mutex_destroy(&dn->dn_mtx);
206
mutex_destroy(&dn->dn_dbufs_mtx);
207
cv_destroy(&dn->dn_notxholds);
208
cv_destroy(&dn->dn_nodnholds);
209
zfs_refcount_destroy(&dn->dn_holds);
210
zfs_refcount_destroy(&dn->dn_tx_holds);
211
ASSERT(!list_link_active(&dn->dn_link));
212
213
for (int i = 0; i < TXG_SIZE; i++) {
214
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
215
ASSERT0P(dn->dn_free_ranges[i]);
216
list_destroy(&dn->dn_dirty_records[i]);
217
ASSERT0(dn->dn_next_nblkptr[i]);
218
ASSERT0(dn->dn_next_nlevels[i]);
219
ASSERT0(dn->dn_next_indblkshift[i]);
220
ASSERT0(dn->dn_next_bonustype[i]);
221
ASSERT0(dn->dn_rm_spillblk[i]);
222
ASSERT0(dn->dn_next_bonuslen[i]);
223
ASSERT0(dn->dn_next_blksz[i]);
224
ASSERT0(dn->dn_next_maxblkid[i]);
225
}
226
227
ASSERT0(dn->dn_allocated_txg);
228
ASSERT0(dn->dn_free_txg);
229
ASSERT0(dn->dn_assigned_txg);
230
ASSERT0(dn->dn_dirtycnt);
231
ASSERT0P(dn->dn_bonus);
232
ASSERT(!dn->dn_have_spill);
233
ASSERT0P(dn->dn_zio);
234
ASSERT0(dn->dn_oldused);
235
ASSERT0(dn->dn_oldflags);
236
ASSERT0(dn->dn_olduid);
237
ASSERT0(dn->dn_oldgid);
238
ASSERT0(dn->dn_oldprojid);
239
ASSERT0(dn->dn_newuid);
240
ASSERT0(dn->dn_newgid);
241
ASSERT0(dn->dn_newprojid);
242
ASSERT0(dn->dn_id_flags);
243
244
ASSERT0(dn->dn_dbufs_count);
245
avl_destroy(&dn->dn_dbufs);
246
}
247
248
static int
249
dnode_kstats_update(kstat_t *ksp, int rw)
250
{
251
dnode_stats_t *ds = ksp->ks_data;
252
253
if (rw == KSTAT_WRITE)
254
return (EACCES);
255
ds->dnode_hold_dbuf_hold.value.ui64 =
256
wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
257
ds->dnode_hold_dbuf_read.value.ui64 =
258
wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
259
ds->dnode_hold_alloc_hits.value.ui64 =
260
wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
261
ds->dnode_hold_alloc_misses.value.ui64 =
262
wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
263
ds->dnode_hold_alloc_interior.value.ui64 =
264
wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
265
ds->dnode_hold_alloc_lock_retry.value.ui64 =
266
wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
267
ds->dnode_hold_alloc_lock_misses.value.ui64 =
268
wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
269
ds->dnode_hold_alloc_type_none.value.ui64 =
270
wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
271
ds->dnode_hold_free_hits.value.ui64 =
272
wmsum_value(&dnode_sums.dnode_hold_free_hits);
273
ds->dnode_hold_free_misses.value.ui64 =
274
wmsum_value(&dnode_sums.dnode_hold_free_misses);
275
ds->dnode_hold_free_lock_misses.value.ui64 =
276
wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
277
ds->dnode_hold_free_lock_retry.value.ui64 =
278
wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
279
ds->dnode_hold_free_refcount.value.ui64 =
280
wmsum_value(&dnode_sums.dnode_hold_free_refcount);
281
ds->dnode_hold_free_overflow.value.ui64 =
282
wmsum_value(&dnode_sums.dnode_hold_free_overflow);
283
ds->dnode_free_interior_lock_retry.value.ui64 =
284
wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
285
ds->dnode_allocate.value.ui64 =
286
wmsum_value(&dnode_sums.dnode_allocate);
287
ds->dnode_reallocate.value.ui64 =
288
wmsum_value(&dnode_sums.dnode_reallocate);
289
ds->dnode_buf_evict.value.ui64 =
290
wmsum_value(&dnode_sums.dnode_buf_evict);
291
ds->dnode_alloc_next_chunk.value.ui64 =
292
wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
293
ds->dnode_alloc_race.value.ui64 =
294
wmsum_value(&dnode_sums.dnode_alloc_race);
295
ds->dnode_alloc_next_block.value.ui64 =
296
wmsum_value(&dnode_sums.dnode_alloc_next_block);
297
ds->dnode_move_invalid.value.ui64 =
298
wmsum_value(&dnode_sums.dnode_move_invalid);
299
ds->dnode_move_recheck1.value.ui64 =
300
wmsum_value(&dnode_sums.dnode_move_recheck1);
301
ds->dnode_move_recheck2.value.ui64 =
302
wmsum_value(&dnode_sums.dnode_move_recheck2);
303
ds->dnode_move_special.value.ui64 =
304
wmsum_value(&dnode_sums.dnode_move_special);
305
ds->dnode_move_handle.value.ui64 =
306
wmsum_value(&dnode_sums.dnode_move_handle);
307
ds->dnode_move_rwlock.value.ui64 =
308
wmsum_value(&dnode_sums.dnode_move_rwlock);
309
ds->dnode_move_active.value.ui64 =
310
wmsum_value(&dnode_sums.dnode_move_active);
311
return (0);
312
}
313
314
void
315
dnode_init(void)
316
{
317
ASSERT0P(dnode_cache);
318
dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
319
0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
320
kmem_cache_set_move(dnode_cache, dnode_move);
321
322
wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
323
wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
324
wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
325
wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
326
wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
327
wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
328
wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
329
wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
330
wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
331
wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
332
wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
333
wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
334
wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
335
wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
336
wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
337
wmsum_init(&dnode_sums.dnode_allocate, 0);
338
wmsum_init(&dnode_sums.dnode_reallocate, 0);
339
wmsum_init(&dnode_sums.dnode_buf_evict, 0);
340
wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
341
wmsum_init(&dnode_sums.dnode_alloc_race, 0);
342
wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
343
wmsum_init(&dnode_sums.dnode_move_invalid, 0);
344
wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
345
wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
346
wmsum_init(&dnode_sums.dnode_move_special, 0);
347
wmsum_init(&dnode_sums.dnode_move_handle, 0);
348
wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
349
wmsum_init(&dnode_sums.dnode_move_active, 0);
350
351
dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
352
KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
353
KSTAT_FLAG_VIRTUAL);
354
if (dnode_ksp != NULL) {
355
dnode_ksp->ks_data = &dnode_stats;
356
dnode_ksp->ks_update = dnode_kstats_update;
357
kstat_install(dnode_ksp);
358
}
359
}
360
361
void
362
dnode_fini(void)
363
{
364
if (dnode_ksp != NULL) {
365
kstat_delete(dnode_ksp);
366
dnode_ksp = NULL;
367
}
368
369
wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
370
wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
371
wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
372
wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
373
wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
374
wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
375
wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
376
wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
377
wmsum_fini(&dnode_sums.dnode_hold_free_hits);
378
wmsum_fini(&dnode_sums.dnode_hold_free_misses);
379
wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
380
wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
381
wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
382
wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
383
wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
384
wmsum_fini(&dnode_sums.dnode_allocate);
385
wmsum_fini(&dnode_sums.dnode_reallocate);
386
wmsum_fini(&dnode_sums.dnode_buf_evict);
387
wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
388
wmsum_fini(&dnode_sums.dnode_alloc_race);
389
wmsum_fini(&dnode_sums.dnode_alloc_next_block);
390
wmsum_fini(&dnode_sums.dnode_move_invalid);
391
wmsum_fini(&dnode_sums.dnode_move_recheck1);
392
wmsum_fini(&dnode_sums.dnode_move_recheck2);
393
wmsum_fini(&dnode_sums.dnode_move_special);
394
wmsum_fini(&dnode_sums.dnode_move_handle);
395
wmsum_fini(&dnode_sums.dnode_move_rwlock);
396
wmsum_fini(&dnode_sums.dnode_move_active);
397
398
kmem_cache_destroy(dnode_cache);
399
dnode_cache = NULL;
400
}
401
402
403
#ifdef ZFS_DEBUG
404
void
405
dnode_verify(dnode_t *dn)
406
{
407
int drop_struct_lock = FALSE;
408
409
ASSERT(dn->dn_phys);
410
ASSERT(dn->dn_objset);
411
ASSERT(dn->dn_handle->dnh_dnode == dn);
412
413
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
414
415
if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
416
return;
417
418
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
419
rw_enter(&dn->dn_struct_rwlock, RW_READER);
420
drop_struct_lock = TRUE;
421
}
422
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
423
int i;
424
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
425
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
426
if (dn->dn_datablkshift) {
427
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
428
ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
429
ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
430
}
431
ASSERT3U(dn->dn_nlevels, <=, 30);
432
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
433
ASSERT3U(dn->dn_nblkptr, >=, 1);
434
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
435
ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
436
ASSERT3U(dn->dn_datablksz, ==,
437
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
438
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
439
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
440
dn->dn_bonuslen, <=, max_bonuslen);
441
for (i = 0; i < TXG_SIZE; i++) {
442
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
443
}
444
}
445
if (dn->dn_phys->dn_type != DMU_OT_NONE)
446
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
447
ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
448
if (dn->dn_dbuf != NULL) {
449
ASSERT3P(dn->dn_phys, ==,
450
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
451
(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
452
}
453
if (drop_struct_lock)
454
rw_exit(&dn->dn_struct_rwlock);
455
}
456
#endif
457
458
void
459
dnode_byteswap(dnode_phys_t *dnp)
460
{
461
uint64_t *buf64 = (void*)&dnp->dn_blkptr;
462
int i;
463
464
if (dnp->dn_type == DMU_OT_NONE) {
465
memset(dnp, 0, sizeof (dnode_phys_t));
466
return;
467
}
468
469
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
470
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
471
dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
472
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
473
dnp->dn_used = BSWAP_64(dnp->dn_used);
474
475
/*
476
* dn_nblkptr is only one byte, so it's OK to read it in either
477
* byte order. We can't read dn_bouslen.
478
*/
479
ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
480
ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
481
for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
482
buf64[i] = BSWAP_64(buf64[i]);
483
484
/*
485
* OK to check dn_bonuslen for zero, because it won't matter if
486
* we have the wrong byte order. This is necessary because the
487
* dnode dnode is smaller than a regular dnode.
488
*/
489
if (dnp->dn_bonuslen != 0) {
490
dmu_object_byteswap_t byteswap;
491
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
492
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
493
dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
494
DN_MAX_BONUS_LEN(dnp));
495
}
496
497
/* Swap SPILL block if we have one */
498
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
499
byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
500
}
501
502
void
503
dnode_buf_byteswap(void *vbuf, size_t size)
504
{
505
int i = 0;
506
507
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
508
ASSERT0((size & (sizeof (dnode_phys_t)-1)));
509
510
while (i < size) {
511
dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
512
dnode_byteswap(dnp);
513
514
i += DNODE_MIN_SIZE;
515
if (dnp->dn_type != DMU_OT_NONE)
516
i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
517
}
518
}
519
520
void
521
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
522
{
523
ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
524
525
dnode_setdirty(dn, tx);
526
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
527
ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
528
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
529
530
if (newsize < dn->dn_bonuslen) {
531
/* clear any data after the end of the new size */
532
size_t diff = dn->dn_bonuslen - newsize;
533
char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
534
memset(data_end, 0, diff);
535
}
536
537
dn->dn_bonuslen = newsize;
538
if (newsize == 0)
539
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
540
else
541
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
542
rw_exit(&dn->dn_struct_rwlock);
543
}
544
545
void
546
dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
547
{
548
ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
549
dnode_setdirty(dn, tx);
550
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
551
dn->dn_bonustype = newtype;
552
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
553
rw_exit(&dn->dn_struct_rwlock);
554
}
555
556
void
557
dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype)
558
{
559
/*
560
* This is not in the dnode_phys, but it should be, and perhaps one day
561
* will. For now we require it be set after taking a hold.
562
*/
563
ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
564
dn->dn_storage_type = newtype;
565
}
566
567
void
568
dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
569
{
570
ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
571
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
572
dnode_setdirty(dn, tx);
573
dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
574
dn->dn_have_spill = B_FALSE;
575
}
576
577
static void
578
dnode_setdblksz(dnode_t *dn, int size)
579
{
580
ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
581
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
582
ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
583
ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
584
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
585
dn->dn_datablksz = size;
586
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
587
dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
588
}
589
590
static dnode_t *
591
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
592
uint64_t object, dnode_handle_t *dnh)
593
{
594
dnode_t *dn;
595
596
dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
597
dn->dn_moved = 0;
598
599
/*
600
* Defer setting dn_objset until the dnode is ready to be a candidate
601
* for the dnode_move() callback.
602
*/
603
dn->dn_object = object;
604
dn->dn_dbuf = db;
605
dn->dn_handle = dnh;
606
dn->dn_phys = dnp;
607
608
if (dnp->dn_datablkszsec) {
609
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
610
} else {
611
dn->dn_datablksz = 0;
612
dn->dn_datablkszsec = 0;
613
dn->dn_datablkshift = 0;
614
}
615
dn->dn_indblkshift = dnp->dn_indblkshift;
616
dn->dn_nlevels = dnp->dn_nlevels;
617
dn->dn_type = dnp->dn_type;
618
dn->dn_nblkptr = dnp->dn_nblkptr;
619
dn->dn_checksum = dnp->dn_checksum;
620
dn->dn_compress = dnp->dn_compress;
621
dn->dn_bonustype = dnp->dn_bonustype;
622
dn->dn_bonuslen = dnp->dn_bonuslen;
623
dn->dn_num_slots = dnp->dn_extra_slots + 1;
624
dn->dn_maxblkid = dnp->dn_maxblkid;
625
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
626
dn->dn_id_flags = 0;
627
628
dn->dn_storage_type = DMU_OT_NONE;
629
630
dmu_zfetch_init(&dn->dn_zfetch, dn);
631
632
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
633
ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
634
ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
635
636
mutex_enter(&os->os_lock);
637
638
/*
639
* Exclude special dnodes from os_dnodes so an empty os_dnodes
640
* signifies that the special dnodes have no references from
641
* their children (the entries in os_dnodes). This allows
642
* dnode_destroy() to easily determine if the last child has
643
* been removed and then complete eviction of the objset.
644
*/
645
if (!DMU_OBJECT_IS_SPECIAL(object))
646
list_insert_head(&os->os_dnodes, dn);
647
membar_producer();
648
649
/*
650
* Everything else must be valid before assigning dn_objset
651
* makes the dnode eligible for dnode_move().
652
*/
653
dn->dn_objset = os;
654
655
dnh->dnh_dnode = dn;
656
mutex_exit(&os->os_lock);
657
658
arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
659
660
return (dn);
661
}
662
663
/*
664
* Caller must be holding the dnode handle, which is released upon return.
665
*/
666
static void
667
dnode_destroy(dnode_t *dn)
668
{
669
objset_t *os = dn->dn_objset;
670
boolean_t complete_os_eviction = B_FALSE;
671
672
ASSERT0((dn->dn_id_flags & DN_ID_NEW_EXIST));
673
674
mutex_enter(&os->os_lock);
675
POINTER_INVALIDATE(&dn->dn_objset);
676
if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
677
list_remove(&os->os_dnodes, dn);
678
complete_os_eviction =
679
list_is_empty(&os->os_dnodes) &&
680
list_link_active(&os->os_evicting_node);
681
}
682
mutex_exit(&os->os_lock);
683
684
/* the dnode can no longer move, so we can release the handle */
685
if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
686
zrl_remove(&dn->dn_handle->dnh_zrlock);
687
688
dn->dn_allocated_txg = 0;
689
dn->dn_free_txg = 0;
690
dn->dn_assigned_txg = 0;
691
dn->dn_dirtycnt = 0;
692
693
if (dn->dn_bonus != NULL) {
694
mutex_enter(&dn->dn_bonus->db_mtx);
695
dbuf_destroy(dn->dn_bonus);
696
dn->dn_bonus = NULL;
697
}
698
dn->dn_zio = NULL;
699
700
dn->dn_have_spill = B_FALSE;
701
dn->dn_oldused = 0;
702
dn->dn_oldflags = 0;
703
dn->dn_olduid = 0;
704
dn->dn_oldgid = 0;
705
dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
706
dn->dn_newuid = 0;
707
dn->dn_newgid = 0;
708
dn->dn_newprojid = ZFS_DEFAULT_PROJID;
709
dn->dn_id_flags = 0;
710
711
dn->dn_storage_type = DMU_OT_NONE;
712
713
dmu_zfetch_fini(&dn->dn_zfetch);
714
kmem_cache_free(dnode_cache, dn);
715
arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
716
717
if (complete_os_eviction)
718
dmu_objset_evict_done(os);
719
}
720
721
void
722
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
723
dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
724
{
725
int i;
726
727
ASSERT3U(dn_slots, >, 0);
728
ASSERT3U(dn_slots << DNODE_SHIFT, <=,
729
spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
730
ASSERT3U(blocksize, <=,
731
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
732
if (blocksize == 0)
733
blocksize = 1 << zfs_default_bs;
734
else
735
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
736
737
if (ibs == 0)
738
ibs = zfs_default_ibs;
739
740
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
741
742
dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
743
dn->dn_objset, (u_longlong_t)dn->dn_object,
744
(u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);
745
DNODE_STAT_BUMP(dnode_allocate);
746
747
ASSERT(dn->dn_type == DMU_OT_NONE);
748
ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
749
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
750
ASSERT(ot != DMU_OT_NONE);
751
ASSERT(DMU_OT_IS_VALID(ot));
752
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
753
(bonustype == DMU_OT_SA && bonuslen == 0) ||
754
(bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
755
(bonustype != DMU_OT_NONE && bonuslen != 0));
756
ASSERT(DMU_OT_IS_VALID(bonustype));
757
ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
758
ASSERT(dn->dn_type == DMU_OT_NONE);
759
ASSERT0(dn->dn_maxblkid);
760
ASSERT0(dn->dn_allocated_txg);
761
ASSERT0(dn->dn_assigned_txg);
762
ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
763
ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
764
ASSERT(avl_is_empty(&dn->dn_dbufs));
765
766
for (i = 0; i < TXG_SIZE; i++) {
767
ASSERT0(dn->dn_next_nblkptr[i]);
768
ASSERT0(dn->dn_next_nlevels[i]);
769
ASSERT0(dn->dn_next_indblkshift[i]);
770
ASSERT0(dn->dn_next_bonuslen[i]);
771
ASSERT0(dn->dn_next_bonustype[i]);
772
ASSERT0(dn->dn_rm_spillblk[i]);
773
ASSERT0(dn->dn_next_blksz[i]);
774
ASSERT0(dn->dn_next_maxblkid[i]);
775
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
776
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
777
ASSERT0P(dn->dn_free_ranges[i]);
778
}
779
780
dn->dn_type = ot;
781
dnode_setdblksz(dn, blocksize);
782
dn->dn_indblkshift = ibs;
783
dn->dn_nlevels = 1;
784
dn->dn_num_slots = dn_slots;
785
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
786
dn->dn_nblkptr = 1;
787
else {
788
dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
789
1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
790
SPA_BLKPTRSHIFT));
791
}
792
793
dn->dn_bonustype = bonustype;
794
dn->dn_bonuslen = bonuslen;
795
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
796
dn->dn_compress = ZIO_COMPRESS_INHERIT;
797
798
dn->dn_free_txg = 0;
799
dn->dn_dirtycnt = 0;
800
801
dn->dn_allocated_txg = tx->tx_txg;
802
dn->dn_id_flags = 0;
803
804
dnode_setdirty(dn, tx);
805
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
806
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
807
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
808
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
809
}
810
811
void
812
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
813
dmu_object_type_t bonustype, int bonuslen, int dn_slots,
814
boolean_t keep_spill, dmu_tx_t *tx)
815
{
816
int nblkptr;
817
818
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
819
ASSERT3U(blocksize, <=,
820
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
821
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
822
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
823
ASSERT(tx->tx_txg != 0);
824
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
825
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
826
(bonustype == DMU_OT_SA && bonuslen == 0));
827
ASSERT(DMU_OT_IS_VALID(bonustype));
828
ASSERT3U(bonuslen, <=,
829
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
830
ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
831
832
dnode_free_interior_slots(dn);
833
DNODE_STAT_BUMP(dnode_reallocate);
834
835
/* clean up any unreferenced dbufs */
836
dnode_evict_dbufs(dn);
837
838
dn->dn_id_flags = 0;
839
840
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
841
dnode_setdirty(dn, tx);
842
if (dn->dn_datablksz != blocksize) {
843
/* change blocksize */
844
ASSERT0(dn->dn_maxblkid);
845
ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
846
dnode_block_freed(dn, 0));
847
848
dnode_setdblksz(dn, blocksize);
849
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
850
}
851
if (dn->dn_bonuslen != bonuslen)
852
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
853
854
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
855
nblkptr = 1;
856
else
857
nblkptr = MIN(DN_MAX_NBLKPTR,
858
1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
859
SPA_BLKPTRSHIFT));
860
if (dn->dn_bonustype != bonustype)
861
dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
862
if (dn->dn_nblkptr != nblkptr)
863
dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
864
if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
865
dbuf_rm_spill(dn, tx);
866
dnode_rm_spill(dn, tx);
867
}
868
869
rw_exit(&dn->dn_struct_rwlock);
870
871
/* change type */
872
dn->dn_type = ot;
873
874
/* change bonus size and type */
875
mutex_enter(&dn->dn_mtx);
876
dn->dn_bonustype = bonustype;
877
dn->dn_bonuslen = bonuslen;
878
dn->dn_num_slots = dn_slots;
879
dn->dn_nblkptr = nblkptr;
880
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
881
dn->dn_compress = ZIO_COMPRESS_INHERIT;
882
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
883
884
/* fix up the bonus db_size */
885
if (dn->dn_bonus) {
886
dn->dn_bonus->db.db_size =
887
DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
888
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
889
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
890
}
891
892
dn->dn_allocated_txg = tx->tx_txg;
893
mutex_exit(&dn->dn_mtx);
894
}
895
896
#ifdef _KERNEL
897
static void
898
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
899
{
900
ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
901
ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
902
ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
903
904
/* Copy fields. */
905
ndn->dn_objset = odn->dn_objset;
906
ndn->dn_object = odn->dn_object;
907
ndn->dn_dbuf = odn->dn_dbuf;
908
ndn->dn_handle = odn->dn_handle;
909
ndn->dn_phys = odn->dn_phys;
910
ndn->dn_type = odn->dn_type;
911
ndn->dn_bonuslen = odn->dn_bonuslen;
912
ndn->dn_bonustype = odn->dn_bonustype;
913
ndn->dn_nblkptr = odn->dn_nblkptr;
914
ndn->dn_checksum = odn->dn_checksum;
915
ndn->dn_compress = odn->dn_compress;
916
ndn->dn_nlevels = odn->dn_nlevels;
917
ndn->dn_indblkshift = odn->dn_indblkshift;
918
ndn->dn_datablkshift = odn->dn_datablkshift;
919
ndn->dn_datablkszsec = odn->dn_datablkszsec;
920
ndn->dn_datablksz = odn->dn_datablksz;
921
ndn->dn_maxblkid = odn->dn_maxblkid;
922
ndn->dn_num_slots = odn->dn_num_slots;
923
memcpy(ndn->dn_next_type, odn->dn_next_type,
924
sizeof (odn->dn_next_type));
925
memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
926
sizeof (odn->dn_next_nblkptr));
927
memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
928
sizeof (odn->dn_next_nlevels));
929
memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
930
sizeof (odn->dn_next_indblkshift));
931
memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
932
sizeof (odn->dn_next_bonustype));
933
memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
934
sizeof (odn->dn_rm_spillblk));
935
memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
936
sizeof (odn->dn_next_bonuslen));
937
memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
938
sizeof (odn->dn_next_blksz));
939
memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
940
sizeof (odn->dn_next_maxblkid));
941
for (int i = 0; i < TXG_SIZE; i++) {
942
list_move_tail(&ndn->dn_dirty_records[i],
943
&odn->dn_dirty_records[i]);
944
}
945
memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
946
sizeof (odn->dn_free_ranges));
947
ndn->dn_allocated_txg = odn->dn_allocated_txg;
948
ndn->dn_free_txg = odn->dn_free_txg;
949
ndn->dn_assigned_txg = odn->dn_assigned_txg;
950
ndn->dn_dirtycnt = odn->dn_dirtycnt;
951
ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));
952
zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
953
ASSERT(avl_is_empty(&ndn->dn_dbufs));
954
avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
955
ndn->dn_dbufs_count = odn->dn_dbufs_count;
956
ndn->dn_bonus = odn->dn_bonus;
957
ndn->dn_have_spill = odn->dn_have_spill;
958
ndn->dn_zio = odn->dn_zio;
959
ndn->dn_oldused = odn->dn_oldused;
960
ndn->dn_oldflags = odn->dn_oldflags;
961
ndn->dn_olduid = odn->dn_olduid;
962
ndn->dn_oldgid = odn->dn_oldgid;
963
ndn->dn_oldprojid = odn->dn_oldprojid;
964
ndn->dn_newuid = odn->dn_newuid;
965
ndn->dn_newgid = odn->dn_newgid;
966
ndn->dn_newprojid = odn->dn_newprojid;
967
ndn->dn_id_flags = odn->dn_id_flags;
968
ndn->dn_storage_type = odn->dn_storage_type;
969
dmu_zfetch_init(&ndn->dn_zfetch, ndn);
970
971
/*
972
* Update back pointers. Updating the handle fixes the back pointer of
973
* every descendant dbuf as well as the bonus dbuf.
974
*/
975
ASSERT(ndn->dn_handle->dnh_dnode == odn);
976
ndn->dn_handle->dnh_dnode = ndn;
977
978
/*
979
* Invalidate the original dnode by clearing all of its back pointers.
980
*/
981
odn->dn_dbuf = NULL;
982
odn->dn_handle = NULL;
983
avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
984
offsetof(dmu_buf_impl_t, db_link));
985
odn->dn_dbufs_count = 0;
986
odn->dn_bonus = NULL;
987
dmu_zfetch_fini(&odn->dn_zfetch);
988
989
/*
990
* Set the low bit of the objset pointer to ensure that dnode_move()
991
* recognizes the dnode as invalid in any subsequent callback.
992
*/
993
POINTER_INVALIDATE(&odn->dn_objset);
994
995
/*
996
* Satisfy the destructor.
997
*/
998
for (int i = 0; i < TXG_SIZE; i++) {
999
list_create(&odn->dn_dirty_records[i],
1000
sizeof (dbuf_dirty_record_t),
1001
offsetof(dbuf_dirty_record_t, dr_dirty_node));
1002
odn->dn_free_ranges[i] = NULL;
1003
odn->dn_next_nlevels[i] = 0;
1004
odn->dn_next_indblkshift[i] = 0;
1005
odn->dn_next_bonustype[i] = 0;
1006
odn->dn_rm_spillblk[i] = 0;
1007
odn->dn_next_bonuslen[i] = 0;
1008
odn->dn_next_blksz[i] = 0;
1009
}
1010
odn->dn_allocated_txg = 0;
1011
odn->dn_free_txg = 0;
1012
odn->dn_assigned_txg = 0;
1013
odn->dn_dirtycnt = 0;
1014
odn->dn_have_spill = B_FALSE;
1015
odn->dn_zio = NULL;
1016
odn->dn_oldused = 0;
1017
odn->dn_oldflags = 0;
1018
odn->dn_olduid = 0;
1019
odn->dn_oldgid = 0;
1020
odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
1021
odn->dn_newuid = 0;
1022
odn->dn_newgid = 0;
1023
odn->dn_newprojid = ZFS_DEFAULT_PROJID;
1024
odn->dn_id_flags = 0;
1025
odn->dn_storage_type = DMU_OT_NONE;
1026
1027
/*
1028
* Mark the dnode.
1029
*/
1030
ndn->dn_moved = 1;
1031
odn->dn_moved = (uint8_t)-1;
1032
}
1033
1034
static kmem_cbrc_t
1035
dnode_move(void *buf, void *newbuf, size_t size, void *arg)
1036
{
1037
dnode_t *odn = buf, *ndn = newbuf;
1038
objset_t *os;
1039
int64_t refcount;
1040
uint32_t dbufs;
1041
1042
#ifndef USE_DNODE_HANDLE
1043
/*
1044
* We can't move dnodes if dbufs reference them directly without
1045
* using handles and respecitve locking. Unless USE_DNODE_HANDLE
1046
* is defined the code below is only to make sure it still builds,
1047
* but it should never be used, since it is unsafe.
1048
*/
1049
#ifdef ZFS_DEBUG
1050
PANIC("dnode_move() called without USE_DNODE_HANDLE");
1051
#endif
1052
return (KMEM_CBRC_NO);
1053
#endif
1054
1055
/*
1056
* The dnode is on the objset's list of known dnodes if the objset
1057
* pointer is valid. We set the low bit of the objset pointer when
1058
* freeing the dnode to invalidate it, and the memory patterns written
1059
* by kmem (baddcafe and deadbeef) set at least one of the two low bits.
1060
* A newly created dnode sets the objset pointer last of all to indicate
1061
* that the dnode is known and in a valid state to be moved by this
1062
* function.
1063
*/
1064
os = odn->dn_objset;
1065
if (!POINTER_IS_VALID(os)) {
1066
DNODE_STAT_BUMP(dnode_move_invalid);
1067
return (KMEM_CBRC_DONT_KNOW);
1068
}
1069
1070
/*
1071
* Ensure that the objset does not go away during the move.
1072
*/
1073
rw_enter(&os_lock, RW_WRITER);
1074
if (os != odn->dn_objset) {
1075
rw_exit(&os_lock);
1076
DNODE_STAT_BUMP(dnode_move_recheck1);
1077
return (KMEM_CBRC_DONT_KNOW);
1078
}
1079
1080
/*
1081
* If the dnode is still valid, then so is the objset. We know that no
1082
* valid objset can be freed while we hold os_lock, so we can safely
1083
* ensure that the objset remains in use.
1084
*/
1085
mutex_enter(&os->os_lock);
1086
1087
/*
1088
* Recheck the objset pointer in case the dnode was removed just before
1089
* acquiring the lock.
1090
*/
1091
if (os != odn->dn_objset) {
1092
mutex_exit(&os->os_lock);
1093
rw_exit(&os_lock);
1094
DNODE_STAT_BUMP(dnode_move_recheck2);
1095
return (KMEM_CBRC_DONT_KNOW);
1096
}
1097
1098
/*
1099
* At this point we know that as long as we hold os->os_lock, the dnode
1100
* cannot be freed and fields within the dnode can be safely accessed.
1101
* The objset listing this dnode cannot go away as long as this dnode is
1102
* on its list.
1103
*/
1104
rw_exit(&os_lock);
1105
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
1106
mutex_exit(&os->os_lock);
1107
DNODE_STAT_BUMP(dnode_move_special);
1108
return (KMEM_CBRC_NO);
1109
}
1110
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
1111
1112
/*
1113
* Lock the dnode handle to prevent the dnode from obtaining any new
1114
* holds. This also prevents the descendant dbufs and the bonus dbuf
1115
* from accessing the dnode, so that we can discount their holds. The
1116
* handle is safe to access because we know that while the dnode cannot
1117
* go away, neither can its handle. Once we hold dnh_zrlock, we can
1118
* safely move any dnode referenced only by dbufs.
1119
*/
1120
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
1121
mutex_exit(&os->os_lock);
1122
DNODE_STAT_BUMP(dnode_move_handle);
1123
return (KMEM_CBRC_LATER);
1124
}
1125
1126
/*
1127
* Ensure a consistent view of the dnode's holds and the dnode's dbufs.
1128
* We need to guarantee that there is a hold for every dbuf in order to
1129
* determine whether the dnode is actively referenced. Falsely matching
1130
* a dbuf to an active hold would lead to an unsafe move. It's possible
1131
* that a thread already having an active dnode hold is about to add a
1132
* dbuf, and we can't compare hold and dbuf counts while the add is in
1133
* progress.
1134
*/
1135
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
1136
zrl_exit(&odn->dn_handle->dnh_zrlock);
1137
mutex_exit(&os->os_lock);
1138
DNODE_STAT_BUMP(dnode_move_rwlock);
1139
return (KMEM_CBRC_LATER);
1140
}
1141
1142
/*
1143
* A dbuf may be removed (evicted) without an active dnode hold. In that
1144
* case, the dbuf count is decremented under the handle lock before the
1145
* dbuf's hold is released. This order ensures that if we count the hold
1146
* after the dbuf is removed but before its hold is released, we will
1147
* treat the unmatched hold as active and exit safely. If we count the
1148
* hold before the dbuf is removed, the hold is discounted, and the
1149
* removal is blocked until the move completes.
1150
*/
1151
refcount = zfs_refcount_count(&odn->dn_holds);
1152
ASSERT(refcount >= 0);
1153
dbufs = DN_DBUFS_COUNT(odn);
1154
1155
/* We can't have more dbufs than dnode holds. */
1156
ASSERT3U(dbufs, <=, refcount);
1157
DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1158
uint32_t, dbufs);
1159
1160
if (refcount > dbufs) {
1161
rw_exit(&odn->dn_struct_rwlock);
1162
zrl_exit(&odn->dn_handle->dnh_zrlock);
1163
mutex_exit(&os->os_lock);
1164
DNODE_STAT_BUMP(dnode_move_active);
1165
return (KMEM_CBRC_LATER);
1166
}
1167
1168
rw_exit(&odn->dn_struct_rwlock);
1169
1170
/*
1171
* At this point we know that anyone with a hold on the dnode is not
1172
* actively referencing it. The dnode is known and in a valid state to
1173
* move. We're holding the locks needed to execute the critical section.
1174
*/
1175
dnode_move_impl(odn, ndn);
1176
1177
list_link_replace(&odn->dn_link, &ndn->dn_link);
1178
/* If the dnode was safe to move, the refcount cannot have changed. */
1179
ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1180
ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
1181
zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1182
mutex_exit(&os->os_lock);
1183
1184
return (KMEM_CBRC_YES);
1185
}
1186
#endif /* _KERNEL */
1187
1188
static void
1189
dnode_slots_hold(dnode_children_t *children, int idx, int slots)
1190
{
1191
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1192
1193
for (int i = idx; i < idx + slots; i++) {
1194
dnode_handle_t *dnh = &children->dnc_children[i];
1195
zrl_add(&dnh->dnh_zrlock);
1196
}
1197
}
1198
1199
static void
1200
dnode_slots_rele(dnode_children_t *children, int idx, int slots)
1201
{
1202
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1203
1204
for (int i = idx; i < idx + slots; i++) {
1205
dnode_handle_t *dnh = &children->dnc_children[i];
1206
1207
if (zrl_is_locked(&dnh->dnh_zrlock))
1208
zrl_exit(&dnh->dnh_zrlock);
1209
else
1210
zrl_remove(&dnh->dnh_zrlock);
1211
}
1212
}
1213
1214
static int
1215
dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
1216
{
1217
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1218
1219
for (int i = idx; i < idx + slots; i++) {
1220
dnode_handle_t *dnh = &children->dnc_children[i];
1221
1222
if (!zrl_tryenter(&dnh->dnh_zrlock)) {
1223
for (int j = idx; j < i; j++) {
1224
dnh = &children->dnc_children[j];
1225
zrl_exit(&dnh->dnh_zrlock);
1226
}
1227
1228
return (0);
1229
}
1230
}
1231
1232
return (1);
1233
}
1234
1235
static void
1236
dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
1237
{
1238
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1239
1240
for (int i = idx; i < idx + slots; i++) {
1241
dnode_handle_t *dnh = &children->dnc_children[i];
1242
dnh->dnh_dnode = ptr;
1243
}
1244
}
1245
1246
static boolean_t
1247
dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
1248
{
1249
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1250
1251
/*
1252
* If all dnode slots are either already free or
1253
* evictable return B_TRUE.
1254
*/
1255
for (int i = idx; i < idx + slots; i++) {
1256
dnode_handle_t *dnh = &children->dnc_children[i];
1257
dnode_t *dn = dnh->dnh_dnode;
1258
1259
if (dn == DN_SLOT_FREE) {
1260
continue;
1261
} else if (DN_SLOT_IS_PTR(dn)) {
1262
mutex_enter(&dn->dn_mtx);
1263
boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1264
dn->dn_dirtycnt == 0 &&
1265
zfs_refcount_is_zero(&dn->dn_holds));
1266
mutex_exit(&dn->dn_mtx);
1267
1268
if (!can_free)
1269
return (B_FALSE);
1270
else
1271
continue;
1272
} else {
1273
return (B_FALSE);
1274
}
1275
}
1276
1277
return (B_TRUE);
1278
}
1279
1280
static uint_t
1281
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
1282
{
1283
uint_t reclaimed = 0;
1284
1285
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1286
1287
for (int i = idx; i < idx + slots; i++) {
1288
dnode_handle_t *dnh = &children->dnc_children[i];
1289
1290
ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
1291
1292
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1293
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
1294
dnode_destroy(dnh->dnh_dnode);
1295
dnh->dnh_dnode = DN_SLOT_FREE;
1296
reclaimed++;
1297
}
1298
}
1299
1300
return (reclaimed);
1301
}
1302
1303
void
1304
dnode_free_interior_slots(dnode_t *dn)
1305
{
1306
dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
1307
int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
1308
int idx = (dn->dn_object & (epb - 1)) + 1;
1309
int slots = dn->dn_num_slots - 1;
1310
1311
if (slots == 0)
1312
return;
1313
1314
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1315
1316
while (!dnode_slots_tryenter(children, idx, slots)) {
1317
DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
1318
kpreempt(KPREEMPT_SYNC);
1319
}
1320
1321
dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
1322
dnode_slots_rele(children, idx, slots);
1323
}
1324
1325
void
1326
dnode_special_close(dnode_handle_t *dnh)
1327
{
1328
dnode_t *dn = dnh->dnh_dnode;
1329
1330
/*
1331
* Ensure dnode_rele_and_unlock() has released dn_mtx, after final
1332
* zfs_refcount_remove()
1333
*/
1334
mutex_enter(&dn->dn_mtx);
1335
if (zfs_refcount_count(&dn->dn_holds) > 0)
1336
cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
1337
mutex_exit(&dn->dn_mtx);
1338
ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
1339
1340
ASSERT(dn->dn_dbuf == NULL ||
1341
dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1342
zrl_add(&dnh->dnh_zrlock);
1343
dnode_destroy(dn); /* implicit zrl_remove() */
1344
zrl_destroy(&dnh->dnh_zrlock);
1345
dnh->dnh_dnode = NULL;
1346
}
1347
1348
void
1349
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1350
dnode_handle_t *dnh)
1351
{
1352
dnode_t *dn;
1353
1354
zrl_init(&dnh->dnh_zrlock);
1355
VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
1356
1357
dn = dnode_create(os, dnp, NULL, object, dnh);
1358
DNODE_VERIFY(dn);
1359
1360
zrl_exit(&dnh->dnh_zrlock);
1361
}
1362
1363
static void
1364
dnode_buf_evict_async(void *dbu)
1365
{
1366
dnode_children_t *dnc = dbu;
1367
1368
DNODE_STAT_BUMP(dnode_buf_evict);
1369
1370
for (int i = 0; i < dnc->dnc_count; i++) {
1371
dnode_handle_t *dnh = &dnc->dnc_children[i];
1372
dnode_t *dn;
1373
1374
/*
1375
* The dnode handle lock guards against the dnode moving to
1376
* another valid address, so there is no need here to guard
1377
* against changes to or from NULL.
1378
*/
1379
if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1380
zrl_destroy(&dnh->dnh_zrlock);
1381
dnh->dnh_dnode = DN_SLOT_UNINIT;
1382
continue;
1383
}
1384
1385
zrl_add(&dnh->dnh_zrlock);
1386
dn = dnh->dnh_dnode;
1387
/*
1388
* If there are holds on this dnode, then there should
1389
* be holds on the dnode's containing dbuf as well; thus
1390
* it wouldn't be eligible for eviction and this function
1391
* would not have been called.
1392
*/
1393
ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1394
ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1395
1396
dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1397
zrl_destroy(&dnh->dnh_zrlock);
1398
dnh->dnh_dnode = DN_SLOT_UNINIT;
1399
}
1400
kmem_free(dnc, sizeof (dnode_children_t) +
1401
dnc->dnc_count * sizeof (dnode_handle_t));
1402
}
1403
1404
/*
1405
* When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
1406
* to ensure the hole at the specified object offset is large enough to
1407
* hold the dnode being created. The slots parameter is also used to ensure
1408
* a dnode does not span multiple dnode blocks. In both of these cases, if
1409
* a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
1410
* are only possible when using DNODE_MUST_BE_FREE.
1411
*
1412
* If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
1413
* dnode_hold_impl() will check if the requested dnode is already consumed
1414
* as an extra dnode slot by an large dnode, in which case it returns
1415
* ENOENT.
1416
*
1417
* If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
1418
* return whether the hold would succeed or not. tag and dnp should set to
1419
* NULL in this case.
1420
*
1421
* errors:
1422
* EINVAL - Invalid object number or flags.
1423
* ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
1424
* EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
1425
* - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
1426
* - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
1427
* ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
1428
* - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
1429
* EIO - I/O error when reading the meta dnode dbuf.
1430
*
1431
* succeeds even for free dnodes.
1432
*/
1433
int
1434
dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1435
const void *tag, dnode_t **dnp)
1436
{
1437
int epb, idx, err;
1438
int drop_struct_lock = FALSE;
1439
int type;
1440
uint64_t blk;
1441
dnode_t *mdn, *dn;
1442
dmu_buf_impl_t *db;
1443
dnode_children_t *dnc;
1444
dnode_phys_t *dn_block;
1445
dnode_handle_t *dnh;
1446
1447
ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
1448
ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1449
IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
1450
1451
/*
1452
* If you are holding the spa config lock as writer, you shouldn't
1453
* be asking the DMU to do *anything* unless it's the root pool
1454
* which may require us to read from the root filesystem while
1455
* holding some (not all) of the locks as writer.
1456
*/
1457
ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1458
(spa_is_root(os->os_spa) &&
1459
spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1460
1461
ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
1462
1463
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1464
object == DMU_PROJECTUSED_OBJECT) {
1465
if (object == DMU_USERUSED_OBJECT)
1466
dn = DMU_USERUSED_DNODE(os);
1467
else if (object == DMU_GROUPUSED_OBJECT)
1468
dn = DMU_GROUPUSED_DNODE(os);
1469
else
1470
dn = DMU_PROJECTUSED_DNODE(os);
1471
if (dn == NULL)
1472
return (SET_ERROR(ENOENT));
1473
type = dn->dn_type;
1474
if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1475
return (SET_ERROR(ENOENT));
1476
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1477
return (SET_ERROR(EEXIST));
1478
DNODE_VERIFY(dn);
1479
/* Don't actually hold if dry run, just return 0 */
1480
if (!(flag & DNODE_DRY_RUN)) {
1481
(void) zfs_refcount_add(&dn->dn_holds, tag);
1482
*dnp = dn;
1483
}
1484
return (0);
1485
}
1486
1487
if (object == 0 || object >= DN_MAX_OBJECT)
1488
return (SET_ERROR(EINVAL));
1489
1490
mdn = DMU_META_DNODE(os);
1491
ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1492
1493
DNODE_VERIFY(mdn);
1494
1495
if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1496
rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1497
drop_struct_lock = TRUE;
1498
}
1499
1500
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1501
db = dbuf_hold(mdn, blk, FTAG);
1502
if (drop_struct_lock)
1503
rw_exit(&mdn->dn_struct_rwlock);
1504
if (db == NULL) {
1505
DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1506
return (SET_ERROR(EIO));
1507
}
1508
1509
/*
1510
* We do not need to decrypt to read the dnode so it doesn't matter
1511
* if we get the encrypted or decrypted version.
1512
*/
1513
err = dbuf_read(db, NULL, DB_RF_CANFAIL |
1514
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
1515
if (err) {
1516
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1517
dbuf_rele(db, FTAG);
1518
return (err);
1519
}
1520
1521
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1522
epb = db->db.db_size >> DNODE_SHIFT;
1523
1524
idx = object & (epb - 1);
1525
dn_block = (dnode_phys_t *)db->db.db_data;
1526
1527
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1528
dnc = dmu_buf_get_user(&db->db);
1529
dnh = NULL;
1530
if (dnc == NULL) {
1531
dnode_children_t *winner;
1532
int skip = 0;
1533
1534
dnc = kmem_zalloc(sizeof (dnode_children_t) +
1535
epb * sizeof (dnode_handle_t), KM_SLEEP);
1536
dnc->dnc_count = epb;
1537
dnh = &dnc->dnc_children[0];
1538
1539
/* Initialize dnode slot status from dnode_phys_t */
1540
for (int i = 0; i < epb; i++) {
1541
zrl_init(&dnh[i].dnh_zrlock);
1542
1543
if (skip) {
1544
skip--;
1545
continue;
1546
}
1547
1548
if (dn_block[i].dn_type != DMU_OT_NONE) {
1549
int interior = dn_block[i].dn_extra_slots;
1550
1551
dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
1552
dnode_set_slots(dnc, i + 1, interior,
1553
DN_SLOT_INTERIOR);
1554
skip = interior;
1555
} else {
1556
dnh[i].dnh_dnode = DN_SLOT_FREE;
1557
skip = 0;
1558
}
1559
}
1560
1561
dmu_buf_init_user(&dnc->dnc_dbu, NULL,
1562
dnode_buf_evict_async, NULL);
1563
winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1564
if (winner != NULL) {
1565
1566
for (int i = 0; i < epb; i++)
1567
zrl_destroy(&dnh[i].dnh_zrlock);
1568
1569
kmem_free(dnc, sizeof (dnode_children_t) +
1570
epb * sizeof (dnode_handle_t));
1571
dnc = winner;
1572
}
1573
}
1574
1575
ASSERT(dnc->dnc_count == epb);
1576
1577
if (flag & DNODE_MUST_BE_ALLOCATED) {
1578
slots = 1;
1579
1580
dnode_slots_hold(dnc, idx, slots);
1581
dnh = &dnc->dnc_children[idx];
1582
1583
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1584
dn = dnh->dnh_dnode;
1585
} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
1586
DNODE_STAT_BUMP(dnode_hold_alloc_interior);
1587
dnode_slots_rele(dnc, idx, slots);
1588
dbuf_rele(db, FTAG);
1589
return (SET_ERROR(EEXIST));
1590
} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
1591
DNODE_STAT_BUMP(dnode_hold_alloc_misses);
1592
dnode_slots_rele(dnc, idx, slots);
1593
dbuf_rele(db, FTAG);
1594
return (SET_ERROR(ENOENT));
1595
} else {
1596
dnode_slots_rele(dnc, idx, slots);
1597
while (!dnode_slots_tryenter(dnc, idx, slots)) {
1598
DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
1599
kpreempt(KPREEMPT_SYNC);
1600
}
1601
1602
/*
1603
* Someone else won the race and called dnode_create()
1604
* after we checked DN_SLOT_IS_PTR() above but before
1605
* we acquired the lock.
1606
*/
1607
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1608
DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
1609
dn = dnh->dnh_dnode;
1610
} else {
1611
dn = dnode_create(os, dn_block + idx, db,
1612
object, dnh);
1613
dmu_buf_add_user_size(&db->db,
1614
sizeof (dnode_t));
1615
}
1616
}
1617
1618
mutex_enter(&dn->dn_mtx);
1619
if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
1620
DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
1621
mutex_exit(&dn->dn_mtx);
1622
dnode_slots_rele(dnc, idx, slots);
1623
dbuf_rele(db, FTAG);
1624
return (SET_ERROR(ENOENT));
1625
}
1626
1627
/* Don't actually hold if dry run, just return 0 */
1628
if (flag & DNODE_DRY_RUN) {
1629
mutex_exit(&dn->dn_mtx);
1630
dnode_slots_rele(dnc, idx, slots);
1631
dbuf_rele(db, FTAG);
1632
return (0);
1633
}
1634
1635
DNODE_STAT_BUMP(dnode_hold_alloc_hits);
1636
} else if (flag & DNODE_MUST_BE_FREE) {
1637
1638
if (idx + slots - 1 >= DNODES_PER_BLOCK) {
1639
DNODE_STAT_BUMP(dnode_hold_free_overflow);
1640
dbuf_rele(db, FTAG);
1641
return (SET_ERROR(ENOSPC));
1642
}
1643
1644
dnode_slots_hold(dnc, idx, slots);
1645
1646
if (!dnode_check_slots_free(dnc, idx, slots)) {
1647
DNODE_STAT_BUMP(dnode_hold_free_misses);
1648
dnode_slots_rele(dnc, idx, slots);
1649
dbuf_rele(db, FTAG);
1650
return (SET_ERROR(ENOSPC));
1651
}
1652
1653
dnode_slots_rele(dnc, idx, slots);
1654
while (!dnode_slots_tryenter(dnc, idx, slots)) {
1655
DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
1656
kpreempt(KPREEMPT_SYNC);
1657
}
1658
1659
if (!dnode_check_slots_free(dnc, idx, slots)) {
1660
DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
1661
dnode_slots_rele(dnc, idx, slots);
1662
dbuf_rele(db, FTAG);
1663
return (SET_ERROR(ENOSPC));
1664
}
1665
1666
/*
1667
* Allocated but otherwise free dnodes which would
1668
* be in the interior of a multi-slot dnodes need
1669
* to be freed. Single slot dnodes can be safely
1670
* re-purposed as a performance optimization.
1671
*/
1672
if (slots > 1) {
1673
uint_t reclaimed =
1674
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1675
if (reclaimed > 0)
1676
dmu_buf_sub_user_size(&db->db,
1677
reclaimed * sizeof (dnode_t));
1678
}
1679
1680
dnh = &dnc->dnc_children[idx];
1681
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1682
dn = dnh->dnh_dnode;
1683
} else {
1684
dn = dnode_create(os, dn_block + idx, db,
1685
object, dnh);
1686
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
1687
}
1688
1689
mutex_enter(&dn->dn_mtx);
1690
if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
1691
DNODE_STAT_BUMP(dnode_hold_free_refcount);
1692
mutex_exit(&dn->dn_mtx);
1693
dnode_slots_rele(dnc, idx, slots);
1694
dbuf_rele(db, FTAG);
1695
return (SET_ERROR(EEXIST));
1696
}
1697
1698
/* Don't actually hold if dry run, just return 0 */
1699
if (flag & DNODE_DRY_RUN) {
1700
mutex_exit(&dn->dn_mtx);
1701
dnode_slots_rele(dnc, idx, slots);
1702
dbuf_rele(db, FTAG);
1703
return (0);
1704
}
1705
1706
dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
1707
DNODE_STAT_BUMP(dnode_hold_free_hits);
1708
} else {
1709
dbuf_rele(db, FTAG);
1710
return (SET_ERROR(EINVAL));
1711
}
1712
1713
ASSERT0(dn->dn_free_txg);
1714
1715
if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1716
dbuf_add_ref(db, dnh);
1717
1718
mutex_exit(&dn->dn_mtx);
1719
1720
/* Now we can rely on the hold to prevent the dnode from moving. */
1721
dnode_slots_rele(dnc, idx, slots);
1722
1723
DNODE_VERIFY(dn);
1724
ASSERT3P(dnp, !=, NULL);
1725
ASSERT3P(dn->dn_dbuf, ==, db);
1726
ASSERT3U(dn->dn_object, ==, object);
1727
dbuf_rele(db, FTAG);
1728
1729
*dnp = dn;
1730
return (0);
1731
}
1732
1733
/*
1734
* Return held dnode if the object is allocated, NULL if not.
1735
*/
1736
int
1737
dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
1738
{
1739
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
1740
dnp));
1741
}
1742
1743
/*
1744
* Can only add a reference if there is already at least one
1745
* reference on the dnode. Returns FALSE if unable to add a
1746
* new reference.
1747
*/
1748
static boolean_t
1749
dnode_add_ref_locked(dnode_t *dn, const void *tag)
1750
{
1751
ASSERT(MUTEX_HELD(&dn->dn_mtx));
1752
if (zfs_refcount_is_zero(&dn->dn_holds))
1753
return (FALSE);
1754
VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
1755
return (TRUE);
1756
}
1757
1758
boolean_t
1759
dnode_add_ref(dnode_t *dn, const void *tag)
1760
{
1761
mutex_enter(&dn->dn_mtx);
1762
boolean_t r = dnode_add_ref_locked(dn, tag);
1763
mutex_exit(&dn->dn_mtx);
1764
return (r);
1765
}
1766
1767
void
1768
dnode_rele(dnode_t *dn, const void *tag)
1769
{
1770
mutex_enter(&dn->dn_mtx);
1771
dnode_rele_and_unlock(dn, tag, B_FALSE);
1772
}
1773
1774
void
1775
dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
1776
{
1777
uint64_t refs;
1778
/* Get while the hold prevents the dnode from moving. */
1779
dmu_buf_impl_t *db = dn->dn_dbuf;
1780
dnode_handle_t *dnh = dn->dn_handle;
1781
1782
refs = zfs_refcount_remove(&dn->dn_holds, tag);
1783
if (refs == 0)
1784
cv_broadcast(&dn->dn_nodnholds);
1785
mutex_exit(&dn->dn_mtx);
1786
/* dnode could get destroyed at this point, so don't use it anymore */
1787
1788
/*
1789
* It's unsafe to release the last hold on a dnode by dnode_rele() or
1790
* indirectly by dbuf_rele() while relying on the dnode handle to
1791
* prevent the dnode from moving, since releasing the last hold could
1792
* result in the dnode's parent dbuf evicting its dnode handles. For
1793
* that reason anyone calling dnode_rele() or dbuf_rele() without some
1794
* other direct or indirect hold on the dnode must first drop the dnode
1795
* handle.
1796
*/
1797
#ifdef ZFS_DEBUG
1798
ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread);
1799
#endif
1800
1801
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1802
if (refs == 0 && db != NULL) {
1803
/*
1804
* Another thread could add a hold to the dnode handle in
1805
* dnode_hold_impl() while holding the parent dbuf. Since the
1806
* hold on the parent dbuf prevents the handle from being
1807
* destroyed, the hold on the handle is OK. We can't yet assert
1808
* that the handle has zero references, but that will be
1809
* asserted anyway when the handle gets destroyed.
1810
*/
1811
mutex_enter(&db->db_mtx);
1812
dbuf_rele_and_unlock(db, dnh, evicting);
1813
}
1814
}
1815
1816
/*
1817
* Test whether we can create a dnode at the specified location.
1818
*/
1819
int
1820
dnode_try_claim(objset_t *os, uint64_t object, int slots)
1821
{
1822
return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
1823
slots, NULL, NULL));
1824
}
1825
1826
/*
1827
* Test if the dnode is dirty, or carrying uncommitted records.
1828
*
1829
* dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented
1830
* in dnode_setdirty() the first time the dnode is dirtied on a txg, and
1831
* decremented in either dnode_rele_task() or userquota_updates_task() when the
1832
* txg is synced out.
1833
*/
1834
boolean_t
1835
dnode_is_dirty(dnode_t *dn)
1836
{
1837
mutex_enter(&dn->dn_mtx);
1838
boolean_t dirty = (dn->dn_dirtycnt != 0);
1839
mutex_exit(&dn->dn_mtx);
1840
return (dirty);
1841
}
1842
1843
void
1844
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1845
{
1846
objset_t *os = dn->dn_objset;
1847
uint64_t txg = tx->tx_txg;
1848
1849
if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1850
dsl_dataset_dirty(os->os_dsl_dataset, tx);
1851
return;
1852
}
1853
1854
DNODE_VERIFY(dn);
1855
1856
#ifdef ZFS_DEBUG
1857
mutex_enter(&dn->dn_mtx);
1858
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1859
ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1860
mutex_exit(&dn->dn_mtx);
1861
#endif
1862
1863
/*
1864
* Determine old uid/gid when necessary
1865
*/
1866
dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1867
1868
multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];
1869
multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1870
1871
/*
1872
* If we are already marked dirty, we're done.
1873
*/
1874
if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1875
multilist_sublist_unlock(mls);
1876
return;
1877
}
1878
1879
ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
1880
!avl_is_empty(&dn->dn_dbufs));
1881
ASSERT(dn->dn_datablksz != 0);
1882
ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
1883
ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
1884
ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
1885
1886
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1887
(u_longlong_t)dn->dn_object, (u_longlong_t)txg);
1888
1889
multilist_sublist_insert_head(mls, dn);
1890
1891
multilist_sublist_unlock(mls);
1892
1893
/*
1894
* The dnode maintains a hold on its containing dbuf as
1895
* long as there are holds on it. Each instantiated child
1896
* dbuf maintains a hold on the dnode. When the last child
1897
* drops its hold, the dnode will drop its hold on the
1898
* containing dbuf. We add a "dirty hold" here so that the
1899
* dnode will hang around after we finish processing its
1900
* children.
1901
*/
1902
mutex_enter(&dn->dn_mtx);
1903
VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));
1904
dn->dn_dirtycnt++;
1905
ASSERT3U(dn->dn_dirtycnt, <=, 3);
1906
mutex_exit(&dn->dn_mtx);
1907
1908
(void) dbuf_dirty(dn->dn_dbuf, tx);
1909
1910
dsl_dataset_dirty(os->os_dsl_dataset, tx);
1911
}
1912
1913
void
1914
dnode_free(dnode_t *dn, dmu_tx_t *tx)
1915
{
1916
mutex_enter(&dn->dn_mtx);
1917
if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1918
mutex_exit(&dn->dn_mtx);
1919
return;
1920
}
1921
dn->dn_free_txg = tx->tx_txg;
1922
mutex_exit(&dn->dn_mtx);
1923
1924
dnode_setdirty(dn, tx);
1925
}
1926
1927
/*
1928
* Try to change the block size for the indicated dnode. This can only
1929
* succeed if there are no blocks allocated or dirty beyond first block
1930
*/
1931
int
1932
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1933
{
1934
dmu_buf_impl_t *db;
1935
int err;
1936
1937
ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1938
if (size == 0)
1939
size = SPA_MINBLOCKSIZE;
1940
else
1941
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1942
1943
if (ibs == dn->dn_indblkshift)
1944
ibs = 0;
1945
1946
if (size == dn->dn_datablksz && ibs == 0)
1947
return (0);
1948
1949
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1950
1951
/* Check for any allocated blocks beyond the first */
1952
if (dn->dn_maxblkid != 0)
1953
goto fail;
1954
1955
mutex_enter(&dn->dn_dbufs_mtx);
1956
for (db = avl_first(&dn->dn_dbufs); db != NULL;
1957
db = AVL_NEXT(&dn->dn_dbufs, db)) {
1958
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1959
db->db_blkid != DMU_SPILL_BLKID) {
1960
mutex_exit(&dn->dn_dbufs_mtx);
1961
goto fail;
1962
}
1963
}
1964
mutex_exit(&dn->dn_dbufs_mtx);
1965
1966
if (ibs && dn->dn_nlevels != 1)
1967
goto fail;
1968
1969
dnode_setdirty(dn, tx);
1970
if (size != dn->dn_datablksz) {
1971
/* resize the old block */
1972
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
1973
if (err == 0) {
1974
dbuf_new_size(db, size, tx);
1975
} else if (err != ENOENT) {
1976
goto fail;
1977
}
1978
1979
dnode_setdblksz(dn, size);
1980
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
1981
if (db)
1982
dbuf_rele(db, FTAG);
1983
}
1984
if (ibs) {
1985
dn->dn_indblkshift = ibs;
1986
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
1987
}
1988
1989
rw_exit(&dn->dn_struct_rwlock);
1990
return (0);
1991
1992
fail:
1993
rw_exit(&dn->dn_struct_rwlock);
1994
return (SET_ERROR(ENOTSUP));
1995
}
1996
1997
static void
1998
dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1999
{
2000
uint64_t txgoff = tx->tx_txg & TXG_MASK;
2001
int old_nlevels = dn->dn_nlevels;
2002
dmu_buf_impl_t *db;
2003
list_t *list;
2004
dbuf_dirty_record_t *new, *dr, *dr_next;
2005
2006
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2007
2008
ASSERT3U(new_nlevels, >, dn->dn_nlevels);
2009
dn->dn_nlevels = new_nlevels;
2010
2011
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
2012
dn->dn_next_nlevels[txgoff] = new_nlevels;
2013
2014
/* dirty the left indirects */
2015
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
2016
ASSERT(db != NULL);
2017
new = dbuf_dirty(db, tx);
2018
dbuf_rele(db, FTAG);
2019
2020
/* transfer the dirty records to the new indirect */
2021
mutex_enter(&dn->dn_mtx);
2022
mutex_enter(&new->dt.di.dr_mtx);
2023
list = &dn->dn_dirty_records[txgoff];
2024
for (dr = list_head(list); dr; dr = dr_next) {
2025
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
2026
2027
IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
2028
if (dr->dr_dbuf == NULL ||
2029
(dr->dr_dbuf->db_level == old_nlevels - 1 &&
2030
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2031
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
2032
list_remove(&dn->dn_dirty_records[txgoff], dr);
2033
list_insert_tail(&new->dt.di.dr_children, dr);
2034
dr->dr_parent = new;
2035
}
2036
}
2037
mutex_exit(&new->dt.di.dr_mtx);
2038
mutex_exit(&dn->dn_mtx);
2039
}
2040
2041
int
2042
dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
2043
{
2044
int ret = 0;
2045
2046
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2047
2048
if (dn->dn_nlevels == nlevels) {
2049
ret = 0;
2050
goto out;
2051
} else if (nlevels < dn->dn_nlevels) {
2052
ret = SET_ERROR(EINVAL);
2053
goto out;
2054
}
2055
2056
dnode_set_nlevels_impl(dn, nlevels, tx);
2057
2058
out:
2059
rw_exit(&dn->dn_struct_rwlock);
2060
return (ret);
2061
}
2062
2063
/* read-holding callers must not rely on the lock being continuously held */
2064
void
2065
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
2066
boolean_t force)
2067
{
2068
int epbs, new_nlevels;
2069
uint64_t sz;
2070
2071
ASSERT(blkid != DMU_BONUS_BLKID);
2072
2073
ASSERT(have_read ?
2074
RW_READ_HELD(&dn->dn_struct_rwlock) :
2075
RW_WRITE_HELD(&dn->dn_struct_rwlock));
2076
2077
/*
2078
* if we have a read-lock, check to see if we need to do any work
2079
* before upgrading to a write-lock.
2080
*/
2081
if (have_read) {
2082
if (blkid <= dn->dn_maxblkid)
2083
return;
2084
2085
if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
2086
rw_exit(&dn->dn_struct_rwlock);
2087
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2088
}
2089
}
2090
2091
/*
2092
* Raw sends (indicated by the force flag) require that we take the
2093
* given blkid even if the value is lower than the current value.
2094
*/
2095
if (!force && blkid <= dn->dn_maxblkid)
2096
goto out;
2097
2098
/*
2099
* We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
2100
* to indicate that this field is set. This allows us to set the
2101
* maxblkid to 0 on an existing object in dnode_sync().
2102
*/
2103
dn->dn_maxblkid = blkid;
2104
dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
2105
blkid | DMU_NEXT_MAXBLKID_SET;
2106
2107
/*
2108
* Compute the number of levels necessary to support the new maxblkid.
2109
* Raw sends will ensure nlevels is set correctly for us.
2110
*/
2111
new_nlevels = 1;
2112
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2113
for (sz = dn->dn_nblkptr;
2114
sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
2115
new_nlevels++;
2116
2117
ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
2118
2119
if (!force) {
2120
if (new_nlevels > dn->dn_nlevels)
2121
dnode_set_nlevels_impl(dn, new_nlevels, tx);
2122
} else {
2123
ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
2124
}
2125
2126
out:
2127
if (have_read)
2128
rw_downgrade(&dn->dn_struct_rwlock);
2129
}
2130
2131
static void
2132
dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
2133
{
2134
dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
2135
if (db != NULL) {
2136
dmu_buf_will_dirty(&db->db, tx);
2137
dbuf_rele(db, FTAG);
2138
}
2139
}
2140
2141
/*
2142
* Dirty all the in-core level-1 dbufs in the range specified by start_blkid
2143
* and end_blkid.
2144
*/
2145
static void
2146
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
2147
dmu_tx_t *tx)
2148
{
2149
dmu_buf_impl_t *db_search;
2150
dmu_buf_impl_t *db;
2151
avl_index_t where;
2152
2153
db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
2154
2155
mutex_enter(&dn->dn_dbufs_mtx);
2156
2157
db_search->db_level = 1;
2158
db_search->db_blkid = start_blkid + 1;
2159
db_search->db_state = DB_SEARCH;
2160
for (;;) {
2161
2162
db = avl_find(&dn->dn_dbufs, db_search, &where);
2163
if (db == NULL)
2164
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
2165
2166
if (db == NULL || db->db_level != 1 ||
2167
db->db_blkid >= end_blkid) {
2168
break;
2169
}
2170
2171
/*
2172
* Setup the next blkid we want to search for.
2173
*/
2174
db_search->db_blkid = db->db_blkid + 1;
2175
ASSERT3U(db->db_blkid, >=, start_blkid);
2176
2177
/*
2178
* If the dbuf transitions to DB_EVICTING while we're trying
2179
* to dirty it, then we will be unable to discover it in
2180
* the dbuf hash table. This will result in a call to
2181
* dbuf_create() which needs to acquire the dn_dbufs_mtx
2182
* lock. To avoid a deadlock, we drop the lock before
2183
* dirtying the level-1 dbuf.
2184
*/
2185
mutex_exit(&dn->dn_dbufs_mtx);
2186
dnode_dirty_l1(dn, db->db_blkid, tx);
2187
mutex_enter(&dn->dn_dbufs_mtx);
2188
}
2189
2190
#ifdef ZFS_DEBUG
2191
/*
2192
* Walk all the in-core level-1 dbufs and verify they have been dirtied.
2193
*/
2194
db_search->db_level = 1;
2195
db_search->db_blkid = start_blkid + 1;
2196
db_search->db_state = DB_SEARCH;
2197
db = avl_find(&dn->dn_dbufs, db_search, &where);
2198
if (db == NULL)
2199
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
2200
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
2201
if (db->db_level != 1 || db->db_blkid >= end_blkid)
2202
break;
2203
if (db->db_state != DB_EVICTING)
2204
ASSERT(db->db_dirtycnt > 0);
2205
}
2206
#endif
2207
kmem_free(db_search, sizeof (dmu_buf_impl_t));
2208
mutex_exit(&dn->dn_dbufs_mtx);
2209
}
2210
2211
static void
2212
dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
2213
dmu_tx_t *tx)
2214
{
2215
dmu_buf_impl_t *db;
2216
int res;
2217
2218
rw_enter(&dn->dn_struct_rwlock, RW_READER);
2219
res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,
2220
FTAG, &db);
2221
rw_exit(&dn->dn_struct_rwlock);
2222
if (res == 0) {
2223
db_lock_type_t dblt;
2224
boolean_t dirty;
2225
2226
dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2227
/* don't dirty if not on disk and not dirty */
2228
dirty = !list_is_empty(&db->db_dirty_records) ||
2229
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
2230
dmu_buf_unlock_parent(db, dblt, FTAG);
2231
if (dirty) {
2232
caddr_t data;
2233
2234
dmu_buf_will_dirty(&db->db, tx);
2235
data = db->db.db_data;
2236
memset(data + blkoff, 0, len);
2237
}
2238
dbuf_rele(db, FTAG);
2239
}
2240
}
2241
2242
void
2243
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
2244
{
2245
uint64_t blkoff, blkid, nblks;
2246
int blksz, blkshift, head, tail;
2247
int trunc = FALSE;
2248
int epbs;
2249
2250
blksz = dn->dn_datablksz;
2251
blkshift = dn->dn_datablkshift;
2252
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2253
2254
if (len == DMU_OBJECT_END) {
2255
len = UINT64_MAX - off;
2256
trunc = TRUE;
2257
}
2258
2259
/*
2260
* First, block align the region to free:
2261
*/
2262
if (ISP2(blksz)) {
2263
head = P2NPHASE(off, blksz);
2264
blkoff = P2PHASE(off, blksz);
2265
if ((off >> blkshift) > dn->dn_maxblkid)
2266
return;
2267
} else {
2268
ASSERT0(dn->dn_maxblkid);
2269
if (off == 0 && len >= blksz) {
2270
/*
2271
* Freeing the whole block; fast-track this request.
2272
*/
2273
blkid = 0;
2274
nblks = 1;
2275
if (dn->dn_nlevels > 1) {
2276
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2277
dnode_dirty_l1(dn, 0, tx);
2278
rw_exit(&dn->dn_struct_rwlock);
2279
}
2280
goto done;
2281
} else if (off >= blksz) {
2282
/* Freeing past end-of-data */
2283
return;
2284
} else {
2285
/* Freeing part of the block. */
2286
head = blksz - off;
2287
ASSERT3U(head, >, 0);
2288
}
2289
blkoff = off;
2290
}
2291
/* zero out any partial block data at the start of the range */
2292
if (head) {
2293
ASSERT3U(blkoff + head, ==, blksz);
2294
if (len < head)
2295
head = len;
2296
dnode_partial_zero(dn, off, blkoff, head, tx);
2297
off += head;
2298
len -= head;
2299
}
2300
2301
/* If the range was less than one block, we're done */
2302
if (len == 0)
2303
return;
2304
2305
/* If the remaining range is past end of file, we're done */
2306
if ((off >> blkshift) > dn->dn_maxblkid)
2307
return;
2308
2309
ASSERT(ISP2(blksz));
2310
if (trunc)
2311
tail = 0;
2312
else
2313
tail = P2PHASE(len, blksz);
2314
2315
ASSERT0(P2PHASE(off, blksz));
2316
/* zero out any partial block data at the end of the range */
2317
if (tail) {
2318
if (len < tail)
2319
tail = len;
2320
dnode_partial_zero(dn, off + len, 0, tail, tx);
2321
len -= tail;
2322
}
2323
2324
/* If the range did not include a full block, we are done */
2325
if (len == 0)
2326
return;
2327
2328
ASSERT(IS_P2ALIGNED(off, blksz));
2329
ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2330
blkid = off >> blkshift;
2331
nblks = len >> blkshift;
2332
if (trunc)
2333
nblks += 1;
2334
2335
/*
2336
* Dirty all the indirect blocks in this range. Note that only
2337
* the first and last indirect blocks can actually be written
2338
* (if they were partially freed) -- they must be dirtied, even if
2339
* they do not exist on disk yet. The interior blocks will
2340
* be freed by free_children(), so they will not actually be written.
2341
* Even though these interior blocks will not be written, we
2342
* dirty them for two reasons:
2343
*
2344
* - It ensures that the indirect blocks remain in memory until
2345
* syncing context. (They have already been prefetched by
2346
* dmu_tx_hold_free(), so we don't have to worry about reading
2347
* them serially here.)
2348
*
2349
* - The dirty space accounting will put pressure on the txg sync
2350
* mechanism to begin syncing, and to delay transactions if there
2351
* is a large amount of freeing. Even though these indirect
2352
* blocks will not be written, we could need to write the same
2353
* amount of space if we copy the freed BPs into deadlists.
2354
*/
2355
if (dn->dn_nlevels > 1) {
2356
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2357
uint64_t first, last;
2358
2359
first = blkid >> epbs;
2360
dnode_dirty_l1(dn, first, tx);
2361
if (trunc)
2362
last = dn->dn_maxblkid >> epbs;
2363
else
2364
last = (blkid + nblks - 1) >> epbs;
2365
if (last != first)
2366
dnode_dirty_l1(dn, last, tx);
2367
2368
dnode_dirty_l1range(dn, first, last, tx);
2369
2370
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
2371
SPA_BLKPTRSHIFT;
2372
for (uint64_t i = first + 1; i < last; i++) {
2373
/*
2374
* Set i to the blockid of the next non-hole
2375
* level-1 indirect block at or after i. Note
2376
* that dnode_next_offset() operates in terms of
2377
* level-0-equivalent bytes.
2378
*/
2379
uint64_t ibyte = i << shift;
2380
int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
2381
&ibyte, 2, 1, 0);
2382
i = ibyte >> shift;
2383
if (i >= last)
2384
break;
2385
2386
/*
2387
* Normally we should not see an error, either
2388
* from dnode_next_offset() or dbuf_hold_level()
2389
* (except for ESRCH from dnode_next_offset).
2390
* If there is an i/o error, then when we read
2391
* this block in syncing context, it will use
2392
* ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
2393
* to the "failmode" property. dnode_next_offset()
2394
* doesn't have a flag to indicate MUSTSUCCEED.
2395
*/
2396
if (err != 0)
2397
break;
2398
2399
dnode_dirty_l1(dn, i, tx);
2400
}
2401
rw_exit(&dn->dn_struct_rwlock);
2402
}
2403
2404
done:
2405
/*
2406
* Add this range to the dnode range list.
2407
* We will finish up this free operation in the syncing phase.
2408
*/
2409
mutex_enter(&dn->dn_mtx);
2410
{
2411
int txgoff = tx->tx_txg & TXG_MASK;
2412
if (dn->dn_free_ranges[txgoff] == NULL) {
2413
dn->dn_free_ranges[txgoff] =
2414
zfs_range_tree_create_flags(
2415
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
2416
ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
2417
}
2418
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2419
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2420
}
2421
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2422
(u_longlong_t)blkid, (u_longlong_t)nblks,
2423
(u_longlong_t)tx->tx_txg);
2424
mutex_exit(&dn->dn_mtx);
2425
2426
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2427
dnode_setdirty(dn, tx);
2428
}
2429
2430
static boolean_t
2431
dnode_spill_freed(dnode_t *dn)
2432
{
2433
int i;
2434
2435
mutex_enter(&dn->dn_mtx);
2436
for (i = 0; i < TXG_SIZE; i++) {
2437
if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
2438
break;
2439
}
2440
mutex_exit(&dn->dn_mtx);
2441
return (i < TXG_SIZE);
2442
}
2443
2444
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2445
uint64_t
2446
dnode_block_freed(dnode_t *dn, uint64_t blkid)
2447
{
2448
int i;
2449
2450
if (blkid == DMU_BONUS_BLKID)
2451
return (FALSE);
2452
2453
if (dn->dn_free_txg)
2454
return (TRUE);
2455
2456
if (blkid == DMU_SPILL_BLKID)
2457
return (dnode_spill_freed(dn));
2458
2459
mutex_enter(&dn->dn_mtx);
2460
for (i = 0; i < TXG_SIZE; i++) {
2461
if (dn->dn_free_ranges[i] != NULL &&
2462
zfs_range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2463
break;
2464
}
2465
mutex_exit(&dn->dn_mtx);
2466
return (i < TXG_SIZE);
2467
}
2468
2469
/* call from syncing context when we actually write/free space for this dnode */
2470
void
2471
dnode_diduse_space(dnode_t *dn, int64_t delta)
2472
{
2473
uint64_t space;
2474
dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2475
dn, dn->dn_phys,
2476
(u_longlong_t)dn->dn_phys->dn_used,
2477
(longlong_t)delta);
2478
2479
mutex_enter(&dn->dn_mtx);
2480
space = DN_USED_BYTES(dn->dn_phys);
2481
if (delta > 0) {
2482
ASSERT3U(space + delta, >=, space); /* no overflow */
2483
} else {
2484
ASSERT3U(space, >=, -delta); /* no underflow */
2485
}
2486
space += delta;
2487
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
2488
ASSERT0((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES));
2489
ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
2490
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2491
} else {
2492
dn->dn_phys->dn_used = space;
2493
dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2494
}
2495
mutex_exit(&dn->dn_mtx);
2496
}
2497
2498
/*
2499
* Scans a block at the indicated "level" looking for a hole or data,
2500
* depending on 'flags'.
2501
*
2502
* If level > 0, then we are scanning an indirect block looking at its
2503
* pointers. If level == 0, then we are looking at a block of dnodes.
2504
*
2505
* If we don't find what we are looking for in the block, we return ESRCH.
2506
* Otherwise, return with *offset pointing to the beginning (if searching
2507
* forwards) or end (if searching backwards) of the range covered by the
2508
* block pointer we matched on (or dnode).
2509
*
2510
* The basic search algorithm used below by dnode_next_offset() is to
2511
* use this function to search up the block tree (widen the search) until
2512
* we find something (i.e., we don't return ESRCH) and then search back
2513
* down the tree (narrow the search) until we reach our original search
2514
* level.
2515
*/
2516
static int
2517
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2518
int lvl, uint64_t blkfill, uint64_t txg)
2519
{
2520
dmu_buf_impl_t *db = NULL;
2521
void *data = NULL;
2522
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2523
uint64_t epb = 1ULL << epbs;
2524
uint64_t minfill, maxfill;
2525
boolean_t hole;
2526
int i, inc, error, span;
2527
2528
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2529
2530
hole = ((flags & DNODE_FIND_HOLE) != 0);
2531
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
2532
ASSERT(txg == 0 || !hole);
2533
2534
if (lvl == dn->dn_phys->dn_nlevels) {
2535
error = 0;
2536
epb = dn->dn_phys->dn_nblkptr;
2537
data = dn->dn_phys->dn_blkptr;
2538
if (dn->dn_dbuf != NULL)
2539
rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);
2540
else if (dmu_objset_ds(dn->dn_objset) != NULL)
2541
rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
2542
RW_READER, FTAG);
2543
} else {
2544
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2545
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2546
if (error) {
2547
if (error != ENOENT)
2548
return (error);
2549
if (hole)
2550
return (0);
2551
/*
2552
* This can only happen when we are searching up
2553
* the block tree for data. We don't really need to
2554
* adjust the offset, as we will just end up looking
2555
* at the pointer to this block in its parent, and its
2556
* going to be unallocated, so we will skip over it.
2557
*/
2558
return (SET_ERROR(ESRCH));
2559
}
2560
error = dbuf_read(db, NULL,
2561
DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
2562
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
2563
if (error) {
2564
dbuf_rele(db, FTAG);
2565
return (error);
2566
}
2567
data = db->db.db_data;
2568
rw_enter(&db->db_rwlock, RW_READER);
2569
}
2570
2571
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
2572
BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
2573
BP_IS_HOLE(db->db_blkptr))) {
2574
/*
2575
* This can only happen when we are searching up the tree
2576
* and these conditions mean that we need to keep climbing.
2577
*/
2578
error = SET_ERROR(ESRCH);
2579
} else if (lvl == 0) {
2580
dnode_phys_t *dnp = data;
2581
2582
ASSERT(dn->dn_type == DMU_OT_DNODE);
2583
ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2584
2585
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2586
i < blkfill; i += dnp[i].dn_extra_slots + 1) {
2587
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2588
break;
2589
}
2590
2591
if (i == blkfill)
2592
error = SET_ERROR(ESRCH);
2593
2594
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2595
(i << DNODE_SHIFT);
2596
} else {
2597
blkptr_t *bp = data;
2598
uint64_t start = *offset;
2599
span = (lvl - 1) * epbs + dn->dn_datablkshift;
2600
minfill = 0;
2601
maxfill = blkfill << ((lvl - 1) * epbs);
2602
2603
if (hole)
2604
maxfill--;
2605
else
2606
minfill++;
2607
2608
if (span >= 8 * sizeof (*offset)) {
2609
/* This only happens on the highest indirection level */
2610
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
2611
*offset = 0;
2612
} else {
2613
*offset = *offset >> span;
2614
}
2615
2616
for (i = BF64_GET(*offset, 0, epbs);
2617
i >= 0 && i < epb; i += inc) {
2618
if (BP_GET_FILL(&bp[i]) >= minfill &&
2619
BP_GET_FILL(&bp[i]) <= maxfill &&
2620
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
2621
break;
2622
if (inc > 0 || *offset > 0)
2623
*offset += inc;
2624
}
2625
2626
if (span >= 8 * sizeof (*offset)) {
2627
*offset = start;
2628
} else {
2629
*offset = *offset << span;
2630
}
2631
2632
if (inc < 0) {
2633
/* traversing backwards; position offset at the end */
2634
if (span < 8 * sizeof (*offset))
2635
*offset = MIN(*offset + (1ULL << span) - 1,
2636
start);
2637
} else if (*offset < start) {
2638
*offset = start;
2639
}
2640
if (i < 0 || i >= epb)
2641
error = SET_ERROR(ESRCH);
2642
}
2643
2644
if (db != NULL) {
2645
rw_exit(&db->db_rwlock);
2646
dbuf_rele(db, FTAG);
2647
} else {
2648
if (dn->dn_dbuf != NULL)
2649
rw_exit(&dn->dn_dbuf->db_rwlock);
2650
else if (dmu_objset_ds(dn->dn_objset) != NULL)
2651
rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
2652
FTAG);
2653
}
2654
2655
return (error);
2656
}
2657
2658
/*
2659
* Adjust *offset to the next (or previous) block byte offset at lvl.
2660
* Returns FALSE if *offset would overflow or underflow.
2661
*/
2662
static boolean_t
2663
dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
2664
{
2665
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2666
int span = lvl * epbs + dn->dn_datablkshift;
2667
uint64_t blkid, maxblkid;
2668
2669
if (span >= 8 * sizeof (uint64_t))
2670
return (B_FALSE);
2671
2672
blkid = *offset >> span;
2673
maxblkid = 1ULL << (8 * sizeof (*offset) - span);
2674
if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
2675
*offset = (blkid + 1) << span;
2676
else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
2677
*offset = (blkid << span) - 1;
2678
else
2679
return (B_FALSE);
2680
2681
return (B_TRUE);
2682
}
2683
2684
/*
2685
* Find the next hole, data, or sparse region at or after *offset.
2686
* The value 'blkfill' tells us how many items we expect to find
2687
* in an L0 data block; this value is 1 for normal objects,
2688
* DNODES_PER_BLOCK for the meta dnode, and some fraction of
2689
* DNODES_PER_BLOCK when searching for sparse regions thereof.
2690
*
2691
* Examples:
2692
*
2693
* dnode_next_offset(dn, flags, offset, 1, 1, 0);
2694
* Finds the next/previous hole/data in a file.
2695
* Used in dmu_offset_next().
2696
*
2697
* dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2698
* Finds the next free/allocated dnode an objset's meta-dnode.
2699
* Only finds objects that have new contents since txg (ie.
2700
* bonus buffer changes and content removal are ignored).
2701
* Used in dmu_object_next().
2702
*
2703
* dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2704
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
2705
* Used in dmu_object_alloc().
2706
*/
2707
int
2708
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
2709
int minlvl, uint64_t blkfill, uint64_t txg)
2710
{
2711
uint64_t matched = *offset;
2712
int lvl, maxlvl;
2713
int error = 0;
2714
2715
if (!(flags & DNODE_FIND_HAVELOCK))
2716
rw_enter(&dn->dn_struct_rwlock, RW_READER);
2717
2718
if (dn->dn_phys->dn_nlevels == 0) {
2719
error = SET_ERROR(ESRCH);
2720
goto out;
2721
}
2722
2723
if (dn->dn_datablkshift == 0) {
2724
if (*offset < dn->dn_datablksz) {
2725
if (flags & DNODE_FIND_HOLE)
2726
*offset = dn->dn_datablksz;
2727
} else {
2728
error = SET_ERROR(ESRCH);
2729
}
2730
goto out;
2731
}
2732
2733
maxlvl = dn->dn_phys->dn_nlevels;
2734
2735
for (lvl = minlvl; lvl <= maxlvl; ) {
2736
error = dnode_next_offset_level(dn,
2737
flags, offset, lvl, blkfill, txg);
2738
if (error == 0 && lvl > minlvl) {
2739
--lvl;
2740
matched = *offset;
2741
} else if (error == ESRCH && lvl < maxlvl &&
2742
dnode_next_block(dn, flags, &matched, lvl)) {
2743
/*
2744
* Continue search at next/prev offset in lvl+1 block.
2745
*
2746
* Usually we only search upwards at the start of the
2747
* search as higher level blocks point at a matching
2748
* minlvl block in most cases, but we backtrack if not.
2749
*
2750
* This can happen for txg > 0 searches if the block
2751
* contains only BPs/dnodes freed at that txg. It also
2752
* happens if we are still syncing out the tree, and
2753
* some BP's at higher levels are not updated yet.
2754
*
2755
* We must adjust offset to avoid coming back to the
2756
* same offset and getting stuck looping forever. This
2757
* also deals with the case where offset is already at
2758
* the beginning or end of the object.
2759
*/
2760
++lvl;
2761
*offset = matched;
2762
} else {
2763
break;
2764
}
2765
}
2766
2767
/*
2768
* There's always a "virtual hole" at the end of the object, even
2769
* if all BP's which physically exist are non-holes.
2770
*/
2771
if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2772
minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2773
error = 0;
2774
}
2775
2776
out:
2777
if (!(flags & DNODE_FIND_HAVELOCK))
2778
rw_exit(&dn->dn_struct_rwlock);
2779
2780
return (error);
2781
}
2782
2783
#if defined(_KERNEL)
2784
EXPORT_SYMBOL(dnode_hold);
2785
EXPORT_SYMBOL(dnode_rele);
2786
EXPORT_SYMBOL(dnode_set_nlevels);
2787
EXPORT_SYMBOL(dnode_set_blksz);
2788
EXPORT_SYMBOL(dnode_free_range);
2789
EXPORT_SYMBOL(dnode_evict_dbufs);
2790
EXPORT_SYMBOL(dnode_evict_bonus);
2791
#endif
2792
2793
ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
2794
"Default dnode block shift");
2795
ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
2796
"Default dnode indirect block shift");
2797
2798