Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_traverse.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25
*/
26
27
#include <sys/zfs_context.h>
28
#include <sys/dmu_objset.h>
29
#include <sys/dmu_traverse.h>
30
#include <sys/dsl_dataset.h>
31
#include <sys/dsl_dir.h>
32
#include <sys/dsl_pool.h>
33
#include <sys/dnode.h>
34
#include <sys/spa.h>
35
#include <sys/spa_impl.h>
36
#include <sys/zio.h>
37
#include <sys/dmu_impl.h>
38
#include <sys/sa.h>
39
#include <sys/sa_impl.h>
40
#include <sys/callb.h>
41
#include <sys/zfeature.h>
42
43
static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
44
static int32_t send_holes_without_birth_time = 1;
45
static uint_t zfs_traverse_indirect_prefetch_limit = 32;
46
47
typedef struct prefetch_data {
48
kmutex_t pd_mtx;
49
kcondvar_t pd_cv;
50
int32_t pd_bytes_fetched;
51
int pd_flags;
52
boolean_t pd_cancel;
53
boolean_t pd_exited;
54
zbookmark_phys_t pd_resume;
55
} prefetch_data_t;
56
57
typedef struct traverse_data {
58
spa_t *td_spa;
59
uint64_t td_objset;
60
blkptr_t *td_rootbp;
61
uint64_t td_min_txg;
62
zbookmark_phys_t *td_resume;
63
int td_flags;
64
prefetch_data_t *td_pfd;
65
boolean_t td_paused;
66
uint64_t td_hole_birth_enabled_txg;
67
blkptr_cb_t *td_func;
68
void *td_arg;
69
boolean_t td_realloc_possible;
70
} traverse_data_t;
71
72
static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
73
const dnode_phys_t *dnp, uint64_t objset, uint64_t object);
74
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
75
uint64_t objset, uint64_t object);
76
77
static inline uint64_t
78
get_birth_time(traverse_data_t *td, const blkptr_t *bp)
79
{
80
if (td->td_flags & TRAVERSE_LOGICAL)
81
return (BP_GET_LOGICAL_BIRTH(bp));
82
else
83
return (BP_GET_BIRTH(bp));
84
}
85
86
static int
87
traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
88
uint64_t claim_txg)
89
{
90
traverse_data_t *td = arg;
91
zbookmark_phys_t zb;
92
93
if (BP_IS_HOLE(bp))
94
return (0);
95
96
if (claim_txg == 0 &&
97
get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa))
98
return (-1);
99
100
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
101
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
102
103
(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
104
105
return (0);
106
}
107
108
static int
109
traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
110
uint64_t claim_txg)
111
{
112
traverse_data_t *td = arg;
113
114
if (lrc->lrc_txtype == TX_WRITE) {
115
lr_write_t *lr = (lr_write_t *)lrc;
116
blkptr_t *bp = &lr->lr_blkptr;
117
zbookmark_phys_t zb;
118
119
if (BP_IS_HOLE(bp))
120
return (0);
121
122
if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg)
123
return (0);
124
125
ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
126
SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
127
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
128
129
(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
130
td->td_arg);
131
}
132
return (0);
133
}
134
135
static void
136
traverse_zil(traverse_data_t *td, zil_header_t *zh)
137
{
138
uint64_t claim_txg = zh->zh_claim_txg;
139
140
/*
141
* We only want to visit blocks that have been claimed but not yet
142
* replayed; plus blocks that are already stable in read-only mode.
143
*/
144
if (claim_txg == 0 && spa_writeable(td->td_spa))
145
return;
146
147
zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
148
(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
149
claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
150
zil_free(zilog);
151
}
152
153
typedef enum resume_skip {
154
RESUME_SKIP_ALL,
155
RESUME_SKIP_NONE,
156
RESUME_SKIP_CHILDREN
157
} resume_skip_t;
158
159
/*
160
* Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
161
* the block indicated by zb does not need to be visited at all. Returns
162
* RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
163
* resume point. This indicates that this block should be visited but not its
164
* children (since they must have been visited in a previous traversal).
165
* Otherwise returns RESUME_SKIP_NONE.
166
*/
167
static resume_skip_t
168
resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
169
const zbookmark_phys_t *zb)
170
{
171
if (td->td_resume != NULL) {
172
/*
173
* If we already visited this bp & everything below,
174
* don't bother doing it again.
175
*/
176
if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
177
return (RESUME_SKIP_ALL);
178
179
if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
180
if (td->td_flags & TRAVERSE_POST)
181
return (RESUME_SKIP_CHILDREN);
182
}
183
}
184
return (RESUME_SKIP_NONE);
185
}
186
187
/*
188
* Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
189
*/
190
static boolean_t
191
traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
192
const blkptr_t *bp, const zbookmark_phys_t *zb)
193
{
194
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
195
ARC_FLAG_PRESCIENT_PREFETCH;
196
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
197
198
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
199
return (B_FALSE);
200
/*
201
* If this bp is before the resume point, it may have already been
202
* freed.
203
*/
204
if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
205
return (B_FALSE);
206
if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg)
207
return (B_FALSE);
208
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
209
return (B_FALSE);
210
ASSERT(!BP_IS_REDACTED(bp));
211
212
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
213
zio_flags |= ZIO_FLAG_RAW;
214
215
(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
216
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
217
return (B_TRUE);
218
}
219
220
static boolean_t
221
prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
222
{
223
ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
224
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
225
BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))
226
return (B_FALSE);
227
return (B_TRUE);
228
}
229
230
static int
231
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
232
const blkptr_t *bp, const zbookmark_phys_t *zb)
233
{
234
int err = 0;
235
arc_buf_t *buf = NULL;
236
prefetch_data_t *pd = td->td_pfd;
237
238
switch (resume_skip_check(td, dnp, zb)) {
239
case RESUME_SKIP_ALL:
240
return (0);
241
case RESUME_SKIP_CHILDREN:
242
goto post;
243
case RESUME_SKIP_NONE:
244
break;
245
default:
246
ASSERT(0);
247
}
248
249
if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
250
/*
251
* Since this block has a birth time of 0 it must be one of
252
* two things: a hole created before the
253
* SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
254
* which has always been a hole in an object.
255
*
256
* If a file is written sparsely, then the unwritten parts of
257
* the file were "always holes" -- that is, they have been
258
* holes since this object was allocated. However, we (and
259
* our callers) can not necessarily tell when an object was
260
* allocated. Therefore, if it's possible that this object
261
* was freed and then its object number reused, we need to
262
* visit all the holes with birth==0.
263
*
264
* If it isn't possible that the object number was reused,
265
* then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
266
* all the blocks we will visit as part of this traversal,
267
* then this hole must have always existed, so we can skip
268
* it. We visit blocks born after (exclusive) td_min_txg.
269
*
270
* Note that the meta-dnode cannot be reallocated.
271
*/
272
if (!send_holes_without_birth_time &&
273
(!td->td_realloc_possible ||
274
zb->zb_object == DMU_META_DNODE_OBJECT) &&
275
td->td_hole_birth_enabled_txg <= td->td_min_txg)
276
return (0);
277
} else if (get_birth_time(td, bp) <= td->td_min_txg) {
278
return (0);
279
}
280
281
if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
282
uint64_t size = BP_GET_LSIZE(bp);
283
mutex_enter(&pd->pd_mtx);
284
ASSERT(pd->pd_bytes_fetched >= 0);
285
while (pd->pd_bytes_fetched < size && !pd->pd_exited)
286
cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
287
pd->pd_bytes_fetched -= size;
288
cv_broadcast(&pd->pd_cv);
289
mutex_exit(&pd->pd_mtx);
290
}
291
292
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
293
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
294
if (err != 0)
295
goto post;
296
return (0);
297
}
298
299
if (td->td_flags & TRAVERSE_PRE) {
300
err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
301
td->td_arg);
302
if (err == TRAVERSE_VISIT_NO_CHILDREN)
303
return (0);
304
if (err != 0)
305
goto post;
306
}
307
308
if (BP_GET_LEVEL(bp) > 0) {
309
arc_flags_t flags = ARC_FLAG_WAIT;
310
int32_t i, ptidx, pidx;
311
uint32_t prefetchlimit;
312
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
313
zbookmark_phys_t *czb;
314
315
ASSERT(!BP_IS_PROTECTED(bp));
316
317
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
318
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
319
if (err != 0)
320
goto post;
321
322
czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
323
324
/*
325
* When performing a traversal it is beneficial to
326
* asynchronously read-ahead the upcoming indirect
327
* blocks since they will be needed shortly. However,
328
* since a 128k indirect (non-L0) block may contain up
329
* to 1024 128-byte block pointers, its preferable to not
330
* prefetch them all at once. Issuing a large number of
331
* async reads may effect performance, and the earlier
332
* the indirect blocks are prefetched the less likely
333
* they are to still be resident in the ARC when needed.
334
* Therefore, prefetching indirect blocks is limited to
335
* zfs_traverse_indirect_prefetch_limit=32 blocks by
336
* default.
337
*
338
* pidx: Index for which next prefetch to be issued.
339
* ptidx: Index at which next prefetch to be triggered.
340
*/
341
ptidx = 0;
342
pidx = 1;
343
prefetchlimit = zfs_traverse_indirect_prefetch_limit;
344
for (i = 0; i < epb; i++) {
345
if (prefetchlimit && i == ptidx) {
346
ASSERT3S(ptidx, <=, pidx);
347
for (uint32_t prefetched = 0; pidx < epb &&
348
prefetched < prefetchlimit; pidx++) {
349
SET_BOOKMARK(czb, zb->zb_objset,
350
zb->zb_object, zb->zb_level - 1,
351
zb->zb_blkid * epb + pidx);
352
if (traverse_prefetch_metadata(td, dnp,
353
&((blkptr_t *)buf->b_data)[pidx],
354
czb) == B_TRUE) {
355
prefetched++;
356
if (prefetched ==
357
MAX(prefetchlimit / 2, 1))
358
ptidx = pidx;
359
}
360
}
361
}
362
363
/* recursively visitbp() blocks below this */
364
SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
365
zb->zb_level - 1,
366
zb->zb_blkid * epb + i);
367
err = traverse_visitbp(td, dnp,
368
&((blkptr_t *)buf->b_data)[i], czb);
369
if (err != 0)
370
break;
371
}
372
373
kmem_free(czb, sizeof (zbookmark_phys_t));
374
375
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
376
arc_flags_t flags = ARC_FLAG_WAIT;
377
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
378
int32_t i;
379
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
380
dnode_phys_t *child_dnp;
381
382
/*
383
* dnode blocks might have their bonus buffers encrypted, so
384
* we must be careful to honor TRAVERSE_NO_DECRYPT
385
*/
386
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
387
zio_flags |= ZIO_FLAG_RAW;
388
389
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
390
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
391
if (err != 0)
392
goto post;
393
394
child_dnp = buf->b_data;
395
396
for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
397
prefetch_dnode_metadata(td, &child_dnp[i],
398
zb->zb_objset, zb->zb_blkid * epb + i);
399
}
400
401
/* recursively visitbp() blocks below this */
402
for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
403
err = traverse_dnode(td, bp, &child_dnp[i],
404
zb->zb_objset, zb->zb_blkid * epb + i);
405
if (err != 0)
406
break;
407
}
408
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
409
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
410
arc_flags_t flags = ARC_FLAG_WAIT;
411
objset_phys_t *osp;
412
413
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
414
zio_flags |= ZIO_FLAG_RAW;
415
416
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
417
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
418
if (err != 0)
419
goto post;
420
421
osp = buf->b_data;
422
prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
423
DMU_META_DNODE_OBJECT);
424
/*
425
* See the block comment above for the goal of this variable.
426
* If the maxblkid of the meta-dnode is 0, then we know that
427
* we've never had more than DNODES_PER_BLOCK objects in the
428
* dataset, which means we can't have reused any object ids.
429
*/
430
if (osp->os_meta_dnode.dn_maxblkid == 0)
431
td->td_realloc_possible = B_FALSE;
432
433
if (OBJSET_BUF_HAS_USERUSED(buf)) {
434
if (OBJSET_BUF_HAS_PROJECTUSED(buf))
435
prefetch_dnode_metadata(td,
436
&osp->os_projectused_dnode,
437
zb->zb_objset, DMU_PROJECTUSED_OBJECT);
438
prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
439
zb->zb_objset, DMU_GROUPUSED_OBJECT);
440
prefetch_dnode_metadata(td, &osp->os_userused_dnode,
441
zb->zb_objset, DMU_USERUSED_OBJECT);
442
}
443
444
err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,
445
DMU_META_DNODE_OBJECT);
446
if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
447
if (OBJSET_BUF_HAS_PROJECTUSED(buf))
448
err = traverse_dnode(td, bp,
449
&osp->os_projectused_dnode, zb->zb_objset,
450
DMU_PROJECTUSED_OBJECT);
451
if (err == 0)
452
err = traverse_dnode(td, bp,
453
&osp->os_groupused_dnode, zb->zb_objset,
454
DMU_GROUPUSED_OBJECT);
455
if (err == 0)
456
err = traverse_dnode(td, bp,
457
&osp->os_userused_dnode, zb->zb_objset,
458
DMU_USERUSED_OBJECT);
459
}
460
}
461
462
if (buf)
463
arc_buf_destroy(buf, &buf);
464
465
post:
466
if (err == 0 && (td->td_flags & TRAVERSE_POST))
467
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
468
469
if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
470
/*
471
* Ignore this disk error as requested by the HARD flag,
472
* and continue traversal.
473
*/
474
err = 0;
475
}
476
477
/*
478
* If we are stopping here, set td_resume.
479
*/
480
if (td->td_resume != NULL && err != 0 && !td->td_paused) {
481
td->td_resume->zb_objset = zb->zb_objset;
482
td->td_resume->zb_object = zb->zb_object;
483
td->td_resume->zb_level = 0;
484
/*
485
* If we have stopped on an indirect block (e.g. due to
486
* i/o error), we have not visited anything below it.
487
* Set the bookmark to the first level-0 block that we need
488
* to visit. This way, the resuming code does not need to
489
* deal with resuming from indirect blocks.
490
*
491
* Note, if zb_level <= 0, dnp may be NULL, so we don't want
492
* to dereference it.
493
*/
494
td->td_resume->zb_blkid = zb->zb_blkid;
495
if (zb->zb_level > 0) {
496
td->td_resume->zb_blkid <<= zb->zb_level *
497
(dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
498
}
499
td->td_paused = B_TRUE;
500
}
501
502
return (err);
503
}
504
505
static void
506
prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
507
uint64_t objset, uint64_t object)
508
{
509
int j;
510
zbookmark_phys_t czb;
511
512
for (j = 0; j < dnp->dn_nblkptr; j++) {
513
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
514
traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
515
}
516
517
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
518
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
519
traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
520
}
521
}
522
523
static int
524
traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
525
uint64_t objset, uint64_t object)
526
{
527
int j, err = 0;
528
zbookmark_phys_t czb;
529
530
if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
531
object < td->td_resume->zb_object)
532
return (0);
533
534
if (td->td_flags & TRAVERSE_PRE) {
535
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
536
ZB_DNODE_BLKID);
537
err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
538
td->td_arg);
539
if (err == TRAVERSE_VISIT_NO_CHILDREN)
540
return (0);
541
if (err != 0)
542
return (err);
543
}
544
545
for (j = 0; j < dnp->dn_nblkptr; j++) {
546
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
547
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
548
if (err != 0)
549
break;
550
}
551
552
if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
553
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
554
err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
555
}
556
557
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
558
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
559
ZB_DNODE_BLKID);
560
err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
561
td->td_arg);
562
if (err == TRAVERSE_VISIT_NO_CHILDREN)
563
return (0);
564
if (err != 0)
565
return (err);
566
}
567
return (err);
568
}
569
570
static int
571
traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
572
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
573
{
574
(void) zilog, (void) dnp;
575
prefetch_data_t *pfd = arg;
576
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
577
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
578
ARC_FLAG_PRESCIENT_PREFETCH;
579
580
ASSERT(pfd->pd_bytes_fetched >= 0);
581
if (zb->zb_level == ZB_DNODE_LEVEL)
582
return (0);
583
if (pfd->pd_cancel)
584
return (SET_ERROR(EINTR));
585
586
if (!prefetch_needed(pfd, bp))
587
return (0);
588
589
mutex_enter(&pfd->pd_mtx);
590
while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
591
cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
592
pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
593
cv_broadcast(&pfd->pd_cv);
594
mutex_exit(&pfd->pd_mtx);
595
596
if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
597
zio_flags |= ZIO_FLAG_RAW;
598
599
(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
600
zio_flags, &aflags, zb);
601
602
return (0);
603
}
604
605
static void
606
traverse_prefetch_thread(void *arg)
607
{
608
traverse_data_t *td_main = arg;
609
traverse_data_t td = *td_main;
610
zbookmark_phys_t czb;
611
fstrans_cookie_t cookie = spl_fstrans_mark();
612
613
td.td_func = traverse_prefetcher;
614
td.td_arg = td_main->td_pfd;
615
td.td_pfd = NULL;
616
td.td_resume = &td_main->td_pfd->pd_resume;
617
618
SET_BOOKMARK(&czb, td.td_objset,
619
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
620
(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
621
622
mutex_enter(&td_main->td_pfd->pd_mtx);
623
td_main->td_pfd->pd_exited = B_TRUE;
624
cv_broadcast(&td_main->td_pfd->pd_cv);
625
mutex_exit(&td_main->td_pfd->pd_mtx);
626
spl_fstrans_unmark(cookie);
627
}
628
629
/*
630
* NB: dataset must not be changing on-disk (eg, is a snapshot or we are
631
* in syncing context).
632
*/
633
static int
634
traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
635
uint64_t txg_start, zbookmark_phys_t *resume, int flags,
636
blkptr_cb_t func, void *arg)
637
{
638
traverse_data_t *td;
639
prefetch_data_t *pd;
640
zbookmark_phys_t *czb;
641
int err;
642
643
ASSERT(ds == NULL || objset == ds->ds_object);
644
ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
645
646
td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
647
pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
648
czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
649
650
td->td_spa = spa;
651
td->td_objset = objset;
652
td->td_rootbp = rootbp;
653
td->td_min_txg = txg_start;
654
td->td_resume = resume;
655
td->td_func = func;
656
td->td_arg = arg;
657
td->td_pfd = pd;
658
td->td_flags = flags;
659
td->td_paused = B_FALSE;
660
td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
661
662
if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
663
VERIFY(spa_feature_enabled_txg(spa,
664
SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
665
} else {
666
td->td_hole_birth_enabled_txg = UINT64_MAX;
667
}
668
669
pd->pd_flags = flags;
670
if (resume != NULL)
671
pd->pd_resume = *resume;
672
mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
673
cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
674
675
SET_BOOKMARK(czb, td->td_objset,
676
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
677
678
/* See comment on ZIL traversal in dsl_scan_visitds. */
679
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
680
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
681
arc_flags_t flags = ARC_FLAG_WAIT;
682
objset_phys_t *osp;
683
arc_buf_t *buf;
684
ASSERT(!BP_IS_REDACTED(rootbp));
685
686
if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
687
BP_IS_PROTECTED(rootbp))
688
zio_flags |= ZIO_FLAG_RAW;
689
690
err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
691
&buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
692
if (err != 0) {
693
/*
694
* If both TRAVERSE_HARD and TRAVERSE_PRE are set,
695
* continue to visitbp so that td_func can be called
696
* in pre stage, and err will reset to zero.
697
*/
698
if (!(td->td_flags & TRAVERSE_HARD) ||
699
!(td->td_flags & TRAVERSE_PRE))
700
goto out;
701
} else {
702
osp = buf->b_data;
703
traverse_zil(td, &osp->os_zil_header);
704
arc_buf_destroy(buf, &buf);
705
}
706
}
707
708
if (!(flags & TRAVERSE_PREFETCH_DATA) ||
709
taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,
710
td, TQ_NOQUEUE) == TASKQID_INVALID)
711
pd->pd_exited = B_TRUE;
712
713
err = traverse_visitbp(td, NULL, rootbp, czb);
714
715
mutex_enter(&pd->pd_mtx);
716
pd->pd_cancel = B_TRUE;
717
cv_broadcast(&pd->pd_cv);
718
while (!pd->pd_exited)
719
cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
720
mutex_exit(&pd->pd_mtx);
721
out:
722
mutex_destroy(&pd->pd_mtx);
723
cv_destroy(&pd->pd_cv);
724
725
kmem_free(czb, sizeof (zbookmark_phys_t));
726
kmem_free(pd, sizeof (struct prefetch_data));
727
kmem_free(td, sizeof (struct traverse_data));
728
729
return (err);
730
}
731
732
/*
733
* NB: dataset must not be changing on-disk (eg, is a snapshot or we are
734
* in syncing context).
735
*/
736
int
737
traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
738
zbookmark_phys_t *resume,
739
int flags, blkptr_cb_t func, void *arg)
740
{
741
return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
742
&dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
743
}
744
745
int
746
traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
747
int flags, blkptr_cb_t func, void *arg)
748
{
749
return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
750
}
751
752
int
753
traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
754
uint64_t txg_start, zbookmark_phys_t *resume, int flags,
755
blkptr_cb_t func, void *arg)
756
{
757
return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
758
blkptr, txg_start, resume, flags, func, arg));
759
}
760
761
/*
762
* NB: pool must not be changing on-disk (eg, from zdb or sync context).
763
*/
764
int
765
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
766
blkptr_cb_t func, void *arg)
767
{
768
int err;
769
dsl_pool_t *dp = spa_get_dsl(spa);
770
objset_t *mos = dp->dp_meta_objset;
771
boolean_t hard = (flags & TRAVERSE_HARD);
772
773
/* visit the MOS */
774
err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
775
txg_start, NULL, flags, func, arg);
776
if (err != 0)
777
return (err);
778
779
/* visit each dataset */
780
for (uint64_t obj = 1; err == 0;
781
err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
782
dmu_object_info_t doi;
783
784
err = dmu_object_info(mos, obj, &doi);
785
if (err != 0) {
786
if (hard)
787
continue;
788
break;
789
}
790
791
if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
792
dsl_dataset_t *ds;
793
uint64_t txg = txg_start;
794
795
dsl_pool_config_enter(dp, FTAG);
796
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
797
dsl_pool_config_exit(dp, FTAG);
798
if (err != 0) {
799
if (hard)
800
continue;
801
break;
802
}
803
if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
804
txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
805
err = traverse_dataset(ds, txg, flags, func, arg);
806
dsl_dataset_rele(ds, FTAG);
807
if (err != 0)
808
break;
809
}
810
}
811
if (err == ESRCH)
812
err = 0;
813
return (err);
814
}
815
816
EXPORT_SYMBOL(traverse_dataset);
817
EXPORT_SYMBOL(traverse_pool);
818
819
ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
820
"Max number of bytes to prefetch");
821
822
ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,
823
"Traverse prefetch number of blocks pointed by indirect block");
824
825
ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
826
"Ignore hole_birth txg for zfs send");
827
828