Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/bpobj.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25
* Copyright (c) 2017 Datto Inc.
26
*/
27
28
#include <sys/bpobj.h>
29
#include <sys/zfs_context.h>
30
#include <sys/zfs_refcount.h>
31
#include <sys/dsl_pool.h>
32
#include <sys/zfeature.h>
33
#include <sys/zap.h>
34
35
/*
36
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
37
*/
38
uint64_t
39
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
40
{
41
spa_t *spa = dmu_objset_spa(os);
42
dsl_pool_t *dp = dmu_objset_pool(os);
43
44
if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
46
ASSERT0(dp->dp_empty_bpobj);
47
dp->dp_empty_bpobj =
48
bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
49
VERIFY(zap_add(os,
50
DMU_POOL_DIRECTORY_OBJECT,
51
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
52
&dp->dp_empty_bpobj, tx) == 0);
53
}
54
spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
55
ASSERT(dp->dp_empty_bpobj != 0);
56
return (dp->dp_empty_bpobj);
57
} else {
58
return (bpobj_alloc(os, blocksize, tx));
59
}
60
}
61
62
void
63
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
64
{
65
dsl_pool_t *dp = dmu_objset_pool(os);
66
67
spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
68
if (!spa_feature_is_active(dmu_objset_spa(os),
69
SPA_FEATURE_EMPTY_BPOBJ)) {
70
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
71
DMU_POOL_DIRECTORY_OBJECT,
72
DMU_POOL_EMPTY_BPOBJ, tx));
73
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
74
dp->dp_empty_bpobj = 0;
75
}
76
}
77
78
uint64_t
79
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
80
{
81
int size;
82
83
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
84
size = BPOBJ_SIZE_V0;
85
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
86
size = BPOBJ_SIZE_V1;
87
else if (!spa_feature_is_active(dmu_objset_spa(os),
88
SPA_FEATURE_LIVELIST))
89
size = BPOBJ_SIZE_V2;
90
else
91
size = sizeof (bpobj_phys_t);
92
93
return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
94
DMU_OT_BPOBJ_HDR, size, tx));
95
}
96
97
void
98
bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
99
{
100
int64_t i;
101
bpobj_t bpo;
102
dmu_object_info_t doi;
103
int epb;
104
dmu_buf_t *dbuf = NULL;
105
106
ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
107
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
108
109
mutex_enter(&bpo.bpo_lock);
110
111
if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
112
goto out;
113
114
VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
115
epb = doi.doi_data_block_size / sizeof (uint64_t);
116
117
for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
118
uint64_t *objarray;
119
uint64_t offset, blkoff;
120
121
offset = i * sizeof (uint64_t);
122
blkoff = P2PHASE(i, epb);
123
124
if (dbuf == NULL || dbuf->db_offset > offset) {
125
if (dbuf)
126
dmu_buf_rele(dbuf, FTAG);
127
VERIFY3U(0, ==, dmu_buf_hold(os,
128
bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
129
}
130
131
ASSERT3U(offset, >=, dbuf->db_offset);
132
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
133
134
objarray = dbuf->db_data;
135
bpobj_free(os, objarray[blkoff], tx);
136
}
137
if (dbuf) {
138
dmu_buf_rele(dbuf, FTAG);
139
dbuf = NULL;
140
}
141
VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
142
143
out:
144
mutex_exit(&bpo.bpo_lock);
145
bpobj_close(&bpo);
146
147
VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
148
}
149
150
int
151
bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
152
{
153
dmu_object_info_t doi;
154
int err;
155
156
err = dmu_object_info(os, object, &doi);
157
if (err)
158
return (err);
159
160
memset(bpo, 0, sizeof (*bpo));
161
mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
162
163
ASSERT0P(bpo->bpo_dbuf);
164
ASSERT0P(bpo->bpo_phys);
165
ASSERT(object != 0);
166
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
167
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
168
169
err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
170
if (err)
171
return (err);
172
173
bpo->bpo_os = os;
174
bpo->bpo_object = object;
175
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
176
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
177
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
178
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
179
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
180
return (0);
181
}
182
183
boolean_t
184
bpobj_is_open(const bpobj_t *bpo)
185
{
186
return (bpo->bpo_object != 0);
187
}
188
189
void
190
bpobj_close(bpobj_t *bpo)
191
{
192
/* Lame workaround for closing a bpobj that was never opened. */
193
if (bpo->bpo_object == 0)
194
return;
195
196
dmu_buf_rele(bpo->bpo_dbuf, bpo);
197
if (bpo->bpo_cached_dbuf != NULL)
198
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
199
bpo->bpo_dbuf = NULL;
200
bpo->bpo_phys = NULL;
201
bpo->bpo_cached_dbuf = NULL;
202
bpo->bpo_object = 0;
203
204
mutex_destroy(&bpo->bpo_lock);
205
}
206
207
static boolean_t
208
bpobj_is_empty_impl(bpobj_t *bpo)
209
{
210
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
211
return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
212
(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
213
}
214
215
boolean_t
216
bpobj_is_empty(bpobj_t *bpo)
217
{
218
mutex_enter(&bpo->bpo_lock);
219
boolean_t is_empty = bpobj_is_empty_impl(bpo);
220
mutex_exit(&bpo->bpo_lock);
221
return (is_empty);
222
}
223
224
/*
225
* A recursive iteration of the bpobjs would be nice here but we run the risk
226
* of overflowing function stack space. Instead, find each subobj and add it
227
* to the head of our list so it can be scanned for subjobjs. Like a
228
* recursive implementation, the "deepest" subobjs will be freed first.
229
* When a subobj is found to have no additional subojs, free it.
230
*/
231
typedef struct bpobj_info {
232
bpobj_t *bpi_bpo;
233
/*
234
* This object is a subobj of bpi_parent,
235
* at bpi_index in its subobj array.
236
*/
237
struct bpobj_info *bpi_parent;
238
uint64_t bpi_index;
239
/* How many of our subobj's are left to process. */
240
uint64_t bpi_unprocessed_subobjs;
241
/* True after having visited this bpo's directly referenced BPs. */
242
boolean_t bpi_visited;
243
list_node_t bpi_node;
244
} bpobj_info_t;
245
246
static bpobj_info_t *
247
bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
248
{
249
bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
250
bpi->bpi_bpo = bpo;
251
bpi->bpi_parent = parent;
252
bpi->bpi_index = index;
253
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
254
bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
255
}
256
return (bpi);
257
}
258
259
/*
260
* Update bpobj and all of its parents with new space accounting.
261
*/
262
static void
263
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
264
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
265
{
266
267
for (; bpi != NULL; bpi = bpi->bpi_parent) {
268
bpobj_t *p = bpi->bpi_bpo;
269
ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
270
p->bpo_phys->bpo_bytes -= freed;
271
ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
272
if (p->bpo_havecomp) {
273
p->bpo_phys->bpo_comp -= comp_freed;
274
p->bpo_phys->bpo_uncomp -= uncomp_freed;
275
}
276
}
277
}
278
279
static int
280
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
281
int64_t start, dmu_tx_t *tx, boolean_t free)
282
{
283
int err = 0;
284
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
285
dmu_buf_t *dbuf = NULL;
286
bpobj_t *bpo = bpi->bpi_bpo;
287
288
int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
289
uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
290
sizeof (blkptr_t);
291
uint64_t ps = start * sizeof (blkptr_t);
292
uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
293
ps);
294
if (pe > pb) {
295
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
296
ZIO_PRIORITY_ASYNC_READ);
297
}
298
for (; i >= start; i--) {
299
uint64_t offset = i * sizeof (blkptr_t);
300
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
301
302
if (dbuf == NULL || dbuf->db_offset > offset) {
303
if (dbuf)
304
dmu_buf_rele(dbuf, FTAG);
305
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
306
offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
307
if (err)
308
break;
309
pe = pb;
310
pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
311
dbuf->db_offset - dmu_prefetch_max : 0, ps);
312
if (pe > pb) {
313
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
314
pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
315
}
316
}
317
318
ASSERT3U(offset, >=, dbuf->db_offset);
319
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
320
321
blkptr_t *bparray = dbuf->db_data;
322
blkptr_t *bp = &bparray[blkoff];
323
324
boolean_t bp_freed = BP_GET_FREE(bp);
325
err = func(arg, bp, bp_freed, tx);
326
if (err)
327
break;
328
329
if (free) {
330
int sign = bp_freed ? -1 : +1;
331
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
332
freed += sign * bp_get_dsize_sync(spa, bp);
333
comp_freed += sign * BP_GET_PSIZE(bp);
334
uncomp_freed += sign * BP_GET_UCSIZE(bp);
335
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
336
bpo->bpo_phys->bpo_num_blkptrs--;
337
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
338
if (bp_freed) {
339
ASSERT(bpo->bpo_havefreed);
340
bpo->bpo_phys->bpo_num_freed--;
341
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
342
}
343
}
344
}
345
if (free) {
346
propagate_space_reduction(bpi, freed, comp_freed,
347
uncomp_freed, tx);
348
VERIFY0(dmu_free_range(bpo->bpo_os,
349
bpo->bpo_object,
350
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
351
DMU_OBJECT_END, tx));
352
}
353
if (dbuf) {
354
dmu_buf_rele(dbuf, FTAG);
355
dbuf = NULL;
356
}
357
return (err);
358
}
359
360
/*
361
* Given an initial bpo, start by freeing the BPs that are directly referenced
362
* by that bpo. If the bpo has subobjs, read in its last subobj and push the
363
* subobj to our stack. By popping items off our stack, eventually we will
364
* encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
365
* requested also free the now-empty bpo from disk and decrement
366
* its parent's subobj count. We continue popping each subobj from our stack,
367
* visiting its last subobj until they too have no more subobjs, and so on.
368
*/
369
static int
370
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
371
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
372
{
373
list_t stack;
374
bpobj_info_t *bpi;
375
int err = 0;
376
377
/*
378
* Create a "stack" for us to work with without worrying about
379
* stack overflows. Initialize it with the initial_bpo.
380
*/
381
list_create(&stack, sizeof (bpobj_info_t),
382
offsetof(bpobj_info_t, bpi_node));
383
mutex_enter(&initial_bpo->bpo_lock);
384
385
if (bpobj_size != NULL)
386
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
387
388
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
389
390
while ((bpi = list_head(&stack)) != NULL) {
391
bpobj_t *bpo = bpi->bpi_bpo;
392
393
ASSERT3P(bpo, !=, NULL);
394
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
395
ASSERT(bpobj_is_open(bpo));
396
397
if (free)
398
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
399
400
if (bpi->bpi_visited == B_FALSE) {
401
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
402
free);
403
bpi->bpi_visited = B_TRUE;
404
if (err != 0)
405
break;
406
}
407
/*
408
* We've finished with this bpo's directly-referenced BP's and
409
* it has no more unprocessed subobjs. We can free its
410
* bpobj_info_t (unless it is the topmost, initial_bpo).
411
* If we are freeing from disk, we can also do that.
412
*/
413
if (bpi->bpi_unprocessed_subobjs == 0) {
414
/*
415
* If there are no entries, there should
416
* be no bytes.
417
*/
418
if (bpobj_is_empty_impl(bpo)) {
419
ASSERT0(bpo->bpo_phys->bpo_bytes);
420
ASSERT0(bpo->bpo_phys->bpo_comp);
421
ASSERT0(bpo->bpo_phys->bpo_uncomp);
422
}
423
424
/* The initial_bpo has no parent and is not closed. */
425
if (bpi->bpi_parent != NULL) {
426
if (free) {
427
bpobj_t *p = bpi->bpi_parent->bpi_bpo;
428
429
ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
430
ASSERT3U(p->bpo_phys->bpo_num_subobjs,
431
>, 0);
432
ASSERT3U(bpi->bpi_index, ==,
433
p->bpo_phys->bpo_num_subobjs - 1);
434
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
435
tx));
436
437
p->bpo_phys->bpo_num_subobjs--;
438
439
VERIFY0(dmu_free_range(p->bpo_os,
440
p->bpo_phys->bpo_subobjs,
441
bpi->bpi_index * sizeof (uint64_t),
442
sizeof (uint64_t), tx));
443
444
/* eliminate the empty subobj list */
445
if (bpo->bpo_havesubobj &&
446
bpo->bpo_phys->bpo_subobjs != 0) {
447
ASSERT0(bpo->bpo_phys->
448
bpo_num_subobjs);
449
err = dmu_object_free(
450
bpo->bpo_os,
451
bpo->bpo_phys->bpo_subobjs,
452
tx);
453
if (err)
454
break;
455
bpo->bpo_phys->bpo_subobjs = 0;
456
}
457
err = dmu_object_free(p->bpo_os,
458
bpo->bpo_object, tx);
459
if (err)
460
break;
461
}
462
463
mutex_exit(&bpo->bpo_lock);
464
bpobj_close(bpo);
465
kmem_free(bpo, sizeof (bpobj_t));
466
} else {
467
mutex_exit(&bpo->bpo_lock);
468
}
469
470
/*
471
* Finished processing this bpo. Unlock, and free
472
* our "stack" info.
473
*/
474
list_remove_head(&stack);
475
kmem_free(bpi, sizeof (bpobj_info_t));
476
} else {
477
/*
478
* We have unprocessed subobjs. Process the next one.
479
*/
480
ASSERT(bpo->bpo_havecomp);
481
ASSERT0P(bpobj_size);
482
483
/* Add the last subobj to stack. */
484
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
485
uint64_t offset = i * sizeof (uint64_t);
486
487
uint64_t subobj;
488
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
489
offset, sizeof (uint64_t), &subobj,
490
DMU_READ_NO_PREFETCH);
491
if (err)
492
break;
493
494
bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
495
KM_SLEEP);
496
err = bpobj_open(subbpo, bpo->bpo_os, subobj);
497
if (err) {
498
kmem_free(subbpo, sizeof (bpobj_t));
499
break;
500
}
501
502
if (subbpo->bpo_havesubobj &&
503
subbpo->bpo_phys->bpo_subobjs != 0) {
504
dmu_prefetch(subbpo->bpo_os,
505
subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
506
ZIO_PRIORITY_ASYNC_READ);
507
}
508
509
list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
510
mutex_enter(&subbpo->bpo_lock);
511
bpi->bpi_unprocessed_subobjs--;
512
}
513
}
514
/*
515
* Cleanup anything left on the "stack" after we left the loop.
516
* Every bpo on the stack is locked so we must remember to undo
517
* that now (in LIFO order).
518
*/
519
while ((bpi = list_remove_head(&stack)) != NULL) {
520
bpobj_t *bpo = bpi->bpi_bpo;
521
ASSERT(err != 0);
522
ASSERT3P(bpo, !=, NULL);
523
524
mutex_exit(&bpo->bpo_lock);
525
526
/* do not free the initial_bpo */
527
if (bpi->bpi_parent != NULL) {
528
bpobj_close(bpi->bpi_bpo);
529
kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
530
}
531
kmem_free(bpi, sizeof (bpobj_info_t));
532
}
533
534
list_destroy(&stack);
535
536
return (err);
537
}
538
539
/*
540
* Iterate and remove the entries. If func returns nonzero, iteration
541
* will stop and that entry will not be removed.
542
*/
543
int
544
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
545
{
546
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
547
}
548
549
/*
550
* Iterate the entries. If func returns nonzero, iteration will stop.
551
*
552
* If there are no subobjs:
553
*
554
* *bpobj_size can be used to return the number of block pointers in the
555
* bpobj. Note that this may be different from the number of block pointers
556
* that are iterated over, if iteration is terminated early (e.g. by the func
557
* returning nonzero).
558
*
559
* If there are concurrent (or subsequent) modifications to the bpobj then the
560
* returned *bpobj_size can be passed as "start" to
561
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
562
*/
563
int
564
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
565
uint64_t *bpobj_size)
566
{
567
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
568
}
569
570
/*
571
* Iterate over the blkptrs in the bpobj beginning at index start. If func
572
* returns nonzero, iteration will stop. This is a livelist specific function
573
* since it assumes that there are no subobjs present.
574
*/
575
int
576
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
577
int64_t start)
578
{
579
if (bpo->bpo_havesubobj)
580
VERIFY0(bpo->bpo_phys->bpo_subobjs);
581
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
582
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
583
kmem_free(bpi, sizeof (bpobj_info_t));
584
return (err);
585
}
586
587
/*
588
* Logically add subobj's contents to the parent bpobj.
589
*
590
* In the most general case, this is accomplished in constant time by adding
591
* a reference to subobj. This case is used when enqueuing a large subobj:
592
* +--------------+ +--------------+
593
* | bpobj |----------------------->| subobj list |
594
* +----+----+----+----+----+ +-----+-----+--+--+
595
* | bp | bp | bp | bp | bp | | obj | obj | obj |
596
* +----+----+----+----+----+ +-----+-----+-----+
597
*
598
* +--------------+ +--------------+
599
* | sub-bpobj |----------------------> | subsubobj |
600
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
601
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
602
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
603
*
604
* Result: sub-bpobj added to parent's subobj list.
605
* +--------------+ +--------------+
606
* | bpobj |----------------------->| subobj list |
607
* +----+----+----+----+----+ +-----+-----+--+--+-----+
608
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
609
* +----+----+----+----+----+ +-----+-----+-----+--|--+
610
* |
611
* /-----------------------------------------------------/
612
* v
613
* +--------------+ +--------------+
614
* | sub-bpobj |----------------------> | subsubobj |
615
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
616
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
617
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
618
*
619
*
620
* In a common case, the subobj is small: its bp's and its list of subobj's
621
* are each stored in a single block. In this case we copy the subobj's
622
* contents to the parent:
623
* +--------------+ +--------------+
624
* | bpobj |----------------------->| subobj list |
625
* +----+----+----+----+----+ +-----+-----+--+--+
626
* | bp | bp | bp | bp | bp | | obj | obj | obj |
627
* +----+----+----+----+----+ +-----+-----+-----+
628
* ^ ^
629
* +--------------+ | +--------------+ |
630
* | sub-bpobj |---------^------------> | subsubobj | ^
631
* +----+----+----+ | +-----+-----+--+ |
632
* | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
633
* +----+----+ +-----+-----+
634
*
635
* Result: subobj destroyed, contents copied to parent:
636
* +--------------+ +--------------+
637
* | bpobj |----------------------->| subobj list |
638
* +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
639
* | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
640
* +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
641
*
642
*
643
* If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
644
* but retain the sub-bpobj:
645
* +--------------+ +--------------+
646
* | bpobj |----------------------->| subobj list |
647
* +----+----+----+----+----+ +-----+-----+--+--+
648
* | bp | bp | bp | bp | bp | | obj | obj | obj |
649
* +----+----+----+----+----+ +-----+-----+-----+
650
* ^
651
* +--------------+ +--------------+ |
652
* | sub-bpobj |----------------------> | subsubobj | ^
653
* +----+----+----+----+---------+----+ +-----+-----+--+ |
654
* | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
655
* +----+----+----+----+---------+----+ +-----+-----+
656
*
657
* Result: sub-sub-bpobjs and subobj added to parent's subobj list.
658
* +--------------+ +--------------+
659
* | bpobj |-------------------->| subobj list |
660
* +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
661
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
662
* +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
663
* |
664
* /--------------------------------------------------------------/
665
* v
666
* +--------------+
667
* | sub-bpobj |
668
* +----+----+----+----+---------+----+
669
* | bp | bp | bp | bp | ... | bp |
670
* +----+----+----+----+---------+----+
671
*/
672
void
673
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
674
{
675
bpobj_t subbpo;
676
uint64_t used, comp, uncomp, subsubobjs;
677
boolean_t copy_subsub = B_TRUE;
678
boolean_t copy_bps = B_TRUE;
679
680
ASSERT(bpobj_is_open(bpo));
681
ASSERT(subobj != 0);
682
ASSERT(bpo->bpo_havesubobj);
683
ASSERT(bpo->bpo_havecomp);
684
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
685
686
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
687
bpobj_decr_empty(bpo->bpo_os, tx);
688
return;
689
}
690
691
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
692
if (bpobj_is_empty(&subbpo)) {
693
/* No point in having an empty subobj. */
694
bpobj_close(&subbpo);
695
bpobj_free(bpo->bpo_os, subobj, tx);
696
return;
697
}
698
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
699
700
mutex_enter(&bpo->bpo_lock);
701
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
702
703
dmu_object_info_t doi;
704
705
if (bpo->bpo_phys->bpo_subobjs != 0) {
706
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
707
&doi));
708
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
709
}
710
711
/*
712
* If subobj has only one block of subobjs, then move subobj's
713
* subobjs to bpo's subobj list directly. This reduces recursion in
714
* bpobj_iterate due to nested subobjs.
715
*/
716
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
717
if (subsubobjs != 0) {
718
VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
719
if (doi.doi_max_offset > doi.doi_data_block_size) {
720
copy_subsub = B_FALSE;
721
}
722
}
723
724
/*
725
* If, in addition to having only one block of subobj's, subobj has
726
* only one block of bp's, then move subobj's bp's to bpo's bp list
727
* directly. This reduces recursion in bpobj_iterate due to nested
728
* subobjs.
729
*/
730
VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
731
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
732
copy_bps = B_FALSE;
733
}
734
735
if (copy_subsub && subsubobjs != 0) {
736
dmu_buf_t *subdb;
737
uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
738
739
VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
740
0, FTAG, &subdb, 0));
741
/*
742
* Make sure that we are not asking dmu_write()
743
* to write more data than we have in our buffer.
744
*/
745
VERIFY3U(subdb->db_size, >=,
746
numsubsub * sizeof (subobj));
747
if (bpo->bpo_phys->bpo_subobjs == 0) {
748
bpo->bpo_phys->bpo_subobjs =
749
dmu_object_alloc(bpo->bpo_os,
750
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
751
DMU_OT_NONE, 0, tx);
752
}
753
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
754
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
755
numsubsub * sizeof (subobj), subdb->db_data, tx);
756
dmu_buf_rele(subdb, FTAG);
757
bpo->bpo_phys->bpo_num_subobjs += numsubsub;
758
759
dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
760
subbpo.bpo_phys->bpo_subobjs = 0;
761
VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
762
}
763
764
if (copy_bps) {
765
dmu_buf_t *bps;
766
uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
767
768
ASSERT(copy_subsub);
769
VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
770
0, FTAG, &bps, 0));
771
772
/*
773
* Make sure that we are not asking dmu_write()
774
* to write more data than we have in our buffer.
775
*/
776
VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
777
dmu_write(bpo->bpo_os, bpo->bpo_object,
778
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
779
numbps * sizeof (blkptr_t),
780
bps->db_data, tx);
781
dmu_buf_rele(bps, FTAG);
782
bpo->bpo_phys->bpo_num_blkptrs += numbps;
783
784
bpobj_close(&subbpo);
785
VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
786
} else {
787
bpobj_close(&subbpo);
788
if (bpo->bpo_phys->bpo_subobjs == 0) {
789
bpo->bpo_phys->bpo_subobjs =
790
dmu_object_alloc(bpo->bpo_os,
791
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
792
DMU_OT_NONE, 0, tx);
793
}
794
795
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
796
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
797
sizeof (subobj), &subobj, tx);
798
bpo->bpo_phys->bpo_num_subobjs++;
799
}
800
801
bpo->bpo_phys->bpo_bytes += used;
802
bpo->bpo_phys->bpo_comp += comp;
803
bpo->bpo_phys->bpo_uncomp += uncomp;
804
mutex_exit(&bpo->bpo_lock);
805
806
}
807
808
/*
809
* Prefetch metadata required for bpobj_enqueue_subobj().
810
*/
811
void
812
bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
813
{
814
dmu_object_info_t doi;
815
bpobj_t subbpo;
816
uint64_t subsubobjs;
817
boolean_t copy_subsub = B_TRUE;
818
boolean_t copy_bps = B_TRUE;
819
820
ASSERT(bpobj_is_open(bpo));
821
ASSERT(subobj != 0);
822
823
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
824
return;
825
826
if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
827
return;
828
if (bpobj_is_empty(&subbpo)) {
829
bpobj_close(&subbpo);
830
return;
831
}
832
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
833
bpobj_close(&subbpo);
834
835
if (subsubobjs != 0) {
836
if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
837
return;
838
if (doi.doi_max_offset > doi.doi_data_block_size)
839
copy_subsub = B_FALSE;
840
}
841
842
if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
843
return;
844
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
845
copy_bps = B_FALSE;
846
847
if (copy_subsub && subsubobjs != 0) {
848
if (bpo->bpo_phys->bpo_subobjs) {
849
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
850
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
851
ZIO_PRIORITY_ASYNC_READ);
852
}
853
dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
854
ZIO_PRIORITY_ASYNC_READ);
855
}
856
857
if (copy_bps) {
858
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
859
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
860
ZIO_PRIORITY_ASYNC_READ);
861
dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
862
ZIO_PRIORITY_ASYNC_READ);
863
} else if (bpo->bpo_phys->bpo_subobjs) {
864
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
865
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
866
ZIO_PRIORITY_ASYNC_READ);
867
}
868
}
869
870
void
871
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
872
dmu_tx_t *tx)
873
{
874
blkptr_t stored_bp = *bp;
875
uint64_t offset;
876
int blkoff;
877
blkptr_t *bparray;
878
879
ASSERT(bpobj_is_open(bpo));
880
ASSERT(!BP_IS_HOLE(bp));
881
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
882
883
if (BP_IS_EMBEDDED(bp)) {
884
/*
885
* The bpobj will compress better without the payload.
886
*
887
* Note that we store EMBEDDED bp's because they have an
888
* uncompressed size, which must be accounted for. An
889
* alternative would be to add their size to bpo_uncomp
890
* without storing the bp, but that would create additional
891
* complications: bpo_uncomp would be inconsistent with the
892
* set of BP's stored, and bpobj_iterate() wouldn't visit
893
* all the space accounted for in the bpobj.
894
*/
895
memset(&stored_bp, 0, sizeof (stored_bp));
896
stored_bp.blk_prop = bp->blk_prop;
897
BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
898
} else if (!BP_GET_DEDUP(bp)) {
899
/* The bpobj will compress better without the checksum */
900
memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
901
}
902
903
stored_bp.blk_fill = 0;
904
BP_SET_FREE(&stored_bp, bp_freed);
905
906
mutex_enter(&bpo->bpo_lock);
907
908
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
909
blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
910
911
if (bpo->bpo_cached_dbuf == NULL ||
912
offset < bpo->bpo_cached_dbuf->db_offset ||
913
offset >= bpo->bpo_cached_dbuf->db_offset +
914
bpo->bpo_cached_dbuf->db_size) {
915
if (bpo->bpo_cached_dbuf)
916
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
917
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
918
offset, bpo, &bpo->bpo_cached_dbuf, 0));
919
ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
920
}
921
922
dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
923
bparray = bpo->bpo_cached_dbuf->db_data;
924
bparray[blkoff] = stored_bp;
925
926
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
927
bpo->bpo_phys->bpo_num_blkptrs++;
928
int sign = bp_freed ? -1 : +1;
929
bpo->bpo_phys->bpo_bytes += sign *
930
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
931
if (bpo->bpo_havecomp) {
932
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
933
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
934
}
935
if (bp_freed) {
936
ASSERT(bpo->bpo_havefreed);
937
bpo->bpo_phys->bpo_num_freed++;
938
}
939
mutex_exit(&bpo->bpo_lock);
940
}
941
942
struct space_range_arg {
943
spa_t *spa;
944
uint64_t mintxg;
945
uint64_t maxtxg;
946
uint64_t used;
947
uint64_t comp;
948
uint64_t uncomp;
949
};
950
951
static int
952
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
953
{
954
(void) bp_freed, (void) tx;
955
struct space_range_arg *sra = arg;
956
957
if (BP_GET_BIRTH(bp) > sra->mintxg &&
958
BP_GET_BIRTH(bp) <= sra->maxtxg) {
959
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
960
sra->used += bp_get_dsize_sync(sra->spa, bp);
961
else
962
sra->used += bp_get_dsize(sra->spa, bp);
963
sra->comp += BP_GET_PSIZE(bp);
964
sra->uncomp += BP_GET_UCSIZE(bp);
965
}
966
return (0);
967
}
968
969
int
970
bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
971
{
972
ASSERT(bpobj_is_open(bpo));
973
mutex_enter(&bpo->bpo_lock);
974
975
*usedp = bpo->bpo_phys->bpo_bytes;
976
if (bpo->bpo_havecomp) {
977
*compp = bpo->bpo_phys->bpo_comp;
978
*uncompp = bpo->bpo_phys->bpo_uncomp;
979
mutex_exit(&bpo->bpo_lock);
980
return (0);
981
} else {
982
mutex_exit(&bpo->bpo_lock);
983
return (bpobj_space_range(bpo, 0, UINT64_MAX,
984
usedp, compp, uncompp));
985
}
986
}
987
988
/*
989
* Return the amount of space in the bpobj which is:
990
* mintxg < logical birth <= maxtxg
991
*/
992
int
993
bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
994
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
995
{
996
struct space_range_arg sra = { 0 };
997
int err;
998
999
ASSERT(bpobj_is_open(bpo));
1000
1001
/*
1002
* As an optimization, if they want the whole txg range, just
1003
* get bpo_bytes rather than iterating over the bps.
1004
*/
1005
if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
1006
return (bpobj_space(bpo, usedp, compp, uncompp));
1007
1008
sra.spa = dmu_objset_spa(bpo->bpo_os);
1009
sra.mintxg = mintxg;
1010
sra.maxtxg = maxtxg;
1011
1012
err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
1013
*usedp = sra.used;
1014
*compp = sra.comp;
1015
*uncompp = sra.uncomp;
1016
return (err);
1017
}
1018
1019
/*
1020
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
1021
* bpobj are designated as free or allocated that information is not preserved
1022
* in bplists.
1023
*/
1024
int
1025
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
1026
dmu_tx_t *tx)
1027
{
1028
(void) bp_freed, (void) tx;
1029
bplist_t *bpl = arg;
1030
bplist_append(bpl, bp);
1031
return (0);
1032
}
1033
1034