Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/bpobj.c
105319 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25
* Copyright (c) 2017 Datto Inc.
26
*/
27
28
#include <sys/bpobj.h>
29
#include <sys/zfs_context.h>
30
#include <sys/zfs_refcount.h>
31
#include <sys/dsl_pool.h>
32
#include <sys/zfeature.h>
33
#include <sys/zap.h>
34
35
/*
36
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
37
*/
38
uint64_t
39
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
40
{
41
spa_t *spa = dmu_objset_spa(os);
42
dsl_pool_t *dp = dmu_objset_pool(os);
43
44
if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
46
ASSERT0(dp->dp_empty_bpobj);
47
dp->dp_empty_bpobj =
48
bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
49
VERIFY(zap_add(os,
50
DMU_POOL_DIRECTORY_OBJECT,
51
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
52
&dp->dp_empty_bpobj, tx) == 0);
53
}
54
spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
55
ASSERT(dp->dp_empty_bpobj != 0);
56
return (dp->dp_empty_bpobj);
57
} else {
58
return (bpobj_alloc(os, blocksize, tx));
59
}
60
}
61
62
void
63
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
64
{
65
dsl_pool_t *dp = dmu_objset_pool(os);
66
67
spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
68
if (!spa_feature_is_active(dmu_objset_spa(os),
69
SPA_FEATURE_EMPTY_BPOBJ)) {
70
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
71
DMU_POOL_DIRECTORY_OBJECT,
72
DMU_POOL_EMPTY_BPOBJ, tx));
73
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
74
dp->dp_empty_bpobj = 0;
75
}
76
}
77
78
uint64_t
79
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
80
{
81
int size;
82
83
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
84
size = BPOBJ_SIZE_V0;
85
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
86
size = BPOBJ_SIZE_V1;
87
else if (!spa_feature_is_active(dmu_objset_spa(os),
88
SPA_FEATURE_LIVELIST))
89
size = BPOBJ_SIZE_V2;
90
else
91
size = sizeof (bpobj_phys_t);
92
93
return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
94
DMU_OT_BPOBJ_HDR, size, tx));
95
}
96
97
void
98
bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
99
{
100
int64_t i;
101
bpobj_t bpo;
102
dmu_object_info_t doi;
103
int epb;
104
dmu_buf_t *dbuf = NULL;
105
106
ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
107
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
108
109
mutex_enter(&bpo.bpo_lock);
110
111
if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
112
goto out;
113
114
VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
115
epb = doi.doi_data_block_size / sizeof (uint64_t);
116
117
for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
118
uint64_t *objarray;
119
uint64_t offset, blkoff;
120
121
offset = i * sizeof (uint64_t);
122
blkoff = P2PHASE(i, epb);
123
124
if (dbuf == NULL || dbuf->db_offset > offset) {
125
if (dbuf)
126
dmu_buf_rele(dbuf, FTAG);
127
VERIFY3U(0, ==, dmu_buf_hold(os,
128
bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
129
}
130
131
ASSERT3U(offset, >=, dbuf->db_offset);
132
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
133
134
objarray = dbuf->db_data;
135
bpobj_free(os, objarray[blkoff], tx);
136
}
137
if (dbuf) {
138
dmu_buf_rele(dbuf, FTAG);
139
dbuf = NULL;
140
}
141
VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
142
143
out:
144
mutex_exit(&bpo.bpo_lock);
145
bpobj_close(&bpo);
146
147
VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
148
}
149
150
int
151
bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
152
{
153
dmu_object_info_t doi;
154
int err;
155
156
err = dmu_object_info(os, object, &doi);
157
if (err)
158
return (err);
159
160
memset(bpo, 0, sizeof (*bpo));
161
mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
162
163
ASSERT0P(bpo->bpo_dbuf);
164
ASSERT0P(bpo->bpo_phys);
165
ASSERT(object != 0);
166
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
167
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
168
169
err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
170
if (err)
171
return (err);
172
173
bpo->bpo_os = os;
174
bpo->bpo_object = object;
175
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
176
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
177
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
178
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
179
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
180
return (0);
181
}
182
183
boolean_t
184
bpobj_is_open(const bpobj_t *bpo)
185
{
186
return (bpo->bpo_object != 0);
187
}
188
189
void
190
bpobj_close(bpobj_t *bpo)
191
{
192
/* Lame workaround for closing a bpobj that was never opened. */
193
if (bpo->bpo_object == 0)
194
return;
195
196
dmu_buf_rele(bpo->bpo_dbuf, bpo);
197
if (bpo->bpo_cached_dbuf != NULL)
198
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
199
bpo->bpo_dbuf = NULL;
200
bpo->bpo_phys = NULL;
201
bpo->bpo_cached_dbuf = NULL;
202
bpo->bpo_object = 0;
203
204
mutex_destroy(&bpo->bpo_lock);
205
}
206
207
static boolean_t
208
bpobj_is_empty_impl(bpobj_t *bpo)
209
{
210
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
211
return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
212
(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
213
}
214
215
boolean_t
216
bpobj_is_empty(bpobj_t *bpo)
217
{
218
mutex_enter(&bpo->bpo_lock);
219
boolean_t is_empty = bpobj_is_empty_impl(bpo);
220
mutex_exit(&bpo->bpo_lock);
221
return (is_empty);
222
}
223
224
/*
225
* A recursive iteration of the bpobjs would be nice here but we run the risk
226
* of overflowing function stack space. Instead, find each subobj and add it
227
* to the head of our list so it can be scanned for subjobjs. Like a
228
* recursive implementation, the "deepest" subobjs will be freed first.
229
* When a subobj is found to have no additional subojs, free it.
230
*/
231
typedef struct bpobj_info {
232
bpobj_t *bpi_bpo;
233
/*
234
* This object is a subobj of bpi_parent,
235
* at bpi_index in its subobj array.
236
*/
237
struct bpobj_info *bpi_parent;
238
uint64_t bpi_index;
239
/* How many of our subobj's are left to process. */
240
uint64_t bpi_unprocessed_subobjs;
241
/* True after having visited this bpo's directly referenced BPs. */
242
boolean_t bpi_visited;
243
list_node_t bpi_node;
244
} bpobj_info_t;
245
246
static bpobj_info_t *
247
bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
248
{
249
bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
250
bpi->bpi_bpo = bpo;
251
bpi->bpi_parent = parent;
252
bpi->bpi_index = index;
253
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
254
bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
255
}
256
return (bpi);
257
}
258
259
/*
260
* Update bpobj and all of its parents with new space accounting.
261
*/
262
static void
263
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
264
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
265
{
266
267
for (; bpi != NULL; bpi = bpi->bpi_parent) {
268
bpobj_t *p = bpi->bpi_bpo;
269
ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
270
p->bpo_phys->bpo_bytes -= freed;
271
ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
272
if (p->bpo_havecomp) {
273
p->bpo_phys->bpo_comp -= comp_freed;
274
p->bpo_phys->bpo_uncomp -= uncomp_freed;
275
}
276
}
277
}
278
279
static int
280
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
281
int64_t start, dmu_tx_t *tx, boolean_t free)
282
{
283
int err = 0;
284
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
285
dmu_buf_t *dbuf = NULL;
286
bpobj_t *bpo = bpi->bpi_bpo;
287
288
int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
289
uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
290
sizeof (blkptr_t);
291
uint64_t ps = start * sizeof (blkptr_t);
292
uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
293
ps);
294
if (pe > pb) {
295
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
296
ZIO_PRIORITY_ASYNC_READ);
297
}
298
for (; i >= start; i--) {
299
uint64_t offset = i * sizeof (blkptr_t);
300
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
301
302
if (dbuf == NULL || dbuf->db_offset > offset) {
303
if (dbuf)
304
dmu_buf_rele(dbuf, FTAG);
305
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
306
offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
307
if (err)
308
break;
309
pe = pb;
310
pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
311
dbuf->db_offset - dmu_prefetch_max : 0, ps);
312
if (pe > pb) {
313
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
314
pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
315
}
316
}
317
318
ASSERT3U(offset, >=, dbuf->db_offset);
319
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
320
321
blkptr_t *bparray = dbuf->db_data;
322
blkptr_t *bp = &bparray[blkoff];
323
324
boolean_t bp_freed = BP_GET_FREE(bp);
325
err = func(arg, bp, bp_freed, tx);
326
if (err)
327
break;
328
329
if (free) {
330
int sign = bp_freed ? -1 : +1;
331
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
332
freed += sign * bp_get_dsize_sync(spa, bp);
333
comp_freed += sign * BP_GET_PSIZE(bp);
334
uncomp_freed += sign * BP_GET_UCSIZE(bp);
335
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
336
bpo->bpo_phys->bpo_num_blkptrs--;
337
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
338
if (bp_freed) {
339
ASSERT(bpo->bpo_havefreed);
340
bpo->bpo_phys->bpo_num_freed--;
341
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
342
}
343
}
344
}
345
if (free) {
346
propagate_space_reduction(bpi, freed, comp_freed,
347
uncomp_freed, tx);
348
VERIFY0(dmu_free_range(bpo->bpo_os,
349
bpo->bpo_object,
350
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
351
DMU_OBJECT_END, tx));
352
}
353
if (dbuf) {
354
dmu_buf_rele(dbuf, FTAG);
355
dbuf = NULL;
356
}
357
return (err);
358
}
359
360
/*
361
* Given an initial bpo, start by freeing the BPs that are directly referenced
362
* by that bpo. If the bpo has subobjs, read in its last subobj and push the
363
* subobj to our stack. By popping items off our stack, eventually we will
364
* encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
365
* requested also free the now-empty bpo from disk and decrement
366
* its parent's subobj count. We continue popping each subobj from our stack,
367
* visiting its last subobj until they too have no more subobjs, and so on.
368
*/
369
static int
370
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
371
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
372
{
373
list_t stack;
374
bpobj_info_t *bpi;
375
int err = 0;
376
377
/*
378
* Create a "stack" for us to work with without worrying about
379
* stack overflows. Initialize it with the initial_bpo.
380
*/
381
list_create(&stack, sizeof (bpobj_info_t),
382
offsetof(bpobj_info_t, bpi_node));
383
mutex_enter(&initial_bpo->bpo_lock);
384
385
if (bpobj_size != NULL)
386
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
387
388
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
389
390
while ((bpi = list_head(&stack)) != NULL) {
391
bpobj_t *bpo = bpi->bpi_bpo;
392
393
ASSERT3P(bpo, !=, NULL);
394
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
395
ASSERT(bpobj_is_open(bpo));
396
397
if (free)
398
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
399
400
if (bpi->bpi_visited == B_FALSE) {
401
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
402
free);
403
bpi->bpi_visited = B_TRUE;
404
if (err != 0)
405
break;
406
}
407
/*
408
* We've finished with this bpo's directly-referenced BP's and
409
* it has no more unprocessed subobjs. We can free its
410
* bpobj_info_t (unless it is the topmost, initial_bpo).
411
* If we are freeing from disk, we can also do that.
412
*/
413
if (bpi->bpi_unprocessed_subobjs == 0) {
414
/*
415
* If there are no entries, there should
416
* be no bytes.
417
*/
418
if (bpobj_is_empty_impl(bpo)) {
419
ASSERT0(bpo->bpo_phys->bpo_bytes);
420
ASSERT0(bpo->bpo_phys->bpo_comp);
421
ASSERT0(bpo->bpo_phys->bpo_uncomp);
422
}
423
424
/* The initial_bpo has no parent and is not closed. */
425
if (bpi->bpi_parent != NULL) {
426
if (free) {
427
bpobj_t *p = bpi->bpi_parent->bpi_bpo;
428
429
ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
430
ASSERT3U(p->bpo_phys->bpo_num_subobjs,
431
>, 0);
432
ASSERT3U(bpi->bpi_index, ==,
433
p->bpo_phys->bpo_num_subobjs - 1);
434
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
435
tx));
436
437
p->bpo_phys->bpo_num_subobjs--;
438
439
VERIFY0(dmu_free_range(p->bpo_os,
440
p->bpo_phys->bpo_subobjs,
441
bpi->bpi_index * sizeof (uint64_t),
442
sizeof (uint64_t), tx));
443
444
/* eliminate the empty subobj list */
445
if (bpo->bpo_havesubobj &&
446
bpo->bpo_phys->bpo_subobjs != 0) {
447
ASSERT0(bpo->bpo_phys->
448
bpo_num_subobjs);
449
err = dmu_object_free(
450
bpo->bpo_os,
451
bpo->bpo_phys->bpo_subobjs,
452
tx);
453
if (err)
454
break;
455
bpo->bpo_phys->bpo_subobjs = 0;
456
}
457
err = dmu_object_free(p->bpo_os,
458
bpo->bpo_object, tx);
459
if (err)
460
break;
461
}
462
463
mutex_exit(&bpo->bpo_lock);
464
bpobj_close(bpo);
465
kmem_free(bpo, sizeof (bpobj_t));
466
} else {
467
mutex_exit(&bpo->bpo_lock);
468
}
469
470
/*
471
* Finished processing this bpo. Unlock, and free
472
* our "stack" info.
473
*/
474
list_remove_head(&stack);
475
kmem_free(bpi, sizeof (bpobj_info_t));
476
} else {
477
/*
478
* We have unprocessed subobjs. Process the next one.
479
*/
480
ASSERT(bpo->bpo_havecomp);
481
ASSERT0P(bpobj_size);
482
483
/* Add the last subobj to stack. */
484
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
485
uint64_t offset = i * sizeof (uint64_t);
486
487
uint64_t subobj;
488
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
489
offset, sizeof (uint64_t), &subobj,
490
DMU_READ_NO_PREFETCH);
491
if (err)
492
break;
493
494
bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
495
KM_SLEEP);
496
err = bpobj_open(subbpo, bpo->bpo_os, subobj);
497
if (err) {
498
kmem_free(subbpo, sizeof (bpobj_t));
499
break;
500
}
501
502
if (subbpo->bpo_havesubobj &&
503
subbpo->bpo_phys->bpo_subobjs != 0) {
504
dmu_prefetch(subbpo->bpo_os,
505
subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
506
ZIO_PRIORITY_ASYNC_READ);
507
}
508
509
list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
510
mutex_enter(&subbpo->bpo_lock);
511
bpi->bpi_unprocessed_subobjs--;
512
}
513
}
514
/*
515
* Cleanup anything left on the "stack" after we left the loop.
516
* Every bpo on the stack is locked so we must remember to undo
517
* that now (in LIFO order).
518
*/
519
while ((bpi = list_remove_head(&stack)) != NULL) {
520
bpobj_t *bpo = bpi->bpi_bpo;
521
ASSERT(err != 0);
522
ASSERT3P(bpo, !=, NULL);
523
524
mutex_exit(&bpo->bpo_lock);
525
526
/* do not free the initial_bpo */
527
if (bpi->bpi_parent != NULL) {
528
bpobj_close(bpi->bpi_bpo);
529
kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
530
}
531
kmem_free(bpi, sizeof (bpobj_info_t));
532
}
533
534
list_destroy(&stack);
535
536
return (err);
537
}
538
539
/*
540
* Iterate and remove the entries. If func returns nonzero, iteration
541
* will stop and that entry will not be removed.
542
*/
543
int
544
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
545
{
546
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
547
}
548
549
/*
550
* Iterate the entries. If func returns nonzero, iteration will stop.
551
*
552
* If there are no subobjs:
553
*
554
* *bpobj_size can be used to return the number of block pointers in the
555
* bpobj. Note that this may be different from the number of block pointers
556
* that are iterated over, if iteration is terminated early (e.g. by the func
557
* returning nonzero).
558
*
559
* If there are concurrent (or subsequent) modifications to the bpobj then the
560
* returned *bpobj_size can be passed as "start" to
561
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
562
*/
563
int
564
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
565
uint64_t *bpobj_size)
566
{
567
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
568
}
569
570
/*
571
* Iterate over the blkptrs in the bpobj beginning at index start. If func
572
* returns nonzero, iteration will stop. This is a livelist specific function
573
* since it assumes that there are no subobjs present.
574
*/
575
int
576
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
577
int64_t start)
578
{
579
if (bpo->bpo_havesubobj)
580
VERIFY0(bpo->bpo_phys->bpo_subobjs);
581
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
582
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
583
kmem_free(bpi, sizeof (bpobj_info_t));
584
return (err);
585
}
586
587
/*
588
* Logically add subobj's contents to the parent bpobj.
589
*
590
* In the most general case, this is accomplished in constant time by adding
591
* a reference to subobj. This case is used when enqueuing a large subobj:
592
* +--------------+ +--------------+
593
* | bpobj |----------------------->| subobj list |
594
* +----+----+----+----+----+ +-----+-----+--+--+
595
* | bp | bp | bp | bp | bp | | obj | obj | obj |
596
* +----+----+----+----+----+ +-----+-----+-----+
597
*
598
* +--------------+ +--------------+
599
* | sub-bpobj |----------------------> | subsubobj |
600
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
601
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
602
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
603
*
604
* Result: sub-bpobj added to parent's subobj list.
605
* +--------------+ +--------------+
606
* | bpobj |----------------------->| subobj list |
607
* +----+----+----+----+----+ +-----+-----+--+--+-----+
608
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
609
* +----+----+----+----+----+ +-----+-----+-----+--|--+
610
* |
611
* /-----------------------------------------------------/
612
* v
613
* +--------------+ +--------------+
614
* | sub-bpobj |----------------------> | subsubobj |
615
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
616
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
617
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
618
*
619
*
620
* In a common case, the subobj is small: its bp's and its list of subobj's
621
* are each stored in a single block. In this case we copy the subobj's
622
* contents to the parent:
623
* +--------------+ +--------------+
624
* | bpobj |----------------------->| subobj list |
625
* +----+----+----+----+----+ +-----+-----+--+--+
626
* | bp | bp | bp | bp | bp | | obj | obj | obj |
627
* +----+----+----+----+----+ +-----+-----+-----+
628
* ^ ^
629
* +--------------+ | +--------------+ |
630
* | sub-bpobj |---------^------------> | subsubobj | ^
631
* +----+----+----+ | +-----+-----+--+ |
632
* | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
633
* +----+----+ +-----+-----+
634
*
635
* Result: subobj destroyed, contents copied to parent:
636
* +--------------+ +--------------+
637
* | bpobj |----------------------->| subobj list |
638
* +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
639
* | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
640
* +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
641
*
642
*
643
* If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
644
* but retain the sub-bpobj:
645
* +--------------+ +--------------+
646
* | bpobj |----------------------->| subobj list |
647
* +----+----+----+----+----+ +-----+-----+--+--+
648
* | bp | bp | bp | bp | bp | | obj | obj | obj |
649
* +----+----+----+----+----+ +-----+-----+-----+
650
* ^
651
* +--------------+ +--------------+ |
652
* | sub-bpobj |----------------------> | subsubobj | ^
653
* +----+----+----+----+---------+----+ +-----+-----+--+ |
654
* | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
655
* +----+----+----+----+---------+----+ +-----+-----+
656
*
657
* Result: sub-sub-bpobjs and subobj added to parent's subobj list.
658
* +--------------+ +--------------+
659
* | bpobj |-------------------->| subobj list |
660
* +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
661
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
662
* +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
663
* |
664
* /--------------------------------------------------------------/
665
* v
666
* +--------------+
667
* | sub-bpobj |
668
* +----+----+----+----+---------+----+
669
* | bp | bp | bp | bp | ... | bp |
670
* +----+----+----+----+---------+----+
671
*/
672
void
673
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
674
{
675
bpobj_t subbpo;
676
uint64_t used, comp, uncomp, subsubobjs;
677
boolean_t copy_subsub = B_TRUE;
678
boolean_t copy_bps = B_TRUE;
679
680
ASSERT(bpobj_is_open(bpo));
681
ASSERT(subobj != 0);
682
ASSERT(bpo->bpo_havesubobj);
683
ASSERT(bpo->bpo_havecomp);
684
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
685
686
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
687
bpobj_decr_empty(bpo->bpo_os, tx);
688
return;
689
}
690
691
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
692
if (bpobj_is_empty(&subbpo)) {
693
/* No point in having an empty subobj. */
694
bpobj_close(&subbpo);
695
bpobj_free(bpo->bpo_os, subobj, tx);
696
return;
697
}
698
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
699
700
mutex_enter(&bpo->bpo_lock);
701
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
702
703
dmu_object_info_t doi;
704
705
if (bpo->bpo_phys->bpo_subobjs != 0) {
706
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
707
&doi));
708
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
709
}
710
711
/*
712
* If subobj has only one block of subobjs, then move subobj's
713
* subobjs to bpo's subobj list directly. This reduces recursion in
714
* bpobj_iterate due to nested subobjs.
715
*/
716
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
717
if (subsubobjs != 0) {
718
VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
719
if (doi.doi_max_offset > doi.doi_data_block_size) {
720
copy_subsub = B_FALSE;
721
}
722
}
723
724
/*
725
* If, in addition to having only one block of subobj's, subobj has
726
* only one block of bp's, then move subobj's bp's to bpo's bp list
727
* directly. This reduces recursion in bpobj_iterate due to nested
728
* subobjs.
729
*/
730
VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
731
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
732
copy_bps = B_FALSE;
733
}
734
735
if (copy_subsub && subsubobjs != 0) {
736
dmu_buf_t *subdb;
737
uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
738
739
VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
740
0, FTAG, &subdb, 0));
741
/*
742
* Make sure that we are not asking dmu_write()
743
* to write more data than we have in our buffer.
744
*/
745
VERIFY3U(subdb->db_size, >=,
746
numsubsub * sizeof (subobj));
747
if (bpo->bpo_phys->bpo_subobjs == 0) {
748
bpo->bpo_phys->bpo_subobjs =
749
dmu_object_alloc(bpo->bpo_os,
750
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
751
DMU_OT_NONE, 0, tx);
752
}
753
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
754
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
755
numsubsub * sizeof (subobj), subdb->db_data, tx,
756
DMU_READ_NO_PREFETCH);
757
dmu_buf_rele(subdb, FTAG);
758
bpo->bpo_phys->bpo_num_subobjs += numsubsub;
759
760
dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
761
subbpo.bpo_phys->bpo_subobjs = 0;
762
VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
763
}
764
765
if (copy_bps) {
766
dmu_buf_t *bps;
767
uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
768
769
ASSERT(copy_subsub);
770
VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
771
0, FTAG, &bps, 0));
772
773
/*
774
* Make sure that we are not asking dmu_write()
775
* to write more data than we have in our buffer.
776
*/
777
VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
778
dmu_write(bpo->bpo_os, bpo->bpo_object,
779
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
780
numbps * sizeof (blkptr_t),
781
bps->db_data, tx, DMU_READ_NO_PREFETCH);
782
dmu_buf_rele(bps, FTAG);
783
bpo->bpo_phys->bpo_num_blkptrs += numbps;
784
785
bpobj_close(&subbpo);
786
VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
787
} else {
788
bpobj_close(&subbpo);
789
if (bpo->bpo_phys->bpo_subobjs == 0) {
790
bpo->bpo_phys->bpo_subobjs =
791
dmu_object_alloc(bpo->bpo_os,
792
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
793
DMU_OT_NONE, 0, tx);
794
}
795
796
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
797
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
798
sizeof (subobj), &subobj, tx, DMU_READ_NO_PREFETCH);
799
bpo->bpo_phys->bpo_num_subobjs++;
800
}
801
802
bpo->bpo_phys->bpo_bytes += used;
803
bpo->bpo_phys->bpo_comp += comp;
804
bpo->bpo_phys->bpo_uncomp += uncomp;
805
mutex_exit(&bpo->bpo_lock);
806
807
}
808
809
/*
810
* Prefetch metadata required for bpobj_enqueue_subobj().
811
*/
812
void
813
bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
814
{
815
dmu_object_info_t doi;
816
bpobj_t subbpo;
817
uint64_t subsubobjs;
818
boolean_t copy_subsub = B_TRUE;
819
boolean_t copy_bps = B_TRUE;
820
821
ASSERT(bpobj_is_open(bpo));
822
ASSERT(subobj != 0);
823
824
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
825
return;
826
827
if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
828
return;
829
if (bpobj_is_empty(&subbpo)) {
830
bpobj_close(&subbpo);
831
return;
832
}
833
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
834
bpobj_close(&subbpo);
835
836
if (subsubobjs != 0) {
837
if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
838
return;
839
if (doi.doi_max_offset > doi.doi_data_block_size)
840
copy_subsub = B_FALSE;
841
}
842
843
if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
844
return;
845
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
846
copy_bps = B_FALSE;
847
848
if (copy_subsub && subsubobjs != 0) {
849
if (bpo->bpo_phys->bpo_subobjs) {
850
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
851
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
852
ZIO_PRIORITY_ASYNC_READ);
853
}
854
dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
855
ZIO_PRIORITY_ASYNC_READ);
856
}
857
858
if (copy_bps) {
859
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
860
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
861
ZIO_PRIORITY_ASYNC_READ);
862
dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
863
ZIO_PRIORITY_ASYNC_READ);
864
} else if (bpo->bpo_phys->bpo_subobjs) {
865
dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
866
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
867
ZIO_PRIORITY_ASYNC_READ);
868
}
869
}
870
871
void
872
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
873
dmu_tx_t *tx)
874
{
875
blkptr_t stored_bp = *bp;
876
uint64_t offset;
877
int blkoff;
878
blkptr_t *bparray;
879
880
ASSERT(bpobj_is_open(bpo));
881
ASSERT(!BP_IS_HOLE(bp));
882
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
883
884
if (BP_IS_EMBEDDED(bp)) {
885
/*
886
* The bpobj will compress better without the payload.
887
*
888
* Note that we store EMBEDDED bp's because they have an
889
* uncompressed size, which must be accounted for. An
890
* alternative would be to add their size to bpo_uncomp
891
* without storing the bp, but that would create additional
892
* complications: bpo_uncomp would be inconsistent with the
893
* set of BP's stored, and bpobj_iterate() wouldn't visit
894
* all the space accounted for in the bpobj.
895
*/
896
memset(&stored_bp, 0, sizeof (stored_bp));
897
stored_bp.blk_prop = bp->blk_prop;
898
BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
899
} else if (!BP_GET_DEDUP(bp)) {
900
/* The bpobj will compress better without the checksum */
901
memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
902
}
903
904
stored_bp.blk_fill = 0;
905
BP_SET_FREE(&stored_bp, bp_freed);
906
907
mutex_enter(&bpo->bpo_lock);
908
909
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
910
blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
911
912
if (bpo->bpo_cached_dbuf == NULL ||
913
offset < bpo->bpo_cached_dbuf->db_offset ||
914
offset >= bpo->bpo_cached_dbuf->db_offset +
915
bpo->bpo_cached_dbuf->db_size) {
916
if (bpo->bpo_cached_dbuf)
917
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
918
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
919
offset, bpo, &bpo->bpo_cached_dbuf, 0));
920
ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
921
}
922
923
dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
924
bparray = bpo->bpo_cached_dbuf->db_data;
925
bparray[blkoff] = stored_bp;
926
927
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
928
bpo->bpo_phys->bpo_num_blkptrs++;
929
int sign = bp_freed ? -1 : +1;
930
bpo->bpo_phys->bpo_bytes += sign *
931
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
932
if (bpo->bpo_havecomp) {
933
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
934
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
935
}
936
if (bp_freed) {
937
ASSERT(bpo->bpo_havefreed);
938
bpo->bpo_phys->bpo_num_freed++;
939
}
940
mutex_exit(&bpo->bpo_lock);
941
}
942
943
struct space_range_arg {
944
spa_t *spa;
945
uint64_t mintxg;
946
uint64_t maxtxg;
947
uint64_t used;
948
uint64_t comp;
949
uint64_t uncomp;
950
};
951
952
static int
953
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
954
{
955
(void) bp_freed, (void) tx;
956
struct space_range_arg *sra = arg;
957
958
if (BP_GET_BIRTH(bp) > sra->mintxg &&
959
BP_GET_BIRTH(bp) <= sra->maxtxg) {
960
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
961
sra->used += bp_get_dsize_sync(sra->spa, bp);
962
else
963
sra->used += bp_get_dsize(sra->spa, bp);
964
sra->comp += BP_GET_PSIZE(bp);
965
sra->uncomp += BP_GET_UCSIZE(bp);
966
}
967
return (0);
968
}
969
970
int
971
bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
972
{
973
ASSERT(bpobj_is_open(bpo));
974
mutex_enter(&bpo->bpo_lock);
975
976
*usedp = bpo->bpo_phys->bpo_bytes;
977
if (bpo->bpo_havecomp) {
978
*compp = bpo->bpo_phys->bpo_comp;
979
*uncompp = bpo->bpo_phys->bpo_uncomp;
980
mutex_exit(&bpo->bpo_lock);
981
return (0);
982
} else {
983
mutex_exit(&bpo->bpo_lock);
984
return (bpobj_space_range(bpo, 0, UINT64_MAX,
985
usedp, compp, uncompp));
986
}
987
}
988
989
/*
990
* Return the amount of space in the bpobj which is:
991
* mintxg < logical birth <= maxtxg
992
*/
993
int
994
bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
995
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
996
{
997
struct space_range_arg sra = { 0 };
998
int err;
999
1000
ASSERT(bpobj_is_open(bpo));
1001
1002
/*
1003
* As an optimization, if they want the whole txg range, just
1004
* get bpo_bytes rather than iterating over the bps.
1005
*/
1006
if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
1007
return (bpobj_space(bpo, usedp, compp, uncompp));
1008
1009
sra.spa = dmu_objset_spa(bpo->bpo_os);
1010
sra.mintxg = mintxg;
1011
sra.maxtxg = maxtxg;
1012
1013
err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
1014
*usedp = sra.used;
1015
*compp = sra.comp;
1016
*uncompp = sra.uncomp;
1017
return (err);
1018
}
1019
1020
/*
1021
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
1022
* bpobj are designated as free or allocated that information is not preserved
1023
* in bplists.
1024
*/
1025
int
1026
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
1027
dmu_tx_t *tx)
1028
{
1029
(void) bp_freed, (void) tx;
1030
bplist_t *bpl = arg;
1031
bplist_append(bpl, bp);
1032
return (0);
1033
}
1034
1035