Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/brt.c
107264 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
23
/*
24
* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
25
*/
26
27
#include <sys/zfs_context.h>
28
#include <sys/spa.h>
29
#include <sys/spa_impl.h>
30
#include <sys/zio.h>
31
#include <sys/brt.h>
32
#include <sys/brt_impl.h>
33
#include <sys/ddt.h>
34
#include <sys/bitmap.h>
35
#include <sys/zap.h>
36
#include <sys/dmu_tx.h>
37
#include <sys/arc.h>
38
#include <sys/dsl_pool.h>
39
#include <sys/dsl_scan.h>
40
#include <sys/vdev_impl.h>
41
#include <sys/kstat.h>
42
#include <sys/wmsum.h>
43
44
/*
45
* Block Cloning design.
46
*
47
* Block Cloning allows to manually clone a file (or a subset of its blocks)
48
* into another (or the same) file by just creating additional references to
49
* the data blocks without copying the data itself. Those references are kept
50
* in the Block Reference Tables (BRTs).
51
*
52
* In many ways this is similar to the existing deduplication, but there are
53
* some important differences:
54
*
55
* - Deduplication is automatic and Block Cloning is not - one has to use a
56
* dedicated system call(s) to clone the given file/blocks.
57
* - Deduplication keeps all data blocks in its table, even those referenced
58
* just once. Block Cloning creates an entry in its tables only when there
59
* are at least two references to the given data block. If the block was
60
* never explicitly cloned or the second to last reference was dropped,
61
* there will be neither space nor performance overhead.
62
* - Deduplication needs data to work - one needs to pass real data to the
63
* write(2) syscall, so hash can be calculated. Block Cloning doesn't require
64
* data, just block pointers to the data, so it is extremely fast, as we pay
65
* neither the cost of reading the data, nor the cost of writing the data -
66
* we operate exclusively on metadata.
67
* - If the D (dedup) bit is not set in the block pointer, it means that
68
* the block is not in the dedup table (DDT) and we won't consult the DDT
69
* when we need to free the block. Block Cloning must be consulted on every
70
* free, because we cannot modify the source BP (eg. by setting something
71
* similar to the D bit), thus we have no hint if the block is in the
72
* Block Reference Table (BRT), so we need to look into the BRT. There is
73
* an optimization in place that allows us to eliminate the majority of BRT
74
* lookups which is described below in the "Minimizing free penalty" section.
75
* - The BRT entry is much smaller than the DDT entry - for BRT we only store
76
* 64bit offset and 64bit reference counter.
77
* - Dedup keys are cryptographic hashes, so two blocks that are close to each
78
* other on disk are most likely in totally different parts of the DDT.
79
* The BRT entry keys are offsets into a single top-level VDEV, so data blocks
80
* from one file should have BRT entries close to each other.
81
* - Scrub will only do a single pass over a block that is referenced multiple
82
* times in the DDT. Unfortunately it is not currently (if at all) possible
83
* with Block Cloning and block referenced multiple times will be scrubbed
84
* multiple times. The new, sorted scrub should be able to eliminate
85
* duplicated reads given enough memory.
86
* - Deduplication requires cryptographically strong hash as a checksum or
87
* additional data verification. Block Cloning works with any checksum
88
* algorithm or even with checksumming disabled.
89
*
90
* As mentioned above, the BRT entries are much smaller than the DDT entries.
91
* To uniquely identify a block we just need its vdev id and offset. We also
92
* need to maintain a reference counter. The vdev id will often repeat, as there
93
* is a small number of top-level VDEVs and a large number of blocks stored in
94
* each VDEV. We take advantage of that to reduce the BRT entry size further by
95
* maintaining one BRT for each top-level VDEV, so we can then have only offset
96
* and counter as the BRT entry.
97
*
98
* Minimizing free penalty.
99
*
100
* Block Cloning allows creating additional references to any existing block.
101
* When we free a block there is no hint in the block pointer whether the block
102
* was cloned or not, so on each free we have to check if there is a
103
* corresponding entry in the BRT or not. If there is, we need to decrease
104
* the reference counter. Doing BRT lookup on every free can potentially be
105
* expensive by requiring additional I/Os if the BRT doesn't fit into memory.
106
* This is the main problem with deduplication, so we've learned our lesson and
107
* try not to repeat the same mistake here. How do we do that? We divide each
108
* top-level VDEV into 16MB regions. For each region we maintain a counter that
109
* is a sum of all the BRT entries that have offsets within the region. This
110
* creates the entries count array of 16bit numbers for each top-level VDEV.
111
* The entries count array is always kept in memory and updated on disk in the
112
* same transaction group as the BRT updates to keep everything in-sync. We can
113
* keep the array in memory, because it is very small. With 16MB regions and
114
* 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
115
* the region size even further in the future). Now, when we want to free
116
* a block, we first consult the array. If the counter for the whole region is
117
* zero, there is no need to look for the BRT entry, as there isn't one for
118
* sure. If the counter for the region is greater than zero, only then we will
119
* do a BRT lookup and if an entry is found we will decrease the reference
120
* counter in the BRT entry and in the entry counters array.
121
*
122
* The entry counters array is small, but can potentially be larger for very
123
* large VDEVs or smaller regions. In this case we don't want to rewrite entire
124
* array on every change. We then divide the array into 32kB block and keep
125
* a bitmap of dirty blocks within a transaction group. When we sync the
126
* transaction group we can only update the parts of the entry counters array
127
* that were modified. Note: Keeping track of the dirty parts of the entry
128
* counters array is implemented, but updating only parts of the array on disk
129
* is not yet implemented - for now we will update entire array if there was
130
* any change.
131
*
132
* The implementation tries to be economic: if BRT is not used, or no longer
133
* used, there will be no entries in the MOS and no additional memory used (eg.
134
* the entry counters array is only allocated if needed).
135
*
136
* Interaction between Deduplication and Block Cloning.
137
*
138
* If both functionalities are in use, we could end up with a block that is
139
* referenced multiple times in both DDT and BRT. When we free one of the
140
* references we couldn't tell where it belongs, so we would have to decide
141
* what table takes the precedence: do we first clear DDT references or BRT
142
* references? To avoid this dilemma BRT cooperates with DDT - if a given block
143
* is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
144
* lookup DDT entry instead and increase the counter there. No BRT entry
145
* will be created for a block which has the D (dedup) bit set.
146
* BRT may be more efficient for manual deduplication, but if the block is
147
* already in the DDT, then creating additional BRT entry would be less
148
* efficient. This clever idea was proposed by Allan Jude.
149
*
150
* Block Cloning across datasets.
151
*
152
* Block Cloning is not limited to cloning blocks within the same dataset.
153
* It is possible (and very useful) to clone blocks between different datasets.
154
* One use case is recovering files from snapshots. By cloning the files into
155
* dataset we need no additional storage. Without Block Cloning we would need
156
* additional space for those files.
157
* Another interesting use case is moving the files between datasets
158
* (copying the file content to the new dataset and removing the source file).
159
* In that case Block Cloning will only be used briefly, because the BRT entries
160
* will be removed when the source is removed.
161
* Block Cloning across encrypted datasets is supported as long as both
162
* datasets share the same master key (e.g. snapshots and clones)
163
*
164
* Block Cloning flow through ZFS layers.
165
*
166
* Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167
* blocks. As of this writing no interface is implemented that allows for block
168
* cloning within a ZVOL.
169
* FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170
* for blocking cloning.
171
*
172
* ssize_t
173
* copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174
* size_t len, unsigned int flags);
175
*
176
* Even though offsets and length represent bytes, they have to be
177
* block-aligned or we will return an error so the upper layer can
178
* fallback to the generic mechanism that will just copy the data.
179
* Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180
* This function was implemented based on zfs_write(), but instead of writing
181
* the given data we first read block pointers using the new dmu_read_l0_bps()
182
* function from the source file. Once we have BPs from the source file we call
183
* the dmu_brt_clone() function on the destination file. This function
184
* allocates BPs for us. We iterate over all source BPs. If the given BP is
185
* a hole or an embedded block, we just copy BP as-is. If it points to a real
186
* data we place this BP on a BRT pending list using the brt_pending_add()
187
* function.
188
*
189
* We use this pending list to keep track of all BPs that got new references
190
* within this transaction group.
191
*
192
* Some special cases to consider and how we address them:
193
* - The block we want to clone may have been created within the same
194
* transaction group that we are trying to clone. Such block has no BP
195
* allocated yet, so cannot be immediately cloned. We return EAGAIN.
196
* - The block we want to clone may have been modified within the same
197
* transaction group. We return EAGAIN.
198
* - A block may be cloned multiple times during one transaction group (that's
199
* why pending list is actually a tree and not an append-only list - this
200
* way we can figure out faster if this block is cloned for the first time
201
* in this txg or consecutive time).
202
* - A block may be cloned and freed within the same transaction group
203
* (see dbuf_undirty()).
204
* - A block may be cloned and within the same transaction group the clone
205
* can be cloned again (see dmu_read_l0_bps()).
206
* - A file might have been deleted, but the caller still has a file descriptor
207
* open to this file and clones it.
208
*
209
* When we free a block we have an additional step in the ZIO pipeline where we
210
* call the zio_brt_free() function. We then call the brt_entry_decref()
211
* that loads the corresponding BRT entry (if one exists) and decreases
212
* reference counter. If this is not the last reference we will stop ZIO
213
* pipeline here. If this is the last reference or the block is not in the
214
* BRT, we continue the pipeline and free the block as usual.
215
*
216
* At the beginning of spa_sync() where there can be no more block cloning,
217
* but before issuing frees we call brt_pending_apply(). This function applies
218
* all the new clones to the BRT table - we load BRT entries and update
219
* reference counters. To sync new BRT entries to disk, we use brt_sync()
220
* function. This function will sync all dirty per-top-level-vdev BRTs,
221
* the entry counters arrays, etc.
222
*
223
* Block Cloning and ZIL.
224
*
225
* Every clone operation is divided into chunks (similar to write) and each
226
* chunk is cloned in a separate transaction. The chunk size is determined by
227
* how many BPs we can fit into a single ZIL entry.
228
* Replaying clone operation is different from the regular clone operation,
229
* as when we log clone operations we cannot use the source object - it may
230
* reside on a different dataset, so we log BPs we want to clone.
231
* The ZIL is replayed when we mount the given dataset, not when the pool is
232
* imported. Taking this into account it is possible that the pool is imported
233
* without mounting datasets and the source dataset is destroyed before the
234
* destination dataset is mounted and its ZIL replayed.
235
* To address this situation we leverage zil_claim() mechanism where ZFS will
236
* parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237
* entries, we will bump reference counters for their BPs in the BRT. Then
238
* on mount and ZIL replay we bump the reference counters once more, while the
239
* first references are dropped during ZIL destroy by zil_free_clone_range().
240
* It is possible that after zil_claim() we never mount the destination, so
241
* we never replay its ZIL and just destroy it. In this case the only taken
242
* references will be dropped by zil_free_clone_range(), since the cloning is
243
* not going to ever take place.
244
*/
245
246
static kmem_cache_t *brt_entry_cache;
247
248
/*
249
* Enable/disable prefetching of BRT entries that we are going to modify.
250
*/
251
static int brt_zap_prefetch = 1;
252
253
#ifdef ZFS_DEBUG
254
#define BRT_DEBUG(...) do { \
255
if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
256
__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257
} \
258
} while (0)
259
#else
260
#define BRT_DEBUG(...) do { } while (0)
261
#endif
262
263
static int brt_zap_default_bs = 13;
264
static int brt_zap_default_ibs = 13;
265
266
static kstat_t *brt_ksp;
267
268
typedef struct brt_stats {
269
kstat_named_t brt_addref_entry_not_on_disk;
270
kstat_named_t brt_addref_entry_on_disk;
271
kstat_named_t brt_decref_entry_in_memory;
272
kstat_named_t brt_decref_entry_loaded_from_disk;
273
kstat_named_t brt_decref_entry_not_in_memory;
274
kstat_named_t brt_decref_entry_read_lost_race;
275
kstat_named_t brt_decref_entry_still_referenced;
276
kstat_named_t brt_decref_free_data_later;
277
kstat_named_t brt_decref_free_data_now;
278
kstat_named_t brt_decref_no_entry;
279
} brt_stats_t;
280
281
static brt_stats_t brt_stats = {
282
{ "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
283
{ "addref_entry_on_disk", KSTAT_DATA_UINT64 },
284
{ "decref_entry_in_memory", KSTAT_DATA_UINT64 },
285
{ "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
286
{ "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
287
{ "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
288
{ "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
289
{ "decref_free_data_later", KSTAT_DATA_UINT64 },
290
{ "decref_free_data_now", KSTAT_DATA_UINT64 },
291
{ "decref_no_entry", KSTAT_DATA_UINT64 }
292
};
293
294
struct {
295
wmsum_t brt_addref_entry_not_on_disk;
296
wmsum_t brt_addref_entry_on_disk;
297
wmsum_t brt_decref_entry_in_memory;
298
wmsum_t brt_decref_entry_loaded_from_disk;
299
wmsum_t brt_decref_entry_not_in_memory;
300
wmsum_t brt_decref_entry_read_lost_race;
301
wmsum_t brt_decref_entry_still_referenced;
302
wmsum_t brt_decref_free_data_later;
303
wmsum_t brt_decref_free_data_now;
304
wmsum_t brt_decref_no_entry;
305
} brt_sums;
306
307
#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
308
309
static int brt_entry_compare(const void *x1, const void *x2);
310
static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
311
312
static void
313
brt_rlock(spa_t *spa)
314
{
315
rw_enter(&spa->spa_brt_lock, RW_READER);
316
}
317
318
static void
319
brt_wlock(spa_t *spa)
320
{
321
rw_enter(&spa->spa_brt_lock, RW_WRITER);
322
}
323
324
static void
325
brt_unlock(spa_t *spa)
326
{
327
rw_exit(&spa->spa_brt_lock);
328
}
329
330
static uint16_t
331
brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
332
{
333
334
ASSERT3U(idx, <, brtvd->bv_size);
335
336
if (unlikely(brtvd->bv_need_byteswap)) {
337
return (BSWAP_16(brtvd->bv_entcount[idx]));
338
} else {
339
return (brtvd->bv_entcount[idx]);
340
}
341
}
342
343
static void
344
brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
345
{
346
347
ASSERT3U(idx, <, brtvd->bv_size);
348
349
if (unlikely(brtvd->bv_need_byteswap)) {
350
brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
351
} else {
352
brtvd->bv_entcount[idx] = entcnt;
353
}
354
}
355
356
static void
357
brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
358
{
359
uint16_t entcnt;
360
361
ASSERT3U(idx, <, brtvd->bv_size);
362
363
entcnt = brt_vdev_entcount_get(brtvd, idx);
364
ASSERT(entcnt < UINT16_MAX);
365
366
brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
367
}
368
369
static void
370
brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
371
{
372
uint16_t entcnt;
373
374
ASSERT3U(idx, <, brtvd->bv_size);
375
376
entcnt = brt_vdev_entcount_get(brtvd, idx);
377
ASSERT(entcnt > 0);
378
379
brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
380
}
381
382
#ifdef ZFS_DEBUG
383
static void
384
brt_vdev_dump(brt_vdev_t *brtvd)
385
{
386
uint64_t idx;
387
388
uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
389
zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
390
"size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
391
(u_longlong_t)brtvd->bv_vdevid,
392
brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
393
(u_longlong_t)brtvd->bv_size,
394
(u_longlong_t)brtvd->bv_totalcount,
395
(u_longlong_t)nblocks,
396
(size_t)BT_SIZEOFMAP(nblocks));
397
if (brtvd->bv_totalcount > 0) {
398
zfs_dbgmsg(" entcounts:");
399
for (idx = 0; idx < brtvd->bv_size; idx++) {
400
uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
401
if (entcnt > 0) {
402
zfs_dbgmsg(" [%04llu] %hu",
403
(u_longlong_t)idx, entcnt);
404
}
405
}
406
}
407
if (brtvd->bv_entcount_dirty) {
408
char *bitmap;
409
410
bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
411
for (idx = 0; idx < nblocks; idx++) {
412
bitmap[idx] =
413
BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
414
}
415
bitmap[idx] = '\0';
416
zfs_dbgmsg(" dirty: %s", bitmap);
417
kmem_free(bitmap, nblocks + 1);
418
}
419
}
420
#endif
421
422
static brt_vdev_t *
423
brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
424
{
425
brt_vdev_t *brtvd = NULL;
426
427
brt_rlock(spa);
428
if (vdevid < spa->spa_brt_nvdevs) {
429
brtvd = spa->spa_brt_vdevs[vdevid];
430
} else if (alloc) {
431
/* New VDEV was added. */
432
brt_unlock(spa);
433
brt_wlock(spa);
434
if (vdevid >= spa->spa_brt_nvdevs)
435
brt_vdevs_expand(spa, vdevid + 1);
436
brtvd = spa->spa_brt_vdevs[vdevid];
437
}
438
brt_unlock(spa);
439
return (brtvd);
440
}
441
442
static void
443
brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
444
{
445
char name[64];
446
447
ASSERT(brtvd->bv_initiated);
448
ASSERT0(brtvd->bv_mos_brtvdev);
449
ASSERT0(brtvd->bv_mos_entries);
450
451
uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
452
ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
453
brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
454
VERIFY(mos_entries != 0);
455
VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
456
&brtvd->bv_mos_entries_dnode));
457
dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
458
rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
459
brtvd->bv_mos_entries = mos_entries;
460
rw_exit(&brtvd->bv_mos_entries_lock);
461
BRT_DEBUG("MOS entries created, object=%llu",
462
(u_longlong_t)brtvd->bv_mos_entries);
463
464
/*
465
* We allocate DMU buffer to store the bv_entcount[] array.
466
* We will keep array size (bv_size) and cummulative count for all
467
* bv_entcount[]s (bv_totalcount) in the bonus buffer.
468
*/
469
brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
470
DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
471
DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
472
VERIFY(brtvd->bv_mos_brtvdev != 0);
473
BRT_DEBUG("MOS BRT VDEV created, object=%llu",
474
(u_longlong_t)brtvd->bv_mos_brtvdev);
475
476
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
477
(u_longlong_t)brtvd->bv_vdevid);
478
VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
479
sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
480
BRT_DEBUG("Pool directory object created, object=%s", name);
481
482
/*
483
* Activate the endian-fixed feature if this is the first BRT ZAP
484
* (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.
485
*/
486
if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&
487
!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
488
spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
489
} else if (spa_feature_is_active(spa,
490
SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {
491
spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
492
}
493
494
spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
495
}
496
497
static void
498
brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
499
{
500
vdev_t *vd;
501
uint16_t *entcount;
502
ulong_t *bitmap;
503
uint64_t nblocks, onblocks, size;
504
505
ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
506
507
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
508
vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
509
size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
510
spa_config_exit(spa, SCL_VDEV, FTAG);
511
512
nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
513
entcount = vmem_zalloc(nblocks * BRT_BLOCKSIZE, KM_SLEEP);
514
bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
515
516
if (!brtvd->bv_initiated) {
517
ASSERT0(brtvd->bv_size);
518
ASSERT0P(brtvd->bv_entcount);
519
ASSERT0P(brtvd->bv_bitmap);
520
} else {
521
ASSERT(brtvd->bv_size > 0);
522
ASSERT(brtvd->bv_entcount != NULL);
523
ASSERT(brtvd->bv_bitmap != NULL);
524
/*
525
* TODO: Allow vdev shrinking. We only need to implement
526
* shrinking the on-disk BRT VDEV object.
527
* dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
528
* offset, size, tx);
529
*/
530
ASSERT3U(brtvd->bv_size, <=, size);
531
532
memcpy(entcount, brtvd->bv_entcount,
533
sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
534
onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
535
vmem_free(brtvd->bv_entcount, onblocks * BRT_BLOCKSIZE);
536
memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
537
BT_SIZEOFMAP(onblocks)));
538
kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
539
}
540
541
brtvd->bv_size = size;
542
brtvd->bv_entcount = entcount;
543
brtvd->bv_bitmap = bitmap;
544
if (!brtvd->bv_initiated) {
545
brtvd->bv_need_byteswap = FALSE;
546
brtvd->bv_initiated = TRUE;
547
BRT_DEBUG("BRT VDEV %llu initiated.",
548
(u_longlong_t)brtvd->bv_vdevid);
549
}
550
}
551
552
static int
553
brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
554
{
555
dmu_buf_t *db;
556
brt_vdev_phys_t *bvphys;
557
int error;
558
559
ASSERT(!brtvd->bv_initiated);
560
ASSERT(brtvd->bv_mos_brtvdev != 0);
561
562
error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
563
FTAG, &db);
564
if (error != 0)
565
return (error);
566
567
bvphys = db->db_data;
568
if (spa->spa_brt_rangesize == 0) {
569
spa->spa_brt_rangesize = bvphys->bvp_rangesize;
570
} else {
571
ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
572
}
573
574
brt_vdev_realloc(spa, brtvd);
575
576
/* TODO: We don't support VDEV shrinking. */
577
ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
578
579
/*
580
* If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
581
*/
582
error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
583
MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
584
brtvd->bv_entcount, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
585
if (error != 0)
586
return (error);
587
588
ASSERT(bvphys->bvp_mos_entries != 0);
589
VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
590
&brtvd->bv_mos_entries_dnode));
591
dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
592
rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
593
brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
594
rw_exit(&brtvd->bv_mos_entries_lock);
595
brtvd->bv_need_byteswap =
596
(bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
597
brtvd->bv_totalcount = bvphys->bvp_totalcount;
598
brtvd->bv_usedspace = bvphys->bvp_usedspace;
599
brtvd->bv_savedspace = bvphys->bvp_savedspace;
600
601
dmu_buf_rele(db, FTAG);
602
603
BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
604
(u_longlong_t)brtvd->bv_vdevid,
605
(u_longlong_t)brtvd->bv_mos_brtvdev,
606
(u_longlong_t)brtvd->bv_mos_entries);
607
return (0);
608
}
609
610
static void
611
brt_vdev_dealloc(brt_vdev_t *brtvd)
612
{
613
ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
614
ASSERT(brtvd->bv_initiated);
615
ASSERT0(avl_numnodes(&brtvd->bv_tree));
616
617
uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
618
vmem_free(brtvd->bv_entcount, nblocks * BRT_BLOCKSIZE);
619
brtvd->bv_entcount = NULL;
620
kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
621
brtvd->bv_bitmap = NULL;
622
623
brtvd->bv_size = 0;
624
625
brtvd->bv_initiated = FALSE;
626
BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
627
}
628
629
static void
630
brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
631
{
632
char name[64];
633
uint64_t count;
634
635
ASSERT(brtvd->bv_initiated);
636
ASSERT(brtvd->bv_mos_brtvdev != 0);
637
ASSERT(brtvd->bv_mos_entries != 0);
638
ASSERT0(brtvd->bv_totalcount);
639
ASSERT0(brtvd->bv_usedspace);
640
ASSERT0(brtvd->bv_savedspace);
641
642
uint64_t mos_entries = brtvd->bv_mos_entries;
643
rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
644
brtvd->bv_mos_entries = 0;
645
rw_exit(&brtvd->bv_mos_entries_lock);
646
dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
647
brtvd->bv_mos_entries_dnode = NULL;
648
ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
649
ASSERT0(count);
650
VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
651
BRT_DEBUG("MOS entries destroyed, object=%llu",
652
(u_longlong_t)mos_entries);
653
654
VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
655
tx));
656
BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
657
(u_longlong_t)brtvd->bv_mos_brtvdev);
658
brtvd->bv_mos_brtvdev = 0;
659
brtvd->bv_entcount_dirty = FALSE;
660
661
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
662
(u_longlong_t)brtvd->bv_vdevid);
663
VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
664
name, tx));
665
BRT_DEBUG("Pool directory object removed, object=%s", name);
666
667
brtvd->bv_meta_dirty = FALSE;
668
669
rw_enter(&brtvd->bv_lock, RW_WRITER);
670
brt_vdev_dealloc(brtvd);
671
rw_exit(&brtvd->bv_lock);
672
673
spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
674
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))
675
spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
676
}
677
678
static void
679
brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
680
{
681
brt_vdev_t **vdevs;
682
683
ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
684
ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
685
686
if (nvdevs == spa->spa_brt_nvdevs)
687
return;
688
689
vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
690
if (spa->spa_brt_nvdevs > 0) {
691
ASSERT(spa->spa_brt_vdevs != NULL);
692
693
memcpy(vdevs, spa->spa_brt_vdevs,
694
sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
695
kmem_free(spa->spa_brt_vdevs,
696
sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
697
}
698
spa->spa_brt_vdevs = vdevs;
699
700
for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
701
brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
702
rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
703
brtvd->bv_vdevid = vdevid;
704
brtvd->bv_initiated = FALSE;
705
rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
706
avl_create(&brtvd->bv_tree, brt_entry_compare,
707
sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
708
for (int i = 0; i < TXG_SIZE; i++) {
709
avl_create(&brtvd->bv_pending_tree[i],
710
brt_entry_compare, sizeof (brt_entry_t),
711
offsetof(brt_entry_t, bre_node));
712
}
713
mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
714
spa->spa_brt_vdevs[vdevid] = brtvd;
715
}
716
717
BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
718
(u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
719
spa->spa_brt_nvdevs = nvdevs;
720
}
721
722
static boolean_t
723
brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
724
{
725
uint64_t idx = offset / spa->spa_brt_rangesize;
726
if (idx < brtvd->bv_size) {
727
/* VDEV wasn't expanded. */
728
return (brt_vdev_entcount_get(brtvd, idx) > 0);
729
}
730
return (FALSE);
731
}
732
733
static void
734
brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
735
uint64_t dsize, uint64_t count)
736
{
737
uint64_t idx;
738
739
ASSERT(brtvd->bv_initiated);
740
741
brtvd->bv_savedspace += dsize * count;
742
brtvd->bv_meta_dirty = TRUE;
743
744
if (bre->bre_count > 0)
745
return;
746
747
brtvd->bv_usedspace += dsize;
748
749
idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
750
if (idx >= brtvd->bv_size) {
751
/* VDEV has been expanded. */
752
rw_enter(&brtvd->bv_lock, RW_WRITER);
753
brt_vdev_realloc(spa, brtvd);
754
rw_exit(&brtvd->bv_lock);
755
}
756
757
ASSERT3U(idx, <, brtvd->bv_size);
758
759
brtvd->bv_totalcount++;
760
brt_vdev_entcount_inc(brtvd, idx);
761
brtvd->bv_entcount_dirty = TRUE;
762
idx = idx / BRT_BLOCKSIZE / 8;
763
BT_SET(brtvd->bv_bitmap, idx);
764
}
765
766
static void
767
brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
768
uint64_t dsize)
769
{
770
uint64_t idx;
771
772
ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
773
ASSERT(brtvd->bv_initiated);
774
775
brtvd->bv_savedspace -= dsize;
776
brtvd->bv_meta_dirty = TRUE;
777
778
if (bre->bre_count > 0)
779
return;
780
781
brtvd->bv_usedspace -= dsize;
782
783
idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
784
ASSERT3U(idx, <, brtvd->bv_size);
785
786
ASSERT(brtvd->bv_totalcount > 0);
787
brtvd->bv_totalcount--;
788
brt_vdev_entcount_dec(brtvd, idx);
789
brtvd->bv_entcount_dirty = TRUE;
790
idx = idx / BRT_BLOCKSIZE / 8;
791
BT_SET(brtvd->bv_bitmap, idx);
792
}
793
794
static void
795
brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
796
{
797
dmu_buf_t *db;
798
brt_vdev_phys_t *bvphys;
799
800
ASSERT(brtvd->bv_meta_dirty);
801
ASSERT(brtvd->bv_mos_brtvdev != 0);
802
ASSERT(dmu_tx_is_syncing(tx));
803
804
VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
805
FTAG, &db));
806
807
if (brtvd->bv_entcount_dirty) {
808
/*
809
* TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
810
*/
811
uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
812
dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
813
nblocks * BRT_BLOCKSIZE, brtvd->bv_entcount, tx,
814
DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
815
memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
816
brtvd->bv_entcount_dirty = FALSE;
817
}
818
819
dmu_buf_will_dirty(db, tx);
820
bvphys = db->db_data;
821
bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
822
bvphys->bvp_size = brtvd->bv_size;
823
if (brtvd->bv_need_byteswap) {
824
bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
825
} else {
826
bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
827
}
828
bvphys->bvp_totalcount = brtvd->bv_totalcount;
829
bvphys->bvp_rangesize = spa->spa_brt_rangesize;
830
bvphys->bvp_usedspace = brtvd->bv_usedspace;
831
bvphys->bvp_savedspace = brtvd->bv_savedspace;
832
dmu_buf_rele(db, FTAG);
833
834
brtvd->bv_meta_dirty = FALSE;
835
}
836
837
static void
838
brt_vdevs_free(spa_t *spa)
839
{
840
if (spa->spa_brt_vdevs == 0)
841
return;
842
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
843
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
844
rw_enter(&brtvd->bv_lock, RW_WRITER);
845
if (brtvd->bv_initiated)
846
brt_vdev_dealloc(brtvd);
847
rw_exit(&brtvd->bv_lock);
848
rw_destroy(&brtvd->bv_lock);
849
if (brtvd->bv_mos_entries != 0)
850
dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
851
rw_destroy(&brtvd->bv_mos_entries_lock);
852
avl_destroy(&brtvd->bv_tree);
853
for (int i = 0; i < TXG_SIZE; i++)
854
avl_destroy(&brtvd->bv_pending_tree[i]);
855
mutex_destroy(&brtvd->bv_pending_lock);
856
kmem_free(brtvd, sizeof (*brtvd));
857
}
858
kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
859
spa->spa_brt_nvdevs);
860
}
861
862
static void
863
brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
864
{
865
866
bre->bre_bp = *bp;
867
bre->bre_count = 0;
868
bre->bre_pcount = 0;
869
870
*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
871
}
872
873
static boolean_t
874
brt_has_endian_fixed(spa_t *spa)
875
{
876
return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));
877
}
878
879
static int
880
brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)
881
{
882
uint64_t off = BRE_OFFSET(bre);
883
884
if (brtvd->bv_mos_entries == 0)
885
return (SET_ERROR(ENOENT));
886
887
if (brt_has_endian_fixed(spa)) {
888
return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
889
&off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
890
&bre->bre_count));
891
} else {
892
return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
893
&off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
894
&bre->bre_count));
895
}
896
}
897
898
/*
899
* Return TRUE if we _can_ have BRT entry for this bp. It might be false
900
* positive, but gives us quick answer if we should look into BRT, which
901
* may require reads and thus will be more expensive.
902
*/
903
boolean_t
904
brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
905
{
906
907
if (spa->spa_brt_nvdevs == 0)
908
return (B_FALSE);
909
910
uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
911
brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
912
if (brtvd == NULL || !brtvd->bv_initiated)
913
return (FALSE);
914
915
/*
916
* We don't need locks here, since bv_entcount pointer must be
917
* stable at this point, and we don't care about false positive
918
* races here, while false negative should be impossible, since
919
* all brt_vdev_addref() have already completed by this point.
920
*/
921
uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
922
return (brt_vdev_lookup(spa, brtvd, off));
923
}
924
925
uint64_t
926
brt_get_dspace(spa_t *spa)
927
{
928
if (spa->spa_brt_nvdevs == 0)
929
return (0);
930
931
brt_rlock(spa);
932
uint64_t s = 0;
933
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
934
s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
935
brt_unlock(spa);
936
return (s);
937
}
938
939
uint64_t
940
brt_get_used(spa_t *spa)
941
{
942
if (spa->spa_brt_nvdevs == 0)
943
return (0);
944
945
brt_rlock(spa);
946
uint64_t s = 0;
947
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
948
s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
949
brt_unlock(spa);
950
return (s);
951
}
952
953
uint64_t
954
brt_get_saved(spa_t *spa)
955
{
956
return (brt_get_dspace(spa));
957
}
958
959
uint64_t
960
brt_get_ratio(spa_t *spa)
961
{
962
uint64_t used = brt_get_used(spa);
963
if (used == 0)
964
return (100);
965
return ((used + brt_get_saved(spa)) * 100 / used);
966
}
967
968
static int
969
brt_kstats_update(kstat_t *ksp, int rw)
970
{
971
brt_stats_t *bs = ksp->ks_data;
972
973
if (rw == KSTAT_WRITE)
974
return (EACCES);
975
976
bs->brt_addref_entry_not_on_disk.value.ui64 =
977
wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
978
bs->brt_addref_entry_on_disk.value.ui64 =
979
wmsum_value(&brt_sums.brt_addref_entry_on_disk);
980
bs->brt_decref_entry_in_memory.value.ui64 =
981
wmsum_value(&brt_sums.brt_decref_entry_in_memory);
982
bs->brt_decref_entry_loaded_from_disk.value.ui64 =
983
wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
984
bs->brt_decref_entry_not_in_memory.value.ui64 =
985
wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
986
bs->brt_decref_entry_read_lost_race.value.ui64 =
987
wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
988
bs->brt_decref_entry_still_referenced.value.ui64 =
989
wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
990
bs->brt_decref_free_data_later.value.ui64 =
991
wmsum_value(&brt_sums.brt_decref_free_data_later);
992
bs->brt_decref_free_data_now.value.ui64 =
993
wmsum_value(&brt_sums.brt_decref_free_data_now);
994
bs->brt_decref_no_entry.value.ui64 =
995
wmsum_value(&brt_sums.brt_decref_no_entry);
996
997
return (0);
998
}
999
1000
static void
1001
brt_stat_init(void)
1002
{
1003
1004
wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1005
wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1006
wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1007
wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1008
wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1009
wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1010
wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1011
wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1012
wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1013
wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1014
1015
brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1016
sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1017
if (brt_ksp != NULL) {
1018
brt_ksp->ks_data = &brt_stats;
1019
brt_ksp->ks_update = brt_kstats_update;
1020
kstat_install(brt_ksp);
1021
}
1022
}
1023
1024
static void
1025
brt_stat_fini(void)
1026
{
1027
if (brt_ksp != NULL) {
1028
kstat_delete(brt_ksp);
1029
brt_ksp = NULL;
1030
}
1031
1032
wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1033
wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1034
wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1035
wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1036
wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1037
wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1038
wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1039
wmsum_fini(&brt_sums.brt_decref_free_data_later);
1040
wmsum_fini(&brt_sums.brt_decref_free_data_now);
1041
wmsum_fini(&brt_sums.brt_decref_no_entry);
1042
}
1043
1044
void
1045
brt_init(void)
1046
{
1047
brt_entry_cache = kmem_cache_create("brt_entry_cache",
1048
sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1049
1050
brt_stat_init();
1051
}
1052
1053
void
1054
brt_fini(void)
1055
{
1056
brt_stat_fini();
1057
1058
kmem_cache_destroy(brt_entry_cache);
1059
}
1060
1061
/* Return TRUE if block should be freed immediately. */
1062
boolean_t
1063
brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1064
{
1065
brt_entry_t *bre, *racebre;
1066
brt_entry_t bre_search;
1067
avl_index_t where;
1068
uint64_t vdevid;
1069
int error;
1070
1071
brt_entry_fill(bp, &bre_search, &vdevid);
1072
1073
brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1074
ASSERT(brtvd != NULL);
1075
1076
rw_enter(&brtvd->bv_lock, RW_WRITER);
1077
ASSERT(brtvd->bv_initiated);
1078
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1079
if (bre != NULL) {
1080
BRTSTAT_BUMP(brt_decref_entry_in_memory);
1081
goto out;
1082
} else {
1083
BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1084
}
1085
rw_exit(&brtvd->bv_lock);
1086
1087
error = brt_entry_lookup(spa, brtvd, &bre_search);
1088
/* bre_search now contains correct bre_count */
1089
if (error == ENOENT) {
1090
BRTSTAT_BUMP(brt_decref_no_entry);
1091
return (B_TRUE);
1092
}
1093
ASSERT0(error);
1094
1095
rw_enter(&brtvd->bv_lock, RW_WRITER);
1096
racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1097
if (racebre != NULL) {
1098
/* The entry was added when the lock was dropped. */
1099
BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1100
bre = racebre;
1101
goto out;
1102
}
1103
1104
BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1105
bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1106
bre->bre_bp = bre_search.bre_bp;
1107
bre->bre_count = bre_search.bre_count;
1108
bre->bre_pcount = 0;
1109
avl_insert(&brtvd->bv_tree, bre, where);
1110
1111
out:
1112
if (bre->bre_count == 0) {
1113
rw_exit(&brtvd->bv_lock);
1114
BRTSTAT_BUMP(brt_decref_free_data_now);
1115
return (B_TRUE);
1116
}
1117
1118
bre->bre_pcount--;
1119
ASSERT(bre->bre_count > 0);
1120
bre->bre_count--;
1121
if (bre->bre_count == 0)
1122
BRTSTAT_BUMP(brt_decref_free_data_later);
1123
else
1124
BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1125
brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1126
1127
rw_exit(&brtvd->bv_lock);
1128
1129
return (B_FALSE);
1130
}
1131
1132
uint64_t
1133
brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1134
{
1135
brt_entry_t bre_search, *bre;
1136
uint64_t vdevid, refcnt;
1137
int error;
1138
1139
brt_entry_fill(bp, &bre_search, &vdevid);
1140
1141
brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1142
ASSERT(brtvd != NULL);
1143
1144
rw_enter(&brtvd->bv_lock, RW_READER);
1145
ASSERT(brtvd->bv_initiated);
1146
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1147
if (bre == NULL) {
1148
rw_exit(&brtvd->bv_lock);
1149
error = brt_entry_lookup(spa, brtvd, &bre_search);
1150
if (error == ENOENT) {
1151
refcnt = 0;
1152
} else {
1153
ASSERT0(error);
1154
refcnt = bre_search.bre_count;
1155
}
1156
} else {
1157
refcnt = bre->bre_count;
1158
rw_exit(&brtvd->bv_lock);
1159
}
1160
1161
return (refcnt);
1162
}
1163
1164
static void
1165
brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1166
{
1167
if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1168
return;
1169
1170
uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1171
rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1172
if (brtvd->bv_mos_entries != 0) {
1173
(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1174
&off, BRT_KEY_WORDS);
1175
}
1176
rw_exit(&brtvd->bv_mos_entries_lock);
1177
}
1178
1179
static int
1180
brt_entry_compare(const void *x1, const void *x2)
1181
{
1182
const brt_entry_t *bre1 = x1, *bre2 = x2;
1183
const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1184
1185
return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1186
DVA_GET_OFFSET(&bp2->blk_dva[0])));
1187
}
1188
1189
void
1190
brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1191
{
1192
brt_entry_t *bre, *newbre;
1193
avl_index_t where;
1194
uint64_t txg;
1195
1196
txg = dmu_tx_get_txg(tx);
1197
ASSERT3U(txg, !=, 0);
1198
1199
uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1200
brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1201
avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1202
1203
newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1204
newbre->bre_bp = *bp;
1205
newbre->bre_count = 0;
1206
newbre->bre_pcount = 1;
1207
1208
mutex_enter(&brtvd->bv_pending_lock);
1209
bre = avl_find(pending_tree, newbre, &where);
1210
if (bre == NULL) {
1211
avl_insert(pending_tree, newbre, where);
1212
newbre = NULL;
1213
} else {
1214
bre->bre_pcount++;
1215
}
1216
mutex_exit(&brtvd->bv_pending_lock);
1217
1218
if (newbre != NULL) {
1219
ASSERT(bre != NULL);
1220
ASSERT(bre != newbre);
1221
kmem_cache_free(brt_entry_cache, newbre);
1222
} else {
1223
ASSERT0P(bre);
1224
1225
/* Prefetch BRT entry for the syncing context. */
1226
brt_prefetch(brtvd, bp);
1227
}
1228
}
1229
1230
void
1231
brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1232
{
1233
brt_entry_t *bre, bre_search;
1234
uint64_t txg;
1235
1236
txg = dmu_tx_get_txg(tx);
1237
ASSERT3U(txg, !=, 0);
1238
1239
uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1240
brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1241
ASSERT(brtvd != NULL);
1242
avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1243
1244
bre_search.bre_bp = *bp;
1245
1246
mutex_enter(&brtvd->bv_pending_lock);
1247
bre = avl_find(pending_tree, &bre_search, NULL);
1248
ASSERT(bre != NULL);
1249
ASSERT(bre->bre_pcount > 0);
1250
bre->bre_pcount--;
1251
if (bre->bre_pcount == 0)
1252
avl_remove(pending_tree, bre);
1253
else
1254
bre = NULL;
1255
mutex_exit(&brtvd->bv_pending_lock);
1256
1257
if (bre)
1258
kmem_cache_free(brt_entry_cache, bre);
1259
}
1260
1261
static void
1262
brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1263
{
1264
brt_entry_t *bre, *nbre;
1265
1266
/*
1267
* We are in syncing context, so no other bv_pending_tree accesses
1268
* are possible for the TXG. So we don't need bv_pending_lock.
1269
*/
1270
ASSERT(avl_is_empty(&brtvd->bv_tree));
1271
avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1272
1273
for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1274
nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1275
1276
/*
1277
* If the block has DEDUP bit set, it means that it
1278
* already exists in the DEDUP table, so we can just
1279
* use that instead of creating new entry in the BRT.
1280
*/
1281
if (BP_GET_DEDUP(&bre->bre_bp)) {
1282
while (bre->bre_pcount > 0) {
1283
if (!ddt_addref(spa, &bre->bre_bp))
1284
break;
1285
bre->bre_pcount--;
1286
}
1287
if (bre->bre_pcount == 0) {
1288
avl_remove(&brtvd->bv_tree, bre);
1289
kmem_cache_free(brt_entry_cache, bre);
1290
continue;
1291
}
1292
}
1293
1294
/*
1295
* Unless we know that the block is definitely not in ZAP,
1296
* try to get its reference count from there.
1297
*/
1298
uint64_t off = BRE_OFFSET(bre);
1299
if (brtvd->bv_mos_entries != 0 &&
1300
brt_vdev_lookup(spa, brtvd, off)) {
1301
int error;
1302
if (brt_has_endian_fixed(spa)) {
1303
error = zap_lookup_uint64_by_dnode(
1304
brtvd->bv_mos_entries_dnode, &off,
1305
BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1306
&bre->bre_count);
1307
} else {
1308
error = zap_lookup_uint64_by_dnode(
1309
brtvd->bv_mos_entries_dnode, &off,
1310
BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1311
&bre->bre_count);
1312
}
1313
if (error == 0) {
1314
BRTSTAT_BUMP(brt_addref_entry_on_disk);
1315
} else {
1316
ASSERT3U(error, ==, ENOENT);
1317
BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1318
}
1319
}
1320
}
1321
1322
/*
1323
* If all the cloned blocks we had were handled by DDT, we don't need
1324
* to initiate the vdev.
1325
*/
1326
if (avl_is_empty(&brtvd->bv_tree))
1327
return;
1328
1329
if (!brtvd->bv_initiated) {
1330
rw_enter(&brtvd->bv_lock, RW_WRITER);
1331
brt_vdev_realloc(spa, brtvd);
1332
rw_exit(&brtvd->bv_lock);
1333
}
1334
1335
/*
1336
* Convert pending references into proper ones. This has to be a
1337
* separate loop, since entcount modifications would cause false
1338
* positives for brt_vdev_lookup() on following iterations.
1339
*/
1340
for (bre = avl_first(&brtvd->bv_tree); bre;
1341
bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1342
brt_vdev_addref(spa, brtvd, bre,
1343
bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1344
bre->bre_count += bre->bre_pcount;
1345
}
1346
}
1347
1348
void
1349
brt_pending_apply(spa_t *spa, uint64_t txg)
1350
{
1351
1352
brt_rlock(spa);
1353
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1354
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1355
brt_unlock(spa);
1356
1357
brt_pending_apply_vdev(spa, brtvd, txg);
1358
1359
brt_rlock(spa);
1360
}
1361
brt_unlock(spa);
1362
}
1363
1364
static void
1365
brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1366
{
1367
uint64_t off = BRE_OFFSET(bre);
1368
1369
if (bre->bre_pcount == 0) {
1370
/* The net change is zero, nothing to do in ZAP. */
1371
} else if (bre->bre_count == 0) {
1372
int error = zap_remove_uint64_by_dnode(dn, &off,
1373
BRT_KEY_WORDS, tx);
1374
VERIFY(error == 0 || error == ENOENT);
1375
} else {
1376
if (brt_has_endian_fixed(spa)) {
1377
VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1378
BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1379
&bre->bre_count, tx));
1380
} else {
1381
VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1382
BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1383
&bre->bre_count, tx));
1384
}
1385
}
1386
}
1387
1388
static void
1389
brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1390
{
1391
brt_entry_t *bre;
1392
1393
brt_rlock(spa);
1394
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1395
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1396
brt_unlock(spa);
1397
1398
if (!brtvd->bv_meta_dirty) {
1399
ASSERT(!brtvd->bv_entcount_dirty);
1400
ASSERT0(avl_numnodes(&brtvd->bv_tree));
1401
brt_rlock(spa);
1402
continue;
1403
}
1404
1405
ASSERT(!brtvd->bv_entcount_dirty ||
1406
avl_numnodes(&brtvd->bv_tree) != 0);
1407
1408
if (brtvd->bv_mos_brtvdev == 0)
1409
brt_vdev_create(spa, brtvd, tx);
1410
1411
void *c = NULL;
1412
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1413
brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,
1414
tx);
1415
kmem_cache_free(brt_entry_cache, bre);
1416
}
1417
1418
#ifdef ZFS_DEBUG
1419
if (zfs_flags & ZFS_DEBUG_BRT)
1420
brt_vdev_dump(brtvd);
1421
#endif
1422
if (brtvd->bv_totalcount == 0)
1423
brt_vdev_destroy(spa, brtvd, tx);
1424
else
1425
brt_vdev_sync(spa, brtvd, tx);
1426
brt_rlock(spa);
1427
}
1428
brt_unlock(spa);
1429
}
1430
1431
void
1432
brt_sync(spa_t *spa, uint64_t txg)
1433
{
1434
dmu_tx_t *tx;
1435
uint64_t vdevid;
1436
1437
ASSERT3U(spa_syncing_txg(spa), ==, txg);
1438
1439
brt_rlock(spa);
1440
for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1441
if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1442
break;
1443
}
1444
if (vdevid >= spa->spa_brt_nvdevs) {
1445
brt_unlock(spa);
1446
return;
1447
}
1448
brt_unlock(spa);
1449
1450
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1451
brt_sync_table(spa, tx);
1452
dmu_tx_commit(tx);
1453
}
1454
1455
static void
1456
brt_alloc(spa_t *spa)
1457
{
1458
rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1459
spa->spa_brt_vdevs = NULL;
1460
spa->spa_brt_nvdevs = 0;
1461
spa->spa_brt_rangesize = 0;
1462
}
1463
1464
void
1465
brt_create(spa_t *spa)
1466
{
1467
brt_alloc(spa);
1468
spa->spa_brt_rangesize = BRT_RANGESIZE;
1469
}
1470
1471
int
1472
brt_load(spa_t *spa)
1473
{
1474
int error = 0;
1475
1476
brt_alloc(spa);
1477
brt_wlock(spa);
1478
for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1479
vdevid++) {
1480
char name[64];
1481
uint64_t mos_brtvdev;
1482
1483
/* Look if this vdev had active block cloning. */
1484
snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1485
(u_longlong_t)vdevid);
1486
error = zap_lookup(spa->spa_meta_objset,
1487
DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1488
&mos_brtvdev);
1489
if (error == ENOENT) {
1490
error = 0;
1491
continue;
1492
}
1493
if (error != 0)
1494
break;
1495
1496
/* If it did, then allocate them all and load this one. */
1497
brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1498
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1499
rw_enter(&brtvd->bv_lock, RW_WRITER);
1500
brtvd->bv_mos_brtvdev = mos_brtvdev;
1501
error = brt_vdev_load(spa, brtvd);
1502
rw_exit(&brtvd->bv_lock);
1503
if (error != 0)
1504
break;
1505
}
1506
1507
if (spa->spa_brt_rangesize == 0)
1508
spa->spa_brt_rangesize = BRT_RANGESIZE;
1509
brt_unlock(spa);
1510
return (error);
1511
}
1512
1513
void
1514
brt_prefetch_all(spa_t *spa)
1515
{
1516
/*
1517
* Load all BRT entries for each vdev. This is intended to perform
1518
* a prefetch on all such blocks. For the same reason that brt_prefetch
1519
* (called from brt_pending_add) isn't locked, this is also not locked.
1520
*/
1521
brt_rlock(spa);
1522
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1523
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1524
brt_unlock(spa);
1525
1526
rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1527
if (brtvd->bv_mos_entries != 0) {
1528
(void) zap_prefetch_object(spa->spa_meta_objset,
1529
brtvd->bv_mos_entries);
1530
}
1531
rw_exit(&brtvd->bv_mos_entries_lock);
1532
1533
brt_rlock(spa);
1534
}
1535
brt_unlock(spa);
1536
}
1537
1538
void
1539
brt_unload(spa_t *spa)
1540
{
1541
if (spa->spa_brt_rangesize == 0)
1542
return;
1543
brt_vdevs_free(spa);
1544
rw_destroy(&spa->spa_brt_lock);
1545
spa->spa_brt_rangesize = 0;
1546
}
1547
1548
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1549
"Enable prefetching of BRT ZAP entries");
1550
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1551
"BRT ZAP leaf blockshift");
1552
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1553
"BRT ZAP indirect blockshift");
1554
1555