Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/ddt_log.c
108011 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
23
/*
24
* Copyright (c) 2023, Klara Inc.
25
*/
26
27
#include <sys/zfs_context.h>
28
#include <sys/spa.h>
29
#include <sys/ddt.h>
30
#include <sys/dmu_tx.h>
31
#include <sys/dmu.h>
32
#include <sys/ddt_impl.h>
33
#include <sys/dnode.h>
34
#include <sys/dbuf.h>
35
#include <sys/zap.h>
36
#include <sys/zio_checksum.h>
37
38
/*
39
* No more than this many txgs before swapping logs.
40
*/
41
uint_t zfs_dedup_log_txg_max = 8;
42
43
/*
44
* Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
45
* load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
46
*/
47
uint64_t zfs_dedup_log_mem_max = 0;
48
uint_t zfs_dedup_log_mem_max_percent = 1;
49
50
51
static kmem_cache_t *ddt_log_entry_flat_cache;
52
static kmem_cache_t *ddt_log_entry_trad_cache;
53
54
#define DDT_LOG_ENTRY_FLAT_SIZE \
55
(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
56
#define DDT_LOG_ENTRY_TRAD_SIZE \
57
(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
58
59
#define DDT_LOG_ENTRY_SIZE(ddt) \
60
_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
61
62
void
63
ddt_log_init(void)
64
{
65
ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
66
DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
67
ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
68
DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
69
70
/*
71
* Max memory for log AVL entries. At least 1M, because we need
72
* something (that's ~3800 entries per tree). They can say 100% if they
73
* want; it just means they're at the mercy of the the txg flush limit.
74
*/
75
if (zfs_dedup_log_mem_max == 0) {
76
zfs_dedup_log_mem_max_percent =
77
MIN(zfs_dedup_log_mem_max_percent, 100);
78
zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
79
zfs_dedup_log_mem_max_percent / 100;
80
}
81
zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
82
}
83
84
void
85
ddt_log_fini(void)
86
{
87
kmem_cache_destroy(ddt_log_entry_trad_cache);
88
kmem_cache_destroy(ddt_log_entry_flat_cache);
89
}
90
91
static void
92
ddt_log_name(ddt_t *ddt, char *name, uint_t n)
93
{
94
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
95
zio_checksum_table[ddt->ddt_checksum].ci_name, n);
96
}
97
98
static void
99
ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
100
{
101
dmu_buf_t *db;
102
VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
103
dmu_buf_will_dirty(db, tx);
104
105
ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
106
DLH_SET_VERSION(hdr, 1);
107
DLH_SET_FLAGS(hdr, ddl->ddl_flags);
108
hdr->dlh_length = ddl->ddl_length;
109
hdr->dlh_first_txg = ddl->ddl_first_txg;
110
hdr->dlh_checkpoint = ddl->ddl_checkpoint;
111
112
dmu_buf_rele(db, FTAG);
113
}
114
115
static void
116
ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
117
{
118
ASSERT3U(ddt->ddt_dir_object, >, 0);
119
ASSERT0(ddl->ddl_object);
120
121
char name[DDT_NAMELEN];
122
ddt_log_name(ddt, name, n);
123
124
ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
125
DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
126
DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
127
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
128
sizeof (uint64_t), 1, &ddl->ddl_object, tx));
129
ddl->ddl_length = 0;
130
ddl->ddl_first_txg = tx->tx_txg;
131
ddt_log_update_header(ddt, ddl, tx);
132
}
133
134
static void
135
ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
136
{
137
ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
138
ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
139
}
140
141
static void
142
ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
143
{
144
ASSERT3U(ddt->ddt_dir_object, >, 0);
145
146
if (ddl->ddl_object == 0)
147
return;
148
149
ASSERT0(ddl->ddl_length);
150
151
char name[DDT_NAMELEN];
152
ddt_log_name(ddt, name, n);
153
154
VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
155
VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
156
157
ddl->ddl_object = 0;
158
}
159
160
void
161
ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
162
{
163
ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
164
ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
165
}
166
167
static void
168
ddt_log_update_stats(ddt_t *ddt)
169
{
170
/*
171
* Log object stats. We count the number of live entries in the log
172
* tree, even if there are more than on disk, and even if the same
173
* entry is on both append and flush trees, because that's more what
174
* the user expects to see. This does mean the on-disk size is not
175
* really correlated with the number of entries, but I don't think
176
* that's reasonable to expect anyway.
177
*/
178
dmu_object_info_t doi;
179
uint64_t nblocks = 0;
180
if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
181
&doi) == 0)
182
nblocks += doi.doi_physical_blocks_512;
183
if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
184
&doi) == 0)
185
nblocks += doi.doi_physical_blocks_512;
186
187
ddt_object_t *ddo = &ddt->ddt_log_stats;
188
ddo->ddo_count =
189
avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
190
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
191
ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
192
ddo->ddo_dspace = nblocks << 9;
193
}
194
195
void
196
ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
197
{
198
ASSERT3U(nentries, >, 0);
199
ASSERT0P(dlu->dlu_dbp);
200
201
if (ddt->ddt_log_active->ddl_object == 0)
202
ddt_log_create(ddt, tx);
203
204
/*
205
* We want to store as many entries as we can in a block, but never
206
* split an entry across block boundaries.
207
*/
208
size_t reclen = P2ALIGN_TYPED(
209
sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
210
DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
211
ASSERT3U(reclen, <=, UINT16_MAX);
212
dlu->dlu_reclen = reclen;
213
214
VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
215
&dlu->dlu_dn));
216
dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
217
218
uint64_t nblocks = howmany(nentries,
219
dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
220
uint64_t offset = ddt->ddt_log_active->ddl_length;
221
uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
222
223
VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
224
B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
225
DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
226
227
dlu->dlu_tx = tx;
228
dlu->dlu_block = dlu->dlu_offset = 0;
229
}
230
231
static ddt_log_entry_t *
232
ddt_log_alloc_entry(ddt_t *ddt)
233
{
234
ddt_log_entry_t *ddle;
235
236
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
237
ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
238
memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
239
} else {
240
ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
241
memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
242
}
243
244
return (ddle);
245
}
246
247
static void
248
ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
249
{
250
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
251
ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
252
}
253
254
static void
255
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
256
boolean_t hist)
257
{
258
/* Create the log tree entry from a live or stored entry */
259
avl_index_t where;
260
ddt_log_entry_t *ddle =
261
avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
262
if (ddle == NULL) {
263
ddle = ddt_log_alloc_entry(ddt);
264
ddle->ddle_key = ddlwe->ddlwe_key;
265
avl_insert(&ddl->ddl_tree, ddle, where);
266
} else if (hist) {
267
ddt_lightweight_entry_t oddlwe;
268
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
269
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
270
}
271
if (hist)
272
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
273
ddle->ddle_type = ddlwe->ddlwe_type;
274
ddle->ddle_class = ddlwe->ddlwe_class;
275
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
276
}
277
278
void
279
ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
280
{
281
ASSERT3U(dlu->dlu_dbp, !=, NULL);
282
283
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
284
285
/* Get our block */
286
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
287
dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
288
289
/*
290
* If this would take us past the end of the block, finish it and
291
* move to the next one.
292
*/
293
if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
294
ASSERT3U(dlu->dlu_offset, >, 0);
295
dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
296
dlu->dlu_block++;
297
dlu->dlu_offset = 0;
298
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
299
db = dlu->dlu_dbp[dlu->dlu_block];
300
}
301
302
/*
303
* If this is the first time touching the block, inform the DMU that
304
* we will fill it, and zero it out.
305
*/
306
if (dlu->dlu_offset == 0) {
307
dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE,
308
DMU_UNCACHEDIO);
309
memset(db->db_data, 0, db->db_size);
310
}
311
312
/* Create the log record directly in the buffer */
313
ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
314
DLR_SET_TYPE(dlr, DLR_ENTRY);
315
DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
316
DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
317
DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
318
319
ddt_log_record_entry_t *dlre =
320
(ddt_log_record_entry_t *)&dlr->dlr_payload;
321
dlre->dlre_key = ddlwe->ddlwe_key;
322
memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
323
324
/* Advance offset for next record. */
325
dlu->dlu_offset += dlu->dlu_reclen;
326
}
327
328
void
329
ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
330
{
331
ASSERT3U(dlu->dlu_dbp, !=, NULL);
332
ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
333
ASSERT3U(dlu->dlu_offset, >, 0);
334
335
/*
336
* Close out the last block. Whatever we haven't used will be zeroed,
337
* which matches DLR_INVALID, so we can detect this during load.
338
*/
339
dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
340
341
dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
342
343
ddt->ddt_log_active->ddl_length +=
344
dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
345
dnode_rele(dlu->dlu_dn, FTAG);
346
347
ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
348
349
memset(dlu, 0, sizeof (ddt_log_update_t));
350
351
ddt_log_update_stats(ddt);
352
}
353
354
boolean_t
355
ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
356
{
357
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
358
if (ddle == NULL)
359
return (B_FALSE);
360
361
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
362
363
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
364
365
avl_remove(&ddl->ddl_tree, ddle);
366
ddt_log_free_entry(ddt, ddle);
367
368
return (B_TRUE);
369
}
370
371
boolean_t
372
ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
373
{
374
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
375
if (ddle == NULL)
376
return (B_FALSE);
377
378
ddt_lightweight_entry_t ddlwe;
379
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
380
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
381
382
avl_remove(&ddl->ddl_tree, ddle);
383
ddt_log_free_entry(ddt, ddle);
384
385
return (B_TRUE);
386
}
387
388
boolean_t
389
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
390
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
391
{
392
ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
393
ddk, NULL);
394
if (ddle) {
395
if (from_flushing)
396
*from_flushing = B_FALSE;
397
} else {
398
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
399
if (!ddle)
400
return (B_FALSE);
401
if (from_flushing)
402
*from_flushing = B_TRUE;
403
}
404
if (ddlwe)
405
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
406
return (B_TRUE);
407
}
408
409
void
410
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
411
{
412
ddt_log_t *ddl = ddt->ddt_log_flushing;
413
414
ASSERT3U(ddl->ddl_object, !=, 0);
415
416
#ifdef ZFS_DEBUG
417
/*
418
* There should not be any entries on the log tree before the given
419
* checkpoint. Assert that this is the case.
420
*/
421
ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
422
if (ddle != NULL)
423
VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
424
>, 0);
425
#endif
426
427
ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
428
ddl->ddl_checkpoint = ddlwe->ddlwe_key;
429
ddt_log_update_header(ddt, ddl, tx);
430
431
ddt_log_update_stats(ddt);
432
}
433
434
void
435
ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
436
{
437
ddt_log_t *ddl = ddt->ddt_log_flushing;
438
439
if (ddl->ddl_object == 0)
440
return;
441
442
ASSERT(avl_is_empty(&ddl->ddl_tree));
443
444
/* Eject the entire object */
445
dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
446
447
ddl->ddl_length = 0;
448
ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
449
memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
450
ddt_log_update_header(ddt, ddl, tx);
451
452
ddt_log_update_stats(ddt);
453
}
454
455
boolean_t
456
ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
457
{
458
/* Swap the logs. The old flushing one must be empty */
459
VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
460
461
/*
462
* If there are still blocks on the flushing log, truncate it first.
463
* This can happen if there were entries on the flushing log that were
464
* removed in memory via ddt_lookup(); their vestigal remains are
465
* on disk.
466
*/
467
if (ddt->ddt_log_flushing->ddl_length > 0)
468
ddt_log_truncate(ddt, tx);
469
470
/*
471
* Swap policy. We swap the logs (and so begin flushing) when the
472
* active tree grows too large, or when we haven't swapped it in
473
* some amount of time, or if something has requested the logs be
474
* flushed ASAP (see ddt_walk_init()).
475
*/
476
477
/*
478
* The log tree is too large if the memory usage of its entries is over
479
* half of the memory limit. This effectively gives each log tree half
480
* the available memory.
481
*/
482
const boolean_t too_large =
483
(avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
484
DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
485
486
const boolean_t too_old =
487
tx->tx_txg >=
488
(ddt->ddt_log_active->ddl_first_txg +
489
MAX(1, zfs_dedup_log_txg_max));
490
491
const boolean_t force =
492
ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
493
494
if (!(too_large || too_old || force))
495
return (B_FALSE);
496
497
ddt_log_t *swap = ddt->ddt_log_active;
498
ddt->ddt_log_active = ddt->ddt_log_flushing;
499
ddt->ddt_log_flushing = swap;
500
501
ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
502
ddt->ddt_log_active->ddl_flags &=
503
~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
504
505
ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
506
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
507
508
ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
509
510
ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
511
ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
512
513
ddt_log_update_stats(ddt);
514
515
return (B_TRUE);
516
}
517
518
static inline void
519
ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
520
const ddt_key_t *checkpoint)
521
{
522
ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
523
524
ddt_log_record_entry_t *dlre =
525
(ddt_log_record_entry_t *)dlr->dlr_payload;
526
if (checkpoint != NULL &&
527
ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
528
/* Skip pre-checkpoint entries; they're already flushed. */
529
return;
530
}
531
532
ddt_lightweight_entry_t ddlwe;
533
ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
534
ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
535
536
ddlwe.ddlwe_key = dlre->dlre_key;
537
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
538
539
ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
540
}
541
542
static void
543
ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
544
{
545
void *cookie = NULL;
546
ddt_log_entry_t *ddle;
547
IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
548
while ((ddle =
549
avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
550
ddt_log_free_entry(ddt, ddle);
551
}
552
ASSERT(avl_is_empty(&ddl->ddl_tree));
553
}
554
555
static int
556
ddt_log_load_one(ddt_t *ddt, uint_t n)
557
{
558
ASSERT3U(n, <, 2);
559
560
ddt_log_t *ddl = &ddt->ddt_log[n];
561
562
char name[DDT_NAMELEN];
563
ddt_log_name(ddt, name, n);
564
565
uint64_t obj;
566
int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
567
sizeof (uint64_t), 1, &obj);
568
if (err == ENOENT)
569
return (0);
570
if (err != 0)
571
return (err);
572
573
dnode_t *dn;
574
err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
575
if (err != 0)
576
return (err);
577
578
ddt_log_header_t hdr;
579
dmu_buf_t *db;
580
err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
581
if (err != 0) {
582
dnode_rele(dn, FTAG);
583
return (err);
584
}
585
memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
586
dmu_buf_rele(db, FTAG);
587
588
if (DLH_GET_VERSION(&hdr) != 1) {
589
dnode_rele(dn, FTAG);
590
zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
591
"unknown version=%llu", spa_name(ddt->ddt_spa), name,
592
(u_longlong_t)DLH_GET_VERSION(&hdr));
593
return (SET_ERROR(EINVAL));
594
}
595
596
ddt_key_t *checkpoint = NULL;
597
if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
598
/*
599
* If the log has a checkpoint, then we can ignore any entries
600
* that have already been flushed.
601
*/
602
ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
603
checkpoint = &hdr.dlh_checkpoint;
604
}
605
606
if (hdr.dlh_length > 0) {
607
dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
608
ZIO_PRIORITY_SYNC_READ);
609
610
for (uint64_t offset = 0; offset < hdr.dlh_length;
611
offset += dn->dn_datablksz) {
612
err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
613
DMU_READ_PREFETCH | DMU_UNCACHEDIO);
614
if (err != 0) {
615
dnode_rele(dn, FTAG);
616
ddt_log_empty(ddt, ddl);
617
return (err);
618
}
619
620
uint64_t boffset = 0;
621
while (boffset < db->db_size) {
622
ddt_log_record_t *dlr =
623
(ddt_log_record_t *)(db->db_data + boffset);
624
625
/* Partially-filled block, skip the rest */
626
if (DLR_GET_TYPE(dlr) == DLR_INVALID)
627
break;
628
629
switch (DLR_GET_TYPE(dlr)) {
630
case DLR_ENTRY:
631
ddt_log_load_entry(ddt, ddl, dlr,
632
checkpoint);
633
break;
634
635
default:
636
dmu_buf_rele(db, FTAG);
637
dnode_rele(dn, FTAG);
638
ddt_log_empty(ddt, ddl);
639
return (SET_ERROR(EINVAL));
640
}
641
642
boffset += DLR_GET_RECLEN(dlr);
643
}
644
645
dmu_buf_rele(db, FTAG);
646
}
647
}
648
649
dnode_rele(dn, FTAG);
650
651
ddl->ddl_object = obj;
652
ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
653
ddl->ddl_length = hdr.dlh_length;
654
ddl->ddl_first_txg = hdr.dlh_first_txg;
655
656
if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
657
ddt->ddt_log_flushing = ddl;
658
else
659
ddt->ddt_log_active = ddl;
660
661
return (0);
662
}
663
664
int
665
ddt_log_load(ddt_t *ddt)
666
{
667
int err;
668
669
if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
670
/*
671
* The DDT is going to be freed again in a moment, so there's
672
* no point loading the log; it'll just slow down import.
673
*/
674
return (0);
675
}
676
677
ASSERT0(ddt->ddt_log[0].ddl_object);
678
ASSERT0(ddt->ddt_log[1].ddl_object);
679
if (ddt->ddt_dir_object == 0) {
680
/*
681
* If we're configured but the containing dir doesn't exist
682
* yet, then the log object can't possibly exist either.
683
*/
684
ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
685
return (SET_ERROR(ENOENT));
686
}
687
688
if ((err = ddt_log_load_one(ddt, 0)) != 0)
689
return (err);
690
if ((err = ddt_log_load_one(ddt, 1)) != 0)
691
return (err);
692
693
VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
694
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
695
VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
696
VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
697
698
/*
699
* We have two finalisation tasks:
700
*
701
* - rebuild the histogram. We do this at the end rather than while
702
* we're loading so we don't need to uncount and recount entries that
703
* appear multiple times in the log.
704
*
705
* - remove entries from the flushing tree that are on both trees. This
706
* happens when ddt_lookup() rehydrates an entry from the flushing
707
* tree, as ddt_log_take_key() removes the entry from the in-memory
708
* tree but doesn't remove it from disk.
709
*/
710
711
/*
712
* We don't technically need a config lock here, since there shouldn't
713
* be pool config changes during DDT load. dva_get_dsize_sync() via
714
* ddt_stat_generate() is expecting it though, and it won't hurt
715
* anything, so we take it.
716
*/
717
spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
718
719
avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
720
avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
721
ddt_log_entry_t *ae = avl_first(al);
722
ddt_log_entry_t *fe = avl_first(fl);
723
while (ae != NULL || fe != NULL) {
724
ddt_log_entry_t *ddle;
725
if (ae == NULL) {
726
/* active exhausted, take flushing */
727
ddle = fe;
728
fe = AVL_NEXT(fl, fe);
729
} else if (fe == NULL) {
730
/* flushing exuhausted, take active */
731
ddle = ae;
732
ae = AVL_NEXT(al, ae);
733
} else {
734
/* compare active and flushing */
735
int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
736
if (c < 0) {
737
/* active behind, take and advance */
738
ddle = ae;
739
ae = AVL_NEXT(al, ae);
740
} else if (c > 0) {
741
/* flushing behind, take and advance */
742
ddle = fe;
743
fe = AVL_NEXT(fl, fe);
744
} else {
745
/* match. remove from flushing, take active */
746
ddle = fe;
747
fe = AVL_NEXT(fl, fe);
748
avl_remove(fl, ddle);
749
ddt_log_free_entry(ddt, ddle);
750
ddle = ae;
751
ae = AVL_NEXT(al, ae);
752
}
753
}
754
755
ddt_lightweight_entry_t ddlwe;
756
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
757
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
758
}
759
760
spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
761
762
ddt_log_update_stats(ddt);
763
764
return (0);
765
}
766
767
void
768
ddt_log_alloc(ddt_t *ddt)
769
{
770
ASSERT0P(ddt->ddt_log_active);
771
ASSERT0P(ddt->ddt_log_flushing);
772
773
avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
774
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
775
avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
776
sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
777
ddt->ddt_log_active = &ddt->ddt_log[0];
778
ddt->ddt_log_flushing = &ddt->ddt_log[1];
779
ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
780
}
781
782
void
783
ddt_log_free(ddt_t *ddt)
784
{
785
ddt_log_empty(ddt, &ddt->ddt_log[0]);
786
ddt_log_empty(ddt, &ddt->ddt_log[1]);
787
avl_destroy(&ddt->ddt_log[0].ddl_tree);
788
avl_destroy(&ddt->ddt_log[1].ddl_tree);
789
}
790
791
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
792
"Max transactions before starting to flush dedup logs");
793
794
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
795
"Max memory for dedup logs");
796
797
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
798
"Max memory for dedup logs, as % of total memory");
799
800