Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/ddt.c
107074 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
23
/*
24
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
26
* Copyright (c) 2022 by Pawel Jakub Dawidek
27
* Copyright (c) 2019, 2023, Klara Inc.
28
*/
29
30
#include <sys/zfs_context.h>
31
#include <sys/spa.h>
32
#include <sys/spa_impl.h>
33
#include <sys/zio.h>
34
#include <sys/ddt.h>
35
#include <sys/ddt_impl.h>
36
#include <sys/zap.h>
37
#include <sys/dmu_tx.h>
38
#include <sys/arc.h>
39
#include <sys/dsl_pool.h>
40
#include <sys/zio_checksum.h>
41
#include <sys/dsl_scan.h>
42
#include <sys/abd.h>
43
#include <sys/zfeature.h>
44
45
/*
46
* # DDT: Deduplication tables
47
*
48
* The dedup subsystem provides block-level deduplication. When enabled, blocks
49
* to be written will have the dedup (D) bit set, which causes them to be
50
* tracked in a "dedup table", or DDT. If a block has been seen before (exists
51
* in the DDT), instead of being written, it will instead be made to reference
52
* the existing on-disk data, and a refcount bumped in the DDT instead.
53
*
54
* ## Dedup tables and entries
55
*
56
* Conceptually, a DDT is a dictionary or map. Each entry has a "key"
57
* (ddt_key_t) made up a block's checksum and certian properties, and a "value"
58
* (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
59
* time and refcount. Together these are enough to track references to a
60
* specific block, to build a valid block pointer to reference that block (for
61
* freeing, scrubbing, etc), and to fill a new block pointer with the missing
62
* pieces to make it seem like it was written.
63
*
64
* There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
65
* Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
66
* object data formats, each with their own implementations) and "classes"
67
* (ddt_class_t, instance of a storage type object, for entries with a specific
68
* characteristic). An entry (key) will only ever exist on one of these objects
69
* at any given time, but may be moved from one to another if their type or
70
* class changes.
71
*
72
* The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
73
* is to be written, before DVAs have been allocated, ddt_lookup() is called to
74
* see if the block has been seen before. If its not found, the write proceeds
75
* as normal, and after it succeeds, a new entry is created. If it is found, we
76
* fill the BP with the DVAs from the entry, increment the refcount and cause
77
* the write IO to return immediately.
78
*
79
* Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
80
* block for the same content/checksum. The slot is selected based on the
81
* zp_copies parameter the block is written with, that is, the number of DVAs
82
* in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
83
* now-removed "dedupditto" feature. These are no longer written, and will be
84
* freed if encountered on old pools.
85
*
86
* If the "fast_dedup" feature is enabled, new dedup tables will be created
87
* with the "flat phys" option. In this mode, there is only one ddt_phys_t
88
* slot. If a write is issued for an entry that exists, but has fewer DVAs,
89
* then only as many new DVAs are allocated and written to make up the
90
* shortfall. The existing entry is then extended (ddt_phys_extend()) with the
91
* new DVAs.
92
*
93
* ## Lifetime of an entry
94
*
95
* A DDT can be enormous, and typically is not held in memory all at once.
96
* Instead, the changes to an entry are tracked in memory, and written down to
97
* disk at the end of each txg.
98
*
99
* A "live" in-memory entry (ddt_entry_t) is a node on the live tree
100
* (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
101
* required for IO, ddt_lookup() is called. If an entry already exists on
102
* ddt_tree, it is returned. Otherwise, a new one is created, and the
103
* type/class objects for the DDT are searched for that key. If its found, its
104
* value is copied into the live entry. If not, an empty entry is created.
105
*
106
* The live entry will be modified during the txg, usually by modifying the
107
* refcount, but sometimes by adding or updating DVAs. At the end of the txg
108
* (during spa_sync()), type and class are recalculated for entry (see
109
* ddt_sync_entry()), and the entry is written to the appropriate storage
110
* object and (if necessary), removed from an old one. ddt_tree is cleared and
111
* the next txg can start.
112
*
113
* ## Dedup quota
114
*
115
* A maximum size for all DDTs on the pool can be set with the
116
* dedup_table_quota property. This is determined in ddt_over_quota() and
117
* enforced during ddt_lookup(). If the pool is at or over its quota limit,
118
* ddt_lookup() will only return entries for existing blocks, as updates are
119
* still possible. New entries will not be created; instead, ddt_lookup() will
120
* return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
121
* the D bit on the block and reissue the IO as a regular write. The block will
122
* not be deduplicated.
123
*
124
* Note that this is based on the on-disk size of the dedup store. Reclaiming
125
* this space after deleting entries relies on the ZAP "shrinking" behaviour,
126
* without which, no space would be recovered and the DDT would continue to be
127
* considered "over quota". See zap_shrink_enabled.
128
*
129
* ## Dedup table pruning
130
*
131
* As a complement to the dedup quota feature, ddtprune allows removal of older
132
* non-duplicate entries to make room for newer duplicate entries. The amount
133
* to prune can be based on a target percentage of the unique entries or based
134
* on the age (i.e., prune unique entry older than N days).
135
*
136
* ## Dedup log
137
*
138
* Historically, all entries modified on a txg were written back to dedup
139
* storage objects at the end of every txg. This could cause significant
140
* overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
141
* and so required reading the whole node, updating the entry, and writing it
142
* back. On busy pools, this could add serious IO and memory overheads.
143
*
144
* To address this, the dedup log was added. If the "fast_dedup" feature is
145
* enabled, at the end of each txg, modified entries will be copied to an
146
* in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
147
* same block is requested again, the in-memory object will be checked first,
148
* and if its there, the entry inflated back onto the live tree without going
149
* to storage. The on-disk log is only read at pool import time, to reload the
150
* in-memory log.
151
*
152
* Each txg, some amount of the in-memory log will be flushed out to a DDT
153
* storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
154
* keep up with the rate of change on dedup entries, but not so much that it
155
* would impact overall throughput, and not using too much memory. See the
156
* zfs_dedup_log_* tunables in zfs(4) for more details.
157
*
158
* ## Repair IO
159
*
160
* If a read on a dedup block fails, but there are other copies of the block in
161
* the other ddt_phys_t slots, reads will be issued for those instead
162
* (zio_ddt_read_start()). If one of those succeeds, the read is returned to
163
* the caller, and a copy is stashed on the entry's dde_repair_abd.
164
*
165
* During the end-of-txg sync, any entries with a dde_repair_abd get a
166
* "rewrite" write issued for the original block pointer, with the data read
167
* from the alternate block. If the block is actually damaged, this will invoke
168
* the pool's "self-healing" mechanism, and repair the block.
169
*
170
* If the "fast_dedup" feature is enabled, the "flat phys" option will be in
171
* use, so there is only ever one ddt_phys_t slot. The repair process will
172
* still happen in this case, though it is unlikely to succeed as there will
173
* usually be no other equivalent blocks to fall back on (though there might
174
* be, if this was an early version of a dedup'd block that has since been
175
* extended).
176
*
177
* Note that this repair mechanism is in addition to and separate from the
178
* regular OpenZFS scrub and self-healing mechanisms.
179
*
180
* ## Scanning (scrub/resilver)
181
*
182
* If dedup is active, the scrub machinery will walk the dedup table first, and
183
* scrub all blocks with refcnt > 1 first. After that it will move on to the
184
* regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
185
* In this way, heavily deduplicated blocks are only scrubbed once. See the
186
* commentary on dsl_scan_ddt() for more details.
187
*
188
* Walking the DDT is done via ddt_walk(). The current position is stored in a
189
* ddt_bookmark_t, which represents a stable position in the storage object.
190
* This bookmark is stored by the scan machinery, and must reference the same
191
* position on the object even if the object changes, the pool is exported, or
192
* OpenZFS is upgraded.
193
*
194
* If the "fast_dedup" feature is enabled and the table has a log, the scan
195
* cannot begin until entries on the log are flushed, as the on-disk log has no
196
* concept of a "stable position". Instead, the log flushing process will enter
197
* a more aggressive mode, to flush out as much as is necesary as soon as
198
* possible, in order to begin the scan as soon as possible.
199
*
200
* ## Interaction with block cloning
201
*
202
* If block cloning and dedup are both enabled on a pool, BRT will look for the
203
* dedup bit on an incoming block pointer. If set, it will call into the DDT
204
* (ddt_addref()) to add a reference to the block, instead of adding a
205
* reference to the BRT. See brt_pending_apply().
206
*/
207
208
/*
209
* These are the only checksums valid for dedup. They must match the list
210
* from dedup_table in zfs_prop.c
211
*/
212
#define DDT_CHECKSUM_VALID(c) \
213
(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
214
c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
215
c == ZIO_CHECKSUM_BLAKE3)
216
217
static kmem_cache_t *ddt_cache;
218
219
static kmem_cache_t *ddt_entry_flat_cache;
220
static kmem_cache_t *ddt_entry_trad_cache;
221
222
#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
223
#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224
225
#define DDT_ENTRY_SIZE(ddt) \
226
_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
227
228
/*
229
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230
*/
231
int zfs_dedup_prefetch = 0;
232
233
/*
234
* If the dedup class cannot satisfy a DDT allocation, treat as over quota
235
* for this many TXGs.
236
*/
237
uint_t dedup_class_wait_txgs = 5;
238
239
/*
240
* How many DDT prune entries to add to the DDT sync AVL tree.
241
* Note these addtional entries have a memory footprint of a
242
* ddt_entry_t (216 bytes).
243
*/
244
static uint32_t zfs_ddt_prunes_per_txg = 50000;
245
246
/*
247
* For testing, synthesize aged DDT entries
248
* (in global scope for ztest)
249
*/
250
boolean_t ddt_prune_artificial_age = B_FALSE;
251
boolean_t ddt_dump_prune_histogram = B_FALSE;
252
253
/*
254
* Minimum time to flush per txg.
255
*/
256
uint_t zfs_dedup_log_flush_min_time_ms = 1000;
257
258
/*
259
* Minimum entries to flush per txg.
260
*/
261
uint_t zfs_dedup_log_flush_entries_min = 200;
262
263
/*
264
* Target number of TXGs until the whole dedup log has been flushed.
265
* The log size will float around this value times the ingest rate.
266
*/
267
uint_t zfs_dedup_log_flush_txgs = 100;
268
269
/*
270
* Maximum entries to flush per txg. Used for testing the dedup log.
271
*/
272
uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
273
274
/*
275
* Soft cap for the size of the current dedup log. If the log is larger
276
* than this size, we slightly increase the aggressiveness of the flushing to
277
* try to bring it back down to the soft cap.
278
*/
279
uint_t zfs_dedup_log_cap = UINT_MAX;
280
281
/*
282
* If this is set to B_TRUE, the cap above acts more like a hard cap:
283
* flushing is significantly more aggressive, increasing the minimum amount we
284
* flush per txg, as well as the maximum.
285
*/
286
boolean_t zfs_dedup_log_hard_cap = B_FALSE;
287
288
/*
289
* Number of txgs to average flow rates across.
290
*/
291
uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
292
293
static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
294
&ddt_zap_ops,
295
};
296
297
static const char *const ddt_class_name[DDT_CLASSES] = {
298
"ditto",
299
"duplicate",
300
"unique",
301
};
302
303
/*
304
* DDT feature flags automatically enabled for each on-disk version. Note that
305
* versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
306
*/
307
static const uint64_t ddt_version_flags[] = {
308
[DDT_VERSION_LEGACY] = 0,
309
[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
310
};
311
312
/* per-DDT kstats */
313
typedef struct {
314
/* total lookups and whether they returned new or existing entries */
315
kstat_named_t dds_lookup;
316
kstat_named_t dds_lookup_new;
317
kstat_named_t dds_lookup_existing;
318
319
/* entries found on live tree, and if we had to wait for load */
320
kstat_named_t dds_lookup_live_hit;
321
kstat_named_t dds_lookup_live_wait;
322
kstat_named_t dds_lookup_live_miss;
323
324
/* entries found on log trees */
325
kstat_named_t dds_lookup_log_hit;
326
kstat_named_t dds_lookup_log_active_hit;
327
kstat_named_t dds_lookup_log_flushing_hit;
328
kstat_named_t dds_lookup_log_miss;
329
330
/* entries found on store objects */
331
kstat_named_t dds_lookup_stored_hit;
332
kstat_named_t dds_lookup_stored_miss;
333
334
/* number of entries on log trees */
335
kstat_named_t dds_log_active_entries;
336
kstat_named_t dds_log_flushing_entries;
337
338
/* avg updated/flushed entries per txg */
339
kstat_named_t dds_log_ingest_rate;
340
kstat_named_t dds_log_flush_rate;
341
kstat_named_t dds_log_flush_time_rate;
342
} ddt_kstats_t;
343
344
static const ddt_kstats_t ddt_kstats_template = {
345
{ "lookup", KSTAT_DATA_UINT64 },
346
{ "lookup_new", KSTAT_DATA_UINT64 },
347
{ "lookup_existing", KSTAT_DATA_UINT64 },
348
{ "lookup_live_hit", KSTAT_DATA_UINT64 },
349
{ "lookup_live_wait", KSTAT_DATA_UINT64 },
350
{ "lookup_live_miss", KSTAT_DATA_UINT64 },
351
{ "lookup_log_hit", KSTAT_DATA_UINT64 },
352
{ "lookup_log_active_hit", KSTAT_DATA_UINT64 },
353
{ "lookup_log_flushing_hit", KSTAT_DATA_UINT64 },
354
{ "lookup_log_miss", KSTAT_DATA_UINT64 },
355
{ "lookup_stored_hit", KSTAT_DATA_UINT64 },
356
{ "lookup_stored_miss", KSTAT_DATA_UINT64 },
357
{ "log_active_entries", KSTAT_DATA_UINT64 },
358
{ "log_flushing_entries", KSTAT_DATA_UINT64 },
359
{ "log_ingest_rate", KSTAT_DATA_UINT32 },
360
{ "log_flush_rate", KSTAT_DATA_UINT32 },
361
{ "log_flush_time_rate", KSTAT_DATA_UINT32 },
362
};
363
364
#ifdef _KERNEL
365
/*
366
* Hot-path lookup counters use wmsums to avoid cache line bouncing.
367
* DDT_KSTAT_BUMP: Increment a wmsum counter (lookup stats).
368
*
369
* Sync-only counters use direct kstat assignment (no atomics needed).
370
* DDT_KSTAT_SET: Set a value (log entry counts, rates).
371
* DDT_KSTAT_SUB: Subtract from a value (decrement log entry counts).
372
* DDT_KSTAT_ZERO: Zero a value (clear log entry counts).
373
*/
374
#define _DDT_KSTAT_STAT(ddt, stat) \
375
&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
376
#define DDT_KSTAT_BUMP(ddt, stat) \
377
wmsum_add(&(ddt)->ddt_kstat_##stat, 1)
378
#define DDT_KSTAT_SUB(ddt, stat, val) \
379
do { *_DDT_KSTAT_STAT(ddt, stat) -= (val); } while (0)
380
#define DDT_KSTAT_SET(ddt, stat, val) \
381
do { *_DDT_KSTAT_STAT(ddt, stat) = (val); } while (0)
382
#define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
383
#else
384
#define DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
385
#define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
386
#define DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
387
#define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
388
#endif /* _KERNEL */
389
390
391
static void
392
ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
393
dmu_tx_t *tx)
394
{
395
spa_t *spa = ddt->ddt_spa;
396
objset_t *os = ddt->ddt_os;
397
uint64_t *objectp = &ddt->ddt_object[type][class];
398
boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
399
ZCHECKSUM_FLAG_DEDUP;
400
char name[DDT_NAMELEN];
401
402
ASSERT3U(ddt->ddt_dir_object, >, 0);
403
404
ddt_object_name(ddt, type, class, name);
405
406
ASSERT0(*objectp);
407
VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
408
ASSERT3U(*objectp, !=, 0);
409
410
VERIFY0(dnode_hold(os, *objectp, ddt,
411
&ddt->ddt_object_dnode[type][class]));
412
413
ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
414
415
VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
416
objectp, tx));
417
418
VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
419
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
420
&ddt->ddt_histogram[type][class], tx));
421
}
422
423
static void
424
ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
425
dmu_tx_t *tx)
426
{
427
spa_t *spa = ddt->ddt_spa;
428
objset_t *os = ddt->ddt_os;
429
uint64_t *objectp = &ddt->ddt_object[type][class];
430
uint64_t count;
431
char name[DDT_NAMELEN];
432
433
ASSERT3U(ddt->ddt_dir_object, >, 0);
434
435
ddt_object_name(ddt, type, class, name);
436
437
ASSERT3U(*objectp, !=, 0);
438
ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
439
VERIFY0(ddt_object_count(ddt, type, class, &count));
440
VERIFY0(count);
441
VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
442
VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
443
if (ddt->ddt_object_dnode[type][class] != NULL) {
444
dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
445
ddt->ddt_object_dnode[type][class] = NULL;
446
}
447
VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
448
memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
449
450
*objectp = 0;
451
}
452
453
static int
454
ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
455
{
456
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
457
dmu_object_info_t doi;
458
uint64_t count;
459
char name[DDT_NAMELEN];
460
int error;
461
462
if (ddt->ddt_dir_object == 0) {
463
/*
464
* If we're configured but the containing dir doesn't exist
465
* yet, then this object can't possibly exist either.
466
*/
467
ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
468
return (SET_ERROR(ENOENT));
469
}
470
471
ddt_object_name(ddt, type, class, name);
472
473
error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
474
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
475
if (error != 0)
476
return (error);
477
478
error = dnode_hold(ddt->ddt_os, ddt->ddt_object[type][class], ddt,
479
&ddt->ddt_object_dnode[type][class]);
480
if (error != 0)
481
return (error);
482
483
error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
484
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
485
&ddt->ddt_histogram[type][class]);
486
if (error != 0)
487
goto error;
488
489
/*
490
* Seed the cached statistics.
491
*/
492
error = ddt_object_info(ddt, type, class, &doi);
493
if (error)
494
goto error;
495
496
error = ddt_object_count(ddt, type, class, &count);
497
if (error)
498
goto error;
499
500
ddo->ddo_count = count;
501
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
502
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
503
504
return (0);
505
506
error:
507
dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
508
ddt->ddt_object_dnode[type][class] = NULL;
509
return (error);
510
}
511
512
static void
513
ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
514
dmu_tx_t *tx)
515
{
516
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
517
dmu_object_info_t doi;
518
uint64_t count;
519
char name[DDT_NAMELEN];
520
521
ddt_object_name(ddt, type, class, name);
522
523
VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
524
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
525
&ddt->ddt_histogram[type][class], tx));
526
527
/*
528
* Cache DDT statistics; this is the only time they'll change.
529
*/
530
VERIFY0(ddt_object_info(ddt, type, class, &doi));
531
VERIFY0(ddt_object_count(ddt, type, class, &count));
532
533
ddo->ddo_count = count;
534
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
535
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
536
}
537
538
static boolean_t
539
ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
540
{
541
return (!!ddt->ddt_object[type][class]);
542
}
543
544
static int
545
ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
546
ddt_entry_t *dde)
547
{
548
dnode_t *dn = ddt->ddt_object_dnode[type][class];
549
if (dn == NULL)
550
return (SET_ERROR(ENOENT));
551
552
return (ddt_ops[type]->ddt_op_lookup(dn, &dde->dde_key,
553
dde->dde_phys, DDT_PHYS_SIZE(ddt)));
554
}
555
556
static int
557
ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
558
const ddt_key_t *ddk)
559
{
560
dnode_t *dn = ddt->ddt_object_dnode[type][class];
561
if (dn == NULL)
562
return (SET_ERROR(ENOENT));
563
564
return (ddt_ops[type]->ddt_op_contains(dn, ddk));
565
}
566
567
static void
568
ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
569
const ddt_key_t *ddk)
570
{
571
dnode_t *dn = ddt->ddt_object_dnode[type][class];
572
if (dn == NULL)
573
return;
574
575
ddt_ops[type]->ddt_op_prefetch(dn, ddk);
576
}
577
578
static void
579
ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
580
{
581
dnode_t *dn = ddt->ddt_object_dnode[type][class];
582
if (dn == NULL)
583
return;
584
585
ddt_ops[type]->ddt_op_prefetch_all(dn);
586
}
587
588
static int
589
ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
590
const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
591
{
592
dnode_t *dn = ddt->ddt_object_dnode[type][class];
593
ASSERT(dn != NULL);
594
595
return (ddt_ops[type]->ddt_op_update(dn, &ddlwe->ddlwe_key,
596
&ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
597
}
598
599
static int
600
ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
601
const ddt_key_t *ddk, dmu_tx_t *tx)
602
{
603
dnode_t *dn = ddt->ddt_object_dnode[type][class];
604
ASSERT(dn != NULL);
605
606
return (ddt_ops[type]->ddt_op_remove(dn, ddk, tx));
607
}
608
609
int
610
ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
611
uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
612
{
613
dnode_t *dn = ddt->ddt_object_dnode[type][class];
614
ASSERT(dn != NULL);
615
616
int error = ddt_ops[type]->ddt_op_walk(dn, walk, &ddlwe->ddlwe_key,
617
&ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
618
if (error == 0) {
619
ddlwe->ddlwe_type = type;
620
ddlwe->ddlwe_class = class;
621
return (0);
622
}
623
return (error);
624
}
625
626
int
627
ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
628
uint64_t *count)
629
{
630
dnode_t *dn = ddt->ddt_object_dnode[type][class];
631
ASSERT(dn != NULL);
632
633
return (ddt_ops[type]->ddt_op_count(dn, count));
634
}
635
636
int
637
ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
638
dmu_object_info_t *doi)
639
{
640
if (!ddt_object_exists(ddt, type, class))
641
return (SET_ERROR(ENOENT));
642
643
return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
644
doi));
645
}
646
647
void
648
ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
649
char *name)
650
{
651
(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
652
zio_checksum_table[ddt->ddt_checksum].ci_name,
653
ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
654
}
655
656
void
657
ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
658
blkptr_t *bp, uint64_t txg)
659
{
660
ASSERT3U(txg, !=, 0);
661
ASSERT3U(v, <, DDT_PHYS_NONE);
662
uint64_t phys_birth;
663
const dva_t *dvap;
664
665
if (v == DDT_PHYS_FLAT) {
666
phys_birth = ddp->ddp_flat.ddp_phys_birth;
667
dvap = ddp->ddp_flat.ddp_dva;
668
} else {
669
phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
670
dvap = ddp->ddp_trad[v].ddp_dva;
671
}
672
673
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
674
bp->blk_dva[d] = dvap[d];
675
BP_SET_BIRTH(bp, txg, phys_birth);
676
}
677
678
/*
679
* The bp created via this function may be used for repairs and scrub, but it
680
* will be missing the salt / IV required to do a full decrypting read.
681
*/
682
void
683
ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
684
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
685
{
686
BP_ZERO(bp);
687
688
if (ddp != NULL)
689
ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
690
691
bp->blk_cksum = ddk->ddk_cksum;
692
693
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
694
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
695
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
696
BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
697
BP_SET_FILL(bp, 1);
698
BP_SET_CHECKSUM(bp, checksum);
699
BP_SET_TYPE(bp, DMU_OT_DEDUP);
700
BP_SET_LEVEL(bp, 0);
701
BP_SET_DEDUP(bp, 1);
702
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
703
}
704
705
void
706
ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
707
{
708
ddk->ddk_cksum = bp->blk_cksum;
709
ddk->ddk_prop = 0;
710
711
ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
712
713
DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
714
DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
715
DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
716
DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
717
}
718
719
void
720
ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
721
{
722
ASSERT3U(v, <, DDT_PHYS_NONE);
723
int bp_ndvas = BP_GET_NDVAS(bp);
724
int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
725
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
726
dva_t *dvas = (v == DDT_PHYS_FLAT) ?
727
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
728
729
int s = 0, d = 0;
730
while (s < bp_ndvas && d < ddp_max_dvas) {
731
if (DVA_IS_VALID(&dvas[d])) {
732
d++;
733
continue;
734
}
735
dvas[d] = bp->blk_dva[s];
736
s++; d++;
737
}
738
739
/*
740
* If the caller offered us more DVAs than we can fit, something has
741
* gone wrong in their accounting. zio_ddt_write() should never ask for
742
* more than we need.
743
*/
744
ASSERT3U(s, ==, bp_ndvas);
745
746
if (BP_IS_ENCRYPTED(bp))
747
dvas[2] = bp->blk_dva[2];
748
749
if (ddt_phys_birth(ddp, v) == 0) {
750
if (v == DDT_PHYS_FLAT) {
751
ddp->ddp_flat.ddp_phys_birth =
752
BP_GET_PHYSICAL_BIRTH(bp);
753
} else {
754
ddp->ddp_trad[v].ddp_phys_birth =
755
BP_GET_PHYSICAL_BIRTH(bp);
756
}
757
}
758
}
759
760
void
761
ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
762
ddt_phys_variant_t v)
763
{
764
ASSERT3U(v, <, DDT_PHYS_NONE);
765
dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
766
cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
767
dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
768
orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
769
770
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
771
cur_dvas[d] = orig_dvas[d];
772
773
if (ddt_phys_birth(orig, v) == 0) {
774
if (v == DDT_PHYS_FLAT)
775
cur->ddp_flat.ddp_phys_birth = 0;
776
else
777
cur->ddp_trad[v].ddp_phys_birth = 0;
778
}
779
}
780
781
void
782
ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
783
ddt_phys_variant_t v)
784
{
785
ASSERT3U(v, <, DDT_PHYS_NONE);
786
787
if (v == DDT_PHYS_FLAT)
788
dst->ddp_flat = src->ddp_flat;
789
else
790
dst->ddp_trad[v] = src->ddp_trad[v];
791
}
792
793
void
794
ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
795
{
796
ASSERT3U(v, <, DDT_PHYS_NONE);
797
798
if (v == DDT_PHYS_FLAT)
799
memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
800
else
801
memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
802
}
803
804
static uint64_t
805
ddt_class_start(void)
806
{
807
uint64_t start = gethrestime_sec();
808
809
if (unlikely(ddt_prune_artificial_age)) {
810
/*
811
* debug aide -- simulate a wider distribution
812
* so we don't have to wait for an aged DDT
813
* to test prune.
814
*/
815
int range = 1 << 21;
816
int percent = random_in_range(100);
817
if (percent < 50) {
818
range = range >> 4;
819
} else if (percent > 75) {
820
range /= 2;
821
}
822
start -= random_in_range(range);
823
}
824
825
return (start);
826
}
827
828
void
829
ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
830
{
831
ASSERT3U(v, <, DDT_PHYS_NONE);
832
833
if (v == DDT_PHYS_FLAT)
834
ddp->ddp_flat.ddp_refcnt++;
835
else
836
ddp->ddp_trad[v].ddp_refcnt++;
837
}
838
839
uint64_t
840
ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
841
{
842
ASSERT3U(v, <, DDT_PHYS_NONE);
843
844
uint64_t *refcntp;
845
846
if (v == DDT_PHYS_FLAT)
847
refcntp = &ddp->ddp_flat.ddp_refcnt;
848
else
849
refcntp = &ddp->ddp_trad[v].ddp_refcnt;
850
851
ASSERT3U(*refcntp, >, 0);
852
(*refcntp)--;
853
return (*refcntp);
854
}
855
856
static void
857
ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
858
ddt_phys_variant_t v, uint64_t txg)
859
{
860
blkptr_t blk;
861
862
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
863
864
/*
865
* We clear the dedup bit so that zio_free() will actually free the
866
* space, rather than just decrementing the refcount in the DDT.
867
*/
868
BP_SET_DEDUP(&blk, 0);
869
870
ddt_phys_clear(ddp, v);
871
zio_free(ddt->ddt_spa, txg, &blk);
872
}
873
874
uint64_t
875
ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
876
{
877
ASSERT3U(v, <, DDT_PHYS_NONE);
878
879
if (v == DDT_PHYS_FLAT)
880
return (ddp->ddp_flat.ddp_phys_birth);
881
else
882
return (ddp->ddp_trad[v].ddp_phys_birth);
883
}
884
885
int
886
ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
887
{
888
ASSERT3U(v, <, DDT_PHYS_NONE);
889
890
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
891
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
892
893
return (DVA_GET_GANG(&dvas[0]));
894
}
895
896
int
897
ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
898
boolean_t encrypted)
899
{
900
ASSERT3U(v, <, DDT_PHYS_NONE);
901
902
const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
903
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
904
905
return (DVA_IS_VALID(&dvas[0]) +
906
DVA_IS_VALID(&dvas[1]) +
907
DVA_IS_VALID(&dvas[2]) * !encrypted);
908
}
909
910
ddt_phys_variant_t
911
ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
912
{
913
if (dde == NULL)
914
return (DDT_PHYS_NONE);
915
916
const ddt_univ_phys_t *ddp = dde->dde_phys;
917
918
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
919
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
920
BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
921
return (DDT_PHYS_FLAT);
922
}
923
} else /* traditional phys */ {
924
for (int p = 0; p < DDT_PHYS_MAX; p++) {
925
if (DVA_EQUAL(BP_IDENTITY(bp),
926
&ddp->ddp_trad[p].ddp_dva[0]) &&
927
BP_GET_PHYSICAL_BIRTH(bp) ==
928
ddp->ddp_trad[p].ddp_phys_birth) {
929
return (p);
930
}
931
}
932
}
933
return (DDT_PHYS_NONE);
934
}
935
936
uint64_t
937
ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
938
{
939
ASSERT3U(v, <, DDT_PHYS_NONE);
940
941
if (v == DDT_PHYS_FLAT)
942
return (ddp->ddp_flat.ddp_refcnt);
943
else
944
return (ddp->ddp_trad[v].ddp_refcnt);
945
}
946
947
uint64_t
948
ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
949
{
950
uint64_t refcnt = 0;
951
952
if (ddt->ddt_flags & DDT_FLAG_FLAT)
953
refcnt = ddp->ddp_flat.ddp_refcnt;
954
else
955
for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
956
refcnt += ddp->ddp_trad[v].ddp_refcnt;
957
958
return (refcnt);
959
}
960
961
ddt_t *
962
ddt_select(spa_t *spa, const blkptr_t *bp)
963
{
964
ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
965
return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
966
}
967
968
void
969
ddt_enter(ddt_t *ddt)
970
{
971
mutex_enter(&ddt->ddt_lock);
972
}
973
974
void
975
ddt_exit(ddt_t *ddt)
976
{
977
mutex_exit(&ddt->ddt_lock);
978
}
979
980
void
981
ddt_init(void)
982
{
983
ddt_cache = kmem_cache_create("ddt_cache",
984
sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
985
ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
986
DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
987
ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
988
DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
989
990
ddt_log_init();
991
}
992
993
void
994
ddt_fini(void)
995
{
996
ddt_log_fini();
997
998
kmem_cache_destroy(ddt_entry_trad_cache);
999
kmem_cache_destroy(ddt_entry_flat_cache);
1000
kmem_cache_destroy(ddt_cache);
1001
}
1002
1003
static ddt_entry_t *
1004
ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
1005
{
1006
ddt_entry_t *dde;
1007
1008
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1009
dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
1010
memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
1011
} else {
1012
dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
1013
memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
1014
}
1015
1016
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
1017
1018
dde->dde_key = *ddk;
1019
1020
return (dde);
1021
}
1022
1023
void
1024
ddt_alloc_entry_io(ddt_entry_t *dde)
1025
{
1026
if (dde->dde_io != NULL)
1027
return;
1028
1029
dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1030
mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
1031
}
1032
1033
static void
1034
ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
1035
{
1036
if (dde->dde_io != NULL) {
1037
for (int p = 0; p < DDT_NPHYS(ddt); p++)
1038
ASSERT0P(dde->dde_io->dde_lead_zio[p]);
1039
1040
if (dde->dde_io->dde_repair_abd != NULL)
1041
abd_free(dde->dde_io->dde_repair_abd);
1042
1043
mutex_destroy(&dde->dde_io->dde_io_lock);
1044
kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
1045
}
1046
1047
cv_destroy(&dde->dde_cv);
1048
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
1049
ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
1050
}
1051
1052
void
1053
ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
1054
{
1055
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1056
1057
avl_remove(&ddt->ddt_tree, dde);
1058
ddt_free(ddt, dde);
1059
}
1060
1061
/*
1062
* We're considered over quota when we hit 85% full, or for larger drives,
1063
* when there is less than 8GB free.
1064
*/
1065
static boolean_t
1066
ddt_special_over_quota(metaslab_class_t *mc)
1067
{
1068
uint64_t allocated = metaslab_class_get_alloc(mc);
1069
uint64_t capacity = metaslab_class_get_space(mc);
1070
uint64_t limit = MAX(capacity * 85 / 100,
1071
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1072
return (allocated >= limit);
1073
}
1074
1075
/*
1076
* Check if the DDT is over its quota. This can be due to a few conditions:
1077
* 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1078
* exceeds this limit
1079
*
1080
* 2. 'dedup_table_quota' property is set to automatic and
1081
* a. the dedup or special allocation class could not satisfy a DDT
1082
* allocation in a recent transaction
1083
* b. the dedup or special allocation class has exceeded its 85% limit
1084
*/
1085
static boolean_t
1086
ddt_over_quota(spa_t *spa)
1087
{
1088
if (spa->spa_dedup_table_quota == 0)
1089
return (B_FALSE);
1090
1091
if (spa->spa_dedup_table_quota != UINT64_MAX)
1092
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
1093
1094
/*
1095
* Over quota if have to allocate outside of the dedup/special class.
1096
*/
1097
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1098
dedup_class_wait_txgs) {
1099
/* Waiting for some deferred frees to be processed */
1100
return (B_TRUE);
1101
}
1102
1103
/*
1104
* For automatic quota, table size is limited by dedup or special class
1105
*/
1106
if (spa_has_dedup(spa))
1107
return (ddt_special_over_quota(spa_dedup_class(spa)));
1108
else if (spa_special_has_ddt(spa))
1109
return (ddt_special_over_quota(spa_special_class(spa)));
1110
1111
return (B_FALSE);
1112
}
1113
1114
void
1115
ddt_prefetch_all(spa_t *spa)
1116
{
1117
/*
1118
* Load all DDT entries for each type/class combination. This is
1119
* indended to perform a prefetch on all such blocks. For the same
1120
* reason that ddt_prefetch isn't locked, this is also not locked.
1121
*/
1122
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1123
ddt_t *ddt = spa->spa_ddt[c];
1124
if (!ddt)
1125
continue;
1126
1127
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1128
for (ddt_class_t class = 0; class < DDT_CLASSES;
1129
class++) {
1130
ddt_object_prefetch_all(ddt, type, class);
1131
}
1132
}
1133
}
1134
}
1135
1136
static int ddt_configure(ddt_t *ddt, boolean_t new);
1137
1138
/*
1139
* If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1140
* to the ones in the entry. If they're different, then the passed-in BP is
1141
* from a previous generation of this entry (ie was previously pruned) and we
1142
* have to act like the entry doesn't exist at all.
1143
*
1144
* This should only happen during a lookup to free the block (zio_ddt_free()).
1145
*
1146
* XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1147
* -- robn, 2024-02-09
1148
*/
1149
static boolean_t
1150
ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
1151
{
1152
/* If the BP has no DVAs, then this entry is good */
1153
uint_t ndvas = BP_GET_NDVAS(bp);
1154
if (ndvas == 0)
1155
return (B_TRUE);
1156
1157
/*
1158
* Only checking the phys for the copies. For flat, there's only one;
1159
* for trad it'll be the one that has the matching set of DVAs.
1160
*/
1161
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
1162
dde->dde_phys->ddp_flat.ddp_dva :
1163
dde->dde_phys->ddp_trad[ndvas].ddp_dva;
1164
1165
/*
1166
* Compare entry DVAs with the BP. They should all be there, but
1167
* there's not really anything we can do if its only partial anyway,
1168
* that's an error somewhere else, maybe long ago.
1169
*/
1170
uint_t d;
1171
for (d = 0; d < ndvas; d++)
1172
if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
1173
return (B_FALSE);
1174
ASSERT3U(d, ==, ndvas);
1175
1176
return (B_TRUE);
1177
}
1178
1179
ddt_entry_t *
1180
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
1181
{
1182
spa_t *spa = ddt->ddt_spa;
1183
ddt_key_t search;
1184
ddt_entry_t *dde;
1185
ddt_type_t type;
1186
ddt_class_t class;
1187
avl_index_t where;
1188
int error;
1189
1190
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1191
1192
if (unlikely(ddt->ddt_version == DDT_VERSION_UNCONFIGURED)) {
1193
/*
1194
* This is the first use of this DDT since the pool was
1195
* created; finish getting it ready for use.
1196
*/
1197
VERIFY0(ddt_configure(ddt, B_TRUE));
1198
ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
1199
}
1200
1201
DDT_KSTAT_BUMP(ddt, dds_lookup);
1202
1203
ddt_key_fill(&search, bp);
1204
1205
/* Find an existing live entry */
1206
dde = avl_find(&ddt->ddt_tree, &search, &where);
1207
if (dde != NULL) {
1208
/* If we went over quota, act like we didn't find it */
1209
if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
1210
return (NULL);
1211
1212
/* If it's already loaded, we can just return it. */
1213
DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
1214
if (dde->dde_flags & DDE_FLAG_LOADED) {
1215
if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1216
return (dde);
1217
return (NULL);
1218
}
1219
1220
/* Someone else is loading it, wait for it. */
1221
dde->dde_waiters++;
1222
DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
1223
while (!(dde->dde_flags & DDE_FLAG_LOADED))
1224
cv_wait(&dde->dde_cv, &ddt->ddt_lock);
1225
dde->dde_waiters--;
1226
1227
/* Loaded but over quota, forget we were ever here */
1228
if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
1229
if (dde->dde_waiters == 0) {
1230
avl_remove(&ddt->ddt_tree, dde);
1231
ddt_free(ddt, dde);
1232
}
1233
return (NULL);
1234
}
1235
1236
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1237
1238
/* Make sure the loaded entry matches the BP */
1239
if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1240
return (dde);
1241
return (NULL);
1242
} else
1243
DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
1244
1245
/* Time to make a new entry. */
1246
dde = ddt_alloc(ddt, &search);
1247
avl_insert(&ddt->ddt_tree, dde, where);
1248
1249
/*
1250
* The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
1251
* threads will wait even while we drop the lock.
1252
*/
1253
ddt_exit(ddt);
1254
1255
/*
1256
* If there is a log, we should try to "load" from there first.
1257
*/
1258
if (ddt->ddt_flags & DDT_FLAG_LOG) {
1259
ddt_lightweight_entry_t ddlwe;
1260
boolean_t from_flushing;
1261
1262
/* Read-only search, no locks needed (logs stable during I/O) */
1263
if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
1264
dde->dde_type = ddlwe.ddlwe_type;
1265
dde->dde_class = ddlwe.ddlwe_class;
1266
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
1267
DDT_PHYS_SIZE(ddt));
1268
1269
/*
1270
* Check validity. If invalid and no waiters, clean up
1271
* immediately. Otherwise continue setup for waiters.
1272
*/
1273
boolean_t valid = !verify ||
1274
ddt_entry_lookup_is_valid(ddt, bp, dde);
1275
ddt_enter(ddt);
1276
if (!valid && dde->dde_waiters == 0) {
1277
avl_remove(&ddt->ddt_tree, dde);
1278
ddt_free(ddt, dde);
1279
return (NULL);
1280
}
1281
1282
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
1283
if (from_flushing) {
1284
dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
1285
DDT_KSTAT_BUMP(ddt,
1286
dds_lookup_log_flushing_hit);
1287
} else {
1288
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
1289
}
1290
1291
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
1292
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1293
1294
cv_broadcast(&dde->dde_cv);
1295
1296
return (valid ? dde : NULL);
1297
}
1298
1299
DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
1300
}
1301
1302
/* Search all store objects for the entry. */
1303
error = ENOENT;
1304
for (type = 0; type < DDT_TYPES; type++) {
1305
for (class = 0; class < DDT_CLASSES; class++) {
1306
error = ddt_object_lookup(ddt, type, class, dde);
1307
if (error != ENOENT) {
1308
ASSERT0(error);
1309
break;
1310
}
1311
}
1312
if (error != ENOENT)
1313
break;
1314
}
1315
1316
ddt_enter(ddt);
1317
1318
ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
1319
1320
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
1321
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
1322
1323
boolean_t valid = B_TRUE;
1324
1325
if (dde->dde_type == DDT_TYPES &&
1326
dde->dde_class == DDT_CLASSES &&
1327
ddt_over_quota(spa)) {
1328
/* Over quota. If no one is waiting, clean up right now. */
1329
if (dde->dde_waiters == 0) {
1330
avl_remove(&ddt->ddt_tree, dde);
1331
ddt_free(ddt, dde);
1332
return (NULL);
1333
}
1334
1335
/* Flag cleanup required */
1336
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
1337
} else if (error == 0) {
1338
/*
1339
* If what we loaded is no good for this BP and there's no one
1340
* waiting for it, we can just remove it and get out. If its no
1341
* good but there are waiters, we have to leave it, because we
1342
* don't know what they want. If its not needed we'll end up
1343
* taking an entry log/sync, but it can only happen if more
1344
* than one previous version of this block is being deleted at
1345
* the same time. This is extremely unlikely to happen and not
1346
* worth the effort to deal with without taking an entry
1347
* update.
1348
*/
1349
valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
1350
if (!valid && dde->dde_waiters == 0) {
1351
avl_remove(&ddt->ddt_tree, dde);
1352
ddt_free(ddt, dde);
1353
return (NULL);
1354
}
1355
1356
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
1357
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1358
1359
/*
1360
* The histograms only track inactive (stored or logged) blocks.
1361
* We've just put an entry onto the live list, so we need to
1362
* remove its counts. When its synced back, it'll be re-added
1363
* to the right one.
1364
*
1365
* We only do this when we successfully found it in the store.
1366
* error == ENOENT means this is a new entry, and so its already
1367
* not counted.
1368
*/
1369
ddt_histogram_t *ddh =
1370
&ddt->ddt_histogram[dde->dde_type][dde->dde_class];
1371
1372
ddt_lightweight_entry_t ddlwe;
1373
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1374
ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
1375
} else {
1376
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
1377
DDT_KSTAT_BUMP(ddt, dds_lookup_new);
1378
}
1379
1380
/* Entry loaded, everyone can proceed now */
1381
dde->dde_flags |= DDE_FLAG_LOADED;
1382
cv_broadcast(&dde->dde_cv);
1383
1384
if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
1385
return (NULL);
1386
1387
return (dde);
1388
}
1389
1390
void
1391
ddt_prefetch(spa_t *spa, const blkptr_t *bp)
1392
{
1393
ddt_t *ddt;
1394
ddt_key_t ddk;
1395
1396
if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
1397
return;
1398
1399
/*
1400
* We only remove the DDT once all tables are empty and only
1401
* prefetch dedup blocks when there are entries in the DDT.
1402
* Thus no locking is required as the DDT can't disappear on us.
1403
*/
1404
ddt = ddt_select(spa, bp);
1405
ddt_key_fill(&ddk, bp);
1406
1407
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1408
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1409
ddt_object_prefetch(ddt, type, class, &ddk);
1410
}
1411
}
1412
}
1413
1414
/*
1415
* ddt_key_t comparison. Any struct wanting to make use of this function must
1416
* have the key as the first element. Casts it to N uint64_ts, and checks until
1417
* we find there's a difference. This is intended to match how ddt_zap.c drives
1418
* the ZAPs (first uint64_t as the key prehash), which will minimise the number
1419
* of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1420
* not an invariant for this function though, should you wish to change it.
1421
*/
1422
int
1423
ddt_key_compare(const void *x1, const void *x2)
1424
{
1425
const uint64_t *k1 = (const uint64_t *)x1;
1426
const uint64_t *k2 = (const uint64_t *)x2;
1427
1428
int cmp;
1429
for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
1430
if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
1431
return (cmp);
1432
1433
return (0);
1434
}
1435
1436
/* Create the containing dir for this DDT and bump the feature count */
1437
static void
1438
ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
1439
{
1440
ASSERT0(ddt->ddt_dir_object);
1441
ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1442
1443
char name[DDT_NAMELEN];
1444
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1445
zio_checksum_table[ddt->ddt_checksum].ci_name);
1446
1447
ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
1448
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
1449
1450
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
1451
sizeof (uint64_t), 1, &ddt->ddt_version, tx));
1452
VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
1453
sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
1454
1455
spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1456
}
1457
1458
/* Destroy the containing dir and deactivate the feature */
1459
static void
1460
ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
1461
{
1462
ASSERT3U(ddt->ddt_dir_object, !=, 0);
1463
ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
1464
ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1465
1466
char name[DDT_NAMELEN];
1467
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1468
zio_checksum_table[ddt->ddt_checksum].ci_name);
1469
1470
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1471
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1472
ASSERT(!ddt_object_exists(ddt, type, class));
1473
}
1474
}
1475
1476
ddt_log_destroy(ddt, tx);
1477
1478
uint64_t count;
1479
ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
1480
ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
1481
DDT_DIR_VERSION));
1482
ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
1483
ASSERT3U(count, ==, 2);
1484
1485
VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
1486
VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
1487
1488
ddt->ddt_dir_object = 0;
1489
1490
spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1491
}
1492
1493
/*
1494
* Determine, flags and on-disk layout from what's already stored. If there's
1495
* nothing stored, then if new is false, returns ENOENT, and if true, selects
1496
* based on pool config.
1497
*/
1498
static int
1499
ddt_configure(ddt_t *ddt, boolean_t new)
1500
{
1501
spa_t *spa = ddt->ddt_spa;
1502
char name[DDT_NAMELEN];
1503
int error;
1504
1505
ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
1506
1507
boolean_t fdt_enabled =
1508
spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
1509
boolean_t fdt_active =
1510
spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
1511
1512
/*
1513
* First, look for the global DDT stats object. If its not there, then
1514
* there's never been a DDT written before ever, and we know we're
1515
* starting from scratch.
1516
*/
1517
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1518
DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1519
&spa->spa_ddt_stat_object);
1520
if (error != 0) {
1521
if (error != ENOENT)
1522
return (error);
1523
goto not_found;
1524
}
1525
1526
if (fdt_active) {
1527
/*
1528
* Now look for a DDT directory. If it exists, then it has
1529
* everything we need.
1530
*/
1531
snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1532
zio_checksum_table[ddt->ddt_checksum].ci_name);
1533
1534
error = zap_lookup(spa->spa_meta_objset,
1535
DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1536
&ddt->ddt_dir_object);
1537
if (error == 0) {
1538
ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
1539
1540
error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1541
DDT_DIR_VERSION, sizeof (uint64_t), 1,
1542
&ddt->ddt_version);
1543
if (error != 0)
1544
return (error);
1545
1546
error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1547
DDT_DIR_FLAGS, sizeof (uint64_t), 1,
1548
&ddt->ddt_flags);
1549
if (error != 0)
1550
return (error);
1551
1552
if (ddt->ddt_version != DDT_VERSION_FDT) {
1553
zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1554
"unknown version %llu", spa_name(spa),
1555
name, (u_longlong_t)ddt->ddt_version);
1556
return (SET_ERROR(EINVAL));
1557
}
1558
1559
if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
1560
zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1561
"version=%llu unknown flags %llx",
1562
spa_name(spa), name,
1563
(u_longlong_t)ddt->ddt_flags,
1564
(u_longlong_t)ddt->ddt_version);
1565
return (SET_ERROR(EINVAL));
1566
}
1567
1568
return (0);
1569
}
1570
if (error != ENOENT)
1571
return (error);
1572
}
1573
1574
/* Any object in the root indicates a traditional setup. */
1575
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1576
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1577
ddt_object_name(ddt, type, class, name);
1578
uint64_t obj;
1579
error = zap_lookup(spa->spa_meta_objset,
1580
DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
1581
1, &obj);
1582
if (error == ENOENT)
1583
continue;
1584
if (error != 0)
1585
return (error);
1586
1587
ddt->ddt_version = DDT_VERSION_LEGACY;
1588
ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1589
ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1590
1591
return (0);
1592
}
1593
}
1594
1595
not_found:
1596
if (!new)
1597
return (SET_ERROR(ENOENT));
1598
1599
/* Nothing on disk, so set up for the best version we can */
1600
if (fdt_enabled) {
1601
ddt->ddt_version = DDT_VERSION_FDT;
1602
ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1603
ddt->ddt_dir_object = 0; /* create on first use */
1604
} else {
1605
ddt->ddt_version = DDT_VERSION_LEGACY;
1606
ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1607
ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1608
}
1609
1610
return (0);
1611
}
1612
1613
static int
1614
ddt_kstat_update(kstat_t *ksp, int rw)
1615
{
1616
ddt_t *ddt = ksp->ks_private;
1617
ddt_kstats_t *dds = ksp->ks_data;
1618
1619
if (rw == KSTAT_WRITE)
1620
return (SET_ERROR(EACCES));
1621
1622
/* Aggregate wmsum counters for lookup stats */
1623
dds->dds_lookup.value.ui64 =
1624
wmsum_value(&ddt->ddt_kstat_dds_lookup);
1625
dds->dds_lookup_live_hit.value.ui64 =
1626
wmsum_value(&ddt->ddt_kstat_dds_lookup_live_hit);
1627
dds->dds_lookup_live_wait.value.ui64 =
1628
wmsum_value(&ddt->ddt_kstat_dds_lookup_live_wait);
1629
dds->dds_lookup_live_miss.value.ui64 =
1630
wmsum_value(&ddt->ddt_kstat_dds_lookup_live_miss);
1631
dds->dds_lookup_existing.value.ui64 =
1632
wmsum_value(&ddt->ddt_kstat_dds_lookup_existing);
1633
dds->dds_lookup_new.value.ui64 =
1634
wmsum_value(&ddt->ddt_kstat_dds_lookup_new);
1635
dds->dds_lookup_log_hit.value.ui64 =
1636
wmsum_value(&ddt->ddt_kstat_dds_lookup_log_hit);
1637
dds->dds_lookup_log_active_hit.value.ui64 =
1638
wmsum_value(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1639
dds->dds_lookup_log_flushing_hit.value.ui64 =
1640
wmsum_value(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1641
dds->dds_lookup_log_miss.value.ui64 =
1642
wmsum_value(&ddt->ddt_kstat_dds_lookup_log_miss);
1643
dds->dds_lookup_stored_hit.value.ui64 =
1644
wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_hit);
1645
dds->dds_lookup_stored_miss.value.ui64 =
1646
wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_miss);
1647
1648
/* Sync-only counters are already set directly in kstats */
1649
1650
return (0);
1651
}
1652
1653
static void
1654
ddt_table_alloc_kstats(ddt_t *ddt)
1655
{
1656
char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
1657
char *name = kmem_asprintf("ddt_stats_%s",
1658
zio_checksum_table[ddt->ddt_checksum].ci_name);
1659
1660
/* Initialize wmsums for lookup counters */
1661
wmsum_init(&ddt->ddt_kstat_dds_lookup, 0);
1662
wmsum_init(&ddt->ddt_kstat_dds_lookup_live_hit, 0);
1663
wmsum_init(&ddt->ddt_kstat_dds_lookup_live_wait, 0);
1664
wmsum_init(&ddt->ddt_kstat_dds_lookup_live_miss, 0);
1665
wmsum_init(&ddt->ddt_kstat_dds_lookup_existing, 0);
1666
wmsum_init(&ddt->ddt_kstat_dds_lookup_new, 0);
1667
wmsum_init(&ddt->ddt_kstat_dds_lookup_log_hit, 0);
1668
wmsum_init(&ddt->ddt_kstat_dds_lookup_log_active_hit, 0);
1669
wmsum_init(&ddt->ddt_kstat_dds_lookup_log_flushing_hit, 0);
1670
wmsum_init(&ddt->ddt_kstat_dds_lookup_log_miss, 0);
1671
wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_hit, 0);
1672
wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_miss, 0);
1673
1674
ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
1675
sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1676
if (ddt->ddt_ksp != NULL) {
1677
ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
1678
memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
1679
ddt->ddt_ksp->ks_data = dds;
1680
ddt->ddt_ksp->ks_update = ddt_kstat_update;
1681
ddt->ddt_ksp->ks_private = ddt;
1682
kstat_install(ddt->ddt_ksp);
1683
}
1684
1685
kmem_strfree(name);
1686
kmem_strfree(mod);
1687
}
1688
1689
static ddt_t *
1690
ddt_table_alloc(spa_t *spa, enum zio_checksum c)
1691
{
1692
ddt_t *ddt;
1693
1694
ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
1695
memset(ddt, 0, sizeof (ddt_t));
1696
mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
1697
avl_create(&ddt->ddt_tree, ddt_key_compare,
1698
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1699
avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
1700
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1701
1702
ddt->ddt_checksum = c;
1703
ddt->ddt_spa = spa;
1704
ddt->ddt_os = spa->spa_meta_objset;
1705
ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
1706
ddt->ddt_log_flush_pressure = 10;
1707
1708
ddt_log_alloc(ddt);
1709
ddt_table_alloc_kstats(ddt);
1710
1711
return (ddt);
1712
}
1713
1714
static void
1715
ddt_table_free(ddt_t *ddt)
1716
{
1717
if (ddt->ddt_ksp != NULL) {
1718
kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
1719
ddt->ddt_ksp->ks_data = NULL;
1720
kstat_delete(ddt->ddt_ksp);
1721
}
1722
1723
/* Cleanup wmsums for lookup counters */
1724
wmsum_fini(&ddt->ddt_kstat_dds_lookup);
1725
wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_hit);
1726
wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_wait);
1727
wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_miss);
1728
wmsum_fini(&ddt->ddt_kstat_dds_lookup_existing);
1729
wmsum_fini(&ddt->ddt_kstat_dds_lookup_new);
1730
wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_hit);
1731
wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1732
wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1733
wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_miss);
1734
wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_hit);
1735
wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
1736
1737
ddt_log_free(ddt);
1738
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1739
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1740
if (ddt->ddt_object_dnode[type][class] != NULL) {
1741
dnode_rele(ddt->ddt_object_dnode[type][class],
1742
ddt);
1743
ddt->ddt_object_dnode[type][class] = NULL;
1744
}
1745
}
1746
}
1747
ASSERT0(avl_numnodes(&ddt->ddt_tree));
1748
ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
1749
avl_destroy(&ddt->ddt_tree);
1750
avl_destroy(&ddt->ddt_repair_tree);
1751
mutex_destroy(&ddt->ddt_lock);
1752
kmem_cache_free(ddt_cache, ddt);
1753
}
1754
1755
void
1756
ddt_create(spa_t *spa)
1757
{
1758
spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
1759
1760
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1761
if (DDT_CHECKSUM_VALID(c))
1762
spa->spa_ddt[c] = ddt_table_alloc(spa, c);
1763
}
1764
}
1765
1766
int
1767
ddt_load(spa_t *spa)
1768
{
1769
int error;
1770
1771
ddt_create(spa);
1772
1773
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1774
DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1775
&spa->spa_ddt_stat_object);
1776
if (error)
1777
return (error == ENOENT ? 0 : error);
1778
1779
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1780
if (!DDT_CHECKSUM_VALID(c))
1781
continue;
1782
1783
ddt_t *ddt = spa->spa_ddt[c];
1784
error = ddt_configure(ddt, B_FALSE);
1785
if (error == ENOENT)
1786
continue;
1787
if (error != 0)
1788
return (error);
1789
1790
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1791
for (ddt_class_t class = 0; class < DDT_CLASSES;
1792
class++) {
1793
error = ddt_object_load(ddt, type, class);
1794
if (error != 0 && error != ENOENT)
1795
return (error);
1796
}
1797
}
1798
1799
if (ddt->ddt_flags & DDT_FLAG_LOG) {
1800
error = ddt_log_load(ddt);
1801
if (error != 0 && error != ENOENT)
1802
return (error);
1803
}
1804
1805
DDT_KSTAT_SET(ddt, dds_log_active_entries,
1806
avl_numnodes(&ddt->ddt_log_active->ddl_tree));
1807
DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
1808
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
1809
1810
/*
1811
* Seed the cached histograms.
1812
*/
1813
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
1814
sizeof (ddt->ddt_histogram));
1815
}
1816
1817
spa->spa_dedup_dspace = ~0ULL;
1818
spa->spa_dedup_dsize = ~0ULL;
1819
1820
return (0);
1821
}
1822
1823
void
1824
ddt_unload(spa_t *spa)
1825
{
1826
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1827
if (spa->spa_ddt[c]) {
1828
ddt_table_free(spa->spa_ddt[c]);
1829
spa->spa_ddt[c] = NULL;
1830
}
1831
}
1832
}
1833
1834
boolean_t
1835
ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
1836
{
1837
ddt_t *ddt;
1838
ddt_key_t ddk;
1839
1840
if (!BP_GET_DEDUP(bp))
1841
return (B_FALSE);
1842
1843
if (max_class == DDT_CLASS_UNIQUE)
1844
return (B_TRUE);
1845
1846
ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1847
1848
ddt_key_fill(&ddk, bp);
1849
1850
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1851
for (ddt_class_t class = 0; class <= max_class; class++) {
1852
if (ddt_object_contains(ddt, type, class, &ddk) == 0)
1853
return (B_TRUE);
1854
}
1855
}
1856
1857
return (B_FALSE);
1858
}
1859
1860
ddt_entry_t *
1861
ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1862
{
1863
ddt_key_t ddk;
1864
ddt_entry_t *dde;
1865
1866
ddt_key_fill(&ddk, bp);
1867
1868
dde = ddt_alloc(ddt, &ddk);
1869
ddt_alloc_entry_io(dde);
1870
1871
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1872
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1873
/*
1874
* We can only do repair if there are multiple copies
1875
* of the block. For anything in the UNIQUE class,
1876
* there's definitely only one copy, so don't even try.
1877
*/
1878
if (class != DDT_CLASS_UNIQUE &&
1879
ddt_object_lookup(ddt, type, class, dde) == 0)
1880
return (dde);
1881
}
1882
}
1883
1884
memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
1885
1886
return (dde);
1887
}
1888
1889
void
1890
ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1891
{
1892
avl_index_t where;
1893
1894
ddt_enter(ddt);
1895
1896
if (dde->dde_io->dde_repair_abd != NULL &&
1897
spa_writeable(ddt->ddt_spa) &&
1898
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1899
avl_insert(&ddt->ddt_repair_tree, dde, where);
1900
else
1901
ddt_free(ddt, dde);
1902
1903
ddt_exit(ddt);
1904
}
1905
1906
static void
1907
ddt_repair_entry_done(zio_t *zio)
1908
{
1909
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1910
ddt_entry_t *rdde = zio->io_private;
1911
1912
ddt_free(ddt, rdde);
1913
}
1914
1915
static void
1916
ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1917
{
1918
ddt_key_t *ddk = &dde->dde_key;
1919
ddt_key_t *rddk = &rdde->dde_key;
1920
zio_t *zio;
1921
blkptr_t blk;
1922
1923
zio = zio_null(rio, rio->io_spa, NULL,
1924
ddt_repair_entry_done, rdde, rio->io_flags);
1925
1926
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
1927
ddt_univ_phys_t *ddp = dde->dde_phys;
1928
ddt_univ_phys_t *rddp = rdde->dde_phys;
1929
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
1930
uint64_t phys_birth = ddt_phys_birth(ddp, v);
1931
const dva_t *dvas, *rdvas;
1932
1933
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1934
dvas = ddp->ddp_flat.ddp_dva;
1935
rdvas = rddp->ddp_flat.ddp_dva;
1936
} else {
1937
dvas = ddp->ddp_trad[p].ddp_dva;
1938
rdvas = rddp->ddp_trad[p].ddp_dva;
1939
}
1940
1941
if (phys_birth == 0 ||
1942
phys_birth != ddt_phys_birth(rddp, v) ||
1943
memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
1944
continue;
1945
1946
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
1947
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1948
rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
1949
NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
1950
ZIO_DDT_CHILD_FLAGS(zio), NULL));
1951
}
1952
1953
zio_nowait(zio);
1954
}
1955
1956
static void
1957
ddt_repair_table(ddt_t *ddt, zio_t *rio)
1958
{
1959
spa_t *spa = ddt->ddt_spa;
1960
ddt_entry_t *dde, *rdde_next, *rdde;
1961
avl_tree_t *t = &ddt->ddt_repair_tree;
1962
blkptr_t blk;
1963
1964
if (spa_sync_pass(spa) > 1)
1965
return;
1966
1967
ddt_enter(ddt);
1968
for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1969
rdde_next = AVL_NEXT(t, rdde);
1970
avl_remove(&ddt->ddt_repair_tree, rdde);
1971
ddt_exit(ddt);
1972
ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
1973
DDT_PHYS_NONE, &blk);
1974
dde = ddt_repair_start(ddt, &blk);
1975
ddt_repair_entry(ddt, dde, rdde, rio);
1976
ddt_repair_done(ddt, dde);
1977
ddt_enter(ddt);
1978
}
1979
ddt_exit(ddt);
1980
}
1981
1982
static void
1983
ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
1984
{
1985
/*
1986
* Count all the entries stored for each type/class, and updates the
1987
* stats within (ddt_object_sync()). If there's no entries for the
1988
* type/class, the whole object is removed. If all objects for the DDT
1989
* are removed, its containing dir is removed, effectively resetting
1990
* the entire DDT to an empty slate.
1991
*/
1992
uint64_t count = 0;
1993
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1994
uint64_t add, tcount = 0;
1995
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1996
if (ddt_object_exists(ddt, type, class)) {
1997
ddt_object_sync(ddt, type, class, tx);
1998
VERIFY0(ddt_object_count(ddt, type, class,
1999
&add));
2000
tcount += add;
2001
}
2002
}
2003
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
2004
if (tcount == 0 && ddt_object_exists(ddt, type, class))
2005
ddt_object_destroy(ddt, type, class, tx);
2006
}
2007
count += tcount;
2008
}
2009
2010
if (ddt->ddt_flags & DDT_FLAG_LOG) {
2011
/* Include logged entries in the total count */
2012
count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2013
count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2014
}
2015
2016
if (count == 0) {
2017
/*
2018
* No entries left on the DDT, so reset the version for next
2019
* time. This allows us to handle the feature being changed
2020
* since the DDT was originally created. New entries should get
2021
* whatever the feature currently demands.
2022
*/
2023
if (ddt->ddt_version == DDT_VERSION_FDT)
2024
ddt_destroy_dir(ddt, tx);
2025
2026
ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
2027
ddt->ddt_flags = 0;
2028
}
2029
2030
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2031
sizeof (ddt->ddt_histogram));
2032
ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2033
ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2034
}
2035
2036
static void
2037
ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2038
{
2039
dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
2040
2041
/*
2042
* Compute the target class, so we can decide whether or not to inform
2043
* the scrub traversal (below). Note that we don't store this in the
2044
* entry, as it might change multiple times before finally being
2045
* committed (if we're logging). Instead, we recompute it in
2046
* ddt_sync_entry().
2047
*/
2048
uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
2049
ddt_class_t nclass =
2050
(refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2051
2052
/*
2053
* If the class changes, the order that we scan this bp changes. If it
2054
* decreases, we could miss it, so scan it right now. (This covers both
2055
* class changing while we are doing ddt_walk(), and when we are
2056
* traversing.)
2057
*
2058
* We also do this when the refcnt goes to zero, because that change is
2059
* only in the log so far; the blocks on disk won't be freed until
2060
* the log is flushed, and the refcnt might increase before that. If it
2061
* does, then we could miss it in the same way.
2062
*/
2063
if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
2064
dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
2065
ddlwe, tx);
2066
}
2067
2068
static void
2069
ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
2070
ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
2071
{
2072
ddt_key_t *ddk = &ddlwe->ddlwe_key;
2073
ddt_type_t ntype = DDT_TYPE_DEFAULT;
2074
uint64_t refcnt = 0;
2075
2076
/*
2077
* Compute the total refcnt. Along the way, issue frees for any DVAs
2078
* we no longer want.
2079
*/
2080
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
2081
ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
2082
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2083
uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
2084
2085
if (ddt_phys_birth(ddp, v) == 0) {
2086
ASSERT0(phys_refcnt);
2087
continue;
2088
}
2089
if (DDT_PHYS_IS_DITTO(ddt, p)) {
2090
/*
2091
* We don't want to keep any obsolete slots (eg ditto),
2092
* regardless of their refcount, but we don't want to
2093
* leak them either. So, free them.
2094
*/
2095
ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2096
continue;
2097
}
2098
if (phys_refcnt == 0)
2099
/* No remaining references, free it! */
2100
ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2101
refcnt += phys_refcnt;
2102
}
2103
2104
/* Select the best class for the entry. */
2105
ddt_class_t nclass =
2106
(refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2107
2108
/*
2109
* If an existing entry changed type or class, or its refcount reached
2110
* zero, delete it from the DDT object
2111
*/
2112
if (otype != DDT_TYPES &&
2113
(otype != ntype || oclass != nclass || refcnt == 0)) {
2114
VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
2115
ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
2116
}
2117
2118
/*
2119
* Add or update the entry
2120
*/
2121
if (refcnt != 0) {
2122
ddt_histogram_t *ddh =
2123
&ddt->ddt_histogram[ntype][nclass];
2124
2125
ddt_histogram_add_entry(ddt, ddh, ddlwe);
2126
2127
if (!ddt_object_exists(ddt, ntype, nclass))
2128
ddt_object_create(ddt, ntype, nclass, tx);
2129
VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
2130
}
2131
}
2132
2133
/* Calculate an exponential weighted moving average, lower limited to zero */
2134
static inline int32_t
2135
_ewma(int32_t val, int32_t prev, uint32_t weight)
2136
{
2137
ASSERT3U(val, >=, 0);
2138
ASSERT3U(prev, >=, 0);
2139
const int32_t new =
2140
MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
2141
ASSERT3U(new, >=, 0);
2142
return (new);
2143
}
2144
2145
static inline void
2146
ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
2147
{
2148
/*
2149
* If we're not forcing flush, and not being asked to start, then
2150
* there's nothing more to do.
2151
*/
2152
if (txg == 0) {
2153
/* Update requested, are we currently forcing flush? */
2154
if (ddt->ddt_flush_force_txg == 0)
2155
return;
2156
txg = ddt->ddt_flush_force_txg;
2157
}
2158
2159
/*
2160
* If either of the logs have entries unflushed entries before
2161
* the wanted txg, set the force txg, otherwise clear it.
2162
*/
2163
2164
if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
2165
ddt->ddt_log_active->ddl_first_txg <= txg) ||
2166
(!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2167
ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
2168
ddt->ddt_flush_force_txg = txg;
2169
return;
2170
}
2171
2172
/*
2173
* Nothing to flush behind the given txg, so we can clear force flush
2174
* state.
2175
*/
2176
ddt->ddt_flush_force_txg = 0;
2177
}
2178
2179
static void
2180
ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
2181
{
2182
spa_t *spa = ddt->ddt_spa;
2183
ASSERT(avl_is_empty(&ddt->ddt_tree));
2184
2185
/*
2186
* Don't do any flushing when the pool is ready to shut down, or in
2187
* passes beyond the first.
2188
*/
2189
if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
2190
return;
2191
2192
hrtime_t flush_start = gethrtime();
2193
uint32_t count = 0;
2194
2195
/*
2196
* How many entries we need to flush. We need to at
2197
* least match the ingest rate, and also consider the
2198
* current backlog of entries.
2199
*/
2200
uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
2201
avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2202
2203
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
2204
goto housekeeping;
2205
2206
uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
2207
uint64_t cap = MAX(1, zfs_dedup_log_cap);
2208
uint64_t flush_min = MAX(backlog / txgs,
2209
zfs_dedup_log_flush_entries_min);
2210
2211
/*
2212
* The theory for this block is that if we increase the pressure while
2213
* we're growing above the cap, and remove it when we're significantly
2214
* below the cap, we'll stay near cap while not bouncing around too
2215
* much.
2216
*
2217
* The factor of 10 is to smooth the pressure effect by expressing it
2218
* in tenths. The addition of the cap to the backlog in the second
2219
* block is to round up, instead of down. We never let the pressure go
2220
* below 1 (10 tenths).
2221
*/
2222
if (cap != UINT_MAX && backlog > cap &&
2223
backlog > ddt->ddt_log_flush_prev_backlog) {
2224
ddt->ddt_log_flush_pressure += 10 * backlog / cap;
2225
} else if (cap != UINT_MAX && backlog < cap) {
2226
ddt->ddt_log_flush_pressure -=
2227
11 - (((10 * backlog) + cap - 1) / cap);
2228
ddt->ddt_log_flush_pressure =
2229
MAX(ddt->ddt_log_flush_pressure, 10);
2230
}
2231
2232
if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
2233
flush_min = MAX(flush_min, MIN(backlog - cap,
2234
(flush_min * ddt->ddt_log_flush_pressure) / 10));
2235
2236
uint64_t flush_max;
2237
2238
/*
2239
* If we've been asked to flush everything in a hurry,
2240
* try to dump as much as possible on this txg. In
2241
* this case we're only limited by time, not amount.
2242
*
2243
* Otherwise, if we are over the cap, try to get back down to it.
2244
*
2245
* Finally if there is no cap (or no pressure), just set the max a
2246
* little higher than the min to help smooth out variations in flush
2247
* times.
2248
*/
2249
if (ddt->ddt_flush_force_txg > 0)
2250
flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2251
else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
2252
flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
2253
(flush_min * ddt->ddt_log_flush_pressure) / 10));
2254
else
2255
flush_max = flush_min * 5 / 4;
2256
flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
2257
2258
/*
2259
* When the pool is busy or someone is explicitly waiting for this txg
2260
* to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use
2261
* half of the time in the txg timeout.
2262
*/
2263
uint64_t target_time;
2264
2265
if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
2266
vdev_queue_pool_busy(spa)) {
2267
target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
2268
SEC2NSEC(zfs_txg_timeout) / 2);
2269
} else {
2270
target_time = SEC2NSEC(zfs_txg_timeout) / 2;
2271
}
2272
2273
ddt_lightweight_entry_t ddlwe;
2274
while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
2275
ddt_sync_flush_entry(ddt, &ddlwe,
2276
ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
2277
2278
/* End if we've synced as much as we needed to. */
2279
if (++count >= flush_max)
2280
break;
2281
2282
/*
2283
* As long as we've flushed the absolute minimum,
2284
* stop if we're way over our target time.
2285
*/
2286
uint64_t diff = gethrtime() - flush_start;
2287
if (count > zfs_dedup_log_flush_entries_min &&
2288
diff >= target_time * 2)
2289
break;
2290
2291
/*
2292
* End if we've passed the minimum flush and we're out of time.
2293
*/
2294
if (count > flush_min && diff >= target_time)
2295
break;
2296
}
2297
2298
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
2299
/* We emptied it, so truncate on-disk */
2300
DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
2301
ddt_log_truncate(ddt, tx);
2302
} else {
2303
/* More to do next time, save checkpoint */
2304
DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
2305
ddt_log_checkpoint(ddt, &ddlwe, tx);
2306
}
2307
2308
ddt_sync_update_stats(ddt, tx);
2309
2310
housekeeping:
2311
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2312
!avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
2313
/*
2314
* No more to flush, and the active list has stuff, so
2315
* try to swap the logs for next time.
2316
*/
2317
if (ddt_log_swap(ddt, tx)) {
2318
DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
2319
DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
2320
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
2321
}
2322
}
2323
2324
/* If force flush is no longer necessary, turn it off. */
2325
ddt_flush_force_update_txg(ddt, 0);
2326
2327
ddt->ddt_log_flush_prev_backlog = backlog;
2328
2329
/*
2330
* Update flush rate. This is an exponential weighted moving
2331
* average of the number of entries flushed over recent txgs.
2332
*/
2333
ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
2334
zfs_dedup_log_flush_flow_rate_txgs);
2335
DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
2336
2337
/*
2338
* Update flush time rate. This is an exponential weighted moving
2339
* average of the total time taken to flush over recent txgs.
2340
*/
2341
ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
2342
(int32_t)NSEC2MSEC(gethrtime() - flush_start),
2343
zfs_dedup_log_flush_flow_rate_txgs);
2344
DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
2345
ddt->ddt_log_flush_time_rate);
2346
if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
2347
zfs_flags & ZFS_DEBUG_DDT) {
2348
zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
2349
"txg %llu, in %llu ms, flush rate %d, time rate %d",
2350
(ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
2351
(ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
2352
count, (u_longlong_t)tx->tx_txg,
2353
(u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
2354
ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
2355
}
2356
}
2357
2358
static void
2359
ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
2360
{
2361
uint64_t count = avl_numnodes(&ddt->ddt_tree);
2362
2363
if (count > 0) {
2364
ddt_log_update_t dlu = {0};
2365
ddt_log_begin(ddt, count, tx, &dlu);
2366
2367
ddt_entry_t *dde;
2368
void *cookie = NULL;
2369
ddt_lightweight_entry_t ddlwe;
2370
while ((dde =
2371
avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
2372
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2373
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2374
2375
/* If from flushing log, remove it. */
2376
if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
2377
VERIFY(ddt_log_remove_key(ddt,
2378
ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
2379
}
2380
2381
/* Update class_start to track last modification time */
2382
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2383
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2384
ddt_class_start();
2385
}
2386
2387
ddt_log_entry(ddt, &ddlwe, &dlu);
2388
ddt_sync_scan_entry(ddt, &ddlwe, tx);
2389
ddt_free(ddt, dde);
2390
}
2391
2392
ddt_log_commit(ddt, &dlu);
2393
2394
DDT_KSTAT_SET(ddt, dds_log_active_entries,
2395
avl_numnodes(&ddt->ddt_log_active->ddl_tree));
2396
2397
/*
2398
* Sync the stats for the store objects. Even though we haven't
2399
* modified anything on those objects, they're no longer the
2400
* source of truth for entries that are now in the log, and we
2401
* need the on-disk counts to reflect that, otherwise we'll
2402
* miscount later when importing.
2403
*/
2404
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2405
for (ddt_class_t class = 0;
2406
class < DDT_CLASSES; class++) {
2407
if (ddt_object_exists(ddt, type, class))
2408
ddt_object_sync(ddt, type, class, tx);
2409
}
2410
}
2411
2412
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2413
sizeof (ddt->ddt_histogram));
2414
ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2415
ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2416
}
2417
2418
if (spa_sync_pass(ddt->ddt_spa) == 1) {
2419
/*
2420
* Update ingest rate. This is an exponential weighted moving
2421
* average of the number of entries changed over recent txgs.
2422
* The ramp-up cost shouldn't matter too much because the
2423
* flusher will be trying to take at least the minimum anyway.
2424
*/
2425
ddt->ddt_log_ingest_rate = _ewma(
2426
count, ddt->ddt_log_ingest_rate,
2427
zfs_dedup_log_flush_flow_rate_txgs);
2428
DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
2429
ddt->ddt_log_ingest_rate);
2430
}
2431
}
2432
2433
static void
2434
ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
2435
{
2436
if (avl_numnodes(&ddt->ddt_tree) == 0)
2437
return;
2438
2439
ddt_entry_t *dde;
2440
void *cookie = NULL;
2441
while ((dde = avl_destroy_nodes(
2442
&ddt->ddt_tree, &cookie)) != NULL) {
2443
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2444
2445
ddt_lightweight_entry_t ddlwe;
2446
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2447
2448
/* Update class_start to track last modification time */
2449
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2450
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2451
ddt_class_start();
2452
}
2453
2454
ddt_sync_flush_entry(ddt, &ddlwe,
2455
dde->dde_type, dde->dde_class, tx);
2456
ddt_sync_scan_entry(ddt, &ddlwe, tx);
2457
ddt_free(ddt, dde);
2458
}
2459
2460
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2461
sizeof (ddt->ddt_histogram));
2462
ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2463
ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2464
ddt_sync_update_stats(ddt, tx);
2465
}
2466
2467
static void
2468
ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
2469
{
2470
spa_t *spa = ddt->ddt_spa;
2471
2472
if (ddt->ddt_version == UINT64_MAX)
2473
return;
2474
2475
if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
2476
ASSERT0(avl_numnodes(&ddt->ddt_tree));
2477
return;
2478
}
2479
2480
if (spa->spa_ddt_stat_object == 0) {
2481
spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
2482
DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
2483
DMU_POOL_DDT_STATS, tx);
2484
}
2485
2486
if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
2487
ddt_create_dir(ddt, tx);
2488
2489
if (ddt->ddt_flags & DDT_FLAG_LOG)
2490
ddt_sync_table_log(ddt, tx);
2491
else
2492
ddt_sync_table_flush(ddt, tx);
2493
}
2494
2495
void
2496
ddt_sync(spa_t *spa, uint64_t txg)
2497
{
2498
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2499
dmu_tx_t *tx;
2500
zio_t *rio;
2501
2502
ASSERT3U(spa_syncing_txg(spa), ==, txg);
2503
2504
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2505
2506
rio = zio_root(spa, NULL, NULL,
2507
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
2508
2509
/*
2510
* This function may cause an immediate scan of ddt blocks (see
2511
* the comment above dsl_scan_ddt() for details). We set the
2512
* scan's root zio here so that we can wait for any scan IOs in
2513
* addition to the regular ddt IOs.
2514
*/
2515
ASSERT0P(scn->scn_zio_root);
2516
scn->scn_zio_root = rio;
2517
2518
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2519
ddt_t *ddt = spa->spa_ddt[c];
2520
if (ddt == NULL)
2521
continue;
2522
ddt_sync_table(ddt, tx);
2523
if (ddt->ddt_flags & DDT_FLAG_LOG)
2524
ddt_sync_flush_log(ddt, tx);
2525
ddt_repair_table(ddt, rio);
2526
}
2527
2528
(void) zio_wait(rio);
2529
scn->scn_zio_root = NULL;
2530
2531
dmu_tx_commit(tx);
2532
}
2533
2534
void
2535
ddt_walk_init(spa_t *spa, uint64_t txg)
2536
{
2537
if (txg == 0)
2538
txg = spa_syncing_txg(spa);
2539
2540
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2541
ddt_t *ddt = spa->spa_ddt[c];
2542
if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2543
continue;
2544
2545
ddt_enter(ddt);
2546
ddt_flush_force_update_txg(ddt, txg);
2547
ddt_exit(ddt);
2548
}
2549
}
2550
2551
boolean_t
2552
ddt_walk_ready(spa_t *spa)
2553
{
2554
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2555
ddt_t *ddt = spa->spa_ddt[c];
2556
if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2557
continue;
2558
2559
if (ddt->ddt_flush_force_txg > 0)
2560
return (B_FALSE);
2561
}
2562
2563
return (B_TRUE);
2564
}
2565
2566
static int
2567
ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
2568
uint64_t flags, boolean_t wait)
2569
{
2570
do {
2571
do {
2572
do {
2573
ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
2574
if (ddt == NULL)
2575
continue;
2576
2577
if (flags != 0 &&
2578
(ddt->ddt_flags & flags) != flags)
2579
continue;
2580
2581
if (wait && ddt->ddt_flush_force_txg > 0)
2582
return (EAGAIN);
2583
2584
int error = ENOENT;
2585
if (ddt_object_exists(ddt, ddb->ddb_type,
2586
ddb->ddb_class)) {
2587
error = ddt_object_walk(ddt,
2588
ddb->ddb_type, ddb->ddb_class,
2589
&ddb->ddb_cursor, ddlwe);
2590
}
2591
if (error == 0)
2592
return (0);
2593
if (error != ENOENT)
2594
return (error);
2595
ddb->ddb_cursor = 0;
2596
} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
2597
ddb->ddb_checksum = 0;
2598
} while (++ddb->ddb_type < DDT_TYPES);
2599
ddb->ddb_type = 0;
2600
} while (++ddb->ddb_class < DDT_CLASSES);
2601
2602
return (SET_ERROR(ENOENT));
2603
}
2604
2605
int
2606
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
2607
{
2608
return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
2609
}
2610
2611
/*
2612
* This function is used by Block Cloning (brt.c) to increase reference
2613
* counter for the DDT entry if the block is already in DDT.
2614
*
2615
* Return false if the block, despite having the D bit set, is not present
2616
* in the DDT. This is possible when the DDT has been pruned by an admin
2617
* or by the DDT quota mechanism.
2618
*/
2619
boolean_t
2620
ddt_addref(spa_t *spa, const blkptr_t *bp)
2621
{
2622
ddt_t *ddt;
2623
ddt_entry_t *dde;
2624
boolean_t result;
2625
2626
spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2627
ddt = ddt_select(spa, bp);
2628
ddt_enter(ddt);
2629
2630
dde = ddt_lookup(ddt, bp, B_TRUE);
2631
2632
/* Can be NULL if the entry for this block was pruned. */
2633
if (dde == NULL) {
2634
ddt_exit(ddt);
2635
spa_config_exit(spa, SCL_ZIO, FTAG);
2636
return (B_FALSE);
2637
}
2638
2639
if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
2640
/*
2641
* This entry was either synced to a store object (dde_type is
2642
* real) or was logged. It must be properly on disk at this
2643
* point, so we can just bump its refcount.
2644
*/
2645
int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
2646
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2647
2648
ddt_phys_addref(dde->dde_phys, v);
2649
result = B_TRUE;
2650
} else {
2651
/*
2652
* If the block has the DEDUP flag set it still might not
2653
* exist in the DEDUP table due to DDT pruning of entries
2654
* where refcnt=1.
2655
*/
2656
ddt_remove(ddt, dde);
2657
result = B_FALSE;
2658
}
2659
2660
ddt_exit(ddt);
2661
spa_config_exit(spa, SCL_ZIO, FTAG);
2662
2663
return (result);
2664
}
2665
2666
typedef struct ddt_prune_entry {
2667
ddt_t *dpe_ddt;
2668
ddt_key_t dpe_key;
2669
list_node_t dpe_node;
2670
ddt_univ_phys_t dpe_phys[];
2671
} ddt_prune_entry_t;
2672
2673
typedef struct ddt_prune_info {
2674
spa_t *dpi_spa;
2675
uint64_t dpi_txg_syncs;
2676
uint64_t dpi_pruned;
2677
list_t dpi_candidates;
2678
} ddt_prune_info_t;
2679
2680
/*
2681
* Add prune candidates for ddt_sync during spa_sync
2682
*/
2683
static void
2684
prune_candidates_sync(void *arg, dmu_tx_t *tx)
2685
{
2686
(void) tx;
2687
ddt_prune_info_t *dpi = arg;
2688
ddt_prune_entry_t *dpe;
2689
2690
spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
2691
2692
/* Process the prune candidates collected so far */
2693
while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
2694
blkptr_t blk;
2695
ddt_t *ddt = dpe->dpe_ddt;
2696
2697
ddt_enter(ddt);
2698
2699
/*
2700
* If it's on the live list, then it was loaded for update
2701
* this txg and is no longer stale; skip it.
2702
*/
2703
if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
2704
ddt_exit(ddt);
2705
kmem_free(dpe, sizeof (*dpe));
2706
continue;
2707
}
2708
2709
ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
2710
dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
2711
2712
ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
2713
if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
2714
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2715
/*
2716
* Zero the physical, so we don't try to free DVAs
2717
* at flush nor try to reuse this entry.
2718
*/
2719
ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
2720
2721
dpi->dpi_pruned++;
2722
}
2723
2724
ddt_exit(ddt);
2725
kmem_free(dpe, sizeof (*dpe));
2726
}
2727
2728
spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
2729
dpi->dpi_txg_syncs++;
2730
}
2731
2732
/*
2733
* Prune candidates are collected in open context and processed
2734
* in sync context as part of ddt_sync_table().
2735
*/
2736
static void
2737
ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
2738
const ddt_univ_phys_t *ddp)
2739
{
2740
ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2741
2742
size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
2743
ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
2744
2745
dpe->dpe_ddt = ddt;
2746
dpe->dpe_key = *ddk;
2747
memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
2748
list_insert_head(list, dpe);
2749
}
2750
2751
/*
2752
* Interate over all the entries in the DDT unique class.
2753
* The walk will perform one of the following operations:
2754
* (a) build a histogram than can be used when pruning
2755
* (b) prune entries older than the cutoff
2756
*
2757
* Also called by zdb(8) to dump the age histogram
2758
*/
2759
void
2760
ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
2761
{
2762
ddt_bookmark_t ddb = {
2763
.ddb_class = DDT_CLASS_UNIQUE,
2764
.ddb_type = 0,
2765
.ddb_checksum = 0,
2766
.ddb_cursor = 0
2767
};
2768
ddt_lightweight_entry_t ddlwe = {0};
2769
int error;
2770
int valid = 0;
2771
int candidates = 0;
2772
uint64_t now = gethrestime_sec();
2773
ddt_prune_info_t dpi;
2774
boolean_t pruning = (cutoff != 0);
2775
2776
if (pruning) {
2777
dpi.dpi_txg_syncs = 0;
2778
dpi.dpi_pruned = 0;
2779
dpi.dpi_spa = spa;
2780
list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
2781
offsetof(ddt_prune_entry_t, dpe_node));
2782
}
2783
2784
if (histogram != NULL)
2785
memset(histogram, 0, sizeof (ddt_age_histo_t));
2786
2787
while ((error =
2788
ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
2789
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2790
VERIFY(ddt);
2791
2792
if (spa_shutting_down(spa) || issig())
2793
break;
2794
2795
ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2796
ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
2797
2798
uint64_t class_start =
2799
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
2800
2801
/*
2802
* If this entry is on the log, then the stored entry is stale
2803
* and we should skip it.
2804
*/
2805
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
2806
continue;
2807
2808
/* prune older entries */
2809
if (pruning && class_start < cutoff) {
2810
if (candidates++ >= zfs_ddt_prunes_per_txg) {
2811
/* sync prune candidates in batches */
2812
VERIFY0(dsl_sync_task(spa_name(spa),
2813
NULL, prune_candidates_sync,
2814
&dpi, 0, ZFS_SPACE_CHECK_NONE));
2815
candidates = 1;
2816
}
2817
ddt_prune_entry(&dpi.dpi_candidates, ddt,
2818
&ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
2819
}
2820
2821
/* build a histogram */
2822
if (histogram != NULL) {
2823
uint64_t age = MAX(1, (now - class_start) / 3600);
2824
int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
2825
histogram->dah_entries++;
2826
histogram->dah_age_histo[bin]++;
2827
}
2828
2829
valid++;
2830
}
2831
2832
if (pruning && valid > 0) {
2833
if (!list_is_empty(&dpi.dpi_candidates)) {
2834
/* sync out final batch of prune candidates */
2835
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2836
prune_candidates_sync, &dpi, 0,
2837
ZFS_SPACE_CHECK_NONE));
2838
}
2839
list_destroy(&dpi.dpi_candidates);
2840
2841
zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2842
(u_longlong_t)dpi.dpi_pruned,
2843
(int)((dpi.dpi_pruned * 100) / valid),
2844
(u_longlong_t)dpi.dpi_txg_syncs);
2845
}
2846
}
2847
2848
static uint64_t
2849
ddt_total_entries(spa_t *spa)
2850
{
2851
ddt_object_t ddo;
2852
ddt_get_dedup_object_stats(spa, &ddo);
2853
2854
return (ddo.ddo_count);
2855
}
2856
2857
int
2858
ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
2859
uint64_t amount)
2860
{
2861
uint64_t cutoff;
2862
uint64_t start_time = gethrtime();
2863
2864
if (spa->spa_active_ddt_prune)
2865
return (SET_ERROR(EALREADY));
2866
if (ddt_total_entries(spa) == 0)
2867
return (0);
2868
2869
spa->spa_active_ddt_prune = B_TRUE;
2870
2871
zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
2872
unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
2873
2874
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
2875
ddt_age_histo_t histogram;
2876
uint64_t oldest = 0;
2877
2878
/* Make a pass over DDT to build a histogram */
2879
ddt_prune_walk(spa, 0, &histogram);
2880
2881
int target = (histogram.dah_entries * amount) / 100;
2882
2883
/*
2884
* Figure out our cutoff date
2885
* (i.e., which bins to prune from)
2886
*/
2887
for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
2888
if (histogram.dah_age_histo[i] != 0) {
2889
/* less than this bucket remaining */
2890
if (target < histogram.dah_age_histo[i]) {
2891
oldest = MAX(1, (1<<i) * 3600);
2892
target = 0;
2893
} else {
2894
target -= histogram.dah_age_histo[i];
2895
}
2896
}
2897
}
2898
cutoff = gethrestime_sec() - oldest;
2899
2900
if (ddt_dump_prune_histogram)
2901
ddt_dump_age_histogram(&histogram, cutoff);
2902
} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
2903
cutoff = gethrestime_sec() - amount;
2904
} else {
2905
return (EINVAL);
2906
}
2907
2908
if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
2909
/* Traverse DDT to prune entries older that our cuttoff */
2910
ddt_prune_walk(spa, cutoff, NULL);
2911
}
2912
2913
zfs_dbgmsg("%s: prune completed in %llu ms",
2914
spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
2915
2916
spa->spa_active_ddt_prune = B_FALSE;
2917
return (0);
2918
}
2919
2920
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
2921
"Enable prefetching dedup-ed blks");
2922
2923
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
2924
"Min time to spend on incremental dedup log flush each transaction");
2925
2926
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
2927
"Min number of log entries to flush each transaction");
2928
2929
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
2930
"Max number of log entries to flush each transaction");
2931
2932
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
2933
"Number of TXGs to try to rotate the log in");
2934
2935
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
2936
"Soft cap for the size of the current dedup log");
2937
2938
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
2939
"Whether to use the soft cap as a hard cap");
2940
2941
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
2942
"Number of txgs to average flow rates across");
2943
2944