Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/cmd/zstream/zstream_redup.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* This file and its contents are supplied under the terms of the
6
* Common Development and Distribution License ("CDDL"), version 1.0.
7
* You may only use this file in accordance with the terms of version
8
* 1.0 of the CDDL.
9
*
10
* A full copy of the text of the CDDL should have accompanied this
11
* source. A copy of the CDDL is also available via the Internet at
12
* http://www.illumos.org/license/CDDL.
13
*
14
* CDDL HEADER END
15
*/
16
17
/*
18
* Copyright (c) 2020 by Delphix. All rights reserved.
19
*/
20
21
#include <assert.h>
22
#include <cityhash.h>
23
#include <ctype.h>
24
#include <errno.h>
25
#include <fcntl.h>
26
#include <libzfs.h>
27
#include <libzutil.h>
28
#include <stddef.h>
29
#include <stdio.h>
30
#include <stdlib.h>
31
#include <string.h>
32
#include <umem.h>
33
#include <unistd.h>
34
#include <sys/debug.h>
35
#include <sys/stat.h>
36
#include <sys/zfs_ioctl.h>
37
#include <sys/zio_checksum.h>
38
#include "zfs_fletcher.h"
39
#include "zstream.h"
40
41
42
#define MAX_RDT_PHYSMEM_PERCENT 20
43
#define SMALLEST_POSSIBLE_MAX_RDT_MB 128
44
45
typedef struct redup_entry {
46
struct redup_entry *rde_next;
47
uint64_t rde_guid;
48
uint64_t rde_object;
49
uint64_t rde_offset;
50
uint64_t rde_stream_offset;
51
} redup_entry_t;
52
53
typedef struct redup_table {
54
redup_entry_t **redup_hash_array;
55
umem_cache_t *ddecache;
56
uint64_t ddt_count;
57
int numhashbits;
58
} redup_table_t;
59
60
void *
61
safe_calloc(size_t n)
62
{
63
void *rv = calloc(1, n);
64
if (rv == NULL) {
65
fprintf(stderr,
66
"Error: could not allocate %u bytes of memory\n",
67
(int)n);
68
exit(1);
69
}
70
return (rv);
71
}
72
73
/*
74
* Safe version of fread(), exits on error.
75
*/
76
int
77
sfread(void *buf, size_t size, FILE *fp)
78
{
79
int rv = fread(buf, size, 1, fp);
80
if (rv == 0 && ferror(fp)) {
81
(void) fprintf(stderr, "Error while reading file: %s\n",
82
strerror(errno));
83
exit(1);
84
}
85
return (rv);
86
}
87
88
/*
89
* Safe version of pread(), exits on error.
90
*/
91
static void
92
spread(int fd, void *buf, size_t count, off_t offset)
93
{
94
ssize_t err = pread(fd, buf, count, offset);
95
if (err == -1) {
96
(void) fprintf(stderr,
97
"Error while reading file: %s\n",
98
strerror(errno));
99
exit(1);
100
} else if (err != count) {
101
(void) fprintf(stderr,
102
"Error while reading file: short read\n");
103
exit(1);
104
}
105
}
106
107
static int
108
dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
109
zio_cksum_t *zc, int outfd)
110
{
111
assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
112
== sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
113
fletcher_4_incremental_native(drr,
114
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
115
if (drr->drr_type != DRR_BEGIN) {
116
assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
117
drr_checksum.drr_checksum));
118
drr->drr_u.drr_checksum.drr_checksum = *zc;
119
}
120
fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
121
sizeof (zio_cksum_t), zc);
122
if (write(outfd, drr, sizeof (*drr)) == -1)
123
return (errno);
124
if (payload_len != 0) {
125
fletcher_4_incremental_native(payload, payload_len, zc);
126
if (write(outfd, payload, payload_len) == -1)
127
return (errno);
128
}
129
return (0);
130
}
131
132
static void
133
rdt_insert(redup_table_t *rdt,
134
uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
135
{
136
uint64_t ch = cityhash3(guid, object, offset);
137
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
138
redup_entry_t **rdepp;
139
140
rdepp = &(rdt->redup_hash_array[hashcode]);
141
redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
142
rde->rde_next = *rdepp;
143
rde->rde_guid = guid;
144
rde->rde_object = object;
145
rde->rde_offset = offset;
146
rde->rde_stream_offset = stream_offset;
147
*rdepp = rde;
148
rdt->ddt_count++;
149
}
150
151
static void
152
rdt_lookup(redup_table_t *rdt,
153
uint64_t guid, uint64_t object, uint64_t offset,
154
uint64_t *stream_offsetp)
155
{
156
uint64_t ch = cityhash3(guid, object, offset);
157
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
158
159
for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
160
rde != NULL; rde = rde->rde_next) {
161
if (rde->rde_guid == guid &&
162
rde->rde_object == object &&
163
rde->rde_offset == offset) {
164
*stream_offsetp = rde->rde_stream_offset;
165
return;
166
}
167
}
168
assert(!"could not find expected redup table entry");
169
}
170
171
/*
172
* Convert a dedup stream (generated by "zfs send -D") to a
173
* non-deduplicated stream. The entire infd will be converted, including
174
* any substreams in a stream package (generated by "zfs send -RD"). The
175
* infd must be seekable.
176
*/
177
static void
178
zfs_redup_stream(int infd, int outfd, boolean_t verbose)
179
{
180
int bufsz = SPA_MAXBLOCKSIZE;
181
dmu_replay_record_t thedrr;
182
dmu_replay_record_t *drr = &thedrr;
183
redup_table_t rdt;
184
zio_cksum_t stream_cksum;
185
uint64_t numbuckets;
186
uint64_t num_records = 0;
187
uint64_t num_write_byref_records = 0;
188
189
memset(&thedrr, 0, sizeof (dmu_replay_record_t));
190
191
#ifdef _ILP32
192
uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
193
#else
194
uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
195
uint64_t max_rde_size =
196
MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
197
SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
198
#endif
199
200
numbuckets = max_rde_size / (sizeof (redup_entry_t));
201
202
/*
203
* numbuckets must be a power of 2. Increase number to
204
* a power of 2 if necessary.
205
*/
206
if (!ISP2(numbuckets))
207
numbuckets = 1ULL << highbit64(numbuckets);
208
209
rdt.redup_hash_array =
210
safe_calloc(numbuckets * sizeof (redup_entry_t *));
211
rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
212
NULL, NULL, NULL, NULL, NULL, 0);
213
rdt.numhashbits = highbit64(numbuckets) - 1;
214
rdt.ddt_count = 0;
215
216
char *buf = safe_calloc(bufsz);
217
FILE *ofp = fdopen(infd, "r");
218
long offset = ftell(ofp);
219
int begin = 0;
220
boolean_t seen = B_FALSE;
221
while (sfread(drr, sizeof (*drr), ofp) != 0) {
222
num_records++;
223
224
/*
225
* We need to regenerate the checksum.
226
*/
227
if (drr->drr_type != DRR_BEGIN) {
228
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
229
sizeof (drr->drr_u.drr_checksum.drr_checksum));
230
}
231
232
uint64_t payload_size = 0;
233
switch (drr->drr_type) {
234
case DRR_BEGIN:
235
{
236
struct drr_begin *drrb = &drr->drr_u.drr_begin;
237
int fflags;
238
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
239
VERIFY0(begin++);
240
seen = B_TRUE;
241
242
assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
243
244
/* clear the DEDUP feature flag for this stream */
245
fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
246
fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
247
DMU_BACKUP_FEATURE_DEDUPPROPS);
248
/* cppcheck-suppress syntaxError */
249
DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
250
251
uint32_t sz = drr->drr_payloadlen;
252
253
VERIFY3U(sz, <=, 1U << 28);
254
255
if (sz != 0) {
256
if (sz > bufsz) {
257
free(buf);
258
buf = safe_calloc(sz);
259
bufsz = sz;
260
}
261
(void) sfread(buf, sz, ofp);
262
}
263
payload_size = sz;
264
break;
265
}
266
267
case DRR_END:
268
{
269
struct drr_end *drre = &drr->drr_u.drr_end;
270
/*
271
* We would prefer to just check --begin == 0, but
272
* replication streams have an end of stream END
273
* record, so we must avoid tripping it.
274
*/
275
VERIFY3B(seen, ==, B_TRUE);
276
begin--;
277
/*
278
* Use the recalculated checksum, unless this is
279
* the END record of a stream package, which has
280
* no checksum.
281
*/
282
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
283
drre->drr_checksum = stream_cksum;
284
break;
285
}
286
287
case DRR_OBJECT:
288
{
289
struct drr_object *drro = &drr->drr_u.drr_object;
290
VERIFY3S(begin, ==, 1);
291
292
if (drro->drr_bonuslen > 0) {
293
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
294
(void) sfread(buf, payload_size, ofp);
295
}
296
break;
297
}
298
299
case DRR_SPILL:
300
{
301
struct drr_spill *drrs = &drr->drr_u.drr_spill;
302
VERIFY3S(begin, ==, 1);
303
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
304
(void) sfread(buf, payload_size, ofp);
305
break;
306
}
307
308
case DRR_WRITE_BYREF:
309
{
310
struct drr_write_byref drrwb =
311
drr->drr_u.drr_write_byref;
312
VERIFY3S(begin, ==, 1);
313
314
num_write_byref_records++;
315
316
/*
317
* Look up in hash table by drrwb->drr_refguid,
318
* drr_refobject, drr_refoffset. Replace this
319
* record with the found WRITE record, but with
320
* drr_object,drr_offset,drr_toguid replaced with ours.
321
*/
322
uint64_t stream_offset = 0;
323
rdt_lookup(&rdt, drrwb.drr_refguid,
324
drrwb.drr_refobject, drrwb.drr_refoffset,
325
&stream_offset);
326
327
spread(infd, drr, sizeof (*drr), stream_offset);
328
329
assert(drr->drr_type == DRR_WRITE);
330
struct drr_write *drrw = &drr->drr_u.drr_write;
331
assert(drrw->drr_toguid == drrwb.drr_refguid);
332
assert(drrw->drr_object == drrwb.drr_refobject);
333
assert(drrw->drr_offset == drrwb.drr_refoffset);
334
335
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
336
spread(infd, buf, payload_size,
337
stream_offset + sizeof (*drr));
338
339
drrw->drr_toguid = drrwb.drr_toguid;
340
drrw->drr_object = drrwb.drr_object;
341
drrw->drr_offset = drrwb.drr_offset;
342
break;
343
}
344
345
case DRR_WRITE:
346
{
347
struct drr_write *drrw = &drr->drr_u.drr_write;
348
VERIFY3S(begin, ==, 1);
349
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
350
(void) sfread(buf, payload_size, ofp);
351
352
rdt_insert(&rdt, drrw->drr_toguid,
353
drrw->drr_object, drrw->drr_offset, offset);
354
break;
355
}
356
357
case DRR_WRITE_EMBEDDED:
358
{
359
struct drr_write_embedded *drrwe =
360
&drr->drr_u.drr_write_embedded;
361
VERIFY3S(begin, ==, 1);
362
payload_size =
363
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
364
(void) sfread(buf, payload_size, ofp);
365
break;
366
}
367
368
case DRR_FREEOBJECTS:
369
case DRR_FREE:
370
case DRR_OBJECT_RANGE:
371
VERIFY3S(begin, ==, 1);
372
break;
373
374
default:
375
(void) fprintf(stderr, "INVALID record type 0x%x\n",
376
drr->drr_type);
377
/* should never happen, so assert */
378
assert(B_FALSE);
379
}
380
381
if (feof(ofp)) {
382
fprintf(stderr, "Error: unexpected end-of-file\n");
383
exit(1);
384
}
385
if (ferror(ofp)) {
386
fprintf(stderr, "Error while reading file: %s\n",
387
strerror(errno));
388
exit(1);
389
}
390
391
/*
392
* We need to recalculate the checksum, and it needs to be
393
* initially zero to do that. BEGIN records don't have
394
* a checksum.
395
*/
396
if (drr->drr_type != DRR_BEGIN) {
397
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
398
sizeof (drr->drr_u.drr_checksum.drr_checksum));
399
}
400
if (dump_record(drr, buf, payload_size,
401
&stream_cksum, outfd) != 0)
402
break;
403
if (drr->drr_type == DRR_END) {
404
/*
405
* Typically the END record is either the last
406
* thing in the stream, or it is followed
407
* by a BEGIN record (which also zeros the checksum).
408
* However, a stream package ends with two END
409
* records. The last END record's checksum starts
410
* from zero.
411
*/
412
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
413
}
414
offset = ftell(ofp);
415
}
416
417
if (verbose) {
418
char mem_str[16];
419
zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
420
mem_str, sizeof (mem_str));
421
fprintf(stderr, "converted stream with %llu total records, "
422
"including %llu dedup records, using %sB memory.\n",
423
(long long)num_records,
424
(long long)num_write_byref_records,
425
mem_str);
426
}
427
428
umem_cache_destroy(rdt.ddecache);
429
free(rdt.redup_hash_array);
430
free(buf);
431
(void) fclose(ofp);
432
}
433
434
int
435
zstream_do_redup(int argc, char *argv[])
436
{
437
boolean_t verbose = B_FALSE;
438
int c;
439
440
while ((c = getopt(argc, argv, "v")) != -1) {
441
switch (c) {
442
case 'v':
443
verbose = B_TRUE;
444
break;
445
case '?':
446
(void) fprintf(stderr, "invalid option '%c'\n",
447
optopt);
448
zstream_usage();
449
break;
450
}
451
}
452
453
argc -= optind;
454
argv += optind;
455
456
if (argc != 1)
457
zstream_usage();
458
459
const char *filename = argv[0];
460
461
if (isatty(STDOUT_FILENO)) {
462
(void) fprintf(stderr,
463
"Error: Stream can not be written to a terminal.\n"
464
"You must redirect standard output.\n");
465
return (1);
466
}
467
468
int fd = open(filename, O_RDONLY);
469
if (fd == -1) {
470
(void) fprintf(stderr,
471
"Error while opening file '%s': %s\n",
472
filename, strerror(errno));
473
exit(1);
474
}
475
476
fletcher_4_init();
477
zfs_redup_stream(fd, STDOUT_FILENO, verbose);
478
fletcher_4_fini();
479
480
close(fd);
481
482
return (0);
483
}
484
485