Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dbuf.c
107001 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
26
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
27
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28
* Copyright (c) 2019, Klara Inc.
29
* Copyright (c) 2019, Allan Jude
30
* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
31
*/
32
33
#include <sys/zfs_context.h>
34
#include <sys/arc.h>
35
#include <sys/dmu.h>
36
#include <sys/dmu_send.h>
37
#include <sys/dmu_impl.h>
38
#include <sys/dbuf.h>
39
#include <sys/dmu_objset.h>
40
#include <sys/dsl_dataset.h>
41
#include <sys/dsl_dir.h>
42
#include <sys/dmu_tx.h>
43
#include <sys/spa.h>
44
#include <sys/zio.h>
45
#include <sys/dmu_zfetch.h>
46
#include <sys/sa.h>
47
#include <sys/sa_impl.h>
48
#include <sys/zfeature.h>
49
#include <sys/blkptr.h>
50
#include <sys/range_tree.h>
51
#include <sys/trace_zfs.h>
52
#include <sys/callb.h>
53
#include <sys/abd.h>
54
#include <sys/brt.h>
55
#include <sys/vdev.h>
56
#include <cityhash.h>
57
#include <sys/spa_impl.h>
58
#include <sys/wmsum.h>
59
#include <sys/vdev_impl.h>
60
61
static kstat_t *dbuf_ksp;
62
63
typedef struct dbuf_stats {
64
/*
65
* Various statistics about the size of the dbuf cache.
66
*/
67
kstat_named_t cache_count;
68
kstat_named_t cache_size_bytes;
69
kstat_named_t cache_size_bytes_max;
70
/*
71
* Statistics regarding the bounds on the dbuf cache size.
72
*/
73
kstat_named_t cache_target_bytes;
74
kstat_named_t cache_lowater_bytes;
75
kstat_named_t cache_hiwater_bytes;
76
/*
77
* Total number of dbuf cache evictions that have occurred.
78
*/
79
kstat_named_t cache_total_evicts;
80
/*
81
* The distribution of dbuf levels in the dbuf cache and
82
* the total size of all dbufs at each level.
83
*/
84
kstat_named_t cache_levels[DN_MAX_LEVELS];
85
kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
86
/*
87
* Statistics about the dbuf hash table.
88
*/
89
kstat_named_t hash_hits;
90
kstat_named_t hash_misses;
91
kstat_named_t hash_collisions;
92
kstat_named_t hash_elements;
93
/*
94
* Number of sublists containing more than one dbuf in the dbuf
95
* hash table. Keep track of the longest hash chain.
96
*/
97
kstat_named_t hash_chains;
98
kstat_named_t hash_chain_max;
99
/*
100
* Number of times a dbuf_create() discovers that a dbuf was
101
* already created and in the dbuf hash table.
102
*/
103
kstat_named_t hash_insert_race;
104
/*
105
* Number of entries in the hash table dbuf and mutex arrays.
106
*/
107
kstat_named_t hash_table_count;
108
kstat_named_t hash_mutex_count;
109
/*
110
* Statistics about the size of the metadata dbuf cache.
111
*/
112
kstat_named_t metadata_cache_count;
113
kstat_named_t metadata_cache_size_bytes;
114
kstat_named_t metadata_cache_size_bytes_max;
115
/*
116
* For diagnostic purposes, this is incremented whenever we can't add
117
* something to the metadata cache because it's full, and instead put
118
* the data in the regular dbuf cache.
119
*/
120
kstat_named_t metadata_cache_overflow;
121
} dbuf_stats_t;
122
123
dbuf_stats_t dbuf_stats = {
124
{ "cache_count", KSTAT_DATA_UINT64 },
125
{ "cache_size_bytes", KSTAT_DATA_UINT64 },
126
{ "cache_size_bytes_max", KSTAT_DATA_UINT64 },
127
{ "cache_target_bytes", KSTAT_DATA_UINT64 },
128
{ "cache_lowater_bytes", KSTAT_DATA_UINT64 },
129
{ "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
130
{ "cache_total_evicts", KSTAT_DATA_UINT64 },
131
{ { "cache_levels_N", KSTAT_DATA_UINT64 } },
132
{ { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
133
{ "hash_hits", KSTAT_DATA_UINT64 },
134
{ "hash_misses", KSTAT_DATA_UINT64 },
135
{ "hash_collisions", KSTAT_DATA_UINT64 },
136
{ "hash_elements", KSTAT_DATA_UINT64 },
137
{ "hash_chains", KSTAT_DATA_UINT64 },
138
{ "hash_chain_max", KSTAT_DATA_UINT64 },
139
{ "hash_insert_race", KSTAT_DATA_UINT64 },
140
{ "hash_table_count", KSTAT_DATA_UINT64 },
141
{ "hash_mutex_count", KSTAT_DATA_UINT64 },
142
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
143
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
144
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
145
{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }
146
};
147
148
struct {
149
wmsum_t cache_count;
150
wmsum_t cache_total_evicts;
151
wmsum_t cache_levels[DN_MAX_LEVELS];
152
wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
153
wmsum_t hash_hits;
154
wmsum_t hash_misses;
155
wmsum_t hash_collisions;
156
wmsum_t hash_elements;
157
wmsum_t hash_chains;
158
wmsum_t hash_insert_race;
159
wmsum_t metadata_cache_count;
160
wmsum_t metadata_cache_overflow;
161
} dbuf_sums;
162
163
#define DBUF_STAT_INCR(stat, val) \
164
wmsum_add(&dbuf_sums.stat, val)
165
#define DBUF_STAT_DECR(stat, val) \
166
DBUF_STAT_INCR(stat, -(val))
167
#define DBUF_STAT_BUMP(stat) \
168
DBUF_STAT_INCR(stat, 1)
169
#define DBUF_STAT_BUMPDOWN(stat) \
170
DBUF_STAT_INCR(stat, -1)
171
#define DBUF_STAT_MAX(stat, v) { \
172
uint64_t _m; \
173
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
174
(_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
175
continue; \
176
}
177
178
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
179
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
180
181
/*
182
* Global data structures and functions for the dbuf cache.
183
*/
184
static kmem_cache_t *dbuf_kmem_cache;
185
kmem_cache_t *dbuf_dirty_kmem_cache;
186
static taskq_t *dbu_evict_taskq;
187
188
static kthread_t *dbuf_cache_evict_thread;
189
static kmutex_t dbuf_evict_lock;
190
static kcondvar_t dbuf_evict_cv;
191
static boolean_t dbuf_evict_thread_exit;
192
193
/*
194
* There are two dbuf caches; each dbuf can only be in one of them at a time.
195
*
196
* 1. Cache of metadata dbufs, to help make read-heavy administrative commands
197
* from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
198
* that represent the metadata that describes filesystems/snapshots/
199
* bookmarks/properties/etc. We only evict from this cache when we export a
200
* pool, to short-circuit as much I/O as possible for all administrative
201
* commands that need the metadata. There is no eviction policy for this
202
* cache, because we try to only include types in it which would occupy a
203
* very small amount of space per object but create a large impact on the
204
* performance of these commands. Instead, after it reaches a maximum size
205
* (which should only happen on very small memory systems with a very large
206
* number of filesystem objects), we stop taking new dbufs into the
207
* metadata cache, instead putting them in the normal dbuf cache.
208
*
209
* 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
210
* are not currently held but have been recently released. These dbufs
211
* are not eligible for arc eviction until they are aged out of the cache.
212
* Dbufs that are aged out of the cache will be immediately destroyed and
213
* become eligible for arc eviction.
214
*
215
* Dbufs are added to these caches once the last hold is released. If a dbuf is
216
* later accessed and still exists in the dbuf cache, then it will be removed
217
* from the cache and later re-added to the head of the cache.
218
*
219
* If a given dbuf meets the requirements for the metadata cache, it will go
220
* there, otherwise it will be considered for the generic LRU dbuf cache. The
221
* caches and the refcounts tracking their sizes are stored in an array indexed
222
* by those caches' matching enum values (from dbuf_cached_state_t).
223
*/
224
typedef struct dbuf_cache {
225
multilist_t cache;
226
zfs_refcount_t size ____cacheline_aligned;
227
} dbuf_cache_t;
228
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
229
230
/* Size limits for the caches */
231
static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
232
static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
233
234
/* Set the default sizes of the caches to log2 fraction of arc size */
235
static uint_t dbuf_cache_shift = 5;
236
static uint_t dbuf_metadata_cache_shift = 6;
237
238
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
239
static uint_t dbuf_mutex_cache_shift = 0;
240
241
static unsigned long dbuf_cache_target_bytes(void);
242
static unsigned long dbuf_metadata_cache_target_bytes(void);
243
244
/*
245
* The LRU dbuf cache uses a three-stage eviction policy:
246
* - A low water marker designates when the dbuf eviction thread
247
* should stop evicting from the dbuf cache.
248
* - When we reach the maximum size (aka mid water mark), we
249
* signal the eviction thread to run.
250
* - The high water mark indicates when the eviction thread
251
* is unable to keep up with the incoming load and eviction must
252
* happen in the context of the calling thread.
253
*
254
* The dbuf cache:
255
* (max size)
256
* low water mid water hi water
257
* +----------------------------------------+----------+----------+
258
* | | | |
259
* | | | |
260
* | | | |
261
* | | | |
262
* +----------------------------------------+----------+----------+
263
* stop signal evict
264
* evicting eviction directly
265
* thread
266
*
267
* The high and low water marks indicate the operating range for the eviction
268
* thread. The low water mark is, by default, 90% of the total size of the
269
* cache and the high water mark is at 110% (both of these percentages can be
270
* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
271
* respectively). The eviction thread will try to ensure that the cache remains
272
* within this range by waking up every second and checking if the cache is
273
* above the low water mark. The thread can also be woken up by callers adding
274
* elements into the cache if the cache is larger than the mid water (i.e max
275
* cache size). Once the eviction thread is woken up and eviction is required,
276
* it will continue evicting buffers until it's able to reduce the cache size
277
* to the low water mark. If the cache size continues to grow and hits the high
278
* water mark, then callers adding elements to the cache will begin to evict
279
* directly from the cache until the cache is no longer above the high water
280
* mark.
281
*/
282
283
/*
284
* The percentage above and below the maximum cache size.
285
*/
286
static uint_t dbuf_cache_hiwater_pct = 10;
287
static uint_t dbuf_cache_lowater_pct = 10;
288
289
static int
290
dbuf_cons(void *vdb, void *unused, int kmflag)
291
{
292
(void) unused, (void) kmflag;
293
dmu_buf_impl_t *db = vdb;
294
memset(db, 0, sizeof (dmu_buf_impl_t));
295
296
mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
297
rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);
298
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
299
multilist_link_init(&db->db_cache_link);
300
zfs_refcount_create(&db->db_holds);
301
302
return (0);
303
}
304
305
static void
306
dbuf_dest(void *vdb, void *unused)
307
{
308
(void) unused;
309
dmu_buf_impl_t *db = vdb;
310
mutex_destroy(&db->db_mtx);
311
rw_destroy(&db->db_rwlock);
312
cv_destroy(&db->db_changed);
313
ASSERT(!multilist_link_active(&db->db_cache_link));
314
zfs_refcount_destroy(&db->db_holds);
315
}
316
317
/*
318
* dbuf hash table routines
319
*/
320
static dbuf_hash_table_t dbuf_hash_table;
321
322
/*
323
* We use Cityhash for this. It's fast, and has good hash properties without
324
* requiring any large static buffers.
325
*/
326
static uint64_t
327
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
328
{
329
return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
330
}
331
332
#define DTRACE_SET_STATE(db, why) \
333
DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \
334
const char *, why)
335
336
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
337
((dbuf)->db.db_object == (obj) && \
338
(dbuf)->db_objset == (os) && \
339
(dbuf)->db_level == (level) && \
340
(dbuf)->db_blkid == (blkid))
341
342
dmu_buf_impl_t *
343
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
344
uint64_t *hash_out)
345
{
346
dbuf_hash_table_t *h = &dbuf_hash_table;
347
uint64_t hv;
348
uint64_t idx;
349
dmu_buf_impl_t *db;
350
351
hv = dbuf_hash(os, obj, level, blkid);
352
idx = hv & h->hash_table_mask;
353
354
mutex_enter(DBUF_HASH_MUTEX(h, idx));
355
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
356
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
357
mutex_enter(&db->db_mtx);
358
if (db->db_state != DB_EVICTING) {
359
mutex_exit(DBUF_HASH_MUTEX(h, idx));
360
return (db);
361
}
362
mutex_exit(&db->db_mtx);
363
}
364
}
365
mutex_exit(DBUF_HASH_MUTEX(h, idx));
366
if (hash_out != NULL)
367
*hash_out = hv;
368
return (NULL);
369
}
370
371
static dmu_buf_impl_t *
372
dbuf_find_bonus(objset_t *os, uint64_t object)
373
{
374
dnode_t *dn;
375
dmu_buf_impl_t *db = NULL;
376
377
if (dnode_hold(os, object, FTAG, &dn) == 0) {
378
rw_enter(&dn->dn_struct_rwlock, RW_READER);
379
if (dn->dn_bonus != NULL) {
380
db = dn->dn_bonus;
381
mutex_enter(&db->db_mtx);
382
}
383
rw_exit(&dn->dn_struct_rwlock);
384
dnode_rele(dn, FTAG);
385
}
386
return (db);
387
}
388
389
/*
390
* Insert an entry into the hash table. If there is already an element
391
* equal to elem in the hash table, then the already existing element
392
* will be returned and the new element will not be inserted.
393
* Otherwise returns NULL.
394
*/
395
static dmu_buf_impl_t *
396
dbuf_hash_insert(dmu_buf_impl_t *db)
397
{
398
dbuf_hash_table_t *h = &dbuf_hash_table;
399
objset_t *os = db->db_objset;
400
uint64_t obj = db->db.db_object;
401
int level = db->db_level;
402
uint64_t blkid, idx;
403
dmu_buf_impl_t *dbf;
404
uint32_t i;
405
406
blkid = db->db_blkid;
407
ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
408
idx = db->db_hash & h->hash_table_mask;
409
410
mutex_enter(DBUF_HASH_MUTEX(h, idx));
411
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
412
dbf = dbf->db_hash_next, i++) {
413
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
414
mutex_enter(&dbf->db_mtx);
415
if (dbf->db_state != DB_EVICTING) {
416
mutex_exit(DBUF_HASH_MUTEX(h, idx));
417
return (dbf);
418
}
419
mutex_exit(&dbf->db_mtx);
420
}
421
}
422
423
if (i > 0) {
424
DBUF_STAT_BUMP(hash_collisions);
425
if (i == 1)
426
DBUF_STAT_BUMP(hash_chains);
427
428
DBUF_STAT_MAX(hash_chain_max, i);
429
}
430
431
mutex_enter(&db->db_mtx);
432
db->db_hash_next = h->hash_table[idx];
433
h->hash_table[idx] = db;
434
mutex_exit(DBUF_HASH_MUTEX(h, idx));
435
DBUF_STAT_BUMP(hash_elements);
436
437
return (NULL);
438
}
439
440
/*
441
* This returns whether this dbuf should be stored in the metadata cache, which
442
* is based on whether it's from one of the dnode types that store data related
443
* to traversing dataset hierarchies.
444
*/
445
static boolean_t
446
dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
447
{
448
DB_DNODE_ENTER(db);
449
dnode_t *dn = DB_DNODE(db);
450
dmu_object_type_t type = dn->dn_storage_type;
451
if (type == DMU_OT_NONE)
452
type = dn->dn_type;
453
DB_DNODE_EXIT(db);
454
455
/* Check if this dbuf is one of the types we care about */
456
if (DMU_OT_IS_METADATA_CACHED(type)) {
457
/* If we hit this, then we set something up wrong in dmu_ot */
458
ASSERT(DMU_OT_IS_METADATA(type));
459
460
/*
461
* Sanity check for small-memory systems: don't allocate too
462
* much memory for this purpose.
463
*/
464
if (zfs_refcount_count(
465
&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
466
dbuf_metadata_cache_target_bytes()) {
467
DBUF_STAT_BUMP(metadata_cache_overflow);
468
return (B_FALSE);
469
}
470
471
return (B_TRUE);
472
}
473
474
return (B_FALSE);
475
}
476
477
/*
478
* Remove an entry from the hash table. It must be in the EVICTING state.
479
*/
480
static void
481
dbuf_hash_remove(dmu_buf_impl_t *db)
482
{
483
dbuf_hash_table_t *h = &dbuf_hash_table;
484
uint64_t idx;
485
dmu_buf_impl_t *dbf, **dbp;
486
487
ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
488
db->db_blkid), ==, db->db_hash);
489
idx = db->db_hash & h->hash_table_mask;
490
491
/*
492
* We mustn't hold db_mtx to maintain lock ordering:
493
* DBUF_HASH_MUTEX > db_mtx.
494
*/
495
ASSERT(zfs_refcount_is_zero(&db->db_holds));
496
ASSERT(db->db_state == DB_EVICTING);
497
ASSERT(!MUTEX_HELD(&db->db_mtx));
498
499
mutex_enter(DBUF_HASH_MUTEX(h, idx));
500
dbp = &h->hash_table[idx];
501
while ((dbf = *dbp) != db) {
502
dbp = &dbf->db_hash_next;
503
ASSERT(dbf != NULL);
504
}
505
*dbp = db->db_hash_next;
506
db->db_hash_next = NULL;
507
if (h->hash_table[idx] &&
508
h->hash_table[idx]->db_hash_next == NULL)
509
DBUF_STAT_BUMPDOWN(hash_chains);
510
mutex_exit(DBUF_HASH_MUTEX(h, idx));
511
DBUF_STAT_BUMPDOWN(hash_elements);
512
}
513
514
typedef enum {
515
DBVU_EVICTING,
516
DBVU_NOT_EVICTING
517
} dbvu_verify_type_t;
518
519
static void
520
dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
521
{
522
#ifdef ZFS_DEBUG
523
int64_t holds;
524
525
if (db->db_user == NULL)
526
return;
527
528
/* Only data blocks support the attachment of user data. */
529
ASSERT0(db->db_level);
530
531
/* Clients must resolve a dbuf before attaching user data. */
532
ASSERT(db->db.db_data != NULL);
533
ASSERT3U(db->db_state, ==, DB_CACHED);
534
535
holds = zfs_refcount_count(&db->db_holds);
536
if (verify_type == DBVU_EVICTING) {
537
/*
538
* Immediate eviction occurs when holds == dirtycnt.
539
* For normal eviction buffers, holds is zero on
540
* eviction, except when dbuf_fix_old_data() calls
541
* dbuf_clear_data(). However, the hold count can grow
542
* during eviction even though db_mtx is held (see
543
* dmu_bonus_hold() for an example), so we can only
544
* test the generic invariant that holds >= dirtycnt.
545
*/
546
ASSERT3U(holds, >=, db->db_dirtycnt);
547
} else {
548
if (db->db_user_immediate_evict == TRUE)
549
ASSERT3U(holds, >=, db->db_dirtycnt);
550
else
551
ASSERT3U(holds, >, 0);
552
}
553
#endif
554
}
555
556
static void
557
dbuf_evict_user(dmu_buf_impl_t *db)
558
{
559
dmu_buf_user_t *dbu = db->db_user;
560
561
ASSERT(MUTEX_HELD(&db->db_mtx));
562
563
if (dbu == NULL)
564
return;
565
566
dbuf_verify_user(db, DBVU_EVICTING);
567
db->db_user = NULL;
568
569
#ifdef ZFS_DEBUG
570
if (dbu->dbu_clear_on_evict_dbufp != NULL)
571
*dbu->dbu_clear_on_evict_dbufp = NULL;
572
#endif
573
574
if (db->db_caching_status != DB_NO_CACHE) {
575
/*
576
* This is a cached dbuf, so the size of the user data is
577
* included in its cached amount. We adjust it here because the
578
* user data has already been detached from the dbuf, and the
579
* sync functions are not supposed to touch it (the dbuf might
580
* not exist anymore by the time the sync functions run.
581
*/
582
uint64_t size = dbu->dbu_size;
583
(void) zfs_refcount_remove_many(
584
&dbuf_caches[db->db_caching_status].size, size, dbu);
585
if (db->db_caching_status == DB_DBUF_CACHE)
586
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
587
}
588
589
/*
590
* There are two eviction callbacks - one that we call synchronously
591
* and one that we invoke via a taskq. The async one is useful for
592
* avoiding lock order reversals and limiting stack depth.
593
*
594
* Note that if we have a sync callback but no async callback,
595
* it's likely that the sync callback will free the structure
596
* containing the dbu. In that case we need to take care to not
597
* dereference dbu after calling the sync evict func.
598
*/
599
boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
600
601
if (dbu->dbu_evict_func_sync != NULL)
602
dbu->dbu_evict_func_sync(dbu);
603
604
if (has_async) {
605
taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
606
dbu, 0, &dbu->dbu_tqent);
607
}
608
}
609
610
boolean_t
611
dbuf_is_metadata(dmu_buf_impl_t *db)
612
{
613
/*
614
* Consider indirect blocks and spill blocks to be meta data.
615
*/
616
if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
617
return (B_TRUE);
618
} else {
619
boolean_t is_metadata;
620
621
DB_DNODE_ENTER(db);
622
is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
623
DB_DNODE_EXIT(db);
624
625
return (is_metadata);
626
}
627
}
628
629
/*
630
* We want to exclude buffers that are on a special allocation class from
631
* L2ARC.
632
*/
633
boolean_t
634
dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
635
{
636
if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
637
(db->db_objset->os_secondary_cache ==
638
ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
639
if (l2arc_exclude_special == 0)
640
return (B_TRUE);
641
642
/*
643
* bp must be checked in the event it was passed from
644
* dbuf_read_impl() as the result of a the BP being set from
645
* a Direct I/O write in dbuf_read(). See comments in
646
* dbuf_read().
647
*/
648
blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
649
650
if (db_bp == NULL || BP_IS_HOLE(db_bp))
651
return (B_FALSE);
652
uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
653
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
654
vdev_t *vd = NULL;
655
656
if (vdev < rvd->vdev_children)
657
vd = rvd->vdev_child[vdev];
658
659
if (vd == NULL)
660
return (B_TRUE);
661
662
if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
663
vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
664
return (B_TRUE);
665
}
666
return (B_FALSE);
667
}
668
669
static inline boolean_t
670
dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
671
{
672
if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
673
(dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
674
(level > 0 ||
675
DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
676
if (l2arc_exclude_special == 0)
677
return (B_TRUE);
678
679
if (bp == NULL || BP_IS_HOLE(bp))
680
return (B_FALSE);
681
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
682
vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
683
vdev_t *vd = NULL;
684
685
if (vdev < rvd->vdev_children)
686
vd = rvd->vdev_child[vdev];
687
688
if (vd == NULL)
689
return (B_TRUE);
690
691
if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
692
vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
693
return (B_TRUE);
694
}
695
return (B_FALSE);
696
}
697
698
699
/*
700
* This function *must* return indices evenly distributed between all
701
* sublists of the multilist. This is needed due to how the dbuf eviction
702
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
703
* distributed between all sublists and uses this assumption when
704
* deciding which sublist to evict from and how much to evict from it.
705
*/
706
static unsigned int
707
dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
708
{
709
dmu_buf_impl_t *db = obj;
710
711
/*
712
* The assumption here, is the hash value for a given
713
* dmu_buf_impl_t will remain constant throughout it's lifetime
714
* (i.e. it's objset, object, level and blkid fields don't change).
715
* Thus, we don't need to store the dbuf's sublist index
716
* on insertion, as this index can be recalculated on removal.
717
*
718
* Also, the low order bits of the hash value are thought to be
719
* distributed evenly. Otherwise, in the case that the multilist
720
* has a power of two number of sublists, each sublists' usage
721
* would not be evenly distributed. In this context full 64bit
722
* division would be a waste of time, so limit it to 32 bits.
723
*/
724
return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
725
db->db_level, db->db_blkid) %
726
multilist_get_num_sublists(ml));
727
}
728
729
/*
730
* The target size of the dbuf cache can grow with the ARC target,
731
* unless limited by the tunable dbuf_cache_max_bytes.
732
*/
733
static inline unsigned long
734
dbuf_cache_target_bytes(void)
735
{
736
return (MIN(dbuf_cache_max_bytes,
737
arc_target_bytes() >> dbuf_cache_shift));
738
}
739
740
/*
741
* The target size of the dbuf metadata cache can grow with the ARC target,
742
* unless limited by the tunable dbuf_metadata_cache_max_bytes.
743
*/
744
static inline unsigned long
745
dbuf_metadata_cache_target_bytes(void)
746
{
747
return (MIN(dbuf_metadata_cache_max_bytes,
748
arc_target_bytes() >> dbuf_metadata_cache_shift));
749
}
750
751
static inline uint64_t
752
dbuf_cache_hiwater_bytes(void)
753
{
754
uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
755
return (dbuf_cache_target +
756
(dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
757
}
758
759
static inline uint64_t
760
dbuf_cache_lowater_bytes(void)
761
{
762
uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
763
return (dbuf_cache_target -
764
(dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
765
}
766
767
static inline boolean_t
768
dbuf_cache_above_lowater(void)
769
{
770
return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
771
dbuf_cache_lowater_bytes());
772
}
773
774
/*
775
* Evict the oldest eligible dbuf from the dbuf cache.
776
*/
777
static void
778
dbuf_evict_one(void)
779
{
780
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
781
multilist_sublist_t *mls = multilist_sublist_lock_idx(
782
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
783
784
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
785
786
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
787
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
788
db = multilist_sublist_prev(mls, db);
789
}
790
791
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
792
multilist_sublist_t *, mls);
793
794
if (db != NULL) {
795
multilist_sublist_remove(mls, db);
796
multilist_sublist_unlock(mls);
797
uint64_t size = db->db.db_size;
798
uint64_t usize = dmu_buf_user_size(&db->db);
799
(void) zfs_refcount_remove_many(
800
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
801
(void) zfs_refcount_remove_many(
802
&dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);
803
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
804
DBUF_STAT_BUMPDOWN(cache_count);
805
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);
806
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
807
db->db_caching_status = DB_NO_CACHE;
808
dbuf_destroy(db);
809
DBUF_STAT_BUMP(cache_total_evicts);
810
} else {
811
multilist_sublist_unlock(mls);
812
}
813
}
814
815
/*
816
* The dbuf evict thread is responsible for aging out dbufs from the
817
* cache. Once the cache has reached it's maximum size, dbufs are removed
818
* and destroyed. The eviction thread will continue running until the size
819
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
820
* out of the cache it is destroyed and becomes eligible for arc eviction.
821
*/
822
static __attribute__((noreturn)) void
823
dbuf_evict_thread(void *unused)
824
{
825
(void) unused;
826
callb_cpr_t cpr;
827
828
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
829
830
mutex_enter(&dbuf_evict_lock);
831
while (!dbuf_evict_thread_exit) {
832
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
833
CALLB_CPR_SAFE_BEGIN(&cpr);
834
(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
835
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
836
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
837
}
838
mutex_exit(&dbuf_evict_lock);
839
840
/*
841
* Keep evicting as long as we're above the low water mark
842
* for the cache. We do this without holding the locks to
843
* minimize lock contention.
844
*/
845
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
846
dbuf_evict_one();
847
}
848
849
mutex_enter(&dbuf_evict_lock);
850
}
851
852
dbuf_evict_thread_exit = B_FALSE;
853
cv_broadcast(&dbuf_evict_cv);
854
CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
855
thread_exit();
856
}
857
858
/*
859
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
860
* If the dbuf cache is at its high water mark, then evict a dbuf from the
861
* dbuf cache using the caller's context.
862
*/
863
static void
864
dbuf_evict_notify(uint64_t size)
865
{
866
/*
867
* We check if we should evict without holding the dbuf_evict_lock,
868
* because it's OK to occasionally make the wrong decision here,
869
* and grabbing the lock results in massive lock contention.
870
*/
871
if (size > dbuf_cache_target_bytes()) {
872
/*
873
* Avoid calling dbuf_evict_one() from memory reclaim context
874
* (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
875
* Memory reclaim threads can get stuck waiting for the dbuf
876
* hash lock.
877
*/
878
if (size > dbuf_cache_hiwater_bytes() &&
879
!current_is_reclaim_thread()) {
880
dbuf_evict_one();
881
}
882
cv_signal(&dbuf_evict_cv);
883
}
884
}
885
886
/*
887
* Since dbuf cache size is a fraction of target ARC size, ARC calls this when
888
* its target size is reduced due to memory pressure.
889
*/
890
void
891
dbuf_cache_reduce_target_size(void)
892
{
893
uint64_t size = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
894
895
if (size > dbuf_cache_target_bytes())
896
cv_signal(&dbuf_evict_cv);
897
}
898
899
static int
900
dbuf_kstat_update(kstat_t *ksp, int rw)
901
{
902
dbuf_stats_t *ds = ksp->ks_data;
903
dbuf_hash_table_t *h = &dbuf_hash_table;
904
905
if (rw == KSTAT_WRITE)
906
return (SET_ERROR(EACCES));
907
908
ds->cache_count.value.ui64 =
909
wmsum_value(&dbuf_sums.cache_count);
910
ds->cache_size_bytes.value.ui64 =
911
zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
912
ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
913
ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
914
ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
915
ds->cache_total_evicts.value.ui64 =
916
wmsum_value(&dbuf_sums.cache_total_evicts);
917
for (int i = 0; i < DN_MAX_LEVELS; i++) {
918
ds->cache_levels[i].value.ui64 =
919
wmsum_value(&dbuf_sums.cache_levels[i]);
920
ds->cache_levels_bytes[i].value.ui64 =
921
wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
922
}
923
ds->hash_hits.value.ui64 =
924
wmsum_value(&dbuf_sums.hash_hits);
925
ds->hash_misses.value.ui64 =
926
wmsum_value(&dbuf_sums.hash_misses);
927
ds->hash_collisions.value.ui64 =
928
wmsum_value(&dbuf_sums.hash_collisions);
929
ds->hash_elements.value.ui64 =
930
wmsum_value(&dbuf_sums.hash_elements);
931
ds->hash_chains.value.ui64 =
932
wmsum_value(&dbuf_sums.hash_chains);
933
ds->hash_insert_race.value.ui64 =
934
wmsum_value(&dbuf_sums.hash_insert_race);
935
ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
936
ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
937
ds->metadata_cache_count.value.ui64 =
938
wmsum_value(&dbuf_sums.metadata_cache_count);
939
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
940
&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
941
ds->metadata_cache_overflow.value.ui64 =
942
wmsum_value(&dbuf_sums.metadata_cache_overflow);
943
return (0);
944
}
945
946
void
947
dbuf_init(void)
948
{
949
uint64_t hmsize, hsize = 1ULL << 16;
950
dbuf_hash_table_t *h = &dbuf_hash_table;
951
952
/*
953
* The hash table is big enough to fill one eighth of physical memory
954
* with an average block size of zfs_arc_average_blocksize (default 8K).
955
* By default, the table will take up
956
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
957
*/
958
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
959
hsize <<= 1;
960
961
h->hash_table = NULL;
962
while (h->hash_table == NULL) {
963
h->hash_table_mask = hsize - 1;
964
965
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
966
if (h->hash_table == NULL)
967
hsize >>= 1;
968
969
ASSERT3U(hsize, >=, 1ULL << 10);
970
}
971
972
/*
973
* The hash table buckets are protected by an array of mutexes where
974
* each mutex is reponsible for protecting 128 buckets. A minimum
975
* array size of 8192 is targeted to avoid contention.
976
*/
977
if (dbuf_mutex_cache_shift == 0)
978
hmsize = MAX(hsize >> 7, 1ULL << 13);
979
else
980
hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
981
982
h->hash_mutexes = NULL;
983
while (h->hash_mutexes == NULL) {
984
h->hash_mutex_mask = hmsize - 1;
985
986
h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
987
KM_SLEEP);
988
if (h->hash_mutexes == NULL)
989
hmsize >>= 1;
990
}
991
992
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
993
sizeof (dmu_buf_impl_t),
994
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
995
dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t",
996
sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
997
998
for (int i = 0; i < hmsize; i++)
999
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
1000
1001
dbuf_stats_init(h);
1002
1003
/*
1004
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
1005
* configuration is not required.
1006
*/
1007
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
1008
1009
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
1010
multilist_create(&dbuf_caches[dcs].cache,
1011
sizeof (dmu_buf_impl_t),
1012
offsetof(dmu_buf_impl_t, db_cache_link),
1013
dbuf_cache_multilist_index_func);
1014
zfs_refcount_create(&dbuf_caches[dcs].size);
1015
}
1016
1017
dbuf_evict_thread_exit = B_FALSE;
1018
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1019
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
1020
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
1021
NULL, 0, &p0, TS_RUN, minclsyspri);
1022
1023
wmsum_init(&dbuf_sums.cache_count, 0);
1024
wmsum_init(&dbuf_sums.cache_total_evicts, 0);
1025
for (int i = 0; i < DN_MAX_LEVELS; i++) {
1026
wmsum_init(&dbuf_sums.cache_levels[i], 0);
1027
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
1028
}
1029
wmsum_init(&dbuf_sums.hash_hits, 0);
1030
wmsum_init(&dbuf_sums.hash_misses, 0);
1031
wmsum_init(&dbuf_sums.hash_collisions, 0);
1032
wmsum_init(&dbuf_sums.hash_elements, 0);
1033
wmsum_init(&dbuf_sums.hash_chains, 0);
1034
wmsum_init(&dbuf_sums.hash_insert_race, 0);
1035
wmsum_init(&dbuf_sums.metadata_cache_count, 0);
1036
wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
1037
1038
dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
1039
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
1040
KSTAT_FLAG_VIRTUAL);
1041
if (dbuf_ksp != NULL) {
1042
for (int i = 0; i < DN_MAX_LEVELS; i++) {
1043
snprintf(dbuf_stats.cache_levels[i].name,
1044
KSTAT_STRLEN, "cache_level_%d", i);
1045
dbuf_stats.cache_levels[i].data_type =
1046
KSTAT_DATA_UINT64;
1047
snprintf(dbuf_stats.cache_levels_bytes[i].name,
1048
KSTAT_STRLEN, "cache_level_%d_bytes", i);
1049
dbuf_stats.cache_levels_bytes[i].data_type =
1050
KSTAT_DATA_UINT64;
1051
}
1052
dbuf_ksp->ks_data = &dbuf_stats;
1053
dbuf_ksp->ks_update = dbuf_kstat_update;
1054
kstat_install(dbuf_ksp);
1055
}
1056
}
1057
1058
void
1059
dbuf_fini(void)
1060
{
1061
dbuf_hash_table_t *h = &dbuf_hash_table;
1062
1063
dbuf_stats_destroy();
1064
1065
for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
1066
mutex_destroy(&h->hash_mutexes[i]);
1067
1068
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
1069
vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
1070
sizeof (kmutex_t));
1071
1072
kmem_cache_destroy(dbuf_kmem_cache);
1073
kmem_cache_destroy(dbuf_dirty_kmem_cache);
1074
taskq_destroy(dbu_evict_taskq);
1075
1076
mutex_enter(&dbuf_evict_lock);
1077
dbuf_evict_thread_exit = B_TRUE;
1078
while (dbuf_evict_thread_exit) {
1079
cv_signal(&dbuf_evict_cv);
1080
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
1081
}
1082
mutex_exit(&dbuf_evict_lock);
1083
1084
mutex_destroy(&dbuf_evict_lock);
1085
cv_destroy(&dbuf_evict_cv);
1086
1087
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
1088
zfs_refcount_destroy(&dbuf_caches[dcs].size);
1089
multilist_destroy(&dbuf_caches[dcs].cache);
1090
}
1091
1092
if (dbuf_ksp != NULL) {
1093
kstat_delete(dbuf_ksp);
1094
dbuf_ksp = NULL;
1095
}
1096
1097
wmsum_fini(&dbuf_sums.cache_count);
1098
wmsum_fini(&dbuf_sums.cache_total_evicts);
1099
for (int i = 0; i < DN_MAX_LEVELS; i++) {
1100
wmsum_fini(&dbuf_sums.cache_levels[i]);
1101
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
1102
}
1103
wmsum_fini(&dbuf_sums.hash_hits);
1104
wmsum_fini(&dbuf_sums.hash_misses);
1105
wmsum_fini(&dbuf_sums.hash_collisions);
1106
wmsum_fini(&dbuf_sums.hash_elements);
1107
wmsum_fini(&dbuf_sums.hash_chains);
1108
wmsum_fini(&dbuf_sums.hash_insert_race);
1109
wmsum_fini(&dbuf_sums.metadata_cache_count);
1110
wmsum_fini(&dbuf_sums.metadata_cache_overflow);
1111
}
1112
1113
/*
1114
* Other stuff.
1115
*/
1116
1117
#ifdef ZFS_DEBUG
1118
static void
1119
dbuf_verify(dmu_buf_impl_t *db)
1120
{
1121
dnode_t *dn;
1122
dbuf_dirty_record_t *dr;
1123
uint32_t txg_prev;
1124
1125
ASSERT(MUTEX_HELD(&db->db_mtx));
1126
1127
if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
1128
return;
1129
1130
ASSERT(db->db_objset != NULL);
1131
DB_DNODE_ENTER(db);
1132
dn = DB_DNODE(db);
1133
if (dn == NULL) {
1134
ASSERT0P(db->db_parent);
1135
ASSERT0P(db->db_blkptr);
1136
} else {
1137
ASSERT3U(db->db.db_object, ==, dn->dn_object);
1138
ASSERT3P(db->db_objset, ==, dn->dn_objset);
1139
ASSERT3U(db->db_level, <, dn->dn_nlevels);
1140
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
1141
db->db_blkid == DMU_SPILL_BLKID ||
1142
!avl_is_empty(&dn->dn_dbufs));
1143
}
1144
if (db->db_blkid == DMU_BONUS_BLKID) {
1145
ASSERT(dn != NULL);
1146
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1147
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
1148
} else if (db->db_blkid == DMU_SPILL_BLKID) {
1149
ASSERT(dn != NULL);
1150
ASSERT0(db->db.db_offset);
1151
} else {
1152
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
1153
}
1154
1155
if ((dr = list_head(&db->db_dirty_records)) != NULL) {
1156
ASSERT(dr->dr_dbuf == db);
1157
txg_prev = dr->dr_txg;
1158
for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
1159
dr = list_next(&db->db_dirty_records, dr)) {
1160
ASSERT(dr->dr_dbuf == db);
1161
ASSERT(txg_prev > dr->dr_txg);
1162
txg_prev = dr->dr_txg;
1163
}
1164
}
1165
1166
/*
1167
* We can't assert that db_size matches dn_datablksz because it
1168
* can be momentarily different when another thread is doing
1169
* dnode_set_blksz().
1170
*/
1171
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
1172
dr = db->db_data_pending;
1173
/*
1174
* It should only be modified in syncing context, so
1175
* make sure we only have one copy of the data.
1176
*/
1177
ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
1178
}
1179
1180
/* verify db->db_blkptr */
1181
if (db->db_blkptr) {
1182
if (db->db_parent == dn->dn_dbuf) {
1183
/* db is pointed to by the dnode */
1184
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
1185
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
1186
ASSERT0P(db->db_parent);
1187
else
1188
ASSERT(db->db_parent != NULL);
1189
if (db->db_blkid != DMU_SPILL_BLKID)
1190
ASSERT3P(db->db_blkptr, ==,
1191
&dn->dn_phys->dn_blkptr[db->db_blkid]);
1192
} else {
1193
/* db is pointed to by an indirect block */
1194
int epb __maybe_unused = db->db_parent->db.db_size >>
1195
SPA_BLKPTRSHIFT;
1196
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
1197
ASSERT3U(db->db_parent->db.db_object, ==,
1198
db->db.db_object);
1199
ASSERT3P(db->db_blkptr, ==,
1200
((blkptr_t *)db->db_parent->db.db_data +
1201
db->db_blkid % epb));
1202
}
1203
}
1204
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
1205
(db->db_buf == NULL || db->db_buf->b_data) &&
1206
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1207
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
1208
/*
1209
* If the blkptr isn't set but they have nonzero data,
1210
* it had better be dirty, otherwise we'll lose that
1211
* data when we evict this buffer.
1212
*
1213
* There is an exception to this rule for indirect blocks; in
1214
* this case, if the indirect block is a hole, we fill in a few
1215
* fields on each of the child blocks (importantly, birth time)
1216
* to prevent hole birth times from being lost when you
1217
* partially fill in a hole.
1218
*/
1219
if (db->db_dirtycnt == 0) {
1220
if (db->db_level == 0) {
1221
uint64_t *buf = db->db.db_data;
1222
int i;
1223
1224
for (i = 0; i < db->db.db_size >> 3; i++) {
1225
ASSERT0(buf[i]);
1226
}
1227
} else {
1228
blkptr_t *bps = db->db.db_data;
1229
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
1230
db->db.db_size);
1231
/*
1232
* We want to verify that all the blkptrs in the
1233
* indirect block are holes, but we may have
1234
* automatically set up a few fields for them.
1235
* We iterate through each blkptr and verify
1236
* they only have those fields set.
1237
*/
1238
for (int i = 0;
1239
i < db->db.db_size / sizeof (blkptr_t);
1240
i++) {
1241
blkptr_t *bp = &bps[i];
1242
ASSERT(ZIO_CHECKSUM_IS_ZERO(
1243
&bp->blk_cksum));
1244
ASSERT(
1245
DVA_IS_EMPTY(&bp->blk_dva[0]) &&
1246
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
1247
DVA_IS_EMPTY(&bp->blk_dva[2]));
1248
ASSERT0(bp->blk_fill);
1249
ASSERT(!BP_IS_EMBEDDED(bp));
1250
ASSERT(BP_IS_HOLE(bp));
1251
ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
1252
}
1253
}
1254
}
1255
}
1256
DB_DNODE_EXIT(db);
1257
}
1258
#endif
1259
1260
static void
1261
dbuf_clear_data(dmu_buf_impl_t *db)
1262
{
1263
ASSERT(MUTEX_HELD(&db->db_mtx));
1264
dbuf_evict_user(db);
1265
ASSERT0P(db->db_buf);
1266
db->db.db_data = NULL;
1267
if (db->db_state != DB_NOFILL) {
1268
db->db_state = DB_UNCACHED;
1269
DTRACE_SET_STATE(db, "clear data");
1270
}
1271
}
1272
1273
static void
1274
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
1275
{
1276
ASSERT(MUTEX_HELD(&db->db_mtx));
1277
ASSERT(buf != NULL);
1278
1279
db->db_buf = buf;
1280
ASSERT(buf->b_data != NULL);
1281
db->db.db_data = buf->b_data;
1282
}
1283
1284
static arc_buf_t *
1285
dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
1286
{
1287
spa_t *spa = db->db_objset->os_spa;
1288
1289
return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
1290
}
1291
1292
/*
1293
* Calculate which level n block references the data at the level 0 offset
1294
* provided.
1295
*/
1296
uint64_t
1297
dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
1298
{
1299
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
1300
/*
1301
* The level n blkid is equal to the level 0 blkid divided by
1302
* the number of level 0s in a level n block.
1303
*
1304
* The level 0 blkid is offset >> datablkshift =
1305
* offset / 2^datablkshift.
1306
*
1307
* The number of level 0s in a level n is the number of block
1308
* pointers in an indirect block, raised to the power of level.
1309
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
1310
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
1311
*
1312
* Thus, the level n blkid is: offset /
1313
* ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
1314
* = offset / 2^(datablkshift + level *
1315
* (indblkshift - SPA_BLKPTRSHIFT))
1316
* = offset >> (datablkshift + level *
1317
* (indblkshift - SPA_BLKPTRSHIFT))
1318
*/
1319
1320
const unsigned exp = dn->dn_datablkshift +
1321
level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
1322
1323
if (exp >= 8 * sizeof (offset)) {
1324
/* This only happens on the highest indirection level */
1325
ASSERT3U(level, ==, dn->dn_nlevels - 1);
1326
return (0);
1327
}
1328
1329
ASSERT3U(exp, <, 8 * sizeof (offset));
1330
1331
return (offset >> exp);
1332
} else {
1333
ASSERT3U(offset, <, dn->dn_datablksz);
1334
return (0);
1335
}
1336
}
1337
1338
/*
1339
* This function is used to lock the parent of the provided dbuf. This should be
1340
* used when modifying or reading db_blkptr.
1341
*/
1342
db_lock_type_t
1343
dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
1344
{
1345
enum db_lock_type ret = DLT_NONE;
1346
if (db->db_parent != NULL) {
1347
rw_enter(&db->db_parent->db_rwlock, rw);
1348
ret = DLT_PARENT;
1349
} else if (dmu_objset_ds(db->db_objset) != NULL) {
1350
rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
1351
tag);
1352
ret = DLT_OBJSET;
1353
}
1354
/*
1355
* We only return a DLT_NONE lock when it's the top-most indirect block
1356
* of the meta-dnode of the MOS.
1357
*/
1358
return (ret);
1359
}
1360
1361
/*
1362
* We need to pass the lock type in because it's possible that the block will
1363
* move from being the topmost indirect block in a dnode (and thus, have no
1364
* parent) to not the top-most via an indirection increase. This would cause a
1365
* panic if we didn't pass the lock type in.
1366
*/
1367
void
1368
dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
1369
{
1370
if (type == DLT_PARENT)
1371
rw_exit(&db->db_parent->db_rwlock);
1372
else if (type == DLT_OBJSET)
1373
rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
1374
}
1375
1376
static void
1377
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
1378
arc_buf_t *buf, void *vdb)
1379
{
1380
(void) zb, (void) bp;
1381
dmu_buf_impl_t *db = vdb;
1382
1383
mutex_enter(&db->db_mtx);
1384
ASSERT3U(db->db_state, ==, DB_READ);
1385
1386
/*
1387
* All reads are synchronous, so we must have a hold on the dbuf
1388
*/
1389
ASSERT(zfs_refcount_count(&db->db_holds) > 0);
1390
ASSERT0P(db->db_buf);
1391
ASSERT0P(db->db.db_data);
1392
if (buf == NULL) {
1393
/* i/o error */
1394
ASSERT(zio == NULL || zio->io_error != 0);
1395
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1396
ASSERT0P(db->db_buf);
1397
db->db_state = DB_UNCACHED;
1398
DTRACE_SET_STATE(db, "i/o error");
1399
} else if (db->db_level == 0 && db->db_freed_in_flight) {
1400
/* freed in flight */
1401
ASSERT(zio == NULL || zio->io_error == 0);
1402
arc_release(buf, db);
1403
memset(buf->b_data, 0, db->db.db_size);
1404
arc_buf_freeze(buf);
1405
db->db_freed_in_flight = FALSE;
1406
dbuf_set_data(db, buf);
1407
db->db_state = DB_CACHED;
1408
DTRACE_SET_STATE(db, "freed in flight");
1409
} else {
1410
/* success */
1411
ASSERT(zio == NULL || zio->io_error == 0);
1412
dbuf_set_data(db, buf);
1413
db->db_state = DB_CACHED;
1414
DTRACE_SET_STATE(db, "successful read");
1415
}
1416
cv_broadcast(&db->db_changed);
1417
dbuf_rele_and_unlock(db, NULL, B_FALSE);
1418
}
1419
1420
/*
1421
* Shortcut for performing reads on bonus dbufs. Returns
1422
* an error if we fail to verify the dnode associated with
1423
* a decrypted block. Otherwise success.
1424
*/
1425
static int
1426
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
1427
{
1428
void* db_data;
1429
int bonuslen, max_bonuslen;
1430
1431
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
1432
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1433
ASSERT(MUTEX_HELD(&db->db_mtx));
1434
ASSERT(DB_DNODE_HELD(db));
1435
ASSERT3U(bonuslen, <=, db->db.db_size);
1436
db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
1437
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
1438
if (bonuslen < max_bonuslen)
1439
memset(db_data, 0, max_bonuslen);
1440
if (bonuslen)
1441
memcpy(db_data, DN_BONUS(dn->dn_phys), bonuslen);
1442
db->db.db_data = db_data;
1443
db->db_state = DB_CACHED;
1444
DTRACE_SET_STATE(db, "bonus buffer filled");
1445
return (0);
1446
}
1447
1448
static void
1449
dbuf_handle_indirect_hole(void *data, dnode_t *dn, blkptr_t *dbbp)
1450
{
1451
blkptr_t *bps = data;
1452
uint32_t indbs = 1ULL << dn->dn_indblkshift;
1453
int n_bps = indbs >> SPA_BLKPTRSHIFT;
1454
1455
for (int i = 0; i < n_bps; i++) {
1456
blkptr_t *bp = &bps[i];
1457
1458
ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
1459
BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
1460
dn->dn_datablksz : BP_GET_LSIZE(dbbp));
1461
BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
1462
BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
1463
BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
1464
}
1465
}
1466
1467
/*
1468
* Handle reads on dbufs that are holes, if necessary. This function
1469
* requires that the dbuf's mutex is held. Returns success (0) if action
1470
* was taken, ENOENT if no action was taken.
1471
*/
1472
static int
1473
dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
1474
{
1475
ASSERT(MUTEX_HELD(&db->db_mtx));
1476
arc_buf_t *db_data;
1477
1478
int is_hole = bp == NULL || BP_IS_HOLE(bp);
1479
/*
1480
* For level 0 blocks only, if the above check fails:
1481
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
1482
* processes the delete record and clears the bp while we are waiting
1483
* for the dn_mtx (resulting in a "no" from block_freed).
1484
*/
1485
if (!is_hole && db->db_level == 0)
1486
is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
1487
1488
if (is_hole) {
1489
db_data = dbuf_alloc_arcbuf(db);
1490
memset(db_data->b_data, 0, db->db.db_size);
1491
1492
if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
1493
BP_GET_LOGICAL_BIRTH(bp) != 0) {
1494
dbuf_handle_indirect_hole(db_data->b_data, dn, bp);
1495
}
1496
dbuf_set_data(db, db_data);
1497
db->db_state = DB_CACHED;
1498
DTRACE_SET_STATE(db, "hole read satisfied");
1499
return (0);
1500
}
1501
return (ENOENT);
1502
}
1503
1504
/*
1505
* This function ensures that, when doing a decrypting read of a block,
1506
* we make sure we have decrypted the dnode associated with it. We must do
1507
* this so that we ensure we are fully authenticating the checksum-of-MACs
1508
* tree from the root of the objset down to this block. Indirect blocks are
1509
* always verified against their secure checksum-of-MACs assuming that the
1510
* dnode containing them is correct. Now that we are doing a decrypting read,
1511
* we can be sure that the key is loaded and verify that assumption. This is
1512
* especially important considering that we always read encrypted dnode
1513
* blocks as raw data (without verifying their MACs) to start, and
1514
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
1515
*/
1516
static int
1517
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
1518
dmu_flags_t flags)
1519
{
1520
objset_t *os = db->db_objset;
1521
dmu_buf_impl_t *dndb;
1522
arc_buf_t *dnbuf;
1523
zbookmark_phys_t zb;
1524
int err;
1525
1526
if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
1527
!os->os_encrypted || os->os_raw_receive ||
1528
(dndb = dn->dn_dbuf) == NULL)
1529
return (0);
1530
1531
dnbuf = dndb->db_buf;
1532
if (!arc_is_encrypted(dnbuf))
1533
return (0);
1534
1535
mutex_enter(&dndb->db_mtx);
1536
1537
/*
1538
* Since dnode buffer is modified by sync process, there can be only
1539
* one copy of it. It means we can not modify (decrypt) it while it
1540
* is being written. I don't see how this may happen now, since
1541
* encrypted dnode writes by receive should be completed before any
1542
* plain-text reads due to txg wait, but better be safe than sorry.
1543
*/
1544
while (1) {
1545
if (!arc_is_encrypted(dnbuf)) {
1546
mutex_exit(&dndb->db_mtx);
1547
return (0);
1548
}
1549
dbuf_dirty_record_t *dr = dndb->db_data_pending;
1550
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
1551
break;
1552
cv_wait(&dndb->db_changed, &dndb->db_mtx);
1553
};
1554
1555
SET_BOOKMARK(&zb, dmu_objset_id(os),
1556
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
1557
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
1558
1559
/*
1560
* An error code of EACCES tells us that the key is still not
1561
* available. This is ok if we are only reading authenticated
1562
* (and therefore non-encrypted) blocks.
1563
*/
1564
if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
1565
!DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
1566
(db->db_blkid == DMU_BONUS_BLKID &&
1567
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
1568
err = 0;
1569
1570
mutex_exit(&dndb->db_mtx);
1571
1572
return (err);
1573
}
1574
1575
/*
1576
* Drops db_mtx and the parent lock specified by dblt and tag before
1577
* returning.
1578
*/
1579
static int
1580
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
1581
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
1582
{
1583
zbookmark_phys_t zb;
1584
uint32_t aflags = ARC_FLAG_NOWAIT;
1585
int err, zio_flags;
1586
1587
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1588
ASSERT(MUTEX_HELD(&db->db_mtx));
1589
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1590
ASSERT0P(db->db_buf);
1591
ASSERT(db->db_parent == NULL ||
1592
RW_LOCK_HELD(&db->db_parent->db_rwlock));
1593
1594
if (db->db_blkid == DMU_BONUS_BLKID) {
1595
err = dbuf_read_bonus(db, dn);
1596
goto early_unlock;
1597
}
1598
1599
err = dbuf_read_hole(db, dn, bp);
1600
if (err == 0)
1601
goto early_unlock;
1602
1603
ASSERT(bp != NULL);
1604
1605
/*
1606
* Any attempt to read a redacted block should result in an error. This
1607
* will never happen under normal conditions, but can be useful for
1608
* debugging purposes.
1609
*/
1610
if (BP_IS_REDACTED(bp)) {
1611
ASSERT(dsl_dataset_feature_is_active(
1612
db->db_objset->os_dsl_dataset,
1613
SPA_FEATURE_REDACTED_DATASETS));
1614
err = SET_ERROR(EIO);
1615
goto early_unlock;
1616
}
1617
1618
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1619
db->db.db_object, db->db_level, db->db_blkid);
1620
1621
/*
1622
* All bps of an encrypted os should have the encryption bit set.
1623
* If this is not true it indicates tampering and we report an error.
1624
*/
1625
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
1626
spa_log_error(db->db_objset->os_spa, &zb,
1627
BP_GET_PHYSICAL_BIRTH(bp));
1628
err = SET_ERROR(EIO);
1629
goto early_unlock;
1630
}
1631
1632
db->db_state = DB_READ;
1633
DTRACE_SET_STATE(db, "read issued");
1634
mutex_exit(&db->db_mtx);
1635
1636
if (!DBUF_IS_CACHEABLE(db))
1637
aflags |= ARC_FLAG_UNCACHED;
1638
else if (dbuf_is_l2cacheable(db, bp))
1639
aflags |= ARC_FLAG_L2CACHE;
1640
1641
dbuf_add_ref(db, NULL);
1642
1643
zio_flags = (flags & DB_RF_CANFAIL) ?
1644
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
1645
1646
if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
1647
zio_flags |= ZIO_FLAG_RAW;
1648
1649
/*
1650
* The zio layer will copy the provided blkptr later, but we need to
1651
* do this now so that we can release the parent's rwlock. We have to
1652
* do that now so that if dbuf_read_done is called synchronously (on
1653
* an l1 cache hit) we don't acquire the db_mtx while holding the
1654
* parent's rwlock, which would be a lock ordering violation.
1655
*/
1656
blkptr_t copy = *bp;
1657
dmu_buf_unlock_parent(db, dblt, tag);
1658
return (arc_read(zio, db->db_objset->os_spa, &copy,
1659
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
1660
&aflags, &zb));
1661
1662
early_unlock:
1663
mutex_exit(&db->db_mtx);
1664
dmu_buf_unlock_parent(db, dblt, tag);
1665
return (err);
1666
}
1667
1668
/*
1669
* This is our just-in-time copy function. It makes a copy of buffers that
1670
* have been modified in a previous transaction group before we access them in
1671
* the current active group.
1672
*
1673
* This function is used in three places: when we are dirtying a buffer for the
1674
* first time in a txg, when we are freeing a range in a dnode that includes
1675
* this buffer, and when we are accessing a buffer which was received compressed
1676
* and later referenced in a WRITE_BYREF record.
1677
*
1678
* Note that when we are called from dbuf_free_range() we do not put a hold on
1679
* the buffer, we just traverse the active dbuf list for the dnode.
1680
*/
1681
static void
1682
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1683
{
1684
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
1685
1686
ASSERT(MUTEX_HELD(&db->db_mtx));
1687
ASSERT(db->db.db_data != NULL);
1688
ASSERT0(db->db_level);
1689
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1690
1691
if (dr == NULL ||
1692
(dr->dt.dl.dr_data !=
1693
((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1694
return;
1695
1696
/*
1697
* If the last dirty record for this dbuf has not yet synced
1698
* and its referencing the dbuf data, either:
1699
* reset the reference to point to a new copy,
1700
* or (if there a no active holders)
1701
* just null out the current db_data pointer.
1702
*/
1703
ASSERT3U(dr->dr_txg, >=, txg - 2);
1704
if (db->db_blkid == DMU_BONUS_BLKID) {
1705
dnode_t *dn = DB_DNODE(db);
1706
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1707
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
1708
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1709
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
1710
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
1711
dnode_t *dn = DB_DNODE(db);
1712
int size = arc_buf_size(db->db_buf);
1713
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1714
spa_t *spa = db->db_objset->os_spa;
1715
enum zio_compress compress_type =
1716
arc_get_compression(db->db_buf);
1717
uint8_t complevel = arc_get_complevel(db->db_buf);
1718
1719
if (arc_is_encrypted(db->db_buf)) {
1720
boolean_t byteorder;
1721
uint8_t salt[ZIO_DATA_SALT_LEN];
1722
uint8_t iv[ZIO_DATA_IV_LEN];
1723
uint8_t mac[ZIO_DATA_MAC_LEN];
1724
1725
arc_get_raw_params(db->db_buf, &byteorder, salt,
1726
iv, mac);
1727
dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
1728
dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
1729
mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
1730
compress_type, complevel);
1731
} else if (compress_type != ZIO_COMPRESS_OFF) {
1732
ASSERT3U(type, ==, ARC_BUFC_DATA);
1733
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
1734
size, arc_buf_lsize(db->db_buf), compress_type,
1735
complevel);
1736
} else {
1737
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
1738
}
1739
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
1740
} else {
1741
db->db_buf = NULL;
1742
dbuf_clear_data(db);
1743
}
1744
}
1745
1746
int
1747
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
1748
{
1749
dnode_t *dn;
1750
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
1751
int err;
1752
1753
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1754
1755
DB_DNODE_ENTER(db);
1756
dn = DB_DNODE(db);
1757
1758
/*
1759
* Ensure that this block's dnode has been decrypted if the caller
1760
* has requested decrypted data.
1761
*/
1762
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
1763
if (err != 0)
1764
goto done;
1765
1766
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1767
(flags & DMU_READ_NO_PREFETCH) == 0;
1768
1769
mutex_enter(&db->db_mtx);
1770
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
1771
db->db_pending_evict = B_FALSE;
1772
if (flags & DMU_PARTIAL_FIRST)
1773
db->db_partial_read = B_TRUE;
1774
else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
1775
db->db_partial_read = B_FALSE;
1776
miss = (db->db_state != DB_CACHED);
1777
1778
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1779
/*
1780
* Another reader came in while the dbuf was in flight between
1781
* UNCACHED and CACHED. Either a writer will finish filling
1782
* the buffer, sending the dbuf to CACHED, or the first reader's
1783
* request will reach the read_done callback and send the dbuf
1784
* to CACHED. Otherwise, a failure occurred and the dbuf will
1785
* be sent to UNCACHED.
1786
*/
1787
if (flags & DB_RF_NEVERWAIT) {
1788
mutex_exit(&db->db_mtx);
1789
DB_DNODE_EXIT(db);
1790
goto done;
1791
}
1792
do {
1793
ASSERT(db->db_state == DB_READ ||
1794
(flags & DB_RF_HAVESTRUCT) == 0);
1795
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
1796
zio_t *, pio);
1797
cv_wait(&db->db_changed, &db->db_mtx);
1798
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
1799
if (db->db_state == DB_UNCACHED) {
1800
err = SET_ERROR(EIO);
1801
mutex_exit(&db->db_mtx);
1802
DB_DNODE_EXIT(db);
1803
goto done;
1804
}
1805
}
1806
1807
if (db->db_state == DB_CACHED) {
1808
/*
1809
* If the arc buf is compressed or encrypted and the caller
1810
* requested uncompressed data, we need to untransform it
1811
* before returning. We also call arc_untransform() on any
1812
* unauthenticated blocks, which will verify their MAC if
1813
* the key is now available.
1814
*/
1815
if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
1816
(arc_is_encrypted(db->db_buf) ||
1817
arc_is_unauthenticated(db->db_buf) ||
1818
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
1819
spa_t *spa = dn->dn_objset->os_spa;
1820
zbookmark_phys_t zb;
1821
1822
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1823
db->db.db_object, db->db_level, db->db_blkid);
1824
dbuf_fix_old_data(db, spa_syncing_txg(spa));
1825
err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
1826
dbuf_set_data(db, db->db_buf);
1827
}
1828
mutex_exit(&db->db_mtx);
1829
} else {
1830
ASSERT(db->db_state == DB_UNCACHED ||
1831
db->db_state == DB_NOFILL);
1832
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
1833
blkptr_t *bp;
1834
1835
/*
1836
* If a block clone or Direct I/O write has occurred we will
1837
* get the dirty records overridden BP so we get the most
1838
* recent data.
1839
*/
1840
err = dmu_buf_get_bp_from_dbuf(db, &bp);
1841
1842
if (!err) {
1843
if (pio == NULL && (db->db_state == DB_NOFILL ||
1844
(bp != NULL && !BP_IS_HOLE(bp)))) {
1845
spa_t *spa = dn->dn_objset->os_spa;
1846
pio =
1847
zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1848
need_wait = B_TRUE;
1849
}
1850
1851
err =
1852
dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
1853
} else {
1854
mutex_exit(&db->db_mtx);
1855
dmu_buf_unlock_parent(db, dblt, FTAG);
1856
}
1857
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
1858
miss = (db->db_state != DB_CACHED);
1859
}
1860
1861
if (err == 0 && prefetch) {
1862
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
1863
flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
1864
db->db_pending_evict);
1865
}
1866
DB_DNODE_EXIT(db);
1867
1868
/*
1869
* If we created a zio we must execute it to avoid leaking it, even if
1870
* it isn't attached to any work due to an error in dbuf_read_impl().
1871
*/
1872
if (need_wait) {
1873
if (err == 0)
1874
err = zio_wait(pio);
1875
else
1876
(void) zio_wait(pio);
1877
pio = NULL;
1878
}
1879
1880
done:
1881
if (miss)
1882
DBUF_STAT_BUMP(hash_misses);
1883
else
1884
DBUF_STAT_BUMP(hash_hits);
1885
if (pio && err != 0) {
1886
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
1887
ZIO_FLAG_CANFAIL);
1888
zio->io_error = err;
1889
zio_nowait(zio);
1890
}
1891
1892
return (err);
1893
}
1894
1895
static void
1896
dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
1897
{
1898
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1899
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1900
mutex_enter(&db->db_mtx);
1901
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
1902
db->db_pending_evict = B_FALSE;
1903
db->db_partial_read = B_FALSE;
1904
while (db->db_state == DB_READ || db->db_state == DB_FILL)
1905
cv_wait(&db->db_changed, &db->db_mtx);
1906
if (db->db_state == DB_UNCACHED) {
1907
ASSERT0P(db->db_buf);
1908
ASSERT0P(db->db.db_data);
1909
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
1910
db->db_state = DB_FILL;
1911
DTRACE_SET_STATE(db, "assigning filled buffer");
1912
} else if (db->db_state == DB_NOFILL) {
1913
dbuf_clear_data(db);
1914
} else {
1915
ASSERT3U(db->db_state, ==, DB_CACHED);
1916
}
1917
mutex_exit(&db->db_mtx);
1918
}
1919
1920
void
1921
dbuf_unoverride(dbuf_dirty_record_t *dr)
1922
{
1923
dmu_buf_impl_t *db = dr->dr_dbuf;
1924
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1925
uint64_t txg = dr->dr_txg;
1926
1927
ASSERT(MUTEX_HELD(&db->db_mtx));
1928
1929
/*
1930
* This assert is valid because dmu_sync() expects to be called by
1931
* a zilog's get_data while holding a range lock. This call only
1932
* comes from dbuf_dirty() callers who must also hold a range lock.
1933
*/
1934
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1935
ASSERT0(db->db_level);
1936
1937
if (db->db_blkid == DMU_BONUS_BLKID ||
1938
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1939
return;
1940
1941
ASSERT(db->db_data_pending != dr);
1942
1943
/* free this block */
1944
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1945
zio_free(db->db_objset->os_spa, txg, bp);
1946
1947
if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
1948
ASSERT0P(dr->dt.dl.dr_data);
1949
dr->dt.dl.dr_data = db->db_buf;
1950
}
1951
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1952
dr->dt.dl.dr_nopwrite = B_FALSE;
1953
dr->dt.dl.dr_brtwrite = B_FALSE;
1954
dr->dt.dl.dr_diowrite = B_FALSE;
1955
dr->dt.dl.dr_has_raw_params = B_FALSE;
1956
1957
/*
1958
* In the event that Direct I/O was used, we do not
1959
* need to release the buffer from the ARC.
1960
*
1961
* Release the already-written buffer, so we leave it in
1962
* a consistent dirty state. Note that all callers are
1963
* modifying the buffer, so they will immediately do
1964
* another (redundant) arc_release(). Therefore, leave
1965
* the buf thawed to save the effort of freezing &
1966
* immediately re-thawing it.
1967
*/
1968
if (dr->dt.dl.dr_data)
1969
arc_release(dr->dt.dl.dr_data, db);
1970
}
1971
1972
/*
1973
* Evict (if its unreferenced) or clear (if its referenced) any level-0
1974
* data blocks in the free range, so that any future readers will find
1975
* empty blocks.
1976
*/
1977
void
1978
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1979
dmu_tx_t *tx)
1980
{
1981
dmu_buf_impl_t *db_search;
1982
dmu_buf_impl_t *db, *db_next;
1983
uint64_t txg = tx->tx_txg;
1984
avl_index_t where;
1985
dbuf_dirty_record_t *dr;
1986
1987
if (end_blkid > dn->dn_maxblkid &&
1988
!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
1989
end_blkid = dn->dn_maxblkid;
1990
dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
1991
(u_longlong_t)end_blkid);
1992
1993
db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
1994
db_search->db_level = 0;
1995
db_search->db_blkid = start_blkid;
1996
db_search->db_state = DB_SEARCH;
1997
1998
mutex_enter(&dn->dn_dbufs_mtx);
1999
db = avl_find(&dn->dn_dbufs, db_search, &where);
2000
ASSERT0P(db);
2001
2002
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
2003
2004
for (; db != NULL; db = db_next) {
2005
db_next = AVL_NEXT(&dn->dn_dbufs, db);
2006
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2007
2008
if (db->db_level != 0 || db->db_blkid > end_blkid) {
2009
break;
2010
}
2011
ASSERT3U(db->db_blkid, >=, start_blkid);
2012
2013
/* found a level 0 buffer in the range */
2014
mutex_enter(&db->db_mtx);
2015
if (dbuf_undirty(db, tx)) {
2016
/* mutex has been dropped and dbuf destroyed */
2017
continue;
2018
}
2019
2020
if (db->db_state == DB_UNCACHED ||
2021
db->db_state == DB_NOFILL ||
2022
db->db_state == DB_EVICTING) {
2023
ASSERT0P(db->db.db_data);
2024
mutex_exit(&db->db_mtx);
2025
continue;
2026
}
2027
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
2028
/* will be handled in dbuf_read_done or dbuf_rele */
2029
db->db_freed_in_flight = TRUE;
2030
mutex_exit(&db->db_mtx);
2031
continue;
2032
}
2033
if (zfs_refcount_count(&db->db_holds) == 0) {
2034
ASSERT(db->db_buf);
2035
dbuf_destroy(db);
2036
continue;
2037
}
2038
/* The dbuf is referenced */
2039
2040
dr = list_head(&db->db_dirty_records);
2041
if (dr != NULL) {
2042
if (dr->dr_txg == txg) {
2043
/*
2044
* This buffer is "in-use", re-adjust the file
2045
* size to reflect that this buffer may
2046
* contain new data when we sync.
2047
*/
2048
if (db->db_blkid != DMU_SPILL_BLKID &&
2049
db->db_blkid > dn->dn_maxblkid)
2050
dn->dn_maxblkid = db->db_blkid;
2051
dbuf_unoverride(dr);
2052
} else {
2053
/*
2054
* This dbuf is not dirty in the open context.
2055
* Either uncache it (if its not referenced in
2056
* the open context) or reset its contents to
2057
* empty.
2058
*/
2059
dbuf_fix_old_data(db, txg);
2060
}
2061
}
2062
/* clear the contents if its cached */
2063
if (db->db_state == DB_CACHED) {
2064
ASSERT(db->db.db_data != NULL);
2065
arc_release(db->db_buf, db);
2066
rw_enter(&db->db_rwlock, RW_WRITER);
2067
memset(db->db.db_data, 0, db->db.db_size);
2068
rw_exit(&db->db_rwlock);
2069
arc_buf_freeze(db->db_buf);
2070
}
2071
2072
mutex_exit(&db->db_mtx);
2073
}
2074
2075
mutex_exit(&dn->dn_dbufs_mtx);
2076
kmem_free(db_search, sizeof (dmu_buf_impl_t));
2077
}
2078
2079
void
2080
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
2081
{
2082
arc_buf_t *buf, *old_buf;
2083
dbuf_dirty_record_t *dr;
2084
int osize = db->db.db_size;
2085
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2086
dnode_t *dn;
2087
2088
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2089
2090
DB_DNODE_ENTER(db);
2091
dn = DB_DNODE(db);
2092
2093
/*
2094
* XXX we should be doing a dbuf_read, checking the return
2095
* value and returning that up to our callers
2096
*/
2097
dmu_buf_will_dirty(&db->db, tx);
2098
2099
VERIFY3P(db->db_buf, !=, NULL);
2100
2101
/* create the data buffer for the new block */
2102
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
2103
2104
/* copy old block data to the new block */
2105
old_buf = db->db_buf;
2106
memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
2107
/* zero the remainder */
2108
if (size > osize)
2109
memset((uint8_t *)buf->b_data + osize, 0, size - osize);
2110
2111
mutex_enter(&db->db_mtx);
2112
dbuf_set_data(db, buf);
2113
arc_buf_destroy(old_buf, db);
2114
db->db.db_size = size;
2115
2116
dr = list_head(&db->db_dirty_records);
2117
/* dirty record added by dmu_buf_will_dirty() */
2118
VERIFY(dr != NULL);
2119
if (db->db_level == 0)
2120
dr->dt.dl.dr_data = buf;
2121
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2122
ASSERT3U(dr->dr_accounted, ==, osize);
2123
dr->dr_accounted = size;
2124
mutex_exit(&db->db_mtx);
2125
2126
dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
2127
DB_DNODE_EXIT(db);
2128
}
2129
2130
void
2131
dbuf_release_bp(dmu_buf_impl_t *db)
2132
{
2133
objset_t *os __maybe_unused = db->db_objset;
2134
2135
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
2136
ASSERT(arc_released(os->os_phys_buf) ||
2137
list_link_active(&os->os_dsl_dataset->ds_synced_link));
2138
ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
2139
2140
(void) arc_release(db->db_buf, db);
2141
}
2142
2143
/*
2144
* We already have a dirty record for this TXG, and we are being
2145
* dirtied again.
2146
*/
2147
static void
2148
dbuf_redirty(dbuf_dirty_record_t *dr)
2149
{
2150
dmu_buf_impl_t *db = dr->dr_dbuf;
2151
2152
ASSERT(MUTEX_HELD(&db->db_mtx));
2153
2154
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
2155
/*
2156
* If this buffer has already been written out,
2157
* we now need to reset its state.
2158
*/
2159
dbuf_unoverride(dr);
2160
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
2161
db->db_state != DB_NOFILL) {
2162
/* Already released on initial dirty, so just thaw. */
2163
ASSERT(arc_released(db->db_buf));
2164
arc_buf_thaw(db->db_buf);
2165
}
2166
2167
/*
2168
* Clear the rewrite flag since this is now a logical
2169
* modification.
2170
*/
2171
dr->dt.dl.dr_rewrite = B_FALSE;
2172
}
2173
}
2174
2175
dbuf_dirty_record_t *
2176
dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
2177
{
2178
rw_enter(&dn->dn_struct_rwlock, RW_READER);
2179
IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
2180
dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
2181
ASSERT(dn->dn_maxblkid >= blkid);
2182
2183
dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
2184
list_link_init(&dr->dr_dirty_node);
2185
list_link_init(&dr->dr_dbuf_node);
2186
dr->dr_dnode = dn;
2187
dr->dr_txg = tx->tx_txg;
2188
dr->dt.dll.dr_blkid = blkid;
2189
dr->dr_accounted = dn->dn_datablksz;
2190
2191
/*
2192
* There should not be any dbuf for the block that we're dirtying.
2193
* Otherwise the buffer contents could be inconsistent between the
2194
* dbuf and the lightweight dirty record.
2195
*/
2196
ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
2197
NULL));
2198
2199
mutex_enter(&dn->dn_mtx);
2200
int txgoff = tx->tx_txg & TXG_MASK;
2201
if (dn->dn_free_ranges[txgoff] != NULL) {
2202
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
2203
}
2204
2205
if (dn->dn_nlevels == 1) {
2206
ASSERT3U(blkid, <, dn->dn_nblkptr);
2207
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2208
mutex_exit(&dn->dn_mtx);
2209
rw_exit(&dn->dn_struct_rwlock);
2210
dnode_setdirty(dn, tx);
2211
} else {
2212
mutex_exit(&dn->dn_mtx);
2213
2214
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2215
dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
2216
1, blkid >> epbs, FTAG);
2217
rw_exit(&dn->dn_struct_rwlock);
2218
if (parent_db == NULL) {
2219
kmem_free(dr, sizeof (*dr));
2220
return (NULL);
2221
}
2222
int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
2223
DMU_READ_NO_PREFETCH);
2224
if (err != 0) {
2225
dbuf_rele(parent_db, FTAG);
2226
kmem_free(dr, sizeof (*dr));
2227
return (NULL);
2228
}
2229
2230
dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
2231
dbuf_rele(parent_db, FTAG);
2232
mutex_enter(&parent_dr->dt.di.dr_mtx);
2233
ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
2234
list_insert_tail(&parent_dr->dt.di.dr_children, dr);
2235
mutex_exit(&parent_dr->dt.di.dr_mtx);
2236
dr->dr_parent = parent_dr;
2237
}
2238
2239
dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
2240
2241
return (dr);
2242
}
2243
2244
dbuf_dirty_record_t *
2245
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2246
{
2247
dnode_t *dn;
2248
objset_t *os;
2249
dbuf_dirty_record_t *dr, *dr_next, *dr_head;
2250
int txgoff = tx->tx_txg & TXG_MASK;
2251
boolean_t drop_struct_rwlock = B_FALSE;
2252
2253
ASSERT(tx->tx_txg != 0);
2254
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2255
DMU_TX_DIRTY_BUF(tx, db);
2256
2257
DB_DNODE_ENTER(db);
2258
dn = DB_DNODE(db);
2259
/*
2260
* Shouldn't dirty a regular buffer in syncing context. Private
2261
* objects may be dirtied in syncing context, but only if they
2262
* were already pre-dirtied in open context.
2263
*/
2264
#ifdef ZFS_DEBUG
2265
if (dn->dn_objset->os_dsl_dataset != NULL) {
2266
rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
2267
RW_READER, FTAG);
2268
}
2269
ASSERT(!dmu_tx_is_syncing(tx) ||
2270
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
2271
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2272
dn->dn_objset->os_dsl_dataset == NULL);
2273
if (dn->dn_objset->os_dsl_dataset != NULL)
2274
rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
2275
#endif
2276
2277
mutex_enter(&db->db_mtx);
2278
/*
2279
* XXX make this true for indirects too? The problem is that
2280
* transactions created with dmu_tx_create_assigned() from
2281
* syncing context don't bother holding ahead.
2282
*/
2283
ASSERT(db->db_level != 0 ||
2284
db->db_state == DB_CACHED || db->db_state == DB_FILL ||
2285
db->db_state == DB_NOFILL);
2286
2287
if (db->db_blkid == DMU_SPILL_BLKID)
2288
dn->dn_have_spill = B_TRUE;
2289
2290
/*
2291
* If this buffer is already dirty, we're done.
2292
*/
2293
dr_head = list_head(&db->db_dirty_records);
2294
ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
2295
db->db.db_object == DMU_META_DNODE_OBJECT);
2296
dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
2297
if (dr_next && dr_next->dr_txg == tx->tx_txg) {
2298
DB_DNODE_EXIT(db);
2299
2300
dbuf_redirty(dr_next);
2301
mutex_exit(&db->db_mtx);
2302
return (dr_next);
2303
}
2304
2305
ASSERT3U(dn->dn_nlevels, >, db->db_level);
2306
2307
/*
2308
* We should only be dirtying in syncing context if it's the
2309
* mos or we're initializing the os or it's a special object.
2310
* However, we are allowed to dirty in syncing context provided
2311
* we already dirtied it in open context. Hence we must make
2312
* this assertion only if we're not already dirty.
2313
*/
2314
os = dn->dn_objset;
2315
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
2316
#ifdef ZFS_DEBUG
2317
if (dn->dn_objset->os_dsl_dataset != NULL)
2318
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
2319
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2320
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
2321
if (dn->dn_objset->os_dsl_dataset != NULL)
2322
rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
2323
#endif
2324
ASSERT(db->db.db_size != 0);
2325
2326
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2327
2328
if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
2329
dmu_objset_willuse_space(os, db->db.db_size, tx);
2330
}
2331
2332
/*
2333
* If this buffer is dirty in an old transaction group we need
2334
* to make a copy of it so that the changes we make in this
2335
* transaction group won't leak out when we sync the older txg.
2336
*/
2337
dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP);
2338
memset(dr, 0, sizeof (*dr));
2339
list_link_init(&dr->dr_dirty_node);
2340
list_link_init(&dr->dr_dbuf_node);
2341
dr->dr_dnode = dn;
2342
if (db->db_level == 0) {
2343
void *data_old = db->db_buf;
2344
2345
if (db->db_state != DB_NOFILL) {
2346
if (db->db_blkid == DMU_BONUS_BLKID) {
2347
dbuf_fix_old_data(db, tx->tx_txg);
2348
data_old = db->db.db_data;
2349
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
2350
/*
2351
* Release the data buffer from the cache so
2352
* that we can modify it without impacting
2353
* possible other users of this cached data
2354
* block. Note that indirect blocks and
2355
* private objects are not released until the
2356
* syncing state (since they are only modified
2357
* then).
2358
*/
2359
arc_release(db->db_buf, db);
2360
dbuf_fix_old_data(db, tx->tx_txg);
2361
data_old = db->db_buf;
2362
}
2363
ASSERT(data_old != NULL);
2364
}
2365
dr->dt.dl.dr_data = data_old;
2366
} else {
2367
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
2368
list_create(&dr->dt.di.dr_children,
2369
sizeof (dbuf_dirty_record_t),
2370
offsetof(dbuf_dirty_record_t, dr_dirty_node));
2371
}
2372
if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
2373
dr->dr_accounted = db->db.db_size;
2374
}
2375
dr->dr_dbuf = db;
2376
dr->dr_txg = tx->tx_txg;
2377
list_insert_before(&db->db_dirty_records, dr_next, dr);
2378
2379
/*
2380
* We could have been freed_in_flight between the dbuf_noread
2381
* and dbuf_dirty. We win, as though the dbuf_noread() had
2382
* happened after the free.
2383
*/
2384
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2385
db->db_blkid != DMU_SPILL_BLKID) {
2386
mutex_enter(&dn->dn_mtx);
2387
if (dn->dn_free_ranges[txgoff] != NULL) {
2388
zfs_range_tree_clear(dn->dn_free_ranges[txgoff],
2389
db->db_blkid, 1);
2390
}
2391
mutex_exit(&dn->dn_mtx);
2392
db->db_freed_in_flight = FALSE;
2393
}
2394
2395
/*
2396
* This buffer is now part of this txg
2397
*/
2398
dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
2399
db->db_dirtycnt += 1;
2400
ASSERT3U(db->db_dirtycnt, <=, 3);
2401
2402
mutex_exit(&db->db_mtx);
2403
2404
if (db->db_blkid == DMU_BONUS_BLKID ||
2405
db->db_blkid == DMU_SPILL_BLKID) {
2406
mutex_enter(&dn->dn_mtx);
2407
ASSERT(!list_link_active(&dr->dr_dirty_node));
2408
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2409
mutex_exit(&dn->dn_mtx);
2410
dnode_setdirty(dn, tx);
2411
DB_DNODE_EXIT(db);
2412
return (dr);
2413
}
2414
2415
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
2416
rw_enter(&dn->dn_struct_rwlock, RW_READER);
2417
drop_struct_rwlock = B_TRUE;
2418
}
2419
2420
/*
2421
* If we are overwriting a dedup BP, then unless it is snapshotted,
2422
* when we get to syncing context we will need to decrement its
2423
* refcount in the DDT. Prefetch the relevant DDT block so that
2424
* syncing context won't have to wait for the i/o.
2425
*/
2426
if (db->db_blkptr != NULL) {
2427
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2428
ddt_prefetch(os->os_spa, db->db_blkptr);
2429
dmu_buf_unlock_parent(db, dblt, FTAG);
2430
}
2431
2432
/*
2433
* We need to hold the dn_struct_rwlock to make this assertion,
2434
* because it protects dn_phys / dn_next_nlevels from changing.
2435
*/
2436
ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
2437
dn->dn_phys->dn_nlevels > db->db_level ||
2438
dn->dn_next_nlevels[txgoff] > db->db_level ||
2439
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
2440
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
2441
2442
2443
if (db->db_level == 0) {
2444
ASSERT(!db->db_objset->os_raw_receive ||
2445
dn->dn_maxblkid >= db->db_blkid);
2446
dnode_new_blkid(dn, db->db_blkid, tx,
2447
drop_struct_rwlock, B_FALSE);
2448
ASSERT(dn->dn_maxblkid >= db->db_blkid);
2449
}
2450
2451
if (db->db_level+1 < dn->dn_nlevels) {
2452
dmu_buf_impl_t *parent = db->db_parent;
2453
dbuf_dirty_record_t *di;
2454
int parent_held = FALSE;
2455
2456
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
2457
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2458
parent = dbuf_hold_level(dn, db->db_level + 1,
2459
db->db_blkid >> epbs, FTAG);
2460
ASSERT(parent != NULL);
2461
parent_held = TRUE;
2462
}
2463
if (drop_struct_rwlock)
2464
rw_exit(&dn->dn_struct_rwlock);
2465
ASSERT3U(db->db_level + 1, ==, parent->db_level);
2466
di = dbuf_dirty(parent, tx);
2467
if (parent_held)
2468
dbuf_rele(parent, FTAG);
2469
2470
mutex_enter(&db->db_mtx);
2471
/*
2472
* Since we've dropped the mutex, it's possible that
2473
* dbuf_undirty() might have changed this out from under us.
2474
*/
2475
if (list_head(&db->db_dirty_records) == dr ||
2476
dn->dn_object == DMU_META_DNODE_OBJECT) {
2477
mutex_enter(&di->dt.di.dr_mtx);
2478
ASSERT3U(di->dr_txg, ==, tx->tx_txg);
2479
ASSERT(!list_link_active(&dr->dr_dirty_node));
2480
list_insert_tail(&di->dt.di.dr_children, dr);
2481
mutex_exit(&di->dt.di.dr_mtx);
2482
dr->dr_parent = di;
2483
}
2484
mutex_exit(&db->db_mtx);
2485
} else {
2486
ASSERT(db->db_level + 1 == dn->dn_nlevels);
2487
ASSERT(db->db_blkid < dn->dn_nblkptr);
2488
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
2489
mutex_enter(&dn->dn_mtx);
2490
ASSERT(!list_link_active(&dr->dr_dirty_node));
2491
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2492
mutex_exit(&dn->dn_mtx);
2493
if (drop_struct_rwlock)
2494
rw_exit(&dn->dn_struct_rwlock);
2495
}
2496
2497
dnode_setdirty(dn, tx);
2498
DB_DNODE_EXIT(db);
2499
return (dr);
2500
}
2501
2502
static void
2503
dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
2504
{
2505
dmu_buf_impl_t *db = dr->dr_dbuf;
2506
2507
ASSERT(MUTEX_HELD(&db->db_mtx));
2508
if (dr->dt.dl.dr_data != db->db.db_data) {
2509
struct dnode *dn = dr->dr_dnode;
2510
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
2511
2512
kmem_free(dr->dt.dl.dr_data, max_bonuslen);
2513
arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
2514
}
2515
db->db_data_pending = NULL;
2516
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
2517
list_remove(&db->db_dirty_records, dr);
2518
if (dr->dr_dbuf->db_level != 0) {
2519
mutex_destroy(&dr->dt.di.dr_mtx);
2520
list_destroy(&dr->dt.di.dr_children);
2521
}
2522
kmem_cache_free(dbuf_dirty_kmem_cache, dr);
2523
ASSERT3U(db->db_dirtycnt, >, 0);
2524
db->db_dirtycnt -= 1;
2525
}
2526
2527
/*
2528
* Undirty a buffer in the transaction group referenced by the given
2529
* transaction. Return whether this evicted the dbuf.
2530
*/
2531
boolean_t
2532
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2533
{
2534
uint64_t txg = tx->tx_txg;
2535
boolean_t brtwrite;
2536
boolean_t diowrite;
2537
2538
ASSERT(txg != 0);
2539
2540
/*
2541
* Due to our use of dn_nlevels below, this can only be called
2542
* in open context, unless we are operating on the MOS or it's
2543
* a special object. From syncing context, dn_nlevels may be
2544
* different from the dn_nlevels used when dbuf was dirtied.
2545
*/
2546
ASSERT(db->db_objset ==
2547
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
2548
DMU_OBJECT_IS_SPECIAL(db->db.db_object) ||
2549
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
2550
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2551
ASSERT0(db->db_level);
2552
ASSERT(MUTEX_HELD(&db->db_mtx));
2553
2554
/*
2555
* If this buffer is not dirty, we're done.
2556
*/
2557
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
2558
if (dr == NULL)
2559
return (B_FALSE);
2560
ASSERT(dr->dr_dbuf == db);
2561
2562
brtwrite = dr->dt.dl.dr_brtwrite;
2563
diowrite = dr->dt.dl.dr_diowrite;
2564
if (brtwrite) {
2565
ASSERT3B(diowrite, ==, B_FALSE);
2566
/*
2567
* We are freeing a block that we cloned in the same
2568
* transaction group.
2569
*/
2570
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
2571
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
2572
brt_pending_remove(dmu_objset_spa(db->db_objset),
2573
bp, tx);
2574
}
2575
}
2576
2577
dnode_t *dn = dr->dr_dnode;
2578
2579
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2580
2581
ASSERT(db->db.db_size != 0);
2582
2583
dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
2584
dr->dr_accounted, txg);
2585
2586
list_remove(&db->db_dirty_records, dr);
2587
2588
/*
2589
* Note that there are three places in dbuf_dirty()
2590
* where this dirty record may be put on a list.
2591
* Make sure to do a list_remove corresponding to
2592
* every one of those list_insert calls.
2593
*/
2594
if (dr->dr_parent) {
2595
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
2596
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
2597
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
2598
} else if (db->db_blkid == DMU_SPILL_BLKID ||
2599
db->db_level + 1 == dn->dn_nlevels) {
2600
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
2601
mutex_enter(&dn->dn_mtx);
2602
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
2603
mutex_exit(&dn->dn_mtx);
2604
}
2605
2606
if (db->db_state != DB_NOFILL && !brtwrite) {
2607
dbuf_unoverride(dr);
2608
2609
if (dr->dt.dl.dr_data != db->db_buf) {
2610
ASSERT(db->db_buf != NULL);
2611
ASSERT(dr->dt.dl.dr_data != NULL);
2612
arc_buf_destroy(dr->dt.dl.dr_data, db);
2613
}
2614
}
2615
2616
kmem_cache_free(dbuf_dirty_kmem_cache, dr);
2617
2618
ASSERT(db->db_dirtycnt > 0);
2619
db->db_dirtycnt -= 1;
2620
2621
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
2622
ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
2623
arc_released(db->db_buf));
2624
dbuf_destroy(db);
2625
return (B_TRUE);
2626
}
2627
2628
return (B_FALSE);
2629
}
2630
2631
void
2632
dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
2633
{
2634
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2635
boolean_t undirty = B_FALSE;
2636
2637
ASSERT(tx->tx_txg != 0);
2638
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2639
2640
/*
2641
* Quick check for dirtiness to improve performance for some workloads
2642
* (e.g. file deletion with indirect blocks cached).
2643
*/
2644
mutex_enter(&db->db_mtx);
2645
if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
2646
/*
2647
* It's possible that the dbuf is already dirty but not cached,
2648
* because there are some calls to dbuf_dirty() that don't
2649
* go through dmu_buf_will_dirty().
2650
*/
2651
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2652
if (dr != NULL) {
2653
if (db->db_level == 0 &&
2654
dr->dt.dl.dr_brtwrite) {
2655
/*
2656
* Block cloning: If we are dirtying a cloned
2657
* level 0 block, we cannot simply redirty it,
2658
* because this dr has no associated data.
2659
* We will go through a full undirtying below,
2660
* before dirtying it again.
2661
*/
2662
undirty = B_TRUE;
2663
} else {
2664
/* This dbuf is already dirty and cached. */
2665
dbuf_redirty(dr);
2666
mutex_exit(&db->db_mtx);
2667
return;
2668
}
2669
}
2670
}
2671
mutex_exit(&db->db_mtx);
2672
2673
DB_DNODE_ENTER(db);
2674
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2675
flags |= DB_RF_HAVESTRUCT;
2676
DB_DNODE_EXIT(db);
2677
2678
/*
2679
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
2680
* want to make sure dbuf_read() will read the pending cloned block and
2681
* not the uderlying block that is being replaced. dbuf_undirty() will
2682
* do brt_pending_remove() before removing the dirty record.
2683
*/
2684
(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
2685
if (undirty) {
2686
mutex_enter(&db->db_mtx);
2687
VERIFY(!dbuf_undirty(db, tx));
2688
mutex_exit(&db->db_mtx);
2689
}
2690
(void) dbuf_dirty(db, tx);
2691
}
2692
2693
void
2694
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2695
{
2696
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
2697
}
2698
2699
void
2700
dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)
2701
{
2702
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2703
2704
ASSERT(tx->tx_txg != 0);
2705
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2706
2707
/*
2708
* If the dbuf is already dirty in this txg, it will be written
2709
* anyway, so there's nothing to do.
2710
*/
2711
mutex_enter(&db->db_mtx);
2712
if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
2713
mutex_exit(&db->db_mtx);
2714
return;
2715
}
2716
mutex_exit(&db->db_mtx);
2717
2718
/*
2719
* The dbuf is not dirty, so we need to make it dirty and
2720
* mark it for rewrite (preserve logical birth time).
2721
*/
2722
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
2723
2724
mutex_enter(&db->db_mtx);
2725
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2726
if (dr != NULL && db->db_level == 0)
2727
dr->dt.dl.dr_rewrite = B_TRUE;
2728
mutex_exit(&db->db_mtx);
2729
}
2730
2731
boolean_t
2732
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2733
{
2734
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2735
dbuf_dirty_record_t *dr;
2736
2737
mutex_enter(&db->db_mtx);
2738
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2739
mutex_exit(&db->db_mtx);
2740
return (dr != NULL);
2741
}
2742
2743
/*
2744
* Normally the db_blkptr points to the most recent on-disk content for the
2745
* dbuf (and anything newer will be cached in the dbuf). However, a pending
2746
* block clone or not yet synced Direct I/O write will have a dirty record BP
2747
* pointing to the most recent data.
2748
*/
2749
int
2750
dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
2751
{
2752
ASSERT(MUTEX_HELD(&db->db_mtx));
2753
int error = 0;
2754
2755
if (db->db_level != 0) {
2756
*bp = db->db_blkptr;
2757
return (0);
2758
}
2759
2760
*bp = db->db_blkptr;
2761
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
2762
if (dr && db->db_state == DB_NOFILL) {
2763
/* Block clone */
2764
if (!dr->dt.dl.dr_brtwrite)
2765
error = EIO;
2766
else
2767
*bp = &dr->dt.dl.dr_overridden_by;
2768
} else if (dr && db->db_state == DB_UNCACHED) {
2769
/* Direct I/O write */
2770
if (dr->dt.dl.dr_diowrite)
2771
*bp = &dr->dt.dl.dr_overridden_by;
2772
}
2773
2774
return (error);
2775
}
2776
2777
/*
2778
* Direct I/O reads can read directly from the ARC, but the data has
2779
* to be untransformed in order to copy it over into user pages.
2780
*/
2781
int
2782
dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
2783
{
2784
int err = 0;
2785
DB_DNODE_ENTER(db);
2786
dnode_t *dn = DB_DNODE(db);
2787
2788
ASSERT3S(db->db_state, ==, DB_CACHED);
2789
ASSERT(MUTEX_HELD(&db->db_mtx));
2790
2791
/*
2792
* Ensure that this block's dnode has been decrypted if
2793
* the caller has requested decrypted data.
2794
*/
2795
err = dbuf_read_verify_dnode_crypt(db, dn, 0);
2796
2797
/*
2798
* If the arc buf is compressed or encrypted and the caller
2799
* requested uncompressed data, we need to untransform it
2800
* before returning. We also call arc_untransform() on any
2801
* unauthenticated blocks, which will verify their MAC if
2802
* the key is now available.
2803
*/
2804
if (err == 0 && db->db_buf != NULL &&
2805
(arc_is_encrypted(db->db_buf) ||
2806
arc_is_unauthenticated(db->db_buf) ||
2807
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
2808
zbookmark_phys_t zb;
2809
2810
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
2811
db->db.db_object, db->db_level, db->db_blkid);
2812
dbuf_fix_old_data(db, spa_syncing_txg(spa));
2813
err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
2814
dbuf_set_data(db, db->db_buf);
2815
}
2816
DB_DNODE_EXIT(db);
2817
DBUF_STAT_BUMP(hash_hits);
2818
2819
return (err);
2820
}
2821
2822
void
2823
dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
2824
{
2825
/*
2826
* Block clones and Direct I/O writes always happen in open-context.
2827
*/
2828
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2829
ASSERT0(db->db_level);
2830
ASSERT(!dmu_tx_is_syncing(tx));
2831
ASSERT0(db->db_level);
2832
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2833
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
2834
2835
mutex_enter(&db->db_mtx);
2836
DBUF_VERIFY(db);
2837
2838
/*
2839
* We are going to clone or issue a Direct I/O write on this block, so
2840
* undirty modifications done to this block so far in this txg. This
2841
* includes writes and clones into this block.
2842
*
2843
* If there dirty record associated with this txg from a previous Direct
2844
* I/O write then space accounting cleanup takes place. It is important
2845
* to go ahead free up the space accounting through dbuf_undirty() ->
2846
* dbuf_unoverride() -> zio_free(). Space accountiung for determining
2847
* if a write can occur in zfs_write() happens through dmu_tx_assign().
2848
* This can cause an issue with Direct I/O writes in the case of
2849
* overwriting the same block, because all DVA allocations are being
2850
* done in open-context. Constantly allowing Direct I/O overwrites to
2851
* the same block can exhaust the pools available space leading to
2852
* ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
2853
* will eventually suspend the pool. By cleaning up sapce acccounting
2854
* now, the ENOSPC error can be avoided.
2855
*
2856
* Since we are undirtying the record in open-context, we must have a
2857
* hold on the db, so it should never be evicted after calling
2858
* dbuf_undirty().
2859
*/
2860
VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
2861
ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
2862
2863
if (db->db_buf != NULL) {
2864
/*
2865
* If there is an associated ARC buffer with this dbuf we can
2866
* only destroy it if the previous dirty record does not
2867
* reference it.
2868
*/
2869
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
2870
if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
2871
arc_buf_destroy(db->db_buf, db);
2872
2873
/*
2874
* Setting the dbuf's data pointers to NULL will force all
2875
* future reads down to the devices to get the most up to date
2876
* version of the data after a Direct I/O write has completed.
2877
*/
2878
db->db_buf = NULL;
2879
dbuf_clear_data(db);
2880
}
2881
2882
ASSERT0P(db->db_buf);
2883
ASSERT0P(db->db.db_data);
2884
2885
db->db_state = DB_NOFILL;
2886
DTRACE_SET_STATE(db,
2887
"allocating NOFILL buffer for clone or direct I/O write");
2888
2889
DBUF_VERIFY(db);
2890
mutex_exit(&db->db_mtx);
2891
2892
dbuf_noread(db, DMU_KEEP_CACHING);
2893
(void) dbuf_dirty(db, tx);
2894
}
2895
2896
void
2897
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2898
{
2899
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2900
2901
mutex_enter(&db->db_mtx);
2902
db->db_state = DB_NOFILL;
2903
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
2904
mutex_exit(&db->db_mtx);
2905
2906
dbuf_noread(db, DMU_KEEP_CACHING);
2907
(void) dbuf_dirty(db, tx);
2908
}
2909
2910
void
2911
dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
2912
dmu_flags_t flags)
2913
{
2914
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2915
2916
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2917
ASSERT(tx->tx_txg != 0);
2918
ASSERT0(db->db_level);
2919
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2920
2921
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
2922
dmu_tx_private_ok(tx));
2923
2924
mutex_enter(&db->db_mtx);
2925
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2926
if (db->db_state == DB_NOFILL ||
2927
(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
2928
/*
2929
* If the fill can fail we should have a way to return back to
2930
* the cloned or Direct I/O write data.
2931
*/
2932
if (canfail && dr) {
2933
mutex_exit(&db->db_mtx);
2934
dmu_buf_will_dirty_flags(db_fake, tx, flags);
2935
return;
2936
}
2937
/*
2938
* Block cloning: We will be completely overwriting a block
2939
* cloned in this transaction group, so let's undirty the
2940
* pending clone and mark the block as uncached. This will be
2941
* as if the clone was never done.
2942
*/
2943
if (db->db_state == DB_NOFILL) {
2944
VERIFY(!dbuf_undirty(db, tx));
2945
db->db_state = DB_UNCACHED;
2946
}
2947
}
2948
mutex_exit(&db->db_mtx);
2949
2950
dbuf_noread(db, flags);
2951
(void) dbuf_dirty(db, tx);
2952
}
2953
2954
void
2955
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
2956
{
2957
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
2958
}
2959
2960
/*
2961
* This function is effectively the same as dmu_buf_will_dirty(), but
2962
* indicates the caller expects raw encrypted data in the db, and provides
2963
* the crypt params (byteorder, salt, iv, mac) which should be stored in the
2964
* blkptr_t when this dbuf is written. This is only used for blocks of
2965
* dnodes, during raw receive.
2966
*/
2967
void
2968
dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
2969
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
2970
{
2971
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2972
dbuf_dirty_record_t *dr;
2973
2974
/*
2975
* dr_has_raw_params is only processed for blocks of dnodes
2976
* (see dbuf_sync_dnode_leaf_crypt()).
2977
*/
2978
ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
2979
ASSERT0(db->db_level);
2980
ASSERT(db->db_objset->os_raw_receive);
2981
2982
dmu_buf_will_dirty_flags(db_fake, tx,
2983
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
2984
2985
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2986
2987
ASSERT3P(dr, !=, NULL);
2988
ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
2989
2990
dr->dt.dl.dr_has_raw_params = B_TRUE;
2991
dr->dt.dl.dr_byteorder = byteorder;
2992
memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
2993
memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
2994
memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
2995
}
2996
2997
static void
2998
dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
2999
{
3000
struct dirty_leaf *dl;
3001
dbuf_dirty_record_t *dr;
3002
3003
ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
3004
ASSERT0(db->db_level);
3005
3006
dr = list_head(&db->db_dirty_records);
3007
ASSERT3P(dr, !=, NULL);
3008
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
3009
dl = &dr->dt.dl;
3010
ASSERT0(dl->dr_has_raw_params);
3011
dl->dr_overridden_by = *bp;
3012
dl->dr_override_state = DR_OVERRIDDEN;
3013
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
3014
}
3015
3016
boolean_t
3017
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
3018
{
3019
(void) tx;
3020
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
3021
mutex_enter(&db->db_mtx);
3022
DBUF_VERIFY(db);
3023
3024
if (db->db_state == DB_FILL) {
3025
if (db->db_level == 0 && db->db_freed_in_flight) {
3026
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3027
/* we were freed while filling */
3028
/* XXX dbuf_undirty? */
3029
memset(db->db.db_data, 0, db->db.db_size);
3030
db->db_freed_in_flight = FALSE;
3031
db->db_state = DB_CACHED;
3032
DTRACE_SET_STATE(db,
3033
"fill done handling freed in flight");
3034
failed = B_FALSE;
3035
} else if (failed) {
3036
VERIFY(!dbuf_undirty(db, tx));
3037
arc_buf_destroy(db->db_buf, db);
3038
db->db_buf = NULL;
3039
dbuf_clear_data(db);
3040
DTRACE_SET_STATE(db, "fill failed");
3041
} else {
3042
db->db_state = DB_CACHED;
3043
DTRACE_SET_STATE(db, "fill done");
3044
}
3045
cv_broadcast(&db->db_changed);
3046
} else {
3047
db->db_state = DB_CACHED;
3048
failed = B_FALSE;
3049
}
3050
mutex_exit(&db->db_mtx);
3051
return (failed);
3052
}
3053
3054
void
3055
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
3056
bp_embedded_type_t etype, enum zio_compress comp,
3057
int uncompressed_size, int compressed_size, int byteorder,
3058
dmu_tx_t *tx)
3059
{
3060
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
3061
struct dirty_leaf *dl;
3062
dmu_object_type_t type;
3063
dbuf_dirty_record_t *dr;
3064
3065
if (etype == BP_EMBEDDED_TYPE_DATA) {
3066
ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
3067
SPA_FEATURE_EMBEDDED_DATA));
3068
}
3069
3070
DB_DNODE_ENTER(db);
3071
type = DB_DNODE(db)->dn_type;
3072
DB_DNODE_EXIT(db);
3073
3074
ASSERT0(db->db_level);
3075
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3076
3077
dmu_buf_will_not_fill(dbuf, tx);
3078
3079
dr = list_head(&db->db_dirty_records);
3080
ASSERT3P(dr, !=, NULL);
3081
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
3082
dl = &dr->dt.dl;
3083
ASSERT0(dl->dr_has_raw_params);
3084
encode_embedded_bp_compressed(&dl->dr_overridden_by,
3085
data, comp, uncompressed_size, compressed_size);
3086
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
3087
BP_SET_TYPE(&dl->dr_overridden_by, type);
3088
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
3089
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
3090
3091
dl->dr_override_state = DR_OVERRIDDEN;
3092
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
3093
}
3094
3095
void
3096
dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
3097
{
3098
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
3099
dmu_object_type_t type;
3100
ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
3101
SPA_FEATURE_REDACTED_DATASETS));
3102
3103
DB_DNODE_ENTER(db);
3104
type = DB_DNODE(db)->dn_type;
3105
DB_DNODE_EXIT(db);
3106
3107
ASSERT0(db->db_level);
3108
dmu_buf_will_not_fill(dbuf, tx);
3109
3110
blkptr_t bp = { { { {0} } } };
3111
BP_SET_TYPE(&bp, type);
3112
BP_SET_LEVEL(&bp, 0);
3113
BP_SET_BIRTH(&bp, tx->tx_txg, 0);
3114
BP_SET_REDACTED(&bp);
3115
BPE_SET_LSIZE(&bp, dbuf->db_size);
3116
3117
dbuf_override_impl(db, &bp, tx);
3118
}
3119
3120
/*
3121
* Directly assign a provided arc buf to a given dbuf if it's not referenced
3122
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
3123
*/
3124
void
3125
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
3126
dmu_flags_t flags)
3127
{
3128
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
3129
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3130
ASSERT0(db->db_level);
3131
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
3132
ASSERT(buf != NULL);
3133
ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
3134
ASSERT(tx->tx_txg != 0);
3135
3136
arc_return_buf(buf, db);
3137
ASSERT(arc_released(buf));
3138
3139
mutex_enter(&db->db_mtx);
3140
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
3141
db->db_pending_evict = B_FALSE;
3142
db->db_partial_read = B_FALSE;
3143
3144
while (db->db_state == DB_READ || db->db_state == DB_FILL)
3145
cv_wait(&db->db_changed, &db->db_mtx);
3146
3147
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
3148
db->db_state == DB_NOFILL);
3149
3150
if (db->db_state == DB_CACHED &&
3151
zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
3152
/*
3153
* In practice, we will never have a case where we have an
3154
* encrypted arc buffer while additional holds exist on the
3155
* dbuf. We don't handle this here so we simply assert that
3156
* fact instead.
3157
*/
3158
ASSERT(!arc_is_encrypted(buf));
3159
mutex_exit(&db->db_mtx);
3160
(void) dbuf_dirty(db, tx);
3161
memcpy(db->db.db_data, buf->b_data, db->db.db_size);
3162
arc_buf_destroy(buf, db);
3163
return;
3164
}
3165
3166
if (db->db_state == DB_CACHED) {
3167
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
3168
3169
ASSERT(db->db_buf != NULL);
3170
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
3171
ASSERT(dr->dt.dl.dr_data == db->db_buf);
3172
3173
if (!arc_released(db->db_buf)) {
3174
ASSERT(dr->dt.dl.dr_override_state ==
3175
DR_OVERRIDDEN);
3176
arc_release(db->db_buf, db);
3177
}
3178
dr->dt.dl.dr_data = buf;
3179
arc_buf_destroy(db->db_buf, db);
3180
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
3181
arc_release(db->db_buf, db);
3182
arc_buf_destroy(db->db_buf, db);
3183
}
3184
db->db_buf = NULL;
3185
} else if (db->db_state == DB_NOFILL) {
3186
/*
3187
* We will be completely replacing the cloned block. In case
3188
* it was cloned in this transaction group, let's undirty the
3189
* pending clone and mark the block as uncached. This will be
3190
* as if the clone was never done.
3191
*/
3192
VERIFY(!dbuf_undirty(db, tx));
3193
db->db_state = DB_UNCACHED;
3194
}
3195
ASSERT0P(db->db_buf);
3196
dbuf_set_data(db, buf);
3197
db->db_state = DB_FILL;
3198
DTRACE_SET_STATE(db, "filling assigned arcbuf");
3199
mutex_exit(&db->db_mtx);
3200
(void) dbuf_dirty(db, tx);
3201
dmu_buf_fill_done(&db->db, tx, B_FALSE);
3202
}
3203
3204
void
3205
dbuf_destroy(dmu_buf_impl_t *db)
3206
{
3207
dnode_t *dn;
3208
dmu_buf_impl_t *parent = db->db_parent;
3209
dmu_buf_impl_t *dndb;
3210
3211
ASSERT(MUTEX_HELD(&db->db_mtx));
3212
ASSERT(zfs_refcount_is_zero(&db->db_holds));
3213
3214
if (db->db_buf != NULL) {
3215
arc_buf_destroy(db->db_buf, db);
3216
db->db_buf = NULL;
3217
}
3218
3219
if (db->db_blkid == DMU_BONUS_BLKID) {
3220
int slots = DB_DNODE(db)->dn_num_slots;
3221
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
3222
if (db->db.db_data != NULL) {
3223
kmem_free(db->db.db_data, bonuslen);
3224
arc_space_return(bonuslen, ARC_SPACE_BONUS);
3225
db->db_state = DB_UNCACHED;
3226
DTRACE_SET_STATE(db, "buffer cleared");
3227
}
3228
}
3229
3230
dbuf_clear_data(db);
3231
3232
if (multilist_link_active(&db->db_cache_link)) {
3233
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3234
db->db_caching_status == DB_DBUF_METADATA_CACHE);
3235
3236
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3237
3238
ASSERT0(dmu_buf_user_size(&db->db));
3239
(void) zfs_refcount_remove_many(
3240
&dbuf_caches[db->db_caching_status].size,
3241
db->db.db_size, db);
3242
3243
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
3244
DBUF_STAT_BUMPDOWN(metadata_cache_count);
3245
} else {
3246
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
3247
DBUF_STAT_BUMPDOWN(cache_count);
3248
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
3249
db->db.db_size);
3250
}
3251
db->db_caching_status = DB_NO_CACHE;
3252
}
3253
3254
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
3255
ASSERT0P(db->db_data_pending);
3256
ASSERT(list_is_empty(&db->db_dirty_records));
3257
3258
db->db_state = DB_EVICTING;
3259
DTRACE_SET_STATE(db, "buffer eviction started");
3260
db->db_blkptr = NULL;
3261
3262
/*
3263
* Now that db_state is DB_EVICTING, nobody else can find this via
3264
* the hash table. We can now drop db_mtx, which allows us to
3265
* acquire the dn_dbufs_mtx.
3266
*/
3267
mutex_exit(&db->db_mtx);
3268
3269
DB_DNODE_ENTER(db);
3270
dn = DB_DNODE(db);
3271
dndb = dn->dn_dbuf;
3272
if (db->db_blkid != DMU_BONUS_BLKID) {
3273
boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
3274
if (needlock)
3275
mutex_enter_nested(&dn->dn_dbufs_mtx,
3276
NESTED_SINGLE);
3277
avl_remove(&dn->dn_dbufs, db);
3278
membar_producer();
3279
DB_DNODE_EXIT(db);
3280
if (needlock)
3281
mutex_exit(&dn->dn_dbufs_mtx);
3282
/*
3283
* Decrementing the dbuf count means that the hold corresponding
3284
* to the removed dbuf is no longer discounted in dnode_move(),
3285
* so the dnode cannot be moved until after we release the hold.
3286
* The membar_producer() ensures visibility of the decremented
3287
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
3288
* release any lock.
3289
*/
3290
mutex_enter(&dn->dn_mtx);
3291
dnode_rele_and_unlock(dn, db, B_TRUE);
3292
#ifdef USE_DNODE_HANDLE
3293
db->db_dnode_handle = NULL;
3294
#else
3295
db->db_dnode = NULL;
3296
#endif
3297
3298
dbuf_hash_remove(db);
3299
} else {
3300
DB_DNODE_EXIT(db);
3301
}
3302
3303
ASSERT(zfs_refcount_is_zero(&db->db_holds));
3304
3305
db->db_parent = NULL;
3306
3307
ASSERT0P(db->db_buf);
3308
ASSERT0P(db->db.db_data);
3309
ASSERT0P(db->db_hash_next);
3310
ASSERT0P(db->db_blkptr);
3311
ASSERT0P(db->db_data_pending);
3312
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
3313
ASSERT(!multilist_link_active(&db->db_cache_link));
3314
3315
/*
3316
* If this dbuf is referenced from an indirect dbuf,
3317
* decrement the ref count on the indirect dbuf.
3318
*/
3319
if (parent && parent != dndb) {
3320
mutex_enter(&parent->db_mtx);
3321
dbuf_rele_and_unlock(parent, db, B_TRUE);
3322
}
3323
3324
kmem_cache_free(dbuf_kmem_cache, db);
3325
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3326
}
3327
3328
/*
3329
* Note: While bpp will always be updated if the function returns success,
3330
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
3331
* this happens when the dnode is the meta-dnode, or {user|group|project}used
3332
* object.
3333
*/
3334
__attribute__((always_inline))
3335
static inline int
3336
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
3337
dmu_buf_impl_t **parentp, blkptr_t **bpp)
3338
{
3339
*parentp = NULL;
3340
*bpp = NULL;
3341
3342
ASSERT(blkid != DMU_BONUS_BLKID);
3343
3344
if (blkid == DMU_SPILL_BLKID) {
3345
mutex_enter(&dn->dn_mtx);
3346
if (dn->dn_have_spill &&
3347
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
3348
*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
3349
else
3350
*bpp = NULL;
3351
dbuf_add_ref(dn->dn_dbuf, NULL);
3352
*parentp = dn->dn_dbuf;
3353
mutex_exit(&dn->dn_mtx);
3354
return (0);
3355
}
3356
3357
int nlevels =
3358
(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
3359
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
3360
3361
ASSERT3U(level * epbs, <, 64);
3362
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3363
/*
3364
* This assertion shouldn't trip as long as the max indirect block size
3365
* is less than 1M. The reason for this is that up to that point,
3366
* the number of levels required to address an entire object with blocks
3367
* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
3368
* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
3369
* (i.e. we can address the entire object), objects will all use at most
3370
* N-1 levels and the assertion won't overflow. However, once epbs is
3371
* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
3372
* enough to address an entire object, so objects will have 5 levels,
3373
* but then this assertion will overflow.
3374
*
3375
* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
3376
* need to redo this logic to handle overflows.
3377
*/
3378
ASSERT(level >= nlevels ||
3379
((nlevels - level - 1) * epbs) +
3380
highbit64(dn->dn_phys->dn_nblkptr) <= 64);
3381
if (level >= nlevels ||
3382
blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
3383
((nlevels - level - 1) * epbs)) ||
3384
(fail_sparse &&
3385
blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
3386
/* the buffer has no parent yet */
3387
return (SET_ERROR(ENOENT));
3388
} else if (level < nlevels-1) {
3389
/* this block is referenced from an indirect block */
3390
int err;
3391
3392
err = dbuf_hold_impl(dn, level + 1,
3393
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
3394
3395
if (err)
3396
return (err);
3397
err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
3398
DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
3399
if (err) {
3400
dbuf_rele(*parentp, NULL);
3401
*parentp = NULL;
3402
return (err);
3403
}
3404
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
3405
(blkid & ((1ULL << epbs) - 1));
3406
return (0);
3407
} else {
3408
/* the block is referenced from the dnode */
3409
ASSERT3U(level, ==, nlevels-1);
3410
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
3411
blkid < dn->dn_phys->dn_nblkptr);
3412
if (dn->dn_dbuf) {
3413
dbuf_add_ref(dn->dn_dbuf, NULL);
3414
*parentp = dn->dn_dbuf;
3415
}
3416
*bpp = &dn->dn_phys->dn_blkptr[blkid];
3417
return (0);
3418
}
3419
}
3420
3421
static dmu_buf_impl_t *
3422
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
3423
dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
3424
{
3425
objset_t *os = dn->dn_objset;
3426
dmu_buf_impl_t *db, *odb;
3427
3428
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3429
ASSERT(dn->dn_type != DMU_OT_NONE);
3430
3431
db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
3432
3433
list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
3434
offsetof(dbuf_dirty_record_t, dr_dbuf_node));
3435
3436
db->db_objset = os;
3437
db->db.db_object = dn->dn_object;
3438
db->db_level = level;
3439
db->db_blkid = blkid;
3440
db->db_dirtycnt = 0;
3441
#ifdef USE_DNODE_HANDLE
3442
db->db_dnode_handle = dn->dn_handle;
3443
#else
3444
db->db_dnode = dn;
3445
#endif
3446
db->db_parent = parent;
3447
db->db_blkptr = blkptr;
3448
db->db_hash = hash;
3449
3450
db->db_user = NULL;
3451
db->db_user_immediate_evict = FALSE;
3452
db->db_freed_in_flight = FALSE;
3453
db->db_pending_evict = TRUE;
3454
db->db_partial_read = FALSE;
3455
3456
if (blkid == DMU_BONUS_BLKID) {
3457
ASSERT3P(parent, ==, dn->dn_dbuf);
3458
db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3459
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
3460
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
3461
db->db.db_offset = DMU_BONUS_BLKID;
3462
db->db_state = DB_UNCACHED;
3463
DTRACE_SET_STATE(db, "bonus buffer created");
3464
db->db_caching_status = DB_NO_CACHE;
3465
/* the bonus dbuf is not placed in the hash table */
3466
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3467
return (db);
3468
} else if (blkid == DMU_SPILL_BLKID) {
3469
db->db.db_size = (blkptr != NULL) ?
3470
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
3471
db->db.db_offset = 0;
3472
} else {
3473
int blocksize =
3474
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
3475
db->db.db_size = blocksize;
3476
db->db.db_offset = db->db_blkid * blocksize;
3477
}
3478
3479
/*
3480
* Hold the dn_dbufs_mtx while we get the new dbuf
3481
* in the hash table *and* added to the dbufs list.
3482
* This prevents a possible deadlock with someone
3483
* trying to look up this dbuf before it's added to the
3484
* dn_dbufs list.
3485
*/
3486
mutex_enter(&dn->dn_dbufs_mtx);
3487
db->db_state = DB_EVICTING; /* not worth logging this state change */
3488
if ((odb = dbuf_hash_insert(db)) != NULL) {
3489
/* someone else inserted it first */
3490
mutex_exit(&dn->dn_dbufs_mtx);
3491
kmem_cache_free(dbuf_kmem_cache, db);
3492
DBUF_STAT_BUMP(hash_insert_race);
3493
return (odb);
3494
}
3495
avl_add(&dn->dn_dbufs, db);
3496
3497
db->db_state = DB_UNCACHED;
3498
DTRACE_SET_STATE(db, "regular buffer created");
3499
db->db_caching_status = DB_NO_CACHE;
3500
mutex_exit(&dn->dn_dbufs_mtx);
3501
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3502
3503
if (parent && parent != dn->dn_dbuf)
3504
dbuf_add_ref(parent, db);
3505
3506
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
3507
zfs_refcount_count(&dn->dn_holds) > 0);
3508
(void) zfs_refcount_add(&dn->dn_holds, db);
3509
3510
dprintf_dbuf(db, "db=%p\n", db);
3511
3512
return (db);
3513
}
3514
3515
/*
3516
* This function returns a block pointer and information about the object,
3517
* given a dnode and a block. This is a publicly accessible version of
3518
* dbuf_findbp that only returns some information, rather than the
3519
* dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock
3520
* should be locked as (at least) a reader.
3521
*/
3522
int
3523
dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
3524
blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
3525
{
3526
dmu_buf_impl_t *dbp = NULL;
3527
blkptr_t *bp2;
3528
int err = 0;
3529
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3530
3531
err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
3532
if (err == 0) {
3533
ASSERT3P(bp2, !=, NULL);
3534
*bp = *bp2;
3535
if (dbp != NULL)
3536
dbuf_rele(dbp, NULL);
3537
if (datablkszsec != NULL)
3538
*datablkszsec = dn->dn_phys->dn_datablkszsec;
3539
if (indblkshift != NULL)
3540
*indblkshift = dn->dn_phys->dn_indblkshift;
3541
}
3542
3543
return (err);
3544
}
3545
3546
typedef struct dbuf_prefetch_arg {
3547
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
3548
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
3549
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
3550
int dpa_curlevel; /* The current level that we're reading */
3551
dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
3552
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
3553
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
3554
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
3555
dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
3556
void *dpa_arg; /* prefetch completion arg */
3557
} dbuf_prefetch_arg_t;
3558
3559
static void
3560
dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
3561
{
3562
if (dpa->dpa_cb != NULL) {
3563
dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
3564
dpa->dpa_zb.zb_blkid, io_done);
3565
}
3566
kmem_free(dpa, sizeof (*dpa));
3567
}
3568
3569
static void
3570
dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
3571
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3572
{
3573
(void) zio, (void) zb, (void) iobp;
3574
dbuf_prefetch_arg_t *dpa = private;
3575
3576
if (abuf != NULL)
3577
arc_buf_destroy(abuf, private);
3578
3579
dbuf_prefetch_fini(dpa, B_TRUE);
3580
}
3581
3582
/*
3583
* Actually issue the prefetch read for the block given.
3584
*/
3585
static void
3586
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
3587
{
3588
ASSERT(!BP_IS_HOLE(bp));
3589
ASSERT(!BP_IS_REDACTED(bp));
3590
if (BP_IS_EMBEDDED(bp))
3591
return (dbuf_prefetch_fini(dpa, B_FALSE));
3592
3593
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
3594
arc_flags_t aflags =
3595
dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
3596
ARC_FLAG_NO_BUF;
3597
3598
/* dnodes are always read as raw and then converted later */
3599
if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
3600
dpa->dpa_curlevel == 0)
3601
zio_flags |= ZIO_FLAG_RAW;
3602
3603
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3604
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
3605
ASSERT(dpa->dpa_zio != NULL);
3606
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
3607
dbuf_issue_final_prefetch_done, dpa,
3608
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
3609
}
3610
3611
/*
3612
* Called when an indirect block above our prefetch target is read in. This
3613
* will either read in the next indirect block down the tree or issue the actual
3614
* prefetch if the next block down is our target.
3615
*/
3616
static void
3617
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
3618
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3619
{
3620
(void) zb, (void) iobp;
3621
dbuf_prefetch_arg_t *dpa = private;
3622
3623
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
3624
ASSERT3S(dpa->dpa_curlevel, >, 0);
3625
3626
if (abuf == NULL) {
3627
ASSERT(zio == NULL || zio->io_error != 0);
3628
dbuf_prefetch_fini(dpa, B_TRUE);
3629
return;
3630
}
3631
ASSERT(zio == NULL || zio->io_error == 0);
3632
3633
/*
3634
* The dpa_dnode is only valid if we are called with a NULL
3635
* zio. This indicates that the arc_read() returned without
3636
* first calling zio_read() to issue a physical read. Once
3637
* a physical read is made the dpa_dnode must be invalidated
3638
* as the locks guarding it may have been dropped. If the
3639
* dpa_dnode is still valid, then we want to add it to the dbuf
3640
* cache. To do so, we must hold the dbuf associated with the block
3641
* we just prefetched, read its contents so that we associate it
3642
* with an arc_buf_t, and then release it.
3643
*/
3644
if (zio != NULL) {
3645
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
3646
if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
3647
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
3648
} else {
3649
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
3650
}
3651
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
3652
3653
dpa->dpa_dnode = NULL;
3654
} else if (dpa->dpa_dnode != NULL) {
3655
uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
3656
(dpa->dpa_epbs * (dpa->dpa_curlevel -
3657
dpa->dpa_zb.zb_level));
3658
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
3659
dpa->dpa_curlevel, curblkid, FTAG);
3660
if (db == NULL) {
3661
arc_buf_destroy(abuf, private);
3662
dbuf_prefetch_fini(dpa, B_TRUE);
3663
return;
3664
}
3665
(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
3666
DMU_READ_NO_PREFETCH);
3667
dbuf_rele(db, FTAG);
3668
}
3669
3670
dpa->dpa_curlevel--;
3671
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
3672
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
3673
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
3674
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
3675
3676
ASSERT(!BP_IS_REDACTED(bp) || dpa->dpa_dnode == NULL ||
3677
dsl_dataset_feature_is_active(
3678
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3679
SPA_FEATURE_REDACTED_DATASETS));
3680
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
3681
arc_buf_destroy(abuf, private);
3682
dbuf_prefetch_fini(dpa, B_TRUE);
3683
return;
3684
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
3685
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
3686
dbuf_issue_final_prefetch(dpa, bp);
3687
} else {
3688
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3689
zbookmark_phys_t zb;
3690
3691
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
3692
if (dpa->dpa_dnode) {
3693
if (dnode_level_is_l2cacheable(bp, dpa->dpa_dnode,
3694
dpa->dpa_curlevel))
3695
iter_aflags |= ARC_FLAG_L2CACHE;
3696
} else {
3697
if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
3698
iter_aflags |= ARC_FLAG_L2CACHE;
3699
}
3700
3701
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3702
3703
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
3704
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
3705
3706
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3707
bp, dbuf_prefetch_indirect_done, dpa,
3708
ZIO_PRIORITY_SYNC_READ,
3709
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3710
&iter_aflags, &zb);
3711
}
3712
3713
arc_buf_destroy(abuf, private);
3714
}
3715
3716
/*
3717
* Issue prefetch reads for the given block on the given level. If the indirect
3718
* blocks above that block are not in memory, we will read them in
3719
* asynchronously. As a result, this call never blocks waiting for a read to
3720
* complete. Note that the prefetch might fail if the dataset is encrypted and
3721
* the encryption key is unmapped before the IO completes.
3722
*/
3723
int
3724
dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
3725
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
3726
void *arg)
3727
{
3728
blkptr_t bp;
3729
int epbs, nlevels, curlevel;
3730
uint64_t curblkid;
3731
3732
ASSERT(blkid != DMU_BONUS_BLKID);
3733
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3734
3735
if (blkid > dn->dn_maxblkid)
3736
goto no_issue;
3737
3738
if (level == 0 && dnode_block_freed(dn, blkid))
3739
goto no_issue;
3740
3741
/*
3742
* This dnode hasn't been written to disk yet, so there's nothing to
3743
* prefetch.
3744
*/
3745
nlevels = dn->dn_phys->dn_nlevels;
3746
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
3747
goto no_issue;
3748
3749
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3750
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
3751
goto no_issue;
3752
3753
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
3754
level, blkid, NULL);
3755
if (db != NULL) {
3756
mutex_exit(&db->db_mtx);
3757
/*
3758
* This dbuf already exists. It is either CACHED, or
3759
* (we assume) about to be read or filled.
3760
*/
3761
goto no_issue;
3762
}
3763
3764
/*
3765
* Find the closest ancestor (indirect block) of the target block
3766
* that is present in the cache. In this indirect block, we will
3767
* find the bp that is at curlevel, curblkid.
3768
*/
3769
curlevel = level;
3770
curblkid = blkid;
3771
while (curlevel < nlevels - 1) {
3772
int parent_level = curlevel + 1;
3773
uint64_t parent_blkid = curblkid >> epbs;
3774
dmu_buf_impl_t *db;
3775
3776
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
3777
FALSE, TRUE, FTAG, &db) == 0) {
3778
blkptr_t *bpp = db->db_buf->b_data;
3779
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
3780
dbuf_rele(db, FTAG);
3781
break;
3782
}
3783
3784
curlevel = parent_level;
3785
curblkid = parent_blkid;
3786
}
3787
3788
if (curlevel == nlevels - 1) {
3789
/* No cached indirect blocks found. */
3790
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
3791
bp = dn->dn_phys->dn_blkptr[curblkid];
3792
}
3793
ASSERT(!BP_IS_REDACTED(&bp) ||
3794
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
3795
SPA_FEATURE_REDACTED_DATASETS));
3796
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
3797
goto no_issue;
3798
3799
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
3800
3801
zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
3802
ZIO_FLAG_CANFAIL);
3803
3804
dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
3805
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
3806
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3807
dn->dn_object, level, blkid);
3808
dpa->dpa_curlevel = curlevel;
3809
dpa->dpa_prio = prio;
3810
dpa->dpa_aflags = aflags;
3811
dpa->dpa_spa = dn->dn_objset->os_spa;
3812
dpa->dpa_dnode = dn;
3813
dpa->dpa_epbs = epbs;
3814
dpa->dpa_zio = pio;
3815
dpa->dpa_cb = cb;
3816
dpa->dpa_arg = arg;
3817
3818
if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
3819
dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
3820
else if (dnode_level_is_l2cacheable(&bp, dn, level))
3821
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
3822
3823
/*
3824
* If we have the indirect just above us, no need to do the asynchronous
3825
* prefetch chain; we'll just run the last step ourselves. If we're at
3826
* a higher level, though, we want to issue the prefetches for all the
3827
* indirect blocks asynchronously, so we can go on with whatever we were
3828
* doing.
3829
*/
3830
if (curlevel == level) {
3831
ASSERT3U(curblkid, ==, blkid);
3832
dbuf_issue_final_prefetch(dpa, &bp);
3833
} else {
3834
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3835
zbookmark_phys_t zb;
3836
3837
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
3838
if (dnode_level_is_l2cacheable(&bp, dn, curlevel))
3839
iter_aflags |= ARC_FLAG_L2CACHE;
3840
3841
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3842
dn->dn_object, curlevel, curblkid);
3843
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3844
&bp, dbuf_prefetch_indirect_done, dpa,
3845
ZIO_PRIORITY_SYNC_READ,
3846
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3847
&iter_aflags, &zb);
3848
}
3849
/*
3850
* We use pio here instead of dpa_zio since it's possible that
3851
* dpa may have already been freed.
3852
*/
3853
zio_nowait(pio);
3854
return (1);
3855
no_issue:
3856
if (cb != NULL)
3857
cb(arg, level, blkid, B_FALSE);
3858
return (0);
3859
}
3860
3861
int
3862
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
3863
arc_flags_t aflags)
3864
{
3865
3866
return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
3867
}
3868
3869
/*
3870
* Helper function for dbuf_hold_impl() to copy a buffer. Handles
3871
* the case of encrypted, compressed and uncompressed buffers by
3872
* allocating the new buffer, respectively, with arc_alloc_raw_buf(),
3873
* arc_alloc_compressed_buf() or arc_alloc_buf().*
3874
*
3875
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
3876
*/
3877
noinline static void
3878
dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
3879
{
3880
dbuf_dirty_record_t *dr = db->db_data_pending;
3881
arc_buf_t *data = dr->dt.dl.dr_data;
3882
arc_buf_t *db_data;
3883
enum zio_compress compress_type = arc_get_compression(data);
3884
uint8_t complevel = arc_get_complevel(data);
3885
3886
if (arc_is_encrypted(data)) {
3887
boolean_t byteorder;
3888
uint8_t salt[ZIO_DATA_SALT_LEN];
3889
uint8_t iv[ZIO_DATA_IV_LEN];
3890
uint8_t mac[ZIO_DATA_MAC_LEN];
3891
3892
arc_get_raw_params(data, &byteorder, salt, iv, mac);
3893
db_data = arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
3894
dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
3895
dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
3896
compress_type, complevel);
3897
} else if (compress_type != ZIO_COMPRESS_OFF) {
3898
db_data = arc_alloc_compressed_buf(
3899
dn->dn_objset->os_spa, db, arc_buf_size(data),
3900
arc_buf_lsize(data), compress_type, complevel);
3901
} else {
3902
db_data = arc_alloc_buf(dn->dn_objset->os_spa, db,
3903
DBUF_GET_BUFC_TYPE(db), db->db.db_size);
3904
}
3905
memcpy(db_data->b_data, data->b_data, arc_buf_size(data));
3906
3907
dbuf_set_data(db, db_data);
3908
}
3909
3910
/*
3911
* Returns with db_holds incremented, and db_mtx not held.
3912
* Note: dn_struct_rwlock must be held.
3913
*/
3914
int
3915
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
3916
boolean_t fail_sparse, boolean_t fail_uncached,
3917
const void *tag, dmu_buf_impl_t **dbp)
3918
{
3919
dmu_buf_impl_t *db, *parent = NULL;
3920
uint64_t hv;
3921
3922
/* If the pool has been created, verify the tx_sync_lock is not held */
3923
spa_t *spa = dn->dn_objset->os_spa;
3924
dsl_pool_t *dp = spa->spa_dsl_pool;
3925
if (dp != NULL) {
3926
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
3927
}
3928
3929
ASSERT(blkid != DMU_BONUS_BLKID);
3930
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3931
if (!fail_sparse)
3932
ASSERT3U(dn->dn_nlevels, >, level);
3933
3934
*dbp = NULL;
3935
3936
/* dbuf_find() returns with db_mtx held */
3937
db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
3938
3939
if (db == NULL) {
3940
blkptr_t *bp = NULL;
3941
int err;
3942
3943
if (fail_uncached)
3944
return (SET_ERROR(ENOENT));
3945
3946
ASSERT0P(parent);
3947
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
3948
if (fail_sparse) {
3949
if (err == 0 && bp && BP_IS_HOLE(bp))
3950
err = SET_ERROR(ENOENT);
3951
if (err) {
3952
if (parent)
3953
dbuf_rele(parent, NULL);
3954
return (err);
3955
}
3956
}
3957
if (err && err != ENOENT)
3958
return (err);
3959
db = dbuf_create(dn, level, blkid, parent, bp, hv);
3960
}
3961
3962
if (fail_uncached && db->db_state != DB_CACHED) {
3963
mutex_exit(&db->db_mtx);
3964
return (SET_ERROR(ENOENT));
3965
}
3966
3967
if (db->db_buf != NULL) {
3968
arc_buf_access(db->db_buf);
3969
ASSERT(MUTEX_HELD(&db->db_mtx));
3970
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
3971
}
3972
3973
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
3974
3975
/*
3976
* If this buffer is currently syncing out, and we are
3977
* still referencing it from db_data, we need to make a copy
3978
* of it in case we decide we want to dirty it again in this txg.
3979
*/
3980
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
3981
dn->dn_object != DMU_META_DNODE_OBJECT &&
3982
db->db_state == DB_CACHED && db->db_data_pending) {
3983
dbuf_dirty_record_t *dr = db->db_data_pending;
3984
if (dr->dt.dl.dr_data == db->db_buf) {
3985
ASSERT3P(db->db_buf, !=, NULL);
3986
dbuf_hold_copy(dn, db);
3987
}
3988
}
3989
3990
if (multilist_link_active(&db->db_cache_link)) {
3991
ASSERT(zfs_refcount_is_zero(&db->db_holds));
3992
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3993
db->db_caching_status == DB_DBUF_METADATA_CACHE);
3994
3995
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3996
3997
uint64_t size = db->db.db_size;
3998
uint64_t usize = dmu_buf_user_size(&db->db);
3999
(void) zfs_refcount_remove_many(
4000
&dbuf_caches[db->db_caching_status].size, size, db);
4001
(void) zfs_refcount_remove_many(
4002
&dbuf_caches[db->db_caching_status].size, usize,
4003
db->db_user);
4004
4005
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
4006
DBUF_STAT_BUMPDOWN(metadata_cache_count);
4007
} else {
4008
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
4009
DBUF_STAT_BUMPDOWN(cache_count);
4010
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
4011
size + usize);
4012
}
4013
db->db_caching_status = DB_NO_CACHE;
4014
}
4015
(void) zfs_refcount_add(&db->db_holds, tag);
4016
DBUF_VERIFY(db);
4017
mutex_exit(&db->db_mtx);
4018
4019
/* NOTE: we can't rele the parent until after we drop the db_mtx */
4020
if (parent)
4021
dbuf_rele(parent, NULL);
4022
4023
ASSERT3P(DB_DNODE(db), ==, dn);
4024
ASSERT3U(db->db_blkid, ==, blkid);
4025
ASSERT3U(db->db_level, ==, level);
4026
*dbp = db;
4027
4028
return (0);
4029
}
4030
4031
dmu_buf_impl_t *
4032
dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
4033
{
4034
return (dbuf_hold_level(dn, 0, blkid, tag));
4035
}
4036
4037
dmu_buf_impl_t *
4038
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
4039
{
4040
dmu_buf_impl_t *db;
4041
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
4042
return (err ? NULL : db);
4043
}
4044
4045
void
4046
dbuf_create_bonus(dnode_t *dn)
4047
{
4048
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
4049
4050
ASSERT0P(dn->dn_bonus);
4051
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
4052
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
4053
dn->dn_bonus->db_pending_evict = FALSE;
4054
}
4055
4056
int
4057
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
4058
{
4059
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4060
4061
if (db->db_blkid != DMU_SPILL_BLKID)
4062
return (SET_ERROR(ENOTSUP));
4063
if (blksz == 0)
4064
blksz = SPA_MINBLOCKSIZE;
4065
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
4066
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
4067
4068
dbuf_new_size(db, blksz, tx);
4069
4070
return (0);
4071
}
4072
4073
void
4074
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
4075
{
4076
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
4077
}
4078
4079
#pragma weak dmu_buf_add_ref = dbuf_add_ref
4080
void
4081
dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
4082
{
4083
int64_t holds = zfs_refcount_add(&db->db_holds, tag);
4084
VERIFY3S(holds, >, 1);
4085
}
4086
4087
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
4088
boolean_t
4089
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
4090
const void *tag)
4091
{
4092
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4093
dmu_buf_impl_t *found_db;
4094
boolean_t result = B_FALSE;
4095
4096
if (blkid == DMU_BONUS_BLKID)
4097
found_db = dbuf_find_bonus(os, obj);
4098
else
4099
found_db = dbuf_find(os, obj, 0, blkid, NULL);
4100
4101
if (found_db != NULL) {
4102
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
4103
(void) zfs_refcount_add(&db->db_holds, tag);
4104
result = B_TRUE;
4105
}
4106
mutex_exit(&found_db->db_mtx);
4107
}
4108
return (result);
4109
}
4110
4111
/*
4112
* If you call dbuf_rele() you had better not be referencing the dnode handle
4113
* unless you have some other direct or indirect hold on the dnode. (An indirect
4114
* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
4115
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
4116
* dnode's parent dbuf evicting its dnode handles.
4117
*/
4118
void
4119
dbuf_rele(dmu_buf_impl_t *db, const void *tag)
4120
{
4121
mutex_enter(&db->db_mtx);
4122
dbuf_rele_and_unlock(db, tag, B_FALSE);
4123
}
4124
4125
void
4126
dmu_buf_rele(dmu_buf_t *db, const void *tag)
4127
{
4128
dbuf_rele((dmu_buf_impl_t *)db, tag);
4129
}
4130
4131
/*
4132
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
4133
* db_dirtycnt and db_holds to be updated atomically. The 'evicting'
4134
* argument should be set if we are already in the dbuf-evicting code
4135
* path, in which case we don't want to recursively evict. This allows us to
4136
* avoid deeply nested stacks that would have a call flow similar to this:
4137
*
4138
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
4139
* ^ |
4140
* | |
4141
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
4142
*
4143
*/
4144
void
4145
dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
4146
{
4147
int64_t holds;
4148
uint64_t size;
4149
4150
ASSERT(MUTEX_HELD(&db->db_mtx));
4151
DBUF_VERIFY(db);
4152
4153
/*
4154
* Remove the reference to the dbuf before removing its hold on the
4155
* dnode so we can guarantee in dnode_move() that a referenced bonus
4156
* buffer has a corresponding dnode hold.
4157
*/
4158
holds = zfs_refcount_remove(&db->db_holds, tag);
4159
ASSERT(holds >= 0);
4160
4161
/*
4162
* We can't freeze indirects if there is a possibility that they
4163
* may be modified in the current syncing context.
4164
*/
4165
if (db->db_buf != NULL &&
4166
holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
4167
arc_buf_freeze(db->db_buf);
4168
}
4169
4170
if (holds == db->db_dirtycnt &&
4171
db->db_level == 0 && db->db_user_immediate_evict)
4172
dbuf_evict_user(db);
4173
4174
if (holds == 0) {
4175
if (db->db_blkid == DMU_BONUS_BLKID) {
4176
dnode_t *dn;
4177
boolean_t evict_dbuf = db->db_pending_evict;
4178
4179
/*
4180
* If the dnode moves here, we cannot cross this
4181
* barrier until the move completes.
4182
*/
4183
DB_DNODE_ENTER(db);
4184
4185
dn = DB_DNODE(db);
4186
atomic_dec_32(&dn->dn_dbufs_count);
4187
4188
/*
4189
* Decrementing the dbuf count means that the bonus
4190
* buffer's dnode hold is no longer discounted in
4191
* dnode_move(). The dnode cannot move until after
4192
* the dnode_rele() below.
4193
*/
4194
DB_DNODE_EXIT(db);
4195
4196
/*
4197
* Do not reference db after its lock is dropped.
4198
* Another thread may evict it.
4199
*/
4200
mutex_exit(&db->db_mtx);
4201
4202
if (evict_dbuf)
4203
dnode_evict_bonus(dn);
4204
4205
dnode_rele(dn, db);
4206
} else if (db->db_buf == NULL) {
4207
/*
4208
* This is a special case: we never associated this
4209
* dbuf with any data allocated from the ARC.
4210
*/
4211
ASSERT(db->db_state == DB_UNCACHED ||
4212
db->db_state == DB_NOFILL);
4213
dbuf_destroy(db);
4214
} else if (arc_released(db->db_buf)) {
4215
/*
4216
* This dbuf has anonymous data associated with it.
4217
*/
4218
dbuf_destroy(db);
4219
} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
4220
/*
4221
* We don't expect more accesses to the dbuf, and it
4222
* is either not cacheable or was marked for eviction.
4223
*/
4224
dbuf_destroy(db);
4225
} else if (!multilist_link_active(&db->db_cache_link)) {
4226
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4227
4228
dbuf_cached_state_t dcs =
4229
dbuf_include_in_metadata_cache(db) ?
4230
DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
4231
db->db_caching_status = dcs;
4232
4233
multilist_insert(&dbuf_caches[dcs].cache, db);
4234
uint64_t db_size = db->db.db_size;
4235
uint64_t dbu_size = dmu_buf_user_size(&db->db);
4236
(void) zfs_refcount_add_many(
4237
&dbuf_caches[dcs].size, db_size, db);
4238
size = zfs_refcount_add_many(
4239
&dbuf_caches[dcs].size, dbu_size, db->db_user);
4240
uint8_t db_level = db->db_level;
4241
mutex_exit(&db->db_mtx);
4242
4243
if (dcs == DB_DBUF_METADATA_CACHE) {
4244
DBUF_STAT_BUMP(metadata_cache_count);
4245
DBUF_STAT_MAX(metadata_cache_size_bytes_max,
4246
size);
4247
} else {
4248
DBUF_STAT_BUMP(cache_count);
4249
DBUF_STAT_MAX(cache_size_bytes_max, size);
4250
DBUF_STAT_BUMP(cache_levels[db_level]);
4251
DBUF_STAT_INCR(cache_levels_bytes[db_level],
4252
db_size + dbu_size);
4253
}
4254
4255
if (dcs == DB_DBUF_CACHE && !evicting)
4256
dbuf_evict_notify(size);
4257
}
4258
} else {
4259
mutex_exit(&db->db_mtx);
4260
}
4261
}
4262
4263
#pragma weak dmu_buf_refcount = dbuf_refcount
4264
uint64_t
4265
dbuf_refcount(dmu_buf_impl_t *db)
4266
{
4267
return (zfs_refcount_count(&db->db_holds));
4268
}
4269
4270
uint64_t
4271
dmu_buf_user_refcount(dmu_buf_t *db_fake)
4272
{
4273
uint64_t holds;
4274
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4275
4276
mutex_enter(&db->db_mtx);
4277
ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
4278
holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
4279
mutex_exit(&db->db_mtx);
4280
4281
return (holds);
4282
}
4283
4284
void *
4285
dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
4286
dmu_buf_user_t *new_user)
4287
{
4288
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4289
4290
mutex_enter(&db->db_mtx);
4291
dbuf_verify_user(db, DBVU_NOT_EVICTING);
4292
if (db->db_user == old_user)
4293
db->db_user = new_user;
4294
else
4295
old_user = db->db_user;
4296
dbuf_verify_user(db, DBVU_NOT_EVICTING);
4297
mutex_exit(&db->db_mtx);
4298
4299
return (old_user);
4300
}
4301
4302
void *
4303
dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4304
{
4305
return (dmu_buf_replace_user(db_fake, NULL, user));
4306
}
4307
4308
void *
4309
dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4310
{
4311
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4312
4313
db->db_user_immediate_evict = TRUE;
4314
return (dmu_buf_set_user(db_fake, user));
4315
}
4316
4317
void *
4318
dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
4319
{
4320
return (dmu_buf_replace_user(db_fake, user, NULL));
4321
}
4322
4323
void *
4324
dmu_buf_get_user(dmu_buf_t *db_fake)
4325
{
4326
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4327
4328
dbuf_verify_user(db, DBVU_NOT_EVICTING);
4329
return (db->db_user);
4330
}
4331
4332
uint64_t
4333
dmu_buf_user_size(dmu_buf_t *db_fake)
4334
{
4335
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4336
if (db->db_user == NULL)
4337
return (0);
4338
return (atomic_load_64(&db->db_user->dbu_size));
4339
}
4340
4341
void
4342
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
4343
{
4344
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4345
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4346
ASSERT3P(db->db_user, !=, NULL);
4347
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
4348
atomic_add_64(&db->db_user->dbu_size, nadd);
4349
}
4350
4351
void
4352
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
4353
{
4354
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4355
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4356
ASSERT3P(db->db_user, !=, NULL);
4357
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
4358
atomic_sub_64(&db->db_user->dbu_size, nsub);
4359
}
4360
4361
void
4362
dmu_buf_user_evict_wait(void)
4363
{
4364
taskq_wait(dbu_evict_taskq);
4365
}
4366
4367
blkptr_t *
4368
dmu_buf_get_blkptr(dmu_buf_t *db)
4369
{
4370
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
4371
return (dbi->db_blkptr);
4372
}
4373
4374
objset_t *
4375
dmu_buf_get_objset(dmu_buf_t *db)
4376
{
4377
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
4378
return (dbi->db_objset);
4379
}
4380
4381
static void
4382
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
4383
{
4384
/* ASSERT(dmu_tx_is_syncing(tx) */
4385
ASSERT(MUTEX_HELD(&db->db_mtx));
4386
4387
if (db->db_blkptr != NULL)
4388
return;
4389
4390
if (db->db_blkid == DMU_SPILL_BLKID) {
4391
db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
4392
BP_ZERO(db->db_blkptr);
4393
return;
4394
}
4395
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
4396
/*
4397
* This buffer was allocated at a time when there was
4398
* no available blkptrs from the dnode, or it was
4399
* inappropriate to hook it in (i.e., nlevels mismatch).
4400
*/
4401
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
4402
ASSERT0P(db->db_parent);
4403
db->db_parent = dn->dn_dbuf;
4404
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
4405
DBUF_VERIFY(db);
4406
} else {
4407
dmu_buf_impl_t *parent = db->db_parent;
4408
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
4409
4410
ASSERT(dn->dn_phys->dn_nlevels > 1);
4411
if (parent == NULL) {
4412
mutex_exit(&db->db_mtx);
4413
rw_enter(&dn->dn_struct_rwlock, RW_READER);
4414
parent = dbuf_hold_level(dn, db->db_level + 1,
4415
db->db_blkid >> epbs, db);
4416
rw_exit(&dn->dn_struct_rwlock);
4417
mutex_enter(&db->db_mtx);
4418
db->db_parent = parent;
4419
}
4420
db->db_blkptr = (blkptr_t *)parent->db.db_data +
4421
(db->db_blkid & ((1ULL << epbs) - 1));
4422
DBUF_VERIFY(db);
4423
}
4424
}
4425
4426
static void
4427
dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4428
{
4429
dmu_buf_impl_t *db = dr->dr_dbuf;
4430
void *data = dr->dt.dl.dr_data;
4431
4432
ASSERT0(db->db_level);
4433
ASSERT(MUTEX_HELD(&db->db_mtx));
4434
ASSERT(db->db_blkid == DMU_BONUS_BLKID);
4435
ASSERT(data != NULL);
4436
4437
dnode_t *dn = dr->dr_dnode;
4438
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
4439
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
4440
memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
4441
4442
dbuf_sync_leaf_verify_bonus_dnode(dr);
4443
4444
dbuf_undirty_bonus(dr);
4445
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
4446
}
4447
4448
/*
4449
* When syncing out a blocks of dnodes, adjust the block to deal with
4450
* encryption. Normally, we make sure the block is decrypted before writing
4451
* it. If we have crypt params, then we are writing a raw (encrypted) block,
4452
* from a raw receive. In this case, set the ARC buf's crypt params so
4453
* that the BP will be filled with the correct byteorder, salt, iv, and mac.
4454
*/
4455
static void
4456
dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
4457
{
4458
int err;
4459
dmu_buf_impl_t *db = dr->dr_dbuf;
4460
4461
ASSERT(MUTEX_HELD(&db->db_mtx));
4462
ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
4463
ASSERT0(db->db_level);
4464
4465
if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
4466
zbookmark_phys_t zb;
4467
4468
/*
4469
* Unfortunately, there is currently no mechanism for
4470
* syncing context to handle decryption errors. An error
4471
* here is only possible if an attacker maliciously
4472
* changed a dnode block and updated the associated
4473
* checksums going up the block tree.
4474
*/
4475
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
4476
db->db.db_object, db->db_level, db->db_blkid);
4477
err = arc_untransform(db->db_buf, db->db_objset->os_spa,
4478
&zb, B_TRUE);
4479
if (err)
4480
panic("Invalid dnode block MAC");
4481
} else if (dr->dt.dl.dr_has_raw_params) {
4482
(void) arc_release(dr->dt.dl.dr_data, db);
4483
arc_convert_to_raw(dr->dt.dl.dr_data,
4484
dmu_objset_id(db->db_objset),
4485
dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
4486
dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
4487
}
4488
}
4489
4490
/*
4491
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
4492
* is critical the we not allow the compiler to inline this function in to
4493
* dbuf_sync_list() thereby drastically bloating the stack usage.
4494
*/
4495
noinline static void
4496
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4497
{
4498
dmu_buf_impl_t *db = dr->dr_dbuf;
4499
dnode_t *dn = dr->dr_dnode;
4500
4501
ASSERT(dmu_tx_is_syncing(tx));
4502
4503
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4504
4505
mutex_enter(&db->db_mtx);
4506
4507
ASSERT(db->db_level > 0);
4508
DBUF_VERIFY(db);
4509
4510
/* Read the block if it hasn't been read yet. */
4511
if (db->db_buf == NULL) {
4512
mutex_exit(&db->db_mtx);
4513
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
4514
mutex_enter(&db->db_mtx);
4515
}
4516
ASSERT3U(db->db_state, ==, DB_CACHED);
4517
ASSERT(db->db_buf != NULL);
4518
4519
/* Indirect block size must match what the dnode thinks it is. */
4520
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4521
dbuf_check_blkptr(dn, db);
4522
4523
/* Provide the pending dirty record to child dbufs */
4524
db->db_data_pending = dr;
4525
4526
mutex_exit(&db->db_mtx);
4527
4528
dbuf_write(dr, db->db_buf, tx);
4529
4530
zio_t *zio = dr->dr_zio;
4531
mutex_enter(&dr->dt.di.dr_mtx);
4532
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
4533
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4534
mutex_exit(&dr->dt.di.dr_mtx);
4535
zio_nowait(zio);
4536
}
4537
4538
/*
4539
* Verify that the size of the data in our bonus buffer does not exceed
4540
* its recorded size.
4541
*
4542
* The purpose of this verification is to catch any cases in development
4543
* where the size of a phys structure (i.e space_map_phys_t) grows and,
4544
* due to incorrect feature management, older pools expect to read more
4545
* data even though they didn't actually write it to begin with.
4546
*
4547
* For a example, this would catch an error in the feature logic where we
4548
* open an older pool and we expect to write the space map histogram of
4549
* a space map with size SPACE_MAP_SIZE_V0.
4550
*/
4551
static void
4552
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
4553
{
4554
#ifdef ZFS_DEBUG
4555
dnode_t *dn = dr->dr_dnode;
4556
4557
/*
4558
* Encrypted bonus buffers can have data past their bonuslen.
4559
* Skip the verification of these blocks.
4560
*/
4561
if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
4562
return;
4563
4564
uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
4565
uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
4566
ASSERT3U(bonuslen, <=, maxbonuslen);
4567
4568
arc_buf_t *datap = dr->dt.dl.dr_data;
4569
char *datap_end = ((char *)datap) + bonuslen;
4570
char *datap_max = ((char *)datap) + maxbonuslen;
4571
4572
/* ensure that everything is zero after our data */
4573
for (; datap_end < datap_max; datap_end++)
4574
ASSERT0(*datap_end);
4575
#endif
4576
}
4577
4578
static blkptr_t *
4579
dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
4580
{
4581
/* This must be a lightweight dirty record. */
4582
ASSERT0P(dr->dr_dbuf);
4583
dnode_t *dn = dr->dr_dnode;
4584
4585
if (dn->dn_phys->dn_nlevels == 1) {
4586
VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
4587
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
4588
} else {
4589
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4590
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
4591
VERIFY3U(parent_db->db_level, ==, 1);
4592
VERIFY3P(DB_DNODE(parent_db), ==, dn);
4593
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4594
blkptr_t *bp = parent_db->db.db_data;
4595
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
4596
}
4597
}
4598
4599
static void
4600
dbuf_lightweight_ready(zio_t *zio)
4601
{
4602
dbuf_dirty_record_t *dr = zio->io_private;
4603
blkptr_t *bp = zio->io_bp;
4604
4605
if (zio->io_error != 0)
4606
return;
4607
4608
dnode_t *dn = dr->dr_dnode;
4609
4610
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
4611
spa_t *spa = dmu_objset_spa(dn->dn_objset);
4612
int64_t delta = bp_get_dsize_sync(spa, bp) -
4613
bp_get_dsize_sync(spa, bp_orig);
4614
dnode_diduse_space(dn, delta);
4615
4616
uint64_t blkid = dr->dt.dll.dr_blkid;
4617
mutex_enter(&dn->dn_mtx);
4618
if (blkid > dn->dn_phys->dn_maxblkid) {
4619
ASSERT0(dn->dn_objset->os_raw_receive);
4620
dn->dn_phys->dn_maxblkid = blkid;
4621
}
4622
mutex_exit(&dn->dn_mtx);
4623
4624
if (!BP_IS_EMBEDDED(bp)) {
4625
uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
4626
BP_SET_FILL(bp, fill);
4627
}
4628
4629
dmu_buf_impl_t *parent_db;
4630
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4631
if (dr->dr_parent == NULL) {
4632
parent_db = dn->dn_dbuf;
4633
} else {
4634
parent_db = dr->dr_parent->dr_dbuf;
4635
}
4636
rw_enter(&parent_db->db_rwlock, RW_WRITER);
4637
*bp_orig = *bp;
4638
rw_exit(&parent_db->db_rwlock);
4639
}
4640
4641
static void
4642
dbuf_lightweight_done(zio_t *zio)
4643
{
4644
dbuf_dirty_record_t *dr = zio->io_private;
4645
4646
VERIFY0(zio->io_error);
4647
4648
objset_t *os = dr->dr_dnode->dn_objset;
4649
dmu_tx_t *tx = os->os_synctx;
4650
4651
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4652
ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
4653
} else {
4654
dsl_dataset_t *ds = os->os_dsl_dataset;
4655
(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
4656
dsl_dataset_block_born(ds, zio->io_bp, tx);
4657
}
4658
4659
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
4660
zio->io_txg);
4661
4662
abd_free(dr->dt.dll.dr_abd);
4663
kmem_free(dr, sizeof (*dr));
4664
}
4665
4666
noinline static void
4667
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4668
{
4669
dnode_t *dn = dr->dr_dnode;
4670
zio_t *pio;
4671
if (dn->dn_phys->dn_nlevels == 1) {
4672
pio = dn->dn_zio;
4673
} else {
4674
pio = dr->dr_parent->dr_zio;
4675
}
4676
4677
zbookmark_phys_t zb = {
4678
.zb_objset = dmu_objset_id(dn->dn_objset),
4679
.zb_object = dn->dn_object,
4680
.zb_level = 0,
4681
.zb_blkid = dr->dt.dll.dr_blkid,
4682
};
4683
4684
/*
4685
* See comment in dbuf_write(). This is so that zio->io_bp_orig
4686
* will have the old BP in dbuf_lightweight_done().
4687
*/
4688
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
4689
4690
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
4691
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
4692
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
4693
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
4694
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
4695
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
4696
4697
zio_nowait(dr->dr_zio);
4698
}
4699
4700
/*
4701
* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
4702
* critical the we not allow the compiler to inline this function in to
4703
* dbuf_sync_list() thereby drastically bloating the stack usage.
4704
*/
4705
noinline static void
4706
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4707
{
4708
arc_buf_t **datap = &dr->dt.dl.dr_data;
4709
dmu_buf_impl_t *db = dr->dr_dbuf;
4710
dnode_t *dn = dr->dr_dnode;
4711
objset_t *os;
4712
uint64_t txg = tx->tx_txg;
4713
4714
ASSERT(dmu_tx_is_syncing(tx));
4715
4716
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4717
4718
mutex_enter(&db->db_mtx);
4719
/*
4720
* To be synced, we must be dirtied. But we might have been freed
4721
* after the dirty.
4722
*/
4723
if (db->db_state == DB_UNCACHED) {
4724
/* This buffer has been freed since it was dirtied */
4725
ASSERT0P(db->db.db_data);
4726
} else if (db->db_state == DB_FILL) {
4727
/* This buffer was freed and is now being re-filled */
4728
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
4729
} else if (db->db_state == DB_READ) {
4730
/*
4731
* This buffer was either cloned or had a Direct I/O write
4732
* occur and has an in-flgiht read on the BP. It is safe to
4733
* issue the write here, because the read has already been
4734
* issued and the contents won't change.
4735
*
4736
* We can verify the case of both the clone and Direct I/O
4737
* write by making sure the first dirty record for the dbuf
4738
* has no ARC buffer associated with it.
4739
*/
4740
dbuf_dirty_record_t *dr_head =
4741
list_head(&db->db_dirty_records);
4742
ASSERT0P(db->db_buf);
4743
ASSERT0P(db->db.db_data);
4744
ASSERT0P(dr_head->dt.dl.dr_data);
4745
ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
4746
} else {
4747
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
4748
}
4749
DBUF_VERIFY(db);
4750
4751
if (db->db_blkid == DMU_SPILL_BLKID) {
4752
mutex_enter(&dn->dn_mtx);
4753
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
4754
/*
4755
* In the previous transaction group, the bonus buffer
4756
* was entirely used to store the attributes for the
4757
* dnode which overrode the dn_spill field. However,
4758
* when adding more attributes to the file a spill
4759
* block was required to hold the extra attributes.
4760
*
4761
* Make sure to clear the garbage left in the dn_spill
4762
* field from the previous attributes in the bonus
4763
* buffer. Otherwise, after writing out the spill
4764
* block to the new allocated dva, it will free
4765
* the old block pointed to by the invalid dn_spill.
4766
*/
4767
db->db_blkptr = NULL;
4768
}
4769
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
4770
mutex_exit(&dn->dn_mtx);
4771
}
4772
4773
/*
4774
* If this is a bonus buffer, simply copy the bonus data into the
4775
* dnode. It will be written out when the dnode is synced (and it
4776
* will be synced, since it must have been dirty for dbuf_sync to
4777
* be called).
4778
*/
4779
if (db->db_blkid == DMU_BONUS_BLKID) {
4780
ASSERT(dr->dr_dbuf == db);
4781
dbuf_sync_bonus(dr, tx);
4782
return;
4783
}
4784
4785
os = dn->dn_objset;
4786
4787
/*
4788
* This function may have dropped the db_mtx lock allowing a dmu_sync
4789
* operation to sneak in. As a result, we need to ensure that we
4790
* don't check the dr_override_state until we have returned from
4791
* dbuf_check_blkptr.
4792
*/
4793
dbuf_check_blkptr(dn, db);
4794
4795
/*
4796
* If this buffer is in the middle of an immediate write, wait for the
4797
* synchronous IO to complete.
4798
*
4799
* This is also valid even with Direct I/O writes setting a dirty
4800
* records override state into DR_IN_DMU_SYNC, because all
4801
* Direct I/O writes happen in open-context.
4802
*/
4803
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
4804
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
4805
cv_wait(&db->db_changed, &db->db_mtx);
4806
}
4807
4808
/*
4809
* If this is a dnode block, ensure it is appropriately encrypted
4810
* or decrypted, depending on what we are writing to it this txg.
4811
*/
4812
if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
4813
dbuf_prepare_encrypted_dnode_leaf(dr);
4814
4815
if (*datap != NULL && *datap == db->db_buf &&
4816
dn->dn_object != DMU_META_DNODE_OBJECT &&
4817
zfs_refcount_count(&db->db_holds) > 1) {
4818
/*
4819
* If this buffer is currently "in use" (i.e., there
4820
* are active holds and db_data still references it),
4821
* then make a copy before we start the write so that
4822
* any modifications from the open txg will not leak
4823
* into this write.
4824
*
4825
* NOTE: this copy does not need to be made for
4826
* objects only modified in the syncing context (e.g.
4827
* DNONE_DNODE blocks).
4828
*/
4829
int psize = arc_buf_size(*datap);
4830
int lsize = arc_buf_lsize(*datap);
4831
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
4832
enum zio_compress compress_type = arc_get_compression(*datap);
4833
uint8_t complevel = arc_get_complevel(*datap);
4834
4835
if (arc_is_encrypted(*datap)) {
4836
boolean_t byteorder;
4837
uint8_t salt[ZIO_DATA_SALT_LEN];
4838
uint8_t iv[ZIO_DATA_IV_LEN];
4839
uint8_t mac[ZIO_DATA_MAC_LEN];
4840
4841
arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
4842
*datap = arc_alloc_raw_buf(os->os_spa, db,
4843
dmu_objset_id(os), byteorder, salt, iv, mac,
4844
dn->dn_type, psize, lsize, compress_type,
4845
complevel);
4846
} else if (compress_type != ZIO_COMPRESS_OFF) {
4847
ASSERT3U(type, ==, ARC_BUFC_DATA);
4848
*datap = arc_alloc_compressed_buf(os->os_spa, db,
4849
psize, lsize, compress_type, complevel);
4850
} else {
4851
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
4852
}
4853
memcpy((*datap)->b_data, db->db.db_data, psize);
4854
}
4855
db->db_data_pending = dr;
4856
4857
mutex_exit(&db->db_mtx);
4858
4859
dbuf_write(dr, *datap, tx);
4860
4861
ASSERT(!list_link_active(&dr->dr_dirty_node));
4862
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
4863
list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
4864
} else {
4865
zio_nowait(dr->dr_zio);
4866
}
4867
}
4868
4869
/*
4870
* Syncs out a range of dirty records for indirect or leaf dbufs. May be
4871
* called recursively from dbuf_sync_indirect().
4872
*/
4873
void
4874
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
4875
{
4876
dbuf_dirty_record_t *dr;
4877
4878
while ((dr = list_head(list))) {
4879
if (dr->dr_zio != NULL) {
4880
/*
4881
* If we find an already initialized zio then we
4882
* are processing the meta-dnode, and we have finished.
4883
* The dbufs for all dnodes are put back on the list
4884
* during processing, so that we can zio_wait()
4885
* these IOs after initiating all child IOs.
4886
*/
4887
ASSERT3U(dr->dr_dbuf->db.db_object, ==,
4888
DMU_META_DNODE_OBJECT);
4889
break;
4890
}
4891
list_remove(list, dr);
4892
if (dr->dr_dbuf == NULL) {
4893
dbuf_sync_lightweight(dr, tx);
4894
} else {
4895
if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
4896
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
4897
VERIFY3U(dr->dr_dbuf->db_level, ==, level);
4898
}
4899
if (dr->dr_dbuf->db_level > 0)
4900
dbuf_sync_indirect(dr, tx);
4901
else
4902
dbuf_sync_leaf(dr, tx);
4903
}
4904
}
4905
}
4906
4907
static void
4908
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4909
{
4910
(void) buf;
4911
dmu_buf_impl_t *db = vdb;
4912
dnode_t *dn;
4913
blkptr_t *bp = zio->io_bp;
4914
blkptr_t *bp_orig = &zio->io_bp_orig;
4915
spa_t *spa = zio->io_spa;
4916
int64_t delta;
4917
uint64_t fill = 0;
4918
int i;
4919
4920
ASSERT3P(db->db_blkptr, !=, NULL);
4921
ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
4922
4923
DB_DNODE_ENTER(db);
4924
dn = DB_DNODE(db);
4925
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
4926
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
4927
zio->io_prev_space_delta = delta;
4928
4929
if (BP_GET_BIRTH(bp) != 0) {
4930
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
4931
BP_GET_TYPE(bp) == dn->dn_type) ||
4932
(db->db_blkid == DMU_SPILL_BLKID &&
4933
BP_GET_TYPE(bp) == dn->dn_bonustype) ||
4934
BP_IS_EMBEDDED(bp));
4935
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
4936
}
4937
4938
mutex_enter(&db->db_mtx);
4939
4940
#ifdef ZFS_DEBUG
4941
if (db->db_blkid == DMU_SPILL_BLKID) {
4942
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4943
ASSERT(!(BP_IS_HOLE(bp)) &&
4944
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4945
}
4946
#endif
4947
4948
if (db->db_level == 0) {
4949
mutex_enter(&dn->dn_mtx);
4950
if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
4951
db->db_blkid != DMU_SPILL_BLKID) {
4952
ASSERT0(db->db_objset->os_raw_receive);
4953
dn->dn_phys->dn_maxblkid = db->db_blkid;
4954
}
4955
mutex_exit(&dn->dn_mtx);
4956
4957
if (dn->dn_type == DMU_OT_DNODE) {
4958
i = 0;
4959
while (i < db->db.db_size) {
4960
dnode_phys_t *dnp =
4961
(void *)(((char *)db->db.db_data) + i);
4962
4963
i += DNODE_MIN_SIZE;
4964
if (dnp->dn_type != DMU_OT_NONE) {
4965
fill++;
4966
for (int j = 0; j < dnp->dn_nblkptr;
4967
j++) {
4968
(void) zfs_blkptr_verify(spa,
4969
&dnp->dn_blkptr[j],
4970
BLK_CONFIG_SKIP,
4971
BLK_VERIFY_HALT);
4972
}
4973
if (dnp->dn_flags &
4974
DNODE_FLAG_SPILL_BLKPTR) {
4975
(void) zfs_blkptr_verify(spa,
4976
DN_SPILL_BLKPTR(dnp),
4977
BLK_CONFIG_SKIP,
4978
BLK_VERIFY_HALT);
4979
}
4980
i += dnp->dn_extra_slots *
4981
DNODE_MIN_SIZE;
4982
}
4983
}
4984
} else {
4985
if (BP_IS_HOLE(bp)) {
4986
fill = 0;
4987
} else {
4988
fill = 1;
4989
}
4990
}
4991
} else {
4992
blkptr_t *ibp = db->db.db_data;
4993
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4994
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
4995
if (BP_IS_HOLE(ibp))
4996
continue;
4997
(void) zfs_blkptr_verify(spa, ibp,
4998
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
4999
fill += BP_GET_FILL(ibp);
5000
}
5001
}
5002
DB_DNODE_EXIT(db);
5003
5004
if (!BP_IS_EMBEDDED(bp))
5005
BP_SET_FILL(bp, fill);
5006
5007
mutex_exit(&db->db_mtx);
5008
5009
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
5010
*db->db_blkptr = *bp;
5011
dmu_buf_unlock_parent(db, dblt, FTAG);
5012
}
5013
5014
/*
5015
* This function gets called just prior to running through the compression
5016
* stage of the zio pipeline. If we're an indirect block comprised of only
5017
* holes, then we want this indirect to be compressed away to a hole. In
5018
* order to do that we must zero out any information about the holes that
5019
* this indirect points to prior to before we try to compress it.
5020
*/
5021
static void
5022
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
5023
{
5024
(void) zio, (void) buf;
5025
dmu_buf_impl_t *db = vdb;
5026
blkptr_t *bp;
5027
unsigned int epbs, i;
5028
5029
ASSERT3U(db->db_level, >, 0);
5030
DB_DNODE_ENTER(db);
5031
epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
5032
DB_DNODE_EXIT(db);
5033
ASSERT3U(epbs, <, 31);
5034
5035
/* Determine if all our children are holes */
5036
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
5037
if (!BP_IS_HOLE(bp))
5038
break;
5039
}
5040
5041
/*
5042
* If all the children are holes, then zero them all out so that
5043
* we may get compressed away.
5044
*/
5045
if (i == 1ULL << epbs) {
5046
/*
5047
* We only found holes. Grab the rwlock to prevent
5048
* anybody from reading the blocks we're about to
5049
* zero out.
5050
*/
5051
rw_enter(&db->db_rwlock, RW_WRITER);
5052
memset(db->db.db_data, 0, db->db.db_size);
5053
rw_exit(&db->db_rwlock);
5054
}
5055
}
5056
5057
static void
5058
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
5059
{
5060
(void) buf;
5061
dmu_buf_impl_t *db = vdb;
5062
blkptr_t *bp_orig = &zio->io_bp_orig;
5063
blkptr_t *bp = db->db_blkptr;
5064
objset_t *os = db->db_objset;
5065
dmu_tx_t *tx = os->os_synctx;
5066
5067
ASSERT0(zio->io_error);
5068
ASSERT(db->db_blkptr == bp);
5069
5070
/*
5071
* For nopwrites and rewrites we ensure that the bp matches our
5072
* original and bypass all the accounting.
5073
*/
5074
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
5075
ASSERT(BP_EQUAL(bp, bp_orig));
5076
} else {
5077
dsl_dataset_t *ds = os->os_dsl_dataset;
5078
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
5079
dsl_dataset_block_born(ds, bp, tx);
5080
}
5081
5082
mutex_enter(&db->db_mtx);
5083
5084
DBUF_VERIFY(db);
5085
5086
dbuf_dirty_record_t *dr = db->db_data_pending;
5087
dnode_t *dn = dr->dr_dnode;
5088
ASSERT(!list_link_active(&dr->dr_dirty_node));
5089
ASSERT(dr->dr_dbuf == db);
5090
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
5091
list_remove(&db->db_dirty_records, dr);
5092
5093
#ifdef ZFS_DEBUG
5094
if (db->db_blkid == DMU_SPILL_BLKID) {
5095
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
5096
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
5097
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
5098
}
5099
#endif
5100
5101
if (db->db_level == 0) {
5102
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
5103
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
5104
5105
/* no dr_data if this is a NO_FILL or Direct I/O */
5106
if (dr->dt.dl.dr_data != NULL &&
5107
dr->dt.dl.dr_data != db->db_buf) {
5108
ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
5109
ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
5110
arc_buf_destroy(dr->dt.dl.dr_data, db);
5111
}
5112
} else {
5113
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
5114
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
5115
if (!BP_IS_HOLE(db->db_blkptr)) {
5116
int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
5117
SPA_BLKPTRSHIFT;
5118
ASSERT3U(db->db_blkid, <=,
5119
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
5120
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
5121
db->db.db_size);
5122
}
5123
mutex_destroy(&dr->dt.di.dr_mtx);
5124
list_destroy(&dr->dt.di.dr_children);
5125
}
5126
5127
cv_broadcast(&db->db_changed);
5128
ASSERT(db->db_dirtycnt > 0);
5129
db->db_dirtycnt -= 1;
5130
db->db_data_pending = NULL;
5131
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
5132
5133
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
5134
zio->io_txg);
5135
5136
kmem_cache_free(dbuf_dirty_kmem_cache, dr);
5137
}
5138
5139
static void
5140
dbuf_write_nofill_ready(zio_t *zio)
5141
{
5142
dbuf_write_ready(zio, NULL, zio->io_private);
5143
}
5144
5145
static void
5146
dbuf_write_nofill_done(zio_t *zio)
5147
{
5148
dbuf_write_done(zio, NULL, zio->io_private);
5149
}
5150
5151
static void
5152
dbuf_write_override_ready(zio_t *zio)
5153
{
5154
dbuf_dirty_record_t *dr = zio->io_private;
5155
dmu_buf_impl_t *db = dr->dr_dbuf;
5156
5157
dbuf_write_ready(zio, NULL, db);
5158
}
5159
5160
static void
5161
dbuf_write_override_done(zio_t *zio)
5162
{
5163
dbuf_dirty_record_t *dr = zio->io_private;
5164
dmu_buf_impl_t *db = dr->dr_dbuf;
5165
blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
5166
5167
mutex_enter(&db->db_mtx);
5168
if (!BP_EQUAL(zio->io_bp, obp)) {
5169
if (!BP_IS_HOLE(obp))
5170
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
5171
arc_release(dr->dt.dl.dr_data, db);
5172
}
5173
mutex_exit(&db->db_mtx);
5174
5175
dbuf_write_done(zio, NULL, db);
5176
5177
if (zio->io_abd != NULL)
5178
abd_free(zio->io_abd);
5179
}
5180
5181
typedef struct dbuf_remap_impl_callback_arg {
5182
objset_t *drica_os;
5183
uint64_t drica_blk_birth;
5184
dmu_tx_t *drica_tx;
5185
} dbuf_remap_impl_callback_arg_t;
5186
5187
static void
5188
dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
5189
void *arg)
5190
{
5191
dbuf_remap_impl_callback_arg_t *drica = arg;
5192
objset_t *os = drica->drica_os;
5193
spa_t *spa = dmu_objset_spa(os);
5194
dmu_tx_t *tx = drica->drica_tx;
5195
5196
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
5197
5198
if (os == spa_meta_objset(spa)) {
5199
spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
5200
} else {
5201
dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
5202
size, drica->drica_blk_birth, tx);
5203
}
5204
}
5205
5206
static void
5207
dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
5208
{
5209
blkptr_t bp_copy = *bp;
5210
spa_t *spa = dmu_objset_spa(dn->dn_objset);
5211
dbuf_remap_impl_callback_arg_t drica;
5212
5213
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
5214
5215
drica.drica_os = dn->dn_objset;
5216
drica.drica_blk_birth = BP_GET_BIRTH(bp);
5217
drica.drica_tx = tx;
5218
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
5219
&drica)) {
5220
/*
5221
* If the blkptr being remapped is tracked by a livelist,
5222
* then we need to make sure the livelist reflects the update.
5223
* First, cancel out the old blkptr by appending a 'FREE'
5224
* entry. Next, add an 'ALLOC' to track the new version. This
5225
* way we avoid trying to free an inaccurate blkptr at delete.
5226
* Note that embedded blkptrs are not tracked in livelists.
5227
*/
5228
if (dn->dn_objset != spa_meta_objset(spa)) {
5229
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
5230
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
5231
BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
5232
ASSERT(!BP_IS_EMBEDDED(bp));
5233
ASSERT(dsl_dir_is_clone(ds->ds_dir));
5234
ASSERT(spa_feature_is_enabled(spa,
5235
SPA_FEATURE_LIVELIST));
5236
bplist_append(&ds->ds_dir->dd_pending_frees,
5237
bp);
5238
bplist_append(&ds->ds_dir->dd_pending_allocs,
5239
&bp_copy);
5240
}
5241
}
5242
5243
/*
5244
* The db_rwlock prevents dbuf_read_impl() from
5245
* dereferencing the BP while we are changing it. To
5246
* avoid lock contention, only grab it when we are actually
5247
* changing the BP.
5248
*/
5249
if (rw != NULL)
5250
rw_enter(rw, RW_WRITER);
5251
*bp = bp_copy;
5252
if (rw != NULL)
5253
rw_exit(rw);
5254
}
5255
}
5256
5257
/*
5258
* Remap any existing BP's to concrete vdevs, if possible.
5259
*/
5260
static void
5261
dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
5262
{
5263
spa_t *spa = dmu_objset_spa(db->db_objset);
5264
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
5265
5266
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
5267
return;
5268
5269
if (db->db_level > 0) {
5270
blkptr_t *bp = db->db.db_data;
5271
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
5272
dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
5273
}
5274
} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
5275
dnode_phys_t *dnp = db->db.db_data;
5276
ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE);
5277
for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
5278
i += dnp[i].dn_extra_slots + 1) {
5279
for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
5280
krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
5281
&dn->dn_dbuf->db_rwlock);
5282
dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
5283
tx);
5284
}
5285
}
5286
}
5287
}
5288
5289
5290
/*
5291
* Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
5292
* Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
5293
*/
5294
static void
5295
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
5296
{
5297
dmu_buf_impl_t *db = dr->dr_dbuf;
5298
dnode_t *dn = dr->dr_dnode;
5299
objset_t *os;
5300
dmu_buf_impl_t *parent = db->db_parent;
5301
uint64_t txg = tx->tx_txg;
5302
zbookmark_phys_t zb;
5303
zio_prop_t zp;
5304
zio_t *pio; /* parent I/O */
5305
int wp_flag = 0;
5306
5307
ASSERT(dmu_tx_is_syncing(tx));
5308
5309
os = dn->dn_objset;
5310
5311
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
5312
/*
5313
* Private object buffers are released here rather than in
5314
* dbuf_dirty() since they are only modified in the syncing
5315
* context and we don't want the overhead of making multiple
5316
* copies of the data.
5317
*/
5318
if (BP_IS_HOLE(db->db_blkptr))
5319
arc_buf_thaw(data);
5320
else
5321
dbuf_release_bp(db);
5322
dbuf_remap(dn, db, tx);
5323
}
5324
5325
if (parent != dn->dn_dbuf) {
5326
/* Our parent is an indirect block. */
5327
/* We have a dirty parent that has been scheduled for write. */
5328
ASSERT(parent && parent->db_data_pending);
5329
/* Our parent's buffer is one level closer to the dnode. */
5330
ASSERT(db->db_level == parent->db_level-1);
5331
/*
5332
* We're about to modify our parent's db_data by modifying
5333
* our block pointer, so the parent must be released.
5334
*/
5335
ASSERT(arc_released(parent->db_buf));
5336
pio = parent->db_data_pending->dr_zio;
5337
} else {
5338
/* Our parent is the dnode itself. */
5339
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
5340
db->db_blkid != DMU_SPILL_BLKID) ||
5341
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
5342
if (db->db_blkid != DMU_SPILL_BLKID)
5343
ASSERT3P(db->db_blkptr, ==,
5344
&dn->dn_phys->dn_blkptr[db->db_blkid]);
5345
pio = dn->dn_zio;
5346
}
5347
5348
ASSERT(db->db_level == 0 || data == db->db_buf);
5349
ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);
5350
ASSERT(pio);
5351
5352
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
5353
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
5354
db->db.db_object, db->db_level, db->db_blkid);
5355
5356
if (db->db_blkid == DMU_SPILL_BLKID)
5357
wp_flag = WP_SPILL;
5358
wp_flag |= (data == NULL) ? WP_NOFILL : 0;
5359
5360
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
5361
5362
/*
5363
* Set rewrite properties for zfs_rewrite() operations.
5364
*/
5365
if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {
5366
zp.zp_rewrite = B_TRUE;
5367
5368
/*
5369
* Mark physical rewrite feature for activation.
5370
* This will be activated automatically during dataset sync.
5371
*/
5372
dsl_dataset_t *ds = os->os_dsl_dataset;
5373
if (!dsl_dataset_feature_is_active(ds,
5374
SPA_FEATURE_PHYSICAL_REWRITE)) {
5375
ds->ds_feature_activation[
5376
SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;
5377
}
5378
}
5379
5380
/*
5381
* We copy the blkptr now (rather than when we instantiate the dirty
5382
* record), because its value can change between open context and
5383
* syncing context. We do not need to hold dn_struct_rwlock to read
5384
* db_blkptr because we are in syncing context.
5385
*/
5386
dr->dr_bp_copy = *db->db_blkptr;
5387
5388
if (db->db_level == 0 &&
5389
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
5390
/*
5391
* The BP for this block has been provided by open context
5392
* (by dmu_sync(), dmu_write_direct(),
5393
* or dmu_buf_write_embedded()).
5394
*/
5395
abd_t *contents = (data != NULL) ?
5396
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
5397
5398
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
5399
contents, db->db.db_size, db->db.db_size, &zp,
5400
dbuf_write_override_ready, NULL,
5401
dbuf_write_override_done,
5402
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
5403
mutex_enter(&db->db_mtx);
5404
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
5405
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
5406
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
5407
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
5408
mutex_exit(&db->db_mtx);
5409
} else if (data == NULL) {
5410
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
5411
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
5412
dr->dr_zio = zio_write(pio, os->os_spa, txg,
5413
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
5414
dbuf_write_nofill_ready, NULL,
5415
dbuf_write_nofill_done, db,
5416
ZIO_PRIORITY_ASYNC_WRITE,
5417
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
5418
} else {
5419
ASSERT(arc_released(data));
5420
5421
/*
5422
* For indirect blocks, we want to setup the children
5423
* ready callback so that we can properly handle an indirect
5424
* block that only contains holes.
5425
*/
5426
arc_write_done_func_t *children_ready_cb = NULL;
5427
if (db->db_level != 0)
5428
children_ready_cb = dbuf_write_children_ready;
5429
5430
dr->dr_zio = arc_write(pio, os->os_spa, txg,
5431
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
5432
dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
5433
children_ready_cb, dbuf_write_done, db,
5434
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
5435
}
5436
}
5437
5438
EXPORT_SYMBOL(dbuf_find);
5439
EXPORT_SYMBOL(dbuf_is_metadata);
5440
EXPORT_SYMBOL(dbuf_destroy);
5441
EXPORT_SYMBOL(dbuf_whichblock);
5442
EXPORT_SYMBOL(dbuf_read);
5443
EXPORT_SYMBOL(dbuf_unoverride);
5444
EXPORT_SYMBOL(dbuf_free_range);
5445
EXPORT_SYMBOL(dbuf_new_size);
5446
EXPORT_SYMBOL(dbuf_release_bp);
5447
EXPORT_SYMBOL(dbuf_dirty);
5448
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
5449
EXPORT_SYMBOL(dmu_buf_will_dirty);
5450
EXPORT_SYMBOL(dmu_buf_will_rewrite);
5451
EXPORT_SYMBOL(dmu_buf_is_dirty);
5452
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
5453
EXPORT_SYMBOL(dmu_buf_will_not_fill);
5454
EXPORT_SYMBOL(dmu_buf_will_fill);
5455
EXPORT_SYMBOL(dmu_buf_fill_done);
5456
EXPORT_SYMBOL(dmu_buf_rele);
5457
EXPORT_SYMBOL(dbuf_assign_arcbuf);
5458
EXPORT_SYMBOL(dbuf_prefetch);
5459
EXPORT_SYMBOL(dbuf_hold_impl);
5460
EXPORT_SYMBOL(dbuf_hold);
5461
EXPORT_SYMBOL(dbuf_hold_level);
5462
EXPORT_SYMBOL(dbuf_create_bonus);
5463
EXPORT_SYMBOL(dbuf_spill_set_blksz);
5464
EXPORT_SYMBOL(dbuf_rm_spill);
5465
EXPORT_SYMBOL(dbuf_add_ref);
5466
EXPORT_SYMBOL(dbuf_rele);
5467
EXPORT_SYMBOL(dbuf_rele_and_unlock);
5468
EXPORT_SYMBOL(dbuf_refcount);
5469
EXPORT_SYMBOL(dbuf_sync_list);
5470
EXPORT_SYMBOL(dmu_buf_set_user);
5471
EXPORT_SYMBOL(dmu_buf_set_user_ie);
5472
EXPORT_SYMBOL(dmu_buf_get_user);
5473
EXPORT_SYMBOL(dmu_buf_get_blkptr);
5474
5475
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
5476
"Maximum size in bytes of the dbuf cache.");
5477
5478
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
5479
"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
5480
5481
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
5482
"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
5483
5484
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
5485
"Maximum size in bytes of dbuf metadata cache.");
5486
5487
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
5488
"Set size of dbuf cache to log2 fraction of arc size.");
5489
5490
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
5491
"Set size of dbuf metadata cache to log2 fraction of arc size.");
5492
5493
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
5494
"Set size of dbuf cache mutex array as log2 shift.");
5495
5496