Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_direct.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
23
24
#include <sys/dmu.h>
25
#include <sys/dmu_impl.h>
26
#include <sys/dbuf.h>
27
#include <sys/dnode.h>
28
#include <sys/zfs_context.h>
29
#include <sys/zfs_racct.h>
30
#include <sys/dsl_dataset.h>
31
#include <sys/dmu_objset.h>
32
33
static abd_t *
34
make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
35
uint64_t size)
36
{
37
size_t buf_size = db->db.db_size;
38
abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
39
size_t buf_off = 0;
40
41
ASSERT(MUTEX_HELD(&db->db_mtx));
42
43
if (offset > db->db.db_offset) {
44
size_t pre_size = offset - db->db.db_offset;
45
pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
46
buf_size -= pre_size;
47
buf_off = 0;
48
} else {
49
buf_off = db->db.db_offset - offset;
50
size -= buf_off;
51
}
52
53
if (size < buf_size) {
54
size_t post_size = buf_size - size;
55
post_buf = abd_alloc_for_io(post_size, B_TRUE);
56
buf_size -= post_size;
57
}
58
59
ASSERT3U(buf_size, >, 0);
60
abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
61
62
if (pre_buf || post_buf) {
63
mbuf = abd_alloc_gang();
64
if (pre_buf)
65
abd_gang_add(mbuf, pre_buf, B_TRUE);
66
abd_gang_add(mbuf, buf, B_TRUE);
67
if (post_buf)
68
abd_gang_add(mbuf, post_buf, B_TRUE);
69
} else {
70
mbuf = buf;
71
}
72
73
return (mbuf);
74
}
75
76
static void
77
dmu_read_abd_done(zio_t *zio)
78
{
79
abd_free(zio->io_abd);
80
}
81
82
static void
83
dmu_write_direct_ready(zio_t *zio)
84
{
85
dmu_sync_ready(zio, NULL, zio->io_private);
86
}
87
88
static void
89
dmu_write_direct_done(zio_t *zio)
90
{
91
dmu_sync_arg_t *dsa = zio->io_private;
92
dbuf_dirty_record_t *dr = dsa->dsa_dr;
93
dmu_buf_impl_t *db = dr->dr_dbuf;
94
95
abd_free(zio->io_abd);
96
97
mutex_enter(&db->db_mtx);
98
ASSERT0P(db->db_buf);
99
ASSERT0P(dr->dt.dl.dr_data);
100
ASSERT0P(db->db.db_data);
101
db->db_state = DB_UNCACHED;
102
mutex_exit(&db->db_mtx);
103
104
dmu_sync_done(zio, NULL, zio->io_private);
105
106
if (zio->io_error != 0) {
107
if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
108
ASSERT3U(zio->io_error, ==, EIO);
109
110
/*
111
* In the event of an I/O error this block has been freed in
112
* zio_done() through zio_dva_unallocate(). Calling
113
* dmu_sync_done() above set dr_override_state to
114
* DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
115
* dbuf_unoverride(), it will skip doing zio_free() to free
116
* this block as that was already taken care of.
117
*
118
* Since we are undirtying the record in open-context, we must
119
* have a hold on the db, so it should never be evicted after
120
* calling dbuf_undirty().
121
*/
122
mutex_enter(&db->db_mtx);
123
VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
124
mutex_exit(&db->db_mtx);
125
}
126
127
kmem_free(zio->io_bp, sizeof (blkptr_t));
128
zio->io_bp = NULL;
129
}
130
131
int
132
dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
133
{
134
objset_t *os = db->db_objset;
135
dsl_dataset_t *ds = dmu_objset_ds(os);
136
zbookmark_phys_t zb;
137
dbuf_dirty_record_t *dr_head;
138
139
SET_BOOKMARK(&zb, ds->ds_object,
140
db->db.db_object, db->db_level, db->db_blkid);
141
142
DB_DNODE_ENTER(db);
143
zio_prop_t zp;
144
dmu_write_policy(os, DB_DNODE(db), db->db_level,
145
WP_DMU_SYNC | WP_DIRECT_WR, &zp);
146
DB_DNODE_EXIT(db);
147
148
/*
149
* Dirty this dbuf with DB_NOFILL since we will not have any data
150
* associated with the dbuf.
151
*/
152
dmu_buf_will_clone_or_dio(&db->db, tx);
153
154
mutex_enter(&db->db_mtx);
155
156
uint64_t txg = dmu_tx_get_txg(tx);
157
ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
158
ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
159
160
dr_head = list_head(&db->db_dirty_records);
161
ASSERT3U(dr_head->dr_txg, ==, txg);
162
dr_head->dt.dl.dr_diowrite = B_TRUE;
163
dr_head->dr_accounted = db->db.db_size;
164
165
blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
166
if (db->db_blkptr != NULL) {
167
/*
168
* Fill in bp with the current block pointer so that
169
* the nopwrite code can check if we're writing the same
170
* data that's already on disk.
171
*/
172
*bp = *db->db_blkptr;
173
} else {
174
memset(bp, 0, sizeof (blkptr_t));
175
}
176
177
/*
178
* Disable nopwrite if the current block pointer could change
179
* before this TXG syncs.
180
*/
181
if (list_next(&db->db_dirty_records, dr_head) != NULL)
182
zp.zp_nopwrite = B_FALSE;
183
184
ASSERT0(dr_head->dt.dl.dr_has_raw_params);
185
ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
186
dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
187
188
mutex_exit(&db->db_mtx);
189
190
dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
191
192
dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
193
dsa->dsa_dr = dr_head;
194
dsa->dsa_tx = tx;
195
196
zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
197
db->db.db_size, db->db.db_size, &zp,
198
dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
199
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
200
201
if (pio == NULL)
202
return (zio_wait(zio));
203
204
zio_nowait(zio);
205
206
return (0);
207
}
208
209
int
210
dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
211
abd_t *data, dmu_flags_t flags, dmu_tx_t *tx)
212
{
213
dmu_buf_t **dbp;
214
spa_t *spa = dn->dn_objset->os_spa;
215
int numbufs, err;
216
217
ASSERT(flags & DMU_DIRECTIO);
218
219
err = dmu_buf_hold_array_by_dnode(dn, offset,
220
size, B_FALSE, FTAG, &numbufs, &dbp, flags);
221
if (err)
222
return (err);
223
224
zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
225
226
for (int i = 0; i < numbufs && err == 0; i++) {
227
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
228
229
abd_t *abd = abd_get_offset_size(data,
230
db->db.db_offset - offset, dn->dn_datablksz);
231
232
zfs_racct_write(spa, db->db.db_size, 1, flags);
233
err = dmu_write_direct(pio, db, abd, tx);
234
ASSERT0(err);
235
}
236
237
err = zio_wait(pio);
238
239
/*
240
* The dbuf must be held until the Direct I/O write has completed in
241
* the event there was any errors and dbuf_undirty() was called.
242
*/
243
dmu_buf_rele_array(dbp, numbufs, FTAG);
244
245
return (err);
246
}
247
248
int
249
dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
250
abd_t *data, dmu_flags_t flags)
251
{
252
objset_t *os = dn->dn_objset;
253
spa_t *spa = os->os_spa;
254
dmu_buf_t **dbp;
255
int numbufs, err;
256
257
ASSERT(flags & DMU_DIRECTIO);
258
259
err = dmu_buf_hold_array_by_dnode(dn, offset,
260
size, B_FALSE, FTAG, &numbufs, &dbp, flags);
261
if (err)
262
return (err);
263
264
zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
265
266
for (int i = 0; i < numbufs; i++) {
267
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
268
abd_t *mbuf;
269
zbookmark_phys_t zb;
270
blkptr_t *bp;
271
272
mutex_enter(&db->db_mtx);
273
274
SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
275
db->db.db_object, db->db_level, db->db_blkid);
276
277
/*
278
* If there is another read for this dbuf, we will wait for
279
* that to complete first before checking the db_state below.
280
*/
281
while (db->db_state == DB_READ)
282
cv_wait(&db->db_changed, &db->db_mtx);
283
284
err = dmu_buf_get_bp_from_dbuf(db, &bp);
285
if (err) {
286
mutex_exit(&db->db_mtx);
287
goto error;
288
}
289
290
/*
291
* There is no need to read if this is a hole or the data is
292
* cached. This will not be considered a direct read for IO
293
* accounting in the same way that an ARC hit is not counted.
294
*/
295
if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
296
size_t aoff = offset < db->db.db_offset ?
297
db->db.db_offset - offset : 0;
298
size_t boff = offset > db->db.db_offset ?
299
offset - db->db.db_offset : 0;
300
size_t len = MIN(size - aoff, db->db.db_size - boff);
301
302
if (db->db_state == DB_CACHED) {
303
/*
304
* We need to untransformed the ARC buf data
305
* before we copy it over.
306
*/
307
err = dmu_buf_untransform_direct(db, spa);
308
ASSERT0(err);
309
abd_copy_from_buf_off(data,
310
(char *)db->db.db_data + boff, aoff, len);
311
} else {
312
abd_zero_off(data, aoff, len);
313
}
314
315
mutex_exit(&db->db_mtx);
316
continue;
317
}
318
319
mbuf = make_abd_for_dbuf(db, data, offset, size);
320
ASSERT3P(mbuf, !=, NULL);
321
322
/*
323
* The dbuf mutex (db_mtx) must be held when creating the ZIO
324
* for the read. The BP returned from
325
* dmu_buf_get_bp_from_dbuf() could be from a pending block
326
* clone or a yet to be synced Direct I/O write that is in the
327
* dbuf's dirty record. When zio_read() is called, zio_create()
328
* will make a copy of the BP. However, if zio_read() is called
329
* without the mutex being held then the dirty record from the
330
* dbuf could be freed in dbuf_write_done() resulting in garbage
331
* being set for the zio BP.
332
*/
333
zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
334
dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
335
ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
336
mutex_exit(&db->db_mtx);
337
338
zfs_racct_read(spa, db->db.db_size, 1, flags);
339
zio_nowait(cio);
340
}
341
342
dmu_buf_rele_array(dbp, numbufs, FTAG);
343
344
return (zio_wait(rio));
345
346
error:
347
dmu_buf_rele_array(dbp, numbufs, FTAG);
348
(void) zio_wait(rio);
349
return (err);
350
}
351
352
#ifdef _KERNEL
353
int
354
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
355
dmu_flags_t flags)
356
{
357
offset_t offset = zfs_uio_offset(uio);
358
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
359
int err;
360
361
ASSERT(uio->uio_extflg & UIO_DIRECT);
362
ASSERT3U(page_index, <, uio->uio_dio.npages);
363
364
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
365
offset & (PAGESIZE - 1), size);
366
err = dmu_read_abd(dn, offset, size, data, flags);
367
abd_free(data);
368
369
if (err == 0)
370
zfs_uioskip(uio, size);
371
372
return (err);
373
}
374
375
int
376
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
377
dmu_flags_t flags, dmu_tx_t *tx)
378
{
379
offset_t offset = zfs_uio_offset(uio);
380
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
381
int err;
382
383
ASSERT(uio->uio_extflg & UIO_DIRECT);
384
ASSERT3U(page_index, <, uio->uio_dio.npages);
385
386
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
387
offset & (PAGESIZE - 1), size);
388
err = dmu_write_abd(dn, offset, size, data, flags, tx);
389
abd_free(data);
390
391
if (err == 0)
392
zfs_uioskip(uio, size);
393
394
return (err);
395
}
396
#endif /* _KERNEL */
397
398
EXPORT_SYMBOL(dmu_read_abd);
399
EXPORT_SYMBOL(dmu_write_abd);
400
401