Path: blob/main/sys/contrib/openzfs/module/zfs/dmu_direct.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/212223#include <sys/dmu.h>24#include <sys/dmu_impl.h>25#include <sys/dbuf.h>26#include <sys/dnode.h>27#include <sys/zfs_context.h>28#include <sys/zfs_racct.h>29#include <sys/dsl_dataset.h>30#include <sys/dmu_objset.h>3132static abd_t *33make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,34uint64_t size)35{36size_t buf_size = db->db.db_size;37abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;38size_t buf_off = 0;3940ASSERT(MUTEX_HELD(&db->db_mtx));4142if (offset > db->db.db_offset) {43size_t pre_size = offset - db->db.db_offset;44pre_buf = abd_alloc_for_io(pre_size, B_TRUE);45buf_size -= pre_size;46buf_off = 0;47} else {48buf_off = db->db.db_offset - offset;49size -= buf_off;50}5152if (size < buf_size) {53size_t post_size = buf_size - size;54post_buf = abd_alloc_for_io(post_size, B_TRUE);55buf_size -= post_size;56}5758ASSERT3U(buf_size, >, 0);59abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);6061if (pre_buf || post_buf) {62mbuf = abd_alloc_gang();63if (pre_buf)64abd_gang_add(mbuf, pre_buf, B_TRUE);65abd_gang_add(mbuf, buf, B_TRUE);66if (post_buf)67abd_gang_add(mbuf, post_buf, B_TRUE);68} else {69mbuf = buf;70}7172return (mbuf);73}7475static void76dmu_read_abd_done(zio_t *zio)77{78abd_free(zio->io_abd);79}8081static void82dmu_write_direct_ready(zio_t *zio)83{84dmu_sync_ready(zio, NULL, zio->io_private);85}8687static void88dmu_write_direct_done(zio_t *zio)89{90dmu_sync_arg_t *dsa = zio->io_private;91dbuf_dirty_record_t *dr = dsa->dsa_dr;92dmu_buf_impl_t *db = dr->dr_dbuf;9394abd_free(zio->io_abd);9596mutex_enter(&db->db_mtx);97ASSERT0P(db->db_buf);98ASSERT0P(dr->dt.dl.dr_data);99ASSERT0P(db->db.db_data);100db->db_state = DB_UNCACHED;101mutex_exit(&db->db_mtx);102103dmu_sync_done(zio, NULL, zio->io_private);104105if (zio->io_error != 0) {106if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)107ASSERT3U(zio->io_error, ==, EIO);108109/*110* In the event of an I/O error this block has been freed in111* zio_done() through zio_dva_unallocate(). Calling112* dmu_sync_done() above set dr_override_state to113* DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls114* dbuf_unoverride(), it will skip doing zio_free() to free115* this block as that was already taken care of.116*117* Since we are undirtying the record in open-context, we must118* have a hold on the db, so it should never be evicted after119* calling dbuf_undirty().120*/121mutex_enter(&db->db_mtx);122VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);123mutex_exit(&db->db_mtx);124}125126kmem_free(zio->io_bp, sizeof (blkptr_t));127zio->io_bp = NULL;128}129130int131dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)132{133objset_t *os = db->db_objset;134dsl_dataset_t *ds = dmu_objset_ds(os);135zbookmark_phys_t zb;136dbuf_dirty_record_t *dr_head;137138SET_BOOKMARK(&zb, ds->ds_object,139db->db.db_object, db->db_level, db->db_blkid);140141DB_DNODE_ENTER(db);142zio_prop_t zp;143dmu_write_policy(os, DB_DNODE(db), db->db_level,144WP_DMU_SYNC | WP_DIRECT_WR, &zp);145DB_DNODE_EXIT(db);146147/*148* Dirty this dbuf with DB_NOFILL since we will not have any data149* associated with the dbuf.150*/151dmu_buf_will_clone_or_dio(&db->db, tx);152153mutex_enter(&db->db_mtx);154155uint64_t txg = dmu_tx_get_txg(tx);156ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));157ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));158159dr_head = list_head(&db->db_dirty_records);160ASSERT3U(dr_head->dr_txg, ==, txg);161dr_head->dt.dl.dr_diowrite = B_TRUE;162dr_head->dr_accounted = db->db.db_size;163164blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);165if (db->db_blkptr != NULL) {166/*167* Fill in bp with the current block pointer so that168* the nopwrite code can check if we're writing the same169* data that's already on disk.170*/171*bp = *db->db_blkptr;172} else {173memset(bp, 0, sizeof (blkptr_t));174}175176/*177* Disable nopwrite if the current block pointer could change178* before this TXG syncs.179*/180if (list_next(&db->db_dirty_records, dr_head) != NULL)181zp.zp_nopwrite = B_FALSE;182183ASSERT0(dr_head->dt.dl.dr_has_raw_params);184ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);185dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;186187mutex_exit(&db->db_mtx);188189dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);190191dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);192dsa->dsa_dr = dr_head;193dsa->dsa_tx = tx;194195zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,196db->db.db_size, db->db.db_size, &zp,197dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,198ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);199200if (pio == NULL)201return (zio_wait(zio));202203zio_nowait(zio);204205return (0);206}207208int209dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,210abd_t *data, dmu_flags_t flags, dmu_tx_t *tx)211{212dmu_buf_t **dbp;213spa_t *spa = dn->dn_objset->os_spa;214int numbufs, err;215216ASSERT(flags & DMU_DIRECTIO);217218err = dmu_buf_hold_array_by_dnode(dn, offset,219size, B_FALSE, FTAG, &numbufs, &dbp, flags);220if (err)221return (err);222223zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);224225for (int i = 0; i < numbufs && err == 0; i++) {226dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];227228abd_t *abd = abd_get_offset_size(data,229db->db.db_offset - offset, dn->dn_datablksz);230231zfs_racct_write(spa, db->db.db_size, 1, flags);232err = dmu_write_direct(pio, db, abd, tx);233ASSERT0(err);234}235236err = zio_wait(pio);237238/*239* The dbuf must be held until the Direct I/O write has completed in240* the event there was any errors and dbuf_undirty() was called.241*/242dmu_buf_rele_array(dbp, numbufs, FTAG);243244return (err);245}246247int248dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,249abd_t *data, dmu_flags_t flags)250{251objset_t *os = dn->dn_objset;252spa_t *spa = os->os_spa;253dmu_buf_t **dbp;254int numbufs, err;255256ASSERT(flags & DMU_DIRECTIO);257258err = dmu_buf_hold_array_by_dnode(dn, offset,259size, B_FALSE, FTAG, &numbufs, &dbp, flags);260if (err)261return (err);262263zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);264265for (int i = 0; i < numbufs; i++) {266dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];267abd_t *mbuf;268zbookmark_phys_t zb;269blkptr_t *bp;270271mutex_enter(&db->db_mtx);272273SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,274db->db.db_object, db->db_level, db->db_blkid);275276/*277* If there is another read for this dbuf, we will wait for278* that to complete first before checking the db_state below.279*/280while (db->db_state == DB_READ)281cv_wait(&db->db_changed, &db->db_mtx);282283err = dmu_buf_get_bp_from_dbuf(db, &bp);284if (err) {285mutex_exit(&db->db_mtx);286goto error;287}288289/*290* There is no need to read if this is a hole or the data is291* cached. This will not be considered a direct read for IO292* accounting in the same way that an ARC hit is not counted.293*/294if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {295size_t aoff = offset < db->db.db_offset ?296db->db.db_offset - offset : 0;297size_t boff = offset > db->db.db_offset ?298offset - db->db.db_offset : 0;299size_t len = MIN(size - aoff, db->db.db_size - boff);300301if (db->db_state == DB_CACHED) {302/*303* We need to untransformed the ARC buf data304* before we copy it over.305*/306err = dmu_buf_untransform_direct(db, spa);307ASSERT0(err);308abd_copy_from_buf_off(data,309(char *)db->db.db_data + boff, aoff, len);310} else {311abd_zero_off(data, aoff, len);312}313314mutex_exit(&db->db_mtx);315continue;316}317318mbuf = make_abd_for_dbuf(db, data, offset, size);319ASSERT3P(mbuf, !=, NULL);320321/*322* The dbuf mutex (db_mtx) must be held when creating the ZIO323* for the read. The BP returned from324* dmu_buf_get_bp_from_dbuf() could be from a pending block325* clone or a yet to be synced Direct I/O write that is in the326* dbuf's dirty record. When zio_read() is called, zio_create()327* will make a copy of the BP. However, if zio_read() is called328* without the mutex being held then the dirty record from the329* dbuf could be freed in dbuf_write_done() resulting in garbage330* being set for the zio BP.331*/332zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,333dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,334ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);335mutex_exit(&db->db_mtx);336337zfs_racct_read(spa, db->db.db_size, 1, flags);338zio_nowait(cio);339}340341dmu_buf_rele_array(dbp, numbufs, FTAG);342343return (zio_wait(rio));344345error:346dmu_buf_rele_array(dbp, numbufs, FTAG);347(void) zio_wait(rio);348return (err);349}350351#ifdef _KERNEL352int353dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,354dmu_flags_t flags)355{356offset_t offset = zfs_uio_offset(uio);357offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;358int err;359360ASSERT(uio->uio_extflg & UIO_DIRECT);361ASSERT3U(page_index, <, uio->uio_dio.npages);362363abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],364offset & (PAGESIZE - 1), size);365err = dmu_read_abd(dn, offset, size, data, flags);366abd_free(data);367368if (err == 0)369zfs_uioskip(uio, size);370371return (err);372}373374int375dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,376dmu_flags_t flags, dmu_tx_t *tx)377{378offset_t offset = zfs_uio_offset(uio);379offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;380int err;381382ASSERT(uio->uio_extflg & UIO_DIRECT);383ASSERT3U(page_index, <, uio->uio_dio.npages);384385abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],386offset & (PAGESIZE - 1), size);387err = dmu_write_abd(dn, offset, size, data, flags, tx);388abd_free(data);389390if (err == 0)391zfs_uioskip(uio, size);392393return (err);394}395#endif /* _KERNEL */396397EXPORT_SYMBOL(dmu_read_abd);398EXPORT_SYMBOL(dmu_write_abd);399400401