Path: blob/main/sys/contrib/openzfs/module/zfs/abd.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2014 by Chunwei Chen. All rights reserved.23* Copyright (c) 2019 by Delphix. All rights reserved.24*/2526/*27* ARC buffer data (ABD).28*29* ABDs are an abstract data structure for the ARC which can use two30* different ways of storing the underlying data:31*32* (a) Linear buffer. In this case, all the data in the ABD is stored in one33* contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).34*35* +-------------------+36* | ABD (linear) |37* | abd_flags = ... |38* | abd_size = ... | +--------------------------------+39* | abd_buf ------------->| raw buffer of size abd_size |40* +-------------------+ +--------------------------------+41* no abd_chunks42*43* (b) Scattered buffer. In this case, the data in the ABD is split into44* equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers45* to the chunks recorded in an array at the end of the ABD structure.46*47* +-------------------+48* | ABD (scattered) |49* | abd_flags = ... |50* | abd_size = ... |51* | abd_offset = 0 | +-----------+52* | abd_chunks[0] ----------------------------->| chunk 0 |53* | abd_chunks[1] ---------------------+ +-----------+54* | ... | | +-----------+55* | abd_chunks[N-1] ---------+ +------->| chunk 1 |56* +-------------------+ | +-----------+57* | ...58* | +-----------+59* +----------------->| chunk N-1 |60* +-----------+61*62* In addition to directly allocating a linear or scattered ABD, it is also63* possible to create an ABD by requesting the "sub-ABD" starting at an offset64* within an existing ABD. In linear buffers this is simple (set abd_buf of65* the new ABD to the starting point within the original raw buffer), but66* scattered ABDs are a little more complex. The new ABD makes a copy of the67* relevant abd_chunks pointers (but not the underlying data). However, to68* provide arbitrary rather than only chunk-aligned starting offsets, it also69* tracks an abd_offset field which represents the starting point of the data70* within the first chunk in abd_chunks. For both linear and scattered ABDs,71* creating an offset ABD marks the original ABD as the offset's parent, and the72* original ABD's abd_children refcount is incremented. This data allows us to73* ensure the root ABD isn't deleted before its children.74*75* Most consumers should never need to know what type of ABD they're using --76* the ABD public API ensures that it's possible to transparently switch from77* using a linear ABD to a scattered one when doing so would be beneficial.78*79* If you need to use the data within an ABD directly, if you know it's linear80* (because you allocated it) you can use abd_to_buf() to access the underlying81* raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions82* which will allocate a raw buffer if necessary. Use the abd_return_buf*83* functions to return any raw buffers that are no longer necessary when you're84* done using them.85*86* There are a variety of ABD APIs that implement basic buffer operations:87* compare, copy, read, write, and fill with zeroes. If you need a custom88* function which progressively accesses the whole ABD, use the abd_iterate_*89* functions.90*91* As an additional feature, linear and scatter ABD's can be stitched together92* by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs93* to be viewed as a singular ABD.94*95* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to96* B_FALSE.97*/9899#include <sys/abd_impl.h>100#include <sys/param.h>101#include <sys/zio.h>102#include <sys/zfs_context.h>103#include <sys/zfs_znode.h>104105/* see block comment above for description */106int zfs_abd_scatter_enabled = B_TRUE;107108void109abd_verify(abd_t *abd)110{111#ifdef ZFS_DEBUG112if (abd_is_from_pages(abd)) {113ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);114} else {115ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);116}117ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |118ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |119ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |120ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));121IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));122IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);123if (abd_is_linear(abd)) {124ASSERT3U(abd->abd_size, >, 0);125ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);126} else if (abd_is_gang(abd)) {127uint_t child_sizes = 0;128for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);129cabd != NULL;130cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {131ASSERT(list_link_active(&cabd->abd_gang_link));132child_sizes += cabd->abd_size;133abd_verify(cabd);134}135ASSERT3U(abd->abd_size, ==, child_sizes);136} else {137ASSERT3U(abd->abd_size, >, 0);138abd_verify_scatter(abd);139}140#endif141}142143void144abd_init_struct(abd_t *abd)145{146list_link_init(&abd->abd_gang_link);147mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);148abd->abd_flags = 0;149#ifdef ZFS_DEBUG150zfs_refcount_create(&abd->abd_children);151abd->abd_parent = NULL;152#endif153abd->abd_size = 0;154}155156static void157abd_fini_struct(abd_t *abd)158{159mutex_destroy(&abd->abd_mtx);160ASSERT(!list_link_active(&abd->abd_gang_link));161#ifdef ZFS_DEBUG162zfs_refcount_destroy(&abd->abd_children);163#endif164}165166abd_t *167abd_alloc_struct(size_t size)168{169abd_t *abd = abd_alloc_struct_impl(size);170abd_init_struct(abd);171abd->abd_flags |= ABD_FLAG_ALLOCD;172return (abd);173}174175void176abd_free_struct(abd_t *abd)177{178abd_fini_struct(abd);179abd_free_struct_impl(abd);180}181182/*183* Allocate an ABD, along with its own underlying data buffers. Use this if you184* don't care whether the ABD is linear or not.185*/186abd_t *187abd_alloc(size_t size, boolean_t is_metadata)188{189if (abd_size_alloc_linear(size))190return (abd_alloc_linear(size, is_metadata));191192VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);193194abd_t *abd = abd_alloc_struct(size);195abd->abd_flags |= ABD_FLAG_OWNER;196abd->abd_u.abd_scatter.abd_offset = 0;197abd_alloc_chunks(abd, size);198199if (is_metadata) {200abd->abd_flags |= ABD_FLAG_META;201}202abd->abd_size = size;203204abd_update_scatter_stats(abd, ABDSTAT_INCR);205206return (abd);207}208209/*210* Allocate an ABD that must be linear, along with its own underlying data211* buffer. Only use this when it would be very annoying to write your ABD212* consumer with a scattered ABD.213*/214abd_t *215abd_alloc_linear(size_t size, boolean_t is_metadata)216{217abd_t *abd = abd_alloc_struct(0);218219VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);220221abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;222if (is_metadata) {223abd->abd_flags |= ABD_FLAG_META;224}225abd->abd_size = size;226227if (is_metadata) {228ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);229} else {230ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);231}232233abd_update_linear_stats(abd, ABDSTAT_INCR);234235return (abd);236}237238static void239abd_free_linear(abd_t *abd)240{241if (abd_is_linear_page(abd)) {242abd_free_linear_page(abd);243return;244}245246if (abd->abd_flags & ABD_FLAG_META) {247zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);248} else {249zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);250}251252abd_update_linear_stats(abd, ABDSTAT_DECR);253}254255static void256abd_free_gang(abd_t *abd)257{258ASSERT(abd_is_gang(abd));259abd_t *cabd;260261while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {262/*263* We must acquire the child ABDs mutex to ensure that if it264* is being added to another gang ABD we will set the link265* as inactive when removing it from this gang ABD and before266* adding it to the other gang ABD.267*/268mutex_enter(&cabd->abd_mtx);269ASSERT(list_link_active(&cabd->abd_gang_link));270list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);271mutex_exit(&cabd->abd_mtx);272if (cabd->abd_flags & ABD_FLAG_GANG_FREE)273abd_free(cabd);274}275list_destroy(&ABD_GANG(abd).abd_gang_chain);276}277278static void279abd_free_scatter(abd_t *abd)280{281abd_free_chunks(abd);282abd_update_scatter_stats(abd, ABDSTAT_DECR);283}284285/*286* Free an ABD. Use with any kind of abd: those created with abd_alloc_*()287* and abd_get_*(), including abd_get_offset_struct().288*289* If the ABD was created with abd_alloc_*(), the underlying data290* (scatterlist or linear buffer) will also be freed. (Subject to ownership291* changes via abd_*_ownership_of_buf().)292*293* Unless the ABD was created with abd_get_offset_struct(), the abd_t will294* also be freed.295*/296void297abd_free(abd_t *abd)298{299if (abd == NULL)300return;301302abd_verify(abd);303#ifdef ZFS_DEBUG304IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);305#endif306307if (abd_is_gang(abd)) {308abd_free_gang(abd);309} else if (abd_is_linear(abd)) {310if (abd->abd_flags & ABD_FLAG_OWNER)311abd_free_linear(abd);312} else {313if (abd->abd_flags & ABD_FLAG_OWNER)314abd_free_scatter(abd);315}316317#ifdef ZFS_DEBUG318if (abd->abd_parent != NULL) {319(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,320abd->abd_size, abd);321}322#endif323324abd_fini_struct(abd);325if (abd->abd_flags & ABD_FLAG_ALLOCD)326abd_free_struct_impl(abd);327}328329/*330* Allocate an ABD of the same format (same metadata flag, same scatterize331* setting) as another ABD.332*/333abd_t *334abd_alloc_sametype(abd_t *sabd, size_t size)335{336boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;337if (abd_is_linear(sabd) &&338!abd_is_linear_page(sabd)) {339return (abd_alloc_linear(size, is_metadata));340} else {341return (abd_alloc(size, is_metadata));342}343}344345/*346* Create gang ABD that will be the head of a list of ABD's. This is used347* to "chain" scatter/gather lists together when constructing aggregated348* IO's. To free this abd, abd_free() must be called.349*/350abd_t *351abd_alloc_gang(void)352{353abd_t *abd = abd_alloc_struct(0);354abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;355list_create(&ABD_GANG(abd).abd_gang_chain,356sizeof (abd_t), offsetof(abd_t, abd_gang_link));357return (abd);358}359360/*361* Add a child gang ABD to a parent gang ABDs chained list.362*/363static void364abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)365{366ASSERT(abd_is_gang(pabd));367ASSERT(abd_is_gang(cabd));368369if (free_on_free) {370/*371* If the parent is responsible for freeing the child gang372* ABD we will just splice the child's children ABD list to373* the parent's list and immediately free the child gang ABD374* struct. The parent gang ABDs children from the child gang375* will retain all the free_on_free settings after being376* added to the parents list.377*/378#ifdef ZFS_DEBUG379/*380* If cabd had abd_parent, we have to drop it here. We can't381* transfer it to pabd, nor we can clear abd_size leaving it.382*/383if (cabd->abd_parent != NULL) {384(void) zfs_refcount_remove_many(385&cabd->abd_parent->abd_children,386cabd->abd_size, cabd);387cabd->abd_parent = NULL;388}389#endif390pabd->abd_size += cabd->abd_size;391cabd->abd_size = 0;392list_move_tail(&ABD_GANG(pabd).abd_gang_chain,393&ABD_GANG(cabd).abd_gang_chain);394ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));395abd_verify(pabd);396abd_free(cabd);397} else {398for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);399child != NULL;400child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {401/*402* We always pass B_FALSE for free_on_free as it is the403* original child gang ABDs responsibility to determine404* if any of its child ABDs should be free'd on the call405* to abd_free().406*/407abd_gang_add(pabd, child, B_FALSE);408}409abd_verify(pabd);410}411}412413/*414* Add a child ABD to a gang ABD's chained list.415*/416void417abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)418{419ASSERT(abd_is_gang(pabd));420abd_t *child_abd = NULL;421422/*423* If the child being added is a gang ABD, we will add the424* child's ABDs to the parent gang ABD. This allows us to account425* for the offset correctly in the parent gang ABD.426*/427if (abd_is_gang(cabd)) {428ASSERT(!list_link_active(&cabd->abd_gang_link));429return (abd_gang_add_gang(pabd, cabd, free_on_free));430}431ASSERT(!abd_is_gang(cabd));432433/*434* In order to verify that an ABD is not already part of435* another gang ABD, we must lock the child ABD's abd_mtx436* to check its abd_gang_link status. We unlock the abd_mtx437* only after it is has been added to a gang ABD, which438* will update the abd_gang_link's status. See comment below439* for how an ABD can be in multiple gang ABD's simultaneously.440*/441mutex_enter(&cabd->abd_mtx);442if (list_link_active(&cabd->abd_gang_link)) {443/*444* If the child ABD is already part of another445* gang ABD then we must allocate a new446* ABD to use a separate link. We mark the newly447* allocated ABD with ABD_FLAG_GANG_FREE, before448* adding it to the gang ABD's list, to make the449* gang ABD aware that it is responsible to call450* abd_free(). We use abd_get_offset() in order451* to just allocate a new ABD but avoid copying the452* data over into the newly allocated ABD.453*454* An ABD may become part of multiple gang ABD's. For455* example, when writing ditto bocks, the same ABD456* is used to write 2 or 3 locations with 2 or 3457* zio_t's. Each of the zio's may be aggregated with458* different adjacent zio's. zio aggregation uses gang459* zio's, so the single ABD can become part of multiple460* gang zio's.461*462* The ASSERT below is to make sure that if463* free_on_free is passed as B_TRUE, the ABD can464* not be in multiple gang ABD's. The gang ABD465* can not be responsible for cleaning up the child466* ABD memory allocation if the ABD can be in467* multiple gang ABD's at one time.468*/469ASSERT3B(free_on_free, ==, B_FALSE);470child_abd = abd_get_offset(cabd, 0);471child_abd->abd_flags |= ABD_FLAG_GANG_FREE;472} else {473child_abd = cabd;474if (free_on_free)475child_abd->abd_flags |= ABD_FLAG_GANG_FREE;476}477ASSERT3P(child_abd, !=, NULL);478479list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);480mutex_exit(&cabd->abd_mtx);481pabd->abd_size += child_abd->abd_size;482}483484/*485* Locate the ABD for the supplied offset in the gang ABD.486* Return a new offset relative to the returned ABD.487*/488abd_t *489abd_gang_get_offset(abd_t *abd, size_t *off)490{491abd_t *cabd;492493ASSERT(abd_is_gang(abd));494ASSERT3U(*off, <, abd->abd_size);495for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;496cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {497if (*off >= cabd->abd_size)498*off -= cabd->abd_size;499else500return (cabd);501}502VERIFY3P(cabd, !=, NULL);503return (cabd);504}505506/*507* Allocate a new ABD, using the provided struct (if non-NULL, and if508* circumstances allow - otherwise allocate the struct). The returned ABD will509* point to offset off of sabd. It shares the underlying buffer data with sabd.510* Use abd_free() to free. sabd must not be freed while any derived ABDs exist.511*/512static abd_t *513abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)514{515abd_verify(sabd);516ASSERT3U(off + size, <=, sabd->abd_size);517518if (abd_is_linear(sabd)) {519if (abd == NULL)520abd = abd_alloc_struct(0);521/*522* Even if this buf is filesystem metadata, we only track that523* if we own the underlying data buffer, which is not true in524* this case. Therefore, we don't ever use ABD_FLAG_META here.525*/526abd->abd_flags |= ABD_FLAG_LINEAR;527528/*529* User pages from Direct I/O requests may be in a single page530* (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag531* that here for abd. This is required because we have to be532* careful when borrowing the buffer from the ABD because we533* can not place user pages under write protection on Linux.534* See the comments in abd_os.c for abd_borrow_buf(),535* abd_borrow_buf_copy(), abd_return_buf() and536* abd_return_buf_copy().537*/538if (abd_is_from_pages(sabd)) {539abd->abd_flags |= ABD_FLAG_FROM_PAGES |540ABD_FLAG_LINEAR_PAGE;541}542543ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;544} else if (abd_is_gang(sabd)) {545size_t left = size;546if (abd == NULL) {547abd = abd_alloc_gang();548} else {549abd->abd_flags |= ABD_FLAG_GANG;550list_create(&ABD_GANG(abd).abd_gang_chain,551sizeof (abd_t), offsetof(abd_t, abd_gang_link));552}553554abd->abd_flags &= ~ABD_FLAG_OWNER;555for (abd_t *cabd = abd_gang_get_offset(sabd, &off);556cabd != NULL && left > 0;557cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {558int csize = MIN(left, cabd->abd_size - off);559560abd_t *nabd = abd_get_offset_size(cabd, off, csize);561abd_gang_add(abd, nabd, B_TRUE);562left -= csize;563off = 0;564}565ASSERT0(left);566} else {567abd = abd_get_offset_scatter(abd, sabd, off, size);568}569570ASSERT3P(abd, !=, NULL);571abd->abd_size = size;572#ifdef ZFS_DEBUG573abd->abd_parent = sabd;574(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);575#endif576return (abd);577}578579/*580* Like abd_get_offset_size(), but memory for the abd_t is provided by the581* caller. Using this routine can improve performance by avoiding the cost582* of allocating memory for the abd_t struct, and updating the abd stats.583* Usually, the provided abd is returned, but in some circumstances (FreeBSD,584* if sabd is scatter and size is more than 2 pages) a new abd_t may need to585* be allocated. Therefore callers should be careful to use the returned586* abd_t*.587*/588abd_t *589abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)590{591abd_t *result;592abd_init_struct(abd);593result = abd_get_offset_impl(abd, sabd, off, size);594if (result != abd)595abd_fini_struct(abd);596return (result);597}598599abd_t *600abd_get_offset(abd_t *sabd, size_t off)601{602size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;603VERIFY3U(size, >, 0);604return (abd_get_offset_impl(NULL, sabd, off, size));605}606607abd_t *608abd_get_offset_size(abd_t *sabd, size_t off, size_t size)609{610ASSERT3U(off + size, <=, sabd->abd_size);611return (abd_get_offset_impl(NULL, sabd, off, size));612}613614/*615* Return a size scatter ABD containing only zeros.616*/617abd_t *618abd_get_zeros(size_t size)619{620ASSERT3P(abd_zero_scatter, !=, NULL);621ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);622return (abd_get_offset_size(abd_zero_scatter, 0, size));623}624625/*626* Create a linear ABD for an existing buf.627*/628static abd_t *629abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)630{631VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);632633/*634* Even if this buf is filesystem metadata, we only track that if we635* own the underlying data buffer, which is not true in this case.636* Therefore, we don't ever use ABD_FLAG_META here.637*/638abd->abd_flags |= ABD_FLAG_LINEAR;639abd->abd_size = size;640641ABD_LINEAR_BUF(abd) = buf;642643return (abd);644}645646abd_t *647abd_get_from_buf(void *buf, size_t size)648{649abd_t *abd = abd_alloc_struct(0);650return (abd_get_from_buf_impl(abd, buf, size));651}652653abd_t *654abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)655{656abd_init_struct(abd);657return (abd_get_from_buf_impl(abd, buf, size));658}659660/*661* Get the raw buffer associated with a linear ABD.662*/663void *664abd_to_buf(abd_t *abd)665{666ASSERT(abd_is_linear(abd));667abd_verify(abd);668return (ABD_LINEAR_BUF(abd));669}670671void672abd_release_ownership_of_buf(abd_t *abd)673{674ASSERT(abd_is_linear(abd));675ASSERT(abd->abd_flags & ABD_FLAG_OWNER);676677/*678* abd_free() needs to handle LINEAR_PAGE ABD's specially.679* Since that flag does not survive the680* abd_release_ownership_of_buf() -> abd_get_from_buf() ->681* abd_take_ownership_of_buf() sequence, we don't allow releasing682* these "linear but not zio_[data_]buf_alloc()'ed" ABD's.683*/684ASSERT(!abd_is_linear_page(abd));685686abd_verify(abd);687688abd->abd_flags &= ~ABD_FLAG_OWNER;689/* Disable this flag since we no longer own the data buffer */690abd->abd_flags &= ~ABD_FLAG_META;691692abd_update_linear_stats(abd, ABDSTAT_DECR);693}694695696/*697* Give this ABD ownership of the buffer that it's storing. Can only be used on698* linear ABDs which were allocated via abd_get_from_buf(), or ones allocated699* with abd_alloc_linear() which subsequently released ownership of their buf700* with abd_release_ownership_of_buf().701*/702void703abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)704{705ASSERT(abd_is_linear(abd));706ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));707abd_verify(abd);708709abd->abd_flags |= ABD_FLAG_OWNER;710if (is_metadata) {711abd->abd_flags |= ABD_FLAG_META;712}713714abd_update_linear_stats(abd, ABDSTAT_INCR);715}716717/*718* Initializes an abd_iter based on whether the abd is a gang ABD719* or just a single ABD.720*/721static inline abd_t *722abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)723{724abd_t *cabd = NULL;725726if (abd_is_gang(abd)) {727cabd = abd_gang_get_offset(abd, &off);728if (cabd) {729abd_iter_init(aiter, cabd);730abd_iter_advance(aiter, off);731}732} else {733abd_iter_init(aiter, abd);734abd_iter_advance(aiter, off);735}736return (cabd);737}738739/*740* Advances an abd_iter. We have to be careful with gang ABD as741* advancing could mean that we are at the end of a particular ABD and742* must grab the ABD in the gang ABD's list.743*/744static inline abd_t *745abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,746size_t len)747{748abd_iter_advance(aiter, len);749if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {750ASSERT3P(cabd, !=, NULL);751cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);752if (cabd) {753abd_iter_init(aiter, cabd);754abd_iter_advance(aiter, 0);755}756}757return (cabd);758}759760int761abd_iterate_func(abd_t *abd, size_t off, size_t size,762abd_iter_func_t *func, void *private)763{764struct abd_iter aiter;765int ret = 0;766767if (size == 0)768return (0);769770abd_verify(abd);771ASSERT3U(off + size, <=, abd->abd_size);772773abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);774775while (size > 0) {776IMPLY(abd_is_gang(abd), c_abd != NULL);777778abd_iter_map(&aiter);779780size_t len = MIN(aiter.iter_mapsize, size);781ASSERT3U(len, >, 0);782783ret = func(aiter.iter_mapaddr, len, private);784785abd_iter_unmap(&aiter);786787if (ret != 0)788break;789790size -= len;791c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);792}793794return (ret);795}796797#if defined(__linux__) && defined(_KERNEL)798int799abd_iterate_page_func(abd_t *abd, size_t off, size_t size,800abd_iter_page_func_t *func, void *private)801{802struct abd_iter aiter;803int ret = 0;804805if (size == 0)806return (0);807808abd_verify(abd);809ASSERT3U(off + size, <=, abd->abd_size);810811abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);812813while (size > 0) {814IMPLY(abd_is_gang(abd), c_abd != NULL);815816abd_iter_page(&aiter);817818size_t len = MIN(aiter.iter_page_dsize, size);819ASSERT3U(len, >, 0);820821ret = func(aiter.iter_page, aiter.iter_page_doff,822len, private);823824aiter.iter_page = NULL;825aiter.iter_page_doff = 0;826aiter.iter_page_dsize = 0;827828if (ret != 0)829break;830831size -= len;832c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);833}834835return (ret);836}837#endif838839struct buf_arg {840void *arg_buf;841};842843static int844abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)845{846struct buf_arg *ba_ptr = private;847848(void) memcpy(ba_ptr->arg_buf, buf, size);849ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;850851return (0);852}853854/*855* Copy abd to buf. (off is the offset in abd.)856*/857void858abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)859{860struct buf_arg ba_ptr = { buf };861862(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,863&ba_ptr);864}865866static int867abd_cmp_buf_off_cb(void *buf, size_t size, void *private)868{869int ret;870struct buf_arg *ba_ptr = private;871872ret = memcmp(buf, ba_ptr->arg_buf, size);873ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;874875return (ret);876}877878/*879* Compare the contents of abd to buf. (off is the offset in abd.)880*/881int882abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)883{884struct buf_arg ba_ptr = { (void *) buf };885886return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));887}888889static int890abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)891{892struct buf_arg *ba_ptr = private;893894(void) memcpy(buf, ba_ptr->arg_buf, size);895ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;896897return (0);898}899900/*901* Copy from buf to abd. (off is the offset in abd.)902*/903void904abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)905{906struct buf_arg ba_ptr = { (void *) buf };907908(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,909&ba_ptr);910}911912static int913abd_zero_off_cb(void *buf, size_t size, void *private)914{915(void) private;916(void) memset(buf, 0, size);917return (0);918}919920/*921* Zero out the abd from a particular offset to the end.922*/923void924abd_zero_off(abd_t *abd, size_t off, size_t size)925{926(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);927}928929/*930* Iterate over two ABDs and call func incrementally on the two ABDs' data in931* equal-sized chunks (passed to func as raw buffers). func could be called many932* times during this iteration.933*/934int935abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,936size_t size, abd_iter_func2_t *func, void *private)937{938int ret = 0;939struct abd_iter daiter, saiter;940abd_t *c_dabd, *c_sabd;941942if (size == 0)943return (0);944945abd_verify(dabd);946abd_verify(sabd);947948ASSERT3U(doff + size, <=, dabd->abd_size);949ASSERT3U(soff + size, <=, sabd->abd_size);950951c_dabd = abd_init_abd_iter(dabd, &daiter, doff);952c_sabd = abd_init_abd_iter(sabd, &saiter, soff);953954while (size > 0) {955IMPLY(abd_is_gang(dabd), c_dabd != NULL);956IMPLY(abd_is_gang(sabd), c_sabd != NULL);957958abd_iter_map(&daiter);959abd_iter_map(&saiter);960961size_t dlen = MIN(daiter.iter_mapsize, size);962size_t slen = MIN(saiter.iter_mapsize, size);963size_t len = MIN(dlen, slen);964ASSERT(dlen > 0 || slen > 0);965966ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,967private);968969abd_iter_unmap(&saiter);970abd_iter_unmap(&daiter);971972if (ret != 0)973break;974975size -= len;976c_dabd =977abd_advance_abd_iter(dabd, c_dabd, &daiter, len);978c_sabd =979abd_advance_abd_iter(sabd, c_sabd, &saiter, len);980}981982return (ret);983}984985static int986abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)987{988(void) private;989(void) memcpy(dbuf, sbuf, size);990return (0);991}992993/*994* Copy from sabd to dabd starting from soff and doff.995*/996void997abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)998{999(void) abd_iterate_func2(dabd, sabd, doff, soff, size,1000abd_copy_off_cb, NULL);1001}10021003static int1004abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)1005{1006(void) private;1007return (memcmp(bufa, bufb, size));1008}10091010/*1011* Compares the contents of two ABDs.1012*/1013int1014abd_cmp(abd_t *dabd, abd_t *sabd)1015{1016ASSERT3U(dabd->abd_size, ==, sabd->abd_size);1017return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,1018abd_cmp_cb, NULL));1019}10201021/*1022* Check if ABD content is all-zeroes.1023*/1024static int1025abd_cmp_zero_off_cb(void *data, size_t len, void *private)1026{1027(void) private;10281029/* This function can only check whole uint64s. Enforce that. */1030ASSERT0(P2PHASE(len, 8));10311032uint64_t *end = (uint64_t *)((char *)data + len);1033for (uint64_t *word = (uint64_t *)data; word < end; word++)1034if (*word != 0)1035return (1);10361037return (0);1038}10391040int1041abd_cmp_zero_off(abd_t *abd, size_t off, size_t size)1042{1043return (abd_iterate_func(abd, off, size, abd_cmp_zero_off_cb, NULL));1044}10451046/*1047* Iterate over code ABDs and a data ABD and call @func_raidz_gen.1048*1049* @cabds parity ABDs, must have equal size1050* @dabd data ABD. Can be NULL (in this case @dsize = 0)1051* @func_raidz_gen should be implemented so that its behaviour1052* is the same when taking linear and when taking scatter1053*/1054void1055abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,1056size_t csize, size_t dsize, const unsigned parity,1057void (*func_raidz_gen)(void **, const void *, size_t, size_t))1058{1059int i;1060size_t len, dlen;1061struct abd_iter caiters[3];1062struct abd_iter daiter;1063void *caddrs[3], *daddr;1064unsigned long flags __maybe_unused = 0;1065abd_t *c_cabds[3];1066abd_t *c_dabd = NULL;10671068ASSERT3U(parity, <=, 3);1069for (i = 0; i < parity; i++) {1070abd_verify(cabds[i]);1071ASSERT3U(off + csize, <=, cabds[i]->abd_size);1072c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);1073}10741075if (dsize > 0) {1076ASSERT(dabd);1077abd_verify(dabd);1078ASSERT3U(off + dsize, <=, dabd->abd_size);1079c_dabd = abd_init_abd_iter(dabd, &daiter, off);1080}10811082abd_enter_critical(flags);1083while (csize > 0) {1084len = csize;1085for (i = 0; i < parity; i++) {1086IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);1087abd_iter_map(&caiters[i]);1088caddrs[i] = caiters[i].iter_mapaddr;1089len = MIN(caiters[i].iter_mapsize, len);1090}10911092if (dsize > 0) {1093IMPLY(abd_is_gang(dabd), c_dabd != NULL);1094abd_iter_map(&daiter);1095daddr = daiter.iter_mapaddr;1096len = MIN(daiter.iter_mapsize, len);1097dlen = len;1098} else {1099daddr = NULL;1100dlen = 0;1101}11021103/* must be progressive */1104ASSERT3U(len, >, 0);1105/*1106* The iterated function likely will not do well if each1107* segment except the last one is not multiple of 512 (raidz).1108*/1109ASSERT3U(((uint64_t)len & 511ULL), ==, 0);11101111func_raidz_gen(caddrs, daddr, len, dlen);11121113for (i = parity-1; i >= 0; i--) {1114abd_iter_unmap(&caiters[i]);1115c_cabds[i] =1116abd_advance_abd_iter(cabds[i], c_cabds[i],1117&caiters[i], len);1118}11191120if (dsize > 0) {1121abd_iter_unmap(&daiter);1122c_dabd =1123abd_advance_abd_iter(dabd, c_dabd, &daiter,1124dlen);1125dsize -= dlen;1126}11271128csize -= len;1129}1130abd_exit_critical(flags);1131}11321133/*1134* Iterate over code ABDs and data reconstruction target ABDs and call1135* @func_raidz_rec. Function maps at most 6 pages atomically.1136*1137* @cabds parity ABDs, must have equal size1138* @tabds rec target ABDs, at most 31139* @tsize size of data target columns1140* @func_raidz_rec expects syndrome data in target columns. Function1141* reconstructs data and overwrites target columns.1142*/1143void1144abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,1145size_t tsize, const unsigned parity,1146void (*func_raidz_rec)(void **t, const size_t tsize, void **c,1147const unsigned *mul),1148const unsigned *mul)1149{1150int i;1151size_t len;1152struct abd_iter citers[3];1153struct abd_iter xiters[3];1154void *caddrs[3], *xaddrs[3];1155unsigned long flags __maybe_unused = 0;1156abd_t *c_cabds[3];1157abd_t *c_tabds[3];11581159ASSERT3U(parity, <=, 3);11601161for (i = 0; i < parity; i++) {1162abd_verify(cabds[i]);1163abd_verify(tabds[i]);1164ASSERT3U(tsize, <=, cabds[i]->abd_size);1165ASSERT3U(tsize, <=, tabds[i]->abd_size);1166c_cabds[i] =1167abd_init_abd_iter(cabds[i], &citers[i], 0);1168c_tabds[i] =1169abd_init_abd_iter(tabds[i], &xiters[i], 0);1170}11711172abd_enter_critical(flags);1173while (tsize > 0) {1174len = tsize;1175for (i = 0; i < parity; i++) {1176IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);1177IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);1178abd_iter_map(&citers[i]);1179abd_iter_map(&xiters[i]);1180caddrs[i] = citers[i].iter_mapaddr;1181xaddrs[i] = xiters[i].iter_mapaddr;1182len = MIN(citers[i].iter_mapsize, len);1183len = MIN(xiters[i].iter_mapsize, len);1184}11851186/* must be progressive */1187ASSERT3S(len, >, 0);1188/*1189* The iterated function likely will not do well if each1190* segment except the last one is not multiple of 512 (raidz).1191*/1192ASSERT3U(((uint64_t)len & 511ULL), ==, 0);11931194func_raidz_rec(xaddrs, len, caddrs, mul);11951196for (i = parity-1; i >= 0; i--) {1197abd_iter_unmap(&xiters[i]);1198abd_iter_unmap(&citers[i]);1199c_tabds[i] =1200abd_advance_abd_iter(tabds[i], c_tabds[i],1201&xiters[i], len);1202c_cabds[i] =1203abd_advance_abd_iter(cabds[i], c_cabds[i],1204&citers[i], len);1205}12061207tsize -= len;1208ASSERT3S(tsize, >=, 0);1209}1210abd_exit_critical(flags);1211}12121213EXPORT_SYMBOL(abd_free);121412151216