// SPDX-License-Identifier: GPL-2.012#include "backref.h"3#include "btrfs_inode.h"4#include "fiemap.h"5#include "file.h"6#include "file-item.h"78struct btrfs_fiemap_entry {9u64 offset;10u64 phys;11u64 len;12u32 flags;13};1415/*16* Indicate the caller of emit_fiemap_extent() that it needs to unlock the file17* range from the inode's io tree, unlock the subvolume tree search path, flush18* the fiemap cache and relock the file range and research the subvolume tree.19* The value here is something negative that can't be confused with a valid20* errno value and different from 1 because that's also a return value from21* fiemap_fill_next_extent() and also it's often used to mean some btree search22* did not find a key, so make it some distinct negative value.23*/24#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))2526/*27* Used to:28*29* - Cache the next entry to be emitted to the fiemap buffer, so that we can30* merge extents that are contiguous and can be grouped as a single one;31*32* - Store extents ready to be written to the fiemap buffer in an intermediary33* buffer. This intermediary buffer is to ensure that in case the fiemap34* buffer is memory mapped to the fiemap target file, we don't deadlock35* during btrfs_page_mkwrite(). This is because during fiemap we are locking36* an extent range in order to prevent races with delalloc flushing and37* ordered extent completion, which is needed in order to reliably detect38* delalloc in holes and prealloc extents. And this can lead to a deadlock39* if the fiemap buffer is memory mapped to the file we are running fiemap40* against (a silly, useless in practice scenario, but possible) because41* btrfs_page_mkwrite() will try to lock the same extent range.42*/43struct fiemap_cache {44/* An array of ready fiemap entries. */45struct btrfs_fiemap_entry *entries;46/* Number of entries in the entries array. */47int entries_size;48/* Index of the next entry in the entries array to write to. */49int entries_pos;50/*51* Once the entries array is full, this indicates what's the offset for52* the next file extent item we must search for in the inode's subvolume53* tree after unlocking the extent range in the inode's io tree and54* releasing the search path.55*/56u64 next_search_offset;57/*58* This matches struct fiemap_extent_info::fi_mapped_extents, we use it59* to count ourselves emitted extents and stop instead of relying on60* fiemap_fill_next_extent() because we buffer ready fiemap entries at61* the @entries array, and we want to stop as soon as we hit the max62* amount of extents to map, not just to save time but also to make the63* logic at extent_fiemap() simpler.64*/65unsigned int extents_mapped;66/* Fields for the cached extent (unsubmitted, not ready, extent). */67u64 offset;68u64 phys;69u64 len;70u32 flags;71bool cached;72};7374static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,75struct fiemap_cache *cache)76{77for (int i = 0; i < cache->entries_pos; i++) {78struct btrfs_fiemap_entry *entry = &cache->entries[i];79int ret;8081ret = fiemap_fill_next_extent(fieinfo, entry->offset,82entry->phys, entry->len,83entry->flags);84/*85* Ignore 1 (reached max entries) because we keep track of that86* ourselves in emit_fiemap_extent().87*/88if (ret < 0)89return ret;90}91cache->entries_pos = 0;9293return 0;94}9596/*97* Helper to submit fiemap extent.98*99* Will try to merge current fiemap extent specified by @offset, @phys,100* @len and @flags with cached one.101* And only when we fails to merge, cached one will be submitted as102* fiemap extent.103*104* Return value is the same as fiemap_fill_next_extent().105*/106static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,107struct fiemap_cache *cache,108u64 offset, u64 phys, u64 len, u32 flags)109{110struct btrfs_fiemap_entry *entry;111u64 cache_end;112113/* Set at the end of extent_fiemap(). */114ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);115116if (!cache->cached)117goto assign;118119/*120* When iterating the extents of the inode, at extent_fiemap(), we may121* find an extent that starts at an offset behind the end offset of the122* previous extent we processed. This happens if fiemap is called123* without FIEMAP_FLAG_SYNC and there are ordered extents completing124* after we had to unlock the file range, release the search path, emit125* the fiemap extents stored in the buffer (cache->entries array) and126* the lock the remainder of the range and re-search the btree.127*128* For example we are in leaf X processing its last item, which is the129* file extent item for file range [512K, 1M[, and after130* btrfs_next_leaf() releases the path, there's an ordered extent that131* completes for the file range [768K, 2M[, and that results in trimming132* the file extent item so that it now corresponds to the file range133* [512K, 768K[ and a new file extent item is inserted for the file134* range [768K, 2M[, which may end up as the last item of leaf X or as135* the first item of the next leaf - in either case btrfs_next_leaf()136* will leave us with a path pointing to the new extent item, for the137* file range [768K, 2M[, since that's the first key that follows the138* last one we processed. So in order not to report overlapping extents139* to user space, we trim the length of the previously cached extent and140* emit it.141*142* Upon calling btrfs_next_leaf() we may also find an extent with an143* offset smaller than or equals to cache->offset, and this happens144* when we had a hole or prealloc extent with several delalloc ranges in145* it, but after btrfs_next_leaf() released the path, delalloc was146* flushed and the resulting ordered extents were completed, so we can147* now have found a file extent item for an offset that is smaller than148* or equals to what we have in cache->offset. We deal with this as149* described below.150*/151cache_end = cache->offset + cache->len;152if (cache_end > offset) {153if (offset == cache->offset) {154/*155* We cached a dealloc range (found in the io tree) for156* a hole or prealloc extent and we have now found a157* file extent item for the same offset. What we have158* now is more recent and up to date, so discard what159* we had in the cache and use what we have just found.160*/161goto assign;162} else if (offset > cache->offset) {163/*164* The extent range we previously found ends after the165* offset of the file extent item we found and that166* offset falls somewhere in the middle of that previous167* extent range. So adjust the range we previously found168* to end at the offset of the file extent item we have169* just found, since this extent is more up to date.170* Emit that adjusted range and cache the file extent171* item we have just found. This corresponds to the case172* where a previously found file extent item was split173* due to an ordered extent completing.174*/175cache->len = offset - cache->offset;176goto emit;177} else {178const u64 range_end = offset + len;179180/*181* The offset of the file extent item we have just found182* is behind the cached offset. This means we were183* processing a hole or prealloc extent for which we184* have found delalloc ranges (in the io tree), so what185* we have in the cache is the last delalloc range we186* found while the file extent item we found can be187* either for a whole delalloc range we previously188* emitted or only a part of that range.189*190* We have two cases here:191*192* 1) The file extent item's range ends at or behind the193* cached extent's end. In this case just ignore the194* current file extent item because we don't want to195* overlap with previous ranges that may have been196* emitted already;197*198* 2) The file extent item starts behind the currently199* cached extent but its end offset goes beyond the200* end offset of the cached extent. We don't want to201* overlap with a previous range that may have been202* emitted already, so we emit the currently cached203* extent and then partially store the current file204* extent item's range in the cache, for the subrange205* going the cached extent's end to the end of the206* file extent item.207*/208if (range_end <= cache_end)209return 0;210211if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))212phys += cache_end - offset;213214offset = cache_end;215len = range_end - cache_end;216goto emit;217}218}219220/*221* Only merges fiemap extents if222* 1) Their logical addresses are continuous223*224* 2) Their physical addresses are continuous225* So truly compressed (physical size smaller than logical size)226* extents won't get merged with each other227*228* 3) Share same flags229*/230if (cache->offset + cache->len == offset &&231cache->phys + cache->len == phys &&232cache->flags == flags) {233cache->len += len;234return 0;235}236237emit:238/* Not mergeable, need to submit cached one */239240if (cache->entries_pos == cache->entries_size) {241/*242* We will need to research for the end offset of the last243* stored extent and not from the current offset, because after244* unlocking the range and releasing the path, if there's a hole245* between that end offset and this current offset, a new extent246* may have been inserted due to a new write, so we don't want247* to miss it.248*/249entry = &cache->entries[cache->entries_size - 1];250cache->next_search_offset = entry->offset + entry->len;251cache->cached = false;252253return BTRFS_FIEMAP_FLUSH_CACHE;254}255256entry = &cache->entries[cache->entries_pos];257entry->offset = cache->offset;258entry->phys = cache->phys;259entry->len = cache->len;260entry->flags = cache->flags;261cache->entries_pos++;262cache->extents_mapped++;263264if (cache->extents_mapped == fieinfo->fi_extents_max) {265cache->cached = false;266return 1;267}268assign:269cache->cached = true;270cache->offset = offset;271cache->phys = phys;272cache->len = len;273cache->flags = flags;274275return 0;276}277278/*279* Emit last fiemap cache280*281* The last fiemap cache may still be cached in the following case:282* 0 4k 8k283* |<- Fiemap range ->|284* |<------------ First extent ----------->|285*286* In this case, the first extent range will be cached but not emitted.287* So we must emit it before ending extent_fiemap().288*/289static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,290struct fiemap_cache *cache)291{292int ret;293294if (!cache->cached)295return 0;296297ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,298cache->len, cache->flags);299cache->cached = false;300if (ret > 0)301ret = 0;302return ret;303}304305static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)306{307struct extent_buffer *clone = path->nodes[0];308struct btrfs_key key;309int slot;310int ret;311312path->slots[0]++;313if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))314return 0;315316/*317* Add a temporary extra ref to an already cloned extent buffer to318* prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid319* the cost of allocating a new one.320*/321ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));322refcount_inc(&clone->refs);323324ret = btrfs_next_leaf(inode->root, path);325if (ret != 0)326goto out;327328/*329* Don't bother with cloning if there are no more file extent items for330* our inode.331*/332btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);333if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {334ret = 1;335goto out;336}337338/*339* Important to preserve the start field, for the optimizations when340* checking if extents are shared (see extent_fiemap()).341*342* We must set ->start before calling copy_extent_buffer_full(). If we343* are on sub-pagesize blocksize, we use ->start to determine the offset344* into the folio where our eb exists, and if we update ->start after345* the fact then any subsequent reads of the eb may read from a346* different offset in the folio than where we originally copied into.347*/348clone->start = path->nodes[0]->start;349/* See the comment at fiemap_search_slot() about why we clone. */350copy_extent_buffer_full(clone, path->nodes[0]);351352slot = path->slots[0];353btrfs_release_path(path);354path->nodes[0] = clone;355path->slots[0] = slot;356out:357if (ret)358free_extent_buffer(clone);359360return ret;361}362363/*364* Search for the first file extent item that starts at a given file offset or365* the one that starts immediately before that offset.366* Returns: 0 on success, < 0 on error, 1 if not found.367*/368static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,369u64 file_offset)370{371const u64 ino = btrfs_ino(inode);372struct btrfs_root *root = inode->root;373struct extent_buffer *clone;374struct btrfs_key key;375int slot;376int ret;377378key.objectid = ino;379key.type = BTRFS_EXTENT_DATA_KEY;380key.offset = file_offset;381382ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);383if (ret < 0)384return ret;385386if (ret > 0 && path->slots[0] > 0) {387btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);388if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)389path->slots[0]--;390}391392if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {393ret = btrfs_next_leaf(root, path);394if (ret != 0)395return ret;396397btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);398if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)399return 1;400}401402/*403* We clone the leaf and use it during fiemap. This is because while404* using the leaf we do expensive things like checking if an extent is405* shared, which can take a long time. In order to prevent blocking406* other tasks for too long, we use a clone of the leaf. We have locked407* the file range in the inode's io tree, so we know none of our file408* extent items can change. This way we avoid blocking other tasks that409* want to insert items for other inodes in the same leaf or b+tree410* rebalance operations (triggered for example when someone is trying411* to push items into this leaf when trying to insert an item in a412* neighbour leaf).413* We also need the private clone because holding a read lock on an414* extent buffer of the subvolume's b+tree will make lockdep unhappy415* when we check if extents are shared, as backref walking may need to416* lock the same leaf we are processing.417*/418clone = btrfs_clone_extent_buffer(path->nodes[0]);419if (!clone)420return -ENOMEM;421422slot = path->slots[0];423btrfs_release_path(path);424path->nodes[0] = clone;425path->slots[0] = slot;426427return 0;428}429430/*431* Process a range which is a hole or a prealloc extent in the inode's subvolume432* btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc433* extent. The end offset (@end) is inclusive.434*/435static int fiemap_process_hole(struct btrfs_inode *inode,436struct fiemap_extent_info *fieinfo,437struct fiemap_cache *cache,438struct extent_state **delalloc_cached_state,439struct btrfs_backref_share_check_ctx *backref_ctx,440u64 disk_bytenr, u64 extent_offset,441u64 extent_gen,442u64 start, u64 end)443{444const u64 i_size = i_size_read(&inode->vfs_inode);445u64 cur_offset = start;446u64 last_delalloc_end = 0;447u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;448bool checked_extent_shared = false;449int ret;450451/*452* There can be no delalloc past i_size, so don't waste time looking for453* it beyond i_size.454*/455while (cur_offset < end && cur_offset < i_size) {456u64 delalloc_start;457u64 delalloc_end;458u64 prealloc_start;459u64 prealloc_len = 0;460bool delalloc;461462delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,463delalloc_cached_state,464&delalloc_start,465&delalloc_end);466if (!delalloc)467break;468469/*470* If this is a prealloc extent we have to report every section471* of it that has no delalloc.472*/473if (disk_bytenr != 0) {474if (last_delalloc_end == 0) {475prealloc_start = start;476prealloc_len = delalloc_start - start;477} else {478prealloc_start = last_delalloc_end + 1;479prealloc_len = delalloc_start - prealloc_start;480}481}482483if (prealloc_len > 0) {484if (!checked_extent_shared && fieinfo->fi_extents_max) {485ret = btrfs_is_data_extent_shared(inode,486disk_bytenr,487extent_gen,488backref_ctx);489if (ret < 0)490return ret;491else if (ret > 0)492prealloc_flags |= FIEMAP_EXTENT_SHARED;493494checked_extent_shared = true;495}496ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,497disk_bytenr + extent_offset,498prealloc_len, prealloc_flags);499if (ret)500return ret;501extent_offset += prealloc_len;502}503504ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,505delalloc_end + 1 - delalloc_start,506FIEMAP_EXTENT_DELALLOC |507FIEMAP_EXTENT_UNKNOWN);508if (ret)509return ret;510511last_delalloc_end = delalloc_end;512cur_offset = delalloc_end + 1;513extent_offset += cur_offset - delalloc_start;514cond_resched();515}516517/*518* Either we found no delalloc for the whole prealloc extent or we have519* a prealloc extent that spans i_size or starts at or after i_size.520*/521if (disk_bytenr != 0 && last_delalloc_end < end) {522u64 prealloc_start;523u64 prealloc_len;524525if (last_delalloc_end == 0) {526prealloc_start = start;527prealloc_len = end + 1 - start;528} else {529prealloc_start = last_delalloc_end + 1;530prealloc_len = end + 1 - prealloc_start;531}532533if (!checked_extent_shared && fieinfo->fi_extents_max) {534ret = btrfs_is_data_extent_shared(inode,535disk_bytenr,536extent_gen,537backref_ctx);538if (ret < 0)539return ret;540else if (ret > 0)541prealloc_flags |= FIEMAP_EXTENT_SHARED;542}543ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,544disk_bytenr + extent_offset,545prealloc_len, prealloc_flags);546if (ret)547return ret;548}549550return 0;551}552553static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,554struct btrfs_path *path,555u64 *last_extent_end_ret)556{557const u64 ino = btrfs_ino(inode);558struct btrfs_root *root = inode->root;559struct extent_buffer *leaf;560struct btrfs_file_extent_item *ei;561struct btrfs_key key;562u64 disk_bytenr;563int ret;564565/*566* Lookup the last file extent. We're not using i_size here because567* there might be preallocation past i_size.568*/569ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);570/* There can't be a file extent item at offset (u64)-1 */571ASSERT(ret != 0);572if (ret < 0)573return ret;574575/*576* For a non-existing key, btrfs_search_slot() always leaves us at a577* slot > 0, except if the btree is empty, which is impossible because578* at least it has the inode item for this inode and all the items for579* the root inode 256.580*/581ASSERT(path->slots[0] > 0);582path->slots[0]--;583leaf = path->nodes[0];584btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);585if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {586/* No file extent items in the subvolume tree. */587*last_extent_end_ret = 0;588return 0;589}590591/*592* For an inline extent, the disk_bytenr is where inline data starts at,593* so first check if we have an inline extent item before checking if we594* have an implicit hole (disk_bytenr == 0).595*/596ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);597if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {598*last_extent_end_ret = btrfs_file_extent_end(path);599return 0;600}601602/*603* Find the last file extent item that is not a hole (when NO_HOLES is604* not enabled). This should take at most 2 iterations in the worst605* case: we have one hole file extent item at slot 0 of a leaf and606* another hole file extent item as the last item in the previous leaf.607* This is because we merge file extent items that represent holes.608*/609disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);610while (disk_bytenr == 0) {611ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);612if (ret < 0) {613return ret;614} else if (ret > 0) {615/* No file extent items that are not holes. */616*last_extent_end_ret = 0;617return 0;618}619leaf = path->nodes[0];620ei = btrfs_item_ptr(leaf, path->slots[0],621struct btrfs_file_extent_item);622disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);623}624625*last_extent_end_ret = btrfs_file_extent_end(path);626return 0;627}628629static int extent_fiemap(struct btrfs_inode *inode,630struct fiemap_extent_info *fieinfo,631u64 start, u64 len)632{633const u64 ino = btrfs_ino(inode);634struct extent_state *cached_state = NULL;635struct extent_state *delalloc_cached_state = NULL;636BTRFS_PATH_AUTO_FREE(path);637struct fiemap_cache cache = { 0 };638struct btrfs_backref_share_check_ctx *backref_ctx;639u64 last_extent_end = 0;640u64 prev_extent_end;641u64 range_start;642u64 range_end;643const u64 sectorsize = inode->root->fs_info->sectorsize;644bool stopped = false;645int ret;646647cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);648cache.entries = kmalloc_array(cache.entries_size,649sizeof(struct btrfs_fiemap_entry),650GFP_KERNEL);651backref_ctx = btrfs_alloc_backref_share_check_ctx();652path = btrfs_alloc_path();653if (!cache.entries || !backref_ctx || !path) {654ret = -ENOMEM;655goto out;656}657658restart:659range_start = round_down(start, sectorsize);660range_end = round_up(start + len, sectorsize);661prev_extent_end = range_start;662663btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);664665ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);666if (ret < 0)667goto out_unlock;668btrfs_release_path(path);669670path->reada = READA_FORWARD;671ret = fiemap_search_slot(inode, path, range_start);672if (ret < 0) {673goto out_unlock;674} else if (ret > 0) {675/*676* No file extent item found, but we may have delalloc between677* the current offset and i_size. So check for that.678*/679ret = 0;680goto check_eof_delalloc;681}682683while (prev_extent_end < range_end) {684struct extent_buffer *leaf = path->nodes[0];685struct btrfs_file_extent_item *ei;686struct btrfs_key key;687u64 extent_end;688u64 extent_len;689u64 extent_offset = 0;690u64 extent_gen;691u64 disk_bytenr = 0;692u64 flags = 0;693int extent_type;694u8 compression;695696btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);697if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)698break;699700extent_end = btrfs_file_extent_end(path);701702/*703* The first iteration can leave us at an extent item that ends704* before our range's start. Move to the next item.705*/706if (extent_end <= range_start)707goto next_item;708709backref_ctx->curr_leaf_bytenr = leaf->start;710711/* We have in implicit hole (NO_HOLES feature enabled). */712if (prev_extent_end < key.offset) {713const u64 hole_end = min(key.offset, range_end) - 1;714715ret = fiemap_process_hole(inode, fieinfo, &cache,716&delalloc_cached_state,717backref_ctx, 0, 0, 0,718prev_extent_end, hole_end);719if (ret < 0) {720goto out_unlock;721} else if (ret > 0) {722/* fiemap_fill_next_extent() told us to stop. */723stopped = true;724break;725}726727/* We've reached the end of the fiemap range, stop. */728if (key.offset >= range_end) {729stopped = true;730break;731}732}733734extent_len = extent_end - key.offset;735ei = btrfs_item_ptr(leaf, path->slots[0],736struct btrfs_file_extent_item);737compression = btrfs_file_extent_compression(leaf, ei);738extent_type = btrfs_file_extent_type(leaf, ei);739extent_gen = btrfs_file_extent_generation(leaf, ei);740741if (extent_type != BTRFS_FILE_EXTENT_INLINE) {742disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);743if (compression == BTRFS_COMPRESS_NONE)744extent_offset = btrfs_file_extent_offset(leaf, ei);745}746747if (compression != BTRFS_COMPRESS_NONE)748flags |= FIEMAP_EXTENT_ENCODED;749750if (extent_type == BTRFS_FILE_EXTENT_INLINE) {751flags |= FIEMAP_EXTENT_DATA_INLINE;752flags |= FIEMAP_EXTENT_NOT_ALIGNED;753ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,754extent_len, flags);755} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {756ret = fiemap_process_hole(inode, fieinfo, &cache,757&delalloc_cached_state,758backref_ctx,759disk_bytenr, extent_offset,760extent_gen, key.offset,761extent_end - 1);762} else if (disk_bytenr == 0) {763/* We have an explicit hole. */764ret = fiemap_process_hole(inode, fieinfo, &cache,765&delalloc_cached_state,766backref_ctx, 0, 0, 0,767key.offset, extent_end - 1);768} else {769/* We have a regular extent. */770if (fieinfo->fi_extents_max) {771ret = btrfs_is_data_extent_shared(inode,772disk_bytenr,773extent_gen,774backref_ctx);775if (ret < 0)776goto out_unlock;777else if (ret > 0)778flags |= FIEMAP_EXTENT_SHARED;779}780781ret = emit_fiemap_extent(fieinfo, &cache, key.offset,782disk_bytenr + extent_offset,783extent_len, flags);784}785786if (ret < 0) {787goto out_unlock;788} else if (ret > 0) {789/* emit_fiemap_extent() told us to stop. */790stopped = true;791break;792}793794prev_extent_end = extent_end;795next_item:796if (fatal_signal_pending(current)) {797ret = -EINTR;798goto out_unlock;799}800801ret = fiemap_next_leaf_item(inode, path);802if (ret < 0) {803goto out_unlock;804} else if (ret > 0) {805/* No more file extent items for this inode. */806break;807}808cond_resched();809}810811check_eof_delalloc:812if (!stopped && prev_extent_end < range_end) {813ret = fiemap_process_hole(inode, fieinfo, &cache,814&delalloc_cached_state, backref_ctx,8150, 0, 0, prev_extent_end, range_end - 1);816if (ret < 0)817goto out_unlock;818prev_extent_end = range_end;819}820821if (cache.cached && cache.offset + cache.len >= last_extent_end) {822const u64 i_size = i_size_read(&inode->vfs_inode);823824if (prev_extent_end < i_size) {825u64 delalloc_start;826u64 delalloc_end;827bool delalloc;828829delalloc = btrfs_find_delalloc_in_range(inode,830prev_extent_end,831i_size - 1,832&delalloc_cached_state,833&delalloc_start,834&delalloc_end);835if (!delalloc)836cache.flags |= FIEMAP_EXTENT_LAST;837} else {838cache.flags |= FIEMAP_EXTENT_LAST;839}840}841842out_unlock:843btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);844845if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {846btrfs_release_path(path);847ret = flush_fiemap_cache(fieinfo, &cache);848if (ret)849goto out;850len -= cache.next_search_offset - start;851start = cache.next_search_offset;852goto restart;853} else if (ret < 0) {854goto out;855}856857/*858* Must free the path before emitting to the fiemap buffer because we859* may have a non-cloned leaf and if the fiemap buffer is memory mapped860* to a file, a write into it (through btrfs_page_mkwrite()) may trigger861* waiting for an ordered extent that in order to complete needs to862* modify that leaf, therefore leading to a deadlock.863*/864btrfs_free_path(path);865path = NULL;866867ret = flush_fiemap_cache(fieinfo, &cache);868if (ret)869goto out;870871ret = emit_last_fiemap_cache(fieinfo, &cache);872out:873btrfs_free_extent_state(delalloc_cached_state);874kfree(cache.entries);875btrfs_free_backref_share_ctx(backref_ctx);876return ret;877}878879int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,880u64 start, u64 len)881{882struct btrfs_inode *btrfs_inode = BTRFS_I(inode);883int ret;884885ret = fiemap_prep(inode, fieinfo, start, &len, 0);886if (ret)887return ret;888889/*890* fiemap_prep() called filemap_write_and_wait() for the whole possible891* file range (0 to LLONG_MAX), but that is not enough if we have892* compression enabled. The first filemap_fdatawrite_range() only kicks893* in the compression of data (in an async thread) and will return894* before the compression is done and writeback is started. A second895* filemap_fdatawrite_range() is needed to wait for the compression to896* complete and writeback to start. We also need to wait for ordered897* extents to complete, because our fiemap implementation uses mainly898* file extent items to list the extents, searching for extent maps899* only for file ranges with holes or prealloc extents to figure out900* if we have delalloc in those ranges.901*/902if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {903ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);904if (ret)905return ret;906}907908btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);909910/*911* We did an initial flush to avoid holding the inode's lock while912* triggering writeback and waiting for the completion of IO and ordered913* extents. Now after we locked the inode we do it again, because it's914* possible a new write may have happened in between those two steps.915*/916if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {917ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);918if (ret) {919btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);920return ret;921}922}923924ret = extent_fiemap(btrfs_inode, fieinfo, start, len);925btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);926927return ret;928}929930931