// SPDX-License-Identifier: GPL-2.012#include "messages.h"3#include "ctree.h"4#include "delalloc-space.h"5#include "block-rsv.h"6#include "btrfs_inode.h"7#include "space-info.h"8#include "qgroup.h"9#include "fs.h"1011/*12* HOW DOES THIS WORK13*14* There are two stages to data reservations, one for data and one for metadata15* to handle the new extents and checksums generated by writing data.16*17*18* DATA RESERVATION19* The general flow of the data reservation is as follows20*21* -> Reserve22* We call into btrfs_reserve_data_bytes() for the user request bytes that23* they wish to write. We make this reservation and add it to24* space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree25* for the range and carry on if this is buffered, or follow up trying to26* make a real allocation if we are pre-allocating or doing O_DIRECT.27*28* -> Use29* At writepages()/prealloc/O_DIRECT time we will call into30* btrfs_reserve_extent() for some part or all of this range of bytes. We31* will make the allocation and subtract space_info->bytes_may_use by the32* original requested length and increase the space_info->bytes_reserved by33* the allocated length. This distinction is important because compression34* may allocate a smaller on disk extent than we previously reserved.35*36* -> Allocation37* finish_ordered_io() will insert the new file extent item for this range,38* and then add a delayed ref update for the extent tree. Once that delayed39* ref is written the extent size is subtracted from40* space_info->bytes_reserved and added to space_info->bytes_used.41*42* Error handling43*44* -> By the reservation maker45* This is the simplest case, we haven't completed our operation and we know46* how much we reserved, we can simply call47* btrfs_free_reserved_data_space*() and it will be removed from48* space_info->bytes_may_use.49*50* -> After the reservation has been made, but before cow_file_range()51* This is specifically for the delalloc case. You must clear52* EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will53* be subtracted from space_info->bytes_may_use.54*55* METADATA RESERVATION56* The general metadata reservation lifetimes are discussed elsewhere, this57* will just focus on how it is used for delalloc space.58*59* We keep track of two things on a per inode bases60*61* ->outstanding_extents62* This is the number of file extent items we'll need to handle all of the63* outstanding DELALLOC space we have in this inode. We limit the maximum64* size of an extent, so a large contiguous dirty area may require more than65* one outstanding_extent, which is why count_max_extents() is used to66* determine how many outstanding_extents get added.67*68* ->csum_bytes69* This is essentially how many dirty bytes we have for this inode, so we70* can calculate the number of checksum items we would have to add in order71* to checksum our outstanding data.72*73* We keep a per-inode block_rsv in order to make it easier to keep track of74* our reservation. We use btrfs_calculate_inode_block_rsv_size() to75* calculate the current theoretical maximum reservation we would need for the76* metadata for this inode. We call this and then adjust our reservation as77* necessary, either by attempting to reserve more space, or freeing up excess78* space.79*80* OUTSTANDING_EXTENTS HANDLING81*82* ->outstanding_extents is used for keeping track of how many extents we will83* need to use for this inode, and it will fluctuate depending on where you are84* in the life cycle of the dirty data. Consider the following normal case for85* a completely clean inode, with a num_bytes < our maximum allowed extent size86*87* -> reserve88* ->outstanding_extents += 1 (current value is 1)89*90* -> set_delalloc91* ->outstanding_extents += 1 (current value is 2)92*93* -> btrfs_delalloc_release_extents()94* ->outstanding_extents -= 1 (current value is 1)95*96* We must call this once we are done, as we hold our reservation for the97* duration of our operation, and then assume set_delalloc will update the98* counter appropriately.99*100* -> add ordered extent101* ->outstanding_extents += 1 (current value is 2)102*103* -> btrfs_clear_delalloc_extent104* ->outstanding_extents -= 1 (current value is 1)105*106* -> finish_ordered_io/btrfs_remove_ordered_extent107* ->outstanding_extents -= 1 (current value is 0)108*109* Each stage is responsible for their own accounting of the extent, thus110* making error handling and cleanup easier.111*/112113static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode)114{115struct btrfs_fs_info *fs_info = inode->root->fs_info;116117if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) {118ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id ==119BTRFS_SUB_GROUP_DATA_RELOC);120return fs_info->data_sinfo->sub_group[0];121}122return fs_info->data_sinfo;123}124125int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)126{127struct btrfs_root *root = inode->root;128struct btrfs_fs_info *fs_info = root->fs_info;129enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;130131/* Make sure bytes are sectorsize aligned */132bytes = ALIGN(bytes, fs_info->sectorsize);133134if (btrfs_is_free_space_inode(inode))135flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;136137return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);138}139140int btrfs_check_data_free_space(struct btrfs_inode *inode,141struct extent_changeset **reserved, u64 start,142u64 len, bool noflush)143{144struct btrfs_fs_info *fs_info = inode->root->fs_info;145enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;146int ret;147148/* align the range */149len = round_up(start + len, fs_info->sectorsize) -150round_down(start, fs_info->sectorsize);151start = round_down(start, fs_info->sectorsize);152153if (noflush)154flush = BTRFS_RESERVE_NO_FLUSH;155else if (btrfs_is_free_space_inode(inode))156flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;157158ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush);159if (ret < 0)160return ret;161162/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */163ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);164if (ret < 0) {165btrfs_free_reserved_data_space_noquota(inode, len);166extent_changeset_free(*reserved);167*reserved = NULL;168} else {169ret = 0;170}171return ret;172}173174/*175* Called if we need to clear a data reservation for this inode176* Normally in a error case.177*178* This one will *NOT* use accurate qgroup reserved space API, just for case179* which we can't sleep and is sure it won't affect qgroup reserved space.180* Like clear_bit_hook().181*/182void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len)183{184struct btrfs_fs_info *fs_info = inode->root->fs_info;185186ASSERT(IS_ALIGNED(len, fs_info->sectorsize));187188btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len);189}190191/*192* Called if we need to clear a data reservation for this inode193* Normally in a error case.194*195* This one will handle the per-inode data rsv map for accurate reserved196* space framework.197*/198void btrfs_free_reserved_data_space(struct btrfs_inode *inode,199struct extent_changeset *reserved, u64 start, u64 len)200{201struct btrfs_fs_info *fs_info = inode->root->fs_info;202203/* Make sure the range is aligned to sectorsize */204len = round_up(start + len, fs_info->sectorsize) -205round_down(start, fs_info->sectorsize);206start = round_down(start, fs_info->sectorsize);207208btrfs_free_reserved_data_space_noquota(inode, len);209btrfs_qgroup_free_data(inode, reserved, start, len, NULL);210}211212/*213* Release any excessive reservations for an inode.214*215* @inode: the inode we need to release from216* @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup217* meta reservation needs to know if we are freeing qgroup218* reservation or just converting it into per-trans. Normally219* @qgroup_free is true for error handling, and false for normal220* release.221*222* This is the same as btrfs_block_rsv_release, except that it handles the223* tracepoint for the reservation.224*/225static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)226{227struct btrfs_fs_info *fs_info = inode->root->fs_info;228struct btrfs_block_rsv *block_rsv = &inode->block_rsv;229u64 released = 0;230u64 qgroup_to_release = 0;231232/*233* Since we statically set the block_rsv->size we just want to say we234* are releasing 0 bytes, and then we'll just get the reservation over235* the size free'd.236*/237released = btrfs_block_rsv_release(fs_info, block_rsv, 0,238&qgroup_to_release);239if (released > 0)240trace_btrfs_space_reservation(fs_info, "delalloc",241btrfs_ino(inode), released, 0);242if (qgroup_free)243btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);244else245btrfs_qgroup_convert_reserved_meta(inode->root,246qgroup_to_release);247}248249static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,250struct btrfs_inode *inode)251{252struct btrfs_block_rsv *block_rsv = &inode->block_rsv;253u64 reserve_size = 0;254u64 qgroup_rsv_size = 0;255unsigned outstanding_extents;256257lockdep_assert_held(&inode->lock);258outstanding_extents = inode->outstanding_extents;259260/*261* Insert size for the number of outstanding extents, 1 normal size for262* updating the inode.263*/264if (outstanding_extents) {265reserve_size = btrfs_calc_insert_metadata_size(fs_info,266outstanding_extents);267reserve_size += btrfs_calc_metadata_size(fs_info, 1);268}269if (!(inode->flags & BTRFS_INODE_NODATASUM)) {270u64 csum_leaves;271272csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);273reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);274}275/*276* For qgroup rsv, the calculation is very simple:277* account one nodesize for each outstanding extent278*279* This is overestimating in most cases.280*/281qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;282283spin_lock(&block_rsv->lock);284block_rsv->size = reserve_size;285block_rsv->qgroup_rsv_size = qgroup_rsv_size;286spin_unlock(&block_rsv->lock);287}288289static void calc_inode_reservations(struct btrfs_inode *inode,290u64 num_bytes, u64 disk_num_bytes,291u64 *meta_reserve, u64 *qgroup_reserve)292{293struct btrfs_fs_info *fs_info = inode->root->fs_info;294u64 nr_extents = count_max_extents(fs_info, num_bytes);295u64 csum_leaves;296u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);297298if (inode->flags & BTRFS_INODE_NODATASUM)299csum_leaves = 0;300else301csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);302303*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,304nr_extents + csum_leaves);305306/*307* finish_ordered_io has to update the inode, so add the space required308* for an inode update.309*/310*meta_reserve += inode_update;311*qgroup_reserve = nr_extents * fs_info->nodesize;312}313314int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,315u64 disk_num_bytes, bool noflush)316{317struct btrfs_root *root = inode->root;318struct btrfs_fs_info *fs_info = root->fs_info;319struct btrfs_block_rsv *block_rsv = &inode->block_rsv;320u64 meta_reserve, qgroup_reserve;321unsigned nr_extents;322enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;323int ret = 0;324325/*326* If we are a free space inode we need to not flush since we will be in327* the middle of a transaction commit. We also don't need the delalloc328* mutex since we won't race with anybody. We need this mostly to make329* lockdep shut its filthy mouth.330*331* If we have a transaction open (can happen if we call truncate_block332* from truncate), then we need FLUSH_LIMIT so we don't deadlock.333*/334if (noflush || btrfs_is_free_space_inode(inode)) {335flush = BTRFS_RESERVE_NO_FLUSH;336} else {337if (current->journal_info)338flush = BTRFS_RESERVE_FLUSH_LIMIT;339}340341num_bytes = ALIGN(num_bytes, fs_info->sectorsize);342disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);343344/*345* We always want to do it this way, every other way is wrong and ends346* in tears. Pre-reserving the amount we are going to add will always347* be the right way, because otherwise if we have enough parallelism we348* could end up with thousands of inodes all holding little bits of349* reservations they were able to make previously and the only way to350* reclaim that space is to ENOSPC out the operations and clear351* everything out and try again, which is bad. This way we just352* over-reserve slightly, and clean up the mess when we are done.353*/354calc_inode_reservations(inode, num_bytes, disk_num_bytes,355&meta_reserve, &qgroup_reserve);356ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,357noflush);358if (ret)359return ret;360ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,361meta_reserve, flush);362if (ret) {363btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);364return ret;365}366367/*368* Now we need to update our outstanding extents and csum bytes _first_369* and then add the reservation to the block_rsv. This keeps us from370* racing with an ordered completion or some such that would think it371* needs to free the reservation we just made.372*/373nr_extents = count_max_extents(fs_info, num_bytes);374spin_lock(&inode->lock);375btrfs_mod_outstanding_extents(inode, nr_extents);376if (!(inode->flags & BTRFS_INODE_NODATASUM))377inode->csum_bytes += disk_num_bytes;378btrfs_calculate_inode_block_rsv_size(fs_info, inode);379spin_unlock(&inode->lock);380381/* Now we can safely add our space to our block rsv */382btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);383trace_btrfs_space_reservation(root->fs_info, "delalloc",384btrfs_ino(inode), meta_reserve, 1);385386spin_lock(&block_rsv->lock);387block_rsv->qgroup_rsv_reserved += qgroup_reserve;388spin_unlock(&block_rsv->lock);389390return 0;391}392393/*394* Release a metadata reservation for an inode.395*396* @inode: the inode to release the reservation for.397* @num_bytes: the number of bytes we are releasing.398* @qgroup_free: free qgroup reservation or convert it to per-trans reservation399*400* This will release the metadata reservation for an inode. This can be called401* once we complete IO for a given set of bytes to release their metadata402* reservations, or on error for the same reason.403*/404void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,405bool qgroup_free)406{407struct btrfs_fs_info *fs_info = inode->root->fs_info;408409num_bytes = ALIGN(num_bytes, fs_info->sectorsize);410spin_lock(&inode->lock);411if (!(inode->flags & BTRFS_INODE_NODATASUM))412inode->csum_bytes -= num_bytes;413btrfs_calculate_inode_block_rsv_size(fs_info, inode);414spin_unlock(&inode->lock);415416if (btrfs_is_testing(fs_info))417return;418419btrfs_inode_rsv_release(inode, qgroup_free);420}421422/*423* Release our outstanding_extents for an inode.424*425* @inode: the inode to balance the reservation for.426* @num_bytes: the number of bytes we originally reserved with427*428* When we reserve space we increase outstanding_extents for the extents we may429* add. Once we've set the range as delalloc or created our ordered extents we430* have outstanding_extents to track the real usage, so we use this to free our431* temporarily tracked outstanding_extents. This _must_ be used in conjunction432* with btrfs_delalloc_reserve_metadata.433*/434void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)435{436struct btrfs_fs_info *fs_info = inode->root->fs_info;437unsigned num_extents;438439spin_lock(&inode->lock);440num_extents = count_max_extents(fs_info, num_bytes);441btrfs_mod_outstanding_extents(inode, -num_extents);442btrfs_calculate_inode_block_rsv_size(fs_info, inode);443spin_unlock(&inode->lock);444445if (btrfs_is_testing(fs_info))446return;447448btrfs_inode_rsv_release(inode, true);449}450451/* Shrink a previously reserved extent to a new length. */452void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len)453{454struct btrfs_fs_info *fs_info = inode->root->fs_info;455const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len);456const u32 new_num_extents = count_max_extents(fs_info, new_len);457const int diff_num_extents = new_num_extents - reserved_num_extents;458459ASSERT(new_len <= reserved_len);460if (new_num_extents == reserved_num_extents)461return;462463spin_lock(&inode->lock);464btrfs_mod_outstanding_extents(inode, diff_num_extents);465btrfs_calculate_inode_block_rsv_size(fs_info, inode);466spin_unlock(&inode->lock);467468if (btrfs_is_testing(fs_info))469return;470471btrfs_inode_rsv_release(inode, true);472}473474/*475* Reserve data and metadata space for delalloc476*477* @inode: inode we're writing to478* @start: start range we are writing to479* @len: how long the range we are writing to480* @reserved: mandatory parameter, record actually reserved qgroup ranges of481* current reservation.482*483* This will do the following things484*485* - reserve space in data space info for num bytes and reserve precious486* corresponding qgroup space487* (Done in check_data_free_space)488*489* - reserve space for metadata space, based on the number of outstanding490* extents and how much csums will be needed also reserve metadata space in a491* per root over-reserve method.492* - add to the inodes->delalloc_bytes493* - add it to the fs_info's delalloc inodes list.494* (Above 3 all done in delalloc_reserve_metadata)495*496* Return 0 for success497* Return <0 for error(-ENOSPC or -EDQUOT)498*/499int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,500struct extent_changeset **reserved, u64 start, u64 len)501{502int ret;503504ret = btrfs_check_data_free_space(inode, reserved, start, len, false);505if (ret < 0)506return ret;507ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);508if (ret < 0) {509btrfs_free_reserved_data_space(inode, *reserved, start, len);510extent_changeset_free(*reserved);511*reserved = NULL;512}513return ret;514}515516/*517* Release data and metadata space for delalloc518*519* @inode: inode we're releasing space for520* @reserved: list of changed/reserved ranges521* @start: start position of the space already reserved522* @len: length of the space already reserved523* @qgroup_free: should qgroup reserved-space also be freed524*525* Release the metadata space that was not used and will decrement526* ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if527* there are no delalloc bytes left. Also it will handle the qgroup reserved528* space.529*/530void btrfs_delalloc_release_space(struct btrfs_inode *inode,531struct extent_changeset *reserved,532u64 start, u64 len, bool qgroup_free)533{534btrfs_delalloc_release_metadata(inode, len, qgroup_free);535btrfs_free_reserved_data_space(inode, reserved, start, len);536}537538539