Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/block-rsv.c
26285 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include "misc.h"
4
#include "ctree.h"
5
#include "block-rsv.h"
6
#include "space-info.h"
7
#include "transaction.h"
8
#include "block-group.h"
9
#include "fs.h"
10
#include "accessors.h"
11
12
/*
13
* HOW DO BLOCK RESERVES WORK
14
*
15
* Think of block_rsv's as buckets for logically grouped metadata
16
* reservations. Each block_rsv has a ->size and a ->reserved. ->size is
17
* how large we want our block rsv to be, ->reserved is how much space is
18
* currently reserved for this block reserve.
19
*
20
* ->failfast exists for the truncate case, and is described below.
21
*
22
* NORMAL OPERATION
23
*
24
* -> Reserve
25
* Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
26
*
27
* We call into btrfs_reserve_metadata_bytes() with our bytes, which is
28
* accounted for in space_info->bytes_may_use, and then add the bytes to
29
* ->reserved, and ->size in the case of btrfs_block_rsv_add.
30
*
31
* ->size is an over-estimation of how much we may use for a particular
32
* operation.
33
*
34
* -> Use
35
* Entrance: btrfs_use_block_rsv
36
*
37
* When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
38
* to determine the appropriate block_rsv to use, and then verify that
39
* ->reserved has enough space for our tree block allocation. Once
40
* successful we subtract fs_info->nodesize from ->reserved.
41
*
42
* -> Finish
43
* Entrance: btrfs_block_rsv_release
44
*
45
* We are finished with our operation, subtract our individual reservation
46
* from ->size, and then subtract ->size from ->reserved and free up the
47
* excess if there is any.
48
*
49
* There is some logic here to refill the delayed refs rsv or the global rsv
50
* as needed, otherwise the excess is subtracted from
51
* space_info->bytes_may_use.
52
*
53
* TYPES OF BLOCK RESERVES
54
*
55
* BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
56
* These behave normally, as described above, just within the confines of the
57
* lifetime of their particular operation (transaction for the whole trans
58
* handle lifetime, for example).
59
*
60
* BLOCK_RSV_GLOBAL
61
* It is impossible to properly account for all the space that may be required
62
* to make our extent tree updates. This block reserve acts as an overflow
63
* buffer in case our delayed refs reserve does not reserve enough space to
64
* update the extent tree.
65
*
66
* We can steal from this in some cases as well, notably on evict() or
67
* truncate() in order to help users recover from ENOSPC conditions.
68
*
69
* BLOCK_RSV_DELALLOC
70
* The individual item sizes are determined by the per-inode size
71
* calculations, which are described with the delalloc code. This is pretty
72
* straightforward, it's just the calculation of ->size encodes a lot of
73
* different items, and thus it gets used when updating inodes, inserting file
74
* extents, and inserting checksums.
75
*
76
* BLOCK_RSV_DELREFS
77
* We keep a running tally of how many delayed refs we have on the system.
78
* We assume each one of these delayed refs are going to use a full
79
* reservation. We use the transaction items and pre-reserve space for every
80
* operation, and use this reservation to refill any gap between ->size and
81
* ->reserved that may exist.
82
*
83
* From there it's straightforward, removing a delayed ref means we remove its
84
* count from ->size and free up reservations as necessary. Since this is
85
* the most dynamic block reserve in the system, we will try to refill this
86
* block reserve first with any excess returned by any other block reserve.
87
*
88
* BLOCK_RSV_EMPTY
89
* This is the fallback block reserve to make us try to reserve space if we
90
* don't have a specific bucket for this allocation. It is mostly used for
91
* updating the device tree and such, since that is a separate pool we're
92
* content to just reserve space from the space_info on demand.
93
*
94
* BLOCK_RSV_TEMP
95
* This is used by things like truncate and iput. We will temporarily
96
* allocate a block reserve, set it to some size, and then truncate bytes
97
* until we have no space left. With ->failfast set we'll simply return
98
* ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
99
* to make a new reservation. This is because these operations are
100
* unbounded, so we want to do as much work as we can, and then back off and
101
* re-reserve.
102
*/
103
104
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
105
struct btrfs_block_rsv *block_rsv,
106
struct btrfs_block_rsv *dest, u64 num_bytes,
107
u64 *qgroup_to_release_ret)
108
{
109
struct btrfs_space_info *space_info = block_rsv->space_info;
110
u64 qgroup_to_release = 0;
111
u64 ret;
112
113
spin_lock(&block_rsv->lock);
114
if (num_bytes == (u64)-1) {
115
num_bytes = block_rsv->size;
116
qgroup_to_release = block_rsv->qgroup_rsv_size;
117
}
118
block_rsv->size -= num_bytes;
119
if (block_rsv->reserved >= block_rsv->size) {
120
num_bytes = block_rsv->reserved - block_rsv->size;
121
block_rsv->reserved = block_rsv->size;
122
block_rsv->full = true;
123
} else {
124
num_bytes = 0;
125
}
126
if (qgroup_to_release_ret &&
127
block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
128
qgroup_to_release = block_rsv->qgroup_rsv_reserved -
129
block_rsv->qgroup_rsv_size;
130
block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
131
} else {
132
qgroup_to_release = 0;
133
}
134
spin_unlock(&block_rsv->lock);
135
136
ret = num_bytes;
137
if (num_bytes > 0) {
138
if (dest) {
139
spin_lock(&dest->lock);
140
if (!dest->full) {
141
u64 bytes_to_add;
142
143
bytes_to_add = dest->size - dest->reserved;
144
bytes_to_add = min(num_bytes, bytes_to_add);
145
dest->reserved += bytes_to_add;
146
if (dest->reserved >= dest->size)
147
dest->full = true;
148
num_bytes -= bytes_to_add;
149
}
150
spin_unlock(&dest->lock);
151
}
152
if (num_bytes)
153
btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
154
}
155
if (qgroup_to_release_ret)
156
*qgroup_to_release_ret = qgroup_to_release;
157
return ret;
158
}
159
160
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
161
struct btrfs_block_rsv *dst, u64 num_bytes,
162
bool update_size)
163
{
164
int ret;
165
166
ret = btrfs_block_rsv_use_bytes(src, num_bytes);
167
if (ret)
168
return ret;
169
170
btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
171
return 0;
172
}
173
174
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
175
{
176
memset(rsv, 0, sizeof(*rsv));
177
spin_lock_init(&rsv->lock);
178
rsv->type = type;
179
}
180
181
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
182
struct btrfs_block_rsv *rsv,
183
enum btrfs_rsv_type type)
184
{
185
btrfs_init_block_rsv(rsv, type);
186
rsv->space_info = btrfs_find_space_info(fs_info,
187
BTRFS_BLOCK_GROUP_METADATA);
188
}
189
190
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
191
enum btrfs_rsv_type type)
192
{
193
struct btrfs_block_rsv *block_rsv;
194
195
block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
196
if (!block_rsv)
197
return NULL;
198
199
btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
200
return block_rsv;
201
}
202
203
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
204
struct btrfs_block_rsv *rsv)
205
{
206
if (!rsv)
207
return;
208
btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
209
kfree(rsv);
210
}
211
212
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
213
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
214
enum btrfs_reserve_flush_enum flush)
215
{
216
int ret;
217
218
if (num_bytes == 0)
219
return 0;
220
221
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
222
num_bytes, flush);
223
if (!ret)
224
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
225
226
return ret;
227
}
228
229
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
230
{
231
u64 num_bytes = 0;
232
int ret = -ENOSPC;
233
234
spin_lock(&block_rsv->lock);
235
num_bytes = mult_perc(block_rsv->size, min_percent);
236
if (block_rsv->reserved >= num_bytes)
237
ret = 0;
238
spin_unlock(&block_rsv->lock);
239
240
return ret;
241
}
242
243
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
244
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
245
enum btrfs_reserve_flush_enum flush)
246
{
247
int ret = -ENOSPC;
248
249
if (!block_rsv)
250
return 0;
251
252
spin_lock(&block_rsv->lock);
253
if (block_rsv->reserved >= num_bytes)
254
ret = 0;
255
else
256
num_bytes -= block_rsv->reserved;
257
spin_unlock(&block_rsv->lock);
258
259
if (!ret)
260
return 0;
261
262
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
263
num_bytes, flush);
264
if (!ret) {
265
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
266
return 0;
267
}
268
269
return ret;
270
}
271
272
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
273
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
274
u64 *qgroup_to_release)
275
{
276
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
277
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
278
struct btrfs_block_rsv *target = NULL;
279
280
/*
281
* If we are a delayed block reserve then push to the global rsv,
282
* otherwise dump into the global delayed reserve if it is not full.
283
*/
284
if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
285
target = global_rsv;
286
else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
287
target = delayed_rsv;
288
289
if (target && block_rsv->space_info != target->space_info)
290
target = NULL;
291
292
return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
293
qgroup_to_release);
294
}
295
296
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
297
{
298
int ret = -ENOSPC;
299
300
spin_lock(&block_rsv->lock);
301
if (block_rsv->reserved >= num_bytes) {
302
block_rsv->reserved -= num_bytes;
303
if (block_rsv->reserved < block_rsv->size)
304
block_rsv->full = false;
305
ret = 0;
306
}
307
spin_unlock(&block_rsv->lock);
308
return ret;
309
}
310
311
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
312
u64 num_bytes, bool update_size)
313
{
314
spin_lock(&block_rsv->lock);
315
block_rsv->reserved += num_bytes;
316
if (update_size)
317
block_rsv->size += num_bytes;
318
else if (block_rsv->reserved >= block_rsv->size)
319
block_rsv->full = true;
320
spin_unlock(&block_rsv->lock);
321
}
322
323
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
324
{
325
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
326
struct btrfs_space_info *sinfo = block_rsv->space_info;
327
struct btrfs_root *root, *tmp;
328
u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
329
unsigned int min_items = 1;
330
331
/*
332
* The global block rsv is based on the size of the extent tree, the
333
* checksum tree and the root tree. If the fs is empty we want to set
334
* it to a minimal amount for safety.
335
*
336
* We also are going to need to modify the minimum of the tree root and
337
* any global roots we could touch.
338
*/
339
read_lock(&fs_info->global_root_lock);
340
rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
341
rb_node) {
342
if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
343
btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
344
btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
345
num_bytes += btrfs_root_used(&root->root_item);
346
min_items++;
347
}
348
}
349
read_unlock(&fs_info->global_root_lock);
350
351
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
352
num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item);
353
min_items++;
354
}
355
356
if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
357
num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
358
min_items++;
359
}
360
361
/*
362
* But we also want to reserve enough space so we can do the fallback
363
* global reserve for an unlink, which is an additional
364
* BTRFS_UNLINK_METADATA_UNITS items.
365
*
366
* But we also need space for the delayed ref updates from the unlink,
367
* so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
368
* each unlink metadata item.
369
*/
370
min_items += BTRFS_UNLINK_METADATA_UNITS;
371
372
num_bytes = max_t(u64, num_bytes,
373
btrfs_calc_insert_metadata_size(fs_info, min_items) +
374
btrfs_calc_delayed_ref_bytes(fs_info,
375
BTRFS_UNLINK_METADATA_UNITS));
376
377
spin_lock(&sinfo->lock);
378
spin_lock(&block_rsv->lock);
379
380
block_rsv->size = min_t(u64, num_bytes, SZ_512M);
381
382
if (block_rsv->reserved < block_rsv->size) {
383
num_bytes = block_rsv->size - block_rsv->reserved;
384
btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
385
block_rsv->reserved = block_rsv->size;
386
} else if (block_rsv->reserved > block_rsv->size) {
387
num_bytes = block_rsv->reserved - block_rsv->size;
388
btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
389
block_rsv->reserved = block_rsv->size;
390
btrfs_try_granting_tickets(fs_info, sinfo);
391
}
392
393
block_rsv->full = (block_rsv->reserved == block_rsv->size);
394
395
if (block_rsv->size >= sinfo->total_bytes)
396
sinfo->force_alloc = CHUNK_ALLOC_FORCE;
397
spin_unlock(&block_rsv->lock);
398
spin_unlock(&sinfo->lock);
399
}
400
401
void btrfs_init_root_block_rsv(struct btrfs_root *root)
402
{
403
struct btrfs_fs_info *fs_info = root->fs_info;
404
405
switch (btrfs_root_id(root)) {
406
case BTRFS_CSUM_TREE_OBJECTID:
407
case BTRFS_EXTENT_TREE_OBJECTID:
408
case BTRFS_FREE_SPACE_TREE_OBJECTID:
409
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
410
case BTRFS_RAID_STRIPE_TREE_OBJECTID:
411
root->block_rsv = &fs_info->delayed_refs_rsv;
412
break;
413
case BTRFS_ROOT_TREE_OBJECTID:
414
case BTRFS_DEV_TREE_OBJECTID:
415
case BTRFS_QUOTA_TREE_OBJECTID:
416
root->block_rsv = &fs_info->global_block_rsv;
417
break;
418
case BTRFS_CHUNK_TREE_OBJECTID:
419
root->block_rsv = &fs_info->chunk_block_rsv;
420
break;
421
case BTRFS_TREE_LOG_OBJECTID:
422
root->block_rsv = &fs_info->treelog_rsv;
423
break;
424
default:
425
root->block_rsv = NULL;
426
break;
427
}
428
}
429
430
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
431
{
432
struct btrfs_space_info *space_info;
433
434
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
435
fs_info->chunk_block_rsv.space_info = space_info;
436
437
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
438
fs_info->global_block_rsv.space_info = space_info;
439
fs_info->trans_block_rsv.space_info = space_info;
440
fs_info->empty_block_rsv.space_info = space_info;
441
fs_info->delayed_block_rsv.space_info = space_info;
442
fs_info->delayed_refs_rsv.space_info = space_info;
443
444
/* The treelog_rsv uses a dedicated space_info on the zoned mode. */
445
if (!btrfs_is_zoned(fs_info)) {
446
fs_info->treelog_rsv.space_info = space_info;
447
} else {
448
ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
449
fs_info->treelog_rsv.space_info = space_info->sub_group[0];
450
}
451
452
btrfs_update_global_block_rsv(fs_info);
453
}
454
455
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
456
{
457
btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
458
NULL);
459
WARN_ON(fs_info->trans_block_rsv.size > 0);
460
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
461
WARN_ON(fs_info->chunk_block_rsv.size > 0);
462
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
463
WARN_ON(fs_info->delayed_block_rsv.size > 0);
464
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
465
WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
466
WARN_ON(fs_info->delayed_refs_rsv.size > 0);
467
}
468
469
static struct btrfs_block_rsv *get_block_rsv(
470
const struct btrfs_trans_handle *trans,
471
const struct btrfs_root *root)
472
{
473
struct btrfs_fs_info *fs_info = root->fs_info;
474
struct btrfs_block_rsv *block_rsv = NULL;
475
476
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
477
(root == fs_info->uuid_root) ||
478
(trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
479
block_rsv = trans->block_rsv;
480
481
if (!block_rsv)
482
block_rsv = root->block_rsv;
483
484
if (!block_rsv)
485
block_rsv = &fs_info->empty_block_rsv;
486
487
return block_rsv;
488
}
489
490
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
491
struct btrfs_root *root,
492
u32 blocksize)
493
{
494
struct btrfs_fs_info *fs_info = root->fs_info;
495
struct btrfs_block_rsv *block_rsv;
496
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
497
int ret;
498
bool global_updated = false;
499
500
block_rsv = get_block_rsv(trans, root);
501
502
if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
503
goto try_reserve;
504
again:
505
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
506
if (!ret)
507
return block_rsv;
508
509
if (block_rsv->failfast)
510
return ERR_PTR(ret);
511
512
if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
513
global_updated = true;
514
btrfs_update_global_block_rsv(fs_info);
515
goto again;
516
}
517
518
/*
519
* The global reserve still exists to save us from ourselves, so don't
520
* warn_on if we are short on our delayed refs reserve.
521
*/
522
if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
523
btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
524
static DEFINE_RATELIMIT_STATE(_rs,
525
DEFAULT_RATELIMIT_INTERVAL * 10,
526
/*DEFAULT_RATELIMIT_BURST*/ 1);
527
if (__ratelimit(&_rs))
528
WARN(1, KERN_DEBUG
529
"BTRFS: block rsv %d returned %d\n",
530
block_rsv->type, ret);
531
}
532
try_reserve:
533
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
534
blocksize, BTRFS_RESERVE_NO_FLUSH);
535
if (!ret)
536
return block_rsv;
537
/*
538
* If we couldn't reserve metadata bytes try and use some from
539
* the global reserve if its space type is the same as the global
540
* reservation.
541
*/
542
if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
543
block_rsv->space_info == global_rsv->space_info) {
544
ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
545
if (!ret)
546
return global_rsv;
547
}
548
549
/*
550
* All hope is lost, but of course our reservations are overly
551
* pessimistic, so instead of possibly having an ENOSPC abort here, try
552
* one last time to force a reservation if there's enough actual space
553
* on disk to make the reservation.
554
*/
555
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
556
BTRFS_RESERVE_FLUSH_EMERGENCY);
557
if (!ret)
558
return block_rsv;
559
560
return ERR_PTR(ret);
561
}
562
563
int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
564
struct btrfs_block_rsv *rsv)
565
{
566
u64 needed_bytes;
567
int ret;
568
569
/* 1 for slack space, 1 for updating the inode */
570
needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
571
btrfs_calc_metadata_size(fs_info, 1);
572
573
spin_lock(&rsv->lock);
574
if (rsv->reserved < needed_bytes)
575
ret = -ENOSPC;
576
else
577
ret = 0;
578
spin_unlock(&rsv->lock);
579
return ret;
580
}
581
582