Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/block-group.c
49642 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include <linux/sizes.h>
4
#include <linux/list_sort.h>
5
#include "misc.h"
6
#include "ctree.h"
7
#include "block-group.h"
8
#include "space-info.h"
9
#include "disk-io.h"
10
#include "free-space-cache.h"
11
#include "free-space-tree.h"
12
#include "volumes.h"
13
#include "transaction.h"
14
#include "ref-verify.h"
15
#include "sysfs.h"
16
#include "tree-log.h"
17
#include "delalloc-space.h"
18
#include "discard.h"
19
#include "raid56.h"
20
#include "zoned.h"
21
#include "fs.h"
22
#include "accessors.h"
23
#include "extent-tree.h"
24
25
#ifdef CONFIG_BTRFS_DEBUG
26
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
27
{
28
struct btrfs_fs_info *fs_info = block_group->fs_info;
29
30
return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
31
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
32
(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
33
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
34
}
35
#endif
36
37
static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
38
{
39
/* The meta_write_pointer is available only on the zoned setup. */
40
if (!btrfs_is_zoned(block_group->fs_info))
41
return false;
42
43
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
44
return false;
45
46
return block_group->start + block_group->alloc_offset >
47
block_group->meta_write_pointer;
48
}
49
50
/*
51
* Return target flags in extended format or 0 if restripe for this chunk_type
52
* is not in progress
53
*
54
* Should be called with balance_lock held
55
*/
56
static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
57
{
58
const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
59
u64 target = 0;
60
61
if (!bctl)
62
return 0;
63
64
if (flags & BTRFS_BLOCK_GROUP_DATA &&
65
bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
66
target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
67
} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
68
bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
69
target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
70
} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
71
bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
72
target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
73
}
74
75
return target;
76
}
77
78
/*
79
* @flags: available profiles in extended format (see ctree.h)
80
*
81
* Return reduced profile in chunk format. If profile changing is in progress
82
* (either running or paused) picks the target profile (if it's already
83
* available), otherwise falls back to plain reducing.
84
*/
85
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
86
{
87
u64 num_devices = fs_info->fs_devices->rw_devices;
88
u64 target;
89
u64 raid_type;
90
u64 allowed = 0;
91
92
/*
93
* See if restripe for this chunk_type is in progress, if so try to
94
* reduce to the target profile
95
*/
96
spin_lock(&fs_info->balance_lock);
97
target = get_restripe_target(fs_info, flags);
98
if (target) {
99
spin_unlock(&fs_info->balance_lock);
100
return extended_to_chunk(target);
101
}
102
spin_unlock(&fs_info->balance_lock);
103
104
/* First, mask out the RAID levels which aren't possible */
105
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
106
if (num_devices >= btrfs_raid_array[raid_type].devs_min)
107
allowed |= btrfs_raid_array[raid_type].bg_flag;
108
}
109
allowed &= flags;
110
111
/* Select the highest-redundancy RAID level. */
112
if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
113
allowed = BTRFS_BLOCK_GROUP_RAID1C4;
114
else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
115
allowed = BTRFS_BLOCK_GROUP_RAID6;
116
else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
117
allowed = BTRFS_BLOCK_GROUP_RAID1C3;
118
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
119
allowed = BTRFS_BLOCK_GROUP_RAID5;
120
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
121
allowed = BTRFS_BLOCK_GROUP_RAID10;
122
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
123
allowed = BTRFS_BLOCK_GROUP_RAID1;
124
else if (allowed & BTRFS_BLOCK_GROUP_DUP)
125
allowed = BTRFS_BLOCK_GROUP_DUP;
126
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
127
allowed = BTRFS_BLOCK_GROUP_RAID0;
128
129
flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
130
131
return extended_to_chunk(flags | allowed);
132
}
133
134
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
135
{
136
unsigned seq;
137
u64 flags;
138
139
do {
140
flags = orig_flags;
141
seq = read_seqbegin(&fs_info->profiles_lock);
142
143
if (flags & BTRFS_BLOCK_GROUP_DATA)
144
flags |= fs_info->avail_data_alloc_bits;
145
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
146
flags |= fs_info->avail_system_alloc_bits;
147
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
148
flags |= fs_info->avail_metadata_alloc_bits;
149
} while (read_seqretry(&fs_info->profiles_lock, seq));
150
151
return btrfs_reduce_alloc_profile(fs_info, flags);
152
}
153
154
void btrfs_get_block_group(struct btrfs_block_group *cache)
155
{
156
refcount_inc(&cache->refs);
157
}
158
159
void btrfs_put_block_group(struct btrfs_block_group *cache)
160
{
161
if (refcount_dec_and_test(&cache->refs)) {
162
WARN_ON(cache->pinned > 0);
163
/*
164
* If there was a failure to cleanup a log tree, very likely due
165
* to an IO failure on a writeback attempt of one or more of its
166
* extent buffers, we could not do proper (and cheap) unaccounting
167
* of their reserved space, so don't warn on reserved > 0 in that
168
* case.
169
*/
170
if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
171
!BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
172
WARN_ON(cache->reserved > 0);
173
174
/*
175
* A block_group shouldn't be on the discard_list anymore.
176
* Remove the block_group from the discard_list to prevent us
177
* from causing a panic due to NULL pointer dereference.
178
*/
179
if (WARN_ON(!list_empty(&cache->discard_list)))
180
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
181
cache);
182
183
kfree(cache->free_space_ctl);
184
btrfs_free_chunk_map(cache->physical_map);
185
kfree(cache);
186
}
187
}
188
189
static int btrfs_bg_start_cmp(const struct rb_node *new,
190
const struct rb_node *exist)
191
{
192
const struct btrfs_block_group *new_bg =
193
rb_entry(new, struct btrfs_block_group, cache_node);
194
const struct btrfs_block_group *exist_bg =
195
rb_entry(exist, struct btrfs_block_group, cache_node);
196
197
if (new_bg->start < exist_bg->start)
198
return -1;
199
if (new_bg->start > exist_bg->start)
200
return 1;
201
return 0;
202
}
203
204
/*
205
* This adds the block group to the fs_info rb tree for the block group cache
206
*/
207
static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
208
{
209
struct btrfs_fs_info *fs_info = block_group->fs_info;
210
struct rb_node *exist;
211
int ret = 0;
212
213
ASSERT(block_group->length != 0);
214
215
write_lock(&fs_info->block_group_cache_lock);
216
217
exist = rb_find_add_cached(&block_group->cache_node,
218
&fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
219
if (exist)
220
ret = -EEXIST;
221
write_unlock(&fs_info->block_group_cache_lock);
222
223
return ret;
224
}
225
226
/*
227
* This will return the block group at or after bytenr if contains is 0, else
228
* it will return the block group that contains the bytenr
229
*/
230
static struct btrfs_block_group *block_group_cache_tree_search(
231
struct btrfs_fs_info *info, u64 bytenr, int contains)
232
{
233
struct btrfs_block_group *cache, *ret = NULL;
234
struct rb_node *n;
235
u64 end, start;
236
237
read_lock(&info->block_group_cache_lock);
238
n = info->block_group_cache_tree.rb_root.rb_node;
239
240
while (n) {
241
cache = rb_entry(n, struct btrfs_block_group, cache_node);
242
end = cache->start + cache->length - 1;
243
start = cache->start;
244
245
if (bytenr < start) {
246
if (!contains && (!ret || start < ret->start))
247
ret = cache;
248
n = n->rb_left;
249
} else if (bytenr > start) {
250
if (contains && bytenr <= end) {
251
ret = cache;
252
break;
253
}
254
n = n->rb_right;
255
} else {
256
ret = cache;
257
break;
258
}
259
}
260
if (ret)
261
btrfs_get_block_group(ret);
262
read_unlock(&info->block_group_cache_lock);
263
264
return ret;
265
}
266
267
/*
268
* Return the block group that starts at or after bytenr
269
*/
270
struct btrfs_block_group *btrfs_lookup_first_block_group(
271
struct btrfs_fs_info *info, u64 bytenr)
272
{
273
return block_group_cache_tree_search(info, bytenr, 0);
274
}
275
276
/*
277
* Return the block group that contains the given bytenr
278
*/
279
struct btrfs_block_group *btrfs_lookup_block_group(
280
struct btrfs_fs_info *info, u64 bytenr)
281
{
282
return block_group_cache_tree_search(info, bytenr, 1);
283
}
284
285
struct btrfs_block_group *btrfs_next_block_group(
286
struct btrfs_block_group *cache)
287
{
288
struct btrfs_fs_info *fs_info = cache->fs_info;
289
struct rb_node *node;
290
291
read_lock(&fs_info->block_group_cache_lock);
292
293
/* If our block group was removed, we need a full search. */
294
if (RB_EMPTY_NODE(&cache->cache_node)) {
295
const u64 next_bytenr = cache->start + cache->length;
296
297
read_unlock(&fs_info->block_group_cache_lock);
298
btrfs_put_block_group(cache);
299
return btrfs_lookup_first_block_group(fs_info, next_bytenr);
300
}
301
node = rb_next(&cache->cache_node);
302
btrfs_put_block_group(cache);
303
if (node) {
304
cache = rb_entry(node, struct btrfs_block_group, cache_node);
305
btrfs_get_block_group(cache);
306
} else
307
cache = NULL;
308
read_unlock(&fs_info->block_group_cache_lock);
309
return cache;
310
}
311
312
/*
313
* Check if we can do a NOCOW write for a given extent.
314
*
315
* @fs_info: The filesystem information object.
316
* @bytenr: Logical start address of the extent.
317
*
318
* Check if we can do a NOCOW write for the given extent, and increments the
319
* number of NOCOW writers in the block group that contains the extent, as long
320
* as the block group exists and it's currently not in read-only mode.
321
*
322
* Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
323
* is responsible for calling btrfs_dec_nocow_writers() later.
324
*
325
* Or NULL if we can not do a NOCOW write
326
*/
327
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
328
u64 bytenr)
329
{
330
struct btrfs_block_group *bg;
331
bool can_nocow = true;
332
333
bg = btrfs_lookup_block_group(fs_info, bytenr);
334
if (!bg)
335
return NULL;
336
337
spin_lock(&bg->lock);
338
if (bg->ro)
339
can_nocow = false;
340
else
341
atomic_inc(&bg->nocow_writers);
342
spin_unlock(&bg->lock);
343
344
if (!can_nocow) {
345
btrfs_put_block_group(bg);
346
return NULL;
347
}
348
349
/* No put on block group, done by btrfs_dec_nocow_writers(). */
350
return bg;
351
}
352
353
/*
354
* Decrement the number of NOCOW writers in a block group.
355
*
356
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
357
* and on the block group returned by that call. Typically this is called after
358
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
359
* relocation.
360
*
361
* After this call, the caller should not use the block group anymore. It it wants
362
* to use it, then it should get a reference on it before calling this function.
363
*/
364
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
365
{
366
if (atomic_dec_and_test(&bg->nocow_writers))
367
wake_up_var(&bg->nocow_writers);
368
369
/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
370
btrfs_put_block_group(bg);
371
}
372
373
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
374
{
375
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
376
}
377
378
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
379
const u64 start)
380
{
381
struct btrfs_block_group *bg;
382
383
bg = btrfs_lookup_block_group(fs_info, start);
384
ASSERT(bg);
385
if (atomic_dec_and_test(&bg->reservations))
386
wake_up_var(&bg->reservations);
387
btrfs_put_block_group(bg);
388
}
389
390
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
391
{
392
struct btrfs_space_info *space_info = bg->space_info;
393
394
ASSERT(bg->ro);
395
396
if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
397
return;
398
399
/*
400
* Our block group is read only but before we set it to read only,
401
* some task might have had allocated an extent from it already, but it
402
* has not yet created a respective ordered extent (and added it to a
403
* root's list of ordered extents).
404
* Therefore wait for any task currently allocating extents, since the
405
* block group's reservations counter is incremented while a read lock
406
* on the groups' semaphore is held and decremented after releasing
407
* the read access on that semaphore and creating the ordered extent.
408
*/
409
down_write(&space_info->groups_sem);
410
up_write(&space_info->groups_sem);
411
412
wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
413
}
414
415
struct btrfs_caching_control *btrfs_get_caching_control(
416
struct btrfs_block_group *cache)
417
{
418
struct btrfs_caching_control *ctl;
419
420
spin_lock(&cache->lock);
421
if (!cache->caching_ctl) {
422
spin_unlock(&cache->lock);
423
return NULL;
424
}
425
426
ctl = cache->caching_ctl;
427
refcount_inc(&ctl->count);
428
spin_unlock(&cache->lock);
429
return ctl;
430
}
431
432
static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
433
{
434
if (refcount_dec_and_test(&ctl->count))
435
kfree(ctl);
436
}
437
438
/*
439
* When we wait for progress in the block group caching, its because our
440
* allocation attempt failed at least once. So, we must sleep and let some
441
* progress happen before we try again.
442
*
443
* This function will sleep at least once waiting for new free space to show
444
* up, and then it will check the block group free space numbers for our min
445
* num_bytes. Another option is to have it go ahead and look in the rbtree for
446
* a free extent of a given size, but this is a good start.
447
*
448
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
449
* any of the information in this block group.
450
*/
451
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
452
u64 num_bytes)
453
{
454
struct btrfs_caching_control *caching_ctl;
455
int progress;
456
457
caching_ctl = btrfs_get_caching_control(cache);
458
if (!caching_ctl)
459
return;
460
461
/*
462
* We've already failed to allocate from this block group, so even if
463
* there's enough space in the block group it isn't contiguous enough to
464
* allow for an allocation, so wait for at least the next wakeup tick,
465
* or for the thing to be done.
466
*/
467
progress = atomic_read(&caching_ctl->progress);
468
469
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
470
(progress != atomic_read(&caching_ctl->progress) &&
471
(cache->free_space_ctl->free_space >= num_bytes)));
472
473
btrfs_put_caching_control(caching_ctl);
474
}
475
476
static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
477
struct btrfs_caching_control *caching_ctl)
478
{
479
wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
480
return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
481
}
482
483
static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
484
{
485
struct btrfs_caching_control *caching_ctl;
486
int ret;
487
488
caching_ctl = btrfs_get_caching_control(cache);
489
if (!caching_ctl)
490
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
491
ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
492
btrfs_put_caching_control(caching_ctl);
493
return ret;
494
}
495
496
#ifdef CONFIG_BTRFS_DEBUG
497
static void fragment_free_space(struct btrfs_block_group *block_group)
498
{
499
struct btrfs_fs_info *fs_info = block_group->fs_info;
500
u64 start = block_group->start;
501
u64 len = block_group->length;
502
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
503
fs_info->nodesize : fs_info->sectorsize;
504
u64 step = chunk << 1;
505
506
while (len > chunk) {
507
btrfs_remove_free_space(block_group, start, chunk);
508
start += step;
509
if (len < step)
510
len = 0;
511
else
512
len -= step;
513
}
514
}
515
#endif
516
517
/*
518
* Add a free space range to the in memory free space cache of a block group.
519
* This checks if the range contains super block locations and any such
520
* locations are not added to the free space cache.
521
*
522
* @block_group: The target block group.
523
* @start: Start offset of the range.
524
* @end: End offset of the range (exclusive).
525
* @total_added_ret: Optional pointer to return the total amount of space
526
* added to the block group's free space cache.
527
*
528
* Returns 0 on success or < 0 on error.
529
*/
530
int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
531
u64 end, u64 *total_added_ret)
532
{
533
struct btrfs_fs_info *info = block_group->fs_info;
534
u64 extent_start, extent_end, size;
535
int ret;
536
537
if (total_added_ret)
538
*total_added_ret = 0;
539
540
while (start < end) {
541
if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
542
&extent_start, &extent_end,
543
EXTENT_DIRTY, NULL))
544
break;
545
546
if (extent_start <= start) {
547
start = extent_end + 1;
548
} else if (extent_start > start && extent_start < end) {
549
size = extent_start - start;
550
ret = btrfs_add_free_space_async_trimmed(block_group,
551
start, size);
552
if (ret)
553
return ret;
554
if (total_added_ret)
555
*total_added_ret += size;
556
start = extent_end + 1;
557
} else {
558
break;
559
}
560
}
561
562
if (start < end) {
563
size = end - start;
564
ret = btrfs_add_free_space_async_trimmed(block_group, start,
565
size);
566
if (ret)
567
return ret;
568
if (total_added_ret)
569
*total_added_ret += size;
570
}
571
572
return 0;
573
}
574
575
/*
576
* Get an arbitrary extent item index / max_index through the block group
577
*
578
* @block_group the block group to sample from
579
* @index: the integral step through the block group to grab from
580
* @max_index: the granularity of the sampling
581
* @key: return value parameter for the item we find
582
*
583
* Pre-conditions on indices:
584
* 0 <= index <= max_index
585
* 0 < max_index
586
*
587
* Returns: 0 on success, 1 if the search didn't yield a useful item, negative
588
* error code on error.
589
*/
590
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
591
struct btrfs_block_group *block_group,
592
int index, int max_index,
593
struct btrfs_key *found_key)
594
{
595
struct btrfs_fs_info *fs_info = block_group->fs_info;
596
struct btrfs_root *extent_root;
597
u64 search_offset;
598
u64 search_end = block_group->start + block_group->length;
599
BTRFS_PATH_AUTO_FREE(path);
600
struct btrfs_key search_key;
601
int ret = 0;
602
603
ASSERT(index >= 0);
604
ASSERT(index <= max_index);
605
ASSERT(max_index > 0);
606
lockdep_assert_held(&caching_ctl->mutex);
607
lockdep_assert_held_read(&fs_info->commit_root_sem);
608
609
path = btrfs_alloc_path();
610
if (!path)
611
return -ENOMEM;
612
613
extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
614
BTRFS_SUPER_INFO_OFFSET));
615
616
path->skip_locking = true;
617
path->search_commit_root = true;
618
path->reada = READA_FORWARD;
619
620
search_offset = index * div_u64(block_group->length, max_index);
621
search_key.objectid = block_group->start + search_offset;
622
search_key.type = BTRFS_EXTENT_ITEM_KEY;
623
search_key.offset = 0;
624
625
btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
626
/* Success; sampled an extent item in the block group */
627
if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
628
found_key->objectid >= block_group->start &&
629
found_key->objectid + found_key->offset <= search_end)
630
break;
631
632
/* We can't possibly find a valid extent item anymore */
633
if (found_key->objectid >= search_end) {
634
ret = 1;
635
break;
636
}
637
}
638
639
lockdep_assert_held(&caching_ctl->mutex);
640
lockdep_assert_held_read(&fs_info->commit_root_sem);
641
return ret;
642
}
643
644
/*
645
* Best effort attempt to compute a block group's size class while caching it.
646
*
647
* @block_group: the block group we are caching
648
*
649
* We cannot infer the size class while adding free space extents, because that
650
* logic doesn't care about contiguous file extents (it doesn't differentiate
651
* between a 100M extent and 100 contiguous 1M extents). So we need to read the
652
* file extent items. Reading all of them is quite wasteful, because usually
653
* only a handful are enough to give a good answer. Therefore, we just grab 5 of
654
* them at even steps through the block group and pick the smallest size class
655
* we see. Since size class is best effort, and not guaranteed in general,
656
* inaccuracy is acceptable.
657
*
658
* To be more explicit about why this algorithm makes sense:
659
*
660
* If we are caching in a block group from disk, then there are three major cases
661
* to consider:
662
* 1. the block group is well behaved and all extents in it are the same size
663
* class.
664
* 2. the block group is mostly one size class with rare exceptions for last
665
* ditch allocations
666
* 3. the block group was populated before size classes and can have a totally
667
* arbitrary mix of size classes.
668
*
669
* In case 1, looking at any extent in the block group will yield the correct
670
* result. For the mixed cases, taking the minimum size class seems like a good
671
* approximation, since gaps from frees will be usable to the size class. For
672
* 2., a small handful of file extents is likely to yield the right answer. For
673
* 3, we can either read every file extent, or admit that this is best effort
674
* anyway and try to stay fast.
675
*
676
* Returns: 0 on success, negative error code on error.
677
*/
678
static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
679
struct btrfs_block_group *block_group)
680
{
681
struct btrfs_fs_info *fs_info = block_group->fs_info;
682
struct btrfs_key key;
683
int i;
684
u64 min_size = block_group->length;
685
enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
686
int ret;
687
688
if (!btrfs_block_group_should_use_size_class(block_group))
689
return 0;
690
691
lockdep_assert_held(&caching_ctl->mutex);
692
lockdep_assert_held_read(&fs_info->commit_root_sem);
693
for (i = 0; i < 5; ++i) {
694
ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
695
if (ret < 0)
696
goto out;
697
if (ret > 0)
698
continue;
699
min_size = min_t(u64, min_size, key.offset);
700
size_class = btrfs_calc_block_group_size_class(min_size);
701
}
702
if (size_class != BTRFS_BG_SZ_NONE) {
703
spin_lock(&block_group->lock);
704
block_group->size_class = size_class;
705
spin_unlock(&block_group->lock);
706
}
707
out:
708
return ret;
709
}
710
711
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
712
{
713
struct btrfs_block_group *block_group = caching_ctl->block_group;
714
struct btrfs_fs_info *fs_info = block_group->fs_info;
715
struct btrfs_root *extent_root;
716
BTRFS_PATH_AUTO_FREE(path);
717
struct extent_buffer *leaf;
718
struct btrfs_key key;
719
u64 total_found = 0;
720
u64 last = 0;
721
u32 nritems;
722
int ret;
723
bool wakeup = true;
724
725
path = btrfs_alloc_path();
726
if (!path)
727
return -ENOMEM;
728
729
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
730
extent_root = btrfs_extent_root(fs_info, last);
731
732
#ifdef CONFIG_BTRFS_DEBUG
733
/*
734
* If we're fragmenting we don't want to make anybody think we can
735
* allocate from this block group until we've had a chance to fragment
736
* the free space.
737
*/
738
if (btrfs_should_fragment_free_space(block_group))
739
wakeup = false;
740
#endif
741
/*
742
* We don't want to deadlock with somebody trying to allocate a new
743
* extent for the extent root while also trying to search the extent
744
* root to add free space. So we skip locking and search the commit
745
* root, since its read-only
746
*/
747
path->skip_locking = true;
748
path->search_commit_root = true;
749
path->reada = READA_FORWARD;
750
751
key.objectid = last;
752
key.type = BTRFS_EXTENT_ITEM_KEY;
753
key.offset = 0;
754
755
next:
756
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
757
if (ret < 0)
758
goto out;
759
760
leaf = path->nodes[0];
761
nritems = btrfs_header_nritems(leaf);
762
763
while (1) {
764
if (btrfs_fs_closing(fs_info) > 1) {
765
last = (u64)-1;
766
break;
767
}
768
769
if (path->slots[0] < nritems) {
770
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
771
} else {
772
ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
773
if (ret)
774
break;
775
776
if (need_resched() ||
777
rwsem_is_contended(&fs_info->commit_root_sem)) {
778
btrfs_release_path(path);
779
up_read(&fs_info->commit_root_sem);
780
mutex_unlock(&caching_ctl->mutex);
781
cond_resched();
782
mutex_lock(&caching_ctl->mutex);
783
down_read(&fs_info->commit_root_sem);
784
goto next;
785
}
786
787
ret = btrfs_next_leaf(extent_root, path);
788
if (ret < 0)
789
goto out;
790
if (ret)
791
break;
792
leaf = path->nodes[0];
793
nritems = btrfs_header_nritems(leaf);
794
continue;
795
}
796
797
if (key.objectid < last) {
798
key.objectid = last;
799
key.type = BTRFS_EXTENT_ITEM_KEY;
800
key.offset = 0;
801
btrfs_release_path(path);
802
goto next;
803
}
804
805
if (key.objectid < block_group->start) {
806
path->slots[0]++;
807
continue;
808
}
809
810
if (key.objectid >= block_group->start + block_group->length)
811
break;
812
813
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
814
key.type == BTRFS_METADATA_ITEM_KEY) {
815
u64 space_added;
816
817
ret = btrfs_add_new_free_space(block_group, last,
818
key.objectid, &space_added);
819
if (ret)
820
goto out;
821
total_found += space_added;
822
if (key.type == BTRFS_METADATA_ITEM_KEY)
823
last = key.objectid +
824
fs_info->nodesize;
825
else
826
last = key.objectid + key.offset;
827
828
if (total_found > CACHING_CTL_WAKE_UP) {
829
total_found = 0;
830
if (wakeup) {
831
atomic_inc(&caching_ctl->progress);
832
wake_up(&caching_ctl->wait);
833
}
834
}
835
}
836
path->slots[0]++;
837
}
838
839
ret = btrfs_add_new_free_space(block_group, last,
840
block_group->start + block_group->length,
841
NULL);
842
out:
843
return ret;
844
}
845
846
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
847
{
848
btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
849
bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
850
}
851
852
static noinline void caching_thread(struct btrfs_work *work)
853
{
854
struct btrfs_block_group *block_group;
855
struct btrfs_fs_info *fs_info;
856
struct btrfs_caching_control *caching_ctl;
857
int ret;
858
859
caching_ctl = container_of(work, struct btrfs_caching_control, work);
860
block_group = caching_ctl->block_group;
861
fs_info = block_group->fs_info;
862
863
mutex_lock(&caching_ctl->mutex);
864
down_read(&fs_info->commit_root_sem);
865
866
load_block_group_size_class(caching_ctl, block_group);
867
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
868
ret = load_free_space_cache(block_group);
869
if (ret == 1) {
870
ret = 0;
871
goto done;
872
}
873
874
/*
875
* We failed to load the space cache, set ourselves to
876
* CACHE_STARTED and carry on.
877
*/
878
spin_lock(&block_group->lock);
879
block_group->cached = BTRFS_CACHE_STARTED;
880
spin_unlock(&block_group->lock);
881
wake_up(&caching_ctl->wait);
882
}
883
884
/*
885
* If we are in the transaction that populated the free space tree we
886
* can't actually cache from the free space tree as our commit root and
887
* real root are the same, so we could change the contents of the blocks
888
* while caching. Instead do the slow caching in this case, and after
889
* the transaction has committed we will be safe.
890
*/
891
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
892
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
893
ret = btrfs_load_free_space_tree(caching_ctl);
894
else
895
ret = load_extent_tree_free(caching_ctl);
896
done:
897
spin_lock(&block_group->lock);
898
block_group->caching_ctl = NULL;
899
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
900
spin_unlock(&block_group->lock);
901
902
#ifdef CONFIG_BTRFS_DEBUG
903
if (btrfs_should_fragment_free_space(block_group)) {
904
u64 bytes_used;
905
906
spin_lock(&block_group->space_info->lock);
907
spin_lock(&block_group->lock);
908
bytes_used = block_group->length - block_group->used;
909
block_group->space_info->bytes_used += bytes_used >> 1;
910
spin_unlock(&block_group->lock);
911
spin_unlock(&block_group->space_info->lock);
912
fragment_free_space(block_group);
913
}
914
#endif
915
916
up_read(&fs_info->commit_root_sem);
917
btrfs_free_excluded_extents(block_group);
918
mutex_unlock(&caching_ctl->mutex);
919
920
wake_up(&caching_ctl->wait);
921
922
btrfs_put_caching_control(caching_ctl);
923
btrfs_put_block_group(block_group);
924
}
925
926
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
927
{
928
struct btrfs_fs_info *fs_info = cache->fs_info;
929
struct btrfs_caching_control *caching_ctl = NULL;
930
int ret = 0;
931
932
/* Allocator for zoned filesystems does not use the cache at all */
933
if (btrfs_is_zoned(fs_info))
934
return 0;
935
936
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
937
if (!caching_ctl)
938
return -ENOMEM;
939
940
INIT_LIST_HEAD(&caching_ctl->list);
941
mutex_init(&caching_ctl->mutex);
942
init_waitqueue_head(&caching_ctl->wait);
943
caching_ctl->block_group = cache;
944
refcount_set(&caching_ctl->count, 2);
945
atomic_set(&caching_ctl->progress, 0);
946
btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
947
948
spin_lock(&cache->lock);
949
if (cache->cached != BTRFS_CACHE_NO) {
950
kfree(caching_ctl);
951
952
caching_ctl = cache->caching_ctl;
953
if (caching_ctl)
954
refcount_inc(&caching_ctl->count);
955
spin_unlock(&cache->lock);
956
goto out;
957
}
958
WARN_ON(cache->caching_ctl);
959
cache->caching_ctl = caching_ctl;
960
cache->cached = BTRFS_CACHE_STARTED;
961
spin_unlock(&cache->lock);
962
963
write_lock(&fs_info->block_group_cache_lock);
964
refcount_inc(&caching_ctl->count);
965
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
966
write_unlock(&fs_info->block_group_cache_lock);
967
968
btrfs_get_block_group(cache);
969
970
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
971
out:
972
if (wait && caching_ctl)
973
ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
974
if (caching_ctl)
975
btrfs_put_caching_control(caching_ctl);
976
977
return ret;
978
}
979
980
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
981
{
982
u64 extra_flags = chunk_to_extended(flags) &
983
BTRFS_EXTENDED_PROFILE_MASK;
984
985
write_seqlock(&fs_info->profiles_lock);
986
if (flags & BTRFS_BLOCK_GROUP_DATA)
987
fs_info->avail_data_alloc_bits &= ~extra_flags;
988
if (flags & BTRFS_BLOCK_GROUP_METADATA)
989
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
990
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
991
fs_info->avail_system_alloc_bits &= ~extra_flags;
992
write_sequnlock(&fs_info->profiles_lock);
993
}
994
995
/*
996
* Clear incompat bits for the following feature(s):
997
*
998
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
999
* in the whole filesystem
1000
*
1001
* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
1002
*/
1003
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
1004
{
1005
bool found_raid56 = false;
1006
bool found_raid1c34 = false;
1007
1008
if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
1009
(flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
1010
(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
1011
struct list_head *head = &fs_info->space_info;
1012
struct btrfs_space_info *sinfo;
1013
1014
list_for_each_entry_rcu(sinfo, head, list) {
1015
down_read(&sinfo->groups_sem);
1016
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
1017
found_raid56 = true;
1018
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
1019
found_raid56 = true;
1020
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1021
found_raid1c34 = true;
1022
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1023
found_raid1c34 = true;
1024
up_read(&sinfo->groups_sem);
1025
}
1026
if (!found_raid56)
1027
btrfs_clear_fs_incompat(fs_info, RAID56);
1028
if (!found_raid1c34)
1029
btrfs_clear_fs_incompat(fs_info, RAID1C34);
1030
}
1031
}
1032
1033
static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
1034
{
1035
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
1036
return fs_info->block_group_root;
1037
return btrfs_extent_root(fs_info, 0);
1038
}
1039
1040
static int remove_block_group_item(struct btrfs_trans_handle *trans,
1041
struct btrfs_path *path,
1042
struct btrfs_block_group *block_group)
1043
{
1044
struct btrfs_fs_info *fs_info = trans->fs_info;
1045
struct btrfs_root *root;
1046
struct btrfs_key key;
1047
int ret;
1048
1049
root = btrfs_block_group_root(fs_info);
1050
key.objectid = block_group->start;
1051
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1052
key.offset = block_group->length;
1053
1054
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1055
if (ret > 0)
1056
ret = -ENOENT;
1057
if (ret < 0)
1058
return ret;
1059
1060
ret = btrfs_del_item(trans, root, path);
1061
return ret;
1062
}
1063
1064
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1065
struct btrfs_chunk_map *map)
1066
{
1067
struct btrfs_fs_info *fs_info = trans->fs_info;
1068
BTRFS_PATH_AUTO_FREE(path);
1069
struct btrfs_block_group *block_group;
1070
struct btrfs_free_cluster *cluster;
1071
struct inode *inode;
1072
struct kobject *kobj = NULL;
1073
int ret;
1074
int index;
1075
int factor;
1076
struct btrfs_caching_control *caching_ctl = NULL;
1077
bool remove_map;
1078
bool remove_rsv = false;
1079
1080
block_group = btrfs_lookup_block_group(fs_info, map->start);
1081
if (!block_group)
1082
return -ENOENT;
1083
1084
BUG_ON(!block_group->ro);
1085
1086
trace_btrfs_remove_block_group(block_group);
1087
/*
1088
* Free the reserved super bytes from this block group before
1089
* remove it.
1090
*/
1091
btrfs_free_excluded_extents(block_group);
1092
btrfs_free_ref_tree_range(fs_info, block_group->start,
1093
block_group->length);
1094
1095
index = btrfs_bg_flags_to_raid_index(block_group->flags);
1096
factor = btrfs_bg_type_to_factor(block_group->flags);
1097
1098
/* make sure this block group isn't part of an allocation cluster */
1099
cluster = &fs_info->data_alloc_cluster;
1100
spin_lock(&cluster->refill_lock);
1101
btrfs_return_cluster_to_free_space(block_group, cluster);
1102
spin_unlock(&cluster->refill_lock);
1103
1104
/*
1105
* make sure this block group isn't part of a metadata
1106
* allocation cluster
1107
*/
1108
cluster = &fs_info->meta_alloc_cluster;
1109
spin_lock(&cluster->refill_lock);
1110
btrfs_return_cluster_to_free_space(block_group, cluster);
1111
spin_unlock(&cluster->refill_lock);
1112
1113
btrfs_clear_treelog_bg(block_group);
1114
btrfs_clear_data_reloc_bg(block_group);
1115
1116
path = btrfs_alloc_path();
1117
if (!path) {
1118
ret = -ENOMEM;
1119
goto out;
1120
}
1121
1122
/*
1123
* get the inode first so any iput calls done for the io_list
1124
* aren't the final iput (no unlinks allowed now)
1125
*/
1126
inode = lookup_free_space_inode(block_group, path);
1127
1128
mutex_lock(&trans->transaction->cache_write_mutex);
1129
/*
1130
* Make sure our free space cache IO is done before removing the
1131
* free space inode
1132
*/
1133
spin_lock(&trans->transaction->dirty_bgs_lock);
1134
if (!list_empty(&block_group->io_list)) {
1135
list_del_init(&block_group->io_list);
1136
1137
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1138
1139
spin_unlock(&trans->transaction->dirty_bgs_lock);
1140
btrfs_wait_cache_io(trans, block_group, path);
1141
btrfs_put_block_group(block_group);
1142
spin_lock(&trans->transaction->dirty_bgs_lock);
1143
}
1144
1145
if (!list_empty(&block_group->dirty_list)) {
1146
list_del_init(&block_group->dirty_list);
1147
remove_rsv = true;
1148
btrfs_put_block_group(block_group);
1149
}
1150
spin_unlock(&trans->transaction->dirty_bgs_lock);
1151
mutex_unlock(&trans->transaction->cache_write_mutex);
1152
1153
ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1154
if (ret)
1155
goto out;
1156
1157
write_lock(&fs_info->block_group_cache_lock);
1158
rb_erase_cached(&block_group->cache_node,
1159
&fs_info->block_group_cache_tree);
1160
RB_CLEAR_NODE(&block_group->cache_node);
1161
1162
/* Once for the block groups rbtree */
1163
btrfs_put_block_group(block_group);
1164
1165
write_unlock(&fs_info->block_group_cache_lock);
1166
1167
down_write(&block_group->space_info->groups_sem);
1168
/*
1169
* we must use list_del_init so people can check to see if they
1170
* are still on the list after taking the semaphore
1171
*/
1172
list_del_init(&block_group->list);
1173
if (list_empty(&block_group->space_info->block_groups[index])) {
1174
kobj = block_group->space_info->block_group_kobjs[index];
1175
block_group->space_info->block_group_kobjs[index] = NULL;
1176
clear_avail_alloc_bits(fs_info, block_group->flags);
1177
}
1178
up_write(&block_group->space_info->groups_sem);
1179
clear_incompat_bg_bits(fs_info, block_group->flags);
1180
if (kobj) {
1181
kobject_del(kobj);
1182
kobject_put(kobj);
1183
}
1184
1185
if (block_group->cached == BTRFS_CACHE_STARTED)
1186
btrfs_wait_block_group_cache_done(block_group);
1187
1188
write_lock(&fs_info->block_group_cache_lock);
1189
caching_ctl = btrfs_get_caching_control(block_group);
1190
if (!caching_ctl) {
1191
struct btrfs_caching_control *ctl;
1192
1193
list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1194
if (ctl->block_group == block_group) {
1195
caching_ctl = ctl;
1196
refcount_inc(&caching_ctl->count);
1197
break;
1198
}
1199
}
1200
}
1201
if (caching_ctl)
1202
list_del_init(&caching_ctl->list);
1203
write_unlock(&fs_info->block_group_cache_lock);
1204
1205
if (caching_ctl) {
1206
/* Once for the caching bgs list and once for us. */
1207
btrfs_put_caching_control(caching_ctl);
1208
btrfs_put_caching_control(caching_ctl);
1209
}
1210
1211
spin_lock(&trans->transaction->dirty_bgs_lock);
1212
WARN_ON(!list_empty(&block_group->dirty_list));
1213
WARN_ON(!list_empty(&block_group->io_list));
1214
spin_unlock(&trans->transaction->dirty_bgs_lock);
1215
1216
btrfs_remove_free_space_cache(block_group);
1217
1218
spin_lock(&block_group->space_info->lock);
1219
list_del_init(&block_group->ro_list);
1220
1221
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1222
WARN_ON(block_group->space_info->total_bytes
1223
< block_group->length);
1224
WARN_ON(block_group->space_info->bytes_readonly
1225
< block_group->length - block_group->zone_unusable);
1226
WARN_ON(block_group->space_info->bytes_zone_unusable
1227
< block_group->zone_unusable);
1228
WARN_ON(block_group->space_info->disk_total
1229
< block_group->length * factor);
1230
}
1231
block_group->space_info->total_bytes -= block_group->length;
1232
block_group->space_info->bytes_readonly -=
1233
(block_group->length - block_group->zone_unusable);
1234
btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
1235
-block_group->zone_unusable);
1236
block_group->space_info->disk_total -= block_group->length * factor;
1237
1238
spin_unlock(&block_group->space_info->lock);
1239
1240
/*
1241
* Remove the free space for the block group from the free space tree
1242
* and the block group's item from the extent tree before marking the
1243
* block group as removed. This is to prevent races with tasks that
1244
* freeze and unfreeze a block group, this task and another task
1245
* allocating a new block group - the unfreeze task ends up removing
1246
* the block group's extent map before the task calling this function
1247
* deletes the block group item from the extent tree, allowing for
1248
* another task to attempt to create another block group with the same
1249
* item key (and failing with -EEXIST and a transaction abort).
1250
*/
1251
ret = btrfs_remove_block_group_free_space(trans, block_group);
1252
if (ret)
1253
goto out;
1254
1255
ret = remove_block_group_item(trans, path, block_group);
1256
if (ret < 0)
1257
goto out;
1258
1259
spin_lock(&block_group->lock);
1260
/*
1261
* Hitting this WARN means we removed a block group with an unwritten
1262
* region. It will cause "unable to find chunk map for logical" errors.
1263
*/
1264
if (WARN_ON(has_unwritten_metadata(block_group)))
1265
btrfs_warn(fs_info,
1266
"block group %llu is removed before metadata write out",
1267
block_group->start);
1268
1269
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1270
1271
/*
1272
* At this point trimming or scrub can't start on this block group,
1273
* because we removed the block group from the rbtree
1274
* fs_info->block_group_cache_tree so no one can't find it anymore and
1275
* even if someone already got this block group before we removed it
1276
* from the rbtree, they have already incremented block_group->frozen -
1277
* if they didn't, for the trimming case they won't find any free space
1278
* entries because we already removed them all when we called
1279
* btrfs_remove_free_space_cache().
1280
*
1281
* And we must not remove the chunk map from the fs_info->mapping_tree
1282
* to prevent the same logical address range and physical device space
1283
* ranges from being reused for a new block group. This is needed to
1284
* avoid races with trimming and scrub.
1285
*
1286
* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1287
* completely transactionless, so while it is trimming a range the
1288
* currently running transaction might finish and a new one start,
1289
* allowing for new block groups to be created that can reuse the same
1290
* physical device locations unless we take this special care.
1291
*
1292
* There may also be an implicit trim operation if the file system
1293
* is mounted with -odiscard. The same protections must remain
1294
* in place until the extents have been discarded completely when
1295
* the transaction commit has completed.
1296
*/
1297
remove_map = (atomic_read(&block_group->frozen) == 0);
1298
spin_unlock(&block_group->lock);
1299
1300
if (remove_map)
1301
btrfs_remove_chunk_map(fs_info, map);
1302
1303
out:
1304
/* Once for the lookup reference */
1305
btrfs_put_block_group(block_group);
1306
if (remove_rsv)
1307
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
1308
return ret;
1309
}
1310
1311
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1312
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1313
{
1314
struct btrfs_root *root = btrfs_block_group_root(fs_info);
1315
struct btrfs_chunk_map *map;
1316
unsigned int num_items;
1317
1318
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
1319
ASSERT(map != NULL);
1320
ASSERT(map->start == chunk_offset);
1321
1322
/*
1323
* We need to reserve 3 + N units from the metadata space info in order
1324
* to remove a block group (done at btrfs_remove_chunk() and at
1325
* btrfs_remove_block_group()), which are used for:
1326
*
1327
* 1 unit for adding the free space inode's orphan (located in the tree
1328
* of tree roots).
1329
* 1 unit for deleting the block group item (located in the extent
1330
* tree).
1331
* 1 unit for deleting the free space item (located in tree of tree
1332
* roots).
1333
* N units for deleting N device extent items corresponding to each
1334
* stripe (located in the device tree).
1335
*
1336
* In order to remove a block group we also need to reserve units in the
1337
* system space info in order to update the chunk tree (update one or
1338
* more device items and remove one chunk item), but this is done at
1339
* btrfs_remove_chunk() through a call to check_system_chunk().
1340
*/
1341
num_items = 3 + map->num_stripes;
1342
btrfs_free_chunk_map(map);
1343
1344
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1345
}
1346
1347
/*
1348
* Mark block group @cache read-only, so later write won't happen to block
1349
* group @cache.
1350
*
1351
* If @force is not set, this function will only mark the block group readonly
1352
* if we have enough free space (1M) in other metadata/system block groups.
1353
* If @force is not set, this function will mark the block group readonly
1354
* without checking free space.
1355
*
1356
* NOTE: This function doesn't care if other block groups can contain all the
1357
* data in this block group. That check should be done by relocation routine,
1358
* not this function.
1359
*/
1360
static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
1361
{
1362
struct btrfs_space_info *sinfo = cache->space_info;
1363
u64 num_bytes;
1364
int ret = -ENOSPC;
1365
1366
spin_lock(&sinfo->lock);
1367
spin_lock(&cache->lock);
1368
1369
if (cache->swap_extents) {
1370
ret = -ETXTBSY;
1371
goto out;
1372
}
1373
1374
if (cache->ro) {
1375
cache->ro++;
1376
ret = 0;
1377
goto out;
1378
}
1379
1380
num_bytes = cache->length - cache->reserved - cache->pinned -
1381
cache->bytes_super - cache->zone_unusable - cache->used;
1382
1383
/*
1384
* Data never overcommits, even in mixed mode, so do just the straight
1385
* check of left over space in how much we have allocated.
1386
*/
1387
if (force) {
1388
ret = 0;
1389
} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1390
u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1391
1392
/*
1393
* Here we make sure if we mark this bg RO, we still have enough
1394
* free space as buffer.
1395
*/
1396
if (sinfo_used + num_bytes <= sinfo->total_bytes)
1397
ret = 0;
1398
} else {
1399
/*
1400
* We overcommit metadata, so we need to do the
1401
* btrfs_can_overcommit check here, and we need to pass in
1402
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1403
* leeway to allow us to mark this block group as read only.
1404
*/
1405
if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH))
1406
ret = 0;
1407
}
1408
1409
if (!ret) {
1410
sinfo->bytes_readonly += num_bytes;
1411
if (btrfs_is_zoned(cache->fs_info)) {
1412
/* Migrate zone_unusable bytes to readonly */
1413
sinfo->bytes_readonly += cache->zone_unusable;
1414
btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
1415
cache->zone_unusable = 0;
1416
}
1417
cache->ro++;
1418
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1419
}
1420
out:
1421
spin_unlock(&cache->lock);
1422
spin_unlock(&sinfo->lock);
1423
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1424
btrfs_info(cache->fs_info,
1425
"unable to make block group %llu ro", cache->start);
1426
btrfs_dump_space_info(cache->space_info, 0, false);
1427
}
1428
return ret;
1429
}
1430
1431
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1432
const struct btrfs_block_group *bg)
1433
{
1434
struct btrfs_fs_info *fs_info = trans->fs_info;
1435
struct btrfs_transaction *prev_trans = NULL;
1436
const u64 start = bg->start;
1437
const u64 end = start + bg->length - 1;
1438
int ret;
1439
1440
spin_lock(&fs_info->trans_lock);
1441
if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
1442
prev_trans = list_prev_entry(trans->transaction, list);
1443
refcount_inc(&prev_trans->use_count);
1444
}
1445
spin_unlock(&fs_info->trans_lock);
1446
1447
/*
1448
* Hold the unused_bg_unpin_mutex lock to avoid racing with
1449
* btrfs_finish_extent_commit(). If we are at transaction N, another
1450
* task might be running finish_extent_commit() for the previous
1451
* transaction N - 1, and have seen a range belonging to the block
1452
* group in pinned_extents before we were able to clear the whole block
1453
* group range from pinned_extents. This means that task can lookup for
1454
* the block group after we unpinned it from pinned_extents and removed
1455
* it, leading to an error at unpin_extent_range().
1456
*/
1457
mutex_lock(&fs_info->unused_bg_unpin_mutex);
1458
if (prev_trans) {
1459
ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
1460
EXTENT_DIRTY, NULL);
1461
if (ret)
1462
goto out;
1463
}
1464
1465
ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
1466
EXTENT_DIRTY, NULL);
1467
out:
1468
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1469
if (prev_trans)
1470
btrfs_put_transaction(prev_trans);
1471
1472
return ret == 0;
1473
}
1474
1475
/*
1476
* Link the block_group to a list via bg_list.
1477
*
1478
* @bg: The block_group to link to the list.
1479
* @list: The list to link it to.
1480
*
1481
* Use this rather than list_add_tail() directly to ensure proper respect
1482
* to locking and refcounting.
1483
*
1484
* Returns: true if the bg was linked with a refcount bump and false otherwise.
1485
*/
1486
static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
1487
{
1488
struct btrfs_fs_info *fs_info = bg->fs_info;
1489
bool added = false;
1490
1491
spin_lock(&fs_info->unused_bgs_lock);
1492
if (list_empty(&bg->bg_list)) {
1493
btrfs_get_block_group(bg);
1494
list_add_tail(&bg->bg_list, list);
1495
added = true;
1496
}
1497
spin_unlock(&fs_info->unused_bgs_lock);
1498
return added;
1499
}
1500
1501
/*
1502
* Process the unused_bgs list and remove any that don't have any allocated
1503
* space inside of them.
1504
*/
1505
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1506
{
1507
LIST_HEAD(retry_list);
1508
struct btrfs_block_group *block_group;
1509
struct btrfs_space_info *space_info;
1510
struct btrfs_trans_handle *trans;
1511
const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1512
int ret = 0;
1513
1514
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1515
return;
1516
1517
if (btrfs_fs_closing(fs_info))
1518
return;
1519
1520
/*
1521
* Long running balances can keep us blocked here for eternity, so
1522
* simply skip deletion if we're unable to get the mutex.
1523
*/
1524
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1525
return;
1526
1527
spin_lock(&fs_info->unused_bgs_lock);
1528
while (!list_empty(&fs_info->unused_bgs)) {
1529
u64 used;
1530
int trimming;
1531
1532
block_group = list_first_entry(&fs_info->unused_bgs,
1533
struct btrfs_block_group,
1534
bg_list);
1535
list_del_init(&block_group->bg_list);
1536
1537
space_info = block_group->space_info;
1538
1539
if (ret || btrfs_mixed_space_info(space_info)) {
1540
btrfs_put_block_group(block_group);
1541
continue;
1542
}
1543
spin_unlock(&fs_info->unused_bgs_lock);
1544
1545
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1546
1547
/* Don't want to race with allocators so take the groups_sem */
1548
down_write(&space_info->groups_sem);
1549
1550
/*
1551
* Async discard moves the final block group discard to be prior
1552
* to the unused_bgs code path. Therefore, if it's not fully
1553
* trimmed, punt it back to the async discard lists.
1554
*/
1555
if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1556
!btrfs_is_free_space_trimmed(block_group)) {
1557
trace_btrfs_skip_unused_block_group(block_group);
1558
up_write(&space_info->groups_sem);
1559
/* Requeue if we failed because of async discard */
1560
btrfs_discard_queue_work(&fs_info->discard_ctl,
1561
block_group);
1562
goto next;
1563
}
1564
1565
spin_lock(&space_info->lock);
1566
spin_lock(&block_group->lock);
1567
if (btrfs_is_block_group_used(block_group) || block_group->ro ||
1568
list_is_singular(&block_group->list)) {
1569
/*
1570
* We want to bail if we made new allocations or have
1571
* outstanding allocations in this block group. We do
1572
* the ro check in case balance is currently acting on
1573
* this block group.
1574
*
1575
* Also bail out if this is the only block group for its
1576
* type, because otherwise we would lose profile
1577
* information from fs_info->avail_*_alloc_bits and the
1578
* next block group of this type would be created with a
1579
* "single" profile (even if we're in a raid fs) because
1580
* fs_info->avail_*_alloc_bits would be 0.
1581
*/
1582
trace_btrfs_skip_unused_block_group(block_group);
1583
spin_unlock(&block_group->lock);
1584
spin_unlock(&space_info->lock);
1585
up_write(&space_info->groups_sem);
1586
goto next;
1587
}
1588
1589
/*
1590
* The block group may be unused but there may be space reserved
1591
* accounting with the existence of that block group, that is,
1592
* space_info->bytes_may_use was incremented by a task but no
1593
* space was yet allocated from the block group by the task.
1594
* That space may or may not be allocated, as we are generally
1595
* pessimistic about space reservation for metadata as well as
1596
* for data when using compression (as we reserve space based on
1597
* the worst case, when data can't be compressed, and before
1598
* actually attempting compression, before starting writeback).
1599
*
1600
* So check if the total space of the space_info minus the size
1601
* of this block group is less than the used space of the
1602
* space_info - if that's the case, then it means we have tasks
1603
* that might be relying on the block group in order to allocate
1604
* extents, and add back the block group to the unused list when
1605
* we finish, so that we retry later in case no tasks ended up
1606
* needing to allocate extents from the block group.
1607
*/
1608
used = btrfs_space_info_used(space_info, true);
1609
if ((space_info->total_bytes - block_group->length < used &&
1610
block_group->zone_unusable < block_group->length) ||
1611
has_unwritten_metadata(block_group)) {
1612
/*
1613
* Add a reference for the list, compensate for the ref
1614
* drop under the "next" label for the
1615
* fs_info->unused_bgs list.
1616
*/
1617
btrfs_link_bg_list(block_group, &retry_list);
1618
1619
trace_btrfs_skip_unused_block_group(block_group);
1620
spin_unlock(&block_group->lock);
1621
spin_unlock(&space_info->lock);
1622
up_write(&space_info->groups_sem);
1623
goto next;
1624
}
1625
1626
spin_unlock(&block_group->lock);
1627
spin_unlock(&space_info->lock);
1628
1629
/* We don't want to force the issue, only flip if it's ok. */
1630
ret = inc_block_group_ro(block_group, 0);
1631
up_write(&space_info->groups_sem);
1632
if (ret < 0) {
1633
ret = 0;
1634
goto next;
1635
}
1636
1637
ret = btrfs_zone_finish(block_group);
1638
if (ret < 0) {
1639
btrfs_dec_block_group_ro(block_group);
1640
if (ret == -EAGAIN) {
1641
btrfs_link_bg_list(block_group, &retry_list);
1642
ret = 0;
1643
}
1644
goto next;
1645
}
1646
1647
/*
1648
* Want to do this before we do anything else so we can recover
1649
* properly if we fail to join the transaction.
1650
*/
1651
trans = btrfs_start_trans_remove_block_group(fs_info,
1652
block_group->start);
1653
if (IS_ERR(trans)) {
1654
btrfs_dec_block_group_ro(block_group);
1655
ret = PTR_ERR(trans);
1656
goto next;
1657
}
1658
1659
/*
1660
* We could have pending pinned extents for this block group,
1661
* just delete them, we don't care about them anymore.
1662
*/
1663
if (!clean_pinned_extents(trans, block_group)) {
1664
btrfs_dec_block_group_ro(block_group);
1665
goto end_trans;
1666
}
1667
1668
/*
1669
* At this point, the block_group is read only and should fail
1670
* new allocations. However, btrfs_finish_extent_commit() can
1671
* cause this block_group to be placed back on the discard
1672
* lists because now the block_group isn't fully discarded.
1673
* Bail here and try again later after discarding everything.
1674
*/
1675
spin_lock(&fs_info->discard_ctl.lock);
1676
if (!list_empty(&block_group->discard_list)) {
1677
spin_unlock(&fs_info->discard_ctl.lock);
1678
btrfs_dec_block_group_ro(block_group);
1679
btrfs_discard_queue_work(&fs_info->discard_ctl,
1680
block_group);
1681
goto end_trans;
1682
}
1683
spin_unlock(&fs_info->discard_ctl.lock);
1684
1685
/* Reset pinned so btrfs_put_block_group doesn't complain */
1686
spin_lock(&space_info->lock);
1687
spin_lock(&block_group->lock);
1688
1689
btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
1690
space_info->bytes_readonly += block_group->pinned;
1691
block_group->pinned = 0;
1692
1693
spin_unlock(&block_group->lock);
1694
spin_unlock(&space_info->lock);
1695
1696
/*
1697
* The normal path here is an unused block group is passed here,
1698
* then trimming is handled in the transaction commit path.
1699
* Async discard interposes before this to do the trimming
1700
* before coming down the unused block group path as trimming
1701
* will no longer be done later in the transaction commit path.
1702
*/
1703
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1704
goto flip_async;
1705
1706
/*
1707
* DISCARD can flip during remount. On zoned filesystems, we
1708
* need to reset sequential-required zones.
1709
*/
1710
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1711
btrfs_is_zoned(fs_info);
1712
1713
/* Implicit trim during transaction commit. */
1714
if (trimming)
1715
btrfs_freeze_block_group(block_group);
1716
1717
/*
1718
* Btrfs_remove_chunk will abort the transaction if things go
1719
* horribly wrong.
1720
*/
1721
ret = btrfs_remove_chunk(trans, block_group->start);
1722
1723
if (ret) {
1724
if (trimming)
1725
btrfs_unfreeze_block_group(block_group);
1726
goto end_trans;
1727
}
1728
1729
/*
1730
* If we're not mounted with -odiscard, we can just forget
1731
* about this block group. Otherwise we'll need to wait
1732
* until transaction commit to do the actual discard.
1733
*/
1734
if (trimming) {
1735
spin_lock(&fs_info->unused_bgs_lock);
1736
/*
1737
* A concurrent scrub might have added us to the list
1738
* fs_info->unused_bgs, so use a list_move operation
1739
* to add the block group to the deleted_bgs list.
1740
*/
1741
list_move(&block_group->bg_list,
1742
&trans->transaction->deleted_bgs);
1743
spin_unlock(&fs_info->unused_bgs_lock);
1744
btrfs_get_block_group(block_group);
1745
}
1746
end_trans:
1747
btrfs_end_transaction(trans);
1748
next:
1749
btrfs_put_block_group(block_group);
1750
spin_lock(&fs_info->unused_bgs_lock);
1751
}
1752
list_splice_tail(&retry_list, &fs_info->unused_bgs);
1753
spin_unlock(&fs_info->unused_bgs_lock);
1754
mutex_unlock(&fs_info->reclaim_bgs_lock);
1755
return;
1756
1757
flip_async:
1758
btrfs_end_transaction(trans);
1759
spin_lock(&fs_info->unused_bgs_lock);
1760
list_splice_tail(&retry_list, &fs_info->unused_bgs);
1761
spin_unlock(&fs_info->unused_bgs_lock);
1762
mutex_unlock(&fs_info->reclaim_bgs_lock);
1763
btrfs_put_block_group(block_group);
1764
btrfs_discard_punt_unused_bgs_list(fs_info);
1765
}
1766
1767
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1768
{
1769
struct btrfs_fs_info *fs_info = bg->fs_info;
1770
1771
spin_lock(&fs_info->unused_bgs_lock);
1772
if (list_empty(&bg->bg_list)) {
1773
btrfs_get_block_group(bg);
1774
trace_btrfs_add_unused_block_group(bg);
1775
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1776
} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
1777
/* Pull out the block group from the reclaim_bgs list. */
1778
trace_btrfs_add_unused_block_group(bg);
1779
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
1780
}
1781
spin_unlock(&fs_info->unused_bgs_lock);
1782
}
1783
1784
/*
1785
* We want block groups with a low number of used bytes to be in the beginning
1786
* of the list, so they will get reclaimed first.
1787
*/
1788
static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1789
const struct list_head *b)
1790
{
1791
const struct btrfs_block_group *bg1, *bg2;
1792
1793
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1794
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1795
1796
/*
1797
* Some other task may be updating the ->used field concurrently, but it
1798
* is not serious if we get a stale value or load/store tearing issues,
1799
* as sorting the list of block groups to reclaim is not critical and an
1800
* occasional imperfect order is ok. So silence KCSAN and avoid the
1801
* overhead of locking or any other synchronization.
1802
*/
1803
return data_race(bg1->used > bg2->used);
1804
}
1805
1806
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
1807
{
1808
if (btrfs_is_zoned(fs_info))
1809
return btrfs_zoned_should_reclaim(fs_info);
1810
return true;
1811
}
1812
1813
static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
1814
{
1815
const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
1816
u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
1817
const u64 new_val = bg->used;
1818
const u64 old_val = new_val + bytes_freed;
1819
1820
if (thresh_bytes == 0)
1821
return false;
1822
1823
/*
1824
* If we were below the threshold before don't reclaim, we are likely a
1825
* brand new block group and we don't want to relocate new block groups.
1826
*/
1827
if (old_val < thresh_bytes)
1828
return false;
1829
if (new_val >= thresh_bytes)
1830
return false;
1831
return true;
1832
}
1833
1834
void btrfs_reclaim_bgs_work(struct work_struct *work)
1835
{
1836
struct btrfs_fs_info *fs_info =
1837
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1838
struct btrfs_block_group *bg;
1839
struct btrfs_space_info *space_info;
1840
LIST_HEAD(retry_list);
1841
1842
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1843
return;
1844
1845
if (btrfs_fs_closing(fs_info))
1846
return;
1847
1848
if (!btrfs_should_reclaim(fs_info))
1849
return;
1850
1851
guard(super_write)(fs_info->sb);
1852
1853
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
1854
return;
1855
1856
/*
1857
* Long running balances can keep us blocked here for eternity, so
1858
* simply skip reclaim if we're unable to get the mutex.
1859
*/
1860
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1861
btrfs_exclop_finish(fs_info);
1862
return;
1863
}
1864
1865
spin_lock(&fs_info->unused_bgs_lock);
1866
/*
1867
* Sort happens under lock because we can't simply splice it and sort.
1868
* The block groups might still be in use and reachable via bg_list,
1869
* and their presence in the reclaim_bgs list must be preserved.
1870
*/
1871
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1872
while (!list_empty(&fs_info->reclaim_bgs)) {
1873
u64 used;
1874
u64 reserved;
1875
int ret = 0;
1876
1877
bg = list_first_entry(&fs_info->reclaim_bgs,
1878
struct btrfs_block_group,
1879
bg_list);
1880
list_del_init(&bg->bg_list);
1881
1882
space_info = bg->space_info;
1883
spin_unlock(&fs_info->unused_bgs_lock);
1884
1885
/* Don't race with allocators so take the groups_sem */
1886
down_write(&space_info->groups_sem);
1887
1888
spin_lock(&space_info->lock);
1889
spin_lock(&bg->lock);
1890
if (bg->reserved || bg->pinned || bg->ro) {
1891
/*
1892
* We want to bail if we made new allocations or have
1893
* outstanding allocations in this block group. We do
1894
* the ro check in case balance is currently acting on
1895
* this block group.
1896
*/
1897
spin_unlock(&bg->lock);
1898
spin_unlock(&space_info->lock);
1899
up_write(&space_info->groups_sem);
1900
goto next;
1901
}
1902
if (bg->used == 0) {
1903
/*
1904
* It is possible that we trigger relocation on a block
1905
* group as its extents are deleted and it first goes
1906
* below the threshold, then shortly after goes empty.
1907
*
1908
* In this case, relocating it does delete it, but has
1909
* some overhead in relocation specific metadata, looking
1910
* for the non-existent extents and running some extra
1911
* transactions, which we can avoid by using one of the
1912
* other mechanisms for dealing with empty block groups.
1913
*/
1914
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1915
btrfs_mark_bg_unused(bg);
1916
spin_unlock(&bg->lock);
1917
spin_unlock(&space_info->lock);
1918
up_write(&space_info->groups_sem);
1919
goto next;
1920
1921
}
1922
/*
1923
* The block group might no longer meet the reclaim condition by
1924
* the time we get around to reclaiming it, so to avoid
1925
* reclaiming overly full block_groups, skip reclaiming them.
1926
*
1927
* Since the decision making process also depends on the amount
1928
* being freed, pass in a fake giant value to skip that extra
1929
* check, which is more meaningful when adding to the list in
1930
* the first place.
1931
*/
1932
if (!should_reclaim_block_group(bg, bg->length)) {
1933
spin_unlock(&bg->lock);
1934
spin_unlock(&space_info->lock);
1935
up_write(&space_info->groups_sem);
1936
goto next;
1937
}
1938
1939
spin_unlock(&bg->lock);
1940
spin_unlock(&space_info->lock);
1941
1942
/*
1943
* Get out fast, in case we're read-only or unmounting the
1944
* filesystem. It is OK to drop block groups from the list even
1945
* for the read-only case. As we did take the super write lock,
1946
* "mount -o remount,ro" won't happen and read-only filesystem
1947
* means it is forced read-only due to a fatal error. So, it
1948
* never gets back to read-write to let us reclaim again.
1949
*/
1950
if (btrfs_need_cleaner_sleep(fs_info)) {
1951
up_write(&space_info->groups_sem);
1952
goto next;
1953
}
1954
1955
ret = inc_block_group_ro(bg, 0);
1956
up_write(&space_info->groups_sem);
1957
if (ret < 0)
1958
goto next;
1959
1960
/*
1961
* The amount of bytes reclaimed corresponds to the sum of the
1962
* "used" and "reserved" counters. We have set the block group
1963
* to RO above, which prevents reservations from happening but
1964
* we may have existing reservations for which allocation has
1965
* not yet been done - btrfs_update_block_group() was not yet
1966
* called, which is where we will transfer a reserved extent's
1967
* size from the "reserved" counter to the "used" counter - this
1968
* happens when running delayed references. When we relocate the
1969
* chunk below, relocation first flushes delalloc, waits for
1970
* ordered extent completion (which is where we create delayed
1971
* references for data extents) and commits the current
1972
* transaction (which runs delayed references), and only after
1973
* it does the actual work to move extents out of the block
1974
* group. So the reported amount of reclaimed bytes is
1975
* effectively the sum of the 'used' and 'reserved' counters.
1976
*/
1977
spin_lock(&bg->lock);
1978
used = bg->used;
1979
reserved = bg->reserved;
1980
spin_unlock(&bg->lock);
1981
1982
trace_btrfs_reclaim_block_group(bg);
1983
ret = btrfs_relocate_chunk(fs_info, bg->start, false);
1984
if (ret) {
1985
btrfs_dec_block_group_ro(bg);
1986
btrfs_err(fs_info, "error relocating chunk %llu",
1987
bg->start);
1988
used = 0;
1989
reserved = 0;
1990
spin_lock(&space_info->lock);
1991
space_info->reclaim_errors++;
1992
if (READ_ONCE(space_info->periodic_reclaim))
1993
space_info->periodic_reclaim_ready = false;
1994
spin_unlock(&space_info->lock);
1995
}
1996
spin_lock(&space_info->lock);
1997
space_info->reclaim_count++;
1998
space_info->reclaim_bytes += used;
1999
space_info->reclaim_bytes += reserved;
2000
spin_unlock(&space_info->lock);
2001
2002
next:
2003
if (ret && !READ_ONCE(space_info->periodic_reclaim))
2004
btrfs_link_bg_list(bg, &retry_list);
2005
btrfs_put_block_group(bg);
2006
2007
mutex_unlock(&fs_info->reclaim_bgs_lock);
2008
/*
2009
* Reclaiming all the block groups in the list can take really
2010
* long. Prioritize cleaning up unused block groups.
2011
*/
2012
btrfs_delete_unused_bgs(fs_info);
2013
/*
2014
* If we are interrupted by a balance, we can just bail out. The
2015
* cleaner thread restart again if necessary.
2016
*/
2017
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
2018
goto end;
2019
spin_lock(&fs_info->unused_bgs_lock);
2020
}
2021
spin_unlock(&fs_info->unused_bgs_lock);
2022
mutex_unlock(&fs_info->reclaim_bgs_lock);
2023
end:
2024
spin_lock(&fs_info->unused_bgs_lock);
2025
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
2026
spin_unlock(&fs_info->unused_bgs_lock);
2027
btrfs_exclop_finish(fs_info);
2028
}
2029
2030
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
2031
{
2032
btrfs_reclaim_sweep(fs_info);
2033
spin_lock(&fs_info->unused_bgs_lock);
2034
if (!list_empty(&fs_info->reclaim_bgs))
2035
queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
2036
spin_unlock(&fs_info->unused_bgs_lock);
2037
}
2038
2039
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
2040
{
2041
struct btrfs_fs_info *fs_info = bg->fs_info;
2042
2043
if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
2044
trace_btrfs_add_reclaim_block_group(bg);
2045
}
2046
2047
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
2048
const struct btrfs_path *path)
2049
{
2050
struct btrfs_chunk_map *map;
2051
struct btrfs_block_group_item bg;
2052
struct extent_buffer *leaf;
2053
int slot;
2054
u64 flags;
2055
int ret = 0;
2056
2057
slot = path->slots[0];
2058
leaf = path->nodes[0];
2059
2060
map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
2061
if (!map) {
2062
btrfs_err(fs_info,
2063
"logical %llu len %llu found bg but no related chunk",
2064
key->objectid, key->offset);
2065
return -ENOENT;
2066
}
2067
2068
if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
2069
btrfs_err(fs_info,
2070
"block group %llu len %llu mismatch with chunk %llu len %llu",
2071
key->objectid, key->offset, map->start, map->chunk_len);
2072
ret = -EUCLEAN;
2073
goto out_free_map;
2074
}
2075
2076
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
2077
sizeof(bg));
2078
flags = btrfs_stack_block_group_flags(&bg) &
2079
BTRFS_BLOCK_GROUP_TYPE_MASK;
2080
2081
if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2082
btrfs_err(fs_info,
2083
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
2084
key->objectid, key->offset, flags,
2085
(BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
2086
ret = -EUCLEAN;
2087
}
2088
2089
out_free_map:
2090
btrfs_free_chunk_map(map);
2091
return ret;
2092
}
2093
2094
static int find_first_block_group(struct btrfs_fs_info *fs_info,
2095
struct btrfs_path *path,
2096
const struct btrfs_key *key)
2097
{
2098
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2099
int ret;
2100
struct btrfs_key found_key;
2101
2102
btrfs_for_each_slot(root, key, &found_key, path, ret) {
2103
if (found_key.objectid >= key->objectid &&
2104
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
2105
return read_bg_from_eb(fs_info, &found_key, path);
2106
}
2107
}
2108
return ret;
2109
}
2110
2111
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2112
{
2113
u64 extra_flags = chunk_to_extended(flags) &
2114
BTRFS_EXTENDED_PROFILE_MASK;
2115
2116
write_seqlock(&fs_info->profiles_lock);
2117
if (flags & BTRFS_BLOCK_GROUP_DATA)
2118
fs_info->avail_data_alloc_bits |= extra_flags;
2119
if (flags & BTRFS_BLOCK_GROUP_METADATA)
2120
fs_info->avail_metadata_alloc_bits |= extra_flags;
2121
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2122
fs_info->avail_system_alloc_bits |= extra_flags;
2123
write_sequnlock(&fs_info->profiles_lock);
2124
}
2125
2126
/*
2127
* Map a physical disk address to a list of logical addresses.
2128
*
2129
* @fs_info: the filesystem
2130
* @chunk_start: logical address of block group
2131
* @physical: physical address to map to logical addresses
2132
* @logical: return array of logical addresses which map to @physical
2133
* @naddrs: length of @logical
2134
* @stripe_len: size of IO stripe for the given block group
2135
*
2136
* Maps a particular @physical disk address to a list of @logical addresses.
2137
* Used primarily to exclude those portions of a block group that contain super
2138
* block copies.
2139
*/
2140
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
2141
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
2142
{
2143
struct btrfs_chunk_map *map;
2144
u64 *buf;
2145
u64 bytenr;
2146
u64 data_stripe_length;
2147
u64 io_stripe_size;
2148
int i, nr = 0;
2149
int ret = 0;
2150
2151
map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
2152
if (IS_ERR(map))
2153
return -EIO;
2154
2155
data_stripe_length = map->stripe_size;
2156
io_stripe_size = BTRFS_STRIPE_LEN;
2157
chunk_start = map->start;
2158
2159
/* For RAID5/6 adjust to a full IO stripe length */
2160
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2161
io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2162
2163
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
2164
if (!buf) {
2165
ret = -ENOMEM;
2166
goto out;
2167
}
2168
2169
for (i = 0; i < map->num_stripes; i++) {
2170
bool already_inserted = false;
2171
u32 stripe_nr;
2172
u32 offset;
2173
int j;
2174
2175
if (!in_range(physical, map->stripes[i].physical,
2176
data_stripe_length))
2177
continue;
2178
2179
stripe_nr = (physical - map->stripes[i].physical) >>
2180
BTRFS_STRIPE_LEN_SHIFT;
2181
offset = (physical - map->stripes[i].physical) &
2182
BTRFS_STRIPE_LEN_MASK;
2183
2184
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2185
BTRFS_BLOCK_GROUP_RAID10))
2186
stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2187
map->sub_stripes);
2188
/*
2189
* The remaining case would be for RAID56, multiply by
2190
* nr_data_stripes(). Alternatively, just use rmap_len below
2191
* instead of map->stripe_len
2192
*/
2193
bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2194
2195
/* Ensure we don't add duplicate addresses */
2196
for (j = 0; j < nr; j++) {
2197
if (buf[j] == bytenr) {
2198
already_inserted = true;
2199
break;
2200
}
2201
}
2202
2203
if (!already_inserted)
2204
buf[nr++] = bytenr;
2205
}
2206
2207
*logical = buf;
2208
*naddrs = nr;
2209
*stripe_len = io_stripe_size;
2210
out:
2211
btrfs_free_chunk_map(map);
2212
return ret;
2213
}
2214
2215
static int exclude_super_stripes(struct btrfs_block_group *cache)
2216
{
2217
struct btrfs_fs_info *fs_info = cache->fs_info;
2218
const bool zoned = btrfs_is_zoned(fs_info);
2219
u64 bytenr;
2220
u64 *logical;
2221
int stripe_len;
2222
int i, nr, ret;
2223
2224
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2225
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2226
cache->bytes_super += stripe_len;
2227
ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
2228
cache->start + stripe_len - 1,
2229
EXTENT_DIRTY, NULL);
2230
if (ret)
2231
return ret;
2232
}
2233
2234
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2235
bytenr = btrfs_sb_offset(i);
2236
ret = btrfs_rmap_block(fs_info, cache->start,
2237
bytenr, &logical, &nr, &stripe_len);
2238
if (ret)
2239
return ret;
2240
2241
/* Shouldn't have super stripes in sequential zones */
2242
if (unlikely(zoned && nr)) {
2243
kfree(logical);
2244
btrfs_err(fs_info,
2245
"zoned: block group %llu must not contain super block",
2246
cache->start);
2247
return -EUCLEAN;
2248
}
2249
2250
while (nr--) {
2251
u64 len = min_t(u64, stripe_len,
2252
cache->start + cache->length - logical[nr]);
2253
2254
cache->bytes_super += len;
2255
ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
2256
logical[nr], logical[nr] + len - 1,
2257
EXTENT_DIRTY, NULL);
2258
if (ret) {
2259
kfree(logical);
2260
return ret;
2261
}
2262
}
2263
2264
kfree(logical);
2265
}
2266
return 0;
2267
}
2268
2269
static struct btrfs_block_group *btrfs_create_block_group_cache(
2270
struct btrfs_fs_info *fs_info, u64 start)
2271
{
2272
struct btrfs_block_group *cache;
2273
2274
cache = kzalloc(sizeof(*cache), GFP_NOFS);
2275
if (!cache)
2276
return NULL;
2277
2278
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2279
GFP_NOFS);
2280
if (!cache->free_space_ctl) {
2281
kfree(cache);
2282
return NULL;
2283
}
2284
2285
cache->start = start;
2286
2287
cache->fs_info = fs_info;
2288
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
2289
2290
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2291
2292
refcount_set(&cache->refs, 1);
2293
spin_lock_init(&cache->lock);
2294
init_rwsem(&cache->data_rwsem);
2295
INIT_LIST_HEAD(&cache->list);
2296
INIT_LIST_HEAD(&cache->cluster_list);
2297
INIT_LIST_HEAD(&cache->bg_list);
2298
INIT_LIST_HEAD(&cache->ro_list);
2299
INIT_LIST_HEAD(&cache->discard_list);
2300
INIT_LIST_HEAD(&cache->dirty_list);
2301
INIT_LIST_HEAD(&cache->io_list);
2302
INIT_LIST_HEAD(&cache->active_bg_list);
2303
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
2304
atomic_set(&cache->frozen, 0);
2305
mutex_init(&cache->free_space_lock);
2306
2307
return cache;
2308
}
2309
2310
/*
2311
* Iterate all chunks and verify that each of them has the corresponding block
2312
* group
2313
*/
2314
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2315
{
2316
u64 start = 0;
2317
int ret = 0;
2318
2319
while (1) {
2320
struct btrfs_chunk_map *map;
2321
struct btrfs_block_group *bg;
2322
2323
/*
2324
* btrfs_find_chunk_map() will return the first chunk map
2325
* intersecting the range, so setting @length to 1 is enough to
2326
* get the first chunk.
2327
*/
2328
map = btrfs_find_chunk_map(fs_info, start, 1);
2329
if (!map)
2330
break;
2331
2332
bg = btrfs_lookup_block_group(fs_info, map->start);
2333
if (unlikely(!bg)) {
2334
btrfs_err(fs_info,
2335
"chunk start=%llu len=%llu doesn't have corresponding block group",
2336
map->start, map->chunk_len);
2337
ret = -EUCLEAN;
2338
btrfs_free_chunk_map(map);
2339
break;
2340
}
2341
if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
2342
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2343
(map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2344
btrfs_err(fs_info,
2345
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2346
map->start, map->chunk_len,
2347
map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2348
bg->start, bg->length,
2349
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2350
ret = -EUCLEAN;
2351
btrfs_free_chunk_map(map);
2352
btrfs_put_block_group(bg);
2353
break;
2354
}
2355
start = map->start + map->chunk_len;
2356
btrfs_free_chunk_map(map);
2357
btrfs_put_block_group(bg);
2358
}
2359
return ret;
2360
}
2361
2362
static int read_one_block_group(struct btrfs_fs_info *info,
2363
struct btrfs_block_group_item *bgi,
2364
const struct btrfs_key *key,
2365
int need_clear)
2366
{
2367
struct btrfs_block_group *cache;
2368
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2369
int ret;
2370
2371
ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2372
2373
cache = btrfs_create_block_group_cache(info, key->objectid);
2374
if (!cache)
2375
return -ENOMEM;
2376
2377
cache->length = key->offset;
2378
cache->used = btrfs_stack_block_group_used(bgi);
2379
cache->commit_used = cache->used;
2380
cache->flags = btrfs_stack_block_group_flags(bgi);
2381
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2382
cache->space_info = btrfs_find_space_info(info, cache->flags);
2383
2384
btrfs_set_free_space_tree_thresholds(cache);
2385
2386
if (need_clear) {
2387
/*
2388
* When we mount with old space cache, we need to
2389
* set BTRFS_DC_CLEAR and set dirty flag.
2390
*
2391
* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2392
* truncate the old free space cache inode and
2393
* setup a new one.
2394
* b) Setting 'dirty flag' makes sure that we flush
2395
* the new space cache info onto disk.
2396
*/
2397
if (btrfs_test_opt(info, SPACE_CACHE))
2398
cache->disk_cache_state = BTRFS_DC_CLEAR;
2399
}
2400
if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2401
(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2402
btrfs_err(info,
2403
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2404
cache->start);
2405
ret = -EINVAL;
2406
goto error;
2407
}
2408
2409
ret = btrfs_load_block_group_zone_info(cache, false);
2410
if (ret) {
2411
btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2412
cache->start);
2413
goto error;
2414
}
2415
2416
/*
2417
* We need to exclude the super stripes now so that the space info has
2418
* super bytes accounted for, otherwise we'll think we have more space
2419
* than we actually do.
2420
*/
2421
ret = exclude_super_stripes(cache);
2422
if (ret) {
2423
/* We may have excluded something, so call this just in case. */
2424
btrfs_free_excluded_extents(cache);
2425
goto error;
2426
}
2427
2428
/*
2429
* For zoned filesystem, space after the allocation offset is the only
2430
* free space for a block group. So, we don't need any caching work.
2431
* btrfs_calc_zone_unusable() will set the amount of free space and
2432
* zone_unusable space.
2433
*
2434
* For regular filesystem, check for two cases, either we are full, and
2435
* therefore don't need to bother with the caching work since we won't
2436
* find any space, or we are empty, and we can just add all the space
2437
* in and be done with it. This saves us _a_lot_ of time, particularly
2438
* in the full case.
2439
*/
2440
if (btrfs_is_zoned(info)) {
2441
btrfs_calc_zone_unusable(cache);
2442
/* Should not have any excluded extents. Just in case, though. */
2443
btrfs_free_excluded_extents(cache);
2444
} else if (cache->length == cache->used) {
2445
cache->cached = BTRFS_CACHE_FINISHED;
2446
btrfs_free_excluded_extents(cache);
2447
} else if (cache->used == 0) {
2448
cache->cached = BTRFS_CACHE_FINISHED;
2449
ret = btrfs_add_new_free_space(cache, cache->start,
2450
cache->start + cache->length, NULL);
2451
btrfs_free_excluded_extents(cache);
2452
if (ret)
2453
goto error;
2454
}
2455
2456
ret = btrfs_add_block_group_cache(cache);
2457
if (ret) {
2458
btrfs_remove_free_space_cache(cache);
2459
goto error;
2460
}
2461
2462
trace_btrfs_add_block_group(info, cache, 0);
2463
btrfs_add_bg_to_space_info(info, cache);
2464
2465
set_avail_alloc_bits(info, cache->flags);
2466
if (btrfs_chunk_writeable(info, cache->start)) {
2467
if (cache->used == 0) {
2468
ASSERT(list_empty(&cache->bg_list));
2469
if (btrfs_test_opt(info, DISCARD_ASYNC))
2470
btrfs_discard_queue_work(&info->discard_ctl, cache);
2471
else
2472
btrfs_mark_bg_unused(cache);
2473
}
2474
} else {
2475
inc_block_group_ro(cache, 1);
2476
}
2477
2478
return 0;
2479
error:
2480
btrfs_put_block_group(cache);
2481
return ret;
2482
}
2483
2484
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2485
{
2486
struct rb_node *node;
2487
int ret = 0;
2488
2489
for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2490
struct btrfs_chunk_map *map;
2491
struct btrfs_block_group *bg;
2492
2493
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2494
bg = btrfs_create_block_group_cache(fs_info, map->start);
2495
if (!bg) {
2496
ret = -ENOMEM;
2497
break;
2498
}
2499
2500
/* Fill dummy cache as FULL */
2501
bg->length = map->chunk_len;
2502
bg->flags = map->type;
2503
bg->cached = BTRFS_CACHE_FINISHED;
2504
bg->used = map->chunk_len;
2505
bg->flags = map->type;
2506
bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
2507
ret = btrfs_add_block_group_cache(bg);
2508
/*
2509
* We may have some valid block group cache added already, in
2510
* that case we skip to the next one.
2511
*/
2512
if (ret == -EEXIST) {
2513
ret = 0;
2514
btrfs_put_block_group(bg);
2515
continue;
2516
}
2517
2518
if (ret) {
2519
btrfs_remove_free_space_cache(bg);
2520
btrfs_put_block_group(bg);
2521
break;
2522
}
2523
2524
btrfs_add_bg_to_space_info(fs_info, bg);
2525
2526
set_avail_alloc_bits(fs_info, bg->flags);
2527
}
2528
if (!ret)
2529
btrfs_init_global_block_rsv(fs_info);
2530
return ret;
2531
}
2532
2533
int btrfs_read_block_groups(struct btrfs_fs_info *info)
2534
{
2535
struct btrfs_root *root = btrfs_block_group_root(info);
2536
struct btrfs_path *path;
2537
int ret;
2538
struct btrfs_block_group *cache;
2539
struct btrfs_space_info *space_info;
2540
struct btrfs_key key;
2541
int need_clear = 0;
2542
u64 cache_gen;
2543
2544
/*
2545
* Either no extent root (with ibadroots rescue option) or we have
2546
* unsupported RO options. The fs can never be mounted read-write, so no
2547
* need to waste time searching block group items.
2548
*
2549
* This also allows new extent tree related changes to be RO compat,
2550
* no need for a full incompat flag.
2551
*/
2552
if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2553
~BTRFS_FEATURE_COMPAT_RO_SUPP))
2554
return fill_dummy_bgs(info);
2555
2556
key.objectid = 0;
2557
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2558
key.offset = 0;
2559
path = btrfs_alloc_path();
2560
if (!path)
2561
return -ENOMEM;
2562
2563
cache_gen = btrfs_super_cache_generation(info->super_copy);
2564
if (btrfs_test_opt(info, SPACE_CACHE) &&
2565
btrfs_super_generation(info->super_copy) != cache_gen)
2566
need_clear = 1;
2567
if (btrfs_test_opt(info, CLEAR_CACHE))
2568
need_clear = 1;
2569
2570
while (1) {
2571
struct btrfs_block_group_item bgi;
2572
struct extent_buffer *leaf;
2573
int slot;
2574
2575
ret = find_first_block_group(info, path, &key);
2576
if (ret > 0)
2577
break;
2578
if (ret != 0)
2579
goto error;
2580
2581
leaf = path->nodes[0];
2582
slot = path->slots[0];
2583
2584
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2585
sizeof(bgi));
2586
2587
btrfs_item_key_to_cpu(leaf, &key, slot);
2588
btrfs_release_path(path);
2589
ret = read_one_block_group(info, &bgi, &key, need_clear);
2590
if (ret < 0)
2591
goto error;
2592
key.objectid += key.offset;
2593
key.offset = 0;
2594
}
2595
btrfs_release_path(path);
2596
2597
list_for_each_entry(space_info, &info->space_info, list) {
2598
int i;
2599
2600
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2601
if (list_empty(&space_info->block_groups[i]))
2602
continue;
2603
cache = list_first_entry(&space_info->block_groups[i],
2604
struct btrfs_block_group,
2605
list);
2606
btrfs_sysfs_add_block_group_type(cache);
2607
}
2608
2609
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2610
(BTRFS_BLOCK_GROUP_RAID10 |
2611
BTRFS_BLOCK_GROUP_RAID1_MASK |
2612
BTRFS_BLOCK_GROUP_RAID56_MASK |
2613
BTRFS_BLOCK_GROUP_DUP)))
2614
continue;
2615
/*
2616
* Avoid allocating from un-mirrored block group if there are
2617
* mirrored block groups.
2618
*/
2619
list_for_each_entry(cache,
2620
&space_info->block_groups[BTRFS_RAID_RAID0],
2621
list)
2622
inc_block_group_ro(cache, 1);
2623
list_for_each_entry(cache,
2624
&space_info->block_groups[BTRFS_RAID_SINGLE],
2625
list)
2626
inc_block_group_ro(cache, 1);
2627
}
2628
2629
btrfs_init_global_block_rsv(info);
2630
ret = check_chunk_block_group_mappings(info);
2631
error:
2632
btrfs_free_path(path);
2633
/*
2634
* We've hit some error while reading the extent tree, and have
2635
* rescue=ibadroots mount option.
2636
* Try to fill the tree using dummy block groups so that the user can
2637
* continue to mount and grab their data.
2638
*/
2639
if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2640
ret = fill_dummy_bgs(info);
2641
return ret;
2642
}
2643
2644
/*
2645
* This function, insert_block_group_item(), belongs to the phase 2 of chunk
2646
* allocation.
2647
*
2648
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2649
* phases.
2650
*/
2651
static int insert_block_group_item(struct btrfs_trans_handle *trans,
2652
struct btrfs_block_group *block_group)
2653
{
2654
struct btrfs_fs_info *fs_info = trans->fs_info;
2655
struct btrfs_block_group_item bgi;
2656
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2657
struct btrfs_key key;
2658
u64 old_commit_used;
2659
int ret;
2660
2661
spin_lock(&block_group->lock);
2662
btrfs_set_stack_block_group_used(&bgi, block_group->used);
2663
btrfs_set_stack_block_group_chunk_objectid(&bgi,
2664
block_group->global_root_id);
2665
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2666
old_commit_used = block_group->commit_used;
2667
block_group->commit_used = block_group->used;
2668
key.objectid = block_group->start;
2669
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2670
key.offset = block_group->length;
2671
spin_unlock(&block_group->lock);
2672
2673
ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2674
if (ret < 0) {
2675
spin_lock(&block_group->lock);
2676
block_group->commit_used = old_commit_used;
2677
spin_unlock(&block_group->lock);
2678
}
2679
2680
return ret;
2681
}
2682
2683
static int insert_dev_extent(struct btrfs_trans_handle *trans,
2684
const struct btrfs_device *device, u64 chunk_offset,
2685
u64 start, u64 num_bytes)
2686
{
2687
struct btrfs_fs_info *fs_info = device->fs_info;
2688
struct btrfs_root *root = fs_info->dev_root;
2689
BTRFS_PATH_AUTO_FREE(path);
2690
struct btrfs_dev_extent *extent;
2691
struct extent_buffer *leaf;
2692
struct btrfs_key key;
2693
int ret;
2694
2695
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2696
WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2697
path = btrfs_alloc_path();
2698
if (!path)
2699
return -ENOMEM;
2700
2701
key.objectid = device->devid;
2702
key.type = BTRFS_DEV_EXTENT_KEY;
2703
key.offset = start;
2704
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2705
if (ret)
2706
return ret;
2707
2708
leaf = path->nodes[0];
2709
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2710
btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2711
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2712
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2713
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2714
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2715
2716
return ret;
2717
}
2718
2719
/*
2720
* This function belongs to phase 2.
2721
*
2722
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2723
* phases.
2724
*/
2725
static int insert_dev_extents(struct btrfs_trans_handle *trans,
2726
u64 chunk_offset, u64 chunk_size)
2727
{
2728
struct btrfs_fs_info *fs_info = trans->fs_info;
2729
struct btrfs_device *device;
2730
struct btrfs_chunk_map *map;
2731
u64 dev_offset;
2732
int i;
2733
int ret = 0;
2734
2735
map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2736
if (IS_ERR(map))
2737
return PTR_ERR(map);
2738
2739
/*
2740
* Take the device list mutex to prevent races with the final phase of
2741
* a device replace operation that replaces the device object associated
2742
* with the map's stripes, because the device object's id can change
2743
* at any time during that final phase of the device replace operation
2744
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2745
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2746
* resulting in persisting a device extent item with such ID.
2747
*/
2748
mutex_lock(&fs_info->fs_devices->device_list_mutex);
2749
for (i = 0; i < map->num_stripes; i++) {
2750
device = map->stripes[i].dev;
2751
dev_offset = map->stripes[i].physical;
2752
2753
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2754
map->stripe_size);
2755
if (ret)
2756
break;
2757
}
2758
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2759
2760
btrfs_free_chunk_map(map);
2761
return ret;
2762
}
2763
2764
/*
2765
* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2766
* chunk allocation.
2767
*
2768
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2769
* phases.
2770
*/
2771
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2772
{
2773
struct btrfs_fs_info *fs_info = trans->fs_info;
2774
struct btrfs_block_group *block_group;
2775
int ret = 0;
2776
2777
while (!list_empty(&trans->new_bgs)) {
2778
int index;
2779
2780
block_group = list_first_entry(&trans->new_bgs,
2781
struct btrfs_block_group,
2782
bg_list);
2783
if (ret)
2784
goto next;
2785
2786
index = btrfs_bg_flags_to_raid_index(block_group->flags);
2787
2788
ret = insert_block_group_item(trans, block_group);
2789
if (ret)
2790
btrfs_abort_transaction(trans, ret);
2791
if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2792
&block_group->runtime_flags)) {
2793
mutex_lock(&fs_info->chunk_mutex);
2794
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2795
mutex_unlock(&fs_info->chunk_mutex);
2796
if (ret)
2797
btrfs_abort_transaction(trans, ret);
2798
}
2799
ret = insert_dev_extents(trans, block_group->start,
2800
block_group->length);
2801
if (ret)
2802
btrfs_abort_transaction(trans, ret);
2803
btrfs_add_block_group_free_space(trans, block_group);
2804
2805
/*
2806
* If we restriped during balance, we may have added a new raid
2807
* type, so now add the sysfs entries when it is safe to do so.
2808
* We don't have to worry about locking here as it's handled in
2809
* btrfs_sysfs_add_block_group_type.
2810
*/
2811
if (block_group->space_info->block_group_kobjs[index] == NULL)
2812
btrfs_sysfs_add_block_group_type(block_group);
2813
2814
/* Already aborted the transaction if it failed. */
2815
next:
2816
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2817
2818
spin_lock(&fs_info->unused_bgs_lock);
2819
list_del_init(&block_group->bg_list);
2820
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
2821
btrfs_put_block_group(block_group);
2822
spin_unlock(&fs_info->unused_bgs_lock);
2823
2824
/*
2825
* If the block group is still unused, add it to the list of
2826
* unused block groups. The block group may have been created in
2827
* order to satisfy a space reservation, in which case the
2828
* extent allocation only happens later. But often we don't
2829
* actually need to allocate space that we previously reserved,
2830
* so the block group may become unused for a long time. For
2831
* example for metadata we generally reserve space for a worst
2832
* possible scenario, but then don't end up allocating all that
2833
* space or none at all (due to no need to COW, extent buffers
2834
* were already COWed in the current transaction and still
2835
* unwritten, tree heights lower than the maximum possible
2836
* height, etc). For data we generally reserve the exact amount
2837
* of space we are going to allocate later, the exception is
2838
* when using compression, as we must reserve space based on the
2839
* uncompressed data size, because the compression is only done
2840
* when writeback triggered and we don't know how much space we
2841
* are actually going to need, so we reserve the uncompressed
2842
* size because the data may be incompressible in the worst case.
2843
*/
2844
if (ret == 0) {
2845
bool used;
2846
2847
spin_lock(&block_group->lock);
2848
used = btrfs_is_block_group_used(block_group);
2849
spin_unlock(&block_group->lock);
2850
2851
if (!used)
2852
btrfs_mark_bg_unused(block_group);
2853
}
2854
}
2855
btrfs_trans_release_chunk_metadata(trans);
2856
}
2857
2858
/*
2859
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
2860
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2861
*/
2862
static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
2863
{
2864
u64 div = SZ_1G;
2865
u64 index;
2866
2867
if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2868
return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2869
2870
/* If we have a smaller fs index based on 128MiB. */
2871
if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2872
div = SZ_128M;
2873
2874
offset = div64_u64(offset, div);
2875
div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2876
return index;
2877
}
2878
2879
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2880
struct btrfs_space_info *space_info,
2881
u64 type, u64 chunk_offset, u64 size)
2882
{
2883
struct btrfs_fs_info *fs_info = trans->fs_info;
2884
struct btrfs_block_group *cache;
2885
int ret;
2886
2887
btrfs_set_log_full_commit(trans);
2888
2889
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2890
if (!cache)
2891
return ERR_PTR(-ENOMEM);
2892
2893
/*
2894
* Mark it as new before adding it to the rbtree of block groups or any
2895
* list, so that no other task finds it and calls btrfs_mark_bg_unused()
2896
* before the new flag is set.
2897
*/
2898
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
2899
2900
cache->length = size;
2901
btrfs_set_free_space_tree_thresholds(cache);
2902
cache->flags = type;
2903
cache->cached = BTRFS_CACHE_FINISHED;
2904
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2905
2906
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2907
set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
2908
2909
ret = btrfs_load_block_group_zone_info(cache, true);
2910
if (ret) {
2911
btrfs_put_block_group(cache);
2912
return ERR_PTR(ret);
2913
}
2914
2915
ret = exclude_super_stripes(cache);
2916
if (ret) {
2917
/* We may have excluded something, so call this just in case */
2918
btrfs_free_excluded_extents(cache);
2919
btrfs_put_block_group(cache);
2920
return ERR_PTR(ret);
2921
}
2922
2923
ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
2924
btrfs_free_excluded_extents(cache);
2925
if (ret) {
2926
btrfs_put_block_group(cache);
2927
return ERR_PTR(ret);
2928
}
2929
2930
/*
2931
* Ensure the corresponding space_info object is created and
2932
* assigned to our block group. We want our bg to be added to the rbtree
2933
* with its ->space_info set.
2934
*/
2935
cache->space_info = space_info;
2936
ASSERT(cache->space_info);
2937
2938
ret = btrfs_add_block_group_cache(cache);
2939
if (ret) {
2940
btrfs_remove_free_space_cache(cache);
2941
btrfs_put_block_group(cache);
2942
return ERR_PTR(ret);
2943
}
2944
2945
/*
2946
* Now that our block group has its ->space_info set and is inserted in
2947
* the rbtree, update the space info's counters.
2948
*/
2949
trace_btrfs_add_block_group(fs_info, cache, 1);
2950
btrfs_add_bg_to_space_info(fs_info, cache);
2951
btrfs_update_global_block_rsv(fs_info);
2952
2953
#ifdef CONFIG_BTRFS_DEBUG
2954
if (btrfs_should_fragment_free_space(cache)) {
2955
cache->space_info->bytes_used += size >> 1;
2956
fragment_free_space(cache);
2957
}
2958
#endif
2959
2960
btrfs_link_bg_list(cache, &trans->new_bgs);
2961
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
2962
2963
set_avail_alloc_bits(fs_info, type);
2964
return cache;
2965
}
2966
2967
/*
2968
* Mark one block group RO, can be called several times for the same block
2969
* group.
2970
*
2971
* @cache: the destination block group
2972
* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
2973
* ensure we still have some free space after marking this
2974
* block group RO.
2975
*/
2976
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2977
bool do_chunk_alloc)
2978
{
2979
struct btrfs_fs_info *fs_info = cache->fs_info;
2980
struct btrfs_space_info *space_info = cache->space_info;
2981
struct btrfs_trans_handle *trans;
2982
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2983
u64 alloc_flags;
2984
int ret;
2985
bool dirty_bg_running;
2986
2987
/*
2988
* This can only happen when we are doing read-only scrub on read-only
2989
* mount.
2990
* In that case we should not start a new transaction on read-only fs.
2991
* Thus here we skip all chunk allocations.
2992
*/
2993
if (sb_rdonly(fs_info->sb)) {
2994
mutex_lock(&fs_info->ro_block_group_mutex);
2995
ret = inc_block_group_ro(cache, 0);
2996
mutex_unlock(&fs_info->ro_block_group_mutex);
2997
return ret;
2998
}
2999
3000
do {
3001
trans = btrfs_join_transaction(root);
3002
if (IS_ERR(trans))
3003
return PTR_ERR(trans);
3004
3005
dirty_bg_running = false;
3006
3007
/*
3008
* We're not allowed to set block groups readonly after the dirty
3009
* block group cache has started writing. If it already started,
3010
* back off and let this transaction commit.
3011
*/
3012
mutex_lock(&fs_info->ro_block_group_mutex);
3013
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
3014
u64 transid = trans->transid;
3015
3016
mutex_unlock(&fs_info->ro_block_group_mutex);
3017
btrfs_end_transaction(trans);
3018
3019
ret = btrfs_wait_for_commit(fs_info, transid);
3020
if (ret)
3021
return ret;
3022
dirty_bg_running = true;
3023
}
3024
} while (dirty_bg_running);
3025
3026
if (do_chunk_alloc) {
3027
/*
3028
* If we are changing raid levels, try to allocate a
3029
* corresponding block group with the new raid level.
3030
*/
3031
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3032
if (alloc_flags != cache->flags) {
3033
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
3034
CHUNK_ALLOC_FORCE);
3035
/*
3036
* ENOSPC is allowed here, we may have enough space
3037
* already allocated at the new raid level to carry on
3038
*/
3039
if (ret == -ENOSPC)
3040
ret = 0;
3041
if (ret < 0)
3042
goto out;
3043
}
3044
}
3045
3046
ret = inc_block_group_ro(cache, 0);
3047
if (!ret)
3048
goto out;
3049
if (ret == -ETXTBSY)
3050
goto unlock_out;
3051
3052
/*
3053
* Skip chunk allocation if the bg is SYSTEM, this is to avoid system
3054
* chunk allocation storm to exhaust the system chunk array. Otherwise
3055
* we still want to try our best to mark the block group read-only.
3056
*/
3057
if (!do_chunk_alloc && ret == -ENOSPC &&
3058
(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
3059
goto unlock_out;
3060
3061
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
3062
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3063
if (ret < 0)
3064
goto out;
3065
/*
3066
* We have allocated a new chunk. We also need to activate that chunk to
3067
* grant metadata tickets for zoned filesystem.
3068
*/
3069
ret = btrfs_zoned_activate_one_bg(space_info, true);
3070
if (ret < 0)
3071
goto out;
3072
3073
ret = inc_block_group_ro(cache, 0);
3074
if (ret == -ETXTBSY)
3075
goto unlock_out;
3076
out:
3077
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3078
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3079
mutex_lock(&fs_info->chunk_mutex);
3080
check_system_chunk(trans, alloc_flags);
3081
mutex_unlock(&fs_info->chunk_mutex);
3082
}
3083
unlock_out:
3084
mutex_unlock(&fs_info->ro_block_group_mutex);
3085
3086
btrfs_end_transaction(trans);
3087
return ret;
3088
}
3089
3090
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
3091
{
3092
struct btrfs_space_info *sinfo = cache->space_info;
3093
u64 num_bytes;
3094
3095
BUG_ON(!cache->ro);
3096
3097
spin_lock(&sinfo->lock);
3098
spin_lock(&cache->lock);
3099
if (!--cache->ro) {
3100
if (btrfs_is_zoned(cache->fs_info)) {
3101
/* Migrate zone_unusable bytes back */
3102
cache->zone_unusable =
3103
(cache->alloc_offset - cache->used - cache->pinned -
3104
cache->reserved) +
3105
(cache->length - cache->zone_capacity);
3106
btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
3107
sinfo->bytes_readonly -= cache->zone_unusable;
3108
}
3109
num_bytes = cache->length - cache->reserved -
3110
cache->pinned - cache->bytes_super -
3111
cache->zone_unusable - cache->used;
3112
sinfo->bytes_readonly -= num_bytes;
3113
list_del_init(&cache->ro_list);
3114
}
3115
spin_unlock(&cache->lock);
3116
spin_unlock(&sinfo->lock);
3117
}
3118
3119
static int update_block_group_item(struct btrfs_trans_handle *trans,
3120
struct btrfs_path *path,
3121
struct btrfs_block_group *cache)
3122
{
3123
struct btrfs_fs_info *fs_info = trans->fs_info;
3124
int ret;
3125
struct btrfs_root *root = btrfs_block_group_root(fs_info);
3126
unsigned long bi;
3127
struct extent_buffer *leaf;
3128
struct btrfs_block_group_item bgi;
3129
struct btrfs_key key;
3130
u64 old_commit_used;
3131
u64 used;
3132
3133
/*
3134
* Block group items update can be triggered out of commit transaction
3135
* critical section, thus we need a consistent view of used bytes.
3136
* We cannot use cache->used directly outside of the spin lock, as it
3137
* may be changed.
3138
*/
3139
spin_lock(&cache->lock);
3140
old_commit_used = cache->commit_used;
3141
used = cache->used;
3142
/* No change in used bytes, can safely skip it. */
3143
if (cache->commit_used == used) {
3144
spin_unlock(&cache->lock);
3145
return 0;
3146
}
3147
cache->commit_used = used;
3148
spin_unlock(&cache->lock);
3149
3150
key.objectid = cache->start;
3151
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
3152
key.offset = cache->length;
3153
3154
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3155
if (ret) {
3156
if (ret > 0)
3157
ret = -ENOENT;
3158
goto fail;
3159
}
3160
3161
leaf = path->nodes[0];
3162
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3163
btrfs_set_stack_block_group_used(&bgi, used);
3164
btrfs_set_stack_block_group_chunk_objectid(&bgi,
3165
cache->global_root_id);
3166
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
3167
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
3168
fail:
3169
btrfs_release_path(path);
3170
/*
3171
* We didn't update the block group item, need to revert commit_used
3172
* unless the block group item didn't exist yet - this is to prevent a
3173
* race with a concurrent insertion of the block group item, with
3174
* insert_block_group_item(), that happened just after we attempted to
3175
* update. In that case we would reset commit_used to 0 just after the
3176
* insertion set it to a value greater than 0 - if the block group later
3177
* becomes with 0 used bytes, we would incorrectly skip its update.
3178
*/
3179
if (ret < 0 && ret != -ENOENT) {
3180
spin_lock(&cache->lock);
3181
cache->commit_used = old_commit_used;
3182
spin_unlock(&cache->lock);
3183
}
3184
return ret;
3185
3186
}
3187
3188
static int cache_save_setup(struct btrfs_block_group *block_group,
3189
struct btrfs_trans_handle *trans,
3190
struct btrfs_path *path)
3191
{
3192
struct btrfs_fs_info *fs_info = block_group->fs_info;
3193
struct inode *inode = NULL;
3194
struct extent_changeset *data_reserved = NULL;
3195
u64 alloc_hint = 0;
3196
int dcs = BTRFS_DC_ERROR;
3197
u64 cache_size = 0;
3198
int retries = 0;
3199
int ret = 0;
3200
3201
if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3202
return 0;
3203
3204
/*
3205
* If this block group is smaller than 100 megs don't bother caching the
3206
* block group.
3207
*/
3208
if (block_group->length < (100 * SZ_1M)) {
3209
spin_lock(&block_group->lock);
3210
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3211
spin_unlock(&block_group->lock);
3212
return 0;
3213
}
3214
3215
if (TRANS_ABORTED(trans))
3216
return 0;
3217
again:
3218
inode = lookup_free_space_inode(block_group, path);
3219
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3220
ret = PTR_ERR(inode);
3221
btrfs_release_path(path);
3222
goto out;
3223
}
3224
3225
if (IS_ERR(inode)) {
3226
BUG_ON(retries);
3227
retries++;
3228
3229
if (block_group->ro)
3230
goto out_free;
3231
3232
ret = create_free_space_inode(trans, block_group, path);
3233
if (ret)
3234
goto out_free;
3235
goto again;
3236
}
3237
3238
/*
3239
* We want to set the generation to 0, that way if anything goes wrong
3240
* from here on out we know not to trust this cache when we load up next
3241
* time.
3242
*/
3243
BTRFS_I(inode)->generation = 0;
3244
ret = btrfs_update_inode(trans, BTRFS_I(inode));
3245
if (unlikely(ret)) {
3246
/*
3247
* So theoretically we could recover from this, simply set the
3248
* super cache generation to 0 so we know to invalidate the
3249
* cache, but then we'd have to keep track of the block groups
3250
* that fail this way so we know we _have_ to reset this cache
3251
* before the next commit or risk reading stale cache. So to
3252
* limit our exposure to horrible edge cases lets just abort the
3253
* transaction, this only happens in really bad situations
3254
* anyway.
3255
*/
3256
btrfs_abort_transaction(trans, ret);
3257
goto out_put;
3258
}
3259
WARN_ON(ret);
3260
3261
/* We've already setup this transaction, go ahead and exit */
3262
if (block_group->cache_generation == trans->transid &&
3263
i_size_read(inode)) {
3264
dcs = BTRFS_DC_SETUP;
3265
goto out_put;
3266
}
3267
3268
if (i_size_read(inode) > 0) {
3269
ret = btrfs_check_trunc_cache_free_space(fs_info,
3270
&fs_info->global_block_rsv);
3271
if (ret)
3272
goto out_put;
3273
3274
ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3275
if (ret)
3276
goto out_put;
3277
}
3278
3279
spin_lock(&block_group->lock);
3280
if (block_group->cached != BTRFS_CACHE_FINISHED ||
3281
!btrfs_test_opt(fs_info, SPACE_CACHE)) {
3282
/*
3283
* don't bother trying to write stuff out _if_
3284
* a) we're not cached,
3285
* b) we're with nospace_cache mount option,
3286
* c) we're with v2 space_cache (FREE_SPACE_TREE).
3287
*/
3288
dcs = BTRFS_DC_WRITTEN;
3289
spin_unlock(&block_group->lock);
3290
goto out_put;
3291
}
3292
spin_unlock(&block_group->lock);
3293
3294
/*
3295
* We hit an ENOSPC when setting up the cache in this transaction, just
3296
* skip doing the setup, we've already cleared the cache so we're safe.
3297
*/
3298
if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3299
ret = -ENOSPC;
3300
goto out_put;
3301
}
3302
3303
/*
3304
* Try to preallocate enough space based on how big the block group is.
3305
* Keep in mind this has to include any pinned space which could end up
3306
* taking up quite a bit since it's not folded into the other space
3307
* cache.
3308
*/
3309
cache_size = div_u64(block_group->length, SZ_256M);
3310
if (!cache_size)
3311
cache_size = 1;
3312
3313
cache_size *= 16;
3314
cache_size *= fs_info->sectorsize;
3315
3316
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
3317
cache_size, false);
3318
if (ret)
3319
goto out_put;
3320
3321
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3322
cache_size, cache_size,
3323
&alloc_hint);
3324
/*
3325
* Our cache requires contiguous chunks so that we don't modify a bunch
3326
* of metadata or split extents when writing the cache out, which means
3327
* we can enospc if we are heavily fragmented in addition to just normal
3328
* out of space conditions. So if we hit this just skip setting up any
3329
* other block groups for this transaction, maybe we'll unpin enough
3330
* space the next time around.
3331
*/
3332
if (!ret)
3333
dcs = BTRFS_DC_SETUP;
3334
else if (ret == -ENOSPC)
3335
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3336
3337
out_put:
3338
iput(inode);
3339
out_free:
3340
btrfs_release_path(path);
3341
out:
3342
spin_lock(&block_group->lock);
3343
if (!ret && dcs == BTRFS_DC_SETUP)
3344
block_group->cache_generation = trans->transid;
3345
block_group->disk_cache_state = dcs;
3346
spin_unlock(&block_group->lock);
3347
3348
extent_changeset_free(data_reserved);
3349
return ret;
3350
}
3351
3352
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3353
{
3354
struct btrfs_fs_info *fs_info = trans->fs_info;
3355
struct btrfs_block_group *cache, *tmp;
3356
struct btrfs_transaction *cur_trans = trans->transaction;
3357
BTRFS_PATH_AUTO_FREE(path);
3358
3359
if (list_empty(&cur_trans->dirty_bgs) ||
3360
!btrfs_test_opt(fs_info, SPACE_CACHE))
3361
return 0;
3362
3363
path = btrfs_alloc_path();
3364
if (!path)
3365
return -ENOMEM;
3366
3367
/* Could add new block groups, use _safe just in case */
3368
list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3369
dirty_list) {
3370
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3371
cache_save_setup(cache, trans, path);
3372
}
3373
3374
return 0;
3375
}
3376
3377
/*
3378
* Transaction commit does final block group cache writeback during a critical
3379
* section where nothing is allowed to change the FS. This is required in
3380
* order for the cache to actually match the block group, but can introduce a
3381
* lot of latency into the commit.
3382
*
3383
* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3384
* There's a chance we'll have to redo some of it if the block group changes
3385
* again during the commit, but it greatly reduces the commit latency by
3386
* getting rid of the easy block groups while we're still allowing others to
3387
* join the commit.
3388
*/
3389
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3390
{
3391
struct btrfs_fs_info *fs_info = trans->fs_info;
3392
struct btrfs_block_group *cache;
3393
struct btrfs_transaction *cur_trans = trans->transaction;
3394
int ret = 0;
3395
int should_put;
3396
BTRFS_PATH_AUTO_FREE(path);
3397
LIST_HEAD(dirty);
3398
struct list_head *io = &cur_trans->io_bgs;
3399
int loops = 0;
3400
3401
spin_lock(&cur_trans->dirty_bgs_lock);
3402
if (list_empty(&cur_trans->dirty_bgs)) {
3403
spin_unlock(&cur_trans->dirty_bgs_lock);
3404
return 0;
3405
}
3406
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3407
spin_unlock(&cur_trans->dirty_bgs_lock);
3408
3409
again:
3410
/* Make sure all the block groups on our dirty list actually exist */
3411
btrfs_create_pending_block_groups(trans);
3412
3413
if (!path) {
3414
path = btrfs_alloc_path();
3415
if (!path) {
3416
ret = -ENOMEM;
3417
goto out;
3418
}
3419
}
3420
3421
/*
3422
* cache_write_mutex is here only to save us from balance or automatic
3423
* removal of empty block groups deleting this block group while we are
3424
* writing out the cache
3425
*/
3426
mutex_lock(&trans->transaction->cache_write_mutex);
3427
while (!list_empty(&dirty)) {
3428
bool drop_reserve = true;
3429
3430
cache = list_first_entry(&dirty, struct btrfs_block_group,
3431
dirty_list);
3432
/*
3433
* This can happen if something re-dirties a block group that
3434
* is already under IO. Just wait for it to finish and then do
3435
* it all again
3436
*/
3437
if (!list_empty(&cache->io_list)) {
3438
list_del_init(&cache->io_list);
3439
btrfs_wait_cache_io(trans, cache, path);
3440
btrfs_put_block_group(cache);
3441
}
3442
3443
3444
/*
3445
* btrfs_wait_cache_io uses the cache->dirty_list to decide if
3446
* it should update the cache_state. Don't delete until after
3447
* we wait.
3448
*
3449
* Since we're not running in the commit critical section
3450
* we need the dirty_bgs_lock to protect from update_block_group
3451
*/
3452
spin_lock(&cur_trans->dirty_bgs_lock);
3453
list_del_init(&cache->dirty_list);
3454
spin_unlock(&cur_trans->dirty_bgs_lock);
3455
3456
should_put = 1;
3457
3458
cache_save_setup(cache, trans, path);
3459
3460
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3461
cache->io_ctl.inode = NULL;
3462
ret = btrfs_write_out_cache(trans, cache, path);
3463
if (ret == 0 && cache->io_ctl.inode) {
3464
should_put = 0;
3465
3466
/*
3467
* The cache_write_mutex is protecting the
3468
* io_list, also refer to the definition of
3469
* btrfs_transaction::io_bgs for more details
3470
*/
3471
list_add_tail(&cache->io_list, io);
3472
} else {
3473
/*
3474
* If we failed to write the cache, the
3475
* generation will be bad and life goes on
3476
*/
3477
ret = 0;
3478
}
3479
}
3480
if (!ret) {
3481
ret = update_block_group_item(trans, path, cache);
3482
/*
3483
* Our block group might still be attached to the list
3484
* of new block groups in the transaction handle of some
3485
* other task (struct btrfs_trans_handle->new_bgs). This
3486
* means its block group item isn't yet in the extent
3487
* tree. If this happens ignore the error, as we will
3488
* try again later in the critical section of the
3489
* transaction commit.
3490
*/
3491
if (ret == -ENOENT) {
3492
ret = 0;
3493
spin_lock(&cur_trans->dirty_bgs_lock);
3494
if (list_empty(&cache->dirty_list)) {
3495
list_add_tail(&cache->dirty_list,
3496
&cur_trans->dirty_bgs);
3497
btrfs_get_block_group(cache);
3498
drop_reserve = false;
3499
}
3500
spin_unlock(&cur_trans->dirty_bgs_lock);
3501
} else if (ret) {
3502
btrfs_abort_transaction(trans, ret);
3503
}
3504
}
3505
3506
/* If it's not on the io list, we need to put the block group */
3507
if (should_put)
3508
btrfs_put_block_group(cache);
3509
if (drop_reserve)
3510
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3511
/*
3512
* Avoid blocking other tasks for too long. It might even save
3513
* us from writing caches for block groups that are going to be
3514
* removed.
3515
*/
3516
mutex_unlock(&trans->transaction->cache_write_mutex);
3517
if (ret)
3518
goto out;
3519
mutex_lock(&trans->transaction->cache_write_mutex);
3520
}
3521
mutex_unlock(&trans->transaction->cache_write_mutex);
3522
3523
/*
3524
* Go through delayed refs for all the stuff we've just kicked off
3525
* and then loop back (just once)
3526
*/
3527
if (!ret)
3528
ret = btrfs_run_delayed_refs(trans, 0);
3529
if (!ret && loops == 0) {
3530
loops++;
3531
spin_lock(&cur_trans->dirty_bgs_lock);
3532
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3533
/*
3534
* dirty_bgs_lock protects us from concurrent block group
3535
* deletes too (not just cache_write_mutex).
3536
*/
3537
if (!list_empty(&dirty)) {
3538
spin_unlock(&cur_trans->dirty_bgs_lock);
3539
goto again;
3540
}
3541
spin_unlock(&cur_trans->dirty_bgs_lock);
3542
}
3543
out:
3544
if (ret < 0) {
3545
spin_lock(&cur_trans->dirty_bgs_lock);
3546
list_splice_init(&dirty, &cur_trans->dirty_bgs);
3547
spin_unlock(&cur_trans->dirty_bgs_lock);
3548
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3549
}
3550
3551
return ret;
3552
}
3553
3554
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3555
{
3556
struct btrfs_fs_info *fs_info = trans->fs_info;
3557
struct btrfs_block_group *cache;
3558
struct btrfs_transaction *cur_trans = trans->transaction;
3559
int ret = 0;
3560
int should_put;
3561
BTRFS_PATH_AUTO_FREE(path);
3562
struct list_head *io = &cur_trans->io_bgs;
3563
3564
path = btrfs_alloc_path();
3565
if (!path)
3566
return -ENOMEM;
3567
3568
/*
3569
* Even though we are in the critical section of the transaction commit,
3570
* we can still have concurrent tasks adding elements to this
3571
* transaction's list of dirty block groups. These tasks correspond to
3572
* endio free space workers started when writeback finishes for a
3573
* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3574
* allocate new block groups as a result of COWing nodes of the root
3575
* tree when updating the free space inode. The writeback for the space
3576
* caches is triggered by an earlier call to
3577
* btrfs_start_dirty_block_groups() and iterations of the following
3578
* loop.
3579
* Also we want to do the cache_save_setup first and then run the
3580
* delayed refs to make sure we have the best chance at doing this all
3581
* in one shot.
3582
*/
3583
spin_lock(&cur_trans->dirty_bgs_lock);
3584
while (!list_empty(&cur_trans->dirty_bgs)) {
3585
cache = list_first_entry(&cur_trans->dirty_bgs,
3586
struct btrfs_block_group,
3587
dirty_list);
3588
3589
/*
3590
* This can happen if cache_save_setup re-dirties a block group
3591
* that is already under IO. Just wait for it to finish and
3592
* then do it all again
3593
*/
3594
if (!list_empty(&cache->io_list)) {
3595
spin_unlock(&cur_trans->dirty_bgs_lock);
3596
list_del_init(&cache->io_list);
3597
btrfs_wait_cache_io(trans, cache, path);
3598
btrfs_put_block_group(cache);
3599
spin_lock(&cur_trans->dirty_bgs_lock);
3600
}
3601
3602
/*
3603
* Don't remove from the dirty list until after we've waited on
3604
* any pending IO
3605
*/
3606
list_del_init(&cache->dirty_list);
3607
spin_unlock(&cur_trans->dirty_bgs_lock);
3608
should_put = 1;
3609
3610
cache_save_setup(cache, trans, path);
3611
3612
if (!ret)
3613
ret = btrfs_run_delayed_refs(trans, U64_MAX);
3614
3615
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3616
cache->io_ctl.inode = NULL;
3617
ret = btrfs_write_out_cache(trans, cache, path);
3618
if (ret == 0 && cache->io_ctl.inode) {
3619
should_put = 0;
3620
list_add_tail(&cache->io_list, io);
3621
} else {
3622
/*
3623
* If we failed to write the cache, the
3624
* generation will be bad and life goes on
3625
*/
3626
ret = 0;
3627
}
3628
}
3629
if (!ret) {
3630
ret = update_block_group_item(trans, path, cache);
3631
/*
3632
* One of the free space endio workers might have
3633
* created a new block group while updating a free space
3634
* cache's inode (at inode.c:btrfs_finish_ordered_io())
3635
* and hasn't released its transaction handle yet, in
3636
* which case the new block group is still attached to
3637
* its transaction handle and its creation has not
3638
* finished yet (no block group item in the extent tree
3639
* yet, etc). If this is the case, wait for all free
3640
* space endio workers to finish and retry. This is a
3641
* very rare case so no need for a more efficient and
3642
* complex approach.
3643
*/
3644
if (ret == -ENOENT) {
3645
wait_event(cur_trans->writer_wait,
3646
atomic_read(&cur_trans->num_writers) == 1);
3647
ret = update_block_group_item(trans, path, cache);
3648
if (ret)
3649
btrfs_abort_transaction(trans, ret);
3650
} else if (ret) {
3651
btrfs_abort_transaction(trans, ret);
3652
}
3653
}
3654
3655
/* If its not on the io list, we need to put the block group */
3656
if (should_put)
3657
btrfs_put_block_group(cache);
3658
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3659
spin_lock(&cur_trans->dirty_bgs_lock);
3660
}
3661
spin_unlock(&cur_trans->dirty_bgs_lock);
3662
3663
/*
3664
* Refer to the definition of io_bgs member for details why it's safe
3665
* to use it without any locking
3666
*/
3667
while (!list_empty(io)) {
3668
cache = list_first_entry(io, struct btrfs_block_group,
3669
io_list);
3670
list_del_init(&cache->io_list);
3671
btrfs_wait_cache_io(trans, cache, path);
3672
btrfs_put_block_group(cache);
3673
}
3674
3675
return ret;
3676
}
3677
3678
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3679
u64 bytenr, u64 num_bytes, bool alloc)
3680
{
3681
struct btrfs_fs_info *info = trans->fs_info;
3682
struct btrfs_space_info *space_info;
3683
struct btrfs_block_group *cache;
3684
u64 old_val;
3685
bool reclaim = false;
3686
bool bg_already_dirty = true;
3687
int factor;
3688
3689
/* Block accounting for super block */
3690
spin_lock(&info->delalloc_root_lock);
3691
old_val = btrfs_super_bytes_used(info->super_copy);
3692
if (alloc)
3693
old_val += num_bytes;
3694
else
3695
old_val -= num_bytes;
3696
btrfs_set_super_bytes_used(info->super_copy, old_val);
3697
spin_unlock(&info->delalloc_root_lock);
3698
3699
cache = btrfs_lookup_block_group(info, bytenr);
3700
if (!cache)
3701
return -ENOENT;
3702
3703
/* An extent can not span multiple block groups. */
3704
ASSERT(bytenr + num_bytes <= cache->start + cache->length);
3705
3706
space_info = cache->space_info;
3707
factor = btrfs_bg_type_to_factor(cache->flags);
3708
3709
/*
3710
* If this block group has free space cache written out, we need to make
3711
* sure to load it if we are removing space. This is because we need
3712
* the unpinning stage to actually add the space back to the block group,
3713
* otherwise we will leak space.
3714
*/
3715
if (!alloc && !btrfs_block_group_done(cache))
3716
btrfs_cache_block_group(cache, true);
3717
3718
spin_lock(&space_info->lock);
3719
spin_lock(&cache->lock);
3720
3721
if (btrfs_test_opt(info, SPACE_CACHE) &&
3722
cache->disk_cache_state < BTRFS_DC_CLEAR)
3723
cache->disk_cache_state = BTRFS_DC_CLEAR;
3724
3725
old_val = cache->used;
3726
if (alloc) {
3727
old_val += num_bytes;
3728
cache->used = old_val;
3729
cache->reserved -= num_bytes;
3730
cache->reclaim_mark = 0;
3731
space_info->bytes_reserved -= num_bytes;
3732
space_info->bytes_used += num_bytes;
3733
space_info->disk_used += num_bytes * factor;
3734
if (READ_ONCE(space_info->periodic_reclaim))
3735
btrfs_space_info_update_reclaimable(space_info, -num_bytes);
3736
spin_unlock(&cache->lock);
3737
spin_unlock(&space_info->lock);
3738
} else {
3739
old_val -= num_bytes;
3740
cache->used = old_val;
3741
cache->pinned += num_bytes;
3742
btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
3743
space_info->bytes_used -= num_bytes;
3744
space_info->disk_used -= num_bytes * factor;
3745
if (READ_ONCE(space_info->periodic_reclaim))
3746
btrfs_space_info_update_reclaimable(space_info, num_bytes);
3747
else
3748
reclaim = should_reclaim_block_group(cache, num_bytes);
3749
3750
spin_unlock(&cache->lock);
3751
spin_unlock(&space_info->lock);
3752
3753
btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
3754
bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
3755
}
3756
3757
spin_lock(&trans->transaction->dirty_bgs_lock);
3758
if (list_empty(&cache->dirty_list)) {
3759
list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
3760
bg_already_dirty = false;
3761
btrfs_get_block_group(cache);
3762
}
3763
spin_unlock(&trans->transaction->dirty_bgs_lock);
3764
3765
/*
3766
* No longer have used bytes in this block group, queue it for deletion.
3767
* We do this after adding the block group to the dirty list to avoid
3768
* races between cleaner kthread and space cache writeout.
3769
*/
3770
if (!alloc && old_val == 0) {
3771
if (!btrfs_test_opt(info, DISCARD_ASYNC))
3772
btrfs_mark_bg_unused(cache);
3773
} else if (!alloc && reclaim) {
3774
btrfs_mark_bg_to_reclaim(cache);
3775
}
3776
3777
btrfs_put_block_group(cache);
3778
3779
/* Modified block groups are accounted for in the delayed_refs_rsv. */
3780
if (!bg_already_dirty)
3781
btrfs_inc_delayed_refs_rsv_bg_updates(info);
3782
3783
return 0;
3784
}
3785
3786
/*
3787
* Update the block_group and space info counters.
3788
*
3789
* @cache: The cache we are manipulating
3790
* @ram_bytes: The number of bytes of file content, and will be same to
3791
* @num_bytes except for the compress path.
3792
* @num_bytes: The number of bytes in question
3793
* @delalloc: The blocks are allocated for the delalloc write
3794
*
3795
* This is called by the allocator when it reserves space. If this is a
3796
* reservation and the block group has become read only we cannot make the
3797
* reservation and return -EAGAIN, otherwise this function always succeeds.
3798
*/
3799
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3800
u64 ram_bytes, u64 num_bytes, bool delalloc,
3801
bool force_wrong_size_class)
3802
{
3803
struct btrfs_space_info *space_info = cache->space_info;
3804
enum btrfs_block_group_size_class size_class;
3805
int ret = 0;
3806
3807
spin_lock(&space_info->lock);
3808
spin_lock(&cache->lock);
3809
if (cache->ro) {
3810
ret = -EAGAIN;
3811
goto out_error;
3812
}
3813
3814
if (btrfs_block_group_should_use_size_class(cache)) {
3815
size_class = btrfs_calc_block_group_size_class(num_bytes);
3816
ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3817
if (ret)
3818
goto out_error;
3819
}
3820
3821
cache->reserved += num_bytes;
3822
if (delalloc)
3823
cache->delalloc_bytes += num_bytes;
3824
3825
trace_btrfs_space_reservation(cache->fs_info, "space_info",
3826
space_info->flags, num_bytes, 1);
3827
spin_unlock(&cache->lock);
3828
3829
space_info->bytes_reserved += num_bytes;
3830
btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
3831
3832
/*
3833
* Compression can use less space than we reserved, so wake tickets if
3834
* that happens.
3835
*/
3836
if (num_bytes < ram_bytes)
3837
btrfs_try_granting_tickets(space_info);
3838
spin_unlock(&space_info->lock);
3839
3840
return 0;
3841
3842
out_error:
3843
spin_unlock(&cache->lock);
3844
spin_unlock(&space_info->lock);
3845
return ret;
3846
}
3847
3848
/*
3849
* Update the block_group and space info counters.
3850
*
3851
* @cache: The cache we are manipulating.
3852
* @num_bytes: The number of bytes in question.
3853
* @is_delalloc: Whether the blocks are allocated for a delalloc write.
3854
*
3855
* This is called by somebody who is freeing space that was never actually used
3856
* on disk. For example if you reserve some space for a new leaf in transaction
3857
* A and before transaction A commits you free that leaf, you call this with
3858
* reserve set to 0 in order to clear the reservation.
3859
*/
3860
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
3861
bool is_delalloc)
3862
{
3863
struct btrfs_space_info *space_info = cache->space_info;
3864
bool bg_ro;
3865
3866
spin_lock(&space_info->lock);
3867
spin_lock(&cache->lock);
3868
bg_ro = cache->ro;
3869
cache->reserved -= num_bytes;
3870
if (is_delalloc)
3871
cache->delalloc_bytes -= num_bytes;
3872
spin_unlock(&cache->lock);
3873
3874
if (bg_ro)
3875
space_info->bytes_readonly += num_bytes;
3876
else if (btrfs_is_zoned(cache->fs_info))
3877
space_info->bytes_zone_unusable += num_bytes;
3878
3879
space_info->bytes_reserved -= num_bytes;
3880
space_info->max_extent_size = 0;
3881
3882
btrfs_try_granting_tickets(space_info);
3883
spin_unlock(&space_info->lock);
3884
}
3885
3886
static void force_metadata_allocation(struct btrfs_fs_info *info)
3887
{
3888
struct list_head *head = &info->space_info;
3889
struct btrfs_space_info *found;
3890
3891
list_for_each_entry(found, head, list) {
3892
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3893
found->force_alloc = CHUNK_ALLOC_FORCE;
3894
}
3895
}
3896
3897
static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
3898
const struct btrfs_space_info *sinfo, int force)
3899
{
3900
u64 bytes_used = btrfs_space_info_used(sinfo, false);
3901
u64 thresh;
3902
3903
if (force == CHUNK_ALLOC_FORCE)
3904
return true;
3905
3906
/*
3907
* in limited mode, we want to have some free space up to
3908
* about 1% of the FS size.
3909
*/
3910
if (force == CHUNK_ALLOC_LIMITED) {
3911
thresh = btrfs_super_total_bytes(fs_info->super_copy);
3912
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
3913
3914
if (sinfo->total_bytes - bytes_used < thresh)
3915
return true;
3916
}
3917
3918
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
3919
return false;
3920
return true;
3921
}
3922
3923
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3924
{
3925
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3926
struct btrfs_space_info *space_info;
3927
3928
space_info = btrfs_find_space_info(trans->fs_info, type);
3929
if (!space_info) {
3930
DEBUG_WARN();
3931
return -EINVAL;
3932
}
3933
3934
return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3935
}
3936
3937
static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
3938
struct btrfs_space_info *space_info,
3939
u64 flags)
3940
{
3941
struct btrfs_block_group *bg;
3942
int ret;
3943
3944
/*
3945
* Check if we have enough space in the system space info because we
3946
* will need to update device items in the chunk btree and insert a new
3947
* chunk item in the chunk btree as well. This will allocate a new
3948
* system block group if needed.
3949
*/
3950
check_system_chunk(trans, flags);
3951
3952
bg = btrfs_create_chunk(trans, space_info, flags);
3953
if (IS_ERR(bg)) {
3954
ret = PTR_ERR(bg);
3955
goto out;
3956
}
3957
3958
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3959
/*
3960
* Normally we are not expected to fail with -ENOSPC here, since we have
3961
* previously reserved space in the system space_info and allocated one
3962
* new system chunk if necessary. However there are three exceptions:
3963
*
3964
* 1) We may have enough free space in the system space_info but all the
3965
* existing system block groups have a profile which can not be used
3966
* for extent allocation.
3967
*
3968
* This happens when mounting in degraded mode. For example we have a
3969
* RAID1 filesystem with 2 devices, lose one device and mount the fs
3970
* using the other device in degraded mode. If we then allocate a chunk,
3971
* we may have enough free space in the existing system space_info, but
3972
* none of the block groups can be used for extent allocation since they
3973
* have a RAID1 profile, and because we are in degraded mode with a
3974
* single device, we are forced to allocate a new system chunk with a
3975
* SINGLE profile. Making check_system_chunk() iterate over all system
3976
* block groups and check if they have a usable profile and enough space
3977
* can be slow on very large filesystems, so we tolerate the -ENOSPC and
3978
* try again after forcing allocation of a new system chunk. Like this
3979
* we avoid paying the cost of that search in normal circumstances, when
3980
* we were not mounted in degraded mode;
3981
*
3982
* 2) We had enough free space info the system space_info, and one suitable
3983
* block group to allocate from when we called check_system_chunk()
3984
* above. However right after we called it, the only system block group
3985
* with enough free space got turned into RO mode by a running scrub,
3986
* and in this case we have to allocate a new one and retry. We only
3987
* need do this allocate and retry once, since we have a transaction
3988
* handle and scrub uses the commit root to search for block groups;
3989
*
3990
* 3) We had one system block group with enough free space when we called
3991
* check_system_chunk(), but after that, right before we tried to
3992
* allocate the last extent buffer we needed, a discard operation came
3993
* in and it temporarily removed the last free space entry from the
3994
* block group (discard removes a free space entry, discards it, and
3995
* then adds back the entry to the block group cache).
3996
*/
3997
if (ret == -ENOSPC) {
3998
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3999
struct btrfs_block_group *sys_bg;
4000
struct btrfs_space_info *sys_space_info;
4001
4002
sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
4003
if (unlikely(!sys_space_info)) {
4004
ret = -EINVAL;
4005
btrfs_abort_transaction(trans, ret);
4006
goto out;
4007
}
4008
4009
sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
4010
if (IS_ERR(sys_bg)) {
4011
ret = PTR_ERR(sys_bg);
4012
btrfs_abort_transaction(trans, ret);
4013
goto out;
4014
}
4015
4016
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
4017
if (unlikely(ret)) {
4018
btrfs_abort_transaction(trans, ret);
4019
goto out;
4020
}
4021
4022
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
4023
if (unlikely(ret)) {
4024
btrfs_abort_transaction(trans, ret);
4025
goto out;
4026
}
4027
} else if (unlikely(ret)) {
4028
btrfs_abort_transaction(trans, ret);
4029
goto out;
4030
}
4031
out:
4032
btrfs_trans_release_chunk_metadata(trans);
4033
4034
if (ret)
4035
return ERR_PTR(ret);
4036
4037
btrfs_get_block_group(bg);
4038
return bg;
4039
}
4040
4041
/*
4042
* Chunk allocation is done in 2 phases:
4043
*
4044
* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
4045
* the chunk, the chunk mapping, create its block group and add the items
4046
* that belong in the chunk btree to it - more specifically, we need to
4047
* update device items in the chunk btree and add a new chunk item to it.
4048
*
4049
* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
4050
* group item to the extent btree and the device extent items to the devices
4051
* btree.
4052
*
4053
* This is done to prevent deadlocks. For example when COWing a node from the
4054
* extent btree we are holding a write lock on the node's parent and if we
4055
* trigger chunk allocation and attempted to insert the new block group item
4056
* in the extent btree right way, we could deadlock because the path for the
4057
* insertion can include that parent node. At first glance it seems impossible
4058
* to trigger chunk allocation after starting a transaction since tasks should
4059
* reserve enough transaction units (metadata space), however while that is true
4060
* most of the time, chunk allocation may still be triggered for several reasons:
4061
*
4062
* 1) When reserving metadata, we check if there is enough free space in the
4063
* metadata space_info and therefore don't trigger allocation of a new chunk.
4064
* However later when the task actually tries to COW an extent buffer from
4065
* the extent btree or from the device btree for example, it is forced to
4066
* allocate a new block group (chunk) because the only one that had enough
4067
* free space was just turned to RO mode by a running scrub for example (or
4068
* device replace, block group reclaim thread, etc), so we can not use it
4069
* for allocating an extent and end up being forced to allocate a new one;
4070
*
4071
* 2) Because we only check that the metadata space_info has enough free bytes,
4072
* we end up not allocating a new metadata chunk in that case. However if
4073
* the filesystem was mounted in degraded mode, none of the existing block
4074
* groups might be suitable for extent allocation due to their incompatible
4075
* profile (for e.g. mounting a 2 devices filesystem, where all block groups
4076
* use a RAID1 profile, in degraded mode using a single device). In this case
4077
* when the task attempts to COW some extent buffer of the extent btree for
4078
* example, it will trigger allocation of a new metadata block group with a
4079
* suitable profile (SINGLE profile in the example of the degraded mount of
4080
* the RAID1 filesystem);
4081
*
4082
* 3) The task has reserved enough transaction units / metadata space, but when
4083
* it attempts to COW an extent buffer from the extent or device btree for
4084
* example, it does not find any free extent in any metadata block group,
4085
* therefore forced to try to allocate a new metadata block group.
4086
* This is because some other task allocated all available extents in the
4087
* meanwhile - this typically happens with tasks that don't reserve space
4088
* properly, either intentionally or as a bug. One example where this is
4089
* done intentionally is fsync, as it does not reserve any transaction units
4090
* and ends up allocating a variable number of metadata extents for log
4091
* tree extent buffers;
4092
*
4093
* 4) The task has reserved enough transaction units / metadata space, but right
4094
* before it tries to allocate the last extent buffer it needs, a discard
4095
* operation comes in and, temporarily, removes the last free space entry from
4096
* the only metadata block group that had free space (discard starts by
4097
* removing a free space entry from a block group, then does the discard
4098
* operation and, once it's done, it adds back the free space entry to the
4099
* block group).
4100
*
4101
* We also need this 2 phases setup when adding a device to a filesystem with
4102
* a seed device - we must create new metadata and system chunks without adding
4103
* any of the block group items to the chunk, extent and device btrees. If we
4104
* did not do it this way, we would get ENOSPC when attempting to update those
4105
* btrees, since all the chunks from the seed device are read-only.
4106
*
4107
* Phase 1 does the updates and insertions to the chunk btree because if we had
4108
* it done in phase 2 and have a thundering herd of tasks allocating chunks in
4109
* parallel, we risk having too many system chunks allocated by many tasks if
4110
* many tasks reach phase 1 without the previous ones completing phase 2. In the
4111
* extreme case this leads to exhaustion of the system chunk array in the
4112
* superblock. This is easier to trigger if using a btree node/leaf size of 64K
4113
* and with RAID filesystems (so we have more device items in the chunk btree).
4114
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
4115
* the system chunk array due to concurrent allocations") provides more details.
4116
*
4117
* Allocation of system chunks does not happen through this function. A task that
4118
* needs to update the chunk btree (the only btree that uses system chunks), must
4119
* preallocate chunk space by calling either check_system_chunk() or
4120
* btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
4121
* metadata chunk or when removing a chunk, while the later is used before doing
4122
* a modification to the chunk btree - use cases for the later are adding,
4123
* removing and resizing a device as well as relocation of a system chunk.
4124
* See the comment below for more details.
4125
*
4126
* The reservation of system space, done through check_system_chunk(), as well
4127
* as all the updates and insertions into the chunk btree must be done while
4128
* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
4129
* an extent buffer from the chunks btree we never trigger allocation of a new
4130
* system chunk, which would result in a deadlock (trying to lock twice an
4131
* extent buffer of the chunk btree, first time before triggering the chunk
4132
* allocation and the second time during chunk allocation while attempting to
4133
* update the chunks btree). The system chunk array is also updated while holding
4134
* that mutex. The same logic applies to removing chunks - we must reserve system
4135
* space, update the chunk btree and the system chunk array in the superblock
4136
* while holding fs_info->chunk_mutex.
4137
*
4138
* This function, btrfs_chunk_alloc(), belongs to phase 1.
4139
*
4140
* @space_info: specify which space_info the new chunk should belong to.
4141
*
4142
* If @force is CHUNK_ALLOC_FORCE:
4143
* - return 1 if it successfully allocates a chunk,
4144
* - return errors including -ENOSPC otherwise.
4145
* If @force is NOT CHUNK_ALLOC_FORCE:
4146
* - return 0 if it doesn't need to allocate a new chunk,
4147
* - return 1 if it successfully allocates a chunk,
4148
* - return errors including -ENOSPC otherwise.
4149
*/
4150
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
4151
struct btrfs_space_info *space_info, u64 flags,
4152
enum btrfs_chunk_alloc_enum force)
4153
{
4154
struct btrfs_fs_info *fs_info = trans->fs_info;
4155
struct btrfs_block_group *ret_bg;
4156
bool wait_for_alloc = false;
4157
bool should_alloc = false;
4158
bool from_extent_allocation = false;
4159
int ret = 0;
4160
4161
if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
4162
from_extent_allocation = true;
4163
force = CHUNK_ALLOC_FORCE;
4164
}
4165
4166
/* Don't re-enter if we're already allocating a chunk */
4167
if (trans->allocating_chunk)
4168
return -ENOSPC;
4169
/*
4170
* Allocation of system chunks can not happen through this path, as we
4171
* could end up in a deadlock if we are allocating a data or metadata
4172
* chunk and there is another task modifying the chunk btree.
4173
*
4174
* This is because while we are holding the chunk mutex, we will attempt
4175
* to add the new chunk item to the chunk btree or update an existing
4176
* device item in the chunk btree, while the other task that is modifying
4177
* the chunk btree is attempting to COW an extent buffer while holding a
4178
* lock on it and on its parent - if the COW operation triggers a system
4179
* chunk allocation, then we can deadlock because we are holding the
4180
* chunk mutex and we may need to access that extent buffer or its parent
4181
* in order to add the chunk item or update a device item.
4182
*
4183
* Tasks that want to modify the chunk tree should reserve system space
4184
* before updating the chunk btree, by calling either
4185
* btrfs_reserve_chunk_metadata() or check_system_chunk().
4186
* It's possible that after a task reserves the space, it still ends up
4187
* here - this happens in the cases described above at do_chunk_alloc().
4188
* The task will have to either retry or fail.
4189
*/
4190
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4191
return -ENOSPC;
4192
4193
do {
4194
spin_lock(&space_info->lock);
4195
if (force < space_info->force_alloc)
4196
force = space_info->force_alloc;
4197
should_alloc = should_alloc_chunk(fs_info, space_info, force);
4198
if (space_info->full) {
4199
/* No more free physical space */
4200
spin_unlock(&space_info->lock);
4201
if (should_alloc)
4202
ret = -ENOSPC;
4203
else
4204
ret = 0;
4205
return ret;
4206
} else if (!should_alloc) {
4207
spin_unlock(&space_info->lock);
4208
return 0;
4209
} else if (space_info->chunk_alloc) {
4210
/*
4211
* Someone is already allocating, so we need to block
4212
* until this someone is finished and then loop to
4213
* recheck if we should continue with our allocation
4214
* attempt.
4215
*/
4216
spin_unlock(&space_info->lock);
4217
wait_for_alloc = true;
4218
force = CHUNK_ALLOC_NO_FORCE;
4219
mutex_lock(&fs_info->chunk_mutex);
4220
mutex_unlock(&fs_info->chunk_mutex);
4221
} else {
4222
/* Proceed with allocation */
4223
space_info->chunk_alloc = true;
4224
spin_unlock(&space_info->lock);
4225
wait_for_alloc = false;
4226
}
4227
4228
cond_resched();
4229
} while (wait_for_alloc);
4230
4231
mutex_lock(&fs_info->chunk_mutex);
4232
trans->allocating_chunk = true;
4233
4234
/*
4235
* If we have mixed data/metadata chunks we want to make sure we keep
4236
* allocating mixed chunks instead of individual chunks.
4237
*/
4238
if (btrfs_mixed_space_info(space_info))
4239
flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4240
4241
/*
4242
* if we're doing a data chunk, go ahead and make sure that
4243
* we keep a reasonable number of metadata chunks allocated in the
4244
* FS as well.
4245
*/
4246
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4247
fs_info->data_chunk_allocations++;
4248
if (!(fs_info->data_chunk_allocations %
4249
fs_info->metadata_ratio))
4250
force_metadata_allocation(fs_info);
4251
}
4252
4253
ret_bg = do_chunk_alloc(trans, space_info, flags);
4254
trans->allocating_chunk = false;
4255
4256
if (IS_ERR(ret_bg)) {
4257
ret = PTR_ERR(ret_bg);
4258
} else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
4259
/*
4260
* New block group is likely to be used soon. Try to activate
4261
* it now. Failure is OK for now.
4262
*/
4263
btrfs_zone_activate(ret_bg);
4264
}
4265
4266
if (!ret)
4267
btrfs_put_block_group(ret_bg);
4268
4269
spin_lock(&space_info->lock);
4270
if (ret < 0) {
4271
if (ret == -ENOSPC)
4272
space_info->full = true;
4273
else
4274
goto out;
4275
} else {
4276
ret = 1;
4277
space_info->max_extent_size = 0;
4278
}
4279
4280
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4281
out:
4282
space_info->chunk_alloc = false;
4283
spin_unlock(&space_info->lock);
4284
mutex_unlock(&fs_info->chunk_mutex);
4285
4286
return ret;
4287
}
4288
4289
static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
4290
{
4291
u64 num_dev;
4292
4293
num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4294
if (!num_dev)
4295
num_dev = fs_info->fs_devices->rw_devices;
4296
4297
return num_dev;
4298
}
4299
4300
static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4301
u64 bytes,
4302
u64 type)
4303
{
4304
struct btrfs_fs_info *fs_info = trans->fs_info;
4305
struct btrfs_space_info *info;
4306
u64 left;
4307
int ret = 0;
4308
4309
/*
4310
* Needed because we can end up allocating a system chunk and for an
4311
* atomic and race free space reservation in the chunk block reserve.
4312
*/
4313
lockdep_assert_held(&fs_info->chunk_mutex);
4314
4315
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4316
spin_lock(&info->lock);
4317
left = info->total_bytes - btrfs_space_info_used(info, true);
4318
spin_unlock(&info->lock);
4319
4320
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4321
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4322
left, bytes, type);
4323
btrfs_dump_space_info(info, 0, false);
4324
}
4325
4326
if (left < bytes) {
4327
u64 flags = btrfs_system_alloc_profile(fs_info);
4328
struct btrfs_block_group *bg;
4329
struct btrfs_space_info *space_info;
4330
4331
space_info = btrfs_find_space_info(fs_info, flags);
4332
ASSERT(space_info);
4333
4334
/*
4335
* Ignore failure to create system chunk. We might end up not
4336
* needing it, as we might not need to COW all nodes/leafs from
4337
* the paths we visit in the chunk tree (they were already COWed
4338
* or created in the current transaction for example).
4339
*/
4340
bg = btrfs_create_chunk(trans, space_info, flags);
4341
if (IS_ERR(bg)) {
4342
ret = PTR_ERR(bg);
4343
} else {
4344
/*
4345
* We have a new chunk. We also need to activate it for
4346
* zoned filesystem.
4347
*/
4348
ret = btrfs_zoned_activate_one_bg(info, true);
4349
if (ret < 0)
4350
return;
4351
4352
/*
4353
* If we fail to add the chunk item here, we end up
4354
* trying again at phase 2 of chunk allocation, at
4355
* btrfs_create_pending_block_groups(). So ignore
4356
* any error here. An ENOSPC here could happen, due to
4357
* the cases described at do_chunk_alloc() - the system
4358
* block group we just created was just turned into RO
4359
* mode by a scrub for example, or a running discard
4360
* temporarily removed its free space entries, etc.
4361
*/
4362
btrfs_chunk_alloc_add_chunk_item(trans, bg);
4363
}
4364
}
4365
4366
if (!ret) {
4367
ret = btrfs_block_rsv_add(fs_info,
4368
&fs_info->chunk_block_rsv,
4369
bytes, BTRFS_RESERVE_NO_FLUSH);
4370
if (!ret)
4371
trans->chunk_bytes_reserved += bytes;
4372
}
4373
}
4374
4375
/*
4376
* Reserve space in the system space for allocating or removing a chunk.
4377
* The caller must be holding fs_info->chunk_mutex.
4378
*/
4379
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4380
{
4381
struct btrfs_fs_info *fs_info = trans->fs_info;
4382
const u64 num_devs = get_profile_num_devs(fs_info, type);
4383
u64 bytes;
4384
4385
/* num_devs device items to update and 1 chunk item to add or remove. */
4386
bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4387
btrfs_calc_insert_metadata_size(fs_info, 1);
4388
4389
reserve_chunk_space(trans, bytes, type);
4390
}
4391
4392
/*
4393
* Reserve space in the system space, if needed, for doing a modification to the
4394
* chunk btree.
4395
*
4396
* @trans: A transaction handle.
4397
* @is_item_insertion: Indicate if the modification is for inserting a new item
4398
* in the chunk btree or if it's for the deletion or update
4399
* of an existing item.
4400
*
4401
* This is used in a context where we need to update the chunk btree outside
4402
* block group allocation and removal, to avoid a deadlock with a concurrent
4403
* task that is allocating a metadata or data block group and therefore needs to
4404
* update the chunk btree while holding the chunk mutex. After the update to the
4405
* chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4406
*
4407
*/
4408
void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4409
bool is_item_insertion)
4410
{
4411
struct btrfs_fs_info *fs_info = trans->fs_info;
4412
u64 bytes;
4413
4414
if (is_item_insertion)
4415
bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4416
else
4417
bytes = btrfs_calc_metadata_size(fs_info, 1);
4418
4419
mutex_lock(&fs_info->chunk_mutex);
4420
reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4421
mutex_unlock(&fs_info->chunk_mutex);
4422
}
4423
4424
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4425
{
4426
struct btrfs_block_group *block_group;
4427
4428
block_group = btrfs_lookup_first_block_group(info, 0);
4429
while (block_group) {
4430
btrfs_wait_block_group_cache_done(block_group);
4431
spin_lock(&block_group->lock);
4432
if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4433
&block_group->runtime_flags)) {
4434
struct btrfs_inode *inode = block_group->inode;
4435
4436
block_group->inode = NULL;
4437
spin_unlock(&block_group->lock);
4438
4439
ASSERT(block_group->io_ctl.inode == NULL);
4440
iput(&inode->vfs_inode);
4441
} else {
4442
spin_unlock(&block_group->lock);
4443
}
4444
block_group = btrfs_next_block_group(block_group);
4445
}
4446
}
4447
4448
static void check_removing_space_info(struct btrfs_space_info *space_info)
4449
{
4450
struct btrfs_fs_info *info = space_info->fs_info;
4451
4452
if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
4453
/* This is a top space_info, proceed with its children first. */
4454
for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
4455
if (space_info->sub_group[i]) {
4456
check_removing_space_info(space_info->sub_group[i]);
4457
kfree(space_info->sub_group[i]);
4458
space_info->sub_group[i] = NULL;
4459
}
4460
}
4461
}
4462
4463
/*
4464
* Do not hide this behind enospc_debug, this is actually important and
4465
* indicates a real bug if this happens.
4466
*/
4467
if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
4468
btrfs_dump_space_info(space_info, 0, false);
4469
4470
/*
4471
* If there was a failure to cleanup a log tree, very likely due to an
4472
* IO failure on a writeback attempt of one or more of its extent
4473
* buffers, we could not do proper (and cheap) unaccounting of their
4474
* reserved space, so don't warn on bytes_reserved > 0 in that case.
4475
*/
4476
if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4477
!BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4478
if (WARN_ON(space_info->bytes_reserved > 0))
4479
btrfs_dump_space_info(space_info, 0, false);
4480
}
4481
4482
WARN_ON(space_info->reclaim_size > 0);
4483
}
4484
4485
/*
4486
* Must be called only after stopping all workers, since we could have block
4487
* group caching kthreads running, and therefore they could race with us if we
4488
* freed the block groups before stopping them.
4489
*/
4490
int btrfs_free_block_groups(struct btrfs_fs_info *info)
4491
{
4492
struct btrfs_block_group *block_group;
4493
struct btrfs_space_info *space_info;
4494
struct btrfs_caching_control *caching_ctl;
4495
struct rb_node *n;
4496
4497
if (btrfs_is_zoned(info)) {
4498
if (info->active_meta_bg) {
4499
btrfs_put_block_group(info->active_meta_bg);
4500
info->active_meta_bg = NULL;
4501
}
4502
if (info->active_system_bg) {
4503
btrfs_put_block_group(info->active_system_bg);
4504
info->active_system_bg = NULL;
4505
}
4506
}
4507
4508
write_lock(&info->block_group_cache_lock);
4509
while (!list_empty(&info->caching_block_groups)) {
4510
caching_ctl = list_first_entry(&info->caching_block_groups,
4511
struct btrfs_caching_control, list);
4512
list_del(&caching_ctl->list);
4513
btrfs_put_caching_control(caching_ctl);
4514
}
4515
write_unlock(&info->block_group_cache_lock);
4516
4517
spin_lock(&info->unused_bgs_lock);
4518
while (!list_empty(&info->unused_bgs)) {
4519
block_group = list_first_entry(&info->unused_bgs,
4520
struct btrfs_block_group,
4521
bg_list);
4522
list_del_init(&block_group->bg_list);
4523
btrfs_put_block_group(block_group);
4524
}
4525
4526
while (!list_empty(&info->reclaim_bgs)) {
4527
block_group = list_first_entry(&info->reclaim_bgs,
4528
struct btrfs_block_group,
4529
bg_list);
4530
list_del_init(&block_group->bg_list);
4531
btrfs_put_block_group(block_group);
4532
}
4533
spin_unlock(&info->unused_bgs_lock);
4534
4535
spin_lock(&info->zone_active_bgs_lock);
4536
while (!list_empty(&info->zone_active_bgs)) {
4537
block_group = list_first_entry(&info->zone_active_bgs,
4538
struct btrfs_block_group,
4539
active_bg_list);
4540
list_del_init(&block_group->active_bg_list);
4541
btrfs_put_block_group(block_group);
4542
}
4543
spin_unlock(&info->zone_active_bgs_lock);
4544
4545
write_lock(&info->block_group_cache_lock);
4546
while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4547
block_group = rb_entry(n, struct btrfs_block_group,
4548
cache_node);
4549
rb_erase_cached(&block_group->cache_node,
4550
&info->block_group_cache_tree);
4551
RB_CLEAR_NODE(&block_group->cache_node);
4552
write_unlock(&info->block_group_cache_lock);
4553
4554
down_write(&block_group->space_info->groups_sem);
4555
list_del(&block_group->list);
4556
up_write(&block_group->space_info->groups_sem);
4557
4558
/*
4559
* We haven't cached this block group, which means we could
4560
* possibly have excluded extents on this block group.
4561
*/
4562
if (block_group->cached == BTRFS_CACHE_NO ||
4563
block_group->cached == BTRFS_CACHE_ERROR)
4564
btrfs_free_excluded_extents(block_group);
4565
4566
btrfs_remove_free_space_cache(block_group);
4567
ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4568
ASSERT(list_empty(&block_group->dirty_list));
4569
ASSERT(list_empty(&block_group->io_list));
4570
ASSERT(list_empty(&block_group->bg_list));
4571
ASSERT(refcount_read(&block_group->refs) == 1);
4572
ASSERT(block_group->swap_extents == 0);
4573
btrfs_put_block_group(block_group);
4574
4575
write_lock(&info->block_group_cache_lock);
4576
}
4577
write_unlock(&info->block_group_cache_lock);
4578
4579
btrfs_release_global_block_rsv(info);
4580
4581
while (!list_empty(&info->space_info)) {
4582
space_info = list_first_entry(&info->space_info,
4583
struct btrfs_space_info, list);
4584
4585
check_removing_space_info(space_info);
4586
list_del(&space_info->list);
4587
btrfs_sysfs_remove_space_info(space_info);
4588
}
4589
return 0;
4590
}
4591
4592
void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4593
{
4594
atomic_inc(&cache->frozen);
4595
}
4596
4597
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4598
{
4599
struct btrfs_fs_info *fs_info = block_group->fs_info;
4600
bool cleanup;
4601
4602
spin_lock(&block_group->lock);
4603
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4604
test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4605
spin_unlock(&block_group->lock);
4606
4607
if (cleanup) {
4608
struct btrfs_chunk_map *map;
4609
4610
map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
4611
/* Logic error, can't happen. */
4612
ASSERT(map);
4613
4614
btrfs_remove_chunk_map(fs_info, map);
4615
4616
/* Once for our lookup reference. */
4617
btrfs_free_chunk_map(map);
4618
4619
/*
4620
* We may have left one free space entry and other possible
4621
* tasks trimming this block group have left 1 entry each one.
4622
* Free them if any.
4623
*/
4624
btrfs_remove_free_space_cache(block_group);
4625
}
4626
}
4627
4628
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4629
{
4630
bool ret = true;
4631
4632
spin_lock(&bg->lock);
4633
if (bg->ro)
4634
ret = false;
4635
else
4636
bg->swap_extents++;
4637
spin_unlock(&bg->lock);
4638
4639
return ret;
4640
}
4641
4642
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4643
{
4644
spin_lock(&bg->lock);
4645
ASSERT(!bg->ro);
4646
ASSERT(bg->swap_extents >= amount);
4647
bg->swap_extents -= amount;
4648
spin_unlock(&bg->lock);
4649
}
4650
4651
enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4652
{
4653
if (size <= SZ_128K)
4654
return BTRFS_BG_SZ_SMALL;
4655
if (size <= SZ_8M)
4656
return BTRFS_BG_SZ_MEDIUM;
4657
return BTRFS_BG_SZ_LARGE;
4658
}
4659
4660
/*
4661
* Handle a block group allocating an extent in a size class
4662
*
4663
* @bg: The block group we allocated in.
4664
* @size_class: The size class of the allocation.
4665
* @force_wrong_size_class: Whether we are desperate enough to allow
4666
* mismatched size classes.
4667
*
4668
* Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4669
* case of a race that leads to the wrong size class without
4670
* force_wrong_size_class set.
4671
*
4672
* find_free_extent will skip block groups with a mismatched size class until
4673
* it really needs to avoid ENOSPC. In that case it will set
4674
* force_wrong_size_class. However, if a block group is newly allocated and
4675
* doesn't yet have a size class, then it is possible for two allocations of
4676
* different sizes to race and both try to use it. The loser is caught here and
4677
* has to retry.
4678
*/
4679
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4680
enum btrfs_block_group_size_class size_class,
4681
bool force_wrong_size_class)
4682
{
4683
ASSERT(size_class != BTRFS_BG_SZ_NONE);
4684
4685
/* The new allocation is in the right size class, do nothing */
4686
if (bg->size_class == size_class)
4687
return 0;
4688
/*
4689
* The new allocation is in a mismatched size class.
4690
* This means one of two things:
4691
*
4692
* 1. Two tasks in find_free_extent for different size_classes raced
4693
* and hit the same empty block_group. Make the loser try again.
4694
* 2. A call to find_free_extent got desperate enough to set
4695
* 'force_wrong_slab'. Don't change the size_class, but allow the
4696
* allocation.
4697
*/
4698
if (bg->size_class != BTRFS_BG_SZ_NONE) {
4699
if (force_wrong_size_class)
4700
return 0;
4701
return -EAGAIN;
4702
}
4703
/*
4704
* The happy new block group case: the new allocation is the first
4705
* one in the block_group so we set size_class.
4706
*/
4707
bg->size_class = size_class;
4708
4709
return 0;
4710
}
4711
4712
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
4713
{
4714
if (btrfs_is_zoned(bg->fs_info))
4715
return false;
4716
if (!btrfs_is_block_group_data_only(bg))
4717
return false;
4718
return true;
4719
}
4720
4721