Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/disk-io.c
26282 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2007 Oracle. All rights reserved.
4
*/
5
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
9
#include <linux/writeback.h>
10
#include <linux/workqueue.h>
11
#include <linux/kthread.h>
12
#include <linux/slab.h>
13
#include <linux/migrate.h>
14
#include <linux/ratelimit.h>
15
#include <linux/uuid.h>
16
#include <linux/semaphore.h>
17
#include <linux/error-injection.h>
18
#include <linux/crc32c.h>
19
#include <linux/sched/mm.h>
20
#include <linux/unaligned.h>
21
#include <crypto/hash.h>
22
#include "ctree.h"
23
#include "disk-io.h"
24
#include "transaction.h"
25
#include "btrfs_inode.h"
26
#include "bio.h"
27
#include "print-tree.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "free-space-cache.h"
31
#include "free-space-tree.h"
32
#include "dev-replace.h"
33
#include "raid56.h"
34
#include "sysfs.h"
35
#include "qgroup.h"
36
#include "compression.h"
37
#include "tree-checker.h"
38
#include "ref-verify.h"
39
#include "block-group.h"
40
#include "discard.h"
41
#include "space-info.h"
42
#include "zoned.h"
43
#include "subpage.h"
44
#include "fs.h"
45
#include "accessors.h"
46
#include "extent-tree.h"
47
#include "root-tree.h"
48
#include "defrag.h"
49
#include "uuid-tree.h"
50
#include "relocation.h"
51
#include "scrub.h"
52
#include "super.h"
53
54
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
55
BTRFS_HEADER_FLAG_RELOC |\
56
BTRFS_SUPER_FLAG_ERROR |\
57
BTRFS_SUPER_FLAG_SEEDING |\
58
BTRFS_SUPER_FLAG_METADUMP |\
59
BTRFS_SUPER_FLAG_METADUMP_V2)
60
61
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
62
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
63
64
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
65
{
66
if (fs_info->csum_shash)
67
crypto_free_shash(fs_info->csum_shash);
68
}
69
70
/*
71
* Compute the csum of a btree block and store the result to provided buffer.
72
*/
73
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
74
{
75
struct btrfs_fs_info *fs_info = buf->fs_info;
76
int num_pages;
77
u32 first_page_part;
78
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
79
char *kaddr;
80
int i;
81
82
shash->tfm = fs_info->csum_shash;
83
crypto_shash_init(shash);
84
85
if (buf->addr) {
86
/* Pages are contiguous, handle them as a big one. */
87
kaddr = buf->addr;
88
first_page_part = fs_info->nodesize;
89
num_pages = 1;
90
} else {
91
kaddr = folio_address(buf->folios[0]);
92
first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
93
num_pages = num_extent_pages(buf);
94
}
95
96
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
97
first_page_part - BTRFS_CSUM_SIZE);
98
99
/*
100
* Multiple single-page folios case would reach here.
101
*
102
* nodesize <= PAGE_SIZE and large folio all handled by above
103
* crypto_shash_update() already.
104
*/
105
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
106
kaddr = folio_address(buf->folios[i]);
107
crypto_shash_update(shash, kaddr, PAGE_SIZE);
108
}
109
memset(result, 0, BTRFS_CSUM_SIZE);
110
crypto_shash_final(shash, result);
111
}
112
113
/*
114
* we can't consider a given block up to date unless the transid of the
115
* block matches the transid in the parent node's pointer. This is how we
116
* detect blocks that either didn't get written at all or got written
117
* in the wrong place.
118
*/
119
int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
120
{
121
if (!extent_buffer_uptodate(eb))
122
return 0;
123
124
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
125
return 1;
126
127
if (atomic)
128
return -EAGAIN;
129
130
if (!extent_buffer_uptodate(eb) ||
131
btrfs_header_generation(eb) != parent_transid) {
132
btrfs_err_rl(eb->fs_info,
133
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
134
eb->start, eb->read_mirror,
135
parent_transid, btrfs_header_generation(eb));
136
clear_extent_buffer_uptodate(eb);
137
return 0;
138
}
139
return 1;
140
}
141
142
static bool btrfs_supported_super_csum(u16 csum_type)
143
{
144
switch (csum_type) {
145
case BTRFS_CSUM_TYPE_CRC32:
146
case BTRFS_CSUM_TYPE_XXHASH:
147
case BTRFS_CSUM_TYPE_SHA256:
148
case BTRFS_CSUM_TYPE_BLAKE2:
149
return true;
150
default:
151
return false;
152
}
153
}
154
155
/*
156
* Return 0 if the superblock checksum type matches the checksum value of that
157
* algorithm. Pass the raw disk superblock data.
158
*/
159
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
160
const struct btrfs_super_block *disk_sb)
161
{
162
char result[BTRFS_CSUM_SIZE];
163
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
164
165
shash->tfm = fs_info->csum_shash;
166
167
/*
168
* The super_block structure does not span the whole
169
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
170
* filled with zeros and is included in the checksum.
171
*/
172
crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
173
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
174
175
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
176
return 1;
177
178
return 0;
179
}
180
181
static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
182
int mirror_num)
183
{
184
struct btrfs_fs_info *fs_info = eb->fs_info;
185
int ret = 0;
186
187
if (sb_rdonly(fs_info->sb))
188
return -EROFS;
189
190
for (int i = 0; i < num_extent_folios(eb); i++) {
191
struct folio *folio = eb->folios[i];
192
u64 start = max_t(u64, eb->start, folio_pos(folio));
193
u64 end = min_t(u64, eb->start + eb->len,
194
folio_pos(folio) + eb->folio_size);
195
u32 len = end - start;
196
phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) +
197
offset_in_folio(folio, start);
198
199
ret = btrfs_repair_io_failure(fs_info, 0, start, len, start,
200
paddr, mirror_num);
201
if (ret)
202
break;
203
}
204
205
return ret;
206
}
207
208
/*
209
* helper to read a given tree block, doing retries as required when
210
* the checksums don't match and we have alternate mirrors to try.
211
*
212
* @check: expected tree parentness check, see the comments of the
213
* structure for details.
214
*/
215
int btrfs_read_extent_buffer(struct extent_buffer *eb,
216
const struct btrfs_tree_parent_check *check)
217
{
218
struct btrfs_fs_info *fs_info = eb->fs_info;
219
int failed = 0;
220
int ret;
221
int num_copies = 0;
222
int mirror_num = 0;
223
int failed_mirror = 0;
224
225
ASSERT(check);
226
227
while (1) {
228
ret = read_extent_buffer_pages(eb, mirror_num, check);
229
if (!ret)
230
break;
231
232
num_copies = btrfs_num_copies(fs_info,
233
eb->start, eb->len);
234
if (num_copies == 1)
235
break;
236
237
if (!failed_mirror) {
238
failed = 1;
239
failed_mirror = eb->read_mirror;
240
}
241
242
mirror_num++;
243
if (mirror_num == failed_mirror)
244
mirror_num++;
245
246
if (mirror_num > num_copies)
247
break;
248
}
249
250
if (failed && !ret && failed_mirror)
251
btrfs_repair_eb_io_failure(eb, failed_mirror);
252
253
return ret;
254
}
255
256
/*
257
* Checksum a dirty tree block before IO.
258
*/
259
int btree_csum_one_bio(struct btrfs_bio *bbio)
260
{
261
struct extent_buffer *eb = bbio->private;
262
struct btrfs_fs_info *fs_info = eb->fs_info;
263
u64 found_start = btrfs_header_bytenr(eb);
264
u64 last_trans;
265
u8 result[BTRFS_CSUM_SIZE];
266
int ret;
267
268
/* Btree blocks are always contiguous on disk. */
269
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
270
return -EIO;
271
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
272
return -EIO;
273
274
/*
275
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
276
* checksum it but zero-out its content. This is done to preserve
277
* ordering of I/O without unnecessarily writing out data.
278
*/
279
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
280
memzero_extent_buffer(eb, 0, eb->len);
281
return 0;
282
}
283
284
if (WARN_ON_ONCE(found_start != eb->start))
285
return -EIO;
286
if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
287
return -EIO;
288
289
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
290
offsetof(struct btrfs_header, fsid),
291
BTRFS_FSID_SIZE) == 0);
292
csum_tree_block(eb, result);
293
294
if (btrfs_header_level(eb))
295
ret = btrfs_check_node(eb);
296
else
297
ret = btrfs_check_leaf(eb);
298
299
if (ret < 0)
300
goto error;
301
302
/*
303
* Also check the generation, the eb reached here must be newer than
304
* last committed. Or something seriously wrong happened.
305
*/
306
last_trans = btrfs_get_last_trans_committed(fs_info);
307
if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
308
ret = -EUCLEAN;
309
btrfs_err(fs_info,
310
"block=%llu bad generation, have %llu expect > %llu",
311
eb->start, btrfs_header_generation(eb), last_trans);
312
goto error;
313
}
314
write_extent_buffer(eb, result, 0, fs_info->csum_size);
315
return 0;
316
317
error:
318
btrfs_print_tree(eb, 0);
319
btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
320
eb->start);
321
/*
322
* Be noisy if this is an extent buffer from a log tree. We don't abort
323
* a transaction in case there's a bad log tree extent buffer, we just
324
* fallback to a transaction commit. Still we want to know when there is
325
* a bad log tree extent buffer, as that may signal a bug somewhere.
326
*/
327
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
328
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
329
return ret;
330
}
331
332
static bool check_tree_block_fsid(struct extent_buffer *eb)
333
{
334
struct btrfs_fs_info *fs_info = eb->fs_info;
335
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
336
u8 fsid[BTRFS_FSID_SIZE];
337
338
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
339
BTRFS_FSID_SIZE);
340
341
/*
342
* alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
343
* This is then overwritten by metadata_uuid if it is present in the
344
* device_list_add(). The same true for a seed device as well. So use of
345
* fs_devices::metadata_uuid is appropriate here.
346
*/
347
if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
348
return false;
349
350
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
351
if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
352
return false;
353
354
return true;
355
}
356
357
/* Do basic extent buffer checks at read time */
358
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
359
const struct btrfs_tree_parent_check *check)
360
{
361
struct btrfs_fs_info *fs_info = eb->fs_info;
362
u64 found_start;
363
const u32 csum_size = fs_info->csum_size;
364
u8 found_level;
365
u8 result[BTRFS_CSUM_SIZE];
366
const u8 *header_csum;
367
int ret = 0;
368
const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
369
370
ASSERT(check);
371
372
found_start = btrfs_header_bytenr(eb);
373
if (found_start != eb->start) {
374
btrfs_err_rl(fs_info,
375
"bad tree block start, mirror %u want %llu have %llu",
376
eb->read_mirror, eb->start, found_start);
377
ret = -EIO;
378
goto out;
379
}
380
if (check_tree_block_fsid(eb)) {
381
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
382
eb->start, eb->read_mirror);
383
ret = -EIO;
384
goto out;
385
}
386
found_level = btrfs_header_level(eb);
387
if (found_level >= BTRFS_MAX_LEVEL) {
388
btrfs_err(fs_info,
389
"bad tree block level, mirror %u level %d on logical %llu",
390
eb->read_mirror, btrfs_header_level(eb), eb->start);
391
ret = -EIO;
392
goto out;
393
}
394
395
csum_tree_block(eb, result);
396
header_csum = folio_address(eb->folios[0]) +
397
get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
398
399
if (memcmp(result, header_csum, csum_size) != 0) {
400
btrfs_warn_rl(fs_info,
401
"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
402
eb->start, eb->read_mirror,
403
CSUM_FMT_VALUE(csum_size, header_csum),
404
CSUM_FMT_VALUE(csum_size, result),
405
btrfs_header_level(eb),
406
ignore_csum ? ", ignored" : "");
407
if (!ignore_csum) {
408
ret = -EUCLEAN;
409
goto out;
410
}
411
}
412
413
if (found_level != check->level) {
414
btrfs_err(fs_info,
415
"level verify failed on logical %llu mirror %u wanted %u found %u",
416
eb->start, eb->read_mirror, check->level, found_level);
417
ret = -EIO;
418
goto out;
419
}
420
if (unlikely(check->transid &&
421
btrfs_header_generation(eb) != check->transid)) {
422
btrfs_err_rl(eb->fs_info,
423
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
424
eb->start, eb->read_mirror, check->transid,
425
btrfs_header_generation(eb));
426
ret = -EIO;
427
goto out;
428
}
429
if (check->has_first_key) {
430
const struct btrfs_key *expect_key = &check->first_key;
431
struct btrfs_key found_key;
432
433
if (found_level)
434
btrfs_node_key_to_cpu(eb, &found_key, 0);
435
else
436
btrfs_item_key_to_cpu(eb, &found_key, 0);
437
if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
438
btrfs_err(fs_info,
439
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
440
eb->start, check->transid,
441
expect_key->objectid,
442
expect_key->type, expect_key->offset,
443
found_key.objectid, found_key.type,
444
found_key.offset);
445
ret = -EUCLEAN;
446
goto out;
447
}
448
}
449
if (check->owner_root) {
450
ret = btrfs_check_eb_owner(eb, check->owner_root);
451
if (ret < 0)
452
goto out;
453
}
454
455
/* If this is a leaf block and it is corrupt, just return -EIO. */
456
if (found_level == 0 && btrfs_check_leaf(eb))
457
ret = -EIO;
458
459
if (found_level > 0 && btrfs_check_node(eb))
460
ret = -EIO;
461
462
if (ret)
463
btrfs_err(fs_info,
464
"read time tree block corruption detected on logical %llu mirror %u",
465
eb->start, eb->read_mirror);
466
out:
467
return ret;
468
}
469
470
#ifdef CONFIG_MIGRATION
471
static int btree_migrate_folio(struct address_space *mapping,
472
struct folio *dst, struct folio *src, enum migrate_mode mode)
473
{
474
/*
475
* we can't safely write a btree page from here,
476
* we haven't done the locking hook
477
*/
478
if (folio_test_dirty(src))
479
return -EAGAIN;
480
/*
481
* Buffers may be managed in a filesystem specific way.
482
* We must have no buffers or drop them.
483
*/
484
if (folio_get_private(src) &&
485
!filemap_release_folio(src, GFP_KERNEL))
486
return -EAGAIN;
487
return migrate_folio(mapping, dst, src, mode);
488
}
489
#else
490
#define btree_migrate_folio NULL
491
#endif
492
493
static int btree_writepages(struct address_space *mapping,
494
struct writeback_control *wbc)
495
{
496
int ret;
497
498
if (wbc->sync_mode == WB_SYNC_NONE) {
499
struct btrfs_fs_info *fs_info;
500
501
if (wbc->for_kupdate)
502
return 0;
503
504
fs_info = inode_to_fs_info(mapping->host);
505
/* this is a bit racy, but that's ok */
506
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
507
BTRFS_DIRTY_METADATA_THRESH,
508
fs_info->dirty_metadata_batch);
509
if (ret < 0)
510
return 0;
511
}
512
return btree_write_cache_pages(mapping, wbc);
513
}
514
515
static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
516
{
517
if (folio_test_writeback(folio) || folio_test_dirty(folio))
518
return false;
519
520
return try_release_extent_buffer(folio);
521
}
522
523
static void btree_invalidate_folio(struct folio *folio, size_t offset,
524
size_t length)
525
{
526
struct extent_io_tree *tree;
527
528
tree = &folio_to_inode(folio)->io_tree;
529
extent_invalidate_folio(tree, folio, offset);
530
btree_release_folio(folio, GFP_NOFS);
531
if (folio_get_private(folio)) {
532
btrfs_warn(folio_to_fs_info(folio),
533
"folio private not zero on folio %llu",
534
(unsigned long long)folio_pos(folio));
535
folio_detach_private(folio);
536
}
537
}
538
539
#ifdef DEBUG
540
static bool btree_dirty_folio(struct address_space *mapping,
541
struct folio *folio)
542
{
543
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
544
struct btrfs_subpage_info *spi = fs_info->subpage_info;
545
struct btrfs_subpage *subpage;
546
struct extent_buffer *eb;
547
int cur_bit = 0;
548
u64 page_start = folio_pos(folio);
549
550
if (fs_info->sectorsize == PAGE_SIZE) {
551
eb = folio_get_private(folio);
552
BUG_ON(!eb);
553
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
554
BUG_ON(!atomic_read(&eb->refs));
555
btrfs_assert_tree_write_locked(eb);
556
return filemap_dirty_folio(mapping, folio);
557
}
558
559
ASSERT(spi);
560
subpage = folio_get_private(folio);
561
562
for (cur_bit = spi->dirty_offset;
563
cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
564
cur_bit++) {
565
unsigned long flags;
566
u64 cur;
567
568
spin_lock_irqsave(&subpage->lock, flags);
569
if (!test_bit(cur_bit, subpage->bitmaps)) {
570
spin_unlock_irqrestore(&subpage->lock, flags);
571
continue;
572
}
573
spin_unlock_irqrestore(&subpage->lock, flags);
574
cur = page_start + cur_bit * fs_info->sectorsize;
575
576
eb = find_extent_buffer(fs_info, cur);
577
ASSERT(eb);
578
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
579
ASSERT(atomic_read(&eb->refs));
580
btrfs_assert_tree_write_locked(eb);
581
free_extent_buffer(eb);
582
583
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
584
}
585
return filemap_dirty_folio(mapping, folio);
586
}
587
#else
588
#define btree_dirty_folio filemap_dirty_folio
589
#endif
590
591
static const struct address_space_operations btree_aops = {
592
.writepages = btree_writepages,
593
.release_folio = btree_release_folio,
594
.invalidate_folio = btree_invalidate_folio,
595
.migrate_folio = btree_migrate_folio,
596
.dirty_folio = btree_dirty_folio,
597
};
598
599
struct extent_buffer *btrfs_find_create_tree_block(
600
struct btrfs_fs_info *fs_info,
601
u64 bytenr, u64 owner_root,
602
int level)
603
{
604
if (btrfs_is_testing(fs_info))
605
return alloc_test_extent_buffer(fs_info, bytenr);
606
return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
607
}
608
609
/*
610
* Read tree block at logical address @bytenr and do variant basic but critical
611
* verification.
612
*
613
* @check: expected tree parentness check, see comments of the
614
* structure for details.
615
*/
616
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
617
struct btrfs_tree_parent_check *check)
618
{
619
struct extent_buffer *buf = NULL;
620
int ret;
621
622
ASSERT(check);
623
624
buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
625
check->level);
626
if (IS_ERR(buf))
627
return buf;
628
629
ret = btrfs_read_extent_buffer(buf, check);
630
if (ret) {
631
free_extent_buffer_stale(buf);
632
return ERR_PTR(ret);
633
}
634
return buf;
635
636
}
637
638
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
639
u64 objectid, gfp_t flags)
640
{
641
struct btrfs_root *root;
642
bool dummy = btrfs_is_testing(fs_info);
643
644
root = kzalloc(sizeof(*root), flags);
645
if (!root)
646
return NULL;
647
648
memset(&root->root_key, 0, sizeof(root->root_key));
649
memset(&root->root_item, 0, sizeof(root->root_item));
650
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
651
root->fs_info = fs_info;
652
root->root_key.objectid = objectid;
653
root->node = NULL;
654
root->commit_root = NULL;
655
root->state = 0;
656
RB_CLEAR_NODE(&root->rb_node);
657
658
btrfs_set_root_last_trans(root, 0);
659
root->free_objectid = 0;
660
root->nr_delalloc_inodes = 0;
661
root->nr_ordered_extents = 0;
662
xa_init(&root->inodes);
663
xa_init(&root->delayed_nodes);
664
665
btrfs_init_root_block_rsv(root);
666
667
INIT_LIST_HEAD(&root->dirty_list);
668
INIT_LIST_HEAD(&root->root_list);
669
INIT_LIST_HEAD(&root->delalloc_inodes);
670
INIT_LIST_HEAD(&root->delalloc_root);
671
INIT_LIST_HEAD(&root->ordered_extents);
672
INIT_LIST_HEAD(&root->ordered_root);
673
INIT_LIST_HEAD(&root->reloc_dirty_list);
674
spin_lock_init(&root->delalloc_lock);
675
spin_lock_init(&root->ordered_extent_lock);
676
spin_lock_init(&root->accounting_lock);
677
spin_lock_init(&root->qgroup_meta_rsv_lock);
678
mutex_init(&root->objectid_mutex);
679
mutex_init(&root->log_mutex);
680
mutex_init(&root->ordered_extent_mutex);
681
mutex_init(&root->delalloc_mutex);
682
init_waitqueue_head(&root->qgroup_flush_wait);
683
init_waitqueue_head(&root->log_writer_wait);
684
init_waitqueue_head(&root->log_commit_wait[0]);
685
init_waitqueue_head(&root->log_commit_wait[1]);
686
INIT_LIST_HEAD(&root->log_ctxs[0]);
687
INIT_LIST_HEAD(&root->log_ctxs[1]);
688
atomic_set(&root->log_commit[0], 0);
689
atomic_set(&root->log_commit[1], 0);
690
atomic_set(&root->log_writers, 0);
691
atomic_set(&root->log_batch, 0);
692
refcount_set(&root->refs, 1);
693
atomic_set(&root->snapshot_force_cow, 0);
694
atomic_set(&root->nr_swapfiles, 0);
695
btrfs_set_root_log_transid(root, 0);
696
root->log_transid_committed = -1;
697
btrfs_set_root_last_log_commit(root, 0);
698
root->anon_dev = 0;
699
if (!dummy) {
700
btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
701
IO_TREE_ROOT_DIRTY_LOG_PAGES);
702
btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
703
IO_TREE_LOG_CSUM_RANGE);
704
}
705
706
spin_lock_init(&root->root_item_lock);
707
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
708
#ifdef CONFIG_BTRFS_DEBUG
709
INIT_LIST_HEAD(&root->leak_list);
710
spin_lock(&fs_info->fs_roots_radix_lock);
711
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
712
spin_unlock(&fs_info->fs_roots_radix_lock);
713
#endif
714
715
return root;
716
}
717
718
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
719
/* Should only be used by the testing infrastructure */
720
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
721
{
722
struct btrfs_root *root;
723
724
if (!fs_info)
725
return ERR_PTR(-EINVAL);
726
727
root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
728
if (!root)
729
return ERR_PTR(-ENOMEM);
730
731
/* We don't use the stripesize in selftest, set it as sectorsize */
732
root->alloc_bytenr = 0;
733
734
return root;
735
}
736
#endif
737
738
static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
739
{
740
const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
741
const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
742
743
return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
744
}
745
746
static int global_root_key_cmp(const void *k, const struct rb_node *node)
747
{
748
const struct btrfs_key *key = k;
749
const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
750
751
return btrfs_comp_cpu_keys(key, &root->root_key);
752
}
753
754
int btrfs_global_root_insert(struct btrfs_root *root)
755
{
756
struct btrfs_fs_info *fs_info = root->fs_info;
757
struct rb_node *tmp;
758
int ret = 0;
759
760
write_lock(&fs_info->global_root_lock);
761
tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
762
write_unlock(&fs_info->global_root_lock);
763
764
if (tmp) {
765
ret = -EEXIST;
766
btrfs_warn(fs_info, "global root %llu %llu already exists",
767
btrfs_root_id(root), root->root_key.offset);
768
}
769
return ret;
770
}
771
772
void btrfs_global_root_delete(struct btrfs_root *root)
773
{
774
struct btrfs_fs_info *fs_info = root->fs_info;
775
776
write_lock(&fs_info->global_root_lock);
777
rb_erase(&root->rb_node, &fs_info->global_root_tree);
778
write_unlock(&fs_info->global_root_lock);
779
}
780
781
struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
782
struct btrfs_key *key)
783
{
784
struct rb_node *node;
785
struct btrfs_root *root = NULL;
786
787
read_lock(&fs_info->global_root_lock);
788
node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
789
if (node)
790
root = container_of(node, struct btrfs_root, rb_node);
791
read_unlock(&fs_info->global_root_lock);
792
793
return root;
794
}
795
796
static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
797
{
798
struct btrfs_block_group *block_group;
799
u64 ret;
800
801
if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
802
return 0;
803
804
if (bytenr)
805
block_group = btrfs_lookup_block_group(fs_info, bytenr);
806
else
807
block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
808
ASSERT(block_group);
809
if (!block_group)
810
return 0;
811
ret = block_group->global_root_id;
812
btrfs_put_block_group(block_group);
813
814
return ret;
815
}
816
817
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
818
{
819
struct btrfs_key key = {
820
.objectid = BTRFS_CSUM_TREE_OBJECTID,
821
.type = BTRFS_ROOT_ITEM_KEY,
822
.offset = btrfs_global_root_id(fs_info, bytenr),
823
};
824
825
return btrfs_global_root(fs_info, &key);
826
}
827
828
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
829
{
830
struct btrfs_key key = {
831
.objectid = BTRFS_EXTENT_TREE_OBJECTID,
832
.type = BTRFS_ROOT_ITEM_KEY,
833
.offset = btrfs_global_root_id(fs_info, bytenr),
834
};
835
836
return btrfs_global_root(fs_info, &key);
837
}
838
839
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
840
u64 objectid)
841
{
842
struct btrfs_fs_info *fs_info = trans->fs_info;
843
struct extent_buffer *leaf;
844
struct btrfs_root *tree_root = fs_info->tree_root;
845
struct btrfs_root *root;
846
struct btrfs_key key;
847
unsigned int nofs_flag;
848
int ret = 0;
849
850
/*
851
* We're holding a transaction handle, so use a NOFS memory allocation
852
* context to avoid deadlock if reclaim happens.
853
*/
854
nofs_flag = memalloc_nofs_save();
855
root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
856
memalloc_nofs_restore(nofs_flag);
857
if (!root)
858
return ERR_PTR(-ENOMEM);
859
860
root->root_key.objectid = objectid;
861
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
862
root->root_key.offset = 0;
863
864
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
865
0, BTRFS_NESTING_NORMAL);
866
if (IS_ERR(leaf)) {
867
ret = PTR_ERR(leaf);
868
leaf = NULL;
869
goto fail;
870
}
871
872
root->node = leaf;
873
btrfs_mark_buffer_dirty(trans, leaf);
874
875
root->commit_root = btrfs_root_node(root);
876
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
877
878
btrfs_set_root_flags(&root->root_item, 0);
879
btrfs_set_root_limit(&root->root_item, 0);
880
btrfs_set_root_bytenr(&root->root_item, leaf->start);
881
btrfs_set_root_generation(&root->root_item, trans->transid);
882
btrfs_set_root_level(&root->root_item, 0);
883
btrfs_set_root_refs(&root->root_item, 1);
884
btrfs_set_root_used(&root->root_item, leaf->len);
885
btrfs_set_root_last_snapshot(&root->root_item, 0);
886
btrfs_set_root_dirid(&root->root_item, 0);
887
if (btrfs_is_fstree(objectid))
888
generate_random_guid(root->root_item.uuid);
889
else
890
export_guid(root->root_item.uuid, &guid_null);
891
btrfs_set_root_drop_level(&root->root_item, 0);
892
893
btrfs_tree_unlock(leaf);
894
895
key.objectid = objectid;
896
key.type = BTRFS_ROOT_ITEM_KEY;
897
key.offset = 0;
898
ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
899
if (ret)
900
goto fail;
901
902
return root;
903
904
fail:
905
btrfs_put_root(root);
906
907
return ERR_PTR(ret);
908
}
909
910
static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
911
{
912
struct btrfs_root *root;
913
914
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
915
if (!root)
916
return ERR_PTR(-ENOMEM);
917
918
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
919
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
920
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
921
922
return root;
923
}
924
925
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
926
struct btrfs_root *root)
927
{
928
struct extent_buffer *leaf;
929
930
/*
931
* DON'T set SHAREABLE bit for log trees.
932
*
933
* Log trees are not exposed to user space thus can't be snapshotted,
934
* and they go away before a real commit is actually done.
935
*
936
* They do store pointers to file data extents, and those reference
937
* counts still get updated (along with back refs to the log tree).
938
*/
939
940
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
941
NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
942
if (IS_ERR(leaf))
943
return PTR_ERR(leaf);
944
945
root->node = leaf;
946
947
btrfs_mark_buffer_dirty(trans, root->node);
948
btrfs_tree_unlock(root->node);
949
950
return 0;
951
}
952
953
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
954
struct btrfs_fs_info *fs_info)
955
{
956
struct btrfs_root *log_root;
957
958
log_root = alloc_log_tree(fs_info);
959
if (IS_ERR(log_root))
960
return PTR_ERR(log_root);
961
962
if (!btrfs_is_zoned(fs_info)) {
963
int ret = btrfs_alloc_log_tree_node(trans, log_root);
964
965
if (ret) {
966
btrfs_put_root(log_root);
967
return ret;
968
}
969
}
970
971
WARN_ON(fs_info->log_root_tree);
972
fs_info->log_root_tree = log_root;
973
return 0;
974
}
975
976
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
977
struct btrfs_root *root)
978
{
979
struct btrfs_fs_info *fs_info = root->fs_info;
980
struct btrfs_root *log_root;
981
struct btrfs_inode_item *inode_item;
982
int ret;
983
984
log_root = alloc_log_tree(fs_info);
985
if (IS_ERR(log_root))
986
return PTR_ERR(log_root);
987
988
ret = btrfs_alloc_log_tree_node(trans, log_root);
989
if (ret) {
990
btrfs_put_root(log_root);
991
return ret;
992
}
993
994
btrfs_set_root_last_trans(log_root, trans->transid);
995
log_root->root_key.offset = btrfs_root_id(root);
996
997
inode_item = &log_root->root_item.inode;
998
btrfs_set_stack_inode_generation(inode_item, 1);
999
btrfs_set_stack_inode_size(inode_item, 3);
1000
btrfs_set_stack_inode_nlink(inode_item, 1);
1001
btrfs_set_stack_inode_nbytes(inode_item,
1002
fs_info->nodesize);
1003
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1004
1005
btrfs_set_root_node(&log_root->root_item, log_root->node);
1006
1007
WARN_ON(root->log_root);
1008
root->log_root = log_root;
1009
btrfs_set_root_log_transid(root, 0);
1010
root->log_transid_committed = -1;
1011
btrfs_set_root_last_log_commit(root, 0);
1012
return 0;
1013
}
1014
1015
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1016
struct btrfs_path *path,
1017
const struct btrfs_key *key)
1018
{
1019
struct btrfs_root *root;
1020
struct btrfs_tree_parent_check check = { 0 };
1021
struct btrfs_fs_info *fs_info = tree_root->fs_info;
1022
u64 generation;
1023
int ret;
1024
int level;
1025
1026
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1027
if (!root)
1028
return ERR_PTR(-ENOMEM);
1029
1030
ret = btrfs_find_root(tree_root, key, path,
1031
&root->root_item, &root->root_key);
1032
if (ret) {
1033
if (ret > 0)
1034
ret = -ENOENT;
1035
goto fail;
1036
}
1037
1038
generation = btrfs_root_generation(&root->root_item);
1039
level = btrfs_root_level(&root->root_item);
1040
check.level = level;
1041
check.transid = generation;
1042
check.owner_root = key->objectid;
1043
root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
1044
&check);
1045
if (IS_ERR(root->node)) {
1046
ret = PTR_ERR(root->node);
1047
root->node = NULL;
1048
goto fail;
1049
}
1050
if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1051
ret = -EIO;
1052
goto fail;
1053
}
1054
1055
/*
1056
* For real fs, and not log/reloc trees, root owner must
1057
* match its root node owner
1058
*/
1059
if (!btrfs_is_testing(fs_info) &&
1060
btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1061
btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
1062
btrfs_root_id(root) != btrfs_header_owner(root->node)) {
1063
btrfs_crit(fs_info,
1064
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1065
btrfs_root_id(root), root->node->start,
1066
btrfs_header_owner(root->node),
1067
btrfs_root_id(root));
1068
ret = -EUCLEAN;
1069
goto fail;
1070
}
1071
root->commit_root = btrfs_root_node(root);
1072
return root;
1073
fail:
1074
btrfs_put_root(root);
1075
return ERR_PTR(ret);
1076
}
1077
1078
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1079
const struct btrfs_key *key)
1080
{
1081
struct btrfs_root *root;
1082
BTRFS_PATH_AUTO_FREE(path);
1083
1084
path = btrfs_alloc_path();
1085
if (!path)
1086
return ERR_PTR(-ENOMEM);
1087
root = read_tree_root_path(tree_root, path, key);
1088
1089
return root;
1090
}
1091
1092
/*
1093
* Initialize subvolume root in-memory structure.
1094
*
1095
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
1096
*
1097
* In case of failure the caller is responsible to call btrfs_free_fs_root()
1098
*/
1099
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1100
{
1101
int ret;
1102
1103
btrfs_drew_lock_init(&root->snapshot_lock);
1104
1105
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1106
!btrfs_is_data_reloc_root(root) &&
1107
btrfs_is_fstree(btrfs_root_id(root))) {
1108
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1109
btrfs_check_and_init_root_item(&root->root_item);
1110
}
1111
1112
/*
1113
* Don't assign anonymous block device to roots that are not exposed to
1114
* userspace, the id pool is limited to 1M
1115
*/
1116
if (btrfs_is_fstree(btrfs_root_id(root)) &&
1117
btrfs_root_refs(&root->root_item) > 0) {
1118
if (!anon_dev) {
1119
ret = get_anon_bdev(&root->anon_dev);
1120
if (ret)
1121
return ret;
1122
} else {
1123
root->anon_dev = anon_dev;
1124
}
1125
}
1126
1127
mutex_lock(&root->objectid_mutex);
1128
ret = btrfs_init_root_free_objectid(root);
1129
if (ret) {
1130
mutex_unlock(&root->objectid_mutex);
1131
return ret;
1132
}
1133
1134
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1135
1136
mutex_unlock(&root->objectid_mutex);
1137
1138
return 0;
1139
}
1140
1141
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1142
u64 root_id)
1143
{
1144
struct btrfs_root *root;
1145
1146
spin_lock(&fs_info->fs_roots_radix_lock);
1147
root = radix_tree_lookup(&fs_info->fs_roots_radix,
1148
(unsigned long)root_id);
1149
root = btrfs_grab_root(root);
1150
spin_unlock(&fs_info->fs_roots_radix_lock);
1151
return root;
1152
}
1153
1154
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1155
u64 objectid)
1156
{
1157
struct btrfs_key key = {
1158
.objectid = objectid,
1159
.type = BTRFS_ROOT_ITEM_KEY,
1160
.offset = 0,
1161
};
1162
1163
switch (objectid) {
1164
case BTRFS_ROOT_TREE_OBJECTID:
1165
return btrfs_grab_root(fs_info->tree_root);
1166
case BTRFS_EXTENT_TREE_OBJECTID:
1167
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1168
case BTRFS_CHUNK_TREE_OBJECTID:
1169
return btrfs_grab_root(fs_info->chunk_root);
1170
case BTRFS_DEV_TREE_OBJECTID:
1171
return btrfs_grab_root(fs_info->dev_root);
1172
case BTRFS_CSUM_TREE_OBJECTID:
1173
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1174
case BTRFS_QUOTA_TREE_OBJECTID:
1175
return btrfs_grab_root(fs_info->quota_root);
1176
case BTRFS_UUID_TREE_OBJECTID:
1177
return btrfs_grab_root(fs_info->uuid_root);
1178
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
1179
return btrfs_grab_root(fs_info->block_group_root);
1180
case BTRFS_FREE_SPACE_TREE_OBJECTID:
1181
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1182
case BTRFS_RAID_STRIPE_TREE_OBJECTID:
1183
return btrfs_grab_root(fs_info->stripe_root);
1184
default:
1185
return NULL;
1186
}
1187
}
1188
1189
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1190
struct btrfs_root *root)
1191
{
1192
int ret;
1193
1194
ret = radix_tree_preload(GFP_NOFS);
1195
if (ret)
1196
return ret;
1197
1198
spin_lock(&fs_info->fs_roots_radix_lock);
1199
ret = radix_tree_insert(&fs_info->fs_roots_radix,
1200
(unsigned long)btrfs_root_id(root),
1201
root);
1202
if (ret == 0) {
1203
btrfs_grab_root(root);
1204
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1205
}
1206
spin_unlock(&fs_info->fs_roots_radix_lock);
1207
radix_tree_preload_end();
1208
1209
return ret;
1210
}
1211
1212
void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
1213
{
1214
#ifdef CONFIG_BTRFS_DEBUG
1215
struct btrfs_root *root;
1216
1217
while (!list_empty(&fs_info->allocated_roots)) {
1218
char buf[BTRFS_ROOT_NAME_BUF_LEN];
1219
1220
root = list_first_entry(&fs_info->allocated_roots,
1221
struct btrfs_root, leak_list);
1222
btrfs_err(fs_info, "leaked root %s refcount %d",
1223
btrfs_root_name(&root->root_key, buf),
1224
refcount_read(&root->refs));
1225
WARN_ON_ONCE(1);
1226
while (refcount_read(&root->refs) > 1)
1227
btrfs_put_root(root);
1228
btrfs_put_root(root);
1229
}
1230
#endif
1231
}
1232
1233
static void free_global_roots(struct btrfs_fs_info *fs_info)
1234
{
1235
struct btrfs_root *root;
1236
struct rb_node *node;
1237
1238
while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1239
root = rb_entry(node, struct btrfs_root, rb_node);
1240
rb_erase(&root->rb_node, &fs_info->global_root_tree);
1241
btrfs_put_root(root);
1242
}
1243
}
1244
1245
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1246
{
1247
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
1248
1249
if (fs_info->fs_devices)
1250
btrfs_close_devices(fs_info->fs_devices);
1251
percpu_counter_destroy(&fs_info->stats_read_blocks);
1252
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1253
percpu_counter_destroy(&fs_info->delalloc_bytes);
1254
percpu_counter_destroy(&fs_info->ordered_bytes);
1255
if (percpu_counter_initialized(em_counter))
1256
ASSERT(percpu_counter_sum_positive(em_counter) == 0);
1257
percpu_counter_destroy(em_counter);
1258
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1259
btrfs_free_csum_hash(fs_info);
1260
btrfs_free_stripe_hash_table(fs_info);
1261
btrfs_free_ref_cache(fs_info);
1262
kfree(fs_info->balance_ctl);
1263
kfree(fs_info->delayed_root);
1264
free_global_roots(fs_info);
1265
btrfs_put_root(fs_info->tree_root);
1266
btrfs_put_root(fs_info->chunk_root);
1267
btrfs_put_root(fs_info->dev_root);
1268
btrfs_put_root(fs_info->quota_root);
1269
btrfs_put_root(fs_info->uuid_root);
1270
btrfs_put_root(fs_info->fs_root);
1271
btrfs_put_root(fs_info->data_reloc_root);
1272
btrfs_put_root(fs_info->block_group_root);
1273
btrfs_put_root(fs_info->stripe_root);
1274
btrfs_check_leaked_roots(fs_info);
1275
btrfs_extent_buffer_leak_debug_check(fs_info);
1276
kfree(fs_info->super_copy);
1277
kfree(fs_info->super_for_commit);
1278
kvfree(fs_info);
1279
}
1280
1281
1282
/*
1283
* Get an in-memory reference of a root structure.
1284
*
1285
* For essential trees like root/extent tree, we grab it from fs_info directly.
1286
* For subvolume trees, we check the cached filesystem roots first. If not
1287
* found, then read it from disk and add it to cached fs roots.
1288
*
1289
* Caller should release the root by calling btrfs_put_root() after the usage.
1290
*
1291
* NOTE: Reloc and log trees can't be read by this function as they share the
1292
* same root objectid.
1293
*
1294
* @objectid: root id
1295
* @anon_dev: preallocated anonymous block device number for new roots,
1296
* pass NULL for a new allocation.
1297
* @check_ref: whether to check root item references, If true, return -ENOENT
1298
* for orphan roots
1299
*/
1300
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1301
u64 objectid, dev_t *anon_dev,
1302
bool check_ref)
1303
{
1304
struct btrfs_root *root;
1305
struct btrfs_path *path;
1306
struct btrfs_key key;
1307
int ret;
1308
1309
root = btrfs_get_global_root(fs_info, objectid);
1310
if (root)
1311
return root;
1312
1313
/*
1314
* If we're called for non-subvolume trees, and above function didn't
1315
* find one, do not try to read it from disk.
1316
*
1317
* This is namely for free-space-tree and quota tree, which can change
1318
* at runtime and should only be grabbed from fs_info.
1319
*/
1320
if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1321
return ERR_PTR(-ENOENT);
1322
again:
1323
root = btrfs_lookup_fs_root(fs_info, objectid);
1324
if (root) {
1325
/*
1326
* Some other caller may have read out the newly inserted
1327
* subvolume already (for things like backref walk etc). Not
1328
* that common but still possible. In that case, we just need
1329
* to free the anon_dev.
1330
*/
1331
if (unlikely(anon_dev && *anon_dev)) {
1332
free_anon_bdev(*anon_dev);
1333
*anon_dev = 0;
1334
}
1335
1336
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1337
btrfs_put_root(root);
1338
return ERR_PTR(-ENOENT);
1339
}
1340
return root;
1341
}
1342
1343
key.objectid = objectid;
1344
key.type = BTRFS_ROOT_ITEM_KEY;
1345
key.offset = (u64)-1;
1346
root = btrfs_read_tree_root(fs_info->tree_root, &key);
1347
if (IS_ERR(root))
1348
return root;
1349
1350
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1351
ret = -ENOENT;
1352
goto fail;
1353
}
1354
1355
ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
1356
if (ret)
1357
goto fail;
1358
1359
path = btrfs_alloc_path();
1360
if (!path) {
1361
ret = -ENOMEM;
1362
goto fail;
1363
}
1364
key.objectid = BTRFS_ORPHAN_OBJECTID;
1365
key.type = BTRFS_ORPHAN_ITEM_KEY;
1366
key.offset = objectid;
1367
1368
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1369
btrfs_free_path(path);
1370
if (ret < 0)
1371
goto fail;
1372
if (ret == 0)
1373
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1374
1375
ret = btrfs_insert_fs_root(fs_info, root);
1376
if (ret) {
1377
if (ret == -EEXIST) {
1378
btrfs_put_root(root);
1379
goto again;
1380
}
1381
goto fail;
1382
}
1383
return root;
1384
fail:
1385
/*
1386
* If our caller provided us an anonymous device, then it's his
1387
* responsibility to free it in case we fail. So we have to set our
1388
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1389
* and once again by our caller.
1390
*/
1391
if (anon_dev && *anon_dev)
1392
root->anon_dev = 0;
1393
btrfs_put_root(root);
1394
return ERR_PTR(ret);
1395
}
1396
1397
/*
1398
* Get in-memory reference of a root structure
1399
*
1400
* @objectid: tree objectid
1401
* @check_ref: if set, verify that the tree exists and the item has at least
1402
* one reference
1403
*/
1404
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1405
u64 objectid, bool check_ref)
1406
{
1407
return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
1408
}
1409
1410
/*
1411
* Get in-memory reference of a root structure, created as new, optionally pass
1412
* the anonymous block device id
1413
*
1414
* @objectid: tree objectid
1415
* @anon_dev: if NULL, allocate a new anonymous block device or use the
1416
* parameter value if not NULL
1417
*/
1418
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1419
u64 objectid, dev_t *anon_dev)
1420
{
1421
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1422
}
1423
1424
/*
1425
* Return a root for the given objectid.
1426
*
1427
* @fs_info: the fs_info
1428
* @objectid: the objectid we need to lookup
1429
*
1430
* This is exclusively used for backref walking, and exists specifically because
1431
* of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
1432
* creation time, which means we may have to read the tree_root in order to look
1433
* up a fs root that is not in memory. If the root is not in memory we will
1434
* read the tree root commit root and look up the fs root from there. This is a
1435
* temporary root, it will not be inserted into the radix tree as it doesn't
1436
* have the most uptodate information, it'll simply be discarded once the
1437
* backref code is finished using the root.
1438
*/
1439
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1440
struct btrfs_path *path,
1441
u64 objectid)
1442
{
1443
struct btrfs_root *root;
1444
struct btrfs_key key;
1445
1446
ASSERT(path->search_commit_root && path->skip_locking);
1447
1448
/*
1449
* This can return -ENOENT if we ask for a root that doesn't exist, but
1450
* since this is called via the backref walking code we won't be looking
1451
* up a root that doesn't exist, unless there's corruption. So if root
1452
* != NULL just return it.
1453
*/
1454
root = btrfs_get_global_root(fs_info, objectid);
1455
if (root)
1456
return root;
1457
1458
root = btrfs_lookup_fs_root(fs_info, objectid);
1459
if (root)
1460
return root;
1461
1462
key.objectid = objectid;
1463
key.type = BTRFS_ROOT_ITEM_KEY;
1464
key.offset = (u64)-1;
1465
root = read_tree_root_path(fs_info->tree_root, path, &key);
1466
btrfs_release_path(path);
1467
1468
return root;
1469
}
1470
1471
static int cleaner_kthread(void *arg)
1472
{
1473
struct btrfs_fs_info *fs_info = arg;
1474
int again;
1475
1476
while (1) {
1477
again = 0;
1478
1479
set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1480
1481
/* Make the cleaner go to sleep early. */
1482
if (btrfs_need_cleaner_sleep(fs_info))
1483
goto sleep;
1484
1485
/*
1486
* Do not do anything if we might cause open_ctree() to block
1487
* before we have finished mounting the filesystem.
1488
*/
1489
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1490
goto sleep;
1491
1492
if (!mutex_trylock(&fs_info->cleaner_mutex))
1493
goto sleep;
1494
1495
/*
1496
* Avoid the problem that we change the status of the fs
1497
* during the above check and trylock.
1498
*/
1499
if (btrfs_need_cleaner_sleep(fs_info)) {
1500
mutex_unlock(&fs_info->cleaner_mutex);
1501
goto sleep;
1502
}
1503
1504
if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
1505
btrfs_sysfs_feature_update(fs_info);
1506
1507
btrfs_run_delayed_iputs(fs_info);
1508
1509
again = btrfs_clean_one_deleted_snapshot(fs_info);
1510
mutex_unlock(&fs_info->cleaner_mutex);
1511
1512
/*
1513
* The defragger has dealt with the R/O remount and umount,
1514
* needn't do anything special here.
1515
*/
1516
btrfs_run_defrag_inodes(fs_info);
1517
1518
/*
1519
* Acquires fs_info->reclaim_bgs_lock to avoid racing
1520
* with relocation (btrfs_relocate_chunk) and relocation
1521
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1522
* after acquiring fs_info->reclaim_bgs_lock. So we
1523
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
1524
* unused block groups.
1525
*/
1526
btrfs_delete_unused_bgs(fs_info);
1527
1528
/*
1529
* Reclaim block groups in the reclaim_bgs list after we deleted
1530
* all unused block_groups. This possibly gives us some more free
1531
* space.
1532
*/
1533
btrfs_reclaim_bgs(fs_info);
1534
sleep:
1535
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1536
if (kthread_should_park())
1537
kthread_parkme();
1538
if (kthread_should_stop())
1539
return 0;
1540
if (!again) {
1541
set_current_state(TASK_INTERRUPTIBLE);
1542
schedule();
1543
__set_current_state(TASK_RUNNING);
1544
}
1545
}
1546
}
1547
1548
static int transaction_kthread(void *arg)
1549
{
1550
struct btrfs_root *root = arg;
1551
struct btrfs_fs_info *fs_info = root->fs_info;
1552
struct btrfs_trans_handle *trans;
1553
struct btrfs_transaction *cur;
1554
u64 transid;
1555
time64_t delta;
1556
unsigned long delay;
1557
bool cannot_commit;
1558
1559
do {
1560
cannot_commit = false;
1561
delay = secs_to_jiffies(fs_info->commit_interval);
1562
mutex_lock(&fs_info->transaction_kthread_mutex);
1563
1564
spin_lock(&fs_info->trans_lock);
1565
cur = fs_info->running_transaction;
1566
if (!cur) {
1567
spin_unlock(&fs_info->trans_lock);
1568
goto sleep;
1569
}
1570
1571
delta = ktime_get_seconds() - cur->start_time;
1572
if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1573
cur->state < TRANS_STATE_COMMIT_PREP &&
1574
delta < fs_info->commit_interval) {
1575
spin_unlock(&fs_info->trans_lock);
1576
delay -= secs_to_jiffies(delta - 1);
1577
delay = min(delay,
1578
secs_to_jiffies(fs_info->commit_interval));
1579
goto sleep;
1580
}
1581
transid = cur->transid;
1582
spin_unlock(&fs_info->trans_lock);
1583
1584
/* If the file system is aborted, this will always fail. */
1585
trans = btrfs_attach_transaction(root);
1586
if (IS_ERR(trans)) {
1587
if (PTR_ERR(trans) != -ENOENT)
1588
cannot_commit = true;
1589
goto sleep;
1590
}
1591
if (transid == trans->transid) {
1592
btrfs_commit_transaction(trans);
1593
} else {
1594
btrfs_end_transaction(trans);
1595
}
1596
sleep:
1597
wake_up_process(fs_info->cleaner_kthread);
1598
mutex_unlock(&fs_info->transaction_kthread_mutex);
1599
1600
if (BTRFS_FS_ERROR(fs_info))
1601
btrfs_cleanup_transaction(fs_info);
1602
if (!kthread_should_stop() &&
1603
(!btrfs_transaction_blocked(fs_info) ||
1604
cannot_commit))
1605
schedule_timeout_interruptible(delay);
1606
} while (!kthread_should_stop());
1607
return 0;
1608
}
1609
1610
/*
1611
* This will find the highest generation in the array of root backups. The
1612
* index of the highest array is returned, or -EINVAL if we can't find
1613
* anything.
1614
*
1615
* We check to make sure the array is valid by comparing the
1616
* generation of the latest root in the array with the generation
1617
* in the super block. If they don't match we pitch it.
1618
*/
1619
static int find_newest_super_backup(struct btrfs_fs_info *info)
1620
{
1621
const u64 newest_gen = btrfs_super_generation(info->super_copy);
1622
u64 cur;
1623
struct btrfs_root_backup *root_backup;
1624
int i;
1625
1626
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1627
root_backup = info->super_copy->super_roots + i;
1628
cur = btrfs_backup_tree_root_gen(root_backup);
1629
if (cur == newest_gen)
1630
return i;
1631
}
1632
1633
return -EINVAL;
1634
}
1635
1636
/*
1637
* copy all the root pointers into the super backup array.
1638
* this will bump the backup pointer by one when it is
1639
* done
1640
*/
1641
static void backup_super_roots(struct btrfs_fs_info *info)
1642
{
1643
const int next_backup = info->backup_root_index;
1644
struct btrfs_root_backup *root_backup;
1645
1646
root_backup = info->super_for_commit->super_roots + next_backup;
1647
1648
/*
1649
* make sure all of our padding and empty slots get zero filled
1650
* regardless of which ones we use today
1651
*/
1652
memset(root_backup, 0, sizeof(*root_backup));
1653
1654
info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1655
1656
btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1657
btrfs_set_backup_tree_root_gen(root_backup,
1658
btrfs_header_generation(info->tree_root->node));
1659
1660
btrfs_set_backup_tree_root_level(root_backup,
1661
btrfs_header_level(info->tree_root->node));
1662
1663
btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1664
btrfs_set_backup_chunk_root_gen(root_backup,
1665
btrfs_header_generation(info->chunk_root->node));
1666
btrfs_set_backup_chunk_root_level(root_backup,
1667
btrfs_header_level(info->chunk_root->node));
1668
1669
if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
1670
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
1671
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
1672
1673
btrfs_set_backup_extent_root(root_backup,
1674
extent_root->node->start);
1675
btrfs_set_backup_extent_root_gen(root_backup,
1676
btrfs_header_generation(extent_root->node));
1677
btrfs_set_backup_extent_root_level(root_backup,
1678
btrfs_header_level(extent_root->node));
1679
1680
btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
1681
btrfs_set_backup_csum_root_gen(root_backup,
1682
btrfs_header_generation(csum_root->node));
1683
btrfs_set_backup_csum_root_level(root_backup,
1684
btrfs_header_level(csum_root->node));
1685
}
1686
1687
/*
1688
* we might commit during log recovery, which happens before we set
1689
* the fs_root. Make sure it is valid before we fill it in.
1690
*/
1691
if (info->fs_root && info->fs_root->node) {
1692
btrfs_set_backup_fs_root(root_backup,
1693
info->fs_root->node->start);
1694
btrfs_set_backup_fs_root_gen(root_backup,
1695
btrfs_header_generation(info->fs_root->node));
1696
btrfs_set_backup_fs_root_level(root_backup,
1697
btrfs_header_level(info->fs_root->node));
1698
}
1699
1700
btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1701
btrfs_set_backup_dev_root_gen(root_backup,
1702
btrfs_header_generation(info->dev_root->node));
1703
btrfs_set_backup_dev_root_level(root_backup,
1704
btrfs_header_level(info->dev_root->node));
1705
1706
btrfs_set_backup_total_bytes(root_backup,
1707
btrfs_super_total_bytes(info->super_copy));
1708
btrfs_set_backup_bytes_used(root_backup,
1709
btrfs_super_bytes_used(info->super_copy));
1710
btrfs_set_backup_num_devices(root_backup,
1711
btrfs_super_num_devices(info->super_copy));
1712
1713
/*
1714
* if we don't copy this out to the super_copy, it won't get remembered
1715
* for the next commit
1716
*/
1717
memcpy(&info->super_copy->super_roots,
1718
&info->super_for_commit->super_roots,
1719
sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1720
}
1721
1722
/*
1723
* Reads a backup root based on the passed priority. Prio 0 is the newest, prio
1724
* 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1725
*
1726
* @fs_info: filesystem whose backup roots need to be read
1727
* @priority: priority of backup root required
1728
*
1729
* Returns backup root index on success and -EINVAL otherwise.
1730
*/
1731
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
1732
{
1733
int backup_index = find_newest_super_backup(fs_info);
1734
struct btrfs_super_block *super = fs_info->super_copy;
1735
struct btrfs_root_backup *root_backup;
1736
1737
if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1738
if (priority == 0)
1739
return backup_index;
1740
1741
backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1742
backup_index %= BTRFS_NUM_BACKUP_ROOTS;
1743
} else {
1744
return -EINVAL;
1745
}
1746
1747
root_backup = super->super_roots + backup_index;
1748
1749
btrfs_set_super_generation(super,
1750
btrfs_backup_tree_root_gen(root_backup));
1751
btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1752
btrfs_set_super_root_level(super,
1753
btrfs_backup_tree_root_level(root_backup));
1754
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1755
1756
/*
1757
* Fixme: the total bytes and num_devices need to match or we should
1758
* need a fsck
1759
*/
1760
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1761
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1762
1763
return backup_index;
1764
}
1765
1766
/* helper to cleanup workers */
1767
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1768
{
1769
btrfs_destroy_workqueue(fs_info->fixup_workers);
1770
btrfs_destroy_workqueue(fs_info->delalloc_workers);
1771
btrfs_destroy_workqueue(fs_info->workers);
1772
if (fs_info->endio_workers)
1773
destroy_workqueue(fs_info->endio_workers);
1774
if (fs_info->rmw_workers)
1775
destroy_workqueue(fs_info->rmw_workers);
1776
if (fs_info->compressed_write_workers)
1777
destroy_workqueue(fs_info->compressed_write_workers);
1778
btrfs_destroy_workqueue(fs_info->endio_write_workers);
1779
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1780
btrfs_destroy_workqueue(fs_info->delayed_workers);
1781
btrfs_destroy_workqueue(fs_info->caching_workers);
1782
btrfs_destroy_workqueue(fs_info->flush_workers);
1783
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1784
if (fs_info->discard_ctl.discard_workers)
1785
destroy_workqueue(fs_info->discard_ctl.discard_workers);
1786
/*
1787
* Now that all other work queues are destroyed, we can safely destroy
1788
* the queues used for metadata I/O, since tasks from those other work
1789
* queues can do metadata I/O operations.
1790
*/
1791
if (fs_info->endio_meta_workers)
1792
destroy_workqueue(fs_info->endio_meta_workers);
1793
}
1794
1795
static void free_root_extent_buffers(struct btrfs_root *root)
1796
{
1797
if (root) {
1798
free_extent_buffer(root->node);
1799
free_extent_buffer(root->commit_root);
1800
root->node = NULL;
1801
root->commit_root = NULL;
1802
}
1803
}
1804
1805
static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
1806
{
1807
struct btrfs_root *root, *tmp;
1808
1809
rbtree_postorder_for_each_entry_safe(root, tmp,
1810
&fs_info->global_root_tree,
1811
rb_node)
1812
free_root_extent_buffers(root);
1813
}
1814
1815
/* helper to cleanup tree roots */
1816
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
1817
{
1818
free_root_extent_buffers(info->tree_root);
1819
1820
free_global_root_pointers(info);
1821
free_root_extent_buffers(info->dev_root);
1822
free_root_extent_buffers(info->quota_root);
1823
free_root_extent_buffers(info->uuid_root);
1824
free_root_extent_buffers(info->fs_root);
1825
free_root_extent_buffers(info->data_reloc_root);
1826
free_root_extent_buffers(info->block_group_root);
1827
free_root_extent_buffers(info->stripe_root);
1828
if (free_chunk_root)
1829
free_root_extent_buffers(info->chunk_root);
1830
}
1831
1832
void btrfs_put_root(struct btrfs_root *root)
1833
{
1834
if (!root)
1835
return;
1836
1837
if (refcount_dec_and_test(&root->refs)) {
1838
if (WARN_ON(!xa_empty(&root->inodes)))
1839
xa_destroy(&root->inodes);
1840
if (WARN_ON(!xa_empty(&root->delayed_nodes)))
1841
xa_destroy(&root->delayed_nodes);
1842
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
1843
if (root->anon_dev)
1844
free_anon_bdev(root->anon_dev);
1845
free_root_extent_buffers(root);
1846
#ifdef CONFIG_BTRFS_DEBUG
1847
spin_lock(&root->fs_info->fs_roots_radix_lock);
1848
list_del_init(&root->leak_list);
1849
spin_unlock(&root->fs_info->fs_roots_radix_lock);
1850
#endif
1851
kfree(root);
1852
}
1853
}
1854
1855
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
1856
{
1857
int ret;
1858
struct btrfs_root *gang[8];
1859
int i;
1860
1861
while (!list_empty(&fs_info->dead_roots)) {
1862
gang[0] = list_first_entry(&fs_info->dead_roots,
1863
struct btrfs_root, root_list);
1864
list_del(&gang[0]->root_list);
1865
1866
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
1867
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
1868
btrfs_put_root(gang[0]);
1869
}
1870
1871
while (1) {
1872
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1873
(void **)gang, 0,
1874
ARRAY_SIZE(gang));
1875
if (!ret)
1876
break;
1877
for (i = 0; i < ret; i++)
1878
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
1879
}
1880
}
1881
1882
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
1883
{
1884
mutex_init(&fs_info->scrub_lock);
1885
atomic_set(&fs_info->scrubs_running, 0);
1886
atomic_set(&fs_info->scrub_pause_req, 0);
1887
atomic_set(&fs_info->scrubs_paused, 0);
1888
atomic_set(&fs_info->scrub_cancel_req, 0);
1889
init_waitqueue_head(&fs_info->scrub_pause_wait);
1890
refcount_set(&fs_info->scrub_workers_refcnt, 0);
1891
}
1892
1893
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
1894
{
1895
spin_lock_init(&fs_info->balance_lock);
1896
mutex_init(&fs_info->balance_mutex);
1897
atomic_set(&fs_info->balance_pause_req, 0);
1898
atomic_set(&fs_info->balance_cancel_req, 0);
1899
fs_info->balance_ctl = NULL;
1900
init_waitqueue_head(&fs_info->balance_wait_q);
1901
atomic_set(&fs_info->reloc_cancel_req, 0);
1902
}
1903
1904
static int btrfs_init_btree_inode(struct super_block *sb)
1905
{
1906
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1907
unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
1908
fs_info->tree_root);
1909
struct inode *inode;
1910
1911
inode = new_inode(sb);
1912
if (!inode)
1913
return -ENOMEM;
1914
1915
btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
1916
set_nlink(inode, 1);
1917
/*
1918
* we set the i_size on the btree inode to the max possible int.
1919
* the real end of the address space is determined by all of
1920
* the devices in the system
1921
*/
1922
inode->i_size = OFFSET_MAX;
1923
inode->i_mapping->a_ops = &btree_aops;
1924
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
1925
1926
btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
1927
IO_TREE_BTREE_INODE_IO);
1928
btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
1929
1930
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
1931
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
1932
__insert_inode_hash(inode, hash);
1933
fs_info->btree_inode = inode;
1934
1935
return 0;
1936
}
1937
1938
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
1939
{
1940
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
1941
init_rwsem(&fs_info->dev_replace.rwsem);
1942
init_waitqueue_head(&fs_info->dev_replace.replace_wait);
1943
}
1944
1945
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
1946
{
1947
spin_lock_init(&fs_info->qgroup_lock);
1948
mutex_init(&fs_info->qgroup_ioctl_lock);
1949
fs_info->qgroup_tree = RB_ROOT;
1950
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
1951
fs_info->qgroup_seq = 1;
1952
fs_info->qgroup_rescan_running = false;
1953
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
1954
mutex_init(&fs_info->qgroup_rescan_lock);
1955
}
1956
1957
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
1958
{
1959
u32 max_active = fs_info->thread_pool_size;
1960
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
1961
unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
1962
1963
fs_info->workers =
1964
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
1965
1966
fs_info->delalloc_workers =
1967
btrfs_alloc_workqueue(fs_info, "delalloc",
1968
flags, max_active, 2);
1969
1970
fs_info->flush_workers =
1971
btrfs_alloc_workqueue(fs_info, "flush_delalloc",
1972
flags, max_active, 0);
1973
1974
fs_info->caching_workers =
1975
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
1976
1977
fs_info->fixup_workers =
1978
btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
1979
1980
fs_info->endio_workers =
1981
alloc_workqueue("btrfs-endio", flags, max_active);
1982
fs_info->endio_meta_workers =
1983
alloc_workqueue("btrfs-endio-meta", flags, max_active);
1984
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
1985
fs_info->endio_write_workers =
1986
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
1987
max_active, 2);
1988
fs_info->compressed_write_workers =
1989
alloc_workqueue("btrfs-compressed-write", flags, max_active);
1990
fs_info->endio_freespace_worker =
1991
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
1992
max_active, 0);
1993
fs_info->delayed_workers =
1994
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
1995
max_active, 0);
1996
fs_info->qgroup_rescan_workers =
1997
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
1998
ordered_flags);
1999
fs_info->discard_ctl.discard_workers =
2000
alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
2001
2002
if (!(fs_info->workers &&
2003
fs_info->delalloc_workers && fs_info->flush_workers &&
2004
fs_info->endio_workers && fs_info->endio_meta_workers &&
2005
fs_info->compressed_write_workers &&
2006
fs_info->endio_write_workers &&
2007
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2008
fs_info->caching_workers && fs_info->fixup_workers &&
2009
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2010
fs_info->discard_ctl.discard_workers)) {
2011
return -ENOMEM;
2012
}
2013
2014
return 0;
2015
}
2016
2017
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2018
{
2019
struct crypto_shash *csum_shash;
2020
const char *csum_driver = btrfs_super_csum_driver(csum_type);
2021
2022
csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2023
2024
if (IS_ERR(csum_shash)) {
2025
btrfs_err(fs_info, "error allocating %s hash for checksum",
2026
csum_driver);
2027
return PTR_ERR(csum_shash);
2028
}
2029
2030
fs_info->csum_shash = csum_shash;
2031
2032
/* Check if the checksum implementation is a fast accelerated one. */
2033
switch (csum_type) {
2034
case BTRFS_CSUM_TYPE_CRC32:
2035
if (crc32_optimizations() & CRC32C_OPTIMIZATION)
2036
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2037
break;
2038
case BTRFS_CSUM_TYPE_XXHASH:
2039
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2040
break;
2041
default:
2042
break;
2043
}
2044
2045
btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2046
btrfs_super_csum_name(csum_type),
2047
crypto_shash_driver_name(csum_shash));
2048
return 0;
2049
}
2050
2051
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2052
struct btrfs_fs_devices *fs_devices)
2053
{
2054
int ret;
2055
struct btrfs_tree_parent_check check = { 0 };
2056
struct btrfs_root *log_tree_root;
2057
struct btrfs_super_block *disk_super = fs_info->super_copy;
2058
u64 bytenr = btrfs_super_log_root(disk_super);
2059
int level = btrfs_super_log_root_level(disk_super);
2060
2061
if (fs_devices->rw_devices == 0) {
2062
btrfs_warn(fs_info, "log replay required on RO media");
2063
return -EIO;
2064
}
2065
2066
log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2067
GFP_KERNEL);
2068
if (!log_tree_root)
2069
return -ENOMEM;
2070
2071
check.level = level;
2072
check.transid = fs_info->generation + 1;
2073
check.owner_root = BTRFS_TREE_LOG_OBJECTID;
2074
log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
2075
if (IS_ERR(log_tree_root->node)) {
2076
btrfs_warn(fs_info, "failed to read log tree");
2077
ret = PTR_ERR(log_tree_root->node);
2078
log_tree_root->node = NULL;
2079
btrfs_put_root(log_tree_root);
2080
return ret;
2081
}
2082
if (!extent_buffer_uptodate(log_tree_root->node)) {
2083
btrfs_err(fs_info, "failed to read log tree");
2084
btrfs_put_root(log_tree_root);
2085
return -EIO;
2086
}
2087
2088
/* returns with log_tree_root freed on success */
2089
ret = btrfs_recover_log_trees(log_tree_root);
2090
if (ret) {
2091
btrfs_handle_fs_error(fs_info, ret,
2092
"Failed to recover log tree");
2093
btrfs_put_root(log_tree_root);
2094
return ret;
2095
}
2096
2097
if (sb_rdonly(fs_info->sb)) {
2098
ret = btrfs_commit_super(fs_info);
2099
if (ret)
2100
return ret;
2101
}
2102
2103
return 0;
2104
}
2105
2106
static int load_global_roots_objectid(struct btrfs_root *tree_root,
2107
struct btrfs_path *path, u64 objectid,
2108
const char *name)
2109
{
2110
struct btrfs_fs_info *fs_info = tree_root->fs_info;
2111
struct btrfs_root *root;
2112
u64 max_global_id = 0;
2113
int ret;
2114
struct btrfs_key key = {
2115
.objectid = objectid,
2116
.type = BTRFS_ROOT_ITEM_KEY,
2117
.offset = 0,
2118
};
2119
bool found = false;
2120
2121
/* If we have IGNOREDATACSUMS skip loading these roots. */
2122
if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2123
btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2124
set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
2125
return 0;
2126
}
2127
2128
while (1) {
2129
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2130
if (ret < 0)
2131
break;
2132
2133
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2134
ret = btrfs_next_leaf(tree_root, path);
2135
if (ret) {
2136
if (ret > 0)
2137
ret = 0;
2138
break;
2139
}
2140
}
2141
ret = 0;
2142
2143
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2144
if (key.objectid != objectid)
2145
break;
2146
btrfs_release_path(path);
2147
2148
/*
2149
* Just worry about this for extent tree, it'll be the same for
2150
* everybody.
2151
*/
2152
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2153
max_global_id = max(max_global_id, key.offset);
2154
2155
found = true;
2156
root = read_tree_root_path(tree_root, path, &key);
2157
if (IS_ERR(root)) {
2158
ret = PTR_ERR(root);
2159
break;
2160
}
2161
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2162
ret = btrfs_global_root_insert(root);
2163
if (ret) {
2164
btrfs_put_root(root);
2165
break;
2166
}
2167
key.offset++;
2168
}
2169
btrfs_release_path(path);
2170
2171
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2172
fs_info->nr_global_roots = max_global_id + 1;
2173
2174
if (!found || ret) {
2175
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2176
set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
2177
2178
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2179
ret = ret ? ret : -ENOENT;
2180
else
2181
ret = 0;
2182
btrfs_err(fs_info, "failed to load root %s", name);
2183
}
2184
return ret;
2185
}
2186
2187
static int load_global_roots(struct btrfs_root *tree_root)
2188
{
2189
BTRFS_PATH_AUTO_FREE(path);
2190
int ret;
2191
2192
path = btrfs_alloc_path();
2193
if (!path)
2194
return -ENOMEM;
2195
2196
ret = load_global_roots_objectid(tree_root, path,
2197
BTRFS_EXTENT_TREE_OBJECTID, "extent");
2198
if (ret)
2199
return ret;
2200
ret = load_global_roots_objectid(tree_root, path,
2201
BTRFS_CSUM_TREE_OBJECTID, "csum");
2202
if (ret)
2203
return ret;
2204
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2205
return ret;
2206
ret = load_global_roots_objectid(tree_root, path,
2207
BTRFS_FREE_SPACE_TREE_OBJECTID,
2208
"free space");
2209
2210
return ret;
2211
}
2212
2213
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2214
{
2215
struct btrfs_root *tree_root = fs_info->tree_root;
2216
struct btrfs_root *root;
2217
struct btrfs_key location;
2218
int ret;
2219
2220
ASSERT(fs_info->tree_root);
2221
2222
ret = load_global_roots(tree_root);
2223
if (ret)
2224
return ret;
2225
2226
location.type = BTRFS_ROOT_ITEM_KEY;
2227
location.offset = 0;
2228
2229
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2230
location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
2231
root = btrfs_read_tree_root(tree_root, &location);
2232
if (IS_ERR(root)) {
2233
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2234
ret = PTR_ERR(root);
2235
goto out;
2236
}
2237
} else {
2238
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2239
fs_info->block_group_root = root;
2240
}
2241
}
2242
2243
location.objectid = BTRFS_DEV_TREE_OBJECTID;
2244
root = btrfs_read_tree_root(tree_root, &location);
2245
if (IS_ERR(root)) {
2246
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2247
ret = PTR_ERR(root);
2248
goto out;
2249
}
2250
} else {
2251
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2252
fs_info->dev_root = root;
2253
}
2254
/* Initialize fs_info for all devices in any case */
2255
ret = btrfs_init_devices_late(fs_info);
2256
if (ret)
2257
goto out;
2258
2259
/*
2260
* This tree can share blocks with some other fs tree during relocation
2261
* and we need a proper setup by btrfs_get_fs_root
2262
*/
2263
root = btrfs_get_fs_root(tree_root->fs_info,
2264
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2265
if (IS_ERR(root)) {
2266
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2267
ret = PTR_ERR(root);
2268
goto out;
2269
}
2270
} else {
2271
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2272
fs_info->data_reloc_root = root;
2273
}
2274
2275
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2276
root = btrfs_read_tree_root(tree_root, &location);
2277
if (!IS_ERR(root)) {
2278
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2279
fs_info->quota_root = root;
2280
}
2281
2282
location.objectid = BTRFS_UUID_TREE_OBJECTID;
2283
root = btrfs_read_tree_root(tree_root, &location);
2284
if (IS_ERR(root)) {
2285
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2286
ret = PTR_ERR(root);
2287
if (ret != -ENOENT)
2288
goto out;
2289
}
2290
} else {
2291
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2292
fs_info->uuid_root = root;
2293
}
2294
2295
if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
2296
location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
2297
root = btrfs_read_tree_root(tree_root, &location);
2298
if (IS_ERR(root)) {
2299
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2300
ret = PTR_ERR(root);
2301
goto out;
2302
}
2303
} else {
2304
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2305
fs_info->stripe_root = root;
2306
}
2307
}
2308
2309
return 0;
2310
out:
2311
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2312
location.objectid, ret);
2313
return ret;
2314
}
2315
2316
static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
2317
const struct btrfs_super_block *sb)
2318
{
2319
unsigned int cur = 0; /* Offset inside the sys chunk array */
2320
/*
2321
* At sb read time, fs_info is not fully initialized. Thus we have
2322
* to use super block sectorsize, which should have been validated.
2323
*/
2324
const u32 sectorsize = btrfs_super_sectorsize(sb);
2325
u32 sys_array_size = btrfs_super_sys_array_size(sb);
2326
2327
if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2328
btrfs_err(fs_info, "system chunk array too big %u > %u",
2329
sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2330
return -EUCLEAN;
2331
}
2332
2333
while (cur < sys_array_size) {
2334
struct btrfs_disk_key *disk_key;
2335
struct btrfs_chunk *chunk;
2336
struct btrfs_key key;
2337
u64 type;
2338
u16 num_stripes;
2339
u32 len;
2340
int ret;
2341
2342
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
2343
len = sizeof(*disk_key);
2344
2345
if (cur + len > sys_array_size)
2346
goto short_read;
2347
cur += len;
2348
2349
btrfs_disk_key_to_cpu(&key, disk_key);
2350
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
2351
btrfs_err(fs_info,
2352
"unexpected item type %u in sys_array at offset %u",
2353
key.type, cur);
2354
return -EUCLEAN;
2355
}
2356
chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
2357
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2358
if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
2359
goto short_read;
2360
type = btrfs_stack_chunk_type(chunk);
2361
if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
2362
btrfs_err(fs_info,
2363
"invalid chunk type %llu in sys_array at offset %u",
2364
type, cur);
2365
return -EUCLEAN;
2366
}
2367
ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
2368
sectorsize);
2369
if (ret < 0)
2370
return ret;
2371
cur += btrfs_chunk_item_size(num_stripes);
2372
}
2373
return 0;
2374
short_read:
2375
btrfs_err(fs_info,
2376
"super block sys chunk array short read, cur=%u sys_array_size=%u",
2377
cur, sys_array_size);
2378
return -EUCLEAN;
2379
}
2380
2381
/*
2382
* Real super block validation
2383
* NOTE: super csum type and incompat features will not be checked here.
2384
*
2385
* @sb: super block to check
2386
* @mirror_num: the super block number to check its bytenr:
2387
* 0 the primary (1st) sb
2388
* 1, 2 2nd and 3rd backup copy
2389
* -1 skip bytenr check
2390
*/
2391
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
2392
const struct btrfs_super_block *sb, int mirror_num)
2393
{
2394
u64 nodesize = btrfs_super_nodesize(sb);
2395
u64 sectorsize = btrfs_super_sectorsize(sb);
2396
int ret = 0;
2397
const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
2398
2399
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2400
btrfs_err(fs_info, "no valid FS found");
2401
ret = -EINVAL;
2402
}
2403
if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
2404
if (!ignore_flags) {
2405
btrfs_err(fs_info,
2406
"unrecognized or unsupported super flag 0x%llx",
2407
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2408
ret = -EINVAL;
2409
} else {
2410
btrfs_info(fs_info,
2411
"unrecognized or unsupported super flags: 0x%llx, ignored",
2412
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2413
}
2414
}
2415
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2416
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2417
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2418
ret = -EINVAL;
2419
}
2420
if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2421
btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2422
btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2423
ret = -EINVAL;
2424
}
2425
if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2426
btrfs_err(fs_info, "log_root level too big: %d >= %d",
2427
btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2428
ret = -EINVAL;
2429
}
2430
2431
/*
2432
* Check sectorsize and nodesize first, other check will need it.
2433
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2434
*/
2435
if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
2436
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2437
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2438
ret = -EINVAL;
2439
}
2440
2441
/*
2442
* We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
2443
*
2444
* For 4K page sized systems with non-debug builds, all 3 matches (4K).
2445
* For 4K page sized systems with debug builds, there are two block sizes
2446
* supported. (4K and 2K)
2447
*
2448
* We can support 16K sectorsize with 64K page size without problem,
2449
* but such sectorsize/pagesize combination doesn't make much sense.
2450
* 4K will be our future standard, PAGE_SIZE is supported from the very
2451
* beginning.
2452
*/
2453
if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
2454
sectorsize != PAGE_SIZE &&
2455
sectorsize != BTRFS_MIN_BLOCKSIZE)) {
2456
btrfs_err(fs_info,
2457
"sectorsize %llu not yet supported for page size %lu",
2458
sectorsize, PAGE_SIZE);
2459
ret = -EINVAL;
2460
}
2461
2462
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2463
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2464
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2465
ret = -EINVAL;
2466
}
2467
if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2468
btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2469
le32_to_cpu(sb->__unused_leafsize), nodesize);
2470
ret = -EINVAL;
2471
}
2472
2473
/* Root alignment check */
2474
if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2475
btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2476
btrfs_super_root(sb));
2477
ret = -EINVAL;
2478
}
2479
if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2480
btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2481
btrfs_super_chunk_root(sb));
2482
ret = -EINVAL;
2483
}
2484
if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2485
btrfs_warn(fs_info, "log_root block unaligned: %llu",
2486
btrfs_super_log_root(sb));
2487
ret = -EINVAL;
2488
}
2489
2490
if (!fs_info->fs_devices->temp_fsid &&
2491
memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
2492
btrfs_err(fs_info,
2493
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2494
sb->fsid, fs_info->fs_devices->fsid);
2495
ret = -EINVAL;
2496
}
2497
2498
if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
2499
BTRFS_FSID_SIZE) != 0) {
2500
btrfs_err(fs_info,
2501
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2502
btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
2503
ret = -EINVAL;
2504
}
2505
2506
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2507
BTRFS_FSID_SIZE) != 0) {
2508
btrfs_err(fs_info,
2509
"dev_item UUID does not match metadata fsid: %pU != %pU",
2510
fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2511
ret = -EINVAL;
2512
}
2513
2514
/*
2515
* Artificial requirement for block-group-tree to force newer features
2516
* (free-space-tree, no-holes) so the test matrix is smaller.
2517
*/
2518
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
2519
(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
2520
!btrfs_fs_incompat(fs_info, NO_HOLES))) {
2521
btrfs_err(fs_info,
2522
"block-group-tree feature requires free-space-tree and no-holes");
2523
ret = -EINVAL;
2524
}
2525
2526
/*
2527
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
2528
* done later
2529
*/
2530
if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2531
btrfs_err(fs_info, "bytes_used is too small %llu",
2532
btrfs_super_bytes_used(sb));
2533
ret = -EINVAL;
2534
}
2535
if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2536
btrfs_err(fs_info, "invalid stripesize %u",
2537
btrfs_super_stripesize(sb));
2538
ret = -EINVAL;
2539
}
2540
if (btrfs_super_num_devices(sb) > (1UL << 31))
2541
btrfs_warn(fs_info, "suspicious number of devices: %llu",
2542
btrfs_super_num_devices(sb));
2543
if (btrfs_super_num_devices(sb) == 0) {
2544
btrfs_err(fs_info, "number of devices is 0");
2545
ret = -EINVAL;
2546
}
2547
2548
if (mirror_num >= 0 &&
2549
btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2550
btrfs_err(fs_info, "super offset mismatch %llu != %u",
2551
btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2552
ret = -EINVAL;
2553
}
2554
2555
if (ret)
2556
return ret;
2557
2558
ret = validate_sys_chunk_array(fs_info, sb);
2559
2560
/*
2561
* Obvious sys_chunk_array corruptions, it must hold at least one key
2562
* and one chunk
2563
*/
2564
if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2565
btrfs_err(fs_info, "system chunk array too big %u > %u",
2566
btrfs_super_sys_array_size(sb),
2567
BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2568
ret = -EINVAL;
2569
}
2570
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2571
+ sizeof(struct btrfs_chunk)) {
2572
btrfs_err(fs_info, "system chunk array too small %u < %zu",
2573
btrfs_super_sys_array_size(sb),
2574
sizeof(struct btrfs_disk_key)
2575
+ sizeof(struct btrfs_chunk));
2576
ret = -EINVAL;
2577
}
2578
2579
/*
2580
* The generation is a global counter, we'll trust it more than the others
2581
* but it's still possible that it's the one that's wrong.
2582
*/
2583
if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2584
btrfs_warn(fs_info,
2585
"suspicious: generation < chunk_root_generation: %llu < %llu",
2586
btrfs_super_generation(sb),
2587
btrfs_super_chunk_root_generation(sb));
2588
if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2589
&& btrfs_super_cache_generation(sb) != (u64)-1)
2590
btrfs_warn(fs_info,
2591
"suspicious: generation < cache_generation: %llu < %llu",
2592
btrfs_super_generation(sb),
2593
btrfs_super_cache_generation(sb));
2594
2595
return ret;
2596
}
2597
2598
/*
2599
* Validation of super block at mount time.
2600
* Some checks already done early at mount time, like csum type and incompat
2601
* flags will be skipped.
2602
*/
2603
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2604
{
2605
return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
2606
}
2607
2608
/*
2609
* Validation of super block at write time.
2610
* Some checks like bytenr check will be skipped as their values will be
2611
* overwritten soon.
2612
* Extra checks like csum type and incompat flags will be done here.
2613
*/
2614
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2615
struct btrfs_super_block *sb)
2616
{
2617
int ret;
2618
2619
ret = btrfs_validate_super(fs_info, sb, -1);
2620
if (ret < 0)
2621
goto out;
2622
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2623
ret = -EUCLEAN;
2624
btrfs_err(fs_info, "invalid csum type, has %u want %u",
2625
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2626
goto out;
2627
}
2628
if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2629
ret = -EUCLEAN;
2630
btrfs_err(fs_info,
2631
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
2632
btrfs_super_incompat_flags(sb),
2633
(unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2634
goto out;
2635
}
2636
out:
2637
if (ret < 0)
2638
btrfs_err(fs_info,
2639
"super block corruption detected before writing it to disk");
2640
return ret;
2641
}
2642
2643
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2644
{
2645
struct btrfs_tree_parent_check check = {
2646
.level = level,
2647
.transid = gen,
2648
.owner_root = btrfs_root_id(root)
2649
};
2650
int ret = 0;
2651
2652
root->node = read_tree_block(root->fs_info, bytenr, &check);
2653
if (IS_ERR(root->node)) {
2654
ret = PTR_ERR(root->node);
2655
root->node = NULL;
2656
return ret;
2657
}
2658
if (!extent_buffer_uptodate(root->node)) {
2659
free_extent_buffer(root->node);
2660
root->node = NULL;
2661
return -EIO;
2662
}
2663
2664
btrfs_set_root_node(&root->root_item, root->node);
2665
root->commit_root = btrfs_root_node(root);
2666
btrfs_set_root_refs(&root->root_item, 1);
2667
return ret;
2668
}
2669
2670
static int load_important_roots(struct btrfs_fs_info *fs_info)
2671
{
2672
struct btrfs_super_block *sb = fs_info->super_copy;
2673
u64 gen, bytenr;
2674
int level, ret;
2675
2676
bytenr = btrfs_super_root(sb);
2677
gen = btrfs_super_generation(sb);
2678
level = btrfs_super_root_level(sb);
2679
ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2680
if (ret) {
2681
btrfs_warn(fs_info, "couldn't read tree root");
2682
return ret;
2683
}
2684
return 0;
2685
}
2686
2687
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2688
{
2689
int backup_index = find_newest_super_backup(fs_info);
2690
struct btrfs_super_block *sb = fs_info->super_copy;
2691
struct btrfs_root *tree_root = fs_info->tree_root;
2692
bool handle_error = false;
2693
int ret = 0;
2694
int i;
2695
2696
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2697
if (handle_error) {
2698
if (!IS_ERR(tree_root->node))
2699
free_extent_buffer(tree_root->node);
2700
tree_root->node = NULL;
2701
2702
if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2703
break;
2704
2705
free_root_pointers(fs_info, 0);
2706
2707
/*
2708
* Don't use the log in recovery mode, it won't be
2709
* valid
2710
*/
2711
btrfs_set_super_log_root(sb, 0);
2712
2713
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
2714
ret = read_backup_root(fs_info, i);
2715
backup_index = ret;
2716
if (ret < 0)
2717
return ret;
2718
}
2719
2720
ret = load_important_roots(fs_info);
2721
if (ret) {
2722
handle_error = true;
2723
continue;
2724
}
2725
2726
/*
2727
* No need to hold btrfs_root::objectid_mutex since the fs
2728
* hasn't been fully initialised and we are the only user
2729
*/
2730
ret = btrfs_init_root_free_objectid(tree_root);
2731
if (ret < 0) {
2732
handle_error = true;
2733
continue;
2734
}
2735
2736
ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2737
2738
ret = btrfs_read_roots(fs_info);
2739
if (ret < 0) {
2740
handle_error = true;
2741
continue;
2742
}
2743
2744
/* All successful */
2745
fs_info->generation = btrfs_header_generation(tree_root->node);
2746
btrfs_set_last_trans_committed(fs_info, fs_info->generation);
2747
fs_info->last_reloc_trans = 0;
2748
2749
/* Always begin writing backup roots after the one being used */
2750
if (backup_index < 0) {
2751
fs_info->backup_root_index = 0;
2752
} else {
2753
fs_info->backup_root_index = backup_index + 1;
2754
fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2755
}
2756
break;
2757
}
2758
2759
return ret;
2760
}
2761
2762
/*
2763
* Lockdep gets confused between our buffer_tree which requires IRQ locking because
2764
* we modify marks in the IRQ context, and our delayed inode xarray which doesn't
2765
* have these requirements. Use a class key so lockdep doesn't get them mixed up.
2766
*/
2767
static struct lock_class_key buffer_xa_class;
2768
2769
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2770
{
2771
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2772
2773
/* Use the same flags as mapping->i_pages. */
2774
xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
2775
lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
2776
2777
INIT_LIST_HEAD(&fs_info->trans_list);
2778
INIT_LIST_HEAD(&fs_info->dead_roots);
2779
INIT_LIST_HEAD(&fs_info->delayed_iputs);
2780
INIT_LIST_HEAD(&fs_info->delalloc_roots);
2781
INIT_LIST_HEAD(&fs_info->caching_block_groups);
2782
spin_lock_init(&fs_info->delalloc_root_lock);
2783
spin_lock_init(&fs_info->trans_lock);
2784
spin_lock_init(&fs_info->fs_roots_radix_lock);
2785
spin_lock_init(&fs_info->delayed_iput_lock);
2786
spin_lock_init(&fs_info->defrag_inodes_lock);
2787
spin_lock_init(&fs_info->super_lock);
2788
spin_lock_init(&fs_info->unused_bgs_lock);
2789
spin_lock_init(&fs_info->treelog_bg_lock);
2790
spin_lock_init(&fs_info->zone_active_bgs_lock);
2791
spin_lock_init(&fs_info->relocation_bg_lock);
2792
rwlock_init(&fs_info->tree_mod_log_lock);
2793
rwlock_init(&fs_info->global_root_lock);
2794
mutex_init(&fs_info->unused_bg_unpin_mutex);
2795
mutex_init(&fs_info->reclaim_bgs_lock);
2796
mutex_init(&fs_info->reloc_mutex);
2797
mutex_init(&fs_info->delalloc_root_mutex);
2798
mutex_init(&fs_info->zoned_meta_io_lock);
2799
mutex_init(&fs_info->zoned_data_reloc_io_lock);
2800
seqlock_init(&fs_info->profiles_lock);
2801
2802
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
2803
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
2804
btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
2805
btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
2806
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
2807
BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2808
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
2809
BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2810
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
2811
BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2812
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
2813
BTRFS_LOCKDEP_TRANS_COMPLETED);
2814
2815
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2816
INIT_LIST_HEAD(&fs_info->space_info);
2817
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2818
INIT_LIST_HEAD(&fs_info->unused_bgs);
2819
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2820
INIT_LIST_HEAD(&fs_info->zone_active_bgs);
2821
#ifdef CONFIG_BTRFS_DEBUG
2822
INIT_LIST_HEAD(&fs_info->allocated_roots);
2823
INIT_LIST_HEAD(&fs_info->allocated_ebs);
2824
spin_lock_init(&fs_info->eb_leak_lock);
2825
#endif
2826
fs_info->mapping_tree = RB_ROOT_CACHED;
2827
rwlock_init(&fs_info->mapping_tree_lock);
2828
btrfs_init_block_rsv(&fs_info->global_block_rsv,
2829
BTRFS_BLOCK_RSV_GLOBAL);
2830
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2831
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2832
btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
2833
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2834
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2835
BTRFS_BLOCK_RSV_DELOPS);
2836
btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2837
BTRFS_BLOCK_RSV_DELREFS);
2838
2839
atomic_set(&fs_info->async_delalloc_pages, 0);
2840
atomic_set(&fs_info->defrag_running, 0);
2841
atomic_set(&fs_info->nr_delayed_iputs, 0);
2842
atomic64_set(&fs_info->tree_mod_seq, 0);
2843
fs_info->global_root_tree = RB_ROOT;
2844
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2845
fs_info->metadata_ratio = 0;
2846
fs_info->defrag_inodes = RB_ROOT;
2847
atomic64_set(&fs_info->free_chunk_space, 0);
2848
fs_info->tree_mod_log = RB_ROOT;
2849
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2850
btrfs_init_ref_verify(fs_info);
2851
2852
fs_info->thread_pool_size = min_t(unsigned long,
2853
num_online_cpus() + 2, 8);
2854
2855
INIT_LIST_HEAD(&fs_info->ordered_roots);
2856
spin_lock_init(&fs_info->ordered_root_lock);
2857
2858
btrfs_init_scrub(fs_info);
2859
btrfs_init_balance(fs_info);
2860
btrfs_init_async_reclaim_work(fs_info);
2861
btrfs_init_extent_map_shrinker_work(fs_info);
2862
2863
rwlock_init(&fs_info->block_group_cache_lock);
2864
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
2865
2866
btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2867
IO_TREE_FS_EXCLUDED_EXTENTS);
2868
2869
mutex_init(&fs_info->ordered_operations_mutex);
2870
mutex_init(&fs_info->tree_log_mutex);
2871
mutex_init(&fs_info->chunk_mutex);
2872
mutex_init(&fs_info->transaction_kthread_mutex);
2873
mutex_init(&fs_info->cleaner_mutex);
2874
mutex_init(&fs_info->ro_block_group_mutex);
2875
init_rwsem(&fs_info->commit_root_sem);
2876
init_rwsem(&fs_info->cleanup_work_sem);
2877
init_rwsem(&fs_info->subvol_sem);
2878
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2879
2880
btrfs_init_dev_replace_locks(fs_info);
2881
btrfs_init_qgroup(fs_info);
2882
btrfs_discard_init(fs_info);
2883
2884
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2885
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2886
2887
init_waitqueue_head(&fs_info->transaction_throttle);
2888
init_waitqueue_head(&fs_info->transaction_wait);
2889
init_waitqueue_head(&fs_info->transaction_blocked_wait);
2890
init_waitqueue_head(&fs_info->async_submit_wait);
2891
init_waitqueue_head(&fs_info->delayed_iputs_wait);
2892
2893
/* Usable values until the real ones are cached from the superblock */
2894
fs_info->nodesize = 4096;
2895
fs_info->sectorsize = 4096;
2896
fs_info->sectorsize_bits = ilog2(4096);
2897
fs_info->stripesize = 4096;
2898
2899
/* Default compress algorithm when user does -o compress */
2900
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2901
2902
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
2903
2904
spin_lock_init(&fs_info->swapfile_pins_lock);
2905
fs_info->swapfile_pins = RB_ROOT;
2906
2907
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2908
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2909
}
2910
2911
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2912
{
2913
int ret;
2914
2915
fs_info->sb = sb;
2916
/* Temporary fixed values for block size until we read the superblock. */
2917
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2918
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2919
2920
ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2921
if (ret)
2922
return ret;
2923
2924
ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
2925
if (ret)
2926
return ret;
2927
2928
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2929
if (ret)
2930
return ret;
2931
2932
ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
2933
if (ret)
2934
return ret;
2935
2936
fs_info->dirty_metadata_batch = PAGE_SIZE *
2937
(1 + ilog2(nr_cpu_ids));
2938
2939
ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2940
if (ret)
2941
return ret;
2942
2943
ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2944
GFP_KERNEL);
2945
if (ret)
2946
return ret;
2947
2948
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2949
GFP_KERNEL);
2950
if (!fs_info->delayed_root)
2951
return -ENOMEM;
2952
btrfs_init_delayed_root(fs_info->delayed_root);
2953
2954
if (sb_rdonly(sb))
2955
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
2956
if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
2957
set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
2958
2959
return btrfs_alloc_stripe_hash_table(fs_info);
2960
}
2961
2962
static int btrfs_uuid_rescan_kthread(void *data)
2963
{
2964
struct btrfs_fs_info *fs_info = data;
2965
int ret;
2966
2967
/*
2968
* 1st step is to iterate through the existing UUID tree and
2969
* to delete all entries that contain outdated data.
2970
* 2nd step is to add all missing entries to the UUID tree.
2971
*/
2972
ret = btrfs_uuid_tree_iterate(fs_info);
2973
if (ret < 0) {
2974
if (ret != -EINTR)
2975
btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2976
ret);
2977
up(&fs_info->uuid_tree_rescan_sem);
2978
return ret;
2979
}
2980
return btrfs_uuid_scan_kthread(data);
2981
}
2982
2983
static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2984
{
2985
struct task_struct *task;
2986
2987
down(&fs_info->uuid_tree_rescan_sem);
2988
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2989
if (IS_ERR(task)) {
2990
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
2991
btrfs_warn(fs_info, "failed to start uuid_rescan task");
2992
up(&fs_info->uuid_tree_rescan_sem);
2993
return PTR_ERR(task);
2994
}
2995
2996
return 0;
2997
}
2998
2999
static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3000
{
3001
u64 root_objectid = 0;
3002
struct btrfs_root *gang[8];
3003
int ret = 0;
3004
3005
while (1) {
3006
unsigned int found;
3007
3008
spin_lock(&fs_info->fs_roots_radix_lock);
3009
found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3010
(void **)gang, root_objectid,
3011
ARRAY_SIZE(gang));
3012
if (!found) {
3013
spin_unlock(&fs_info->fs_roots_radix_lock);
3014
break;
3015
}
3016
root_objectid = btrfs_root_id(gang[found - 1]) + 1;
3017
3018
for (int i = 0; i < found; i++) {
3019
/* Avoid to grab roots in dead_roots. */
3020
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3021
gang[i] = NULL;
3022
continue;
3023
}
3024
/* Grab all the search result for later use. */
3025
gang[i] = btrfs_grab_root(gang[i]);
3026
}
3027
spin_unlock(&fs_info->fs_roots_radix_lock);
3028
3029
for (int i = 0; i < found; i++) {
3030
if (!gang[i])
3031
continue;
3032
root_objectid = btrfs_root_id(gang[i]);
3033
/*
3034
* Continue to release the remaining roots after the first
3035
* error without cleanup and preserve the first error
3036
* for the return.
3037
*/
3038
if (!ret)
3039
ret = btrfs_orphan_cleanup(gang[i]);
3040
btrfs_put_root(gang[i]);
3041
}
3042
if (ret)
3043
break;
3044
3045
root_objectid++;
3046
}
3047
return ret;
3048
}
3049
3050
/*
3051
* Mounting logic specific to read-write file systems. Shared by open_ctree
3052
* and btrfs_remount when remounting from read-only to read-write.
3053
*/
3054
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3055
{
3056
int ret;
3057
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3058
bool rebuild_free_space_tree = false;
3059
3060
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3061
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3062
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
3063
btrfs_warn(fs_info,
3064
"'clear_cache' option is ignored with extent tree v2");
3065
else
3066
rebuild_free_space_tree = true;
3067
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3068
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3069
btrfs_warn(fs_info, "free space tree is invalid");
3070
rebuild_free_space_tree = true;
3071
}
3072
3073
if (rebuild_free_space_tree) {
3074
btrfs_info(fs_info, "rebuilding free space tree");
3075
ret = btrfs_rebuild_free_space_tree(fs_info);
3076
if (ret) {
3077
btrfs_warn(fs_info,
3078
"failed to rebuild free space tree: %d", ret);
3079
goto out;
3080
}
3081
}
3082
3083
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3084
!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
3085
btrfs_info(fs_info, "disabling free space tree");
3086
ret = btrfs_delete_free_space_tree(fs_info);
3087
if (ret) {
3088
btrfs_warn(fs_info,
3089
"failed to disable free space tree: %d", ret);
3090
goto out;
3091
}
3092
}
3093
3094
/*
3095
* btrfs_find_orphan_roots() is responsible for finding all the dead
3096
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3097
* them into the fs_info->fs_roots_radix tree. This must be done before
3098
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3099
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3100
* item before the root's tree is deleted - this means that if we unmount
3101
* or crash before the deletion completes, on the next mount we will not
3102
* delete what remains of the tree because the orphan item does not
3103
* exists anymore, which is what tells us we have a pending deletion.
3104
*/
3105
ret = btrfs_find_orphan_roots(fs_info);
3106
if (ret)
3107
goto out;
3108
3109
ret = btrfs_cleanup_fs_roots(fs_info);
3110
if (ret)
3111
goto out;
3112
3113
down_read(&fs_info->cleanup_work_sem);
3114
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3115
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3116
up_read(&fs_info->cleanup_work_sem);
3117
goto out;
3118
}
3119
up_read(&fs_info->cleanup_work_sem);
3120
3121
mutex_lock(&fs_info->cleaner_mutex);
3122
ret = btrfs_recover_relocation(fs_info);
3123
mutex_unlock(&fs_info->cleaner_mutex);
3124
if (ret < 0) {
3125
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3126
goto out;
3127
}
3128
3129
if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3130
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3131
btrfs_info(fs_info, "creating free space tree");
3132
ret = btrfs_create_free_space_tree(fs_info);
3133
if (ret) {
3134
btrfs_warn(fs_info,
3135
"failed to create free space tree: %d", ret);
3136
goto out;
3137
}
3138
}
3139
3140
if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3141
ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3142
if (ret)
3143
goto out;
3144
}
3145
3146
ret = btrfs_resume_balance_async(fs_info);
3147
if (ret)
3148
goto out;
3149
3150
ret = btrfs_resume_dev_replace_async(fs_info);
3151
if (ret) {
3152
btrfs_warn(fs_info, "failed to resume dev_replace");
3153
goto out;
3154
}
3155
3156
btrfs_qgroup_rescan_resume(fs_info);
3157
3158
if (!fs_info->uuid_root) {
3159
btrfs_info(fs_info, "creating UUID tree");
3160
ret = btrfs_create_uuid_tree(fs_info);
3161
if (ret) {
3162
btrfs_warn(fs_info,
3163
"failed to create the UUID tree %d", ret);
3164
goto out;
3165
}
3166
}
3167
3168
out:
3169
return ret;
3170
}
3171
3172
/*
3173
* Do various sanity and dependency checks of different features.
3174
*
3175
* @is_rw_mount: If the mount is read-write.
3176
*
3177
* This is the place for less strict checks (like for subpage or artificial
3178
* feature dependencies).
3179
*
3180
* For strict checks or possible corruption detection, see
3181
* btrfs_validate_super().
3182
*
3183
* This should be called after btrfs_parse_options(), as some mount options
3184
* (space cache related) can modify on-disk format like free space tree and
3185
* screw up certain feature dependencies.
3186
*/
3187
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3188
{
3189
struct btrfs_super_block *disk_super = fs_info->super_copy;
3190
u64 incompat = btrfs_super_incompat_flags(disk_super);
3191
const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
3192
const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
3193
3194
if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
3195
btrfs_err(fs_info,
3196
"cannot mount because of unknown incompat features (0x%llx)",
3197
incompat);
3198
return -EINVAL;
3199
}
3200
3201
/* Runtime limitation for mixed block groups. */
3202
if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3203
(fs_info->sectorsize != fs_info->nodesize)) {
3204
btrfs_err(fs_info,
3205
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3206
fs_info->nodesize, fs_info->sectorsize);
3207
return -EINVAL;
3208
}
3209
3210
/* Mixed backref is an always-enabled feature. */
3211
incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3212
3213
/* Set compression related flags just in case. */
3214
if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3215
incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3216
else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3217
incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3218
3219
/*
3220
* An ancient flag, which should really be marked deprecated.
3221
* Such runtime limitation doesn't really need a incompat flag.
3222
*/
3223
if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3224
incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3225
3226
if (compat_ro_unsupp && is_rw_mount) {
3227
btrfs_err(fs_info,
3228
"cannot mount read-write because of unknown compat_ro features (0x%llx)",
3229
compat_ro);
3230
return -EINVAL;
3231
}
3232
3233
/*
3234
* We have unsupported RO compat features, although RO mounted, we
3235
* should not cause any metadata writes, including log replay.
3236
* Or we could screw up whatever the new feature requires.
3237
*/
3238
if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
3239
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3240
btrfs_err(fs_info,
3241
"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3242
compat_ro);
3243
return -EINVAL;
3244
}
3245
3246
/*
3247
* Artificial limitations for block group tree, to force
3248
* block-group-tree to rely on no-holes and free-space-tree.
3249
*/
3250
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
3251
(!btrfs_fs_incompat(fs_info, NO_HOLES) ||
3252
!btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
3253
btrfs_err(fs_info,
3254
"block-group-tree feature requires no-holes and free-space-tree features");
3255
return -EINVAL;
3256
}
3257
3258
/*
3259
* Subpage runtime limitation on v1 cache.
3260
*
3261
* V1 space cache still has some hard codeed PAGE_SIZE usage, while
3262
* we're already defaulting to v2 cache, no need to bother v1 as it's
3263
* going to be deprecated anyway.
3264
*/
3265
if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3266
btrfs_warn(fs_info,
3267
"v1 space cache is not supported for page size %lu with sectorsize %u",
3268
PAGE_SIZE, fs_info->sectorsize);
3269
return -EINVAL;
3270
}
3271
3272
/* This can be called by remount, we need to protect the super block. */
3273
spin_lock(&fs_info->super_lock);
3274
btrfs_set_super_incompat_flags(disk_super, incompat);
3275
spin_unlock(&fs_info->super_lock);
3276
3277
return 0;
3278
}
3279
3280
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
3281
{
3282
u32 sectorsize;
3283
u32 nodesize;
3284
u32 stripesize;
3285
u64 generation;
3286
u16 csum_type;
3287
struct btrfs_super_block *disk_super;
3288
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3289
struct btrfs_root *tree_root;
3290
struct btrfs_root *chunk_root;
3291
int ret;
3292
int level;
3293
3294
ret = init_mount_fs_info(fs_info, sb);
3295
if (ret)
3296
goto fail;
3297
3298
/* These need to be init'ed before we start creating inodes and such. */
3299
tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3300
GFP_KERNEL);
3301
fs_info->tree_root = tree_root;
3302
chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3303
GFP_KERNEL);
3304
fs_info->chunk_root = chunk_root;
3305
if (!tree_root || !chunk_root) {
3306
ret = -ENOMEM;
3307
goto fail;
3308
}
3309
3310
ret = btrfs_init_btree_inode(sb);
3311
if (ret)
3312
goto fail;
3313
3314
invalidate_bdev(fs_devices->latest_dev->bdev);
3315
3316
/*
3317
* Read super block and check the signature bytes only
3318
*/
3319
disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
3320
if (IS_ERR(disk_super)) {
3321
ret = PTR_ERR(disk_super);
3322
goto fail_alloc;
3323
}
3324
3325
btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
3326
/*
3327
* Verify the type first, if that or the checksum value are
3328
* corrupted, we'll find out
3329
*/
3330
csum_type = btrfs_super_csum_type(disk_super);
3331
if (!btrfs_supported_super_csum(csum_type)) {
3332
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3333
csum_type);
3334
ret = -EINVAL;
3335
btrfs_release_disk_super(disk_super);
3336
goto fail_alloc;
3337
}
3338
3339
fs_info->csum_size = btrfs_super_csum_size(disk_super);
3340
3341
ret = btrfs_init_csum_hash(fs_info, csum_type);
3342
if (ret) {
3343
btrfs_release_disk_super(disk_super);
3344
goto fail_alloc;
3345
}
3346
3347
/*
3348
* We want to check superblock checksum, the type is stored inside.
3349
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3350
*/
3351
if (btrfs_check_super_csum(fs_info, disk_super)) {
3352
btrfs_err(fs_info, "superblock checksum mismatch");
3353
ret = -EINVAL;
3354
btrfs_release_disk_super(disk_super);
3355
goto fail_alloc;
3356
}
3357
3358
/*
3359
* super_copy is zeroed at allocation time and we never touch the
3360
* following bytes up to INFO_SIZE, the checksum is calculated from
3361
* the whole block of INFO_SIZE
3362
*/
3363
memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3364
btrfs_release_disk_super(disk_super);
3365
3366
disk_super = fs_info->super_copy;
3367
3368
memcpy(fs_info->super_for_commit, fs_info->super_copy,
3369
sizeof(*fs_info->super_for_commit));
3370
3371
ret = btrfs_validate_mount_super(fs_info);
3372
if (ret) {
3373
btrfs_err(fs_info, "superblock contains fatal errors");
3374
ret = -EINVAL;
3375
goto fail_alloc;
3376
}
3377
3378
if (!btrfs_super_root(disk_super)) {
3379
btrfs_err(fs_info, "invalid superblock tree root bytenr");
3380
ret = -EINVAL;
3381
goto fail_alloc;
3382
}
3383
3384
/* check FS state, whether FS is broken. */
3385
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3386
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
3387
3388
/* Set up fs_info before parsing mount options */
3389
nodesize = btrfs_super_nodesize(disk_super);
3390
sectorsize = btrfs_super_sectorsize(disk_super);
3391
stripesize = sectorsize;
3392
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3393
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3394
3395
fs_info->nodesize = nodesize;
3396
fs_info->nodesize_bits = ilog2(nodesize);
3397
fs_info->sectorsize = sectorsize;
3398
fs_info->sectorsize_bits = ilog2(sectorsize);
3399
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3400
fs_info->stripesize = stripesize;
3401
fs_info->fs_devices->fs_info = fs_info;
3402
3403
/*
3404
* Handle the space caching options appropriately now that we have the
3405
* super block loaded and validated.
3406
*/
3407
btrfs_set_free_space_cache_settings(fs_info);
3408
3409
if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
3410
ret = -EINVAL;
3411
goto fail_alloc;
3412
}
3413
3414
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
3415
if (ret < 0)
3416
goto fail_alloc;
3417
3418
/*
3419
* At this point our mount options are validated, if we set ->max_inline
3420
* to something non-standard make sure we truncate it to sectorsize.
3421
*/
3422
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
3423
3424
ret = btrfs_init_workqueues(fs_info);
3425
if (ret)
3426
goto fail_sb_buffer;
3427
3428
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3429
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3430
3431
/* Update the values for the current filesystem. */
3432
sb->s_blocksize = sectorsize;
3433
sb->s_blocksize_bits = blksize_bits(sectorsize);
3434
memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3435
3436
mutex_lock(&fs_info->chunk_mutex);
3437
ret = btrfs_read_sys_array(fs_info);
3438
mutex_unlock(&fs_info->chunk_mutex);
3439
if (ret) {
3440
btrfs_err(fs_info, "failed to read the system array: %d", ret);
3441
goto fail_sb_buffer;
3442
}
3443
3444
generation = btrfs_super_chunk_root_generation(disk_super);
3445
level = btrfs_super_chunk_root_level(disk_super);
3446
ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3447
generation, level);
3448
if (ret) {
3449
btrfs_err(fs_info, "failed to read chunk root");
3450
goto fail_tree_roots;
3451
}
3452
3453
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3454
offsetof(struct btrfs_header, chunk_tree_uuid),
3455
BTRFS_UUID_SIZE);
3456
3457
ret = btrfs_read_chunk_tree(fs_info);
3458
if (ret) {
3459
btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3460
goto fail_tree_roots;
3461
}
3462
3463
/*
3464
* At this point we know all the devices that make this filesystem,
3465
* including the seed devices but we don't know yet if the replace
3466
* target is required. So free devices that are not part of this
3467
* filesystem but skip the replace target device which is checked
3468
* below in btrfs_init_dev_replace().
3469
*/
3470
btrfs_free_extra_devids(fs_devices);
3471
if (!fs_devices->latest_dev->bdev) {
3472
btrfs_err(fs_info, "failed to read devices");
3473
ret = -EIO;
3474
goto fail_tree_roots;
3475
}
3476
3477
ret = init_tree_roots(fs_info);
3478
if (ret)
3479
goto fail_tree_roots;
3480
3481
/*
3482
* Get zone type information of zoned block devices. This will also
3483
* handle emulation of a zoned filesystem if a regular device has the
3484
* zoned incompat feature flag set.
3485
*/
3486
ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3487
if (ret) {
3488
btrfs_err(fs_info,
3489
"zoned: failed to read device zone info: %d", ret);
3490
goto fail_block_groups;
3491
}
3492
3493
/*
3494
* If we have a uuid root and we're not being told to rescan we need to
3495
* check the generation here so we can set the
3496
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
3497
* transaction during a balance or the log replay without updating the
3498
* uuid generation, and then if we crash we would rescan the uuid tree,
3499
* even though it was perfectly fine.
3500
*/
3501
if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3502
fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3503
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3504
3505
ret = btrfs_verify_dev_extents(fs_info);
3506
if (ret) {
3507
btrfs_err(fs_info,
3508
"failed to verify dev extents against chunks: %d",
3509
ret);
3510
goto fail_block_groups;
3511
}
3512
ret = btrfs_recover_balance(fs_info);
3513
if (ret) {
3514
btrfs_err(fs_info, "failed to recover balance: %d", ret);
3515
goto fail_block_groups;
3516
}
3517
3518
ret = btrfs_init_dev_stats(fs_info);
3519
if (ret) {
3520
btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3521
goto fail_block_groups;
3522
}
3523
3524
ret = btrfs_init_dev_replace(fs_info);
3525
if (ret) {
3526
btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3527
goto fail_block_groups;
3528
}
3529
3530
ret = btrfs_check_zoned_mode(fs_info);
3531
if (ret) {
3532
btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3533
ret);
3534
goto fail_block_groups;
3535
}
3536
3537
ret = btrfs_sysfs_add_fsid(fs_devices);
3538
if (ret) {
3539
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3540
ret);
3541
goto fail_block_groups;
3542
}
3543
3544
ret = btrfs_sysfs_add_mounted(fs_info);
3545
if (ret) {
3546
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3547
goto fail_fsdev_sysfs;
3548
}
3549
3550
ret = btrfs_init_space_info(fs_info);
3551
if (ret) {
3552
btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3553
goto fail_sysfs;
3554
}
3555
3556
ret = btrfs_read_block_groups(fs_info);
3557
if (ret) {
3558
btrfs_err(fs_info, "failed to read block groups: %d", ret);
3559
goto fail_sysfs;
3560
}
3561
3562
btrfs_zoned_reserve_data_reloc_bg(fs_info);
3563
btrfs_free_zone_cache(fs_info);
3564
3565
btrfs_check_active_zone_reservation(fs_info);
3566
3567
if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3568
!btrfs_check_rw_degradable(fs_info, NULL)) {
3569
btrfs_warn(fs_info,
3570
"writable mount is not allowed due to too many missing devices");
3571
ret = -EINVAL;
3572
goto fail_sysfs;
3573
}
3574
3575
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3576
"btrfs-cleaner");
3577
if (IS_ERR(fs_info->cleaner_kthread)) {
3578
ret = PTR_ERR(fs_info->cleaner_kthread);
3579
goto fail_sysfs;
3580
}
3581
3582
fs_info->transaction_kthread = kthread_run(transaction_kthread,
3583
tree_root,
3584
"btrfs-transaction");
3585
if (IS_ERR(fs_info->transaction_kthread)) {
3586
ret = PTR_ERR(fs_info->transaction_kthread);
3587
goto fail_cleaner;
3588
}
3589
3590
ret = btrfs_read_qgroup_config(fs_info);
3591
if (ret)
3592
goto fail_trans_kthread;
3593
3594
if (btrfs_build_ref_tree(fs_info))
3595
btrfs_err(fs_info, "couldn't build ref tree");
3596
3597
/* do not make disk changes in broken FS or nologreplay is given */
3598
if (btrfs_super_log_root(disk_super) != 0 &&
3599
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3600
btrfs_info(fs_info, "start tree-log replay");
3601
ret = btrfs_replay_log(fs_info, fs_devices);
3602
if (ret)
3603
goto fail_qgroup;
3604
}
3605
3606
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3607
if (IS_ERR(fs_info->fs_root)) {
3608
ret = PTR_ERR(fs_info->fs_root);
3609
btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
3610
fs_info->fs_root = NULL;
3611
goto fail_qgroup;
3612
}
3613
3614
if (sb_rdonly(sb))
3615
return 0;
3616
3617
ret = btrfs_start_pre_rw_mount(fs_info);
3618
if (ret) {
3619
close_ctree(fs_info);
3620
return ret;
3621
}
3622
btrfs_discard_resume(fs_info);
3623
3624
if (fs_info->uuid_root &&
3625
(btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3626
fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3627
btrfs_info(fs_info, "checking UUID tree");
3628
ret = btrfs_check_uuid_tree(fs_info);
3629
if (ret) {
3630
btrfs_warn(fs_info,
3631
"failed to check the UUID tree: %d", ret);
3632
close_ctree(fs_info);
3633
return ret;
3634
}
3635
}
3636
3637
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3638
3639
/* Kick the cleaner thread so it'll start deleting snapshots. */
3640
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3641
wake_up_process(fs_info->cleaner_kthread);
3642
3643
return 0;
3644
3645
fail_qgroup:
3646
btrfs_free_qgroup_config(fs_info);
3647
fail_trans_kthread:
3648
kthread_stop(fs_info->transaction_kthread);
3649
btrfs_cleanup_transaction(fs_info);
3650
btrfs_free_fs_roots(fs_info);
3651
fail_cleaner:
3652
kthread_stop(fs_info->cleaner_kthread);
3653
3654
/*
3655
* make sure we're done with the btree inode before we stop our
3656
* kthreads
3657
*/
3658
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3659
3660
fail_sysfs:
3661
btrfs_sysfs_remove_mounted(fs_info);
3662
3663
fail_fsdev_sysfs:
3664
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3665
3666
fail_block_groups:
3667
btrfs_put_block_group_cache(fs_info);
3668
3669
fail_tree_roots:
3670
if (fs_info->data_reloc_root)
3671
btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3672
free_root_pointers(fs_info, true);
3673
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3674
3675
fail_sb_buffer:
3676
btrfs_stop_all_workers(fs_info);
3677
btrfs_free_block_groups(fs_info);
3678
fail_alloc:
3679
btrfs_mapping_tree_free(fs_info);
3680
3681
iput(fs_info->btree_inode);
3682
fail:
3683
ASSERT(ret < 0);
3684
return ret;
3685
}
3686
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3687
3688
static void btrfs_end_super_write(struct bio *bio)
3689
{
3690
struct btrfs_device *device = bio->bi_private;
3691
struct folio_iter fi;
3692
3693
bio_for_each_folio_all(fi, bio) {
3694
if (bio->bi_status) {
3695
btrfs_warn_rl(device->fs_info,
3696
"lost super block write due to IO error on %s (%d)",
3697
btrfs_dev_name(device),
3698
blk_status_to_errno(bio->bi_status));
3699
btrfs_dev_stat_inc_and_print(device,
3700
BTRFS_DEV_STAT_WRITE_ERRS);
3701
/* Ensure failure if the primary sb fails. */
3702
if (bio->bi_opf & REQ_FUA)
3703
atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
3704
&device->sb_write_errors);
3705
else
3706
atomic_inc(&device->sb_write_errors);
3707
}
3708
folio_unlock(fi.folio);
3709
folio_put(fi.folio);
3710
}
3711
3712
bio_put(bio);
3713
}
3714
3715
/*
3716
* Write superblock @sb to the @device. Do not wait for completion, all the
3717
* folios we use for writing are locked.
3718
*
3719
* Write @max_mirrors copies of the superblock, where 0 means default that fit
3720
* the expected device size at commit time. Note that max_mirrors must be
3721
* same for write and wait phases.
3722
*
3723
* Return number of errors when folio is not found or submission fails.
3724
*/
3725
static int write_dev_supers(struct btrfs_device *device,
3726
struct btrfs_super_block *sb, int max_mirrors)
3727
{
3728
struct btrfs_fs_info *fs_info = device->fs_info;
3729
struct address_space *mapping = device->bdev->bd_mapping;
3730
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3731
int i;
3732
int ret;
3733
u64 bytenr, bytenr_orig;
3734
3735
atomic_set(&device->sb_write_errors, 0);
3736
3737
if (max_mirrors == 0)
3738
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3739
3740
shash->tfm = fs_info->csum_shash;
3741
3742
for (i = 0; i < max_mirrors; i++) {
3743
struct folio *folio;
3744
struct bio *bio;
3745
struct btrfs_super_block *disk_super;
3746
size_t offset;
3747
3748
bytenr_orig = btrfs_sb_offset(i);
3749
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3750
if (ret == -ENOENT) {
3751
continue;
3752
} else if (ret < 0) {
3753
btrfs_err(device->fs_info,
3754
"couldn't get super block location for mirror %d error %d",
3755
i, ret);
3756
atomic_inc(&device->sb_write_errors);
3757
continue;
3758
}
3759
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3760
device->commit_total_bytes)
3761
break;
3762
3763
btrfs_set_super_bytenr(sb, bytenr_orig);
3764
3765
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3766
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3767
sb->csum);
3768
3769
folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
3770
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
3771
GFP_NOFS);
3772
if (IS_ERR(folio)) {
3773
btrfs_err(device->fs_info,
3774
"couldn't get super block page for bytenr %llu error %ld",
3775
bytenr, PTR_ERR(folio));
3776
atomic_inc(&device->sb_write_errors);
3777
continue;
3778
}
3779
3780
offset = offset_in_folio(folio, bytenr);
3781
disk_super = folio_address(folio) + offset;
3782
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3783
3784
/*
3785
* Directly use bios here instead of relying on the page cache
3786
* to do I/O, so we don't lose the ability to do integrity
3787
* checking.
3788
*/
3789
bio = bio_alloc(device->bdev, 1,
3790
REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
3791
GFP_NOFS);
3792
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3793
bio->bi_private = device;
3794
bio->bi_end_io = btrfs_end_super_write;
3795
bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
3796
3797
/*
3798
* We FUA only the first super block. The others we allow to
3799
* go down lazy and there's a short window where the on-disk
3800
* copies might still contain the older version.
3801
*/
3802
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3803
bio->bi_opf |= REQ_FUA;
3804
submit_bio(bio);
3805
3806
if (btrfs_advance_sb_log(device, i))
3807
atomic_inc(&device->sb_write_errors);
3808
}
3809
return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
3810
}
3811
3812
/*
3813
* Wait for write completion of superblocks done by write_dev_supers,
3814
* @max_mirrors same for write and wait phases.
3815
*
3816
* Return -1 if primary super block write failed or when there were no super block
3817
* copies written. Otherwise 0.
3818
*/
3819
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3820
{
3821
int i;
3822
int errors = 0;
3823
bool primary_failed = false;
3824
int ret;
3825
u64 bytenr;
3826
3827
if (max_mirrors == 0)
3828
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3829
3830
for (i = 0; i < max_mirrors; i++) {
3831
struct folio *folio;
3832
3833
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
3834
if (ret == -ENOENT) {
3835
break;
3836
} else if (ret < 0) {
3837
errors++;
3838
if (i == 0)
3839
primary_failed = true;
3840
continue;
3841
}
3842
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3843
device->commit_total_bytes)
3844
break;
3845
3846
folio = filemap_get_folio(device->bdev->bd_mapping,
3847
bytenr >> PAGE_SHIFT);
3848
/* If the folio has been removed, then we know it completed. */
3849
if (IS_ERR(folio))
3850
continue;
3851
3852
/* Folio will be unlocked once the write completes. */
3853
folio_wait_locked(folio);
3854
folio_put(folio);
3855
}
3856
3857
errors += atomic_read(&device->sb_write_errors);
3858
if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
3859
primary_failed = true;
3860
if (primary_failed) {
3861
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3862
device->devid);
3863
return -1;
3864
}
3865
3866
return errors < i ? 0 : -1;
3867
}
3868
3869
/*
3870
* endio for the write_dev_flush, this will wake anyone waiting
3871
* for the barrier when it is done
3872
*/
3873
static void btrfs_end_empty_barrier(struct bio *bio)
3874
{
3875
bio_uninit(bio);
3876
complete(bio->bi_private);
3877
}
3878
3879
/*
3880
* Submit a flush request to the device if it supports it. Error handling is
3881
* done in the waiting counterpart.
3882
*/
3883
static void write_dev_flush(struct btrfs_device *device)
3884
{
3885
struct bio *bio = &device->flush_bio;
3886
3887
device->last_flush_error = BLK_STS_OK;
3888
3889
bio_init(bio, device->bdev, NULL, 0,
3890
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
3891
bio->bi_end_io = btrfs_end_empty_barrier;
3892
init_completion(&device->flush_wait);
3893
bio->bi_private = &device->flush_wait;
3894
submit_bio(bio);
3895
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3896
}
3897
3898
/*
3899
* If the flush bio has been submitted by write_dev_flush, wait for it.
3900
* Return true for any error, and false otherwise.
3901
*/
3902
static bool wait_dev_flush(struct btrfs_device *device)
3903
{
3904
struct bio *bio = &device->flush_bio;
3905
3906
if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3907
return false;
3908
3909
wait_for_completion_io(&device->flush_wait);
3910
3911
if (bio->bi_status) {
3912
device->last_flush_error = bio->bi_status;
3913
btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
3914
return true;
3915
}
3916
3917
return false;
3918
}
3919
3920
/*
3921
* send an empty flush down to each device in parallel,
3922
* then wait for them
3923
*/
3924
static int barrier_all_devices(struct btrfs_fs_info *info)
3925
{
3926
struct list_head *head;
3927
struct btrfs_device *dev;
3928
int errors_wait = 0;
3929
3930
lockdep_assert_held(&info->fs_devices->device_list_mutex);
3931
/* send down all the barriers */
3932
head = &info->fs_devices->devices;
3933
list_for_each_entry(dev, head, dev_list) {
3934
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3935
continue;
3936
if (!dev->bdev)
3937
continue;
3938
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3939
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3940
continue;
3941
3942
write_dev_flush(dev);
3943
}
3944
3945
/* wait for all the barriers */
3946
list_for_each_entry(dev, head, dev_list) {
3947
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3948
continue;
3949
if (!dev->bdev) {
3950
errors_wait++;
3951
continue;
3952
}
3953
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3954
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3955
continue;
3956
3957
if (wait_dev_flush(dev))
3958
errors_wait++;
3959
}
3960
3961
/*
3962
* Checks last_flush_error of disks in order to determine the device
3963
* state.
3964
*/
3965
if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
3966
return -EIO;
3967
3968
return 0;
3969
}
3970
3971
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3972
{
3973
int raid_type;
3974
int min_tolerated = INT_MAX;
3975
3976
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3977
(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3978
min_tolerated = min_t(int, min_tolerated,
3979
btrfs_raid_array[BTRFS_RAID_SINGLE].
3980
tolerated_failures);
3981
3982
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3983
if (raid_type == BTRFS_RAID_SINGLE)
3984
continue;
3985
if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3986
continue;
3987
min_tolerated = min_t(int, min_tolerated,
3988
btrfs_raid_array[raid_type].
3989
tolerated_failures);
3990
}
3991
3992
if (min_tolerated == INT_MAX) {
3993
btrfs_warn(NULL, "unknown raid flag: %llu", flags);
3994
min_tolerated = 0;
3995
}
3996
3997
return min_tolerated;
3998
}
3999
4000
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4001
{
4002
struct list_head *head;
4003
struct btrfs_device *dev;
4004
struct btrfs_super_block *sb;
4005
struct btrfs_dev_item *dev_item;
4006
int ret;
4007
int do_barriers;
4008
int max_errors;
4009
int total_errors = 0;
4010
u64 flags;
4011
4012
do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4013
4014
/*
4015
* max_mirrors == 0 indicates we're from commit_transaction,
4016
* not from fsync where the tree roots in fs_info have not
4017
* been consistent on disk.
4018
*/
4019
if (max_mirrors == 0)
4020
backup_super_roots(fs_info);
4021
4022
sb = fs_info->super_for_commit;
4023
dev_item = &sb->dev_item;
4024
4025
mutex_lock(&fs_info->fs_devices->device_list_mutex);
4026
head = &fs_info->fs_devices->devices;
4027
max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4028
4029
if (do_barriers) {
4030
ret = barrier_all_devices(fs_info);
4031
if (ret) {
4032
mutex_unlock(
4033
&fs_info->fs_devices->device_list_mutex);
4034
btrfs_handle_fs_error(fs_info, ret,
4035
"errors while submitting device barriers.");
4036
return ret;
4037
}
4038
}
4039
4040
list_for_each_entry(dev, head, dev_list) {
4041
if (!dev->bdev) {
4042
total_errors++;
4043
continue;
4044
}
4045
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4046
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4047
continue;
4048
4049
btrfs_set_stack_device_generation(dev_item, 0);
4050
btrfs_set_stack_device_type(dev_item, dev->type);
4051
btrfs_set_stack_device_id(dev_item, dev->devid);
4052
btrfs_set_stack_device_total_bytes(dev_item,
4053
dev->commit_total_bytes);
4054
btrfs_set_stack_device_bytes_used(dev_item,
4055
dev->commit_bytes_used);
4056
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4057
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4058
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4059
memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4060
memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4061
BTRFS_FSID_SIZE);
4062
4063
flags = btrfs_super_flags(sb);
4064
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4065
4066
ret = btrfs_validate_write_super(fs_info, sb);
4067
if (ret < 0) {
4068
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4069
btrfs_handle_fs_error(fs_info, -EUCLEAN,
4070
"unexpected superblock corruption detected");
4071
return -EUCLEAN;
4072
}
4073
4074
ret = write_dev_supers(dev, sb, max_mirrors);
4075
if (ret)
4076
total_errors++;
4077
}
4078
if (total_errors > max_errors) {
4079
btrfs_err(fs_info, "%d errors while writing supers",
4080
total_errors);
4081
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4082
4083
/* FUA is masked off if unsupported and can't be the reason */
4084
btrfs_handle_fs_error(fs_info, -EIO,
4085
"%d errors while writing supers",
4086
total_errors);
4087
return -EIO;
4088
}
4089
4090
total_errors = 0;
4091
list_for_each_entry(dev, head, dev_list) {
4092
if (!dev->bdev)
4093
continue;
4094
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4095
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4096
continue;
4097
4098
ret = wait_dev_supers(dev, max_mirrors);
4099
if (ret)
4100
total_errors++;
4101
}
4102
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4103
if (total_errors > max_errors) {
4104
btrfs_handle_fs_error(fs_info, -EIO,
4105
"%d errors while writing supers",
4106
total_errors);
4107
return -EIO;
4108
}
4109
return 0;
4110
}
4111
4112
/* Drop a fs root from the radix tree and free it. */
4113
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4114
struct btrfs_root *root)
4115
{
4116
bool drop_ref = false;
4117
4118
spin_lock(&fs_info->fs_roots_radix_lock);
4119
radix_tree_delete(&fs_info->fs_roots_radix,
4120
(unsigned long)btrfs_root_id(root));
4121
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4122
drop_ref = true;
4123
spin_unlock(&fs_info->fs_roots_radix_lock);
4124
4125
if (BTRFS_FS_ERROR(fs_info)) {
4126
ASSERT(root->log_root == NULL);
4127
if (root->reloc_root) {
4128
btrfs_put_root(root->reloc_root);
4129
root->reloc_root = NULL;
4130
}
4131
}
4132
4133
if (drop_ref)
4134
btrfs_put_root(root);
4135
}
4136
4137
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4138
{
4139
mutex_lock(&fs_info->cleaner_mutex);
4140
btrfs_run_delayed_iputs(fs_info);
4141
mutex_unlock(&fs_info->cleaner_mutex);
4142
wake_up_process(fs_info->cleaner_kthread);
4143
4144
/* wait until ongoing cleanup work done */
4145
down_write(&fs_info->cleanup_work_sem);
4146
up_write(&fs_info->cleanup_work_sem);
4147
4148
return btrfs_commit_current_transaction(fs_info->tree_root);
4149
}
4150
4151
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4152
{
4153
struct btrfs_transaction *trans;
4154
struct btrfs_transaction *tmp;
4155
bool found = false;
4156
4157
/*
4158
* This function is only called at the very end of close_ctree(),
4159
* thus no other running transaction, no need to take trans_lock.
4160
*/
4161
ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4162
list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4163
struct extent_state *cached = NULL;
4164
u64 dirty_bytes = 0;
4165
u64 cur = 0;
4166
u64 found_start;
4167
u64 found_end;
4168
4169
found = true;
4170
while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
4171
&found_start, &found_end,
4172
EXTENT_DIRTY, &cached)) {
4173
dirty_bytes += found_end + 1 - found_start;
4174
cur = found_end + 1;
4175
}
4176
btrfs_warn(fs_info,
4177
"transaction %llu (with %llu dirty metadata bytes) is not committed",
4178
trans->transid, dirty_bytes);
4179
btrfs_cleanup_one_transaction(trans);
4180
4181
if (trans == fs_info->running_transaction)
4182
fs_info->running_transaction = NULL;
4183
list_del_init(&trans->list);
4184
4185
btrfs_put_transaction(trans);
4186
trace_btrfs_transaction_commit(fs_info);
4187
}
4188
ASSERT(!found);
4189
}
4190
4191
void __cold close_ctree(struct btrfs_fs_info *fs_info)
4192
{
4193
int ret;
4194
4195
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4196
4197
/*
4198
* If we had UNFINISHED_DROPS we could still be processing them, so
4199
* clear that bit and wake up relocation so it can stop.
4200
* We must do this before stopping the block group reclaim task, because
4201
* at btrfs_relocate_block_group() we wait for this bit, and after the
4202
* wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4203
* have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4204
* return 1.
4205
*/
4206
btrfs_wake_unfinished_drop(fs_info);
4207
4208
/*
4209
* We may have the reclaim task running and relocating a data block group,
4210
* in which case it may create delayed iputs. So stop it before we park
4211
* the cleaner kthread otherwise we can get new delayed iputs after
4212
* parking the cleaner, and that can make the async reclaim task to hang
4213
* if it's waiting for delayed iputs to complete, since the cleaner is
4214
* parked and can not run delayed iputs - this will make us hang when
4215
* trying to stop the async reclaim task.
4216
*/
4217
cancel_work_sync(&fs_info->reclaim_bgs_work);
4218
/*
4219
* We don't want the cleaner to start new transactions, add more delayed
4220
* iputs, etc. while we're closing. We can't use kthread_stop() yet
4221
* because that frees the task_struct, and the transaction kthread might
4222
* still try to wake up the cleaner.
4223
*/
4224
kthread_park(fs_info->cleaner_kthread);
4225
4226
/* wait for the qgroup rescan worker to stop */
4227
btrfs_qgroup_wait_for_completion(fs_info, false);
4228
4229
/* wait for the uuid_scan task to finish */
4230
down(&fs_info->uuid_tree_rescan_sem);
4231
/* avoid complains from lockdep et al., set sem back to initial state */
4232
up(&fs_info->uuid_tree_rescan_sem);
4233
4234
/* pause restriper - we want to resume on mount */
4235
btrfs_pause_balance(fs_info);
4236
4237
btrfs_dev_replace_suspend_for_unmount(fs_info);
4238
4239
btrfs_scrub_cancel(fs_info);
4240
4241
/* wait for any defraggers to finish */
4242
wait_event(fs_info->transaction_wait,
4243
(atomic_read(&fs_info->defrag_running) == 0));
4244
4245
/* clear out the rbtree of defraggable inodes */
4246
btrfs_cleanup_defrag_inodes(fs_info);
4247
4248
/*
4249
* Handle the error fs first, as it will flush and wait for all ordered
4250
* extents. This will generate delayed iputs, thus we want to handle
4251
* it first.
4252
*/
4253
if (unlikely(BTRFS_FS_ERROR(fs_info)))
4254
btrfs_error_commit_super(fs_info);
4255
4256
/*
4257
* Wait for any fixup workers to complete.
4258
* If we don't wait for them here and they are still running by the time
4259
* we call kthread_stop() against the cleaner kthread further below, we
4260
* get an use-after-free on the cleaner because the fixup worker adds an
4261
* inode to the list of delayed iputs and then attempts to wakeup the
4262
* cleaner kthread, which was already stopped and destroyed. We parked
4263
* already the cleaner, but below we run all pending delayed iputs.
4264
*/
4265
btrfs_flush_workqueue(fs_info->fixup_workers);
4266
/*
4267
* Similar case here, we have to wait for delalloc workers before we
4268
* proceed below and stop the cleaner kthread, otherwise we trigger a
4269
* use-after-tree on the cleaner kthread task_struct when a delalloc
4270
* worker running submit_compressed_extents() adds a delayed iput, which
4271
* does a wake up on the cleaner kthread, which was already freed below
4272
* when we call kthread_stop().
4273
*/
4274
btrfs_flush_workqueue(fs_info->delalloc_workers);
4275
4276
/*
4277
* We can have ordered extents getting their last reference dropped from
4278
* the fs_info->workers queue because for async writes for data bios we
4279
* queue a work for that queue, at btrfs_wq_submit_bio(), that runs
4280
* run_one_async_done() which calls btrfs_bio_end_io() in case the bio
4281
* has an error, and that later function can do the final
4282
* btrfs_put_ordered_extent() on the ordered extent attached to the bio,
4283
* which adds a delayed iput for the inode. So we must flush the queue
4284
* so that we don't have delayed iputs after committing the current
4285
* transaction below and stopping the cleaner and transaction kthreads.
4286
*/
4287
btrfs_flush_workqueue(fs_info->workers);
4288
4289
/*
4290
* When finishing a compressed write bio we schedule a work queue item
4291
* to finish an ordered extent - btrfs_finish_compressed_write_work()
4292
* calls btrfs_finish_ordered_extent() which in turns does a call to
4293
* btrfs_queue_ordered_fn(), and that queues the ordered extent
4294
* completion either in the endio_write_workers work queue or in the
4295
* fs_info->endio_freespace_worker work queue. We flush those queues
4296
* below, so before we flush them we must flush this queue for the
4297
* workers of compressed writes.
4298
*/
4299
flush_workqueue(fs_info->compressed_write_workers);
4300
4301
/*
4302
* After we parked the cleaner kthread, ordered extents may have
4303
* completed and created new delayed iputs. If one of the async reclaim
4304
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4305
* can hang forever trying to stop it, because if a delayed iput is
4306
* added after it ran btrfs_run_delayed_iputs() and before it called
4307
* btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4308
* no one else to run iputs.
4309
*
4310
* So wait for all ongoing ordered extents to complete and then run
4311
* delayed iputs. This works because once we reach this point no one
4312
* can create new ordered extents, but delayed iputs can still be added
4313
* by a reclaim worker (see comments further below).
4314
*
4315
* Also note that btrfs_wait_ordered_roots() is not safe here, because
4316
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4317
* but the delayed iput for the respective inode is made only when doing
4318
* the final btrfs_put_ordered_extent() (which must happen at
4319
* btrfs_finish_ordered_io() when we are unmounting).
4320
*/
4321
btrfs_flush_workqueue(fs_info->endio_write_workers);
4322
/* Ordered extents for free space inodes. */
4323
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4324
/*
4325
* Run delayed iputs in case an async reclaim worker is waiting for them
4326
* to be run as mentioned above.
4327
*/
4328
btrfs_run_delayed_iputs(fs_info);
4329
4330
cancel_work_sync(&fs_info->async_reclaim_work);
4331
cancel_work_sync(&fs_info->async_data_reclaim_work);
4332
cancel_work_sync(&fs_info->preempt_reclaim_work);
4333
cancel_work_sync(&fs_info->em_shrinker_work);
4334
4335
/*
4336
* Run delayed iputs again because an async reclaim worker may have
4337
* added new ones if it was flushing delalloc:
4338
*
4339
* shrink_delalloc() -> btrfs_start_delalloc_roots() ->
4340
* start_delalloc_inodes() -> btrfs_add_delayed_iput()
4341
*/
4342
btrfs_run_delayed_iputs(fs_info);
4343
4344
/* There should be no more workload to generate new delayed iputs. */
4345
set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
4346
4347
/* Cancel or finish ongoing discard work */
4348
btrfs_discard_cleanup(fs_info);
4349
4350
if (!sb_rdonly(fs_info->sb)) {
4351
/*
4352
* The cleaner kthread is stopped, so do one final pass over
4353
* unused block groups.
4354
*/
4355
btrfs_delete_unused_bgs(fs_info);
4356
4357
/*
4358
* There might be existing delayed inode workers still running
4359
* and holding an empty delayed inode item. We must wait for
4360
* them to complete first because they can create a transaction.
4361
* This happens when someone calls btrfs_balance_delayed_items()
4362
* and then a transaction commit runs the same delayed nodes
4363
* before any delayed worker has done something with the nodes.
4364
* We must wait for any worker here and not at transaction
4365
* commit time since that could cause a deadlock.
4366
* This is a very rare case.
4367
*/
4368
btrfs_flush_workqueue(fs_info->delayed_workers);
4369
4370
ret = btrfs_commit_super(fs_info);
4371
if (ret)
4372
btrfs_err(fs_info, "commit super ret %d", ret);
4373
}
4374
4375
kthread_stop(fs_info->transaction_kthread);
4376
kthread_stop(fs_info->cleaner_kthread);
4377
4378
ASSERT(list_empty(&fs_info->delayed_iputs));
4379
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4380
4381
if (btrfs_check_quota_leak(fs_info)) {
4382
DEBUG_WARN("qgroup reserved space leaked");
4383
btrfs_err(fs_info, "qgroup reserved space leaked");
4384
}
4385
4386
btrfs_free_qgroup_config(fs_info);
4387
ASSERT(list_empty(&fs_info->delalloc_roots));
4388
4389
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4390
btrfs_info(fs_info, "at unmount delalloc count %lld",
4391
percpu_counter_sum(&fs_info->delalloc_bytes));
4392
}
4393
4394
if (percpu_counter_sum(&fs_info->ordered_bytes))
4395
btrfs_info(fs_info, "at unmount dio bytes count %lld",
4396
percpu_counter_sum(&fs_info->ordered_bytes));
4397
4398
btrfs_sysfs_remove_mounted(fs_info);
4399
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4400
4401
btrfs_put_block_group_cache(fs_info);
4402
4403
/*
4404
* we must make sure there is not any read request to
4405
* submit after we stopping all workers.
4406
*/
4407
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4408
btrfs_stop_all_workers(fs_info);
4409
4410
/* We shouldn't have any transaction open at this point */
4411
warn_about_uncommitted_trans(fs_info);
4412
4413
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4414
free_root_pointers(fs_info, true);
4415
btrfs_free_fs_roots(fs_info);
4416
4417
/*
4418
* We must free the block groups after dropping the fs_roots as we could
4419
* have had an IO error and have left over tree log blocks that aren't
4420
* cleaned up until the fs roots are freed. This makes the block group
4421
* accounting appear to be wrong because there's pending reserved bytes,
4422
* so make sure we do the block group cleanup afterwards.
4423
*/
4424
btrfs_free_block_groups(fs_info);
4425
4426
iput(fs_info->btree_inode);
4427
4428
btrfs_mapping_tree_free(fs_info);
4429
}
4430
4431
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
4432
struct extent_buffer *buf)
4433
{
4434
struct btrfs_fs_info *fs_info = buf->fs_info;
4435
u64 transid = btrfs_header_generation(buf);
4436
4437
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4438
/*
4439
* This is a fast path so only do this check if we have sanity tests
4440
* enabled. Normal people shouldn't be using unmapped buffers as dirty
4441
* outside of the sanity tests.
4442
*/
4443
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4444
return;
4445
#endif
4446
/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
4447
ASSERT(trans->transid == fs_info->generation);
4448
btrfs_assert_tree_write_locked(buf);
4449
if (unlikely(transid != fs_info->generation)) {
4450
btrfs_abort_transaction(trans, -EUCLEAN);
4451
btrfs_crit(fs_info,
4452
"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
4453
buf->start, transid, fs_info->generation);
4454
}
4455
set_extent_buffer_dirty(buf);
4456
}
4457
4458
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4459
int flush_delayed)
4460
{
4461
/*
4462
* looks as though older kernels can get into trouble with
4463
* this code, they end up stuck in balance_dirty_pages forever
4464
*/
4465
int ret;
4466
4467
if (current->flags & PF_MEMALLOC)
4468
return;
4469
4470
if (flush_delayed)
4471
btrfs_balance_delayed_items(fs_info);
4472
4473
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4474
BTRFS_DIRTY_METADATA_THRESH,
4475
fs_info->dirty_metadata_batch);
4476
if (ret > 0) {
4477
balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4478
}
4479
}
4480
4481
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4482
{
4483
__btrfs_btree_balance_dirty(fs_info, 1);
4484
}
4485
4486
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4487
{
4488
__btrfs_btree_balance_dirty(fs_info, 0);
4489
}
4490
4491
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4492
{
4493
/* cleanup FS via transaction */
4494
btrfs_cleanup_transaction(fs_info);
4495
4496
down_write(&fs_info->cleanup_work_sem);
4497
up_write(&fs_info->cleanup_work_sem);
4498
}
4499
4500
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4501
{
4502
struct btrfs_root *gang[8];
4503
u64 root_objectid = 0;
4504
int ret;
4505
4506
spin_lock(&fs_info->fs_roots_radix_lock);
4507
while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4508
(void **)gang, root_objectid,
4509
ARRAY_SIZE(gang))) != 0) {
4510
int i;
4511
4512
for (i = 0; i < ret; i++)
4513
gang[i] = btrfs_grab_root(gang[i]);
4514
spin_unlock(&fs_info->fs_roots_radix_lock);
4515
4516
for (i = 0; i < ret; i++) {
4517
if (!gang[i])
4518
continue;
4519
root_objectid = btrfs_root_id(gang[i]);
4520
btrfs_free_log(NULL, gang[i]);
4521
btrfs_put_root(gang[i]);
4522
}
4523
root_objectid++;
4524
spin_lock(&fs_info->fs_roots_radix_lock);
4525
}
4526
spin_unlock(&fs_info->fs_roots_radix_lock);
4527
btrfs_free_log_root_tree(NULL, fs_info);
4528
}
4529
4530
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4531
{
4532
struct btrfs_ordered_extent *ordered;
4533
4534
spin_lock(&root->ordered_extent_lock);
4535
/*
4536
* This will just short circuit the ordered completion stuff which will
4537
* make sure the ordered extent gets properly cleaned up.
4538
*/
4539
list_for_each_entry(ordered, &root->ordered_extents,
4540
root_extent_list)
4541
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4542
spin_unlock(&root->ordered_extent_lock);
4543
}
4544
4545
static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4546
{
4547
struct btrfs_root *root;
4548
LIST_HEAD(splice);
4549
4550
spin_lock(&fs_info->ordered_root_lock);
4551
list_splice_init(&fs_info->ordered_roots, &splice);
4552
while (!list_empty(&splice)) {
4553
root = list_first_entry(&splice, struct btrfs_root,
4554
ordered_root);
4555
list_move_tail(&root->ordered_root,
4556
&fs_info->ordered_roots);
4557
4558
spin_unlock(&fs_info->ordered_root_lock);
4559
btrfs_destroy_ordered_extents(root);
4560
4561
cond_resched();
4562
spin_lock(&fs_info->ordered_root_lock);
4563
}
4564
spin_unlock(&fs_info->ordered_root_lock);
4565
4566
/*
4567
* We need this here because if we've been flipped read-only we won't
4568
* get sync() from the umount, so we need to make sure any ordered
4569
* extents that haven't had their dirty pages IO start writeout yet
4570
* actually get run and error out properly.
4571
*/
4572
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
4573
}
4574
4575
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4576
{
4577
struct btrfs_inode *btrfs_inode;
4578
LIST_HEAD(splice);
4579
4580
spin_lock(&root->delalloc_lock);
4581
list_splice_init(&root->delalloc_inodes, &splice);
4582
4583
while (!list_empty(&splice)) {
4584
struct inode *inode = NULL;
4585
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4586
delalloc_inodes);
4587
btrfs_del_delalloc_inode(btrfs_inode);
4588
spin_unlock(&root->delalloc_lock);
4589
4590
/*
4591
* Make sure we get a live inode and that it'll not disappear
4592
* meanwhile.
4593
*/
4594
inode = igrab(&btrfs_inode->vfs_inode);
4595
if (inode) {
4596
unsigned int nofs_flag;
4597
4598
nofs_flag = memalloc_nofs_save();
4599
invalidate_inode_pages2(inode->i_mapping);
4600
memalloc_nofs_restore(nofs_flag);
4601
iput(inode);
4602
}
4603
spin_lock(&root->delalloc_lock);
4604
}
4605
spin_unlock(&root->delalloc_lock);
4606
}
4607
4608
static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4609
{
4610
struct btrfs_root *root;
4611
LIST_HEAD(splice);
4612
4613
spin_lock(&fs_info->delalloc_root_lock);
4614
list_splice_init(&fs_info->delalloc_roots, &splice);
4615
while (!list_empty(&splice)) {
4616
root = list_first_entry(&splice, struct btrfs_root,
4617
delalloc_root);
4618
root = btrfs_grab_root(root);
4619
BUG_ON(!root);
4620
spin_unlock(&fs_info->delalloc_root_lock);
4621
4622
btrfs_destroy_delalloc_inodes(root);
4623
btrfs_put_root(root);
4624
4625
spin_lock(&fs_info->delalloc_root_lock);
4626
}
4627
spin_unlock(&fs_info->delalloc_root_lock);
4628
}
4629
4630
static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4631
struct extent_io_tree *dirty_pages,
4632
int mark)
4633
{
4634
struct extent_buffer *eb;
4635
u64 start = 0;
4636
u64 end;
4637
4638
while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
4639
mark, NULL)) {
4640
btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
4641
while (start <= end) {
4642
eb = find_extent_buffer(fs_info, start);
4643
start += fs_info->nodesize;
4644
if (!eb)
4645
continue;
4646
4647
btrfs_tree_lock(eb);
4648
wait_on_extent_buffer_writeback(eb);
4649
btrfs_clear_buffer_dirty(NULL, eb);
4650
btrfs_tree_unlock(eb);
4651
4652
free_extent_buffer_stale(eb);
4653
}
4654
}
4655
}
4656
4657
static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4658
struct extent_io_tree *unpin)
4659
{
4660
u64 start;
4661
u64 end;
4662
4663
while (1) {
4664
struct extent_state *cached_state = NULL;
4665
4666
/*
4667
* The btrfs_finish_extent_commit() may get the same range as
4668
* ours between find_first_extent_bit and clear_extent_dirty.
4669
* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4670
* the same extent range.
4671
*/
4672
mutex_lock(&fs_info->unused_bg_unpin_mutex);
4673
if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
4674
EXTENT_DIRTY, &cached_state)) {
4675
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4676
break;
4677
}
4678
4679
btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
4680
btrfs_free_extent_state(cached_state);
4681
btrfs_error_unpin_extent_range(fs_info, start, end);
4682
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4683
cond_resched();
4684
}
4685
}
4686
4687
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4688
{
4689
struct inode *inode;
4690
4691
inode = cache->io_ctl.inode;
4692
if (inode) {
4693
unsigned int nofs_flag;
4694
4695
nofs_flag = memalloc_nofs_save();
4696
invalidate_inode_pages2(inode->i_mapping);
4697
memalloc_nofs_restore(nofs_flag);
4698
4699
BTRFS_I(inode)->generation = 0;
4700
cache->io_ctl.inode = NULL;
4701
iput(inode);
4702
}
4703
ASSERT(cache->io_ctl.pages == NULL);
4704
btrfs_put_block_group(cache);
4705
}
4706
4707
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4708
struct btrfs_fs_info *fs_info)
4709
{
4710
struct btrfs_block_group *cache;
4711
4712
spin_lock(&cur_trans->dirty_bgs_lock);
4713
while (!list_empty(&cur_trans->dirty_bgs)) {
4714
cache = list_first_entry(&cur_trans->dirty_bgs,
4715
struct btrfs_block_group,
4716
dirty_list);
4717
4718
if (!list_empty(&cache->io_list)) {
4719
spin_unlock(&cur_trans->dirty_bgs_lock);
4720
list_del_init(&cache->io_list);
4721
btrfs_cleanup_bg_io(cache);
4722
spin_lock(&cur_trans->dirty_bgs_lock);
4723
}
4724
4725
list_del_init(&cache->dirty_list);
4726
spin_lock(&cache->lock);
4727
cache->disk_cache_state = BTRFS_DC_ERROR;
4728
spin_unlock(&cache->lock);
4729
4730
spin_unlock(&cur_trans->dirty_bgs_lock);
4731
btrfs_put_block_group(cache);
4732
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
4733
spin_lock(&cur_trans->dirty_bgs_lock);
4734
}
4735
spin_unlock(&cur_trans->dirty_bgs_lock);
4736
4737
/*
4738
* Refer to the definition of io_bgs member for details why it's safe
4739
* to use it without any locking
4740
*/
4741
while (!list_empty(&cur_trans->io_bgs)) {
4742
cache = list_first_entry(&cur_trans->io_bgs,
4743
struct btrfs_block_group,
4744
io_list);
4745
4746
list_del_init(&cache->io_list);
4747
spin_lock(&cache->lock);
4748
cache->disk_cache_state = BTRFS_DC_ERROR;
4749
spin_unlock(&cache->lock);
4750
btrfs_cleanup_bg_io(cache);
4751
}
4752
}
4753
4754
static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
4755
{
4756
struct btrfs_root *gang[8];
4757
int i;
4758
int ret;
4759
4760
spin_lock(&fs_info->fs_roots_radix_lock);
4761
while (1) {
4762
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
4763
(void **)gang, 0,
4764
ARRAY_SIZE(gang),
4765
BTRFS_ROOT_TRANS_TAG);
4766
if (ret == 0)
4767
break;
4768
for (i = 0; i < ret; i++) {
4769
struct btrfs_root *root = gang[i];
4770
4771
btrfs_qgroup_free_meta_all_pertrans(root);
4772
radix_tree_tag_clear(&fs_info->fs_roots_radix,
4773
(unsigned long)btrfs_root_id(root),
4774
BTRFS_ROOT_TRANS_TAG);
4775
}
4776
}
4777
spin_unlock(&fs_info->fs_roots_radix_lock);
4778
}
4779
4780
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
4781
{
4782
struct btrfs_fs_info *fs_info = cur_trans->fs_info;
4783
struct btrfs_device *dev, *tmp;
4784
4785
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4786
ASSERT(list_empty(&cur_trans->dirty_bgs));
4787
ASSERT(list_empty(&cur_trans->io_bgs));
4788
4789
list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4790
post_commit_list) {
4791
list_del_init(&dev->post_commit_list);
4792
}
4793
4794
btrfs_destroy_delayed_refs(cur_trans);
4795
4796
cur_trans->state = TRANS_STATE_COMMIT_START;
4797
wake_up(&fs_info->transaction_blocked_wait);
4798
4799
cur_trans->state = TRANS_STATE_UNBLOCKED;
4800
wake_up(&fs_info->transaction_wait);
4801
4802
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4803
EXTENT_DIRTY);
4804
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4805
4806
cur_trans->state =TRANS_STATE_COMPLETED;
4807
wake_up(&cur_trans->commit_wait);
4808
}
4809
4810
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4811
{
4812
struct btrfs_transaction *t;
4813
4814
mutex_lock(&fs_info->transaction_kthread_mutex);
4815
4816
spin_lock(&fs_info->trans_lock);
4817
while (!list_empty(&fs_info->trans_list)) {
4818
t = list_first_entry(&fs_info->trans_list,
4819
struct btrfs_transaction, list);
4820
if (t->state >= TRANS_STATE_COMMIT_PREP) {
4821
refcount_inc(&t->use_count);
4822
spin_unlock(&fs_info->trans_lock);
4823
btrfs_wait_for_commit(fs_info, t->transid);
4824
btrfs_put_transaction(t);
4825
spin_lock(&fs_info->trans_lock);
4826
continue;
4827
}
4828
if (t == fs_info->running_transaction) {
4829
t->state = TRANS_STATE_COMMIT_DOING;
4830
spin_unlock(&fs_info->trans_lock);
4831
/*
4832
* We wait for 0 num_writers since we don't hold a trans
4833
* handle open currently for this transaction.
4834
*/
4835
wait_event(t->writer_wait,
4836
atomic_read(&t->num_writers) == 0);
4837
} else {
4838
spin_unlock(&fs_info->trans_lock);
4839
}
4840
btrfs_cleanup_one_transaction(t);
4841
4842
spin_lock(&fs_info->trans_lock);
4843
if (t == fs_info->running_transaction)
4844
fs_info->running_transaction = NULL;
4845
list_del_init(&t->list);
4846
spin_unlock(&fs_info->trans_lock);
4847
4848
btrfs_put_transaction(t);
4849
trace_btrfs_transaction_commit(fs_info);
4850
spin_lock(&fs_info->trans_lock);
4851
}
4852
spin_unlock(&fs_info->trans_lock);
4853
btrfs_destroy_all_ordered_extents(fs_info);
4854
btrfs_destroy_delayed_inodes(fs_info);
4855
btrfs_assert_delayed_root_empty(fs_info);
4856
btrfs_destroy_all_delalloc_inodes(fs_info);
4857
btrfs_drop_all_logs(fs_info);
4858
btrfs_free_all_qgroup_pertrans(fs_info);
4859
mutex_unlock(&fs_info->transaction_kthread_mutex);
4860
4861
return 0;
4862
}
4863
4864
int btrfs_init_root_free_objectid(struct btrfs_root *root)
4865
{
4866
BTRFS_PATH_AUTO_FREE(path);
4867
int ret;
4868
struct extent_buffer *l;
4869
struct btrfs_key search_key;
4870
struct btrfs_key found_key;
4871
int slot;
4872
4873
path = btrfs_alloc_path();
4874
if (!path)
4875
return -ENOMEM;
4876
4877
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4878
search_key.type = -1;
4879
search_key.offset = (u64)-1;
4880
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
4881
if (ret < 0)
4882
return ret;
4883
if (ret == 0) {
4884
/*
4885
* Key with offset -1 found, there would have to exist a root
4886
* with such id, but this is out of valid range.
4887
*/
4888
return -EUCLEAN;
4889
}
4890
if (path->slots[0] > 0) {
4891
slot = path->slots[0] - 1;
4892
l = path->nodes[0];
4893
btrfs_item_key_to_cpu(l, &found_key, slot);
4894
root->free_objectid = max_t(u64, found_key.objectid + 1,
4895
BTRFS_FIRST_FREE_OBJECTID);
4896
} else {
4897
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4898
}
4899
4900
return 0;
4901
}
4902
4903
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4904
{
4905
int ret;
4906
mutex_lock(&root->objectid_mutex);
4907
4908
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4909
btrfs_warn(root->fs_info,
4910
"the objectid of root %llu reaches its highest value",
4911
btrfs_root_id(root));
4912
ret = -ENOSPC;
4913
goto out;
4914
}
4915
4916
*objectid = root->free_objectid++;
4917
ret = 0;
4918
out:
4919
mutex_unlock(&root->objectid_mutex);
4920
return ret;
4921
}
4922
4923