Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/disk-io.c
49452 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2007 Oracle. All rights reserved.
4
*/
5
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
9
#include <linux/writeback.h>
10
#include <linux/workqueue.h>
11
#include <linux/kthread.h>
12
#include <linux/slab.h>
13
#include <linux/migrate.h>
14
#include <linux/ratelimit.h>
15
#include <linux/uuid.h>
16
#include <linux/semaphore.h>
17
#include <linux/error-injection.h>
18
#include <linux/crc32c.h>
19
#include <linux/sched/mm.h>
20
#include <linux/unaligned.h>
21
#include <crypto/hash.h>
22
#include "ctree.h"
23
#include "disk-io.h"
24
#include "transaction.h"
25
#include "btrfs_inode.h"
26
#include "bio.h"
27
#include "print-tree.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "free-space-cache.h"
31
#include "free-space-tree.h"
32
#include "dev-replace.h"
33
#include "raid56.h"
34
#include "sysfs.h"
35
#include "qgroup.h"
36
#include "compression.h"
37
#include "tree-checker.h"
38
#include "ref-verify.h"
39
#include "block-group.h"
40
#include "discard.h"
41
#include "space-info.h"
42
#include "zoned.h"
43
#include "subpage.h"
44
#include "fs.h"
45
#include "accessors.h"
46
#include "extent-tree.h"
47
#include "root-tree.h"
48
#include "defrag.h"
49
#include "uuid-tree.h"
50
#include "relocation.h"
51
#include "scrub.h"
52
#include "super.h"
53
#include "delayed-inode.h"
54
55
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
56
BTRFS_HEADER_FLAG_RELOC |\
57
BTRFS_SUPER_FLAG_ERROR |\
58
BTRFS_SUPER_FLAG_SEEDING |\
59
BTRFS_SUPER_FLAG_METADUMP |\
60
BTRFS_SUPER_FLAG_METADUMP_V2)
61
62
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
63
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
64
65
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
66
{
67
if (fs_info->csum_shash)
68
crypto_free_shash(fs_info->csum_shash);
69
}
70
71
/*
72
* Compute the csum of a btree block and store the result to provided buffer.
73
*/
74
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
75
{
76
struct btrfs_fs_info *fs_info = buf->fs_info;
77
int num_pages;
78
u32 first_page_part;
79
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
80
char *kaddr;
81
int i;
82
83
shash->tfm = fs_info->csum_shash;
84
crypto_shash_init(shash);
85
86
if (buf->addr) {
87
/* Pages are contiguous, handle them as a big one. */
88
kaddr = buf->addr;
89
first_page_part = fs_info->nodesize;
90
num_pages = 1;
91
} else {
92
kaddr = folio_address(buf->folios[0]);
93
first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
94
num_pages = num_extent_pages(buf);
95
}
96
97
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
98
first_page_part - BTRFS_CSUM_SIZE);
99
100
/*
101
* Multiple single-page folios case would reach here.
102
*
103
* nodesize <= PAGE_SIZE and large folio all handled by above
104
* crypto_shash_update() already.
105
*/
106
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
107
kaddr = folio_address(buf->folios[i]);
108
crypto_shash_update(shash, kaddr, PAGE_SIZE);
109
}
110
memset(result, 0, BTRFS_CSUM_SIZE);
111
crypto_shash_final(shash, result);
112
}
113
114
/*
115
* we can't consider a given block up to date unless the transid of the
116
* block matches the transid in the parent node's pointer. This is how we
117
* detect blocks that either didn't get written at all or got written
118
* in the wrong place.
119
*/
120
int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
121
{
122
if (!extent_buffer_uptodate(eb))
123
return 0;
124
125
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
126
return 1;
127
128
if (atomic)
129
return -EAGAIN;
130
131
if (!extent_buffer_uptodate(eb) ||
132
btrfs_header_generation(eb) != parent_transid) {
133
btrfs_err_rl(eb->fs_info,
134
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
135
eb->start, eb->read_mirror,
136
parent_transid, btrfs_header_generation(eb));
137
clear_extent_buffer_uptodate(eb);
138
return 0;
139
}
140
return 1;
141
}
142
143
static bool btrfs_supported_super_csum(u16 csum_type)
144
{
145
switch (csum_type) {
146
case BTRFS_CSUM_TYPE_CRC32:
147
case BTRFS_CSUM_TYPE_XXHASH:
148
case BTRFS_CSUM_TYPE_SHA256:
149
case BTRFS_CSUM_TYPE_BLAKE2:
150
return true;
151
default:
152
return false;
153
}
154
}
155
156
/*
157
* Return 0 if the superblock checksum type matches the checksum value of that
158
* algorithm. Pass the raw disk superblock data.
159
*/
160
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
161
const struct btrfs_super_block *disk_sb)
162
{
163
char result[BTRFS_CSUM_SIZE];
164
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
165
166
shash->tfm = fs_info->csum_shash;
167
168
/*
169
* The super_block structure does not span the whole
170
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
171
* filled with zeros and is included in the checksum.
172
*/
173
crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
174
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
175
176
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
177
return 1;
178
179
return 0;
180
}
181
182
static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
183
int mirror_num)
184
{
185
struct btrfs_fs_info *fs_info = eb->fs_info;
186
const u32 step = min(fs_info->nodesize, PAGE_SIZE);
187
const u32 nr_steps = eb->len / step;
188
phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
189
int ret = 0;
190
191
if (sb_rdonly(fs_info->sb))
192
return -EROFS;
193
194
for (int i = 0; i < num_extent_pages(eb); i++) {
195
struct folio *folio = eb->folios[i];
196
197
/* No large folio support yet. */
198
ASSERT(folio_order(folio) == 0);
199
ASSERT(i < nr_steps);
200
201
/*
202
* For nodesize < page size, there is just one paddr, with some
203
* offset inside the page.
204
*
205
* For nodesize >= page size, it's one or more paddrs, and eb->start
206
* must be aligned to page boundary.
207
*/
208
paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
209
}
210
211
ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start,
212
paddrs, step, mirror_num);
213
return ret;
214
}
215
216
/*
217
* helper to read a given tree block, doing retries as required when
218
* the checksums don't match and we have alternate mirrors to try.
219
*
220
* @check: expected tree parentness check, see the comments of the
221
* structure for details.
222
*/
223
int btrfs_read_extent_buffer(struct extent_buffer *eb,
224
const struct btrfs_tree_parent_check *check)
225
{
226
struct btrfs_fs_info *fs_info = eb->fs_info;
227
int failed = 0;
228
int ret;
229
int num_copies = 0;
230
int mirror_num = 0;
231
int failed_mirror = 0;
232
233
ASSERT(check);
234
235
while (1) {
236
ret = read_extent_buffer_pages(eb, mirror_num, check);
237
if (!ret)
238
break;
239
240
num_copies = btrfs_num_copies(fs_info,
241
eb->start, eb->len);
242
if (num_copies == 1)
243
break;
244
245
if (!failed_mirror) {
246
failed = 1;
247
failed_mirror = eb->read_mirror;
248
}
249
250
mirror_num++;
251
if (mirror_num == failed_mirror)
252
mirror_num++;
253
254
if (mirror_num > num_copies)
255
break;
256
}
257
258
if (failed && !ret && failed_mirror)
259
btrfs_repair_eb_io_failure(eb, failed_mirror);
260
261
return ret;
262
}
263
264
/*
265
* Checksum a dirty tree block before IO.
266
*/
267
int btree_csum_one_bio(struct btrfs_bio *bbio)
268
{
269
struct extent_buffer *eb = bbio->private;
270
struct btrfs_fs_info *fs_info = eb->fs_info;
271
u64 found_start = btrfs_header_bytenr(eb);
272
u64 last_trans;
273
u8 result[BTRFS_CSUM_SIZE];
274
int ret;
275
276
/* Btree blocks are always contiguous on disk. */
277
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
278
return -EIO;
279
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
280
return -EIO;
281
282
/*
283
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
284
* checksum it but zero-out its content. This is done to preserve
285
* ordering of I/O without unnecessarily writing out data.
286
*/
287
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
288
memzero_extent_buffer(eb, 0, eb->len);
289
return 0;
290
}
291
292
if (WARN_ON_ONCE(found_start != eb->start))
293
return -EIO;
294
if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
295
return -EIO;
296
297
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
298
offsetof(struct btrfs_header, fsid),
299
BTRFS_FSID_SIZE) == 0);
300
csum_tree_block(eb, result);
301
302
if (btrfs_header_level(eb))
303
ret = btrfs_check_node(eb);
304
else
305
ret = btrfs_check_leaf(eb);
306
307
if (ret < 0)
308
goto error;
309
310
/*
311
* Also check the generation, the eb reached here must be newer than
312
* last committed. Or something seriously wrong happened.
313
*/
314
last_trans = btrfs_get_last_trans_committed(fs_info);
315
if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
316
ret = -EUCLEAN;
317
btrfs_err(fs_info,
318
"block=%llu bad generation, have %llu expect > %llu",
319
eb->start, btrfs_header_generation(eb), last_trans);
320
goto error;
321
}
322
write_extent_buffer(eb, result, 0, fs_info->csum_size);
323
return 0;
324
325
error:
326
btrfs_print_tree(eb, 0);
327
btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
328
eb->start);
329
/*
330
* Be noisy if this is an extent buffer from a log tree. We don't abort
331
* a transaction in case there's a bad log tree extent buffer, we just
332
* fallback to a transaction commit. Still we want to know when there is
333
* a bad log tree extent buffer, as that may signal a bug somewhere.
334
*/
335
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
336
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
337
return ret;
338
}
339
340
static bool check_tree_block_fsid(struct extent_buffer *eb)
341
{
342
struct btrfs_fs_info *fs_info = eb->fs_info;
343
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
344
u8 fsid[BTRFS_FSID_SIZE];
345
346
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
347
BTRFS_FSID_SIZE);
348
349
/*
350
* alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
351
* This is then overwritten by metadata_uuid if it is present in the
352
* device_list_add(). The same true for a seed device as well. So use of
353
* fs_devices::metadata_uuid is appropriate here.
354
*/
355
if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
356
return false;
357
358
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
359
if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
360
return false;
361
362
return true;
363
}
364
365
/* Do basic extent buffer checks at read time */
366
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
367
const struct btrfs_tree_parent_check *check)
368
{
369
struct btrfs_fs_info *fs_info = eb->fs_info;
370
u64 found_start;
371
const u32 csum_size = fs_info->csum_size;
372
u8 found_level;
373
u8 result[BTRFS_CSUM_SIZE];
374
const u8 *header_csum;
375
int ret = 0;
376
const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
377
378
ASSERT(check);
379
380
found_start = btrfs_header_bytenr(eb);
381
if (unlikely(found_start != eb->start)) {
382
btrfs_err_rl(fs_info,
383
"bad tree block start, mirror %u want %llu have %llu",
384
eb->read_mirror, eb->start, found_start);
385
ret = -EIO;
386
goto out;
387
}
388
if (unlikely(check_tree_block_fsid(eb))) {
389
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
390
eb->start, eb->read_mirror);
391
ret = -EIO;
392
goto out;
393
}
394
found_level = btrfs_header_level(eb);
395
if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
396
btrfs_err(fs_info,
397
"bad tree block level, mirror %u level %d on logical %llu",
398
eb->read_mirror, btrfs_header_level(eb), eb->start);
399
ret = -EIO;
400
goto out;
401
}
402
403
csum_tree_block(eb, result);
404
header_csum = folio_address(eb->folios[0]) +
405
get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
406
407
if (memcmp(result, header_csum, csum_size) != 0) {
408
btrfs_warn_rl(fs_info,
409
"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s",
410
eb->start, eb->read_mirror,
411
BTRFS_CSUM_FMT_VALUE(csum_size, header_csum),
412
BTRFS_CSUM_FMT_VALUE(csum_size, result),
413
btrfs_header_level(eb),
414
ignore_csum ? ", ignored" : "");
415
if (unlikely(!ignore_csum)) {
416
ret = -EUCLEAN;
417
goto out;
418
}
419
}
420
421
if (unlikely(found_level != check->level)) {
422
btrfs_err(fs_info,
423
"level verify failed on logical %llu mirror %u wanted %u found %u",
424
eb->start, eb->read_mirror, check->level, found_level);
425
ret = -EIO;
426
goto out;
427
}
428
if (unlikely(check->transid &&
429
btrfs_header_generation(eb) != check->transid)) {
430
btrfs_err_rl(eb->fs_info,
431
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
432
eb->start, eb->read_mirror, check->transid,
433
btrfs_header_generation(eb));
434
ret = -EIO;
435
goto out;
436
}
437
if (check->has_first_key) {
438
const struct btrfs_key *expect_key = &check->first_key;
439
struct btrfs_key found_key;
440
441
if (found_level)
442
btrfs_node_key_to_cpu(eb, &found_key, 0);
443
else
444
btrfs_item_key_to_cpu(eb, &found_key, 0);
445
if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
446
btrfs_err(fs_info,
447
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
448
eb->start, check->transid,
449
expect_key->objectid,
450
expect_key->type, expect_key->offset,
451
found_key.objectid, found_key.type,
452
found_key.offset);
453
ret = -EUCLEAN;
454
goto out;
455
}
456
}
457
if (check->owner_root) {
458
ret = btrfs_check_eb_owner(eb, check->owner_root);
459
if (ret < 0)
460
goto out;
461
}
462
463
/* If this is a leaf block and it is corrupt, just return -EIO. */
464
if (found_level == 0 && btrfs_check_leaf(eb))
465
ret = -EIO;
466
467
if (found_level > 0 && btrfs_check_node(eb))
468
ret = -EIO;
469
470
if (ret)
471
btrfs_err(fs_info,
472
"read time tree block corruption detected on logical %llu mirror %u",
473
eb->start, eb->read_mirror);
474
out:
475
return ret;
476
}
477
478
#ifdef CONFIG_MIGRATION
479
static int btree_migrate_folio(struct address_space *mapping,
480
struct folio *dst, struct folio *src, enum migrate_mode mode)
481
{
482
/*
483
* we can't safely write a btree page from here,
484
* we haven't done the locking hook
485
*/
486
if (folio_test_dirty(src))
487
return -EAGAIN;
488
/*
489
* Buffers may be managed in a filesystem specific way.
490
* We must have no buffers or drop them.
491
*/
492
if (folio_get_private(src) &&
493
!filemap_release_folio(src, GFP_KERNEL))
494
return -EAGAIN;
495
return migrate_folio(mapping, dst, src, mode);
496
}
497
#else
498
#define btree_migrate_folio NULL
499
#endif
500
501
static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
502
{
503
if (folio_test_writeback(folio) || folio_test_dirty(folio))
504
return false;
505
506
return try_release_extent_buffer(folio);
507
}
508
509
static void btree_invalidate_folio(struct folio *folio, size_t offset,
510
size_t length)
511
{
512
struct extent_io_tree *tree;
513
514
tree = &folio_to_inode(folio)->io_tree;
515
extent_invalidate_folio(tree, folio, offset);
516
btree_release_folio(folio, GFP_NOFS);
517
if (folio_get_private(folio)) {
518
btrfs_warn(folio_to_fs_info(folio),
519
"folio private not zero on folio %llu",
520
(unsigned long long)folio_pos(folio));
521
folio_detach_private(folio);
522
}
523
}
524
525
#ifdef DEBUG
526
static bool btree_dirty_folio(struct address_space *mapping,
527
struct folio *folio)
528
{
529
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
530
struct btrfs_subpage_info *spi = fs_info->subpage_info;
531
struct btrfs_subpage *subpage;
532
struct extent_buffer *eb;
533
int cur_bit = 0;
534
u64 page_start = folio_pos(folio);
535
536
if (fs_info->sectorsize == PAGE_SIZE) {
537
eb = folio_get_private(folio);
538
BUG_ON(!eb);
539
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
540
BUG_ON(!atomic_read(&eb->refs));
541
btrfs_assert_tree_write_locked(eb);
542
return filemap_dirty_folio(mapping, folio);
543
}
544
545
ASSERT(spi);
546
subpage = folio_get_private(folio);
547
548
for (cur_bit = spi->dirty_offset;
549
cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
550
cur_bit++) {
551
unsigned long flags;
552
u64 cur;
553
554
spin_lock_irqsave(&subpage->lock, flags);
555
if (!test_bit(cur_bit, subpage->bitmaps)) {
556
spin_unlock_irqrestore(&subpage->lock, flags);
557
continue;
558
}
559
spin_unlock_irqrestore(&subpage->lock, flags);
560
cur = page_start + cur_bit * fs_info->sectorsize;
561
562
eb = find_extent_buffer(fs_info, cur);
563
ASSERT(eb);
564
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
565
ASSERT(atomic_read(&eb->refs));
566
btrfs_assert_tree_write_locked(eb);
567
free_extent_buffer(eb);
568
569
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
570
}
571
return filemap_dirty_folio(mapping, folio);
572
}
573
#else
574
#define btree_dirty_folio filemap_dirty_folio
575
#endif
576
577
static const struct address_space_operations btree_aops = {
578
.writepages = btree_writepages,
579
.release_folio = btree_release_folio,
580
.invalidate_folio = btree_invalidate_folio,
581
.migrate_folio = btree_migrate_folio,
582
.dirty_folio = btree_dirty_folio,
583
};
584
585
struct extent_buffer *btrfs_find_create_tree_block(
586
struct btrfs_fs_info *fs_info,
587
u64 bytenr, u64 owner_root,
588
int level)
589
{
590
if (btrfs_is_testing(fs_info))
591
return alloc_test_extent_buffer(fs_info, bytenr);
592
return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
593
}
594
595
/*
596
* Read tree block at logical address @bytenr and do variant basic but critical
597
* verification.
598
*
599
* @check: expected tree parentness check, see comments of the
600
* structure for details.
601
*/
602
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
603
struct btrfs_tree_parent_check *check)
604
{
605
struct extent_buffer *buf = NULL;
606
int ret;
607
608
ASSERT(check);
609
610
buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
611
check->level);
612
if (IS_ERR(buf))
613
return buf;
614
615
ret = btrfs_read_extent_buffer(buf, check);
616
if (ret) {
617
free_extent_buffer_stale(buf);
618
return ERR_PTR(ret);
619
}
620
return buf;
621
622
}
623
624
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
625
u64 objectid, gfp_t flags)
626
{
627
struct btrfs_root *root;
628
629
root = kzalloc(sizeof(*root), flags);
630
if (!root)
631
return NULL;
632
633
root->fs_info = fs_info;
634
root->root_key.objectid = objectid;
635
RB_CLEAR_NODE(&root->rb_node);
636
637
xa_init(&root->inodes);
638
xa_init(&root->delayed_nodes);
639
640
btrfs_init_root_block_rsv(root);
641
642
INIT_LIST_HEAD(&root->dirty_list);
643
INIT_LIST_HEAD(&root->root_list);
644
INIT_LIST_HEAD(&root->delalloc_inodes);
645
INIT_LIST_HEAD(&root->delalloc_root);
646
INIT_LIST_HEAD(&root->ordered_extents);
647
INIT_LIST_HEAD(&root->ordered_root);
648
INIT_LIST_HEAD(&root->reloc_dirty_list);
649
spin_lock_init(&root->delalloc_lock);
650
spin_lock_init(&root->ordered_extent_lock);
651
spin_lock_init(&root->accounting_lock);
652
spin_lock_init(&root->qgroup_meta_rsv_lock);
653
mutex_init(&root->objectid_mutex);
654
mutex_init(&root->log_mutex);
655
mutex_init(&root->ordered_extent_mutex);
656
mutex_init(&root->delalloc_mutex);
657
init_waitqueue_head(&root->qgroup_flush_wait);
658
init_waitqueue_head(&root->log_writer_wait);
659
init_waitqueue_head(&root->log_commit_wait[0]);
660
init_waitqueue_head(&root->log_commit_wait[1]);
661
INIT_LIST_HEAD(&root->log_ctxs[0]);
662
INIT_LIST_HEAD(&root->log_ctxs[1]);
663
atomic_set(&root->log_commit[0], 0);
664
atomic_set(&root->log_commit[1], 0);
665
atomic_set(&root->log_writers, 0);
666
atomic_set(&root->log_batch, 0);
667
refcount_set(&root->refs, 1);
668
atomic_set(&root->snapshot_force_cow, 0);
669
atomic_set(&root->nr_swapfiles, 0);
670
root->log_transid_committed = -1;
671
if (!btrfs_is_testing(fs_info)) {
672
btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
673
IO_TREE_ROOT_DIRTY_LOG_PAGES);
674
btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
675
IO_TREE_LOG_CSUM_RANGE);
676
}
677
678
spin_lock_init(&root->root_item_lock);
679
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
680
#ifdef CONFIG_BTRFS_DEBUG
681
INIT_LIST_HEAD(&root->leak_list);
682
spin_lock(&fs_info->fs_roots_radix_lock);
683
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
684
spin_unlock(&fs_info->fs_roots_radix_lock);
685
#endif
686
687
return root;
688
}
689
690
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
691
/* Should only be used by the testing infrastructure */
692
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
693
{
694
struct btrfs_root *root;
695
696
if (!fs_info)
697
return ERR_PTR(-EINVAL);
698
699
root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
700
if (!root)
701
return ERR_PTR(-ENOMEM);
702
703
/* We don't use the stripesize in selftest, set it as sectorsize */
704
root->alloc_bytenr = 0;
705
706
return root;
707
}
708
#endif
709
710
static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
711
{
712
const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
713
const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
714
715
return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
716
}
717
718
static int global_root_key_cmp(const void *k, const struct rb_node *node)
719
{
720
const struct btrfs_key *key = k;
721
const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
722
723
return btrfs_comp_cpu_keys(key, &root->root_key);
724
}
725
726
int btrfs_global_root_insert(struct btrfs_root *root)
727
{
728
struct btrfs_fs_info *fs_info = root->fs_info;
729
struct rb_node *tmp;
730
int ret = 0;
731
732
write_lock(&fs_info->global_root_lock);
733
tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
734
write_unlock(&fs_info->global_root_lock);
735
736
if (tmp) {
737
ret = -EEXIST;
738
btrfs_warn(fs_info, "global root %llu %llu already exists",
739
btrfs_root_id(root), root->root_key.offset);
740
}
741
return ret;
742
}
743
744
void btrfs_global_root_delete(struct btrfs_root *root)
745
{
746
struct btrfs_fs_info *fs_info = root->fs_info;
747
748
write_lock(&fs_info->global_root_lock);
749
rb_erase(&root->rb_node, &fs_info->global_root_tree);
750
write_unlock(&fs_info->global_root_lock);
751
}
752
753
struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
754
struct btrfs_key *key)
755
{
756
struct rb_node *node;
757
struct btrfs_root *root = NULL;
758
759
read_lock(&fs_info->global_root_lock);
760
node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
761
if (node)
762
root = container_of(node, struct btrfs_root, rb_node);
763
read_unlock(&fs_info->global_root_lock);
764
765
return root;
766
}
767
768
static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
769
{
770
struct btrfs_block_group *block_group;
771
u64 ret;
772
773
if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
774
return 0;
775
776
if (bytenr)
777
block_group = btrfs_lookup_block_group(fs_info, bytenr);
778
else
779
block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
780
ASSERT(block_group);
781
if (!block_group)
782
return 0;
783
ret = block_group->global_root_id;
784
btrfs_put_block_group(block_group);
785
786
return ret;
787
}
788
789
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
790
{
791
struct btrfs_key key = {
792
.objectid = BTRFS_CSUM_TREE_OBJECTID,
793
.type = BTRFS_ROOT_ITEM_KEY,
794
.offset = btrfs_global_root_id(fs_info, bytenr),
795
};
796
797
return btrfs_global_root(fs_info, &key);
798
}
799
800
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
801
{
802
struct btrfs_key key = {
803
.objectid = BTRFS_EXTENT_TREE_OBJECTID,
804
.type = BTRFS_ROOT_ITEM_KEY,
805
.offset = btrfs_global_root_id(fs_info, bytenr),
806
};
807
808
return btrfs_global_root(fs_info, &key);
809
}
810
811
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
812
u64 objectid)
813
{
814
struct btrfs_fs_info *fs_info = trans->fs_info;
815
struct extent_buffer *leaf;
816
struct btrfs_root *tree_root = fs_info->tree_root;
817
struct btrfs_root *root;
818
struct btrfs_key key;
819
unsigned int nofs_flag;
820
int ret = 0;
821
822
/*
823
* We're holding a transaction handle, so use a NOFS memory allocation
824
* context to avoid deadlock if reclaim happens.
825
*/
826
nofs_flag = memalloc_nofs_save();
827
root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
828
memalloc_nofs_restore(nofs_flag);
829
if (!root)
830
return ERR_PTR(-ENOMEM);
831
832
root->root_key.objectid = objectid;
833
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
834
root->root_key.offset = 0;
835
836
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
837
0, BTRFS_NESTING_NORMAL);
838
if (IS_ERR(leaf)) {
839
ret = PTR_ERR(leaf);
840
leaf = NULL;
841
goto fail;
842
}
843
844
root->node = leaf;
845
btrfs_mark_buffer_dirty(trans, leaf);
846
847
root->commit_root = btrfs_root_node(root);
848
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
849
850
btrfs_set_root_flags(&root->root_item, 0);
851
btrfs_set_root_limit(&root->root_item, 0);
852
btrfs_set_root_bytenr(&root->root_item, leaf->start);
853
btrfs_set_root_generation(&root->root_item, trans->transid);
854
btrfs_set_root_level(&root->root_item, 0);
855
btrfs_set_root_refs(&root->root_item, 1);
856
btrfs_set_root_used(&root->root_item, leaf->len);
857
btrfs_set_root_last_snapshot(&root->root_item, 0);
858
btrfs_set_root_dirid(&root->root_item, 0);
859
if (btrfs_is_fstree(objectid))
860
generate_random_guid(root->root_item.uuid);
861
else
862
export_guid(root->root_item.uuid, &guid_null);
863
btrfs_set_root_drop_level(&root->root_item, 0);
864
865
btrfs_tree_unlock(leaf);
866
867
key.objectid = objectid;
868
key.type = BTRFS_ROOT_ITEM_KEY;
869
key.offset = 0;
870
ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
871
if (ret)
872
goto fail;
873
874
return root;
875
876
fail:
877
btrfs_put_root(root);
878
879
return ERR_PTR(ret);
880
}
881
882
static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
883
{
884
struct btrfs_root *root;
885
886
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
887
if (!root)
888
return ERR_PTR(-ENOMEM);
889
890
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
891
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
892
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
893
894
return root;
895
}
896
897
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
898
struct btrfs_root *root)
899
{
900
struct extent_buffer *leaf;
901
902
/*
903
* DON'T set SHAREABLE bit for log trees.
904
*
905
* Log trees are not exposed to user space thus can't be snapshotted,
906
* and they go away before a real commit is actually done.
907
*
908
* They do store pointers to file data extents, and those reference
909
* counts still get updated (along with back refs to the log tree).
910
*/
911
912
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
913
NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
914
if (IS_ERR(leaf))
915
return PTR_ERR(leaf);
916
917
root->node = leaf;
918
919
btrfs_mark_buffer_dirty(trans, root->node);
920
btrfs_tree_unlock(root->node);
921
922
return 0;
923
}
924
925
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
926
struct btrfs_fs_info *fs_info)
927
{
928
struct btrfs_root *log_root;
929
930
log_root = alloc_log_tree(fs_info);
931
if (IS_ERR(log_root))
932
return PTR_ERR(log_root);
933
934
if (!btrfs_is_zoned(fs_info)) {
935
int ret = btrfs_alloc_log_tree_node(trans, log_root);
936
937
if (ret) {
938
btrfs_put_root(log_root);
939
return ret;
940
}
941
}
942
943
WARN_ON(fs_info->log_root_tree);
944
fs_info->log_root_tree = log_root;
945
return 0;
946
}
947
948
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
949
struct btrfs_root *root)
950
{
951
struct btrfs_fs_info *fs_info = root->fs_info;
952
struct btrfs_root *log_root;
953
struct btrfs_inode_item *inode_item;
954
int ret;
955
956
log_root = alloc_log_tree(fs_info);
957
if (IS_ERR(log_root))
958
return PTR_ERR(log_root);
959
960
ret = btrfs_alloc_log_tree_node(trans, log_root);
961
if (ret) {
962
btrfs_put_root(log_root);
963
return ret;
964
}
965
966
btrfs_set_root_last_trans(log_root, trans->transid);
967
log_root->root_key.offset = btrfs_root_id(root);
968
969
inode_item = &log_root->root_item.inode;
970
btrfs_set_stack_inode_generation(inode_item, 1);
971
btrfs_set_stack_inode_size(inode_item, 3);
972
btrfs_set_stack_inode_nlink(inode_item, 1);
973
btrfs_set_stack_inode_nbytes(inode_item,
974
fs_info->nodesize);
975
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
976
977
btrfs_set_root_node(&log_root->root_item, log_root->node);
978
979
WARN_ON(root->log_root);
980
root->log_root = log_root;
981
btrfs_set_root_log_transid(root, 0);
982
root->log_transid_committed = -1;
983
btrfs_set_root_last_log_commit(root, 0);
984
return 0;
985
}
986
987
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
988
struct btrfs_path *path,
989
const struct btrfs_key *key)
990
{
991
struct btrfs_root *root;
992
struct btrfs_tree_parent_check check = { 0 };
993
struct btrfs_fs_info *fs_info = tree_root->fs_info;
994
u64 generation;
995
int ret;
996
int level;
997
998
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
999
if (!root)
1000
return ERR_PTR(-ENOMEM);
1001
1002
ret = btrfs_find_root(tree_root, key, path,
1003
&root->root_item, &root->root_key);
1004
if (ret) {
1005
if (ret > 0)
1006
ret = -ENOENT;
1007
goto fail;
1008
}
1009
1010
generation = btrfs_root_generation(&root->root_item);
1011
level = btrfs_root_level(&root->root_item);
1012
check.level = level;
1013
check.transid = generation;
1014
check.owner_root = key->objectid;
1015
root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
1016
&check);
1017
if (IS_ERR(root->node)) {
1018
ret = PTR_ERR(root->node);
1019
root->node = NULL;
1020
goto fail;
1021
}
1022
if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
1023
ret = -EIO;
1024
goto fail;
1025
}
1026
1027
/*
1028
* For real fs, and not log/reloc trees, root owner must
1029
* match its root node owner
1030
*/
1031
if (unlikely(!btrfs_is_testing(fs_info) &&
1032
btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1033
btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
1034
btrfs_root_id(root) != btrfs_header_owner(root->node))) {
1035
btrfs_crit(fs_info,
1036
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1037
btrfs_root_id(root), root->node->start,
1038
btrfs_header_owner(root->node),
1039
btrfs_root_id(root));
1040
ret = -EUCLEAN;
1041
goto fail;
1042
}
1043
root->commit_root = btrfs_root_node(root);
1044
return root;
1045
fail:
1046
btrfs_put_root(root);
1047
return ERR_PTR(ret);
1048
}
1049
1050
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1051
const struct btrfs_key *key)
1052
{
1053
struct btrfs_root *root;
1054
BTRFS_PATH_AUTO_FREE(path);
1055
1056
path = btrfs_alloc_path();
1057
if (!path)
1058
return ERR_PTR(-ENOMEM);
1059
root = read_tree_root_path(tree_root, path, key);
1060
1061
return root;
1062
}
1063
1064
/*
1065
* Initialize subvolume root in-memory structure.
1066
*
1067
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
1068
*
1069
* In case of failure the caller is responsible to call btrfs_free_fs_root()
1070
*/
1071
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1072
{
1073
int ret;
1074
1075
btrfs_drew_lock_init(&root->snapshot_lock);
1076
1077
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1078
!btrfs_is_data_reloc_root(root) &&
1079
btrfs_is_fstree(btrfs_root_id(root))) {
1080
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1081
btrfs_check_and_init_root_item(&root->root_item);
1082
}
1083
1084
/*
1085
* Don't assign anonymous block device to roots that are not exposed to
1086
* userspace, the id pool is limited to 1M
1087
*/
1088
if (btrfs_is_fstree(btrfs_root_id(root)) &&
1089
btrfs_root_refs(&root->root_item) > 0) {
1090
if (!anon_dev) {
1091
ret = get_anon_bdev(&root->anon_dev);
1092
if (ret)
1093
return ret;
1094
} else {
1095
root->anon_dev = anon_dev;
1096
}
1097
}
1098
1099
mutex_lock(&root->objectid_mutex);
1100
ret = btrfs_init_root_free_objectid(root);
1101
if (ret) {
1102
mutex_unlock(&root->objectid_mutex);
1103
return ret;
1104
}
1105
1106
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1107
1108
mutex_unlock(&root->objectid_mutex);
1109
1110
return 0;
1111
}
1112
1113
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1114
u64 root_id)
1115
{
1116
struct btrfs_root *root;
1117
1118
spin_lock(&fs_info->fs_roots_radix_lock);
1119
root = radix_tree_lookup(&fs_info->fs_roots_radix,
1120
(unsigned long)root_id);
1121
root = btrfs_grab_root(root);
1122
spin_unlock(&fs_info->fs_roots_radix_lock);
1123
return root;
1124
}
1125
1126
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1127
u64 objectid)
1128
{
1129
struct btrfs_key key = {
1130
.objectid = objectid,
1131
.type = BTRFS_ROOT_ITEM_KEY,
1132
.offset = 0,
1133
};
1134
1135
switch (objectid) {
1136
case BTRFS_ROOT_TREE_OBJECTID:
1137
return btrfs_grab_root(fs_info->tree_root);
1138
case BTRFS_EXTENT_TREE_OBJECTID:
1139
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1140
case BTRFS_CHUNK_TREE_OBJECTID:
1141
return btrfs_grab_root(fs_info->chunk_root);
1142
case BTRFS_DEV_TREE_OBJECTID:
1143
return btrfs_grab_root(fs_info->dev_root);
1144
case BTRFS_CSUM_TREE_OBJECTID:
1145
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1146
case BTRFS_QUOTA_TREE_OBJECTID:
1147
return btrfs_grab_root(fs_info->quota_root);
1148
case BTRFS_UUID_TREE_OBJECTID:
1149
return btrfs_grab_root(fs_info->uuid_root);
1150
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
1151
return btrfs_grab_root(fs_info->block_group_root);
1152
case BTRFS_FREE_SPACE_TREE_OBJECTID:
1153
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1154
case BTRFS_RAID_STRIPE_TREE_OBJECTID:
1155
return btrfs_grab_root(fs_info->stripe_root);
1156
default:
1157
return NULL;
1158
}
1159
}
1160
1161
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1162
struct btrfs_root *root)
1163
{
1164
int ret;
1165
1166
ret = radix_tree_preload(GFP_NOFS);
1167
if (ret)
1168
return ret;
1169
1170
spin_lock(&fs_info->fs_roots_radix_lock);
1171
ret = radix_tree_insert(&fs_info->fs_roots_radix,
1172
(unsigned long)btrfs_root_id(root),
1173
root);
1174
if (ret == 0) {
1175
btrfs_grab_root(root);
1176
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1177
}
1178
spin_unlock(&fs_info->fs_roots_radix_lock);
1179
radix_tree_preload_end();
1180
1181
return ret;
1182
}
1183
1184
void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
1185
{
1186
#ifdef CONFIG_BTRFS_DEBUG
1187
struct btrfs_root *root;
1188
1189
while (!list_empty(&fs_info->allocated_roots)) {
1190
char buf[BTRFS_ROOT_NAME_BUF_LEN];
1191
1192
root = list_first_entry(&fs_info->allocated_roots,
1193
struct btrfs_root, leak_list);
1194
btrfs_err(fs_info, "leaked root %s refcount %d",
1195
btrfs_root_name(&root->root_key, buf),
1196
refcount_read(&root->refs));
1197
WARN_ON_ONCE(1);
1198
while (refcount_read(&root->refs) > 1)
1199
btrfs_put_root(root);
1200
btrfs_put_root(root);
1201
}
1202
#endif
1203
}
1204
1205
static void free_global_roots(struct btrfs_fs_info *fs_info)
1206
{
1207
struct btrfs_root *root;
1208
struct rb_node *node;
1209
1210
while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1211
root = rb_entry(node, struct btrfs_root, rb_node);
1212
rb_erase(&root->rb_node, &fs_info->global_root_tree);
1213
btrfs_put_root(root);
1214
}
1215
}
1216
1217
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1218
{
1219
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
1220
1221
if (fs_info->fs_devices)
1222
btrfs_close_devices(fs_info->fs_devices);
1223
btrfs_free_compress_wsm(fs_info);
1224
percpu_counter_destroy(&fs_info->stats_read_blocks);
1225
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1226
percpu_counter_destroy(&fs_info->delalloc_bytes);
1227
percpu_counter_destroy(&fs_info->ordered_bytes);
1228
if (percpu_counter_initialized(em_counter))
1229
ASSERT(percpu_counter_sum_positive(em_counter) == 0);
1230
percpu_counter_destroy(em_counter);
1231
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1232
btrfs_free_csum_hash(fs_info);
1233
btrfs_free_stripe_hash_table(fs_info);
1234
btrfs_free_ref_cache(fs_info);
1235
kfree(fs_info->balance_ctl);
1236
kfree(fs_info->delayed_root);
1237
free_global_roots(fs_info);
1238
btrfs_put_root(fs_info->tree_root);
1239
btrfs_put_root(fs_info->chunk_root);
1240
btrfs_put_root(fs_info->dev_root);
1241
btrfs_put_root(fs_info->quota_root);
1242
btrfs_put_root(fs_info->uuid_root);
1243
btrfs_put_root(fs_info->fs_root);
1244
btrfs_put_root(fs_info->data_reloc_root);
1245
btrfs_put_root(fs_info->block_group_root);
1246
btrfs_put_root(fs_info->stripe_root);
1247
btrfs_check_leaked_roots(fs_info);
1248
btrfs_extent_buffer_leak_debug_check(fs_info);
1249
kfree(fs_info->super_copy);
1250
kfree(fs_info->super_for_commit);
1251
kvfree(fs_info);
1252
}
1253
1254
1255
/*
1256
* Get an in-memory reference of a root structure.
1257
*
1258
* For essential trees like root/extent tree, we grab it from fs_info directly.
1259
* For subvolume trees, we check the cached filesystem roots first. If not
1260
* found, then read it from disk and add it to cached fs roots.
1261
*
1262
* Caller should release the root by calling btrfs_put_root() after the usage.
1263
*
1264
* NOTE: Reloc and log trees can't be read by this function as they share the
1265
* same root objectid.
1266
*
1267
* @objectid: root id
1268
* @anon_dev: preallocated anonymous block device number for new roots,
1269
* pass NULL for a new allocation.
1270
* @check_ref: whether to check root item references, If true, return -ENOENT
1271
* for orphan roots
1272
*/
1273
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1274
u64 objectid, dev_t *anon_dev,
1275
bool check_ref)
1276
{
1277
struct btrfs_root *root;
1278
struct btrfs_path *path;
1279
struct btrfs_key key;
1280
int ret;
1281
1282
root = btrfs_get_global_root(fs_info, objectid);
1283
if (root)
1284
return root;
1285
1286
/*
1287
* If we're called for non-subvolume trees, and above function didn't
1288
* find one, do not try to read it from disk.
1289
*
1290
* This is namely for free-space-tree and quota tree, which can change
1291
* at runtime and should only be grabbed from fs_info.
1292
*/
1293
if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1294
return ERR_PTR(-ENOENT);
1295
again:
1296
root = btrfs_lookup_fs_root(fs_info, objectid);
1297
if (root) {
1298
/*
1299
* Some other caller may have read out the newly inserted
1300
* subvolume already (for things like backref walk etc). Not
1301
* that common but still possible. In that case, we just need
1302
* to free the anon_dev.
1303
*/
1304
if (unlikely(anon_dev && *anon_dev)) {
1305
free_anon_bdev(*anon_dev);
1306
*anon_dev = 0;
1307
}
1308
1309
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1310
btrfs_put_root(root);
1311
return ERR_PTR(-ENOENT);
1312
}
1313
return root;
1314
}
1315
1316
key.objectid = objectid;
1317
key.type = BTRFS_ROOT_ITEM_KEY;
1318
key.offset = (u64)-1;
1319
root = btrfs_read_tree_root(fs_info->tree_root, &key);
1320
if (IS_ERR(root))
1321
return root;
1322
1323
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1324
ret = -ENOENT;
1325
goto fail;
1326
}
1327
1328
ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
1329
if (ret)
1330
goto fail;
1331
1332
path = btrfs_alloc_path();
1333
if (!path) {
1334
ret = -ENOMEM;
1335
goto fail;
1336
}
1337
key.objectid = BTRFS_ORPHAN_OBJECTID;
1338
key.type = BTRFS_ORPHAN_ITEM_KEY;
1339
key.offset = objectid;
1340
1341
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1342
btrfs_free_path(path);
1343
if (ret < 0)
1344
goto fail;
1345
if (ret == 0)
1346
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1347
1348
ret = btrfs_insert_fs_root(fs_info, root);
1349
if (ret) {
1350
if (ret == -EEXIST) {
1351
btrfs_put_root(root);
1352
goto again;
1353
}
1354
goto fail;
1355
}
1356
return root;
1357
fail:
1358
/*
1359
* If our caller provided us an anonymous device, then it's his
1360
* responsibility to free it in case we fail. So we have to set our
1361
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1362
* and once again by our caller.
1363
*/
1364
if (anon_dev && *anon_dev)
1365
root->anon_dev = 0;
1366
btrfs_put_root(root);
1367
return ERR_PTR(ret);
1368
}
1369
1370
/*
1371
* Get in-memory reference of a root structure
1372
*
1373
* @objectid: tree objectid
1374
* @check_ref: if set, verify that the tree exists and the item has at least
1375
* one reference
1376
*/
1377
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1378
u64 objectid, bool check_ref)
1379
{
1380
return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
1381
}
1382
1383
/*
1384
* Get in-memory reference of a root structure, created as new, optionally pass
1385
* the anonymous block device id
1386
*
1387
* @objectid: tree objectid
1388
* @anon_dev: if NULL, allocate a new anonymous block device or use the
1389
* parameter value if not NULL
1390
*/
1391
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1392
u64 objectid, dev_t *anon_dev)
1393
{
1394
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1395
}
1396
1397
/*
1398
* Return a root for the given objectid.
1399
*
1400
* @fs_info: the fs_info
1401
* @objectid: the objectid we need to lookup
1402
*
1403
* This is exclusively used for backref walking, and exists specifically because
1404
* of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
1405
* creation time, which means we may have to read the tree_root in order to look
1406
* up a fs root that is not in memory. If the root is not in memory we will
1407
* read the tree root commit root and look up the fs root from there. This is a
1408
* temporary root, it will not be inserted into the radix tree as it doesn't
1409
* have the most uptodate information, it'll simply be discarded once the
1410
* backref code is finished using the root.
1411
*/
1412
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1413
struct btrfs_path *path,
1414
u64 objectid)
1415
{
1416
struct btrfs_root *root;
1417
struct btrfs_key key;
1418
1419
ASSERT(path->search_commit_root && path->skip_locking);
1420
1421
/*
1422
* This can return -ENOENT if we ask for a root that doesn't exist, but
1423
* since this is called via the backref walking code we won't be looking
1424
* up a root that doesn't exist, unless there's corruption. So if root
1425
* != NULL just return it.
1426
*/
1427
root = btrfs_get_global_root(fs_info, objectid);
1428
if (root)
1429
return root;
1430
1431
root = btrfs_lookup_fs_root(fs_info, objectid);
1432
if (root)
1433
return root;
1434
1435
key.objectid = objectid;
1436
key.type = BTRFS_ROOT_ITEM_KEY;
1437
key.offset = (u64)-1;
1438
root = read_tree_root_path(fs_info->tree_root, path, &key);
1439
btrfs_release_path(path);
1440
1441
return root;
1442
}
1443
1444
static int cleaner_kthread(void *arg)
1445
{
1446
struct btrfs_fs_info *fs_info = arg;
1447
int again;
1448
1449
while (1) {
1450
again = 0;
1451
1452
set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1453
1454
/* Make the cleaner go to sleep early. */
1455
if (btrfs_need_cleaner_sleep(fs_info))
1456
goto sleep;
1457
1458
/*
1459
* Do not do anything if we might cause open_ctree() to block
1460
* before we have finished mounting the filesystem.
1461
*/
1462
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1463
goto sleep;
1464
1465
if (!mutex_trylock(&fs_info->cleaner_mutex))
1466
goto sleep;
1467
1468
/*
1469
* Avoid the problem that we change the status of the fs
1470
* during the above check and trylock.
1471
*/
1472
if (btrfs_need_cleaner_sleep(fs_info)) {
1473
mutex_unlock(&fs_info->cleaner_mutex);
1474
goto sleep;
1475
}
1476
1477
if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
1478
btrfs_sysfs_feature_update(fs_info);
1479
1480
btrfs_run_delayed_iputs(fs_info);
1481
1482
again = btrfs_clean_one_deleted_snapshot(fs_info);
1483
mutex_unlock(&fs_info->cleaner_mutex);
1484
1485
/*
1486
* The defragger has dealt with the R/O remount and umount,
1487
* needn't do anything special here.
1488
*/
1489
btrfs_run_defrag_inodes(fs_info);
1490
1491
/*
1492
* Acquires fs_info->reclaim_bgs_lock to avoid racing
1493
* with relocation (btrfs_relocate_chunk) and relocation
1494
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1495
* after acquiring fs_info->reclaim_bgs_lock. So we
1496
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
1497
* unused block groups.
1498
*/
1499
btrfs_delete_unused_bgs(fs_info);
1500
1501
/*
1502
* Reclaim block groups in the reclaim_bgs list after we deleted
1503
* all unused block_groups. This possibly gives us some more free
1504
* space.
1505
*/
1506
btrfs_reclaim_bgs(fs_info);
1507
sleep:
1508
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1509
if (kthread_should_park())
1510
kthread_parkme();
1511
if (kthread_should_stop())
1512
return 0;
1513
if (!again) {
1514
set_current_state(TASK_INTERRUPTIBLE);
1515
schedule();
1516
__set_current_state(TASK_RUNNING);
1517
}
1518
}
1519
}
1520
1521
static int transaction_kthread(void *arg)
1522
{
1523
struct btrfs_root *root = arg;
1524
struct btrfs_fs_info *fs_info = root->fs_info;
1525
struct btrfs_trans_handle *trans;
1526
struct btrfs_transaction *cur;
1527
u64 transid;
1528
time64_t delta;
1529
unsigned long delay;
1530
bool cannot_commit;
1531
1532
do {
1533
cannot_commit = false;
1534
delay = secs_to_jiffies(fs_info->commit_interval);
1535
mutex_lock(&fs_info->transaction_kthread_mutex);
1536
1537
spin_lock(&fs_info->trans_lock);
1538
cur = fs_info->running_transaction;
1539
if (!cur) {
1540
spin_unlock(&fs_info->trans_lock);
1541
goto sleep;
1542
}
1543
1544
delta = ktime_get_seconds() - cur->start_time;
1545
if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1546
cur->state < TRANS_STATE_COMMIT_PREP &&
1547
delta < fs_info->commit_interval) {
1548
spin_unlock(&fs_info->trans_lock);
1549
delay -= secs_to_jiffies(delta - 1);
1550
delay = min(delay,
1551
secs_to_jiffies(fs_info->commit_interval));
1552
goto sleep;
1553
}
1554
transid = cur->transid;
1555
spin_unlock(&fs_info->trans_lock);
1556
1557
/* If the file system is aborted, this will always fail. */
1558
trans = btrfs_attach_transaction(root);
1559
if (IS_ERR(trans)) {
1560
if (PTR_ERR(trans) != -ENOENT)
1561
cannot_commit = true;
1562
goto sleep;
1563
}
1564
if (transid == trans->transid) {
1565
btrfs_commit_transaction(trans);
1566
} else {
1567
btrfs_end_transaction(trans);
1568
}
1569
sleep:
1570
wake_up_process(fs_info->cleaner_kthread);
1571
mutex_unlock(&fs_info->transaction_kthread_mutex);
1572
1573
if (BTRFS_FS_ERROR(fs_info))
1574
btrfs_cleanup_transaction(fs_info);
1575
if (!kthread_should_stop() &&
1576
(!btrfs_transaction_blocked(fs_info) ||
1577
cannot_commit))
1578
schedule_timeout_interruptible(delay);
1579
} while (!kthread_should_stop());
1580
return 0;
1581
}
1582
1583
/*
1584
* This will find the highest generation in the array of root backups. The
1585
* index of the highest array is returned, or -EINVAL if we can't find
1586
* anything.
1587
*
1588
* We check to make sure the array is valid by comparing the
1589
* generation of the latest root in the array with the generation
1590
* in the super block. If they don't match we pitch it.
1591
*/
1592
static int find_newest_super_backup(struct btrfs_fs_info *info)
1593
{
1594
const u64 newest_gen = btrfs_super_generation(info->super_copy);
1595
u64 cur;
1596
struct btrfs_root_backup *root_backup;
1597
int i;
1598
1599
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1600
root_backup = info->super_copy->super_roots + i;
1601
cur = btrfs_backup_tree_root_gen(root_backup);
1602
if (cur == newest_gen)
1603
return i;
1604
}
1605
1606
return -EINVAL;
1607
}
1608
1609
/*
1610
* copy all the root pointers into the super backup array.
1611
* this will bump the backup pointer by one when it is
1612
* done
1613
*/
1614
static void backup_super_roots(struct btrfs_fs_info *info)
1615
{
1616
const int next_backup = info->backup_root_index;
1617
struct btrfs_root_backup *root_backup;
1618
1619
root_backup = info->super_for_commit->super_roots + next_backup;
1620
1621
/*
1622
* make sure all of our padding and empty slots get zero filled
1623
* regardless of which ones we use today
1624
*/
1625
memset(root_backup, 0, sizeof(*root_backup));
1626
1627
info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1628
1629
btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1630
btrfs_set_backup_tree_root_gen(root_backup,
1631
btrfs_header_generation(info->tree_root->node));
1632
1633
btrfs_set_backup_tree_root_level(root_backup,
1634
btrfs_header_level(info->tree_root->node));
1635
1636
btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1637
btrfs_set_backup_chunk_root_gen(root_backup,
1638
btrfs_header_generation(info->chunk_root->node));
1639
btrfs_set_backup_chunk_root_level(root_backup,
1640
btrfs_header_level(info->chunk_root->node));
1641
1642
if (!btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
1643
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
1644
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
1645
1646
btrfs_set_backup_extent_root(root_backup,
1647
extent_root->node->start);
1648
btrfs_set_backup_extent_root_gen(root_backup,
1649
btrfs_header_generation(extent_root->node));
1650
btrfs_set_backup_extent_root_level(root_backup,
1651
btrfs_header_level(extent_root->node));
1652
1653
btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
1654
btrfs_set_backup_csum_root_gen(root_backup,
1655
btrfs_header_generation(csum_root->node));
1656
btrfs_set_backup_csum_root_level(root_backup,
1657
btrfs_header_level(csum_root->node));
1658
}
1659
1660
/*
1661
* we might commit during log recovery, which happens before we set
1662
* the fs_root. Make sure it is valid before we fill it in.
1663
*/
1664
if (info->fs_root && info->fs_root->node) {
1665
btrfs_set_backup_fs_root(root_backup,
1666
info->fs_root->node->start);
1667
btrfs_set_backup_fs_root_gen(root_backup,
1668
btrfs_header_generation(info->fs_root->node));
1669
btrfs_set_backup_fs_root_level(root_backup,
1670
btrfs_header_level(info->fs_root->node));
1671
}
1672
1673
btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1674
btrfs_set_backup_dev_root_gen(root_backup,
1675
btrfs_header_generation(info->dev_root->node));
1676
btrfs_set_backup_dev_root_level(root_backup,
1677
btrfs_header_level(info->dev_root->node));
1678
1679
btrfs_set_backup_total_bytes(root_backup,
1680
btrfs_super_total_bytes(info->super_copy));
1681
btrfs_set_backup_bytes_used(root_backup,
1682
btrfs_super_bytes_used(info->super_copy));
1683
btrfs_set_backup_num_devices(root_backup,
1684
btrfs_super_num_devices(info->super_copy));
1685
1686
/*
1687
* if we don't copy this out to the super_copy, it won't get remembered
1688
* for the next commit
1689
*/
1690
memcpy(&info->super_copy->super_roots,
1691
&info->super_for_commit->super_roots,
1692
sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1693
}
1694
1695
/*
1696
* Reads a backup root based on the passed priority. Prio 0 is the newest, prio
1697
* 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1698
*
1699
* @fs_info: filesystem whose backup roots need to be read
1700
* @priority: priority of backup root required
1701
*
1702
* Returns backup root index on success and -EINVAL otherwise.
1703
*/
1704
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
1705
{
1706
int backup_index = find_newest_super_backup(fs_info);
1707
struct btrfs_super_block *super = fs_info->super_copy;
1708
struct btrfs_root_backup *root_backup;
1709
1710
if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1711
if (priority == 0)
1712
return backup_index;
1713
1714
backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1715
backup_index %= BTRFS_NUM_BACKUP_ROOTS;
1716
} else {
1717
return -EINVAL;
1718
}
1719
1720
root_backup = super->super_roots + backup_index;
1721
1722
btrfs_set_super_generation(super,
1723
btrfs_backup_tree_root_gen(root_backup));
1724
btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1725
btrfs_set_super_root_level(super,
1726
btrfs_backup_tree_root_level(root_backup));
1727
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1728
1729
/*
1730
* Fixme: the total bytes and num_devices need to match or we should
1731
* need a fsck
1732
*/
1733
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1734
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1735
1736
return backup_index;
1737
}
1738
1739
/* helper to cleanup workers */
1740
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1741
{
1742
btrfs_destroy_workqueue(fs_info->fixup_workers);
1743
btrfs_destroy_workqueue(fs_info->delalloc_workers);
1744
btrfs_destroy_workqueue(fs_info->workers);
1745
if (fs_info->endio_workers)
1746
destroy_workqueue(fs_info->endio_workers);
1747
if (fs_info->rmw_workers)
1748
destroy_workqueue(fs_info->rmw_workers);
1749
btrfs_destroy_workqueue(fs_info->endio_write_workers);
1750
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1751
btrfs_destroy_workqueue(fs_info->delayed_workers);
1752
btrfs_destroy_workqueue(fs_info->caching_workers);
1753
btrfs_destroy_workqueue(fs_info->flush_workers);
1754
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1755
if (fs_info->discard_ctl.discard_workers)
1756
destroy_workqueue(fs_info->discard_ctl.discard_workers);
1757
/*
1758
* Now that all other work queues are destroyed, we can safely destroy
1759
* the queues used for metadata I/O, since tasks from those other work
1760
* queues can do metadata I/O operations.
1761
*/
1762
if (fs_info->endio_meta_workers)
1763
destroy_workqueue(fs_info->endio_meta_workers);
1764
}
1765
1766
static void free_root_extent_buffers(struct btrfs_root *root)
1767
{
1768
if (root) {
1769
free_extent_buffer(root->node);
1770
free_extent_buffer(root->commit_root);
1771
root->node = NULL;
1772
root->commit_root = NULL;
1773
}
1774
}
1775
1776
static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
1777
{
1778
struct btrfs_root *root, *tmp;
1779
1780
rbtree_postorder_for_each_entry_safe(root, tmp,
1781
&fs_info->global_root_tree,
1782
rb_node)
1783
free_root_extent_buffers(root);
1784
}
1785
1786
/* helper to cleanup tree roots */
1787
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
1788
{
1789
free_root_extent_buffers(info->tree_root);
1790
1791
free_global_root_pointers(info);
1792
free_root_extent_buffers(info->dev_root);
1793
free_root_extent_buffers(info->quota_root);
1794
free_root_extent_buffers(info->uuid_root);
1795
free_root_extent_buffers(info->fs_root);
1796
free_root_extent_buffers(info->data_reloc_root);
1797
free_root_extent_buffers(info->block_group_root);
1798
free_root_extent_buffers(info->stripe_root);
1799
if (free_chunk_root)
1800
free_root_extent_buffers(info->chunk_root);
1801
}
1802
1803
void btrfs_put_root(struct btrfs_root *root)
1804
{
1805
if (!root)
1806
return;
1807
1808
if (refcount_dec_and_test(&root->refs)) {
1809
if (WARN_ON(!xa_empty(&root->inodes)))
1810
xa_destroy(&root->inodes);
1811
if (WARN_ON(!xa_empty(&root->delayed_nodes)))
1812
xa_destroy(&root->delayed_nodes);
1813
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
1814
if (root->anon_dev)
1815
free_anon_bdev(root->anon_dev);
1816
free_root_extent_buffers(root);
1817
#ifdef CONFIG_BTRFS_DEBUG
1818
spin_lock(&root->fs_info->fs_roots_radix_lock);
1819
list_del_init(&root->leak_list);
1820
spin_unlock(&root->fs_info->fs_roots_radix_lock);
1821
#endif
1822
kfree(root);
1823
}
1824
}
1825
1826
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
1827
{
1828
int ret;
1829
struct btrfs_root *gang[8];
1830
int i;
1831
1832
while (!list_empty(&fs_info->dead_roots)) {
1833
gang[0] = list_first_entry(&fs_info->dead_roots,
1834
struct btrfs_root, root_list);
1835
list_del(&gang[0]->root_list);
1836
1837
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
1838
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
1839
btrfs_put_root(gang[0]);
1840
}
1841
1842
while (1) {
1843
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1844
(void **)gang, 0,
1845
ARRAY_SIZE(gang));
1846
if (!ret)
1847
break;
1848
for (i = 0; i < ret; i++)
1849
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
1850
}
1851
}
1852
1853
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
1854
{
1855
mutex_init(&fs_info->scrub_lock);
1856
atomic_set(&fs_info->scrubs_running, 0);
1857
atomic_set(&fs_info->scrub_pause_req, 0);
1858
atomic_set(&fs_info->scrubs_paused, 0);
1859
atomic_set(&fs_info->scrub_cancel_req, 0);
1860
init_waitqueue_head(&fs_info->scrub_pause_wait);
1861
refcount_set(&fs_info->scrub_workers_refcnt, 0);
1862
}
1863
1864
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
1865
{
1866
spin_lock_init(&fs_info->balance_lock);
1867
mutex_init(&fs_info->balance_mutex);
1868
atomic_set(&fs_info->balance_pause_req, 0);
1869
atomic_set(&fs_info->balance_cancel_req, 0);
1870
fs_info->balance_ctl = NULL;
1871
init_waitqueue_head(&fs_info->balance_wait_q);
1872
atomic_set(&fs_info->reloc_cancel_req, 0);
1873
}
1874
1875
static int btrfs_init_btree_inode(struct super_block *sb)
1876
{
1877
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1878
unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
1879
fs_info->tree_root);
1880
struct inode *inode;
1881
1882
inode = new_inode(sb);
1883
if (!inode)
1884
return -ENOMEM;
1885
1886
btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
1887
set_nlink(inode, 1);
1888
/*
1889
* we set the i_size on the btree inode to the max possible int.
1890
* the real end of the address space is determined by all of
1891
* the devices in the system
1892
*/
1893
inode->i_size = OFFSET_MAX;
1894
inode->i_mapping->a_ops = &btree_aops;
1895
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
1896
1897
btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
1898
IO_TREE_BTREE_INODE_IO);
1899
btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
1900
1901
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
1902
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
1903
__insert_inode_hash(inode, hash);
1904
set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags);
1905
fs_info->btree_inode = inode;
1906
1907
return 0;
1908
}
1909
1910
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
1911
{
1912
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
1913
init_rwsem(&fs_info->dev_replace.rwsem);
1914
init_waitqueue_head(&fs_info->dev_replace.replace_wait);
1915
}
1916
1917
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
1918
{
1919
spin_lock_init(&fs_info->qgroup_lock);
1920
mutex_init(&fs_info->qgroup_ioctl_lock);
1921
fs_info->qgroup_tree = RB_ROOT;
1922
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
1923
fs_info->qgroup_seq = 1;
1924
fs_info->qgroup_rescan_running = false;
1925
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
1926
mutex_init(&fs_info->qgroup_rescan_lock);
1927
}
1928
1929
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
1930
{
1931
u32 max_active = fs_info->thread_pool_size;
1932
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
1933
unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
1934
1935
fs_info->workers =
1936
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
1937
1938
fs_info->delalloc_workers =
1939
btrfs_alloc_workqueue(fs_info, "delalloc",
1940
flags, max_active, 2);
1941
1942
fs_info->flush_workers =
1943
btrfs_alloc_workqueue(fs_info, "flush_delalloc",
1944
flags, max_active, 0);
1945
1946
fs_info->caching_workers =
1947
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
1948
1949
fs_info->fixup_workers =
1950
btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
1951
1952
fs_info->endio_workers =
1953
alloc_workqueue("btrfs-endio", flags, max_active);
1954
fs_info->endio_meta_workers =
1955
alloc_workqueue("btrfs-endio-meta", flags, max_active);
1956
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
1957
fs_info->endio_write_workers =
1958
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
1959
max_active, 2);
1960
fs_info->endio_freespace_worker =
1961
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
1962
max_active, 0);
1963
fs_info->delayed_workers =
1964
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
1965
max_active, 0);
1966
fs_info->qgroup_rescan_workers =
1967
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
1968
ordered_flags);
1969
fs_info->discard_ctl.discard_workers =
1970
alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
1971
1972
if (!(fs_info->workers &&
1973
fs_info->delalloc_workers && fs_info->flush_workers &&
1974
fs_info->endio_workers && fs_info->endio_meta_workers &&
1975
fs_info->endio_write_workers &&
1976
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
1977
fs_info->caching_workers && fs_info->fixup_workers &&
1978
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
1979
fs_info->discard_ctl.discard_workers)) {
1980
return -ENOMEM;
1981
}
1982
1983
return 0;
1984
}
1985
1986
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
1987
{
1988
struct crypto_shash *csum_shash;
1989
const char *csum_driver = btrfs_super_csum_driver(csum_type);
1990
1991
csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
1992
1993
if (IS_ERR(csum_shash)) {
1994
btrfs_err(fs_info, "error allocating %s hash for checksum",
1995
csum_driver);
1996
return PTR_ERR(csum_shash);
1997
}
1998
1999
fs_info->csum_shash = csum_shash;
2000
2001
/* Check if the checksum implementation is a fast accelerated one. */
2002
switch (csum_type) {
2003
case BTRFS_CSUM_TYPE_CRC32:
2004
if (crc32_optimizations() & CRC32C_OPTIMIZATION)
2005
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2006
break;
2007
case BTRFS_CSUM_TYPE_XXHASH:
2008
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2009
break;
2010
default:
2011
break;
2012
}
2013
2014
btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2015
btrfs_super_csum_name(csum_type),
2016
crypto_shash_driver_name(csum_shash));
2017
return 0;
2018
}
2019
2020
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2021
struct btrfs_fs_devices *fs_devices)
2022
{
2023
int ret;
2024
struct btrfs_tree_parent_check check = { 0 };
2025
struct btrfs_root *log_tree_root;
2026
struct btrfs_super_block *disk_super = fs_info->super_copy;
2027
u64 bytenr = btrfs_super_log_root(disk_super);
2028
int level = btrfs_super_log_root_level(disk_super);
2029
2030
if (unlikely(fs_devices->rw_devices == 0)) {
2031
btrfs_warn(fs_info, "log replay required on RO media");
2032
return -EIO;
2033
}
2034
2035
log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2036
GFP_KERNEL);
2037
if (!log_tree_root)
2038
return -ENOMEM;
2039
2040
check.level = level;
2041
check.transid = fs_info->generation + 1;
2042
check.owner_root = BTRFS_TREE_LOG_OBJECTID;
2043
log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
2044
if (IS_ERR(log_tree_root->node)) {
2045
btrfs_warn(fs_info, "failed to read log tree");
2046
ret = PTR_ERR(log_tree_root->node);
2047
log_tree_root->node = NULL;
2048
btrfs_put_root(log_tree_root);
2049
return ret;
2050
}
2051
if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
2052
btrfs_err(fs_info, "failed to read log tree");
2053
btrfs_put_root(log_tree_root);
2054
return -EIO;
2055
}
2056
2057
/* returns with log_tree_root freed on success */
2058
ret = btrfs_recover_log_trees(log_tree_root);
2059
btrfs_put_root(log_tree_root);
2060
if (ret) {
2061
btrfs_handle_fs_error(fs_info, ret,
2062
"Failed to recover log tree");
2063
return ret;
2064
}
2065
2066
if (sb_rdonly(fs_info->sb)) {
2067
ret = btrfs_commit_super(fs_info);
2068
if (ret)
2069
return ret;
2070
}
2071
2072
return 0;
2073
}
2074
2075
static int load_global_roots_objectid(struct btrfs_root *tree_root,
2076
struct btrfs_path *path, u64 objectid,
2077
const char *name)
2078
{
2079
struct btrfs_fs_info *fs_info = tree_root->fs_info;
2080
struct btrfs_root *root;
2081
u64 max_global_id = 0;
2082
int ret;
2083
struct btrfs_key key = {
2084
.objectid = objectid,
2085
.type = BTRFS_ROOT_ITEM_KEY,
2086
.offset = 0,
2087
};
2088
bool found = false;
2089
2090
/* If we have IGNOREDATACSUMS skip loading these roots. */
2091
if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2092
btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2093
set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
2094
return 0;
2095
}
2096
2097
while (1) {
2098
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2099
if (ret < 0)
2100
break;
2101
2102
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2103
ret = btrfs_next_leaf(tree_root, path);
2104
if (ret) {
2105
if (ret > 0)
2106
ret = 0;
2107
break;
2108
}
2109
}
2110
ret = 0;
2111
2112
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2113
if (key.objectid != objectid)
2114
break;
2115
btrfs_release_path(path);
2116
2117
/*
2118
* Just worry about this for extent tree, it'll be the same for
2119
* everybody.
2120
*/
2121
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2122
max_global_id = max(max_global_id, key.offset);
2123
2124
found = true;
2125
root = read_tree_root_path(tree_root, path, &key);
2126
if (IS_ERR(root)) {
2127
ret = PTR_ERR(root);
2128
break;
2129
}
2130
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2131
ret = btrfs_global_root_insert(root);
2132
if (ret) {
2133
btrfs_put_root(root);
2134
break;
2135
}
2136
key.offset++;
2137
}
2138
btrfs_release_path(path);
2139
2140
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2141
fs_info->nr_global_roots = max_global_id + 1;
2142
2143
if (!found || ret) {
2144
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2145
set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
2146
2147
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2148
ret = ret ? ret : -ENOENT;
2149
else
2150
ret = 0;
2151
btrfs_err(fs_info, "failed to load root %s", name);
2152
}
2153
return ret;
2154
}
2155
2156
static int load_global_roots(struct btrfs_root *tree_root)
2157
{
2158
BTRFS_PATH_AUTO_FREE(path);
2159
int ret;
2160
2161
path = btrfs_alloc_path();
2162
if (!path)
2163
return -ENOMEM;
2164
2165
ret = load_global_roots_objectid(tree_root, path,
2166
BTRFS_EXTENT_TREE_OBJECTID, "extent");
2167
if (ret)
2168
return ret;
2169
ret = load_global_roots_objectid(tree_root, path,
2170
BTRFS_CSUM_TREE_OBJECTID, "csum");
2171
if (ret)
2172
return ret;
2173
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2174
return ret;
2175
ret = load_global_roots_objectid(tree_root, path,
2176
BTRFS_FREE_SPACE_TREE_OBJECTID,
2177
"free space");
2178
2179
return ret;
2180
}
2181
2182
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2183
{
2184
struct btrfs_root *tree_root = fs_info->tree_root;
2185
struct btrfs_root *root;
2186
struct btrfs_key location;
2187
int ret;
2188
2189
ASSERT(fs_info->tree_root);
2190
2191
ret = load_global_roots(tree_root);
2192
if (ret)
2193
return ret;
2194
2195
location.type = BTRFS_ROOT_ITEM_KEY;
2196
location.offset = 0;
2197
2198
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2199
location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
2200
root = btrfs_read_tree_root(tree_root, &location);
2201
if (IS_ERR(root)) {
2202
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2203
ret = PTR_ERR(root);
2204
goto out;
2205
}
2206
} else {
2207
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2208
fs_info->block_group_root = root;
2209
}
2210
}
2211
2212
location.objectid = BTRFS_DEV_TREE_OBJECTID;
2213
root = btrfs_read_tree_root(tree_root, &location);
2214
if (IS_ERR(root)) {
2215
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2216
ret = PTR_ERR(root);
2217
goto out;
2218
}
2219
} else {
2220
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2221
fs_info->dev_root = root;
2222
}
2223
/* Initialize fs_info for all devices in any case */
2224
ret = btrfs_init_devices_late(fs_info);
2225
if (ret)
2226
goto out;
2227
2228
/*
2229
* This tree can share blocks with some other fs tree during relocation
2230
* and we need a proper setup by btrfs_get_fs_root
2231
*/
2232
root = btrfs_get_fs_root(tree_root->fs_info,
2233
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2234
if (IS_ERR(root)) {
2235
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2236
location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
2237
ret = PTR_ERR(root);
2238
goto out;
2239
}
2240
} else {
2241
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2242
fs_info->data_reloc_root = root;
2243
}
2244
2245
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2246
root = btrfs_read_tree_root(tree_root, &location);
2247
if (!IS_ERR(root)) {
2248
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2249
fs_info->quota_root = root;
2250
}
2251
2252
location.objectid = BTRFS_UUID_TREE_OBJECTID;
2253
root = btrfs_read_tree_root(tree_root, &location);
2254
if (IS_ERR(root)) {
2255
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2256
ret = PTR_ERR(root);
2257
if (ret != -ENOENT)
2258
goto out;
2259
}
2260
} else {
2261
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2262
fs_info->uuid_root = root;
2263
}
2264
2265
if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
2266
location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
2267
root = btrfs_read_tree_root(tree_root, &location);
2268
if (IS_ERR(root)) {
2269
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2270
ret = PTR_ERR(root);
2271
goto out;
2272
}
2273
} else {
2274
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2275
fs_info->stripe_root = root;
2276
}
2277
}
2278
2279
return 0;
2280
out:
2281
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2282
location.objectid, ret);
2283
return ret;
2284
}
2285
2286
static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
2287
const struct btrfs_super_block *sb)
2288
{
2289
unsigned int cur = 0; /* Offset inside the sys chunk array */
2290
/*
2291
* At sb read time, fs_info is not fully initialized. Thus we have
2292
* to use super block sectorsize, which should have been validated.
2293
*/
2294
const u32 sectorsize = btrfs_super_sectorsize(sb);
2295
u32 sys_array_size = btrfs_super_sys_array_size(sb);
2296
2297
if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
2298
btrfs_err(fs_info, "system chunk array too big %u > %u",
2299
sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2300
return -EUCLEAN;
2301
}
2302
2303
while (cur < sys_array_size) {
2304
struct btrfs_disk_key *disk_key;
2305
struct btrfs_chunk *chunk;
2306
struct btrfs_key key;
2307
u64 type;
2308
u16 num_stripes;
2309
u32 len;
2310
int ret;
2311
2312
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
2313
len = sizeof(*disk_key);
2314
2315
if (unlikely(cur + len > sys_array_size))
2316
goto short_read;
2317
cur += len;
2318
2319
btrfs_disk_key_to_cpu(&key, disk_key);
2320
if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
2321
btrfs_err(fs_info,
2322
"unexpected item type %u in sys_array at offset %u",
2323
key.type, cur);
2324
return -EUCLEAN;
2325
}
2326
chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
2327
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2328
if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
2329
goto short_read;
2330
type = btrfs_stack_chunk_type(chunk);
2331
if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
2332
btrfs_err(fs_info,
2333
"invalid chunk type %llu in sys_array at offset %u",
2334
type, cur);
2335
return -EUCLEAN;
2336
}
2337
ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
2338
sectorsize);
2339
if (ret < 0)
2340
return ret;
2341
cur += btrfs_chunk_item_size(num_stripes);
2342
}
2343
return 0;
2344
short_read:
2345
btrfs_err(fs_info,
2346
"super block sys chunk array short read, cur=%u sys_array_size=%u",
2347
cur, sys_array_size);
2348
return -EUCLEAN;
2349
}
2350
2351
/*
2352
* Real super block validation
2353
* NOTE: super csum type and incompat features will not be checked here.
2354
*
2355
* @sb: super block to check
2356
* @mirror_num: the super block number to check its bytenr:
2357
* 0 the primary (1st) sb
2358
* 1, 2 2nd and 3rd backup copy
2359
* -1 skip bytenr check
2360
*/
2361
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
2362
const struct btrfs_super_block *sb, int mirror_num)
2363
{
2364
u64 nodesize = btrfs_super_nodesize(sb);
2365
u64 sectorsize = btrfs_super_sectorsize(sb);
2366
int ret = 0;
2367
const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
2368
2369
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2370
btrfs_err(fs_info, "no valid FS found");
2371
ret = -EINVAL;
2372
}
2373
if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
2374
if (!ignore_flags) {
2375
btrfs_err(fs_info,
2376
"unrecognized or unsupported super flag 0x%llx",
2377
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2378
ret = -EINVAL;
2379
} else {
2380
btrfs_info(fs_info,
2381
"unrecognized or unsupported super flags: 0x%llx, ignored",
2382
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2383
}
2384
}
2385
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2386
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2387
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2388
ret = -EINVAL;
2389
}
2390
if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2391
btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2392
btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2393
ret = -EINVAL;
2394
}
2395
if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2396
btrfs_err(fs_info, "log_root level too big: %d >= %d",
2397
btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2398
ret = -EINVAL;
2399
}
2400
2401
/*
2402
* Check sectorsize and nodesize first, other check will need it.
2403
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2404
*/
2405
if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
2406
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2407
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2408
ret = -EINVAL;
2409
}
2410
2411
if (!btrfs_supported_blocksize(sectorsize)) {
2412
btrfs_err(fs_info,
2413
"sectorsize %llu not yet supported for page size %lu",
2414
sectorsize, PAGE_SIZE);
2415
ret = -EINVAL;
2416
}
2417
2418
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2419
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2420
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2421
ret = -EINVAL;
2422
}
2423
if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2424
btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2425
le32_to_cpu(sb->__unused_leafsize), nodesize);
2426
ret = -EINVAL;
2427
}
2428
2429
/* Root alignment check */
2430
if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2431
btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2432
btrfs_super_root(sb));
2433
ret = -EINVAL;
2434
}
2435
if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2436
btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2437
btrfs_super_chunk_root(sb));
2438
ret = -EINVAL;
2439
}
2440
if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2441
btrfs_warn(fs_info, "log_root block unaligned: %llu",
2442
btrfs_super_log_root(sb));
2443
ret = -EINVAL;
2444
}
2445
2446
if (!fs_info->fs_devices->temp_fsid &&
2447
memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
2448
btrfs_err(fs_info,
2449
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2450
sb->fsid, fs_info->fs_devices->fsid);
2451
ret = -EINVAL;
2452
}
2453
2454
if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
2455
BTRFS_FSID_SIZE) != 0) {
2456
btrfs_err(fs_info,
2457
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2458
btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
2459
ret = -EINVAL;
2460
}
2461
2462
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2463
BTRFS_FSID_SIZE) != 0) {
2464
btrfs_err(fs_info,
2465
"dev_item UUID does not match metadata fsid: %pU != %pU",
2466
fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2467
ret = -EINVAL;
2468
}
2469
2470
/*
2471
* Artificial requirement for block-group-tree to force newer features
2472
* (free-space-tree, no-holes) so the test matrix is smaller.
2473
*/
2474
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
2475
(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
2476
!btrfs_fs_incompat(fs_info, NO_HOLES))) {
2477
btrfs_err(fs_info,
2478
"block-group-tree feature requires free-space-tree and no-holes");
2479
ret = -EINVAL;
2480
}
2481
2482
/*
2483
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
2484
* done later
2485
*/
2486
if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2487
btrfs_err(fs_info, "bytes_used is too small %llu",
2488
btrfs_super_bytes_used(sb));
2489
ret = -EINVAL;
2490
}
2491
if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2492
btrfs_err(fs_info, "invalid stripesize %u",
2493
btrfs_super_stripesize(sb));
2494
ret = -EINVAL;
2495
}
2496
if (btrfs_super_num_devices(sb) > (1UL << 31))
2497
btrfs_warn(fs_info, "suspicious number of devices: %llu",
2498
btrfs_super_num_devices(sb));
2499
if (btrfs_super_num_devices(sb) == 0) {
2500
btrfs_err(fs_info, "number of devices is 0");
2501
ret = -EINVAL;
2502
}
2503
2504
if (mirror_num >= 0 &&
2505
btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2506
btrfs_err(fs_info, "super offset mismatch %llu != %u",
2507
btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2508
ret = -EINVAL;
2509
}
2510
2511
if (ret)
2512
return ret;
2513
2514
ret = validate_sys_chunk_array(fs_info, sb);
2515
2516
/*
2517
* Obvious sys_chunk_array corruptions, it must hold at least one key
2518
* and one chunk
2519
*/
2520
if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2521
btrfs_err(fs_info, "system chunk array too big %u > %u",
2522
btrfs_super_sys_array_size(sb),
2523
BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2524
ret = -EINVAL;
2525
}
2526
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2527
+ sizeof(struct btrfs_chunk)) {
2528
btrfs_err(fs_info, "system chunk array too small %u < %zu",
2529
btrfs_super_sys_array_size(sb),
2530
sizeof(struct btrfs_disk_key)
2531
+ sizeof(struct btrfs_chunk));
2532
ret = -EINVAL;
2533
}
2534
2535
/*
2536
* The generation is a global counter, we'll trust it more than the others
2537
* but it's still possible that it's the one that's wrong.
2538
*/
2539
if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2540
btrfs_warn(fs_info,
2541
"suspicious: generation < chunk_root_generation: %llu < %llu",
2542
btrfs_super_generation(sb),
2543
btrfs_super_chunk_root_generation(sb));
2544
if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2545
&& btrfs_super_cache_generation(sb) != (u64)-1)
2546
btrfs_warn(fs_info,
2547
"suspicious: generation < cache_generation: %llu < %llu",
2548
btrfs_super_generation(sb),
2549
btrfs_super_cache_generation(sb));
2550
2551
return ret;
2552
}
2553
2554
/*
2555
* Validation of super block at mount time.
2556
* Some checks already done early at mount time, like csum type and incompat
2557
* flags will be skipped.
2558
*/
2559
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2560
{
2561
return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
2562
}
2563
2564
/*
2565
* Validation of super block at write time.
2566
* Some checks like bytenr check will be skipped as their values will be
2567
* overwritten soon.
2568
* Extra checks like csum type and incompat flags will be done here.
2569
*/
2570
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2571
struct btrfs_super_block *sb)
2572
{
2573
int ret;
2574
2575
ret = btrfs_validate_super(fs_info, sb, -1);
2576
if (ret < 0)
2577
goto out;
2578
if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
2579
ret = -EUCLEAN;
2580
btrfs_err(fs_info, "invalid csum type, has %u want %u",
2581
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2582
goto out;
2583
}
2584
if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
2585
ret = -EUCLEAN;
2586
btrfs_err(fs_info,
2587
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
2588
btrfs_super_incompat_flags(sb),
2589
(unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2590
goto out;
2591
}
2592
out:
2593
if (ret < 0)
2594
btrfs_err(fs_info,
2595
"super block corruption detected before writing it to disk");
2596
return ret;
2597
}
2598
2599
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2600
{
2601
struct btrfs_tree_parent_check check = {
2602
.level = level,
2603
.transid = gen,
2604
.owner_root = btrfs_root_id(root)
2605
};
2606
int ret = 0;
2607
2608
root->node = read_tree_block(root->fs_info, bytenr, &check);
2609
if (IS_ERR(root->node)) {
2610
ret = PTR_ERR(root->node);
2611
root->node = NULL;
2612
return ret;
2613
}
2614
if (unlikely(!extent_buffer_uptodate(root->node))) {
2615
free_extent_buffer(root->node);
2616
root->node = NULL;
2617
return -EIO;
2618
}
2619
2620
btrfs_set_root_node(&root->root_item, root->node);
2621
root->commit_root = btrfs_root_node(root);
2622
btrfs_set_root_refs(&root->root_item, 1);
2623
return ret;
2624
}
2625
2626
static int load_important_roots(struct btrfs_fs_info *fs_info)
2627
{
2628
struct btrfs_super_block *sb = fs_info->super_copy;
2629
u64 gen, bytenr;
2630
int level, ret;
2631
2632
bytenr = btrfs_super_root(sb);
2633
gen = btrfs_super_generation(sb);
2634
level = btrfs_super_root_level(sb);
2635
ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2636
if (ret) {
2637
btrfs_warn(fs_info, "couldn't read tree root");
2638
return ret;
2639
}
2640
return 0;
2641
}
2642
2643
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2644
{
2645
int backup_index = find_newest_super_backup(fs_info);
2646
struct btrfs_super_block *sb = fs_info->super_copy;
2647
struct btrfs_root *tree_root = fs_info->tree_root;
2648
bool handle_error = false;
2649
int ret = 0;
2650
int i;
2651
2652
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2653
if (handle_error) {
2654
if (!IS_ERR(tree_root->node))
2655
free_extent_buffer(tree_root->node);
2656
tree_root->node = NULL;
2657
2658
if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2659
break;
2660
2661
free_root_pointers(fs_info, 0);
2662
2663
/*
2664
* Don't use the log in recovery mode, it won't be
2665
* valid
2666
*/
2667
btrfs_set_super_log_root(sb, 0);
2668
2669
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
2670
ret = read_backup_root(fs_info, i);
2671
backup_index = ret;
2672
if (ret < 0)
2673
return ret;
2674
}
2675
2676
ret = load_important_roots(fs_info);
2677
if (ret) {
2678
handle_error = true;
2679
continue;
2680
}
2681
2682
/*
2683
* No need to hold btrfs_root::objectid_mutex since the fs
2684
* hasn't been fully initialised and we are the only user
2685
*/
2686
ret = btrfs_init_root_free_objectid(tree_root);
2687
if (ret < 0) {
2688
handle_error = true;
2689
continue;
2690
}
2691
2692
ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2693
2694
ret = btrfs_read_roots(fs_info);
2695
if (ret < 0) {
2696
handle_error = true;
2697
continue;
2698
}
2699
2700
/* All successful */
2701
fs_info->generation = btrfs_header_generation(tree_root->node);
2702
btrfs_set_last_trans_committed(fs_info, fs_info->generation);
2703
fs_info->last_reloc_trans = 0;
2704
2705
/* Always begin writing backup roots after the one being used */
2706
if (backup_index < 0) {
2707
fs_info->backup_root_index = 0;
2708
} else {
2709
fs_info->backup_root_index = backup_index + 1;
2710
fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2711
}
2712
break;
2713
}
2714
2715
return ret;
2716
}
2717
2718
/*
2719
* Lockdep gets confused between our buffer_tree which requires IRQ locking because
2720
* we modify marks in the IRQ context, and our delayed inode xarray which doesn't
2721
* have these requirements. Use a class key so lockdep doesn't get them mixed up.
2722
*/
2723
static struct lock_class_key buffer_xa_class;
2724
2725
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2726
{
2727
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2728
2729
/* Use the same flags as mapping->i_pages. */
2730
xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
2731
lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
2732
2733
INIT_LIST_HEAD(&fs_info->trans_list);
2734
INIT_LIST_HEAD(&fs_info->dead_roots);
2735
INIT_LIST_HEAD(&fs_info->delayed_iputs);
2736
INIT_LIST_HEAD(&fs_info->delalloc_roots);
2737
INIT_LIST_HEAD(&fs_info->caching_block_groups);
2738
spin_lock_init(&fs_info->delalloc_root_lock);
2739
spin_lock_init(&fs_info->trans_lock);
2740
spin_lock_init(&fs_info->fs_roots_radix_lock);
2741
spin_lock_init(&fs_info->delayed_iput_lock);
2742
spin_lock_init(&fs_info->defrag_inodes_lock);
2743
spin_lock_init(&fs_info->super_lock);
2744
spin_lock_init(&fs_info->unused_bgs_lock);
2745
spin_lock_init(&fs_info->treelog_bg_lock);
2746
spin_lock_init(&fs_info->zone_active_bgs_lock);
2747
spin_lock_init(&fs_info->relocation_bg_lock);
2748
rwlock_init(&fs_info->tree_mod_log_lock);
2749
rwlock_init(&fs_info->global_root_lock);
2750
mutex_init(&fs_info->unused_bg_unpin_mutex);
2751
mutex_init(&fs_info->reclaim_bgs_lock);
2752
mutex_init(&fs_info->reloc_mutex);
2753
mutex_init(&fs_info->delalloc_root_mutex);
2754
mutex_init(&fs_info->zoned_meta_io_lock);
2755
mutex_init(&fs_info->zoned_data_reloc_io_lock);
2756
seqlock_init(&fs_info->profiles_lock);
2757
2758
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
2759
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
2760
btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
2761
btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
2762
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
2763
BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2764
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
2765
BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2766
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
2767
BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2768
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
2769
BTRFS_LOCKDEP_TRANS_COMPLETED);
2770
2771
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2772
INIT_LIST_HEAD(&fs_info->space_info);
2773
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2774
INIT_LIST_HEAD(&fs_info->unused_bgs);
2775
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2776
INIT_LIST_HEAD(&fs_info->zone_active_bgs);
2777
#ifdef CONFIG_BTRFS_DEBUG
2778
INIT_LIST_HEAD(&fs_info->allocated_roots);
2779
INIT_LIST_HEAD(&fs_info->allocated_ebs);
2780
spin_lock_init(&fs_info->eb_leak_lock);
2781
#endif
2782
fs_info->mapping_tree = RB_ROOT_CACHED;
2783
rwlock_init(&fs_info->mapping_tree_lock);
2784
btrfs_init_block_rsv(&fs_info->global_block_rsv,
2785
BTRFS_BLOCK_RSV_GLOBAL);
2786
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2787
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2788
btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
2789
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2790
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2791
BTRFS_BLOCK_RSV_DELOPS);
2792
btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2793
BTRFS_BLOCK_RSV_DELREFS);
2794
2795
atomic_set(&fs_info->async_delalloc_pages, 0);
2796
atomic_set(&fs_info->defrag_running, 0);
2797
atomic_set(&fs_info->nr_delayed_iputs, 0);
2798
atomic64_set(&fs_info->tree_mod_seq, 0);
2799
fs_info->global_root_tree = RB_ROOT;
2800
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2801
fs_info->metadata_ratio = 0;
2802
fs_info->defrag_inodes = RB_ROOT;
2803
atomic64_set(&fs_info->free_chunk_space, 0);
2804
fs_info->tree_mod_log = RB_ROOT;
2805
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2806
btrfs_init_ref_verify(fs_info);
2807
2808
fs_info->thread_pool_size = min_t(unsigned long,
2809
num_online_cpus() + 2, 8);
2810
2811
INIT_LIST_HEAD(&fs_info->ordered_roots);
2812
spin_lock_init(&fs_info->ordered_root_lock);
2813
2814
btrfs_init_scrub(fs_info);
2815
btrfs_init_balance(fs_info);
2816
btrfs_init_async_reclaim_work(fs_info);
2817
btrfs_init_extent_map_shrinker_work(fs_info);
2818
2819
rwlock_init(&fs_info->block_group_cache_lock);
2820
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
2821
2822
btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2823
IO_TREE_FS_EXCLUDED_EXTENTS);
2824
2825
mutex_init(&fs_info->ordered_operations_mutex);
2826
mutex_init(&fs_info->tree_log_mutex);
2827
mutex_init(&fs_info->chunk_mutex);
2828
mutex_init(&fs_info->transaction_kthread_mutex);
2829
mutex_init(&fs_info->cleaner_mutex);
2830
mutex_init(&fs_info->ro_block_group_mutex);
2831
init_rwsem(&fs_info->commit_root_sem);
2832
init_rwsem(&fs_info->cleanup_work_sem);
2833
init_rwsem(&fs_info->subvol_sem);
2834
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2835
2836
btrfs_init_dev_replace_locks(fs_info);
2837
btrfs_init_qgroup(fs_info);
2838
btrfs_discard_init(fs_info);
2839
2840
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2841
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2842
2843
init_waitqueue_head(&fs_info->transaction_throttle);
2844
init_waitqueue_head(&fs_info->transaction_wait);
2845
init_waitqueue_head(&fs_info->transaction_blocked_wait);
2846
init_waitqueue_head(&fs_info->async_submit_wait);
2847
init_waitqueue_head(&fs_info->delayed_iputs_wait);
2848
2849
/* Usable values until the real ones are cached from the superblock */
2850
fs_info->nodesize = 4096;
2851
fs_info->sectorsize = 4096;
2852
fs_info->sectorsize_bits = ilog2(4096);
2853
fs_info->stripesize = 4096;
2854
2855
/* Default compress algorithm when user does -o compress */
2856
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2857
2858
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
2859
2860
spin_lock_init(&fs_info->swapfile_pins_lock);
2861
fs_info->swapfile_pins = RB_ROOT;
2862
2863
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2864
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2865
}
2866
2867
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2868
{
2869
int ret;
2870
2871
fs_info->sb = sb;
2872
/* Temporary fixed values for block size until we read the superblock. */
2873
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2874
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2875
2876
ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2877
if (ret)
2878
return ret;
2879
2880
ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
2881
if (ret)
2882
return ret;
2883
2884
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2885
if (ret)
2886
return ret;
2887
2888
ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
2889
if (ret)
2890
return ret;
2891
2892
fs_info->dirty_metadata_batch = PAGE_SIZE *
2893
(1 + ilog2(nr_cpu_ids));
2894
2895
ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2896
if (ret)
2897
return ret;
2898
2899
ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2900
GFP_KERNEL);
2901
if (ret)
2902
return ret;
2903
2904
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2905
GFP_KERNEL);
2906
if (!fs_info->delayed_root)
2907
return -ENOMEM;
2908
btrfs_init_delayed_root(fs_info->delayed_root);
2909
2910
if (sb_rdonly(sb))
2911
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
2912
if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
2913
set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
2914
2915
return btrfs_alloc_stripe_hash_table(fs_info);
2916
}
2917
2918
static int btrfs_uuid_rescan_kthread(void *data)
2919
{
2920
struct btrfs_fs_info *fs_info = data;
2921
int ret;
2922
2923
/*
2924
* 1st step is to iterate through the existing UUID tree and
2925
* to delete all entries that contain outdated data.
2926
* 2nd step is to add all missing entries to the UUID tree.
2927
*/
2928
ret = btrfs_uuid_tree_iterate(fs_info);
2929
if (ret < 0) {
2930
if (ret != -EINTR)
2931
btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2932
ret);
2933
up(&fs_info->uuid_tree_rescan_sem);
2934
return ret;
2935
}
2936
return btrfs_uuid_scan_kthread(data);
2937
}
2938
2939
static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2940
{
2941
struct task_struct *task;
2942
2943
down(&fs_info->uuid_tree_rescan_sem);
2944
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2945
if (IS_ERR(task)) {
2946
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
2947
btrfs_warn(fs_info, "failed to start uuid_rescan task");
2948
up(&fs_info->uuid_tree_rescan_sem);
2949
return PTR_ERR(task);
2950
}
2951
2952
return 0;
2953
}
2954
2955
static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2956
{
2957
u64 root_objectid = 0;
2958
struct btrfs_root *gang[8];
2959
int ret = 0;
2960
2961
while (1) {
2962
unsigned int found;
2963
2964
spin_lock(&fs_info->fs_roots_radix_lock);
2965
found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2966
(void **)gang, root_objectid,
2967
ARRAY_SIZE(gang));
2968
if (!found) {
2969
spin_unlock(&fs_info->fs_roots_radix_lock);
2970
break;
2971
}
2972
root_objectid = btrfs_root_id(gang[found - 1]) + 1;
2973
2974
for (int i = 0; i < found; i++) {
2975
/* Avoid to grab roots in dead_roots. */
2976
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
2977
gang[i] = NULL;
2978
continue;
2979
}
2980
/* Grab all the search result for later use. */
2981
gang[i] = btrfs_grab_root(gang[i]);
2982
}
2983
spin_unlock(&fs_info->fs_roots_radix_lock);
2984
2985
for (int i = 0; i < found; i++) {
2986
if (!gang[i])
2987
continue;
2988
root_objectid = btrfs_root_id(gang[i]);
2989
/*
2990
* Continue to release the remaining roots after the first
2991
* error without cleanup and preserve the first error
2992
* for the return.
2993
*/
2994
if (!ret)
2995
ret = btrfs_orphan_cleanup(gang[i]);
2996
btrfs_put_root(gang[i]);
2997
}
2998
if (ret)
2999
break;
3000
3001
root_objectid++;
3002
}
3003
return ret;
3004
}
3005
3006
/*
3007
* Mounting logic specific to read-write file systems. Shared by open_ctree
3008
* and btrfs_remount when remounting from read-only to read-write.
3009
*/
3010
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3011
{
3012
int ret;
3013
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3014
bool rebuild_free_space_tree = false;
3015
3016
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3017
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3018
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
3019
btrfs_warn(fs_info,
3020
"'clear_cache' option is ignored with extent tree v2");
3021
else
3022
rebuild_free_space_tree = true;
3023
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3024
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3025
btrfs_warn(fs_info, "free space tree is invalid");
3026
rebuild_free_space_tree = true;
3027
}
3028
3029
if (rebuild_free_space_tree) {
3030
btrfs_info(fs_info, "rebuilding free space tree");
3031
ret = btrfs_rebuild_free_space_tree(fs_info);
3032
if (ret) {
3033
btrfs_warn(fs_info,
3034
"failed to rebuild free space tree: %d", ret);
3035
goto out;
3036
}
3037
}
3038
3039
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3040
!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
3041
btrfs_info(fs_info, "disabling free space tree");
3042
ret = btrfs_delete_free_space_tree(fs_info);
3043
if (ret) {
3044
btrfs_warn(fs_info,
3045
"failed to disable free space tree: %d", ret);
3046
goto out;
3047
}
3048
}
3049
3050
/*
3051
* btrfs_find_orphan_roots() is responsible for finding all the dead
3052
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3053
* them into the fs_info->fs_roots_radix tree. This must be done before
3054
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3055
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3056
* item before the root's tree is deleted - this means that if we unmount
3057
* or crash before the deletion completes, on the next mount we will not
3058
* delete what remains of the tree because the orphan item does not
3059
* exists anymore, which is what tells us we have a pending deletion.
3060
*/
3061
ret = btrfs_find_orphan_roots(fs_info);
3062
if (ret)
3063
goto out;
3064
3065
ret = btrfs_cleanup_fs_roots(fs_info);
3066
if (ret)
3067
goto out;
3068
3069
down_read(&fs_info->cleanup_work_sem);
3070
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3071
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3072
up_read(&fs_info->cleanup_work_sem);
3073
goto out;
3074
}
3075
up_read(&fs_info->cleanup_work_sem);
3076
3077
mutex_lock(&fs_info->cleaner_mutex);
3078
ret = btrfs_recover_relocation(fs_info);
3079
mutex_unlock(&fs_info->cleaner_mutex);
3080
if (ret < 0) {
3081
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3082
goto out;
3083
}
3084
3085
if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3086
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3087
btrfs_info(fs_info, "creating free space tree");
3088
ret = btrfs_create_free_space_tree(fs_info);
3089
if (ret) {
3090
btrfs_warn(fs_info,
3091
"failed to create free space tree: %d", ret);
3092
goto out;
3093
}
3094
}
3095
3096
if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3097
ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3098
if (ret)
3099
goto out;
3100
}
3101
3102
ret = btrfs_resume_balance_async(fs_info);
3103
if (ret)
3104
goto out;
3105
3106
ret = btrfs_resume_dev_replace_async(fs_info);
3107
if (ret) {
3108
btrfs_warn(fs_info, "failed to resume dev_replace");
3109
goto out;
3110
}
3111
3112
btrfs_qgroup_rescan_resume(fs_info);
3113
3114
if (!fs_info->uuid_root) {
3115
btrfs_info(fs_info, "creating UUID tree");
3116
ret = btrfs_create_uuid_tree(fs_info);
3117
if (ret) {
3118
btrfs_warn(fs_info,
3119
"failed to create the UUID tree %d", ret);
3120
goto out;
3121
}
3122
}
3123
3124
out:
3125
return ret;
3126
}
3127
3128
/*
3129
* Do various sanity and dependency checks of different features.
3130
*
3131
* @is_rw_mount: If the mount is read-write.
3132
*
3133
* This is the place for less strict checks (like for subpage or artificial
3134
* feature dependencies).
3135
*
3136
* For strict checks or possible corruption detection, see
3137
* btrfs_validate_super().
3138
*
3139
* This should be called after btrfs_parse_options(), as some mount options
3140
* (space cache related) can modify on-disk format like free space tree and
3141
* screw up certain feature dependencies.
3142
*/
3143
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3144
{
3145
struct btrfs_super_block *disk_super = fs_info->super_copy;
3146
u64 incompat = btrfs_super_incompat_flags(disk_super);
3147
const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
3148
const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
3149
3150
if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
3151
btrfs_err(fs_info,
3152
"cannot mount because of unknown incompat features (0x%llx)",
3153
incompat);
3154
return -EINVAL;
3155
}
3156
3157
/* Runtime limitation for mixed block groups. */
3158
if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3159
(fs_info->sectorsize != fs_info->nodesize)) {
3160
btrfs_err(fs_info,
3161
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3162
fs_info->nodesize, fs_info->sectorsize);
3163
return -EINVAL;
3164
}
3165
3166
/* Mixed backref is an always-enabled feature. */
3167
incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3168
3169
/* Set compression related flags just in case. */
3170
if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3171
incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3172
else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3173
incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3174
3175
/*
3176
* An ancient flag, which should really be marked deprecated.
3177
* Such runtime limitation doesn't really need a incompat flag.
3178
*/
3179
if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3180
incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3181
3182
if (compat_ro_unsupp && is_rw_mount) {
3183
btrfs_err(fs_info,
3184
"cannot mount read-write because of unknown compat_ro features (0x%llx)",
3185
compat_ro);
3186
return -EINVAL;
3187
}
3188
3189
/*
3190
* We have unsupported RO compat features, although RO mounted, we
3191
* should not cause any metadata writes, including log replay.
3192
* Or we could screw up whatever the new feature requires.
3193
*/
3194
if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
3195
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3196
btrfs_err(fs_info,
3197
"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3198
compat_ro);
3199
return -EINVAL;
3200
}
3201
3202
/*
3203
* Artificial limitations for block group tree, to force
3204
* block-group-tree to rely on no-holes and free-space-tree.
3205
*/
3206
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
3207
(!btrfs_fs_incompat(fs_info, NO_HOLES) ||
3208
!btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
3209
btrfs_err(fs_info,
3210
"block-group-tree feature requires no-holes and free-space-tree features");
3211
return -EINVAL;
3212
}
3213
3214
/*
3215
* Subpage/bs > ps runtime limitation on v1 cache.
3216
*
3217
* V1 space cache still has some hard coded PAGE_SIZE usage, while
3218
* we're already defaulting to v2 cache, no need to bother v1 as it's
3219
* going to be deprecated anyway.
3220
*/
3221
if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3222
btrfs_warn(fs_info,
3223
"v1 space cache is not supported for page size %lu with sectorsize %u",
3224
PAGE_SIZE, fs_info->sectorsize);
3225
return -EINVAL;
3226
}
3227
3228
/* This can be called by remount, we need to protect the super block. */
3229
spin_lock(&fs_info->super_lock);
3230
btrfs_set_super_incompat_flags(disk_super, incompat);
3231
spin_unlock(&fs_info->super_lock);
3232
3233
return 0;
3234
}
3235
3236
static bool fs_is_full_ro(const struct btrfs_fs_info *fs_info)
3237
{
3238
if (!sb_rdonly(fs_info->sb))
3239
return false;
3240
if (unlikely(fs_info->mount_opt & BTRFS_MOUNT_FULL_RO_MASK))
3241
return true;
3242
return false;
3243
}
3244
3245
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
3246
{
3247
u32 sectorsize;
3248
u32 nodesize;
3249
u32 stripesize;
3250
u64 generation;
3251
u16 csum_type;
3252
struct btrfs_super_block *disk_super;
3253
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3254
struct btrfs_root *tree_root;
3255
struct btrfs_root *chunk_root;
3256
int ret;
3257
int level;
3258
3259
ret = init_mount_fs_info(fs_info, sb);
3260
if (ret)
3261
goto fail;
3262
3263
/* These need to be init'ed before we start creating inodes and such. */
3264
tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3265
GFP_KERNEL);
3266
fs_info->tree_root = tree_root;
3267
chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3268
GFP_KERNEL);
3269
fs_info->chunk_root = chunk_root;
3270
if (!tree_root || !chunk_root) {
3271
ret = -ENOMEM;
3272
goto fail;
3273
}
3274
3275
ret = btrfs_init_btree_inode(sb);
3276
if (ret)
3277
goto fail;
3278
3279
invalidate_bdev(fs_devices->latest_dev->bdev);
3280
3281
/*
3282
* Read super block and check the signature bytes only
3283
*/
3284
disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
3285
if (IS_ERR(disk_super)) {
3286
ret = PTR_ERR(disk_super);
3287
goto fail_alloc;
3288
}
3289
3290
btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
3291
/*
3292
* Verify the type first, if that or the checksum value are
3293
* corrupted, we'll find out
3294
*/
3295
csum_type = btrfs_super_csum_type(disk_super);
3296
if (!btrfs_supported_super_csum(csum_type)) {
3297
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3298
csum_type);
3299
ret = -EINVAL;
3300
btrfs_release_disk_super(disk_super);
3301
goto fail_alloc;
3302
}
3303
3304
fs_info->csum_size = btrfs_super_csum_size(disk_super);
3305
3306
ret = btrfs_init_csum_hash(fs_info, csum_type);
3307
if (ret) {
3308
btrfs_release_disk_super(disk_super);
3309
goto fail_alloc;
3310
}
3311
3312
/*
3313
* We want to check superblock checksum, the type is stored inside.
3314
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3315
*/
3316
if (btrfs_check_super_csum(fs_info, disk_super)) {
3317
btrfs_err(fs_info, "superblock checksum mismatch");
3318
ret = -EINVAL;
3319
btrfs_release_disk_super(disk_super);
3320
goto fail_alloc;
3321
}
3322
3323
/*
3324
* super_copy is zeroed at allocation time and we never touch the
3325
* following bytes up to INFO_SIZE, the checksum is calculated from
3326
* the whole block of INFO_SIZE
3327
*/
3328
memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3329
btrfs_release_disk_super(disk_super);
3330
3331
disk_super = fs_info->super_copy;
3332
3333
memcpy(fs_info->super_for_commit, fs_info->super_copy,
3334
sizeof(*fs_info->super_for_commit));
3335
3336
ret = btrfs_validate_mount_super(fs_info);
3337
if (ret) {
3338
btrfs_err(fs_info, "superblock contains fatal errors");
3339
ret = -EINVAL;
3340
goto fail_alloc;
3341
}
3342
3343
if (!btrfs_super_root(disk_super)) {
3344
btrfs_err(fs_info, "invalid superblock tree root bytenr");
3345
ret = -EINVAL;
3346
goto fail_alloc;
3347
}
3348
3349
/* check FS state, whether FS is broken. */
3350
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3351
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
3352
3353
/* If the fs has any rescue options, no transaction is allowed. */
3354
if (fs_is_full_ro(fs_info))
3355
WRITE_ONCE(fs_info->fs_error, -EROFS);
3356
3357
/* Set up fs_info before parsing mount options */
3358
nodesize = btrfs_super_nodesize(disk_super);
3359
sectorsize = btrfs_super_sectorsize(disk_super);
3360
stripesize = sectorsize;
3361
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3362
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3363
3364
fs_info->nodesize = nodesize;
3365
fs_info->nodesize_bits = ilog2(nodesize);
3366
fs_info->sectorsize = sectorsize;
3367
fs_info->sectorsize_bits = ilog2(sectorsize);
3368
fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
3369
fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
3370
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3371
fs_info->stripesize = stripesize;
3372
fs_info->fs_devices->fs_info = fs_info;
3373
3374
if (fs_info->sectorsize > PAGE_SIZE)
3375
btrfs_warn(fs_info,
3376
"support for block size %u with page size %lu is experimental, some features may be missing",
3377
fs_info->sectorsize, PAGE_SIZE);
3378
/*
3379
* Handle the space caching options appropriately now that we have the
3380
* super block loaded and validated.
3381
*/
3382
btrfs_set_free_space_cache_settings(fs_info);
3383
3384
if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
3385
ret = -EINVAL;
3386
goto fail_alloc;
3387
}
3388
3389
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
3390
if (ret < 0)
3391
goto fail_alloc;
3392
3393
/*
3394
* At this point our mount options are validated, if we set ->max_inline
3395
* to something non-standard make sure we truncate it to sectorsize.
3396
*/
3397
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
3398
3399
ret = btrfs_alloc_compress_wsm(fs_info);
3400
if (ret)
3401
goto fail_sb_buffer;
3402
ret = btrfs_init_workqueues(fs_info);
3403
if (ret)
3404
goto fail_sb_buffer;
3405
3406
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3407
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3408
3409
/* Update the values for the current filesystem. */
3410
sb->s_blocksize = sectorsize;
3411
sb->s_blocksize_bits = blksize_bits(sectorsize);
3412
memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3413
3414
mutex_lock(&fs_info->chunk_mutex);
3415
ret = btrfs_read_sys_array(fs_info);
3416
mutex_unlock(&fs_info->chunk_mutex);
3417
if (ret) {
3418
btrfs_err(fs_info, "failed to read the system array: %d", ret);
3419
goto fail_sb_buffer;
3420
}
3421
3422
generation = btrfs_super_chunk_root_generation(disk_super);
3423
level = btrfs_super_chunk_root_level(disk_super);
3424
ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3425
generation, level);
3426
if (ret) {
3427
btrfs_err(fs_info, "failed to read chunk root");
3428
goto fail_tree_roots;
3429
}
3430
3431
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3432
offsetof(struct btrfs_header, chunk_tree_uuid),
3433
BTRFS_UUID_SIZE);
3434
3435
ret = btrfs_read_chunk_tree(fs_info);
3436
if (ret) {
3437
btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3438
goto fail_tree_roots;
3439
}
3440
3441
/*
3442
* At this point we know all the devices that make this filesystem,
3443
* including the seed devices but we don't know yet if the replace
3444
* target is required. So free devices that are not part of this
3445
* filesystem but skip the replace target device which is checked
3446
* below in btrfs_init_dev_replace().
3447
*/
3448
btrfs_free_extra_devids(fs_devices);
3449
if (unlikely(!fs_devices->latest_dev->bdev)) {
3450
btrfs_err(fs_info, "failed to read devices");
3451
ret = -EIO;
3452
goto fail_tree_roots;
3453
}
3454
3455
ret = init_tree_roots(fs_info);
3456
if (ret)
3457
goto fail_tree_roots;
3458
3459
/*
3460
* Get zone type information of zoned block devices. This will also
3461
* handle emulation of a zoned filesystem if a regular device has the
3462
* zoned incompat feature flag set.
3463
*/
3464
ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3465
if (ret) {
3466
btrfs_err(fs_info,
3467
"zoned: failed to read device zone info: %d", ret);
3468
goto fail_block_groups;
3469
}
3470
3471
/*
3472
* If we have a uuid root and we're not being told to rescan we need to
3473
* check the generation here so we can set the
3474
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
3475
* transaction during a balance or the log replay without updating the
3476
* uuid generation, and then if we crash we would rescan the uuid tree,
3477
* even though it was perfectly fine.
3478
*/
3479
if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3480
fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3481
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3482
3483
if (unlikely(btrfs_verify_dev_items(fs_info))) {
3484
ret = -EUCLEAN;
3485
goto fail_block_groups;
3486
}
3487
ret = btrfs_verify_dev_extents(fs_info);
3488
if (ret) {
3489
btrfs_err(fs_info,
3490
"failed to verify dev extents against chunks: %d",
3491
ret);
3492
goto fail_block_groups;
3493
}
3494
ret = btrfs_recover_balance(fs_info);
3495
if (ret) {
3496
btrfs_err(fs_info, "failed to recover balance: %d", ret);
3497
goto fail_block_groups;
3498
}
3499
3500
ret = btrfs_init_dev_stats(fs_info);
3501
if (ret) {
3502
btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3503
goto fail_block_groups;
3504
}
3505
3506
ret = btrfs_init_dev_replace(fs_info);
3507
if (ret) {
3508
btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3509
goto fail_block_groups;
3510
}
3511
3512
ret = btrfs_check_zoned_mode(fs_info);
3513
if (ret) {
3514
btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3515
ret);
3516
goto fail_block_groups;
3517
}
3518
3519
ret = btrfs_sysfs_add_fsid(fs_devices);
3520
if (ret) {
3521
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3522
ret);
3523
goto fail_block_groups;
3524
}
3525
3526
ret = btrfs_sysfs_add_mounted(fs_info);
3527
if (ret) {
3528
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3529
goto fail_fsdev_sysfs;
3530
}
3531
3532
ret = btrfs_init_space_info(fs_info);
3533
if (ret) {
3534
btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3535
goto fail_sysfs;
3536
}
3537
3538
ret = btrfs_read_block_groups(fs_info);
3539
if (ret) {
3540
btrfs_err(fs_info, "failed to read block groups: %d", ret);
3541
goto fail_sysfs;
3542
}
3543
3544
btrfs_zoned_reserve_data_reloc_bg(fs_info);
3545
btrfs_free_zone_cache(fs_info);
3546
3547
btrfs_check_active_zone_reservation(fs_info);
3548
3549
if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3550
!btrfs_check_rw_degradable(fs_info, NULL)) {
3551
btrfs_warn(fs_info,
3552
"writable mount is not allowed due to too many missing devices");
3553
ret = -EINVAL;
3554
goto fail_sysfs;
3555
}
3556
3557
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3558
"btrfs-cleaner");
3559
if (IS_ERR(fs_info->cleaner_kthread)) {
3560
ret = PTR_ERR(fs_info->cleaner_kthread);
3561
goto fail_sysfs;
3562
}
3563
3564
fs_info->transaction_kthread = kthread_run(transaction_kthread,
3565
tree_root,
3566
"btrfs-transaction");
3567
if (IS_ERR(fs_info->transaction_kthread)) {
3568
ret = PTR_ERR(fs_info->transaction_kthread);
3569
goto fail_cleaner;
3570
}
3571
3572
ret = btrfs_read_qgroup_config(fs_info);
3573
if (ret)
3574
goto fail_trans_kthread;
3575
3576
if (btrfs_build_ref_tree(fs_info))
3577
btrfs_err(fs_info, "couldn't build ref tree");
3578
3579
/* do not make disk changes in broken FS or nologreplay is given */
3580
if (btrfs_super_log_root(disk_super) != 0 &&
3581
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3582
btrfs_info(fs_info, "start tree-log replay");
3583
ret = btrfs_replay_log(fs_info, fs_devices);
3584
if (ret)
3585
goto fail_qgroup;
3586
}
3587
3588
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3589
if (IS_ERR(fs_info->fs_root)) {
3590
ret = PTR_ERR(fs_info->fs_root);
3591
btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
3592
fs_info->fs_root = NULL;
3593
goto fail_qgroup;
3594
}
3595
3596
if (sb_rdonly(sb))
3597
return 0;
3598
3599
ret = btrfs_start_pre_rw_mount(fs_info);
3600
if (ret) {
3601
close_ctree(fs_info);
3602
return ret;
3603
}
3604
btrfs_discard_resume(fs_info);
3605
3606
if (fs_info->uuid_root &&
3607
(btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3608
fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3609
btrfs_info(fs_info, "checking UUID tree");
3610
ret = btrfs_check_uuid_tree(fs_info);
3611
if (ret) {
3612
btrfs_warn(fs_info,
3613
"failed to check the UUID tree: %d", ret);
3614
close_ctree(fs_info);
3615
return ret;
3616
}
3617
}
3618
3619
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3620
3621
/* Kick the cleaner thread so it'll start deleting snapshots. */
3622
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3623
wake_up_process(fs_info->cleaner_kthread);
3624
3625
return 0;
3626
3627
fail_qgroup:
3628
btrfs_free_qgroup_config(fs_info);
3629
fail_trans_kthread:
3630
kthread_stop(fs_info->transaction_kthread);
3631
btrfs_cleanup_transaction(fs_info);
3632
btrfs_free_fs_roots(fs_info);
3633
fail_cleaner:
3634
kthread_stop(fs_info->cleaner_kthread);
3635
3636
/*
3637
* make sure we're done with the btree inode before we stop our
3638
* kthreads
3639
*/
3640
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3641
3642
fail_sysfs:
3643
btrfs_sysfs_remove_mounted(fs_info);
3644
3645
fail_fsdev_sysfs:
3646
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3647
3648
fail_block_groups:
3649
btrfs_put_block_group_cache(fs_info);
3650
3651
fail_tree_roots:
3652
if (fs_info->data_reloc_root)
3653
btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3654
free_root_pointers(fs_info, true);
3655
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3656
3657
fail_sb_buffer:
3658
btrfs_stop_all_workers(fs_info);
3659
btrfs_free_block_groups(fs_info);
3660
fail_alloc:
3661
btrfs_mapping_tree_free(fs_info);
3662
3663
iput(fs_info->btree_inode);
3664
fail:
3665
ASSERT(ret < 0);
3666
return ret;
3667
}
3668
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3669
3670
static void btrfs_end_super_write(struct bio *bio)
3671
{
3672
struct btrfs_device *device = bio->bi_private;
3673
struct folio_iter fi;
3674
3675
bio_for_each_folio_all(fi, bio) {
3676
if (bio->bi_status) {
3677
btrfs_warn_rl(device->fs_info,
3678
"lost super block write due to IO error on %s (%d)",
3679
btrfs_dev_name(device),
3680
blk_status_to_errno(bio->bi_status));
3681
btrfs_dev_stat_inc_and_print(device,
3682
BTRFS_DEV_STAT_WRITE_ERRS);
3683
/* Ensure failure if the primary sb fails. */
3684
if (bio->bi_opf & REQ_FUA)
3685
atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
3686
&device->sb_write_errors);
3687
else
3688
atomic_inc(&device->sb_write_errors);
3689
}
3690
folio_unlock(fi.folio);
3691
folio_put(fi.folio);
3692
}
3693
3694
bio_put(bio);
3695
}
3696
3697
/*
3698
* Write superblock @sb to the @device. Do not wait for completion, all the
3699
* folios we use for writing are locked.
3700
*
3701
* Write @max_mirrors copies of the superblock, where 0 means default that fit
3702
* the expected device size at commit time. Note that max_mirrors must be
3703
* same for write and wait phases.
3704
*
3705
* Return number of errors when folio is not found or submission fails.
3706
*/
3707
static int write_dev_supers(struct btrfs_device *device,
3708
struct btrfs_super_block *sb, int max_mirrors)
3709
{
3710
struct btrfs_fs_info *fs_info = device->fs_info;
3711
struct address_space *mapping = device->bdev->bd_mapping;
3712
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3713
int i;
3714
int ret;
3715
u64 bytenr, bytenr_orig;
3716
3717
atomic_set(&device->sb_write_errors, 0);
3718
3719
if (max_mirrors == 0)
3720
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3721
3722
shash->tfm = fs_info->csum_shash;
3723
3724
for (i = 0; i < max_mirrors; i++) {
3725
struct folio *folio;
3726
struct bio *bio;
3727
struct btrfs_super_block *disk_super;
3728
size_t offset;
3729
3730
bytenr_orig = btrfs_sb_offset(i);
3731
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3732
if (ret == -ENOENT) {
3733
continue;
3734
} else if (ret < 0) {
3735
btrfs_err(device->fs_info,
3736
"couldn't get super block location for mirror %d error %d",
3737
i, ret);
3738
atomic_inc(&device->sb_write_errors);
3739
continue;
3740
}
3741
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3742
device->commit_total_bytes)
3743
break;
3744
3745
btrfs_set_super_bytenr(sb, bytenr_orig);
3746
3747
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3748
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3749
sb->csum);
3750
3751
folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
3752
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
3753
GFP_NOFS);
3754
if (IS_ERR(folio)) {
3755
btrfs_err(device->fs_info,
3756
"couldn't get super block page for bytenr %llu error %ld",
3757
bytenr, PTR_ERR(folio));
3758
atomic_inc(&device->sb_write_errors);
3759
continue;
3760
}
3761
3762
offset = offset_in_folio(folio, bytenr);
3763
disk_super = folio_address(folio) + offset;
3764
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3765
3766
/*
3767
* Directly use bios here instead of relying on the page cache
3768
* to do I/O, so we don't lose the ability to do integrity
3769
* checking.
3770
*/
3771
bio = bio_alloc(device->bdev, 1,
3772
REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
3773
GFP_NOFS);
3774
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3775
bio->bi_private = device;
3776
bio->bi_end_io = btrfs_end_super_write;
3777
bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
3778
3779
/*
3780
* We FUA only the first super block. The others we allow to
3781
* go down lazy and there's a short window where the on-disk
3782
* copies might still contain the older version.
3783
*/
3784
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3785
bio->bi_opf |= REQ_FUA;
3786
submit_bio(bio);
3787
3788
if (btrfs_advance_sb_log(device, i))
3789
atomic_inc(&device->sb_write_errors);
3790
}
3791
return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
3792
}
3793
3794
/*
3795
* Wait for write completion of superblocks done by write_dev_supers,
3796
* @max_mirrors same for write and wait phases.
3797
*
3798
* Return -1 if primary super block write failed or when there were no super block
3799
* copies written. Otherwise 0.
3800
*/
3801
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3802
{
3803
int i;
3804
int errors = 0;
3805
bool primary_failed = false;
3806
int ret;
3807
u64 bytenr;
3808
3809
if (max_mirrors == 0)
3810
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3811
3812
for (i = 0; i < max_mirrors; i++) {
3813
struct folio *folio;
3814
3815
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
3816
if (ret == -ENOENT) {
3817
break;
3818
} else if (ret < 0) {
3819
errors++;
3820
if (i == 0)
3821
primary_failed = true;
3822
continue;
3823
}
3824
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3825
device->commit_total_bytes)
3826
break;
3827
3828
folio = filemap_get_folio(device->bdev->bd_mapping,
3829
bytenr >> PAGE_SHIFT);
3830
/* If the folio has been removed, then we know it completed. */
3831
if (IS_ERR(folio))
3832
continue;
3833
3834
/* Folio will be unlocked once the write completes. */
3835
folio_wait_locked(folio);
3836
folio_put(folio);
3837
}
3838
3839
errors += atomic_read(&device->sb_write_errors);
3840
if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
3841
primary_failed = true;
3842
if (primary_failed) {
3843
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3844
device->devid);
3845
return -1;
3846
}
3847
3848
return errors < i ? 0 : -1;
3849
}
3850
3851
/*
3852
* endio for the write_dev_flush, this will wake anyone waiting
3853
* for the barrier when it is done
3854
*/
3855
static void btrfs_end_empty_barrier(struct bio *bio)
3856
{
3857
bio_uninit(bio);
3858
complete(bio->bi_private);
3859
}
3860
3861
/*
3862
* Submit a flush request to the device if it supports it. Error handling is
3863
* done in the waiting counterpart.
3864
*/
3865
static void write_dev_flush(struct btrfs_device *device)
3866
{
3867
struct bio *bio = &device->flush_bio;
3868
3869
device->last_flush_error = BLK_STS_OK;
3870
3871
bio_init(bio, device->bdev, NULL, 0,
3872
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
3873
bio->bi_end_io = btrfs_end_empty_barrier;
3874
init_completion(&device->flush_wait);
3875
bio->bi_private = &device->flush_wait;
3876
submit_bio(bio);
3877
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3878
}
3879
3880
/*
3881
* If the flush bio has been submitted by write_dev_flush, wait for it.
3882
* Return true for any error, and false otherwise.
3883
*/
3884
static bool wait_dev_flush(struct btrfs_device *device)
3885
{
3886
struct bio *bio = &device->flush_bio;
3887
3888
if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3889
return false;
3890
3891
wait_for_completion_io(&device->flush_wait);
3892
3893
if (bio->bi_status) {
3894
device->last_flush_error = bio->bi_status;
3895
btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
3896
return true;
3897
}
3898
3899
return false;
3900
}
3901
3902
/*
3903
* send an empty flush down to each device in parallel,
3904
* then wait for them
3905
*/
3906
static int barrier_all_devices(struct btrfs_fs_info *info)
3907
{
3908
struct list_head *head;
3909
struct btrfs_device *dev;
3910
int errors_wait = 0;
3911
3912
lockdep_assert_held(&info->fs_devices->device_list_mutex);
3913
/* send down all the barriers */
3914
head = &info->fs_devices->devices;
3915
list_for_each_entry(dev, head, dev_list) {
3916
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3917
continue;
3918
if (!dev->bdev)
3919
continue;
3920
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3921
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3922
continue;
3923
3924
write_dev_flush(dev);
3925
}
3926
3927
/* wait for all the barriers */
3928
list_for_each_entry(dev, head, dev_list) {
3929
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3930
continue;
3931
if (!dev->bdev) {
3932
errors_wait++;
3933
continue;
3934
}
3935
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3936
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3937
continue;
3938
3939
if (wait_dev_flush(dev))
3940
errors_wait++;
3941
}
3942
3943
/*
3944
* Checks last_flush_error of disks in order to determine the device
3945
* state.
3946
*/
3947
if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
3948
return -EIO;
3949
3950
return 0;
3951
}
3952
3953
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3954
{
3955
int raid_type;
3956
int min_tolerated = INT_MAX;
3957
3958
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3959
(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3960
min_tolerated = min_t(int, min_tolerated,
3961
btrfs_raid_array[BTRFS_RAID_SINGLE].
3962
tolerated_failures);
3963
3964
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3965
if (raid_type == BTRFS_RAID_SINGLE)
3966
continue;
3967
if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3968
continue;
3969
min_tolerated = min_t(int, min_tolerated,
3970
btrfs_raid_array[raid_type].
3971
tolerated_failures);
3972
}
3973
3974
if (min_tolerated == INT_MAX) {
3975
btrfs_warn(NULL, "unknown raid flag: %llu", flags);
3976
min_tolerated = 0;
3977
}
3978
3979
return min_tolerated;
3980
}
3981
3982
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3983
{
3984
struct list_head *head;
3985
struct btrfs_device *dev;
3986
struct btrfs_super_block *sb;
3987
struct btrfs_dev_item *dev_item;
3988
int ret;
3989
int do_barriers;
3990
int max_errors;
3991
int total_errors = 0;
3992
u64 flags;
3993
3994
do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3995
3996
/*
3997
* max_mirrors == 0 indicates we're from commit_transaction,
3998
* not from fsync where the tree roots in fs_info have not
3999
* been consistent on disk.
4000
*/
4001
if (max_mirrors == 0)
4002
backup_super_roots(fs_info);
4003
4004
sb = fs_info->super_for_commit;
4005
dev_item = &sb->dev_item;
4006
4007
mutex_lock(&fs_info->fs_devices->device_list_mutex);
4008
head = &fs_info->fs_devices->devices;
4009
max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4010
4011
if (do_barriers) {
4012
ret = barrier_all_devices(fs_info);
4013
if (ret) {
4014
mutex_unlock(
4015
&fs_info->fs_devices->device_list_mutex);
4016
btrfs_handle_fs_error(fs_info, ret,
4017
"errors while submitting device barriers.");
4018
return ret;
4019
}
4020
}
4021
4022
list_for_each_entry(dev, head, dev_list) {
4023
if (!dev->bdev) {
4024
total_errors++;
4025
continue;
4026
}
4027
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4028
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4029
continue;
4030
4031
btrfs_set_stack_device_generation(dev_item, 0);
4032
btrfs_set_stack_device_type(dev_item, dev->type);
4033
btrfs_set_stack_device_id(dev_item, dev->devid);
4034
btrfs_set_stack_device_total_bytes(dev_item,
4035
dev->commit_total_bytes);
4036
btrfs_set_stack_device_bytes_used(dev_item,
4037
dev->commit_bytes_used);
4038
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4039
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4040
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4041
memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4042
memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4043
BTRFS_FSID_SIZE);
4044
4045
flags = btrfs_super_flags(sb);
4046
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4047
4048
ret = btrfs_validate_write_super(fs_info, sb);
4049
if (unlikely(ret < 0)) {
4050
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4051
btrfs_handle_fs_error(fs_info, -EUCLEAN,
4052
"unexpected superblock corruption detected");
4053
return -EUCLEAN;
4054
}
4055
4056
ret = write_dev_supers(dev, sb, max_mirrors);
4057
if (ret)
4058
total_errors++;
4059
}
4060
if (unlikely(total_errors > max_errors)) {
4061
btrfs_err(fs_info, "%d errors while writing supers",
4062
total_errors);
4063
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4064
4065
/* FUA is masked off if unsupported and can't be the reason */
4066
btrfs_handle_fs_error(fs_info, -EIO,
4067
"%d errors while writing supers",
4068
total_errors);
4069
return -EIO;
4070
}
4071
4072
total_errors = 0;
4073
list_for_each_entry(dev, head, dev_list) {
4074
if (!dev->bdev)
4075
continue;
4076
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4077
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4078
continue;
4079
4080
ret = wait_dev_supers(dev, max_mirrors);
4081
if (ret)
4082
total_errors++;
4083
}
4084
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4085
if (unlikely(total_errors > max_errors)) {
4086
btrfs_handle_fs_error(fs_info, -EIO,
4087
"%d errors while writing supers",
4088
total_errors);
4089
return -EIO;
4090
}
4091
return 0;
4092
}
4093
4094
/* Drop a fs root from the radix tree and free it. */
4095
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4096
struct btrfs_root *root)
4097
{
4098
bool drop_ref = false;
4099
4100
spin_lock(&fs_info->fs_roots_radix_lock);
4101
radix_tree_delete(&fs_info->fs_roots_radix,
4102
(unsigned long)btrfs_root_id(root));
4103
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4104
drop_ref = true;
4105
spin_unlock(&fs_info->fs_roots_radix_lock);
4106
4107
if (BTRFS_FS_ERROR(fs_info)) {
4108
ASSERT(root->log_root == NULL);
4109
if (root->reloc_root) {
4110
btrfs_put_root(root->reloc_root);
4111
root->reloc_root = NULL;
4112
}
4113
}
4114
4115
if (drop_ref)
4116
btrfs_put_root(root);
4117
}
4118
4119
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4120
{
4121
mutex_lock(&fs_info->cleaner_mutex);
4122
btrfs_run_delayed_iputs(fs_info);
4123
mutex_unlock(&fs_info->cleaner_mutex);
4124
wake_up_process(fs_info->cleaner_kthread);
4125
4126
/* wait until ongoing cleanup work done */
4127
down_write(&fs_info->cleanup_work_sem);
4128
up_write(&fs_info->cleanup_work_sem);
4129
4130
return btrfs_commit_current_transaction(fs_info->tree_root);
4131
}
4132
4133
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4134
{
4135
struct btrfs_transaction *trans;
4136
struct btrfs_transaction *tmp;
4137
bool found = false;
4138
4139
/*
4140
* This function is only called at the very end of close_ctree(),
4141
* thus no other running transaction, no need to take trans_lock.
4142
*/
4143
ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4144
list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4145
struct extent_state *cached = NULL;
4146
u64 dirty_bytes = 0;
4147
u64 cur = 0;
4148
u64 found_start;
4149
u64 found_end;
4150
4151
found = true;
4152
while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
4153
&found_start, &found_end,
4154
EXTENT_DIRTY, &cached)) {
4155
dirty_bytes += found_end + 1 - found_start;
4156
cur = found_end + 1;
4157
}
4158
btrfs_warn(fs_info,
4159
"transaction %llu (with %llu dirty metadata bytes) is not committed",
4160
trans->transid, dirty_bytes);
4161
btrfs_cleanup_one_transaction(trans);
4162
4163
if (trans == fs_info->running_transaction)
4164
fs_info->running_transaction = NULL;
4165
list_del_init(&trans->list);
4166
4167
btrfs_put_transaction(trans);
4168
trace_btrfs_transaction_commit(fs_info);
4169
}
4170
ASSERT(!found);
4171
}
4172
4173
void __cold close_ctree(struct btrfs_fs_info *fs_info)
4174
{
4175
int ret;
4176
4177
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4178
4179
/*
4180
* If we had UNFINISHED_DROPS we could still be processing them, so
4181
* clear that bit and wake up relocation so it can stop.
4182
* We must do this before stopping the block group reclaim task, because
4183
* at btrfs_relocate_block_group() we wait for this bit, and after the
4184
* wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4185
* have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4186
* return 1.
4187
*/
4188
btrfs_wake_unfinished_drop(fs_info);
4189
4190
/*
4191
* We may have the reclaim task running and relocating a data block group,
4192
* in which case it may create delayed iputs. So stop it before we park
4193
* the cleaner kthread otherwise we can get new delayed iputs after
4194
* parking the cleaner, and that can make the async reclaim task to hang
4195
* if it's waiting for delayed iputs to complete, since the cleaner is
4196
* parked and can not run delayed iputs - this will make us hang when
4197
* trying to stop the async reclaim task.
4198
*/
4199
cancel_work_sync(&fs_info->reclaim_bgs_work);
4200
/*
4201
* We don't want the cleaner to start new transactions, add more delayed
4202
* iputs, etc. while we're closing. We can't use kthread_stop() yet
4203
* because that frees the task_struct, and the transaction kthread might
4204
* still try to wake up the cleaner.
4205
*/
4206
kthread_park(fs_info->cleaner_kthread);
4207
4208
/* wait for the qgroup rescan worker to stop */
4209
btrfs_qgroup_wait_for_completion(fs_info, false);
4210
4211
/* wait for the uuid_scan task to finish */
4212
down(&fs_info->uuid_tree_rescan_sem);
4213
/* avoid complains from lockdep et al., set sem back to initial state */
4214
up(&fs_info->uuid_tree_rescan_sem);
4215
4216
/* pause restriper - we want to resume on mount */
4217
btrfs_pause_balance(fs_info);
4218
4219
btrfs_dev_replace_suspend_for_unmount(fs_info);
4220
4221
btrfs_scrub_cancel(fs_info);
4222
4223
/* wait for any defraggers to finish */
4224
wait_event(fs_info->transaction_wait,
4225
(atomic_read(&fs_info->defrag_running) == 0));
4226
4227
/* clear out the rbtree of defraggable inodes */
4228
btrfs_cleanup_defrag_inodes(fs_info);
4229
4230
/*
4231
* Handle the error fs first, as it will flush and wait for all ordered
4232
* extents. This will generate delayed iputs, thus we want to handle
4233
* it first.
4234
*/
4235
if (unlikely(BTRFS_FS_ERROR(fs_info)))
4236
btrfs_error_commit_super(fs_info);
4237
4238
/*
4239
* Wait for any fixup workers to complete.
4240
* If we don't wait for them here and they are still running by the time
4241
* we call kthread_stop() against the cleaner kthread further below, we
4242
* get an use-after-free on the cleaner because the fixup worker adds an
4243
* inode to the list of delayed iputs and then attempts to wakeup the
4244
* cleaner kthread, which was already stopped and destroyed. We parked
4245
* already the cleaner, but below we run all pending delayed iputs.
4246
*/
4247
btrfs_flush_workqueue(fs_info->fixup_workers);
4248
/*
4249
* Similar case here, we have to wait for delalloc workers before we
4250
* proceed below and stop the cleaner kthread, otherwise we trigger a
4251
* use-after-tree on the cleaner kthread task_struct when a delalloc
4252
* worker running submit_compressed_extents() adds a delayed iput, which
4253
* does a wake up on the cleaner kthread, which was already freed below
4254
* when we call kthread_stop().
4255
*/
4256
btrfs_flush_workqueue(fs_info->delalloc_workers);
4257
4258
/*
4259
* We can have ordered extents getting their last reference dropped from
4260
* the fs_info->workers queue because for async writes for data bios we
4261
* queue a work for that queue, at btrfs_wq_submit_bio(), that runs
4262
* run_one_async_done() which calls btrfs_bio_end_io() in case the bio
4263
* has an error, and that later function can do the final
4264
* btrfs_put_ordered_extent() on the ordered extent attached to the bio,
4265
* which adds a delayed iput for the inode. So we must flush the queue
4266
* so that we don't have delayed iputs after committing the current
4267
* transaction below and stopping the cleaner and transaction kthreads.
4268
*/
4269
btrfs_flush_workqueue(fs_info->workers);
4270
4271
/*
4272
* When finishing a compressed write bio we schedule a work queue item
4273
* to finish an ordered extent - end_bbio_compressed_write()
4274
* calls btrfs_finish_ordered_extent() which in turns does a call to
4275
* btrfs_queue_ordered_fn(), and that queues the ordered extent
4276
* completion either in the endio_write_workers work queue or in the
4277
* fs_info->endio_freespace_worker work queue. We flush those queues
4278
* below, so before we flush them we must flush this queue for the
4279
* workers of compressed writes.
4280
*/
4281
flush_workqueue(fs_info->endio_workers);
4282
4283
/*
4284
* After we parked the cleaner kthread, ordered extents may have
4285
* completed and created new delayed iputs. If one of the async reclaim
4286
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4287
* can hang forever trying to stop it, because if a delayed iput is
4288
* added after it ran btrfs_run_delayed_iputs() and before it called
4289
* btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4290
* no one else to run iputs.
4291
*
4292
* So wait for all ongoing ordered extents to complete and then run
4293
* delayed iputs. This works because once we reach this point no one
4294
* can create new ordered extents, but delayed iputs can still be added
4295
* by a reclaim worker (see comments further below).
4296
*
4297
* Also note that btrfs_wait_ordered_roots() is not safe here, because
4298
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4299
* but the delayed iput for the respective inode is made only when doing
4300
* the final btrfs_put_ordered_extent() (which must happen at
4301
* btrfs_finish_ordered_io() when we are unmounting).
4302
*/
4303
btrfs_flush_workqueue(fs_info->endio_write_workers);
4304
/* Ordered extents for free space inodes. */
4305
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4306
/*
4307
* Run delayed iputs in case an async reclaim worker is waiting for them
4308
* to be run as mentioned above.
4309
*/
4310
btrfs_run_delayed_iputs(fs_info);
4311
4312
cancel_work_sync(&fs_info->async_reclaim_work);
4313
cancel_work_sync(&fs_info->async_data_reclaim_work);
4314
cancel_work_sync(&fs_info->preempt_reclaim_work);
4315
cancel_work_sync(&fs_info->em_shrinker_work);
4316
4317
/*
4318
* Run delayed iputs again because an async reclaim worker may have
4319
* added new ones if it was flushing delalloc:
4320
*
4321
* shrink_delalloc() -> btrfs_start_delalloc_roots() ->
4322
* start_delalloc_inodes() -> btrfs_add_delayed_iput()
4323
*/
4324
btrfs_run_delayed_iputs(fs_info);
4325
4326
/* There should be no more workload to generate new delayed iputs. */
4327
set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
4328
4329
/* Cancel or finish ongoing discard work */
4330
btrfs_discard_cleanup(fs_info);
4331
4332
if (!sb_rdonly(fs_info->sb)) {
4333
/*
4334
* The cleaner kthread is stopped, so do one final pass over
4335
* unused block groups.
4336
*/
4337
btrfs_delete_unused_bgs(fs_info);
4338
4339
/*
4340
* There might be existing delayed inode workers still running
4341
* and holding an empty delayed inode item. We must wait for
4342
* them to complete first because they can create a transaction.
4343
* This happens when someone calls btrfs_balance_delayed_items()
4344
* and then a transaction commit runs the same delayed nodes
4345
* before any delayed worker has done something with the nodes.
4346
* We must wait for any worker here and not at transaction
4347
* commit time since that could cause a deadlock.
4348
* This is a very rare case.
4349
*/
4350
btrfs_flush_workqueue(fs_info->delayed_workers);
4351
4352
ret = btrfs_commit_super(fs_info);
4353
if (ret)
4354
btrfs_err(fs_info, "commit super ret %d", ret);
4355
}
4356
4357
kthread_stop(fs_info->transaction_kthread);
4358
kthread_stop(fs_info->cleaner_kthread);
4359
4360
ASSERT(list_empty(&fs_info->delayed_iputs));
4361
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4362
4363
if (btrfs_check_quota_leak(fs_info)) {
4364
DEBUG_WARN("qgroup reserved space leaked");
4365
btrfs_err(fs_info, "qgroup reserved space leaked");
4366
}
4367
4368
btrfs_free_qgroup_config(fs_info);
4369
ASSERT(list_empty(&fs_info->delalloc_roots));
4370
4371
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4372
btrfs_info(fs_info, "at unmount delalloc count %lld",
4373
percpu_counter_sum(&fs_info->delalloc_bytes));
4374
}
4375
4376
if (percpu_counter_sum(&fs_info->ordered_bytes))
4377
btrfs_info(fs_info, "at unmount dio bytes count %lld",
4378
percpu_counter_sum(&fs_info->ordered_bytes));
4379
4380
btrfs_sysfs_remove_mounted(fs_info);
4381
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4382
4383
btrfs_put_block_group_cache(fs_info);
4384
4385
/*
4386
* we must make sure there is not any read request to
4387
* submit after we stopping all workers.
4388
*/
4389
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4390
btrfs_stop_all_workers(fs_info);
4391
4392
/* We shouldn't have any transaction open at this point */
4393
warn_about_uncommitted_trans(fs_info);
4394
4395
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4396
free_root_pointers(fs_info, true);
4397
btrfs_free_fs_roots(fs_info);
4398
4399
/*
4400
* We must free the block groups after dropping the fs_roots as we could
4401
* have had an IO error and have left over tree log blocks that aren't
4402
* cleaned up until the fs roots are freed. This makes the block group
4403
* accounting appear to be wrong because there's pending reserved bytes,
4404
* so make sure we do the block group cleanup afterwards.
4405
*/
4406
btrfs_free_block_groups(fs_info);
4407
4408
iput(fs_info->btree_inode);
4409
4410
btrfs_mapping_tree_free(fs_info);
4411
}
4412
4413
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
4414
struct extent_buffer *buf)
4415
{
4416
struct btrfs_fs_info *fs_info = buf->fs_info;
4417
u64 transid = btrfs_header_generation(buf);
4418
4419
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4420
/*
4421
* This is a fast path so only do this check if we have sanity tests
4422
* enabled. Normal people shouldn't be using unmapped buffers as dirty
4423
* outside of the sanity tests.
4424
*/
4425
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4426
return;
4427
#endif
4428
/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
4429
ASSERT(trans->transid == fs_info->generation);
4430
btrfs_assert_tree_write_locked(buf);
4431
if (unlikely(transid != fs_info->generation)) {
4432
btrfs_abort_transaction(trans, -EUCLEAN);
4433
btrfs_crit(fs_info,
4434
"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
4435
buf->start, transid, fs_info->generation);
4436
}
4437
set_extent_buffer_dirty(buf);
4438
}
4439
4440
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4441
int flush_delayed)
4442
{
4443
/*
4444
* looks as though older kernels can get into trouble with
4445
* this code, they end up stuck in balance_dirty_pages forever
4446
*/
4447
int ret;
4448
4449
if (current->flags & PF_MEMALLOC)
4450
return;
4451
4452
if (flush_delayed)
4453
btrfs_balance_delayed_items(fs_info);
4454
4455
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4456
BTRFS_DIRTY_METADATA_THRESH,
4457
fs_info->dirty_metadata_batch);
4458
if (ret > 0) {
4459
balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4460
}
4461
}
4462
4463
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4464
{
4465
__btrfs_btree_balance_dirty(fs_info, 1);
4466
}
4467
4468
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4469
{
4470
__btrfs_btree_balance_dirty(fs_info, 0);
4471
}
4472
4473
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4474
{
4475
/* cleanup FS via transaction */
4476
btrfs_cleanup_transaction(fs_info);
4477
4478
down_write(&fs_info->cleanup_work_sem);
4479
up_write(&fs_info->cleanup_work_sem);
4480
}
4481
4482
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4483
{
4484
struct btrfs_root *gang[8];
4485
u64 root_objectid = 0;
4486
int ret;
4487
4488
spin_lock(&fs_info->fs_roots_radix_lock);
4489
while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4490
(void **)gang, root_objectid,
4491
ARRAY_SIZE(gang))) != 0) {
4492
int i;
4493
4494
for (i = 0; i < ret; i++)
4495
gang[i] = btrfs_grab_root(gang[i]);
4496
spin_unlock(&fs_info->fs_roots_radix_lock);
4497
4498
for (i = 0; i < ret; i++) {
4499
if (!gang[i])
4500
continue;
4501
root_objectid = btrfs_root_id(gang[i]);
4502
btrfs_free_log(NULL, gang[i]);
4503
btrfs_put_root(gang[i]);
4504
}
4505
root_objectid++;
4506
spin_lock(&fs_info->fs_roots_radix_lock);
4507
}
4508
spin_unlock(&fs_info->fs_roots_radix_lock);
4509
btrfs_free_log_root_tree(NULL, fs_info);
4510
}
4511
4512
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4513
{
4514
struct btrfs_ordered_extent *ordered;
4515
4516
spin_lock(&root->ordered_extent_lock);
4517
/*
4518
* This will just short circuit the ordered completion stuff which will
4519
* make sure the ordered extent gets properly cleaned up.
4520
*/
4521
list_for_each_entry(ordered, &root->ordered_extents,
4522
root_extent_list)
4523
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4524
spin_unlock(&root->ordered_extent_lock);
4525
}
4526
4527
static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4528
{
4529
struct btrfs_root *root;
4530
LIST_HEAD(splice);
4531
4532
spin_lock(&fs_info->ordered_root_lock);
4533
list_splice_init(&fs_info->ordered_roots, &splice);
4534
while (!list_empty(&splice)) {
4535
root = list_first_entry(&splice, struct btrfs_root,
4536
ordered_root);
4537
list_move_tail(&root->ordered_root,
4538
&fs_info->ordered_roots);
4539
4540
spin_unlock(&fs_info->ordered_root_lock);
4541
btrfs_destroy_ordered_extents(root);
4542
4543
cond_resched();
4544
spin_lock(&fs_info->ordered_root_lock);
4545
}
4546
spin_unlock(&fs_info->ordered_root_lock);
4547
4548
/*
4549
* We need this here because if we've been flipped read-only we won't
4550
* get sync() from the umount, so we need to make sure any ordered
4551
* extents that haven't had their dirty pages IO start writeout yet
4552
* actually get run and error out properly.
4553
*/
4554
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
4555
}
4556
4557
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4558
{
4559
struct btrfs_inode *btrfs_inode;
4560
LIST_HEAD(splice);
4561
4562
spin_lock(&root->delalloc_lock);
4563
list_splice_init(&root->delalloc_inodes, &splice);
4564
4565
while (!list_empty(&splice)) {
4566
struct inode *inode = NULL;
4567
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4568
delalloc_inodes);
4569
btrfs_del_delalloc_inode(btrfs_inode);
4570
spin_unlock(&root->delalloc_lock);
4571
4572
/*
4573
* Make sure we get a live inode and that it'll not disappear
4574
* meanwhile.
4575
*/
4576
inode = igrab(&btrfs_inode->vfs_inode);
4577
if (inode) {
4578
unsigned int nofs_flag;
4579
4580
nofs_flag = memalloc_nofs_save();
4581
invalidate_inode_pages2(inode->i_mapping);
4582
memalloc_nofs_restore(nofs_flag);
4583
iput(inode);
4584
}
4585
spin_lock(&root->delalloc_lock);
4586
}
4587
spin_unlock(&root->delalloc_lock);
4588
}
4589
4590
static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4591
{
4592
struct btrfs_root *root;
4593
LIST_HEAD(splice);
4594
4595
spin_lock(&fs_info->delalloc_root_lock);
4596
list_splice_init(&fs_info->delalloc_roots, &splice);
4597
while (!list_empty(&splice)) {
4598
root = list_first_entry(&splice, struct btrfs_root,
4599
delalloc_root);
4600
root = btrfs_grab_root(root);
4601
BUG_ON(!root);
4602
spin_unlock(&fs_info->delalloc_root_lock);
4603
4604
btrfs_destroy_delalloc_inodes(root);
4605
btrfs_put_root(root);
4606
4607
spin_lock(&fs_info->delalloc_root_lock);
4608
}
4609
spin_unlock(&fs_info->delalloc_root_lock);
4610
}
4611
4612
static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4613
struct extent_io_tree *dirty_pages,
4614
int mark)
4615
{
4616
struct extent_buffer *eb;
4617
u64 start = 0;
4618
u64 end;
4619
4620
while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
4621
mark, NULL)) {
4622
btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
4623
while (start <= end) {
4624
eb = find_extent_buffer(fs_info, start);
4625
start += fs_info->nodesize;
4626
if (!eb)
4627
continue;
4628
4629
btrfs_tree_lock(eb);
4630
wait_on_extent_buffer_writeback(eb);
4631
btrfs_clear_buffer_dirty(NULL, eb);
4632
btrfs_tree_unlock(eb);
4633
4634
free_extent_buffer_stale(eb);
4635
}
4636
}
4637
}
4638
4639
static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4640
struct extent_io_tree *unpin)
4641
{
4642
u64 start;
4643
u64 end;
4644
4645
while (1) {
4646
struct extent_state *cached_state = NULL;
4647
4648
/*
4649
* The btrfs_finish_extent_commit() may get the same range as
4650
* ours between find_first_extent_bit and clear_extent_dirty.
4651
* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4652
* the same extent range.
4653
*/
4654
mutex_lock(&fs_info->unused_bg_unpin_mutex);
4655
if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
4656
EXTENT_DIRTY, &cached_state)) {
4657
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4658
break;
4659
}
4660
4661
btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
4662
btrfs_free_extent_state(cached_state);
4663
btrfs_error_unpin_extent_range(fs_info, start, end);
4664
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4665
cond_resched();
4666
}
4667
}
4668
4669
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4670
{
4671
struct inode *inode;
4672
4673
inode = cache->io_ctl.inode;
4674
if (inode) {
4675
unsigned int nofs_flag;
4676
4677
nofs_flag = memalloc_nofs_save();
4678
invalidate_inode_pages2(inode->i_mapping);
4679
memalloc_nofs_restore(nofs_flag);
4680
4681
BTRFS_I(inode)->generation = 0;
4682
cache->io_ctl.inode = NULL;
4683
iput(inode);
4684
}
4685
ASSERT(cache->io_ctl.pages == NULL);
4686
btrfs_put_block_group(cache);
4687
}
4688
4689
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4690
struct btrfs_fs_info *fs_info)
4691
{
4692
struct btrfs_block_group *cache;
4693
4694
spin_lock(&cur_trans->dirty_bgs_lock);
4695
while (!list_empty(&cur_trans->dirty_bgs)) {
4696
cache = list_first_entry(&cur_trans->dirty_bgs,
4697
struct btrfs_block_group,
4698
dirty_list);
4699
4700
if (!list_empty(&cache->io_list)) {
4701
spin_unlock(&cur_trans->dirty_bgs_lock);
4702
list_del_init(&cache->io_list);
4703
btrfs_cleanup_bg_io(cache);
4704
spin_lock(&cur_trans->dirty_bgs_lock);
4705
}
4706
4707
list_del_init(&cache->dirty_list);
4708
spin_lock(&cache->lock);
4709
cache->disk_cache_state = BTRFS_DC_ERROR;
4710
spin_unlock(&cache->lock);
4711
4712
spin_unlock(&cur_trans->dirty_bgs_lock);
4713
btrfs_put_block_group(cache);
4714
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
4715
spin_lock(&cur_trans->dirty_bgs_lock);
4716
}
4717
spin_unlock(&cur_trans->dirty_bgs_lock);
4718
4719
/*
4720
* Refer to the definition of io_bgs member for details why it's safe
4721
* to use it without any locking
4722
*/
4723
while (!list_empty(&cur_trans->io_bgs)) {
4724
cache = list_first_entry(&cur_trans->io_bgs,
4725
struct btrfs_block_group,
4726
io_list);
4727
4728
list_del_init(&cache->io_list);
4729
spin_lock(&cache->lock);
4730
cache->disk_cache_state = BTRFS_DC_ERROR;
4731
spin_unlock(&cache->lock);
4732
btrfs_cleanup_bg_io(cache);
4733
}
4734
}
4735
4736
static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
4737
{
4738
struct btrfs_root *gang[8];
4739
int i;
4740
int ret;
4741
4742
spin_lock(&fs_info->fs_roots_radix_lock);
4743
while (1) {
4744
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
4745
(void **)gang, 0,
4746
ARRAY_SIZE(gang),
4747
BTRFS_ROOT_TRANS_TAG);
4748
if (ret == 0)
4749
break;
4750
for (i = 0; i < ret; i++) {
4751
struct btrfs_root *root = gang[i];
4752
4753
btrfs_qgroup_free_meta_all_pertrans(root);
4754
radix_tree_tag_clear(&fs_info->fs_roots_radix,
4755
(unsigned long)btrfs_root_id(root),
4756
BTRFS_ROOT_TRANS_TAG);
4757
}
4758
}
4759
spin_unlock(&fs_info->fs_roots_radix_lock);
4760
}
4761
4762
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
4763
{
4764
struct btrfs_fs_info *fs_info = cur_trans->fs_info;
4765
struct btrfs_device *dev, *tmp;
4766
4767
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4768
ASSERT(list_empty(&cur_trans->dirty_bgs));
4769
ASSERT(list_empty(&cur_trans->io_bgs));
4770
4771
list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4772
post_commit_list) {
4773
list_del_init(&dev->post_commit_list);
4774
}
4775
4776
btrfs_destroy_delayed_refs(cur_trans);
4777
4778
cur_trans->state = TRANS_STATE_COMMIT_START;
4779
wake_up(&fs_info->transaction_blocked_wait);
4780
4781
cur_trans->state = TRANS_STATE_UNBLOCKED;
4782
wake_up(&fs_info->transaction_wait);
4783
4784
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4785
EXTENT_DIRTY);
4786
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4787
4788
cur_trans->state =TRANS_STATE_COMPLETED;
4789
wake_up(&cur_trans->commit_wait);
4790
}
4791
4792
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4793
{
4794
struct btrfs_transaction *t;
4795
4796
mutex_lock(&fs_info->transaction_kthread_mutex);
4797
4798
spin_lock(&fs_info->trans_lock);
4799
while (!list_empty(&fs_info->trans_list)) {
4800
t = list_first_entry(&fs_info->trans_list,
4801
struct btrfs_transaction, list);
4802
if (t->state >= TRANS_STATE_COMMIT_PREP) {
4803
refcount_inc(&t->use_count);
4804
spin_unlock(&fs_info->trans_lock);
4805
btrfs_wait_for_commit(fs_info, t->transid);
4806
btrfs_put_transaction(t);
4807
spin_lock(&fs_info->trans_lock);
4808
continue;
4809
}
4810
if (t == fs_info->running_transaction) {
4811
t->state = TRANS_STATE_COMMIT_DOING;
4812
spin_unlock(&fs_info->trans_lock);
4813
/*
4814
* We wait for 0 num_writers since we don't hold a trans
4815
* handle open currently for this transaction.
4816
*/
4817
wait_event(t->writer_wait,
4818
atomic_read(&t->num_writers) == 0);
4819
} else {
4820
spin_unlock(&fs_info->trans_lock);
4821
}
4822
btrfs_cleanup_one_transaction(t);
4823
4824
spin_lock(&fs_info->trans_lock);
4825
if (t == fs_info->running_transaction)
4826
fs_info->running_transaction = NULL;
4827
list_del_init(&t->list);
4828
spin_unlock(&fs_info->trans_lock);
4829
4830
btrfs_put_transaction(t);
4831
trace_btrfs_transaction_commit(fs_info);
4832
spin_lock(&fs_info->trans_lock);
4833
}
4834
spin_unlock(&fs_info->trans_lock);
4835
btrfs_destroy_all_ordered_extents(fs_info);
4836
btrfs_destroy_delayed_inodes(fs_info);
4837
btrfs_assert_delayed_root_empty(fs_info);
4838
btrfs_destroy_all_delalloc_inodes(fs_info);
4839
btrfs_drop_all_logs(fs_info);
4840
btrfs_free_all_qgroup_pertrans(fs_info);
4841
mutex_unlock(&fs_info->transaction_kthread_mutex);
4842
4843
return 0;
4844
}
4845
4846
int btrfs_init_root_free_objectid(struct btrfs_root *root)
4847
{
4848
BTRFS_PATH_AUTO_FREE(path);
4849
int ret;
4850
struct extent_buffer *l;
4851
struct btrfs_key search_key;
4852
struct btrfs_key found_key;
4853
int slot;
4854
4855
path = btrfs_alloc_path();
4856
if (!path)
4857
return -ENOMEM;
4858
4859
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4860
search_key.type = -1;
4861
search_key.offset = (u64)-1;
4862
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
4863
if (ret < 0)
4864
return ret;
4865
if (unlikely(ret == 0)) {
4866
/*
4867
* Key with offset -1 found, there would have to exist a root
4868
* with such id, but this is out of valid range.
4869
*/
4870
return -EUCLEAN;
4871
}
4872
if (path->slots[0] > 0) {
4873
slot = path->slots[0] - 1;
4874
l = path->nodes[0];
4875
btrfs_item_key_to_cpu(l, &found_key, slot);
4876
root->free_objectid = max_t(u64, found_key.objectid + 1,
4877
BTRFS_FIRST_FREE_OBJECTID);
4878
} else {
4879
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4880
}
4881
4882
return 0;
4883
}
4884
4885
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4886
{
4887
int ret;
4888
mutex_lock(&root->objectid_mutex);
4889
4890
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4891
btrfs_warn(root->fs_info,
4892
"the objectid of root %llu reaches its highest value",
4893
btrfs_root_id(root));
4894
ret = -ENOSPC;
4895
goto out;
4896
}
4897
4898
*objectid = root->free_objectid++;
4899
ret = 0;
4900
out:
4901
mutex_unlock(&root->objectid_mutex);
4902
return ret;
4903
}
4904
4905