Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/extent_io.c
51070 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include <linux/bitops.h>
4
#include <linux/slab.h>
5
#include <linux/bio.h>
6
#include <linux/mm.h>
7
#include <linux/pagemap.h>
8
#include <linux/page-flags.h>
9
#include <linux/sched/mm.h>
10
#include <linux/spinlock.h>
11
#include <linux/blkdev.h>
12
#include <linux/swap.h>
13
#include <linux/writeback.h>
14
#include <linux/pagevec.h>
15
#include <linux/prefetch.h>
16
#include <linux/fsverity.h>
17
#include "extent_io.h"
18
#include "extent-io-tree.h"
19
#include "extent_map.h"
20
#include "ctree.h"
21
#include "btrfs_inode.h"
22
#include "bio.h"
23
#include "locking.h"
24
#include "backref.h"
25
#include "disk-io.h"
26
#include "subpage.h"
27
#include "zoned.h"
28
#include "block-group.h"
29
#include "compression.h"
30
#include "fs.h"
31
#include "accessors.h"
32
#include "file-item.h"
33
#include "file.h"
34
#include "dev-replace.h"
35
#include "super.h"
36
#include "transaction.h"
37
38
static struct kmem_cache *extent_buffer_cache;
39
40
#ifdef CONFIG_BTRFS_DEBUG
41
static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
42
{
43
struct btrfs_fs_info *fs_info = eb->fs_info;
44
unsigned long flags;
45
46
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
47
list_add(&eb->leak_list, &fs_info->allocated_ebs);
48
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
49
}
50
51
static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
52
{
53
struct btrfs_fs_info *fs_info = eb->fs_info;
54
unsigned long flags;
55
56
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
57
list_del(&eb->leak_list);
58
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
59
}
60
61
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
62
{
63
struct extent_buffer *eb;
64
unsigned long flags;
65
66
/*
67
* If we didn't get into open_ctree our allocated_ebs will not be
68
* initialized, so just skip this.
69
*/
70
if (!fs_info->allocated_ebs.next)
71
return;
72
73
WARN_ON(!list_empty(&fs_info->allocated_ebs));
74
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
75
while (!list_empty(&fs_info->allocated_ebs)) {
76
eb = list_first_entry(&fs_info->allocated_ebs,
77
struct extent_buffer, leak_list);
78
btrfs_err(fs_info,
79
"buffer leak start %llu len %u refs %d bflags %lu owner %llu",
80
eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
81
btrfs_header_owner(eb));
82
list_del(&eb->leak_list);
83
WARN_ON_ONCE(1);
84
kmem_cache_free(extent_buffer_cache, eb);
85
}
86
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
87
}
88
#else
89
#define btrfs_leak_debug_add_eb(eb) do {} while (0)
90
#define btrfs_leak_debug_del_eb(eb) do {} while (0)
91
#endif
92
93
/*
94
* Structure to record info about the bio being assembled, and other info like
95
* how many bytes are there before stripe/ordered extent boundary.
96
*/
97
struct btrfs_bio_ctrl {
98
struct btrfs_bio *bbio;
99
/* Last byte contained in bbio + 1 . */
100
loff_t next_file_offset;
101
enum btrfs_compression_type compress_type;
102
u32 len_to_oe_boundary;
103
blk_opf_t opf;
104
/*
105
* For data read bios, we attempt to optimize csum lookups if the extent
106
* generation is older than the current one. To make this possible, we
107
* need to track the maximum generation of an extent in a bio_ctrl to
108
* make the decision when submitting the bio.
109
*
110
* The pattern between do_readpage(), submit_one_bio() and
111
* submit_extent_folio() is quite subtle, so tracking this is tricky.
112
*
113
* As we process extent E, we might submit a bio with existing built up
114
* extents before adding E to a new bio, or we might just add E to the
115
* bio. As a result, E's generation could apply to the current bio or
116
* to the next one, so we need to be careful to update the bio_ctrl's
117
* generation with E's only when we are sure E is added to bio_ctrl->bbio
118
* in submit_extent_folio().
119
*
120
* See the comment in btrfs_lookup_bio_sums() for more detail on the
121
* need for this optimization.
122
*/
123
u64 generation;
124
btrfs_bio_end_io_t end_io_func;
125
struct writeback_control *wbc;
126
127
/*
128
* The sectors of the page which are going to be submitted by
129
* extent_writepage_io().
130
* This is to avoid touching ranges covered by compression/inline.
131
*/
132
unsigned long submit_bitmap;
133
struct readahead_control *ractl;
134
135
/*
136
* The start offset of the last used extent map by a read operation.
137
*
138
* This is for proper compressed read merge.
139
* U64_MAX means we are starting the read and have made no progress yet.
140
*
141
* The current btrfs_bio_is_contig() only uses disk_bytenr as
142
* the condition to check if the read can be merged with previous
143
* bio, which is not correct. E.g. two file extents pointing to the
144
* same extent but with different offset.
145
*
146
* So here we need to do extra checks to only merge reads that are
147
* covered by the same extent map.
148
* Just extent_map::start will be enough, as they are unique
149
* inside the same inode.
150
*/
151
u64 last_em_start;
152
};
153
154
/*
155
* Helper to set the csum search commit root option for a bio_ctrl's bbio
156
* before submitting the bio.
157
*
158
* Only for use by submit_one_bio().
159
*/
160
static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
161
{
162
struct btrfs_bio *bbio = bio_ctrl->bbio;
163
164
ASSERT(bbio);
165
166
if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
167
return;
168
169
bio_ctrl->bbio->csum_search_commit_root =
170
(bio_ctrl->generation &&
171
bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
172
}
173
174
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
175
{
176
struct btrfs_bio *bbio = bio_ctrl->bbio;
177
178
if (!bbio)
179
return;
180
181
/* Caller should ensure the bio has at least some range added */
182
ASSERT(bbio->bio.bi_iter.bi_size);
183
184
bio_set_csum_search_commit_root(bio_ctrl);
185
186
if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
187
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
188
btrfs_submit_compressed_read(bbio);
189
else
190
btrfs_submit_bbio(bbio, 0);
191
192
/* The bbio is owned by the end_io handler now */
193
bio_ctrl->bbio = NULL;
194
/*
195
* We used the generation to decide whether to lookup csums in the
196
* commit_root or not when we called bio_set_csum_search_commit_root()
197
* above. Now, reset the generation for the next bio.
198
*/
199
bio_ctrl->generation = 0;
200
}
201
202
/*
203
* Submit or fail the current bio in the bio_ctrl structure.
204
*/
205
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
206
{
207
struct btrfs_bio *bbio = bio_ctrl->bbio;
208
209
if (!bbio)
210
return;
211
212
if (ret) {
213
ASSERT(ret < 0);
214
btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
215
/* The bio is owned by the end_io handler now */
216
bio_ctrl->bbio = NULL;
217
} else {
218
submit_one_bio(bio_ctrl);
219
}
220
}
221
222
int __init extent_buffer_init_cachep(void)
223
{
224
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
225
sizeof(struct extent_buffer), 0, 0,
226
NULL);
227
if (!extent_buffer_cache)
228
return -ENOMEM;
229
230
return 0;
231
}
232
233
void __cold extent_buffer_free_cachep(void)
234
{
235
/*
236
* Make sure all delayed rcu free are flushed before we
237
* destroy caches.
238
*/
239
rcu_barrier();
240
kmem_cache_destroy(extent_buffer_cache);
241
}
242
243
static void process_one_folio(struct btrfs_fs_info *fs_info,
244
struct folio *folio, const struct folio *locked_folio,
245
unsigned long page_ops, u64 start, u64 end)
246
{
247
u32 len;
248
249
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
250
len = end + 1 - start;
251
252
if (page_ops & PAGE_SET_ORDERED)
253
btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
254
if (page_ops & PAGE_START_WRITEBACK) {
255
btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
256
btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
257
}
258
if (page_ops & PAGE_END_WRITEBACK)
259
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
260
261
if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
262
btrfs_folio_end_lock(fs_info, folio, start, len);
263
}
264
265
static void __process_folios_contig(struct address_space *mapping,
266
const struct folio *locked_folio, u64 start,
267
u64 end, unsigned long page_ops)
268
{
269
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
270
pgoff_t index = start >> PAGE_SHIFT;
271
pgoff_t end_index = end >> PAGE_SHIFT;
272
struct folio_batch fbatch;
273
int i;
274
275
folio_batch_init(&fbatch);
276
while (index <= end_index) {
277
int found_folios;
278
279
found_folios = filemap_get_folios_contig(mapping, &index,
280
end_index, &fbatch);
281
for (i = 0; i < found_folios; i++) {
282
struct folio *folio = fbatch.folios[i];
283
284
process_one_folio(fs_info, folio, locked_folio,
285
page_ops, start, end);
286
}
287
folio_batch_release(&fbatch);
288
cond_resched();
289
}
290
}
291
292
static noinline void unlock_delalloc_folio(const struct inode *inode,
293
struct folio *locked_folio,
294
u64 start, u64 end)
295
{
296
ASSERT(locked_folio);
297
298
__process_folios_contig(inode->i_mapping, locked_folio, start, end,
299
PAGE_UNLOCK);
300
}
301
302
static noinline int lock_delalloc_folios(struct inode *inode,
303
struct folio *locked_folio,
304
u64 start, u64 end)
305
{
306
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
307
struct address_space *mapping = inode->i_mapping;
308
pgoff_t index = start >> PAGE_SHIFT;
309
pgoff_t end_index = end >> PAGE_SHIFT;
310
u64 processed_end = start;
311
struct folio_batch fbatch;
312
313
folio_batch_init(&fbatch);
314
while (index <= end_index) {
315
unsigned int found_folios, i;
316
317
found_folios = filemap_get_folios_contig(mapping, &index,
318
end_index, &fbatch);
319
if (found_folios == 0)
320
goto out;
321
322
for (i = 0; i < found_folios; i++) {
323
struct folio *folio = fbatch.folios[i];
324
u64 range_start;
325
u32 range_len;
326
327
if (folio == locked_folio)
328
continue;
329
330
folio_lock(folio);
331
if (!folio_test_dirty(folio) || folio->mapping != mapping) {
332
folio_unlock(folio);
333
goto out;
334
}
335
range_start = max_t(u64, folio_pos(folio), start);
336
range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start;
337
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
338
339
processed_end = range_start + range_len - 1;
340
}
341
folio_batch_release(&fbatch);
342
cond_resched();
343
}
344
345
return 0;
346
out:
347
folio_batch_release(&fbatch);
348
if (processed_end > start)
349
unlock_delalloc_folio(inode, locked_folio, start, processed_end);
350
return -EAGAIN;
351
}
352
353
/*
354
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
355
* more than @max_bytes.
356
*
357
* @start: The original start bytenr to search.
358
* Will store the extent range start bytenr.
359
* @end: The original end bytenr of the search range
360
* Will store the extent range end bytenr.
361
*
362
* Return true if we find a delalloc range which starts inside the original
363
* range, and @start/@end will store the delalloc range start/end.
364
*
365
* Return false if we can't find any delalloc range which starts inside the
366
* original range, and @start/@end will be the non-delalloc range start/end.
367
*/
368
EXPORT_FOR_TESTS
369
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
370
struct folio *locked_folio,
371
u64 *start, u64 *end)
372
{
373
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
374
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
375
const u64 orig_start = *start;
376
const u64 orig_end = *end;
377
u64 max_bytes = fs_info->max_extent_size;
378
u64 delalloc_start;
379
u64 delalloc_end;
380
bool found;
381
struct extent_state *cached_state = NULL;
382
int ret;
383
int loops = 0;
384
385
/* Caller should pass a valid @end to indicate the search range end */
386
ASSERT(orig_end > orig_start);
387
388
/* The range should at least cover part of the folio */
389
ASSERT(!(orig_start >= folio_next_pos(locked_folio) ||
390
orig_end <= folio_pos(locked_folio)));
391
again:
392
/* step one, find a bunch of delalloc bytes starting at start */
393
delalloc_start = *start;
394
delalloc_end = 0;
395
396
/*
397
* If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
398
* return early without handling any dirty ranges.
399
*/
400
ASSERT(max_bytes >= fs_info->sectorsize);
401
402
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
403
max_bytes, &cached_state);
404
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
405
*start = delalloc_start;
406
407
/* @delalloc_end can be -1, never go beyond @orig_end */
408
*end = min(delalloc_end, orig_end);
409
btrfs_free_extent_state(cached_state);
410
return false;
411
}
412
413
/*
414
* start comes from the offset of locked_folio. We have to lock
415
* folios in order, so we can't process delalloc bytes before
416
* locked_folio
417
*/
418
if (delalloc_start < *start)
419
delalloc_start = *start;
420
421
/*
422
* make sure to limit the number of folios we try to lock down
423
*/
424
if (delalloc_end + 1 - delalloc_start > max_bytes)
425
delalloc_end = delalloc_start + max_bytes - 1;
426
427
/* step two, lock all the folios after the folios that has start */
428
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
429
delalloc_end);
430
ASSERT(!ret || ret == -EAGAIN);
431
if (ret == -EAGAIN) {
432
/*
433
* Some of the folios are gone, lets avoid looping by
434
* shortening the size of the delalloc range we're searching.
435
*/
436
btrfs_free_extent_state(cached_state);
437
cached_state = NULL;
438
if (!loops) {
439
max_bytes = fs_info->sectorsize;
440
loops = 1;
441
goto again;
442
} else {
443
return false;
444
}
445
}
446
447
/* step three, lock the state bits for the whole range */
448
btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
449
450
/* then test to make sure it is all still delalloc */
451
ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
452
EXTENT_DELALLOC, cached_state);
453
454
btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
455
if (!ret) {
456
unlock_delalloc_folio(inode, locked_folio, delalloc_start,
457
delalloc_end);
458
cond_resched();
459
goto again;
460
}
461
*start = delalloc_start;
462
*end = delalloc_end;
463
464
return found;
465
}
466
467
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
468
const struct folio *locked_folio,
469
struct extent_state **cached,
470
u32 clear_bits, unsigned long page_ops)
471
{
472
btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
473
474
__process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
475
end, page_ops);
476
}
477
478
static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len)
479
{
480
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
481
482
if (!fsverity_active(folio->mapping->host) ||
483
btrfs_folio_test_uptodate(fs_info, folio, start, len) ||
484
start >= i_size_read(folio->mapping->host))
485
return true;
486
return fsverity_verify_folio(folio);
487
}
488
489
static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len)
490
{
491
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
492
493
ASSERT(folio_pos(folio) <= start &&
494
start + len <= folio_next_pos(folio));
495
496
if (uptodate && btrfs_verify_folio(folio, start, len))
497
btrfs_folio_set_uptodate(fs_info, folio, start, len);
498
else
499
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
500
501
if (!btrfs_is_subpage(fs_info, folio))
502
folio_unlock(folio);
503
else
504
btrfs_folio_end_lock(fs_info, folio, start, len);
505
}
506
507
/*
508
* After a write IO is done, we need to:
509
*
510
* - clear the uptodate bits on error
511
* - clear the writeback bits in the extent tree for the range
512
* - filio_end_writeback() if there is no more pending io for the folio
513
*
514
* Scheduling is not allowed, so the extent state tree is expected
515
* to have one and only one object corresponding to this IO.
516
*/
517
static void end_bbio_data_write(struct btrfs_bio *bbio)
518
{
519
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
520
struct bio *bio = &bbio->bio;
521
int error = blk_status_to_errno(bio->bi_status);
522
struct folio_iter fi;
523
const u32 sectorsize = fs_info->sectorsize;
524
525
ASSERT(!bio_flagged(bio, BIO_CLONED));
526
bio_for_each_folio_all(fi, bio) {
527
struct folio *folio = fi.folio;
528
u64 start = folio_pos(folio) + fi.offset;
529
u32 len = fi.length;
530
531
/* Our read/write should always be sector aligned. */
532
if (!IS_ALIGNED(fi.offset, sectorsize))
533
btrfs_err(fs_info,
534
"partial page write in btrfs with offset %zu and length %zu",
535
fi.offset, fi.length);
536
else if (!IS_ALIGNED(fi.length, sectorsize))
537
btrfs_info(fs_info,
538
"incomplete page write with offset %zu and length %zu",
539
fi.offset, fi.length);
540
541
btrfs_finish_ordered_extent(bbio->ordered, folio, start, len,
542
!error);
543
if (error)
544
mapping_set_error(folio->mapping, error);
545
btrfs_folio_clear_writeback(fs_info, folio, start, len);
546
}
547
548
bio_put(bio);
549
}
550
551
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
552
{
553
ASSERT(folio_test_locked(folio));
554
if (!btrfs_is_subpage(fs_info, folio))
555
return;
556
557
ASSERT(folio_test_private(folio));
558
btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
559
}
560
561
/*
562
* After a data read IO is done, we need to:
563
*
564
* - clear the uptodate bits on error
565
* - set the uptodate bits if things worked
566
* - set the folio up to date if all extents in the tree are uptodate
567
* - clear the lock bit in the extent tree
568
* - unlock the folio if there are no other extents locked for it
569
*
570
* Scheduling is not allowed, so the extent state tree is expected
571
* to have one and only one object corresponding to this IO.
572
*/
573
static void end_bbio_data_read(struct btrfs_bio *bbio)
574
{
575
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
576
struct bio *bio = &bbio->bio;
577
struct folio_iter fi;
578
579
ASSERT(!bio_flagged(bio, BIO_CLONED));
580
bio_for_each_folio_all(fi, &bbio->bio) {
581
bool uptodate = !bio->bi_status;
582
struct folio *folio = fi.folio;
583
struct inode *inode = folio->mapping->host;
584
u64 start = folio_pos(folio) + fi.offset;
585
586
btrfs_debug(fs_info,
587
"%s: bi_sector=%llu, err=%d, mirror=%u",
588
__func__, bio->bi_iter.bi_sector, bio->bi_status,
589
bbio->mirror_num);
590
591
592
if (likely(uptodate)) {
593
u64 end = start + fi.length - 1;
594
loff_t i_size = i_size_read(inode);
595
596
/*
597
* Zero out the remaining part if this range straddles
598
* i_size.
599
*
600
* Here we should only zero the range inside the folio,
601
* not touch anything else.
602
*
603
* NOTE: i_size is exclusive while end is inclusive and
604
* folio_contains() takes PAGE_SIZE units.
605
*/
606
if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
607
i_size <= end) {
608
u32 zero_start = max(offset_in_folio(folio, i_size),
609
offset_in_folio(folio, start));
610
u32 zero_len = offset_in_folio(folio, end) + 1 -
611
zero_start;
612
613
folio_zero_range(folio, zero_start, zero_len);
614
}
615
}
616
617
/* Update page status and unlock. */
618
end_folio_read(folio, uptodate, start, fi.length);
619
}
620
bio_put(bio);
621
}
622
623
/*
624
* Populate every free slot in a provided array with folios using GFP_NOFS.
625
*
626
* @nr_folios: number of folios to allocate
627
* @order: the order of the folios to be allocated
628
* @folio_array: the array to fill with folios; any existing non-NULL entries in
629
* the array will be skipped
630
*
631
* Return: 0 if all folios were able to be allocated;
632
* -ENOMEM otherwise, the partially allocated folios would be freed and
633
* the array slots zeroed
634
*/
635
int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
636
struct folio **folio_array)
637
{
638
for (int i = 0; i < nr_folios; i++) {
639
if (folio_array[i])
640
continue;
641
folio_array[i] = folio_alloc(GFP_NOFS, order);
642
if (!folio_array[i])
643
goto error;
644
}
645
return 0;
646
error:
647
for (int i = 0; i < nr_folios; i++) {
648
if (folio_array[i])
649
folio_put(folio_array[i]);
650
folio_array[i] = NULL;
651
}
652
return -ENOMEM;
653
}
654
655
/*
656
* Populate every free slot in a provided array with pages, using GFP_NOFS.
657
*
658
* @nr_pages: number of pages to allocate
659
* @page_array: the array to fill with pages; any existing non-null entries in
660
* the array will be skipped
661
* @nofail: whether using __GFP_NOFAIL flag
662
*
663
* Return: 0 if all pages were able to be allocated;
664
* -ENOMEM otherwise, the partially allocated pages would be freed and
665
* the array slots zeroed
666
*/
667
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
668
bool nofail)
669
{
670
const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
671
unsigned int allocated;
672
673
for (allocated = 0; allocated < nr_pages;) {
674
unsigned int last = allocated;
675
676
allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
677
if (unlikely(allocated == last)) {
678
/* No progress, fail and do cleanup. */
679
for (int i = 0; i < allocated; i++) {
680
__free_page(page_array[i]);
681
page_array[i] = NULL;
682
}
683
return -ENOMEM;
684
}
685
}
686
return 0;
687
}
688
689
/*
690
* Populate needed folios for the extent buffer.
691
*
692
* For now, the folios populated are always in order 0 (aka, single page).
693
*/
694
static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
695
{
696
struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
697
int num_pages = num_extent_pages(eb);
698
int ret;
699
700
ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
701
if (ret < 0)
702
return ret;
703
704
for (int i = 0; i < num_pages; i++)
705
eb->folios[i] = page_folio(page_array[i]);
706
eb->folio_size = PAGE_SIZE;
707
eb->folio_shift = PAGE_SHIFT;
708
return 0;
709
}
710
711
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
712
u64 disk_bytenr, loff_t file_offset)
713
{
714
struct bio *bio = &bio_ctrl->bbio->bio;
715
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
716
717
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
718
/*
719
* For compression, all IO should have its logical bytenr set
720
* to the starting bytenr of the compressed extent.
721
*/
722
return bio->bi_iter.bi_sector == sector;
723
}
724
725
/*
726
* To merge into a bio both the disk sector and the logical offset in
727
* the file need to be contiguous.
728
*/
729
return bio_ctrl->next_file_offset == file_offset &&
730
bio_end_sector(bio) == sector;
731
}
732
733
static void alloc_new_bio(struct btrfs_inode *inode,
734
struct btrfs_bio_ctrl *bio_ctrl,
735
u64 disk_bytenr, u64 file_offset)
736
{
737
struct btrfs_fs_info *fs_info = inode->root->fs_info;
738
struct btrfs_bio *bbio;
739
740
bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
741
file_offset, bio_ctrl->end_io_func, NULL);
742
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
743
bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
744
bio_ctrl->bbio = bbio;
745
bio_ctrl->len_to_oe_boundary = U32_MAX;
746
bio_ctrl->next_file_offset = file_offset;
747
748
/* Limit data write bios to the ordered boundary. */
749
if (bio_ctrl->wbc) {
750
struct btrfs_ordered_extent *ordered;
751
752
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
753
if (ordered) {
754
bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
755
ordered->file_offset +
756
ordered->disk_num_bytes - file_offset);
757
bbio->ordered = ordered;
758
}
759
760
/*
761
* Pick the last added device to support cgroup writeback. For
762
* multi-device file systems this means blk-cgroup policies have
763
* to always be set on the last added/replaced device.
764
* This is a bit odd but has been like that for a long time.
765
*/
766
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
767
wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
768
}
769
}
770
771
/*
772
* @disk_bytenr: logical bytenr where the write will be
773
* @page: page to add to the bio
774
* @size: portion of page that we want to write to
775
* @pg_offset: offset of the new bio or to check whether we are adding
776
* a contiguous page to the previous one
777
* @read_em_generation: generation of the extent_map we are submitting
778
* (only used for read)
779
*
780
* The will either add the page into the existing @bio_ctrl->bbio, or allocate a
781
* new one in @bio_ctrl->bbio.
782
* The mirror number for this IO should already be initialized in
783
* @bio_ctrl->mirror_num.
784
*/
785
static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
786
u64 disk_bytenr, struct folio *folio,
787
size_t size, unsigned long pg_offset,
788
u64 read_em_generation)
789
{
790
struct btrfs_inode *inode = folio_to_inode(folio);
791
loff_t file_offset = folio_pos(folio) + pg_offset;
792
793
ASSERT(pg_offset + size <= folio_size(folio));
794
ASSERT(bio_ctrl->end_io_func);
795
796
if (bio_ctrl->bbio &&
797
!btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
798
submit_one_bio(bio_ctrl);
799
800
do {
801
u32 len = size;
802
803
/* Allocate new bio if needed */
804
if (!bio_ctrl->bbio)
805
alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
806
807
/* Cap to the current ordered extent boundary if there is one. */
808
if (len > bio_ctrl->len_to_oe_boundary) {
809
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
810
ASSERT(is_data_inode(inode));
811
len = bio_ctrl->len_to_oe_boundary;
812
}
813
814
if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) {
815
/* bio full: move on to a new one */
816
submit_one_bio(bio_ctrl);
817
continue;
818
}
819
/*
820
* Now that the folio is definitely added to the bio, include its
821
* generation in the max generation calculation.
822
*/
823
bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
824
bio_ctrl->next_file_offset += len;
825
826
if (bio_ctrl->wbc)
827
wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
828
829
size -= len;
830
pg_offset += len;
831
disk_bytenr += len;
832
file_offset += len;
833
834
/*
835
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
836
* sector aligned. alloc_new_bio() then sets it to the end of
837
* our ordered extent for writes into zoned devices.
838
*
839
* When len_to_oe_boundary is tracking an ordered extent, we
840
* trust the ordered extent code to align things properly, and
841
* the check above to cap our write to the ordered extent
842
* boundary is correct.
843
*
844
* When len_to_oe_boundary is U32_MAX, the cap above would
845
* result in a 4095 byte IO for the last folio right before
846
* we hit the bio limit of UINT_MAX. bio_add_folio() has all
847
* the checks required to make sure we don't overflow the bio,
848
* and we should just ignore len_to_oe_boundary completely
849
* unless we're using it to track an ordered extent.
850
*
851
* It's pretty hard to make a bio sized U32_MAX, but it can
852
* happen when the page cache is able to feed us contiguous
853
* folios for large extents.
854
*/
855
if (bio_ctrl->len_to_oe_boundary != U32_MAX)
856
bio_ctrl->len_to_oe_boundary -= len;
857
858
/* Ordered extent boundary: move on to a new bio. */
859
if (bio_ctrl->len_to_oe_boundary == 0)
860
submit_one_bio(bio_ctrl);
861
} while (size);
862
}
863
864
static int attach_extent_buffer_folio(struct extent_buffer *eb,
865
struct folio *folio,
866
struct btrfs_folio_state *prealloc)
867
{
868
struct btrfs_fs_info *fs_info = eb->fs_info;
869
int ret = 0;
870
871
/*
872
* If the page is mapped to btree inode, we should hold the private
873
* lock to prevent race.
874
* For cloned or dummy extent buffers, their pages are not mapped and
875
* will not race with any other ebs.
876
*/
877
if (folio->mapping)
878
lockdep_assert_held(&folio->mapping->i_private_lock);
879
880
if (!btrfs_meta_is_subpage(fs_info)) {
881
if (!folio_test_private(folio))
882
folio_attach_private(folio, eb);
883
else
884
WARN_ON(folio_get_private(folio) != eb);
885
return 0;
886
}
887
888
/* Already mapped, just free prealloc */
889
if (folio_test_private(folio)) {
890
btrfs_free_folio_state(prealloc);
891
return 0;
892
}
893
894
if (prealloc)
895
/* Has preallocated memory for subpage */
896
folio_attach_private(folio, prealloc);
897
else
898
/* Do new allocation to attach subpage */
899
ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
900
return ret;
901
}
902
903
int set_folio_extent_mapped(struct folio *folio)
904
{
905
struct btrfs_fs_info *fs_info;
906
907
ASSERT(folio->mapping);
908
909
if (folio_test_private(folio))
910
return 0;
911
912
fs_info = folio_to_fs_info(folio);
913
914
if (btrfs_is_subpage(fs_info, folio))
915
return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
916
917
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
918
return 0;
919
}
920
921
void clear_folio_extent_mapped(struct folio *folio)
922
{
923
struct btrfs_fs_info *fs_info;
924
925
ASSERT(folio->mapping);
926
927
if (!folio_test_private(folio))
928
return;
929
930
fs_info = folio_to_fs_info(folio);
931
if (btrfs_is_subpage(fs_info, folio))
932
return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
933
934
folio_detach_private(folio);
935
}
936
937
static struct extent_map *get_extent_map(struct btrfs_inode *inode,
938
struct folio *folio, u64 start,
939
u64 len, struct extent_map **em_cached)
940
{
941
struct extent_map *em;
942
943
ASSERT(em_cached);
944
945
if (*em_cached) {
946
em = *em_cached;
947
if (btrfs_extent_map_in_tree(em) && start >= em->start &&
948
start < btrfs_extent_map_end(em)) {
949
refcount_inc(&em->refs);
950
return em;
951
}
952
953
btrfs_free_extent_map(em);
954
*em_cached = NULL;
955
}
956
957
em = btrfs_get_extent(inode, folio, start, len);
958
if (!IS_ERR(em)) {
959
BUG_ON(*em_cached);
960
refcount_inc(&em->refs);
961
*em_cached = em;
962
}
963
964
return em;
965
}
966
967
static void btrfs_readahead_expand(struct readahead_control *ractl,
968
const struct extent_map *em)
969
{
970
const u64 ra_pos = readahead_pos(ractl);
971
const u64 ra_end = ra_pos + readahead_length(ractl);
972
const u64 em_end = btrfs_extent_map_end(em);
973
974
/* No expansion for holes and inline extents. */
975
if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
976
return;
977
978
ASSERT(em_end >= ra_pos,
979
"extent_map %llu %llu ends before current readahead position %llu",
980
em->start, em->len, ra_pos);
981
if (em_end > ra_end)
982
readahead_expand(ractl, ra_pos, em_end - ra_pos);
983
}
984
985
/*
986
* basic readpage implementation. Locked extent state structs are inserted
987
* into the tree that are removed when the IO is done (by the end_io
988
* handlers)
989
* XXX JDM: This needs looking at to ensure proper page locking
990
* return 0 on success, otherwise return error
991
*/
992
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
993
struct btrfs_bio_ctrl *bio_ctrl)
994
{
995
struct inode *inode = folio->mapping->host;
996
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
997
u64 start = folio_pos(folio);
998
const u64 end = start + folio_size(folio) - 1;
999
u64 extent_offset;
1000
u64 locked_end;
1001
u64 last_byte = i_size_read(inode);
1002
struct extent_map *em;
1003
int ret = 0;
1004
const size_t blocksize = fs_info->sectorsize;
1005
1006
if (bio_ctrl->ractl)
1007
locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1;
1008
else
1009
locked_end = end;
1010
1011
ret = set_folio_extent_mapped(folio);
1012
if (ret < 0) {
1013
folio_unlock(folio);
1014
return ret;
1015
}
1016
1017
if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
1018
size_t zero_offset = offset_in_folio(folio, last_byte);
1019
1020
if (zero_offset)
1021
folio_zero_range(folio, zero_offset,
1022
folio_size(folio) - zero_offset);
1023
}
1024
bio_ctrl->end_io_func = end_bbio_data_read;
1025
begin_folio_read(fs_info, folio);
1026
for (u64 cur = start; cur <= end; cur += blocksize) {
1027
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
1028
unsigned long pg_offset = offset_in_folio(folio, cur);
1029
bool force_bio_submit = false;
1030
u64 disk_bytenr;
1031
u64 block_start;
1032
u64 em_gen;
1033
1034
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
1035
if (cur >= last_byte) {
1036
folio_zero_range(folio, pg_offset, end - cur + 1);
1037
end_folio_read(folio, true, cur, end - cur + 1);
1038
break;
1039
}
1040
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
1041
end_folio_read(folio, true, cur, blocksize);
1042
continue;
1043
}
1044
/*
1045
* Search extent map for the whole locked range.
1046
* This will allow btrfs_get_extent() to return a larger hole
1047
* when possible.
1048
* This can reduce duplicated btrfs_get_extent() calls for large
1049
* holes.
1050
*/
1051
em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached);
1052
if (IS_ERR(em)) {
1053
end_folio_read(folio, false, cur, end + 1 - cur);
1054
return PTR_ERR(em);
1055
}
1056
extent_offset = cur - em->start;
1057
BUG_ON(btrfs_extent_map_end(em) <= cur);
1058
BUG_ON(end < cur);
1059
1060
compress_type = btrfs_extent_map_compression(em);
1061
1062
/*
1063
* Only expand readahead for extents which are already creating
1064
* the pages anyway in add_ra_bio_pages, which is compressed
1065
* extents in the non subpage case.
1066
*/
1067
if (bio_ctrl->ractl &&
1068
!btrfs_is_subpage(fs_info, folio) &&
1069
compress_type != BTRFS_COMPRESS_NONE)
1070
btrfs_readahead_expand(bio_ctrl->ractl, em);
1071
1072
if (compress_type != BTRFS_COMPRESS_NONE)
1073
disk_bytenr = em->disk_bytenr;
1074
else
1075
disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
1076
1077
if (em->flags & EXTENT_FLAG_PREALLOC)
1078
block_start = EXTENT_MAP_HOLE;
1079
else
1080
block_start = btrfs_extent_map_block_start(em);
1081
1082
/*
1083
* If we have a file range that points to a compressed extent
1084
* and it's followed by a consecutive file range that points
1085
* to the same compressed extent (possibly with a different
1086
* offset and/or length, so it either points to the whole extent
1087
* or only part of it), we must make sure we do not submit a
1088
* single bio to populate the folios for the 2 ranges because
1089
* this makes the compressed extent read zero out the folios
1090
* belonging to the 2nd range. Imagine the following scenario:
1091
*
1092
* File layout
1093
* [0 - 8K] [8K - 24K]
1094
* | |
1095
* | |
1096
* points to extent X, points to extent X,
1097
* offset 4K, length of 8K offset 0, length 16K
1098
*
1099
* [extent X, compressed length = 4K uncompressed length = 16K]
1100
*
1101
* If the bio to read the compressed extent covers both ranges,
1102
* it will decompress extent X into the folios belonging to the
1103
* first range and then it will stop, zeroing out the remaining
1104
* folios that belong to the other range that points to extent X.
1105
* So here we make sure we submit 2 bios, one for the first
1106
* range and another one for the third range. Both will target
1107
* the same physical extent from disk, but we can't currently
1108
* make the compressed bio endio callback populate the folios
1109
* for both ranges because each compressed bio is tightly
1110
* coupled with a single extent map, and each range can have
1111
* an extent map with a different offset value relative to the
1112
* uncompressed data of our extent and different lengths. This
1113
* is a corner case so we prioritize correctness over
1114
* non-optimal behavior (submitting 2 bios for the same extent).
1115
*/
1116
if (compress_type != BTRFS_COMPRESS_NONE &&
1117
bio_ctrl->last_em_start != U64_MAX &&
1118
bio_ctrl->last_em_start != em->start)
1119
force_bio_submit = true;
1120
1121
bio_ctrl->last_em_start = em->start;
1122
1123
em_gen = em->generation;
1124
btrfs_free_extent_map(em);
1125
em = NULL;
1126
1127
/* we've found a hole, just zero and go on */
1128
if (block_start == EXTENT_MAP_HOLE) {
1129
folio_zero_range(folio, pg_offset, blocksize);
1130
end_folio_read(folio, true, cur, blocksize);
1131
continue;
1132
}
1133
/* the get_extent function already copied into the folio */
1134
if (block_start == EXTENT_MAP_INLINE) {
1135
end_folio_read(folio, true, cur, blocksize);
1136
continue;
1137
}
1138
1139
if (bio_ctrl->compress_type != compress_type) {
1140
submit_one_bio(bio_ctrl);
1141
bio_ctrl->compress_type = compress_type;
1142
}
1143
1144
if (force_bio_submit)
1145
submit_one_bio(bio_ctrl);
1146
submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
1147
pg_offset, em_gen);
1148
}
1149
return 0;
1150
}
1151
1152
/*
1153
* Check if we can skip waiting the @ordered extent covering the block at @fileoff.
1154
*
1155
* @fileoff: Both input and output.
1156
* Input as the file offset where the check should start at.
1157
* Output as where the next check should start at,
1158
* if the function returns true.
1159
*
1160
* Return true if we can skip to @fileoff. The caller needs to check the new
1161
* @fileoff value to make sure it covers the full range, before skipping the
1162
* full OE.
1163
*
1164
* Return false if we must wait for the ordered extent.
1165
*/
1166
static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
1167
struct btrfs_ordered_extent *ordered,
1168
u64 *fileoff)
1169
{
1170
const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1171
struct folio *folio;
1172
const u32 blocksize = fs_info->sectorsize;
1173
u64 cur = *fileoff;
1174
bool ret;
1175
1176
folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
1177
1178
/*
1179
* We should have locked the folio(s) for range [start, end], thus
1180
* there must be a folio and it must be locked.
1181
*/
1182
ASSERT(!IS_ERR(folio));
1183
ASSERT(folio_test_locked(folio));
1184
1185
/*
1186
* There are several cases for the folio and OE combination:
1187
*
1188
* 1) Folio has no private flag
1189
* The OE has all its IO done but not yet finished, and folio got
1190
* invalidated.
1191
*
1192
* Have we have to wait for the OE to finish, as it may contain the
1193
* to-be-inserted data checksum.
1194
* Without the data checksum inserted into the csum tree, read will
1195
* just fail with missing csum.
1196
*/
1197
if (!folio_test_private(folio)) {
1198
ret = false;
1199
goto out;
1200
}
1201
1202
/*
1203
* 2) The first block is DIRTY.
1204
*
1205
* This means the OE is created by some other folios whose file pos is
1206
* before this one. And since we are holding the folio lock, the writeback
1207
* of this folio cannot start.
1208
*
1209
* We must skip the whole OE, because it will never start until we
1210
* finished our folio read and unlocked the folio.
1211
*/
1212
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
1213
u64 range_len = umin(folio_next_pos(folio),
1214
ordered->file_offset + ordered->num_bytes) - cur;
1215
1216
ret = true;
1217
/*
1218
* At least inside the folio, all the remaining blocks should
1219
* also be dirty.
1220
*/
1221
ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
1222
*fileoff = ordered->file_offset + ordered->num_bytes;
1223
goto out;
1224
}
1225
1226
/*
1227
* 3) The first block is uptodate.
1228
*
1229
* At least the first block can be skipped, but we are still not fully
1230
* sure. E.g. if the OE has some other folios in the range that cannot
1231
* be skipped.
1232
* So we return true and update @next_ret to the OE/folio boundary.
1233
*/
1234
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
1235
u64 range_len = umin(folio_next_pos(folio),
1236
ordered->file_offset + ordered->num_bytes) - cur;
1237
1238
/*
1239
* The whole range to the OE end or folio boundary should also
1240
* be uptodate.
1241
*/
1242
ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
1243
ret = true;
1244
*fileoff = cur + range_len;
1245
goto out;
1246
}
1247
1248
/*
1249
* 4) The first block is not uptodate.
1250
*
1251
* This means the folio is invalidated after the writeback was finished,
1252
* but by some other operations (e.g. block aligned buffered write) the
1253
* folio is inserted into filemap.
1254
* Very much the same as case 1).
1255
*/
1256
ret = false;
1257
out:
1258
folio_put(folio);
1259
return ret;
1260
}
1261
1262
static bool can_skip_ordered_extent(struct btrfs_inode *inode,
1263
struct btrfs_ordered_extent *ordered,
1264
u64 start, u64 end)
1265
{
1266
const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
1267
u64 cur = max(start, ordered->file_offset);
1268
1269
while (cur < range_end) {
1270
bool can_skip;
1271
1272
can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
1273
if (!can_skip)
1274
return false;
1275
}
1276
return true;
1277
}
1278
1279
/*
1280
* Locking helper to make sure we get a stable view of extent maps for the
1281
* involved range.
1282
*
1283
* This is for folio read paths (read and readahead), thus the involved range
1284
* should have all the folios locked.
1285
*/
1286
static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
1287
struct extent_state **cached_state)
1288
{
1289
u64 cur_pos;
1290
1291
/* Caller must provide a valid @cached_state. */
1292
ASSERT(cached_state);
1293
1294
/* The range must at least be page aligned, as all read paths are folio based. */
1295
ASSERT(IS_ALIGNED(start, PAGE_SIZE));
1296
ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
1297
1298
again:
1299
btrfs_lock_extent(&inode->io_tree, start, end, cached_state);
1300
cur_pos = start;
1301
while (cur_pos < end) {
1302
struct btrfs_ordered_extent *ordered;
1303
1304
ordered = btrfs_lookup_ordered_range(inode, cur_pos,
1305
end - cur_pos + 1);
1306
/*
1307
* No ordered extents in the range, and we hold the extent lock,
1308
* no one can modify the extent maps in the range, we're safe to return.
1309
*/
1310
if (!ordered)
1311
break;
1312
1313
/* Check if we can skip waiting for the whole OE. */
1314
if (can_skip_ordered_extent(inode, ordered, start, end)) {
1315
cur_pos = min(ordered->file_offset + ordered->num_bytes,
1316
end + 1);
1317
btrfs_put_ordered_extent(ordered);
1318
continue;
1319
}
1320
1321
/* Now wait for the OE to finish. */
1322
btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
1323
btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
1324
btrfs_put_ordered_extent(ordered);
1325
/* We have unlocked the whole range, restart from the beginning. */
1326
goto again;
1327
}
1328
}
1329
1330
int btrfs_read_folio(struct file *file, struct folio *folio)
1331
{
1332
struct btrfs_inode *inode = folio_to_inode(folio);
1333
const u64 start = folio_pos(folio);
1334
const u64 end = start + folio_size(folio) - 1;
1335
struct extent_state *cached_state = NULL;
1336
struct btrfs_bio_ctrl bio_ctrl = {
1337
.opf = REQ_OP_READ,
1338
.last_em_start = U64_MAX,
1339
};
1340
struct extent_map *em_cached = NULL;
1341
int ret;
1342
1343
lock_extents_for_read(inode, start, end, &cached_state);
1344
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
1345
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
1346
1347
btrfs_free_extent_map(em_cached);
1348
1349
/*
1350
* If btrfs_do_readpage() failed we will want to submit the assembled
1351
* bio to do the cleanup.
1352
*/
1353
submit_one_bio(&bio_ctrl);
1354
return ret;
1355
}
1356
1357
static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
1358
u64 start, u32 len)
1359
{
1360
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
1361
const u64 folio_start = folio_pos(folio);
1362
unsigned int start_bit;
1363
unsigned int nbits;
1364
1365
ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
1366
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
1367
nbits = len >> fs_info->sectorsize_bits;
1368
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
1369
bitmap_set(delalloc_bitmap, start_bit, nbits);
1370
}
1371
1372
static bool find_next_delalloc_bitmap(struct folio *folio,
1373
unsigned long *delalloc_bitmap, u64 start,
1374
u64 *found_start, u32 *found_len)
1375
{
1376
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
1377
const u64 folio_start = folio_pos(folio);
1378
const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
1379
unsigned int start_bit;
1380
unsigned int first_zero;
1381
unsigned int first_set;
1382
1383
ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
1384
1385
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
1386
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
1387
if (first_set >= bitmap_size)
1388
return false;
1389
1390
*found_start = folio_start + (first_set << fs_info->sectorsize_bits);
1391
first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
1392
*found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
1393
return true;
1394
}
1395
1396
/*
1397
* Do all of the delayed allocation setup.
1398
*
1399
* Return >0 if all the dirty blocks are submitted async (compression) or inlined.
1400
* The @folio should no longer be touched (treat it as already unlocked).
1401
*
1402
* Return 0 if there is still dirty block that needs to be submitted through
1403
* extent_writepage_io().
1404
* bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
1405
* submitted, and @folio is still kept locked.
1406
*
1407
* Return <0 if there is any error hit.
1408
* Any allocated ordered extent range covering this folio will be marked
1409
* finished (IOERR), and @folio is still kept locked.
1410
*/
1411
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
1412
struct folio *folio,
1413
struct btrfs_bio_ctrl *bio_ctrl)
1414
{
1415
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
1416
struct writeback_control *wbc = bio_ctrl->wbc;
1417
const bool is_subpage = btrfs_is_subpage(fs_info, folio);
1418
const u64 page_start = folio_pos(folio);
1419
const u64 page_end = page_start + folio_size(folio) - 1;
1420
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
1421
unsigned long delalloc_bitmap = 0;
1422
/*
1423
* Save the last found delalloc end. As the delalloc end can go beyond
1424
* page boundary, thus we cannot rely on subpage bitmap to locate the
1425
* last delalloc end.
1426
*/
1427
u64 last_delalloc_end = 0;
1428
/*
1429
* The range end (exclusive) of the last successfully finished delalloc
1430
* range.
1431
* Any range covered by ordered extent must either be manually marked
1432
* finished (error handling), or has IO submitted (and finish the
1433
* ordered extent normally).
1434
*
1435
* This records the end of ordered extent cleanup if we hit an error.
1436
*/
1437
u64 last_finished_delalloc_end = page_start;
1438
u64 delalloc_start = page_start;
1439
u64 delalloc_end = page_end;
1440
u64 delalloc_to_write = 0;
1441
unsigned int start_bit;
1442
unsigned int end_bit;
1443
int ret = 0;
1444
1445
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
1446
if (btrfs_is_subpage(fs_info, folio)) {
1447
ASSERT(blocks_per_folio > 1);
1448
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
1449
} else {
1450
bio_ctrl->submit_bitmap = 1;
1451
}
1452
1453
for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
1454
blocks_per_folio) {
1455
u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
1456
u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
1457
1458
btrfs_folio_set_lock(fs_info, folio, start, len);
1459
}
1460
1461
/* Lock all (subpage) delalloc ranges inside the folio first. */
1462
while (delalloc_start < page_end) {
1463
delalloc_end = page_end;
1464
if (!find_lock_delalloc_range(&inode->vfs_inode, folio,
1465
&delalloc_start, &delalloc_end)) {
1466
delalloc_start = delalloc_end + 1;
1467
continue;
1468
}
1469
set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
1470
min(delalloc_end, page_end) + 1 - delalloc_start);
1471
last_delalloc_end = delalloc_end;
1472
delalloc_start = delalloc_end + 1;
1473
}
1474
delalloc_start = page_start;
1475
1476
if (!last_delalloc_end)
1477
goto out;
1478
1479
/* Run the delalloc ranges for the above locked ranges. */
1480
while (delalloc_start < page_end) {
1481
u64 found_start;
1482
u32 found_len;
1483
bool found;
1484
1485
if (!is_subpage) {
1486
/*
1487
* For non-subpage case, the found delalloc range must
1488
* cover this folio and there must be only one locked
1489
* delalloc range.
1490
*/
1491
found_start = page_start;
1492
found_len = last_delalloc_end + 1 - found_start;
1493
found = true;
1494
} else {
1495
found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
1496
delalloc_start, &found_start, &found_len);
1497
}
1498
if (!found)
1499
break;
1500
/*
1501
* The subpage range covers the last sector, the delalloc range may
1502
* end beyond the folio boundary, use the saved delalloc_end
1503
* instead.
1504
*/
1505
if (found_start + found_len >= page_end)
1506
found_len = last_delalloc_end + 1 - found_start;
1507
1508
if (ret >= 0) {
1509
/*
1510
* Some delalloc range may be created by previous folios.
1511
* Thus we still need to clean up this range during error
1512
* handling.
1513
*/
1514
last_finished_delalloc_end = found_start;
1515
/* No errors hit so far, run the current delalloc range. */
1516
ret = btrfs_run_delalloc_range(inode, folio,
1517
found_start,
1518
found_start + found_len - 1,
1519
wbc);
1520
if (ret >= 0)
1521
last_finished_delalloc_end = found_start + found_len;
1522
if (unlikely(ret < 0))
1523
btrfs_err_rl(fs_info,
1524
"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
1525
btrfs_root_id(inode->root),
1526
btrfs_ino(inode),
1527
folio_pos(folio),
1528
blocks_per_folio,
1529
&bio_ctrl->submit_bitmap,
1530
found_start, found_len, ret);
1531
} else {
1532
/*
1533
* We've hit an error during previous delalloc range,
1534
* have to cleanup the remaining locked ranges.
1535
*/
1536
btrfs_unlock_extent(&inode->io_tree, found_start,
1537
found_start + found_len - 1, NULL);
1538
unlock_delalloc_folio(&inode->vfs_inode, folio,
1539
found_start,
1540
found_start + found_len - 1);
1541
}
1542
1543
/*
1544
* We have some ranges that's going to be submitted asynchronously
1545
* (compression or inline). These range have their own control
1546
* on when to unlock the pages. We should not touch them
1547
* anymore, so clear the range from the submission bitmap.
1548
*/
1549
if (ret > 0) {
1550
unsigned int start_bit = (found_start - page_start) >>
1551
fs_info->sectorsize_bits;
1552
unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
1553
page_start) >> fs_info->sectorsize_bits;
1554
bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
1555
}
1556
/*
1557
* Above btrfs_run_delalloc_range() may have unlocked the folio,
1558
* thus for the last range, we cannot touch the folio anymore.
1559
*/
1560
if (found_start + found_len >= last_delalloc_end + 1)
1561
break;
1562
1563
delalloc_start = found_start + found_len;
1564
}
1565
/*
1566
* It's possible we had some ordered extents created before we hit
1567
* an error, cleanup non-async successfully created delalloc ranges.
1568
*/
1569
if (unlikely(ret < 0)) {
1570
unsigned int bitmap_size = min(
1571
(last_finished_delalloc_end - page_start) >>
1572
fs_info->sectorsize_bits,
1573
blocks_per_folio);
1574
1575
for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
1576
bitmap_size) {
1577
u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
1578
u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
1579
1580
btrfs_mark_ordered_io_finished(inode, folio, start, len, false);
1581
}
1582
return ret;
1583
}
1584
out:
1585
if (last_delalloc_end)
1586
delalloc_end = last_delalloc_end;
1587
else
1588
delalloc_end = page_end;
1589
/*
1590
* delalloc_end is already one less than the total length, so
1591
* we don't subtract one from PAGE_SIZE.
1592
*/
1593
delalloc_to_write +=
1594
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
1595
1596
/*
1597
* If all ranges are submitted asynchronously, we just need to account
1598
* for them here.
1599
*/
1600
if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
1601
wbc->nr_to_write -= delalloc_to_write;
1602
return 1;
1603
}
1604
1605
if (wbc->nr_to_write < delalloc_to_write) {
1606
int thresh = 8192;
1607
1608
if (delalloc_to_write < thresh * 2)
1609
thresh = delalloc_to_write;
1610
wbc->nr_to_write = min_t(u64, delalloc_to_write,
1611
thresh);
1612
}
1613
1614
return 0;
1615
}
1616
1617
/*
1618
* Return 0 if we have submitted or queued the sector for submission.
1619
* Return <0 for critical errors, and the involved sector will be cleaned up.
1620
*
1621
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
1622
*/
1623
static int submit_one_sector(struct btrfs_inode *inode,
1624
struct folio *folio,
1625
u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
1626
loff_t i_size)
1627
{
1628
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1629
struct extent_map *em;
1630
u64 block_start;
1631
u64 disk_bytenr;
1632
u64 extent_offset;
1633
u64 em_end;
1634
const u32 sectorsize = fs_info->sectorsize;
1635
1636
ASSERT(IS_ALIGNED(filepos, sectorsize));
1637
1638
/* @filepos >= i_size case should be handled by the caller. */
1639
ASSERT(filepos < i_size);
1640
1641
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
1642
if (IS_ERR(em)) {
1643
/*
1644
* bio_ctrl may contain a bio crossing several folios.
1645
* Submit it immediately so that the bio has a chance
1646
* to finish normally, other than marked as error.
1647
*/
1648
submit_one_bio(bio_ctrl);
1649
1650
/*
1651
* When submission failed, we should still clear the folio dirty.
1652
* Or the folio will be written back again but without any
1653
* ordered extent.
1654
*/
1655
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
1656
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
1657
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
1658
1659
/*
1660
* Since there is no bio submitted to finish the ordered
1661
* extent, we have to manually finish this sector.
1662
*/
1663
btrfs_mark_ordered_io_finished(inode, folio, filepos,
1664
fs_info->sectorsize, false);
1665
return PTR_ERR(em);
1666
}
1667
1668
extent_offset = filepos - em->start;
1669
em_end = btrfs_extent_map_end(em);
1670
ASSERT(filepos <= em_end);
1671
ASSERT(IS_ALIGNED(em->start, sectorsize));
1672
ASSERT(IS_ALIGNED(em->len, sectorsize));
1673
1674
block_start = btrfs_extent_map_block_start(em);
1675
disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
1676
1677
ASSERT(!btrfs_extent_map_is_compressed(em));
1678
ASSERT(block_start != EXTENT_MAP_HOLE);
1679
ASSERT(block_start != EXTENT_MAP_INLINE);
1680
1681
btrfs_free_extent_map(em);
1682
em = NULL;
1683
1684
/*
1685
* Although the PageDirty bit is cleared before entering this
1686
* function, subpage dirty bit is not cleared.
1687
* So clear subpage dirty bit here so next time we won't submit
1688
* a folio for a range already written to disk.
1689
*/
1690
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
1691
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
1692
/*
1693
* Above call should set the whole folio with writeback flag, even
1694
* just for a single subpage sector.
1695
* As long as the folio is properly locked and the range is correct,
1696
* we should always get the folio with writeback flag.
1697
*/
1698
ASSERT(folio_test_writeback(folio));
1699
1700
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
1701
sectorsize, filepos - folio_pos(folio), 0);
1702
return 0;
1703
}
1704
1705
/*
1706
* Helper for extent_writepage(). This calls the writepage start hooks,
1707
* and does the loop to map the page into extents and bios.
1708
*
1709
* We return 1 if the IO is started and the page is unlocked,
1710
* 0 if all went well (page still locked)
1711
* < 0 if there were errors (page still locked)
1712
*/
1713
static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
1714
struct folio *folio,
1715
u64 start, u32 len,
1716
struct btrfs_bio_ctrl *bio_ctrl,
1717
loff_t i_size)
1718
{
1719
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1720
unsigned long range_bitmap = 0;
1721
bool submitted_io = false;
1722
int found_error = 0;
1723
const u64 end = start + len;
1724
const u64 folio_start = folio_pos(folio);
1725
const u64 folio_end = folio_start + folio_size(folio);
1726
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
1727
u64 cur;
1728
int bit;
1729
int ret = 0;
1730
1731
ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start);
1732
ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
1733
start, len, folio_start, folio_size(folio));
1734
1735
ret = btrfs_writepage_cow_fixup(folio);
1736
if (ret == -EAGAIN) {
1737
/* Fixup worker will requeue */
1738
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
1739
folio_unlock(folio);
1740
return 1;
1741
}
1742
if (ret < 0) {
1743
btrfs_folio_clear_dirty(fs_info, folio, start, len);
1744
btrfs_folio_set_writeback(fs_info, folio, start, len);
1745
btrfs_folio_clear_writeback(fs_info, folio, start, len);
1746
return ret;
1747
}
1748
1749
bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits,
1750
len >> fs_info->sectorsize_bits);
1751
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
1752
blocks_per_folio);
1753
1754
bio_ctrl->end_io_func = end_bbio_data_write;
1755
1756
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
1757
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
1758
1759
if (cur >= i_size) {
1760
struct btrfs_ordered_extent *ordered;
1761
1762
ordered = btrfs_lookup_first_ordered_range(inode, cur,
1763
fs_info->sectorsize);
1764
/*
1765
* We have just run delalloc before getting here, so
1766
* there must be an ordered extent.
1767
*/
1768
ASSERT(ordered != NULL);
1769
spin_lock(&inode->ordered_tree_lock);
1770
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
1771
ordered->truncated_len = min(ordered->truncated_len,
1772
cur - ordered->file_offset);
1773
spin_unlock(&inode->ordered_tree_lock);
1774
btrfs_put_ordered_extent(ordered);
1775
1776
btrfs_mark_ordered_io_finished(inode, folio, cur,
1777
fs_info->sectorsize, true);
1778
/*
1779
* This range is beyond i_size, thus we don't need to
1780
* bother writing back.
1781
* But we still need to clear the dirty subpage bit, or
1782
* the next time the folio gets dirtied, we will try to
1783
* writeback the sectors with subpage dirty bits,
1784
* causing writeback without ordered extent.
1785
*/
1786
btrfs_folio_clear_dirty(fs_info, folio, cur, fs_info->sectorsize);
1787
continue;
1788
}
1789
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
1790
if (unlikely(ret < 0)) {
1791
if (!found_error)
1792
found_error = ret;
1793
continue;
1794
}
1795
submitted_io = true;
1796
}
1797
1798
/*
1799
* If we didn't submitted any sector (>= i_size), folio dirty get
1800
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
1801
* by folio_start_writeback() if the folio is not dirty).
1802
*
1803
* Here we set writeback and clear for the range. If the full folio
1804
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
1805
*
1806
* If we hit any error, the corresponding sector will have its dirty
1807
* flag cleared and writeback finished, thus no need to handle the error case.
1808
*/
1809
if (!submitted_io && !found_error) {
1810
btrfs_folio_set_writeback(fs_info, folio, start, len);
1811
btrfs_folio_clear_writeback(fs_info, folio, start, len);
1812
}
1813
return found_error;
1814
}
1815
1816
/*
1817
* the writepage semantics are similar to regular writepage. extent
1818
* records are inserted to lock ranges in the tree, and as dirty areas
1819
* are found, they are marked writeback. Then the lock bits are removed
1820
* and the end_io handler clears the writeback ranges
1821
*
1822
* Return 0 if everything goes well.
1823
* Return <0 for error.
1824
*/
1825
static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
1826
{
1827
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
1828
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1829
int ret;
1830
size_t pg_offset;
1831
loff_t i_size = i_size_read(&inode->vfs_inode);
1832
const pgoff_t end_index = i_size >> PAGE_SHIFT;
1833
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
1834
1835
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
1836
1837
WARN_ON(!folio_test_locked(folio));
1838
1839
pg_offset = offset_in_folio(folio, i_size);
1840
if (folio->index > end_index ||
1841
(folio->index == end_index && !pg_offset)) {
1842
folio_invalidate(folio, 0, folio_size(folio));
1843
folio_unlock(folio);
1844
return 0;
1845
}
1846
1847
if (folio_contains(folio, end_index))
1848
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
1849
1850
/*
1851
* Default to unlock the whole folio.
1852
* The proper bitmap can only be initialized until writepage_delalloc().
1853
*/
1854
bio_ctrl->submit_bitmap = (unsigned long)-1;
1855
1856
/*
1857
* If the page is dirty but without private set, it's marked dirty
1858
* without informing the fs.
1859
* Nowadays that is a bug, since the introduction of
1860
* pin_user_pages*().
1861
*
1862
* So here we check if the page has private set to rule out such
1863
* case.
1864
* But we also have a long history of relying on the COW fixup,
1865
* so here we only enable this check for experimental builds until
1866
* we're sure it's safe.
1867
*/
1868
if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
1869
unlikely(!folio_test_private(folio))) {
1870
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
1871
btrfs_err_rl(fs_info,
1872
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
1873
btrfs_root_id(inode->root),
1874
btrfs_ino(inode), folio_pos(folio));
1875
ret = -EUCLEAN;
1876
goto done;
1877
}
1878
1879
ret = set_folio_extent_mapped(folio);
1880
if (ret < 0)
1881
goto done;
1882
1883
ret = writepage_delalloc(inode, folio, bio_ctrl);
1884
if (ret == 1)
1885
return 0;
1886
if (ret)
1887
goto done;
1888
1889
ret = extent_writepage_io(inode, folio, folio_pos(folio),
1890
folio_size(folio), bio_ctrl, i_size);
1891
if (ret == 1)
1892
return 0;
1893
if (unlikely(ret < 0))
1894
btrfs_err_rl(fs_info,
1895
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
1896
btrfs_root_id(inode->root), btrfs_ino(inode),
1897
folio_pos(folio), blocks_per_folio,
1898
&bio_ctrl->submit_bitmap, ret);
1899
1900
bio_ctrl->wbc->nr_to_write--;
1901
1902
done:
1903
if (ret < 0)
1904
mapping_set_error(folio->mapping, ret);
1905
/*
1906
* Only unlock ranges that are submitted. As there can be some async
1907
* submitted ranges inside the folio.
1908
*/
1909
btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
1910
ASSERT(ret <= 0);
1911
return ret;
1912
}
1913
1914
/*
1915
* Lock extent buffer status and pages for writeback.
1916
*
1917
* Return %false if the extent buffer doesn't need to be submitted (e.g. the
1918
* extent buffer is not dirty)
1919
* Return %true is the extent buffer is submitted to bio.
1920
*/
1921
static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
1922
struct writeback_control *wbc)
1923
{
1924
struct btrfs_fs_info *fs_info = eb->fs_info;
1925
bool ret = false;
1926
1927
btrfs_tree_lock(eb);
1928
while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
1929
btrfs_tree_unlock(eb);
1930
if (wbc->sync_mode != WB_SYNC_ALL)
1931
return false;
1932
wait_on_extent_buffer_writeback(eb);
1933
btrfs_tree_lock(eb);
1934
}
1935
1936
/*
1937
* We need to do this to prevent races in people who check if the eb is
1938
* under IO since we can end up having no IO bits set for a short period
1939
* of time.
1940
*/
1941
spin_lock(&eb->refs_lock);
1942
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
1943
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
1944
unsigned long flags;
1945
1946
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
1947
spin_unlock(&eb->refs_lock);
1948
1949
xas_lock_irqsave(&xas, flags);
1950
xas_load(&xas);
1951
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
1952
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
1953
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
1954
xas_unlock_irqrestore(&xas, flags);
1955
1956
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
1957
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1958
-eb->len,
1959
fs_info->dirty_metadata_batch);
1960
ret = true;
1961
} else {
1962
spin_unlock(&eb->refs_lock);
1963
}
1964
btrfs_tree_unlock(eb);
1965
return ret;
1966
}
1967
1968
static void set_btree_ioerr(struct extent_buffer *eb)
1969
{
1970
struct btrfs_fs_info *fs_info = eb->fs_info;
1971
1972
set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
1973
1974
/*
1975
* A read may stumble upon this buffer later, make sure that it gets an
1976
* error and knows there was an error.
1977
*/
1978
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
1979
1980
/*
1981
* We need to set the mapping with the io error as well because a write
1982
* error will flip the file system readonly, and then syncfs() will
1983
* return a 0 because we are readonly if we don't modify the err seq for
1984
* the superblock.
1985
*/
1986
mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);
1987
1988
/*
1989
* If writeback for a btree extent that doesn't belong to a log tree
1990
* failed, increment the counter transaction->eb_write_errors.
1991
* We do this because while the transaction is running and before it's
1992
* committing (when we call filemap_fdata[write|wait]_range against
1993
* the btree inode), we might have
1994
* btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
1995
* returns an error or an error happens during writeback, when we're
1996
* committing the transaction we wouldn't know about it, since the pages
1997
* can be no longer dirty nor marked anymore for writeback (if a
1998
* subsequent modification to the extent buffer didn't happen before the
1999
* transaction commit), which makes filemap_fdata[write|wait]_range not
2000
* able to find the pages which contain errors at transaction
2001
* commit time. So if this happens we must abort the transaction,
2002
* otherwise we commit a super block with btree roots that point to
2003
* btree nodes/leafs whose content on disk is invalid - either garbage
2004
* or the content of some node/leaf from a past generation that got
2005
* cowed or deleted and is no longer valid.
2006
*
2007
* Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
2008
* not be enough - we need to distinguish between log tree extents vs
2009
* non-log tree extents, and the next filemap_fdatawait_range() call
2010
* will catch and clear such errors in the mapping - and that call might
2011
* be from a log sync and not from a transaction commit. Also, checking
2012
* for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
2013
* not done and would not be reliable - the eb might have been released
2014
* from memory and reading it back again means that flag would not be
2015
* set (since it's a runtime flag, not persisted on disk).
2016
*
2017
* Using the flags below in the btree inode also makes us achieve the
2018
* goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
2019
* writeback for all dirty pages and before filemap_fdatawait_range()
2020
* is called, the writeback for all dirty pages had already finished
2021
* with errors - because we were not using AS_EIO/AS_ENOSPC,
2022
* filemap_fdatawait_range() would return success, as it could not know
2023
* that writeback errors happened (the pages were no longer tagged for
2024
* writeback).
2025
*/
2026
switch (eb->log_index) {
2027
case -1:
2028
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
2029
break;
2030
case 0:
2031
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
2032
break;
2033
case 1:
2034
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
2035
break;
2036
default:
2037
BUG(); /* unexpected, logic error */
2038
}
2039
}
2040
2041
static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
2042
{
2043
struct btrfs_fs_info *fs_info = eb->fs_info;
2044
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
2045
unsigned long flags;
2046
2047
xas_lock_irqsave(&xas, flags);
2048
xas_load(&xas);
2049
xas_set_mark(&xas, mark);
2050
xas_unlock_irqrestore(&xas, flags);
2051
}
2052
2053
static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
2054
{
2055
struct btrfs_fs_info *fs_info = eb->fs_info;
2056
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
2057
unsigned long flags;
2058
2059
xas_lock_irqsave(&xas, flags);
2060
xas_load(&xas);
2061
xas_clear_mark(&xas, mark);
2062
xas_unlock_irqrestore(&xas, flags);
2063
}
2064
2065
static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
2066
unsigned long start, unsigned long end)
2067
{
2068
XA_STATE(xas, &fs_info->buffer_tree, start);
2069
unsigned int tagged = 0;
2070
void *eb;
2071
2072
xas_lock_irq(&xas);
2073
xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
2074
xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
2075
if (++tagged % XA_CHECK_SCHED)
2076
continue;
2077
xas_pause(&xas);
2078
xas_unlock_irq(&xas);
2079
cond_resched();
2080
xas_lock_irq(&xas);
2081
}
2082
xas_unlock_irq(&xas);
2083
}
2084
2085
struct eb_batch {
2086
unsigned int nr;
2087
unsigned int cur;
2088
struct extent_buffer *ebs[PAGEVEC_SIZE];
2089
};
2090
2091
static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
2092
{
2093
batch->ebs[batch->nr++] = eb;
2094
return (batch->nr < PAGEVEC_SIZE);
2095
}
2096
2097
static inline void eb_batch_init(struct eb_batch *batch)
2098
{
2099
batch->nr = 0;
2100
batch->cur = 0;
2101
}
2102
2103
static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
2104
{
2105
if (batch->cur >= batch->nr)
2106
return NULL;
2107
return batch->ebs[batch->cur++];
2108
}
2109
2110
static inline void eb_batch_release(struct eb_batch *batch)
2111
{
2112
for (unsigned int i = 0; i < batch->nr; i++)
2113
free_extent_buffer(batch->ebs[i]);
2114
eb_batch_init(batch);
2115
}
2116
2117
static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
2118
xa_mark_t mark)
2119
{
2120
struct extent_buffer *eb;
2121
2122
retry:
2123
eb = xas_find_marked(xas, max, mark);
2124
2125
if (xas_retry(xas, eb))
2126
goto retry;
2127
2128
if (!eb)
2129
return NULL;
2130
2131
if (!refcount_inc_not_zero(&eb->refs)) {
2132
xas_reset(xas);
2133
goto retry;
2134
}
2135
2136
if (unlikely(eb != xas_reload(xas))) {
2137
free_extent_buffer(eb);
2138
xas_reset(xas);
2139
goto retry;
2140
}
2141
2142
return eb;
2143
}
2144
2145
static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
2146
unsigned long *start,
2147
unsigned long end, xa_mark_t tag,
2148
struct eb_batch *batch)
2149
{
2150
XA_STATE(xas, &fs_info->buffer_tree, *start);
2151
struct extent_buffer *eb;
2152
2153
rcu_read_lock();
2154
while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
2155
if (!eb_batch_add(batch, eb)) {
2156
*start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
2157
goto out;
2158
}
2159
}
2160
if (end == ULONG_MAX)
2161
*start = ULONG_MAX;
2162
else
2163
*start = end + 1;
2164
out:
2165
rcu_read_unlock();
2166
2167
return batch->nr;
2168
}
2169
2170
/*
2171
* The endio specific version which won't touch any unsafe spinlock in endio
2172
* context.
2173
*/
2174
static struct extent_buffer *find_extent_buffer_nolock(
2175
struct btrfs_fs_info *fs_info, u64 start)
2176
{
2177
struct extent_buffer *eb;
2178
unsigned long index = (start >> fs_info->nodesize_bits);
2179
2180
rcu_read_lock();
2181
eb = xa_load(&fs_info->buffer_tree, index);
2182
if (eb && !refcount_inc_not_zero(&eb->refs))
2183
eb = NULL;
2184
rcu_read_unlock();
2185
return eb;
2186
}
2187
2188
static void end_bbio_meta_write(struct btrfs_bio *bbio)
2189
{
2190
struct extent_buffer *eb = bbio->private;
2191
struct folio_iter fi;
2192
2193
if (bbio->bio.bi_status != BLK_STS_OK)
2194
set_btree_ioerr(eb);
2195
2196
bio_for_each_folio_all(fi, &bbio->bio) {
2197
btrfs_meta_folio_clear_writeback(fi.folio, eb);
2198
}
2199
2200
buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
2201
clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
2202
bio_put(&bbio->bio);
2203
}
2204
2205
static void prepare_eb_write(struct extent_buffer *eb)
2206
{
2207
u32 nritems;
2208
unsigned long start;
2209
unsigned long end;
2210
2211
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
2212
2213
/* Set btree blocks beyond nritems with 0 to avoid stale content */
2214
nritems = btrfs_header_nritems(eb);
2215
if (btrfs_header_level(eb) > 0) {
2216
end = btrfs_node_key_ptr_offset(eb, nritems);
2217
memzero_extent_buffer(eb, end, eb->len - end);
2218
} else {
2219
/*
2220
* Leaf:
2221
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
2222
*/
2223
start = btrfs_item_nr_offset(eb, nritems);
2224
end = btrfs_item_nr_offset(eb, 0);
2225
if (nritems == 0)
2226
end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
2227
else
2228
end += btrfs_item_offset(eb, nritems - 1);
2229
memzero_extent_buffer(eb, start, end - start);
2230
}
2231
}
2232
2233
static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
2234
struct writeback_control *wbc)
2235
{
2236
struct btrfs_fs_info *fs_info = eb->fs_info;
2237
struct btrfs_bio *bbio;
2238
2239
prepare_eb_write(eb);
2240
2241
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
2242
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
2243
BTRFS_I(fs_info->btree_inode), eb->start,
2244
end_bbio_meta_write, eb);
2245
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
2246
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
2247
wbc_init_bio(wbc, &bbio->bio);
2248
for (int i = 0; i < num_extent_folios(eb); i++) {
2249
struct folio *folio = eb->folios[i];
2250
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
2251
u32 range_len = min_t(u64, folio_next_pos(folio),
2252
eb->start + eb->len) - range_start;
2253
2254
folio_lock(folio);
2255
btrfs_meta_folio_clear_dirty(folio, eb);
2256
btrfs_meta_folio_set_writeback(folio, eb);
2257
if (!folio_test_dirty(folio))
2258
wbc->nr_to_write -= folio_nr_pages(folio);
2259
bio_add_folio_nofail(&bbio->bio, folio, range_len,
2260
offset_in_folio(folio, range_start));
2261
wbc_account_cgroup_owner(wbc, folio, range_len);
2262
folio_unlock(folio);
2263
}
2264
/*
2265
* If the fs is already in error status, do not submit any writeback
2266
* but immediately finish it.
2267
*/
2268
if (unlikely(BTRFS_FS_ERROR(fs_info))) {
2269
btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info)));
2270
return;
2271
}
2272
btrfs_submit_bbio(bbio, 0);
2273
}
2274
2275
/*
2276
* Wait for all eb writeback in the given range to finish.
2277
*
2278
* @fs_info: The fs_info for this file system.
2279
* @start: The offset of the range to start waiting on writeback.
2280
* @end: The end of the range, inclusive. This is meant to be used in
2281
* conjunction with wait_marked_extents, so this will usually be
2282
* the_next_eb->start - 1.
2283
*/
2284
void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
2285
u64 end)
2286
{
2287
struct eb_batch batch;
2288
unsigned long start_index = (start >> fs_info->nodesize_bits);
2289
unsigned long end_index = (end >> fs_info->nodesize_bits);
2290
2291
eb_batch_init(&batch);
2292
while (start_index <= end_index) {
2293
struct extent_buffer *eb;
2294
unsigned int nr_ebs;
2295
2296
nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
2297
PAGECACHE_TAG_WRITEBACK, &batch);
2298
if (!nr_ebs)
2299
break;
2300
2301
while ((eb = eb_batch_next(&batch)) != NULL)
2302
wait_on_extent_buffer_writeback(eb);
2303
eb_batch_release(&batch);
2304
cond_resched();
2305
}
2306
}
2307
2308
int btree_writepages(struct address_space *mapping, struct writeback_control *wbc)
2309
{
2310
struct btrfs_eb_write_context ctx = { .wbc = wbc };
2311
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
2312
int ret = 0;
2313
int done = 0;
2314
int nr_to_write_done = 0;
2315
struct eb_batch batch;
2316
unsigned int nr_ebs;
2317
unsigned long index;
2318
unsigned long end;
2319
int scanned = 0;
2320
xa_mark_t tag;
2321
2322
eb_batch_init(&batch);
2323
if (wbc->range_cyclic) {
2324
index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
2325
end = -1;
2326
2327
/*
2328
* Start from the beginning does not need to cycle over the
2329
* range, mark it as scanned.
2330
*/
2331
scanned = (index == 0);
2332
} else {
2333
index = (wbc->range_start >> fs_info->nodesize_bits);
2334
end = (wbc->range_end >> fs_info->nodesize_bits);
2335
2336
scanned = 1;
2337
}
2338
if (wbc->sync_mode == WB_SYNC_ALL)
2339
tag = PAGECACHE_TAG_TOWRITE;
2340
else
2341
tag = PAGECACHE_TAG_DIRTY;
2342
btrfs_zoned_meta_io_lock(fs_info);
2343
retry:
2344
if (wbc->sync_mode == WB_SYNC_ALL)
2345
buffer_tree_tag_for_writeback(fs_info, index, end);
2346
while (!done && !nr_to_write_done && (index <= end) &&
2347
(nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
2348
struct extent_buffer *eb;
2349
2350
while ((eb = eb_batch_next(&batch)) != NULL) {
2351
ctx.eb = eb;
2352
2353
ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
2354
if (ret) {
2355
if (ret == -EBUSY)
2356
ret = 0;
2357
2358
if (ret) {
2359
done = 1;
2360
break;
2361
}
2362
continue;
2363
}
2364
2365
if (!lock_extent_buffer_for_io(eb, wbc))
2366
continue;
2367
2368
/* Implies write in zoned mode. */
2369
if (ctx.zoned_bg) {
2370
/* Mark the last eb in the block group. */
2371
btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
2372
ctx.zoned_bg->meta_write_pointer += eb->len;
2373
}
2374
write_one_eb(eb, wbc);
2375
}
2376
nr_to_write_done = (wbc->nr_to_write <= 0);
2377
eb_batch_release(&batch);
2378
cond_resched();
2379
}
2380
if (!scanned && !done) {
2381
/*
2382
* We hit the last page and there is more work to be done: wrap
2383
* back to the start of the file
2384
*/
2385
scanned = 1;
2386
index = 0;
2387
goto retry;
2388
}
2389
/*
2390
* If something went wrong, don't allow any metadata write bio to be
2391
* submitted.
2392
*
2393
* This would prevent use-after-free if we had dirty pages not
2394
* cleaned up, which can still happen by fuzzed images.
2395
*
2396
* - Bad extent tree
2397
* Allowing existing tree block to be allocated for other trees.
2398
*
2399
* - Log tree operations
2400
* Exiting tree blocks get allocated to log tree, bumps its
2401
* generation, then get cleaned in tree re-balance.
2402
* Such tree block will not be written back, since it's clean,
2403
* thus no WRITTEN flag set.
2404
* And after log writes back, this tree block is not traced by
2405
* any dirty extent_io_tree.
2406
*
2407
* - Offending tree block gets re-dirtied from its original owner
2408
* Since it has bumped generation, no WRITTEN flag, it can be
2409
* reused without COWing. This tree block will not be traced
2410
* by btrfs_transaction::dirty_pages.
2411
*
2412
* Now such dirty tree block will not be cleaned by any dirty
2413
* extent io tree. Thus we don't want to submit such wild eb
2414
* if the fs already has error.
2415
*
2416
* We can get ret > 0 from submit_extent_folio() indicating how many ebs
2417
* were submitted. Reset it to 0 to avoid false alerts for the caller.
2418
*/
2419
if (ret > 0)
2420
ret = 0;
2421
if (!ret && BTRFS_FS_ERROR(fs_info))
2422
ret = -EROFS;
2423
2424
if (ctx.zoned_bg)
2425
btrfs_put_block_group(ctx.zoned_bg);
2426
btrfs_zoned_meta_io_unlock(fs_info);
2427
return ret;
2428
}
2429
2430
/*
2431
* Walk the list of dirty pages of the given address space and write all of them.
2432
*
2433
* @mapping: address space structure to write
2434
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
2435
* @bio_ctrl: holds context for the write, namely the bio
2436
*
2437
* If a page is already under I/O, write_cache_pages() skips it, even
2438
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2439
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2440
* and msync() need to guarantee that all the data which was dirty at the time
2441
* the call was made get new I/O started against them. If wbc->sync_mode is
2442
* WB_SYNC_ALL then we were called for data integrity and we must wait for
2443
* existing IO to complete.
2444
*/
2445
static int extent_write_cache_pages(struct address_space *mapping,
2446
struct btrfs_bio_ctrl *bio_ctrl)
2447
{
2448
struct writeback_control *wbc = bio_ctrl->wbc;
2449
struct inode *inode = mapping->host;
2450
int ret = 0;
2451
int done = 0;
2452
int nr_to_write_done = 0;
2453
struct folio_batch fbatch;
2454
unsigned int nr_folios;
2455
pgoff_t index;
2456
pgoff_t end; /* Inclusive */
2457
pgoff_t done_index;
2458
int range_whole = 0;
2459
int scanned = 0;
2460
xa_mark_t tag;
2461
2462
/*
2463
* We have to hold onto the inode so that ordered extents can do their
2464
* work when the IO finishes. The alternative to this is failing to add
2465
* an ordered extent if the igrab() fails there and that is a huge pain
2466
* to deal with, so instead just hold onto the inode throughout the
2467
* writepages operation. If it fails here we are freeing up the inode
2468
* anyway and we'd rather not waste our time writing out stuff that is
2469
* going to be truncated anyway.
2470
*/
2471
if (!igrab(inode))
2472
return 0;
2473
2474
folio_batch_init(&fbatch);
2475
if (wbc->range_cyclic) {
2476
index = mapping->writeback_index; /* Start from prev offset */
2477
end = -1;
2478
/*
2479
* Start from the beginning does not need to cycle over the
2480
* range, mark it as scanned.
2481
*/
2482
scanned = (index == 0);
2483
} else {
2484
index = wbc->range_start >> PAGE_SHIFT;
2485
end = wbc->range_end >> PAGE_SHIFT;
2486
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2487
range_whole = 1;
2488
scanned = 1;
2489
}
2490
2491
/*
2492
* We do the tagged writepage as long as the snapshot flush bit is set
2493
* and we are the first one who do the filemap_flush() on this inode.
2494
*
2495
* The nr_to_write == LONG_MAX is needed to make sure other flushers do
2496
* not race in and drop the bit.
2497
*/
2498
if (range_whole && wbc->nr_to_write == LONG_MAX &&
2499
test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
2500
&BTRFS_I(inode)->runtime_flags))
2501
wbc->tagged_writepages = 1;
2502
2503
tag = wbc_to_tag(wbc);
2504
retry:
2505
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2506
tag_pages_for_writeback(mapping, index, end);
2507
done_index = index;
2508
while (!done && !nr_to_write_done && (index <= end) &&
2509
(nr_folios = filemap_get_folios_tag(mapping, &index,
2510
end, tag, &fbatch))) {
2511
unsigned i;
2512
2513
for (i = 0; i < nr_folios; i++) {
2514
struct folio *folio = fbatch.folios[i];
2515
2516
done_index = folio_next_index(folio);
2517
/*
2518
* At this point we hold neither the i_pages lock nor
2519
* the folio lock: the folio may be truncated or
2520
* invalidated (changing folio->mapping to NULL).
2521
*/
2522
if (!folio_trylock(folio)) {
2523
submit_write_bio(bio_ctrl, 0);
2524
folio_lock(folio);
2525
}
2526
2527
if (unlikely(folio->mapping != mapping)) {
2528
folio_unlock(folio);
2529
continue;
2530
}
2531
2532
if (!folio_test_dirty(folio)) {
2533
/* Someone wrote it for us. */
2534
folio_unlock(folio);
2535
continue;
2536
}
2537
2538
/*
2539
* For subpage case, compression can lead to mixed
2540
* writeback and dirty flags, e.g:
2541
* 0 32K 64K 96K 128K
2542
* | |//////||/////| |//|
2543
*
2544
* In above case, [32K, 96K) is asynchronously submitted
2545
* for compression, and [124K, 128K) needs to be written back.
2546
*
2547
* If we didn't wait writeback for page 64K, [128K, 128K)
2548
* won't be submitted as the page still has writeback flag
2549
* and will be skipped in the next check.
2550
*
2551
* This mixed writeback and dirty case is only possible for
2552
* subpage case.
2553
*
2554
* TODO: Remove this check after migrating compression to
2555
* regular submission.
2556
*/
2557
if (wbc->sync_mode != WB_SYNC_NONE ||
2558
btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
2559
if (folio_test_writeback(folio))
2560
submit_write_bio(bio_ctrl, 0);
2561
folio_wait_writeback(folio);
2562
}
2563
2564
if (folio_test_writeback(folio) ||
2565
!folio_clear_dirty_for_io(folio)) {
2566
folio_unlock(folio);
2567
continue;
2568
}
2569
2570
ret = extent_writepage(folio, bio_ctrl);
2571
if (ret < 0) {
2572
done = 1;
2573
break;
2574
}
2575
2576
/*
2577
* The filesystem may choose to bump up nr_to_write.
2578
* We have to make sure to honor the new nr_to_write
2579
* at any time.
2580
*/
2581
nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
2582
wbc->nr_to_write <= 0);
2583
}
2584
folio_batch_release(&fbatch);
2585
cond_resched();
2586
}
2587
if (!scanned && !done) {
2588
/*
2589
* We hit the last page and there is more work to be done: wrap
2590
* back to the start of the file
2591
*/
2592
scanned = 1;
2593
index = 0;
2594
2595
/*
2596
* If we're looping we could run into a page that is locked by a
2597
* writer and that writer could be waiting on writeback for a
2598
* page in our current bio, and thus deadlock, so flush the
2599
* write bio here.
2600
*/
2601
submit_write_bio(bio_ctrl, 0);
2602
goto retry;
2603
}
2604
2605
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
2606
mapping->writeback_index = done_index;
2607
2608
btrfs_add_delayed_iput(BTRFS_I(inode));
2609
return ret;
2610
}
2611
2612
/*
2613
* Submit the pages in the range to bio for call sites which delalloc range has
2614
* already been ran (aka, ordered extent inserted) and all pages are still
2615
* locked.
2616
*/
2617
void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
2618
u64 start, u64 end, struct writeback_control *wbc,
2619
bool pages_dirty)
2620
{
2621
bool found_error = false;
2622
int ret = 0;
2623
struct address_space *mapping = inode->i_mapping;
2624
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2625
const u32 sectorsize = fs_info->sectorsize;
2626
loff_t i_size = i_size_read(inode);
2627
u64 cur = start;
2628
struct btrfs_bio_ctrl bio_ctrl = {
2629
.wbc = wbc,
2630
.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
2631
};
2632
2633
if (wbc->no_cgroup_owner)
2634
bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT;
2635
2636
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
2637
2638
while (cur <= end) {
2639
u64 cur_end;
2640
u32 cur_len;
2641
struct folio *folio;
2642
2643
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
2644
2645
/*
2646
* This shouldn't happen, the pages are pinned and locked, this
2647
* code is just in case, but shouldn't actually be run.
2648
*/
2649
if (IS_ERR(folio)) {
2650
cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
2651
cur_len = cur_end + 1 - cur;
2652
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
2653
cur, cur_len, false);
2654
mapping_set_error(mapping, PTR_ERR(folio));
2655
cur = cur_end;
2656
continue;
2657
}
2658
2659
cur_end = min_t(u64, folio_next_pos(folio) - 1, end);
2660
cur_len = cur_end + 1 - cur;
2661
2662
ASSERT(folio_test_locked(folio));
2663
if (pages_dirty && folio != locked_folio)
2664
ASSERT(folio_test_dirty(folio));
2665
2666
/*
2667
* Set the submission bitmap to submit all sectors.
2668
* extent_writepage_io() will do the truncation correctly.
2669
*/
2670
bio_ctrl.submit_bitmap = (unsigned long)-1;
2671
ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
2672
&bio_ctrl, i_size);
2673
if (ret == 1)
2674
goto next_page;
2675
2676
if (ret)
2677
mapping_set_error(mapping, ret);
2678
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
2679
if (ret < 0)
2680
found_error = true;
2681
next_page:
2682
folio_put(folio);
2683
cur = cur_end + 1;
2684
}
2685
2686
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
2687
}
2688
2689
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
2690
{
2691
struct inode *inode = mapping->host;
2692
int ret = 0;
2693
struct btrfs_bio_ctrl bio_ctrl = {
2694
.wbc = wbc,
2695
.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
2696
};
2697
2698
/*
2699
* Allow only a single thread to do the reloc work in zoned mode to
2700
* protect the write pointer updates.
2701
*/
2702
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
2703
ret = extent_write_cache_pages(mapping, &bio_ctrl);
2704
submit_write_bio(&bio_ctrl, ret);
2705
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
2706
return ret;
2707
}
2708
2709
void btrfs_readahead(struct readahead_control *rac)
2710
{
2711
struct btrfs_bio_ctrl bio_ctrl = {
2712
.opf = REQ_OP_READ | REQ_RAHEAD,
2713
.ractl = rac,
2714
.last_em_start = U64_MAX,
2715
};
2716
struct folio *folio;
2717
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
2718
const u64 start = readahead_pos(rac);
2719
const u64 end = start + readahead_length(rac) - 1;
2720
struct extent_state *cached_state = NULL;
2721
struct extent_map *em_cached = NULL;
2722
2723
lock_extents_for_read(inode, start, end, &cached_state);
2724
2725
while ((folio = readahead_folio(rac)) != NULL)
2726
btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
2727
2728
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
2729
2730
if (em_cached)
2731
btrfs_free_extent_map(em_cached);
2732
submit_one_bio(&bio_ctrl);
2733
}
2734
2735
/*
2736
* basic invalidate_folio code, this waits on any locked or writeback
2737
* ranges corresponding to the folio, and then deletes any extent state
2738
* records from the tree
2739
*/
2740
int extent_invalidate_folio(struct extent_io_tree *tree,
2741
struct folio *folio, size_t offset)
2742
{
2743
struct extent_state *cached_state = NULL;
2744
u64 start = folio_pos(folio);
2745
u64 end = start + folio_size(folio) - 1;
2746
size_t blocksize = folio_to_fs_info(folio)->sectorsize;
2747
2748
/* This function is only called for the btree inode */
2749
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
2750
2751
start += ALIGN(offset, blocksize);
2752
if (start > end)
2753
return 0;
2754
2755
btrfs_lock_extent(tree, start, end, &cached_state);
2756
folio_wait_writeback(folio);
2757
2758
/*
2759
* Currently for btree io tree, only EXTENT_LOCKED is utilized,
2760
* so here we only need to unlock the extent range to free any
2761
* existing extent state.
2762
*/
2763
btrfs_unlock_extent(tree, start, end, &cached_state);
2764
return 0;
2765
}
2766
2767
/*
2768
* A helper for struct address_space_operations::release_folio, this tests for
2769
* areas of the folio that are locked or under IO and drops the related state
2770
* bits if it is safe to drop the folio.
2771
*/
2772
static bool try_release_extent_state(struct extent_io_tree *tree,
2773
struct folio *folio)
2774
{
2775
struct extent_state *cached_state = NULL;
2776
u64 start = folio_pos(folio);
2777
u64 end = start + folio_size(folio) - 1;
2778
u32 range_bits;
2779
u32 clear_bits;
2780
bool ret = false;
2781
int ret2;
2782
2783
btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state);
2784
2785
/*
2786
* We can release the folio if it's locked only for ordered extent
2787
* completion, since that doesn't require using the folio.
2788
*/
2789
if ((range_bits & EXTENT_LOCKED) &&
2790
!(range_bits & EXTENT_FINISHING_ORDERED))
2791
goto out;
2792
2793
clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW |
2794
EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED |
2795
EXTENT_FINISHING_ORDERED);
2796
/*
2797
* At this point we can safely clear everything except the locked,
2798
* nodatasum, delalloc new and finishing ordered bits. The delalloc new
2799
* bit will be cleared by ordered extent completion.
2800
*/
2801
ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state);
2802
/*
2803
* If clear_extent_bit failed for enomem reasons, we can't allow the
2804
* release to continue.
2805
*/
2806
if (ret2 == 0)
2807
ret = true;
2808
out:
2809
btrfs_free_extent_state(cached_state);
2810
2811
return ret;
2812
}
2813
2814
/*
2815
* a helper for release_folio. As long as there are no locked extents
2816
* in the range corresponding to the page, both state records and extent
2817
* map records are removed
2818
*/
2819
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
2820
{
2821
u64 start = folio_pos(folio);
2822
u64 end = start + folio_size(folio) - 1;
2823
struct btrfs_inode *inode = folio_to_inode(folio);
2824
struct extent_io_tree *io_tree = &inode->io_tree;
2825
2826
while (start <= end) {
2827
const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
2828
const u64 len = end - start + 1;
2829
struct extent_map_tree *extent_tree = &inode->extent_tree;
2830
struct extent_map *em;
2831
2832
write_lock(&extent_tree->lock);
2833
em = btrfs_lookup_extent_mapping(extent_tree, start, len);
2834
if (!em) {
2835
write_unlock(&extent_tree->lock);
2836
break;
2837
}
2838
if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
2839
write_unlock(&extent_tree->lock);
2840
btrfs_free_extent_map(em);
2841
break;
2842
}
2843
if (btrfs_test_range_bit_exists(io_tree, em->start,
2844
btrfs_extent_map_end(em) - 1,
2845
EXTENT_LOCKED))
2846
goto next;
2847
/*
2848
* If it's not in the list of modified extents, used by a fast
2849
* fsync, we can remove it. If it's being logged we can safely
2850
* remove it since fsync took an extra reference on the em.
2851
*/
2852
if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
2853
goto remove_em;
2854
/*
2855
* If it's in the list of modified extents, remove it only if
2856
* its generation is older then the current one, in which case
2857
* we don't need it for a fast fsync. Otherwise don't remove it,
2858
* we could be racing with an ongoing fast fsync that could miss
2859
* the new extent.
2860
*/
2861
if (em->generation >= cur_gen)
2862
goto next;
2863
remove_em:
2864
/*
2865
* We only remove extent maps that are not in the list of
2866
* modified extents or that are in the list but with a
2867
* generation lower then the current generation, so there is no
2868
* need to set the full fsync flag on the inode (it hurts the
2869
* fsync performance for workloads with a data size that exceeds
2870
* or is close to the system's memory).
2871
*/
2872
btrfs_remove_extent_mapping(inode, em);
2873
/* Once for the inode's extent map tree. */
2874
btrfs_free_extent_map(em);
2875
next:
2876
start = btrfs_extent_map_end(em);
2877
write_unlock(&extent_tree->lock);
2878
2879
/* Once for us, for the lookup_extent_mapping() reference. */
2880
btrfs_free_extent_map(em);
2881
2882
if (need_resched()) {
2883
/*
2884
* If we need to resched but we can't block just exit
2885
* and leave any remaining extent maps.
2886
*/
2887
if (!gfpflags_allow_blocking(mask))
2888
break;
2889
2890
cond_resched();
2891
}
2892
}
2893
return try_release_extent_state(io_tree, folio);
2894
}
2895
2896
static int extent_buffer_under_io(const struct extent_buffer *eb)
2897
{
2898
return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
2899
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
2900
}
2901
2902
static bool folio_range_has_eb(struct folio *folio)
2903
{
2904
struct btrfs_folio_state *bfs;
2905
2906
lockdep_assert_held(&folio->mapping->i_private_lock);
2907
2908
if (folio_test_private(folio)) {
2909
bfs = folio_get_private(folio);
2910
if (atomic_read(&bfs->eb_refs))
2911
return true;
2912
}
2913
return false;
2914
}
2915
2916
static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
2917
{
2918
struct btrfs_fs_info *fs_info = eb->fs_info;
2919
struct address_space *mapping = folio->mapping;
2920
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
2921
2922
/*
2923
* For mapped eb, we're going to change the folio private, which should
2924
* be done under the i_private_lock.
2925
*/
2926
if (mapped)
2927
spin_lock(&mapping->i_private_lock);
2928
2929
if (!folio_test_private(folio)) {
2930
if (mapped)
2931
spin_unlock(&mapping->i_private_lock);
2932
return;
2933
}
2934
2935
if (!btrfs_meta_is_subpage(fs_info)) {
2936
/*
2937
* We do this since we'll remove the pages after we've removed
2938
* the eb from the xarray, so we could race and have this page
2939
* now attached to the new eb. So only clear folio if it's
2940
* still connected to this eb.
2941
*/
2942
if (folio_test_private(folio) && folio_get_private(folio) == eb) {
2943
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
2944
BUG_ON(folio_test_dirty(folio));
2945
BUG_ON(folio_test_writeback(folio));
2946
/* We need to make sure we haven't be attached to a new eb. */
2947
folio_detach_private(folio);
2948
}
2949
if (mapped)
2950
spin_unlock(&mapping->i_private_lock);
2951
return;
2952
}
2953
2954
/*
2955
* For subpage, we can have dummy eb with folio private attached. In
2956
* this case, we can directly detach the private as such folio is only
2957
* attached to one dummy eb, no sharing.
2958
*/
2959
if (!mapped) {
2960
btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
2961
return;
2962
}
2963
2964
btrfs_folio_dec_eb_refs(fs_info, folio);
2965
2966
/*
2967
* We can only detach the folio private if there are no other ebs in the
2968
* page range and no unfinished IO.
2969
*/
2970
if (!folio_range_has_eb(folio))
2971
btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
2972
2973
spin_unlock(&mapping->i_private_lock);
2974
}
2975
2976
/* Release all folios attached to the extent buffer */
2977
static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
2978
{
2979
ASSERT(!extent_buffer_under_io(eb));
2980
2981
for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
2982
struct folio *folio = eb->folios[i];
2983
2984
if (!folio)
2985
continue;
2986
2987
detach_extent_buffer_folio(eb, folio);
2988
}
2989
}
2990
2991
/*
2992
* Helper for releasing the extent buffer.
2993
*/
2994
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
2995
{
2996
btrfs_release_extent_buffer_folios(eb);
2997
btrfs_leak_debug_del_eb(eb);
2998
kmem_cache_free(extent_buffer_cache, eb);
2999
}
3000
3001
static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3002
u64 start)
3003
{
3004
struct extent_buffer *eb = NULL;
3005
3006
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
3007
eb->start = start;
3008
eb->len = fs_info->nodesize;
3009
eb->fs_info = fs_info;
3010
init_rwsem(&eb->lock);
3011
3012
btrfs_leak_debug_add_eb(eb);
3013
3014
spin_lock_init(&eb->refs_lock);
3015
refcount_set(&eb->refs, 1);
3016
3017
ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
3018
3019
return eb;
3020
}
3021
3022
/*
3023
* For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer()
3024
* does not call folio_put(), and we need to set the folios to NULL so that
3025
* btrfs_release_extent_buffer() will not detach them a second time.
3026
*/
3027
static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
3028
{
3029
const int num_folios = num_extent_folios(eb);
3030
3031
/* We cannot use num_extent_folios() as loop bound as eb->folios changes. */
3032
for (int i = 0; i < num_folios; i++) {
3033
ASSERT(eb->folios[i]);
3034
detach_extent_buffer_folio(eb, eb->folios[i]);
3035
folio_put(eb->folios[i]);
3036
eb->folios[i] = NULL;
3037
}
3038
}
3039
3040
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
3041
{
3042
struct extent_buffer *new;
3043
int num_folios;
3044
int ret;
3045
3046
new = __alloc_extent_buffer(src->fs_info, src->start);
3047
if (new == NULL)
3048
return NULL;
3049
3050
/*
3051
* Set UNMAPPED before calling btrfs_release_extent_buffer(), as
3052
* btrfs_release_extent_buffer() have different behavior for
3053
* UNMAPPED subpage extent buffer.
3054
*/
3055
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
3056
3057
ret = alloc_eb_folio_array(new, false);
3058
if (ret)
3059
goto release_eb;
3060
3061
ASSERT(num_extent_folios(src) == num_extent_folios(new),
3062
"%d != %d", num_extent_folios(src), num_extent_folios(new));
3063
/* Explicitly use the cached num_extent value from now on. */
3064
num_folios = num_extent_folios(src);
3065
for (int i = 0; i < num_folios; i++) {
3066
struct folio *folio = new->folios[i];
3067
3068
ret = attach_extent_buffer_folio(new, folio, NULL);
3069
if (ret < 0)
3070
goto cleanup_folios;
3071
WARN_ON(folio_test_dirty(folio));
3072
}
3073
for (int i = 0; i < num_folios; i++)
3074
folio_put(new->folios[i]);
3075
3076
copy_extent_buffer_full(new, src);
3077
set_extent_buffer_uptodate(new);
3078
3079
return new;
3080
3081
cleanup_folios:
3082
cleanup_extent_buffer_folios(new);
3083
release_eb:
3084
btrfs_release_extent_buffer(new);
3085
return NULL;
3086
}
3087
3088
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
3089
u64 start)
3090
{
3091
struct extent_buffer *eb;
3092
int ret;
3093
3094
eb = __alloc_extent_buffer(fs_info, start);
3095
if (!eb)
3096
return NULL;
3097
3098
ret = alloc_eb_folio_array(eb, false);
3099
if (ret)
3100
goto release_eb;
3101
3102
for (int i = 0; i < num_extent_folios(eb); i++) {
3103
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
3104
if (ret < 0)
3105
goto cleanup_folios;
3106
}
3107
for (int i = 0; i < num_extent_folios(eb); i++)
3108
folio_put(eb->folios[i]);
3109
3110
set_extent_buffer_uptodate(eb);
3111
btrfs_set_header_nritems(eb, 0);
3112
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
3113
3114
return eb;
3115
3116
cleanup_folios:
3117
cleanup_extent_buffer_folios(eb);
3118
release_eb:
3119
btrfs_release_extent_buffer(eb);
3120
return NULL;
3121
}
3122
3123
static void check_buffer_tree_ref(struct extent_buffer *eb)
3124
{
3125
int refs;
3126
/*
3127
* The TREE_REF bit is first set when the extent_buffer is added to the
3128
* xarray. It is also reset, if unset, when a new reference is created
3129
* by find_extent_buffer.
3130
*
3131
* It is only cleared in two cases: freeing the last non-tree
3132
* reference to the extent_buffer when its STALE bit is set or
3133
* calling release_folio when the tree reference is the only reference.
3134
*
3135
* In both cases, care is taken to ensure that the extent_buffer's
3136
* pages are not under io. However, release_folio can be concurrently
3137
* called with creating new references, which is prone to race
3138
* conditions between the calls to check_buffer_tree_ref in those
3139
* codepaths and clearing TREE_REF in try_release_extent_buffer.
3140
*
3141
* The actual lifetime of the extent_buffer in the xarray is adequately
3142
* protected by the refcount, but the TREE_REF bit and its corresponding
3143
* reference are not. To protect against this class of races, we call
3144
* check_buffer_tree_ref() from the code paths which trigger io. Note that
3145
* once io is initiated, TREE_REF can no longer be cleared, so that is
3146
* the moment at which any such race is best fixed.
3147
*/
3148
refs = refcount_read(&eb->refs);
3149
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3150
return;
3151
3152
spin_lock(&eb->refs_lock);
3153
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3154
refcount_inc(&eb->refs);
3155
spin_unlock(&eb->refs_lock);
3156
}
3157
3158
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
3159
{
3160
check_buffer_tree_ref(eb);
3161
3162
for (int i = 0; i < num_extent_folios(eb); i++)
3163
folio_mark_accessed(eb->folios[i]);
3164
}
3165
3166
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
3167
u64 start)
3168
{
3169
struct extent_buffer *eb;
3170
3171
eb = find_extent_buffer_nolock(fs_info, start);
3172
if (!eb)
3173
return NULL;
3174
/*
3175
* Lock our eb's refs_lock to avoid races with free_extent_buffer().
3176
* When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
3177
* another task running free_extent_buffer() might have seen that flag
3178
* set, eb->refs == 2, that the buffer isn't under IO (dirty and
3179
* writeback flags not set) and it's still in the tree (flag
3180
* EXTENT_BUFFER_TREE_REF set), therefore being in the process of
3181
* decrementing the extent buffer's reference count twice. So here we
3182
* could race and increment the eb's reference count, clear its stale
3183
* flag, mark it as dirty and drop our reference before the other task
3184
* finishes executing free_extent_buffer, which would later result in
3185
* an attempt to free an extent buffer that is dirty.
3186
*/
3187
if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
3188
spin_lock(&eb->refs_lock);
3189
spin_unlock(&eb->refs_lock);
3190
}
3191
mark_extent_buffer_accessed(eb);
3192
return eb;
3193
}
3194
3195
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
3196
u64 start)
3197
{
3198
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3199
struct extent_buffer *eb, *exists = NULL;
3200
int ret;
3201
3202
eb = find_extent_buffer(fs_info, start);
3203
if (eb)
3204
return eb;
3205
eb = alloc_dummy_extent_buffer(fs_info, start);
3206
if (!eb)
3207
return ERR_PTR(-ENOMEM);
3208
eb->fs_info = fs_info;
3209
again:
3210
xa_lock_irq(&fs_info->buffer_tree);
3211
exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
3212
NULL, eb, GFP_NOFS);
3213
if (xa_is_err(exists)) {
3214
ret = xa_err(exists);
3215
xa_unlock_irq(&fs_info->buffer_tree);
3216
btrfs_release_extent_buffer(eb);
3217
return ERR_PTR(ret);
3218
}
3219
if (exists) {
3220
if (!refcount_inc_not_zero(&exists->refs)) {
3221
/* The extent buffer is being freed, retry. */
3222
xa_unlock_irq(&fs_info->buffer_tree);
3223
goto again;
3224
}
3225
xa_unlock_irq(&fs_info->buffer_tree);
3226
btrfs_release_extent_buffer(eb);
3227
return exists;
3228
}
3229
xa_unlock_irq(&fs_info->buffer_tree);
3230
check_buffer_tree_ref(eb);
3231
3232
return eb;
3233
#else
3234
/* Stub to avoid linker error when compiled with optimizations turned off. */
3235
return NULL;
3236
#endif
3237
}
3238
3239
static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
3240
struct folio *folio)
3241
{
3242
struct extent_buffer *exists;
3243
3244
lockdep_assert_held(&folio->mapping->i_private_lock);
3245
3246
/*
3247
* For subpage case, we completely rely on xarray to ensure we don't try
3248
* to insert two ebs for the same bytenr. So here we always return NULL
3249
* and just continue.
3250
*/
3251
if (btrfs_meta_is_subpage(fs_info))
3252
return NULL;
3253
3254
/* Page not yet attached to an extent buffer */
3255
if (!folio_test_private(folio))
3256
return NULL;
3257
3258
/*
3259
* We could have already allocated an eb for this folio and attached one
3260
* so lets see if we can get a ref on the existing eb, and if we can we
3261
* know it's good and we can just return that one, else we know we can
3262
* just overwrite folio private.
3263
*/
3264
exists = folio_get_private(folio);
3265
if (refcount_inc_not_zero(&exists->refs))
3266
return exists;
3267
3268
WARN_ON(folio_test_dirty(folio));
3269
folio_detach_private(folio);
3270
return NULL;
3271
}
3272
3273
/*
3274
* Validate alignment constraints of eb at logical address @start.
3275
*/
3276
static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
3277
{
3278
const u32 nodesize = fs_info->nodesize;
3279
3280
if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) {
3281
btrfs_err(fs_info, "bad tree block start %llu", start);
3282
return true;
3283
}
3284
3285
if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) {
3286
btrfs_err(fs_info,
3287
"tree block is not nodesize aligned, start %llu nodesize %u",
3288
start, nodesize);
3289
return true;
3290
}
3291
if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) {
3292
btrfs_err(fs_info,
3293
"tree block is not page aligned, start %llu nodesize %u",
3294
start, nodesize);
3295
return true;
3296
}
3297
if (unlikely(!IS_ALIGNED(start, nodesize) &&
3298
!test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) {
3299
btrfs_warn(fs_info,
3300
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
3301
start, nodesize);
3302
}
3303
return false;
3304
}
3305
3306
/*
3307
* Return 0 if eb->folios[i] is attached to btree inode successfully.
3308
* Return >0 if there is already another extent buffer for the range,
3309
* and @found_eb_ret would be updated.
3310
* Return -EAGAIN if the filemap has an existing folio but with different size
3311
* than @eb.
3312
* The caller needs to free the existing folios and retry using the same order.
3313
*/
3314
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
3315
struct btrfs_folio_state *prealloc,
3316
struct extent_buffer **found_eb_ret)
3317
{
3318
3319
struct btrfs_fs_info *fs_info = eb->fs_info;
3320
struct address_space *mapping = fs_info->btree_inode->i_mapping;
3321
const pgoff_t index = eb->start >> PAGE_SHIFT;
3322
struct folio *existing_folio;
3323
int ret;
3324
3325
ASSERT(found_eb_ret);
3326
3327
/* Caller should ensure the folio exists. */
3328
ASSERT(eb->folios[i]);
3329
3330
retry:
3331
existing_folio = NULL;
3332
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
3333
GFP_NOFS | __GFP_NOFAIL);
3334
if (!ret)
3335
goto finish;
3336
3337
existing_folio = filemap_lock_folio(mapping, index + i);
3338
/* The page cache only exists for a very short time, just retry. */
3339
if (IS_ERR(existing_folio))
3340
goto retry;
3341
3342
/* For now, we should only have single-page folios for btree inode. */
3343
ASSERT(folio_nr_pages(existing_folio) == 1);
3344
3345
if (folio_size(existing_folio) != eb->folio_size) {
3346
folio_unlock(existing_folio);
3347
folio_put(existing_folio);
3348
return -EAGAIN;
3349
}
3350
3351
finish:
3352
spin_lock(&mapping->i_private_lock);
3353
if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
3354
/* We're going to reuse the existing page, can drop our folio now. */
3355
__free_page(folio_page(eb->folios[i], 0));
3356
eb->folios[i] = existing_folio;
3357
} else if (existing_folio) {
3358
struct extent_buffer *existing_eb;
3359
3360
existing_eb = grab_extent_buffer(fs_info, existing_folio);
3361
if (existing_eb) {
3362
/* The extent buffer still exists, we can use it directly. */
3363
*found_eb_ret = existing_eb;
3364
spin_unlock(&mapping->i_private_lock);
3365
folio_unlock(existing_folio);
3366
folio_put(existing_folio);
3367
return 1;
3368
}
3369
/* The extent buffer no longer exists, we can reuse the folio. */
3370
__free_page(folio_page(eb->folios[i], 0));
3371
eb->folios[i] = existing_folio;
3372
}
3373
eb->folio_size = folio_size(eb->folios[i]);
3374
eb->folio_shift = folio_shift(eb->folios[i]);
3375
/* Should not fail, as we have preallocated the memory. */
3376
ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
3377
ASSERT(!ret);
3378
/*
3379
* To inform we have an extra eb under allocation, so that
3380
* detach_extent_buffer_page() won't release the folio private when the
3381
* eb hasn't been inserted into the xarray yet.
3382
*
3383
* The ref will be decreased when the eb releases the page, in
3384
* detach_extent_buffer_page(). Thus needs no special handling in the
3385
* error path.
3386
*/
3387
btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
3388
spin_unlock(&mapping->i_private_lock);
3389
return 0;
3390
}
3391
3392
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3393
u64 start, u64 owner_root, int level)
3394
{
3395
int attached = 0;
3396
struct extent_buffer *eb;
3397
struct extent_buffer *existing_eb = NULL;
3398
struct btrfs_folio_state *prealloc = NULL;
3399
u64 lockdep_owner = owner_root;
3400
bool page_contig = true;
3401
int uptodate = 1;
3402
int ret;
3403
3404
if (check_eb_alignment(fs_info, start))
3405
return ERR_PTR(-EINVAL);
3406
3407
#if BITS_PER_LONG == 32
3408
if (start >= MAX_LFS_FILESIZE) {
3409
btrfs_err_rl(fs_info,
3410
"extent buffer %llu is beyond 32bit page cache limit", start);
3411
btrfs_err_32bit_limit(fs_info);
3412
return ERR_PTR(-EOVERFLOW);
3413
}
3414
if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
3415
btrfs_warn_32bit_limit(fs_info);
3416
#endif
3417
3418
eb = find_extent_buffer(fs_info, start);
3419
if (eb)
3420
return eb;
3421
3422
eb = __alloc_extent_buffer(fs_info, start);
3423
if (!eb)
3424
return ERR_PTR(-ENOMEM);
3425
3426
/*
3427
* The reloc trees are just snapshots, so we need them to appear to be
3428
* just like any other fs tree WRT lockdep.
3429
*/
3430
if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
3431
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
3432
3433
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
3434
3435
/*
3436
* Preallocate folio private for subpage case, so that we won't
3437
* allocate memory with i_private_lock nor page lock hold.
3438
*
3439
* The memory will be freed by attach_extent_buffer_page() or freed
3440
* manually if we exit earlier.
3441
*/
3442
if (btrfs_meta_is_subpage(fs_info)) {
3443
prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
3444
if (IS_ERR(prealloc)) {
3445
ret = PTR_ERR(prealloc);
3446
goto out;
3447
}
3448
}
3449
3450
reallocate:
3451
/* Allocate all pages first. */
3452
ret = alloc_eb_folio_array(eb, true);
3453
if (ret < 0) {
3454
btrfs_free_folio_state(prealloc);
3455
goto out;
3456
}
3457
3458
/* Attach all pages to the filemap. */
3459
for (int i = 0; i < num_extent_folios(eb); i++) {
3460
struct folio *folio;
3461
3462
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
3463
if (ret > 0) {
3464
ASSERT(existing_eb);
3465
goto out;
3466
}
3467
3468
/*
3469
* TODO: Special handling for a corner case where the order of
3470
* folios mismatch between the new eb and filemap.
3471
*
3472
* This happens when:
3473
*
3474
* - the new eb is using higher order folio
3475
*
3476
* - the filemap is still using 0-order folios for the range
3477
* This can happen at the previous eb allocation, and we don't
3478
* have higher order folio for the call.
3479
*
3480
* - the existing eb has already been freed
3481
*
3482
* In this case, we have to free the existing folios first, and
3483
* re-allocate using the same order.
3484
* Thankfully this is not going to happen yet, as we're still
3485
* using 0-order folios.
3486
*/
3487
if (unlikely(ret == -EAGAIN)) {
3488
DEBUG_WARN("folio order mismatch between new eb and filemap");
3489
goto reallocate;
3490
}
3491
attached++;
3492
3493
/*
3494
* Only after attach_eb_folio_to_filemap(), eb->folios[] is
3495
* reliable, as we may choose to reuse the existing page cache
3496
* and free the allocated page.
3497
*/
3498
folio = eb->folios[i];
3499
WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
3500
3501
/*
3502
* Check if the current page is physically contiguous with previous eb
3503
* page.
3504
* At this stage, either we allocated a large folio, thus @i
3505
* would only be 0, or we fall back to per-page allocation.
3506
*/
3507
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
3508
page_contig = false;
3509
3510
if (!btrfs_meta_folio_test_uptodate(folio, eb))
3511
uptodate = 0;
3512
3513
/*
3514
* We can't unlock the pages just yet since the extent buffer
3515
* hasn't been properly inserted into the xarray, this opens a
3516
* race with btree_release_folio() which can free a page while we
3517
* are still filling in all pages for the buffer and we could crash.
3518
*/
3519
}
3520
if (uptodate)
3521
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3522
/* All pages are physically contiguous, can skip cross page handling. */
3523
if (page_contig)
3524
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
3525
again:
3526
xa_lock_irq(&fs_info->buffer_tree);
3527
existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
3528
start >> fs_info->nodesize_bits, NULL, eb,
3529
GFP_NOFS);
3530
if (xa_is_err(existing_eb)) {
3531
ret = xa_err(existing_eb);
3532
xa_unlock_irq(&fs_info->buffer_tree);
3533
goto out;
3534
}
3535
if (existing_eb) {
3536
if (!refcount_inc_not_zero(&existing_eb->refs)) {
3537
xa_unlock_irq(&fs_info->buffer_tree);
3538
goto again;
3539
}
3540
xa_unlock_irq(&fs_info->buffer_tree);
3541
goto out;
3542
}
3543
xa_unlock_irq(&fs_info->buffer_tree);
3544
3545
/* add one reference for the tree */
3546
check_buffer_tree_ref(eb);
3547
3548
/*
3549
* Now it's safe to unlock the pages because any calls to
3550
* btree_release_folio will correctly detect that a page belongs to a
3551
* live buffer and won't free them prematurely.
3552
*/
3553
for (int i = 0; i < num_extent_folios(eb); i++) {
3554
folio_unlock(eb->folios[i]);
3555
/*
3556
* A folio that has been added to an address_space mapping
3557
* should not continue holding the refcount from its original
3558
* allocation indefinitely.
3559
*/
3560
folio_put(eb->folios[i]);
3561
}
3562
return eb;
3563
3564
out:
3565
WARN_ON(!refcount_dec_and_test(&eb->refs));
3566
3567
/*
3568
* Any attached folios need to be detached before we unlock them. This
3569
* is because when we're inserting our new folios into the mapping, and
3570
* then attaching our eb to that folio. If we fail to insert our folio
3571
* we'll lookup the folio for that index, and grab that EB. We do not
3572
* want that to grab this eb, as we're getting ready to free it. So we
3573
* have to detach it first and then unlock it.
3574
*
3575
* Note: the bounds is num_extent_pages() as we need to go through all slots.
3576
*/
3577
for (int i = 0; i < num_extent_pages(eb); i++) {
3578
struct folio *folio = eb->folios[i];
3579
3580
if (i < attached) {
3581
ASSERT(folio);
3582
detach_extent_buffer_folio(eb, folio);
3583
folio_unlock(folio);
3584
} else if (!folio) {
3585
continue;
3586
}
3587
3588
folio_put(folio);
3589
eb->folios[i] = NULL;
3590
}
3591
btrfs_release_extent_buffer(eb);
3592
if (ret < 0)
3593
return ERR_PTR(ret);
3594
ASSERT(existing_eb);
3595
return existing_eb;
3596
}
3597
3598
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3599
{
3600
struct extent_buffer *eb =
3601
container_of(head, struct extent_buffer, rcu_head);
3602
3603
kmem_cache_free(extent_buffer_cache, eb);
3604
}
3605
3606
static int release_extent_buffer(struct extent_buffer *eb)
3607
__releases(&eb->refs_lock)
3608
{
3609
lockdep_assert_held(&eb->refs_lock);
3610
3611
if (refcount_dec_and_test(&eb->refs)) {
3612
struct btrfs_fs_info *fs_info = eb->fs_info;
3613
3614
spin_unlock(&eb->refs_lock);
3615
3616
/*
3617
* We're erasing, theoretically there will be no allocations, so
3618
* just use GFP_ATOMIC.
3619
*
3620
* We use cmpxchg instead of erase because we do not know if
3621
* this eb is actually in the tree or not, we could be cleaning
3622
* up an eb that we allocated but never inserted into the tree.
3623
* Thus use cmpxchg to remove it from the tree if it is there,
3624
* or leave the other entry if this isn't in the tree.
3625
*
3626
* The documentation says that putting a NULL value is the same
3627
* as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
3628
* in this case.
3629
*/
3630
xa_cmpxchg_irq(&fs_info->buffer_tree,
3631
eb->start >> fs_info->nodesize_bits, eb, NULL,
3632
GFP_ATOMIC);
3633
3634
btrfs_leak_debug_del_eb(eb);
3635
/* Should be safe to release folios at this point. */
3636
btrfs_release_extent_buffer_folios(eb);
3637
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3638
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
3639
kmem_cache_free(extent_buffer_cache, eb);
3640
return 1;
3641
}
3642
#endif
3643
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3644
return 1;
3645
}
3646
spin_unlock(&eb->refs_lock);
3647
3648
return 0;
3649
}
3650
3651
void free_extent_buffer(struct extent_buffer *eb)
3652
{
3653
int refs;
3654
if (!eb)
3655
return;
3656
3657
refs = refcount_read(&eb->refs);
3658
while (1) {
3659
if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
3660
if (refs == 1)
3661
break;
3662
} else if (refs <= 3) {
3663
break;
3664
}
3665
3666
/* Optimization to avoid locking eb->refs_lock. */
3667
if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
3668
return;
3669
}
3670
3671
spin_lock(&eb->refs_lock);
3672
if (refcount_read(&eb->refs) == 2 &&
3673
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
3674
!extent_buffer_under_io(eb) &&
3675
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3676
refcount_dec(&eb->refs);
3677
3678
/*
3679
* I know this is terrible, but it's temporary until we stop tracking
3680
* the uptodate bits and such for the extent buffers.
3681
*/
3682
release_extent_buffer(eb);
3683
}
3684
3685
void free_extent_buffer_stale(struct extent_buffer *eb)
3686
{
3687
if (!eb)
3688
return;
3689
3690
spin_lock(&eb->refs_lock);
3691
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
3692
3693
if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3694
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
3695
refcount_dec(&eb->refs);
3696
release_extent_buffer(eb);
3697
}
3698
3699
static void btree_clear_folio_dirty_tag(struct folio *folio)
3700
{
3701
ASSERT(!folio_test_dirty(folio));
3702
ASSERT(folio_test_locked(folio));
3703
xa_lock_irq(&folio->mapping->i_pages);
3704
if (!folio_test_dirty(folio))
3705
__xa_clear_mark(&folio->mapping->i_pages, folio->index,
3706
PAGECACHE_TAG_DIRTY);
3707
xa_unlock_irq(&folio->mapping->i_pages);
3708
}
3709
3710
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
3711
struct extent_buffer *eb)
3712
{
3713
struct btrfs_fs_info *fs_info = eb->fs_info;
3714
3715
btrfs_assert_tree_write_locked(eb);
3716
3717
if (trans && btrfs_header_generation(eb) != trans->transid)
3718
return;
3719
3720
/*
3721
* Instead of clearing the dirty flag off of the buffer, mark it as
3722
* EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
3723
* write-ordering in zoned mode, without the need to later re-dirty
3724
* the extent_buffer.
3725
*
3726
* The actual zeroout of the buffer will happen later in
3727
* btree_csum_one_bio.
3728
*/
3729
if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3730
set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
3731
return;
3732
}
3733
3734
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
3735
return;
3736
3737
buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY);
3738
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
3739
fs_info->dirty_metadata_batch);
3740
3741
for (int i = 0; i < num_extent_folios(eb); i++) {
3742
struct folio *folio = eb->folios[i];
3743
bool last;
3744
3745
if (!folio_test_dirty(folio))
3746
continue;
3747
folio_lock(folio);
3748
last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
3749
if (last)
3750
btree_clear_folio_dirty_tag(folio);
3751
folio_unlock(folio);
3752
}
3753
WARN_ON(refcount_read(&eb->refs) == 0);
3754
}
3755
3756
void set_extent_buffer_dirty(struct extent_buffer *eb)
3757
{
3758
bool was_dirty;
3759
3760
check_buffer_tree_ref(eb);
3761
3762
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3763
3764
WARN_ON(refcount_read(&eb->refs) == 0);
3765
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
3766
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
3767
3768
if (!was_dirty) {
3769
bool subpage = btrfs_meta_is_subpage(eb->fs_info);
3770
3771
/*
3772
* For subpage case, we can have other extent buffers in the
3773
* same page, and in clear_extent_buffer_dirty() we
3774
* have to clear page dirty without subpage lock held.
3775
* This can cause race where our page gets dirty cleared after
3776
* we just set it.
3777
*
3778
* Thankfully, clear_extent_buffer_dirty() has locked
3779
* its page for other reasons, we can use page lock to prevent
3780
* the above race.
3781
*/
3782
if (subpage)
3783
folio_lock(eb->folios[0]);
3784
for (int i = 0; i < num_extent_folios(eb); i++)
3785
btrfs_meta_folio_set_dirty(eb->folios[i], eb);
3786
buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY);
3787
if (subpage)
3788
folio_unlock(eb->folios[0]);
3789
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
3790
eb->len,
3791
eb->fs_info->dirty_metadata_batch);
3792
}
3793
#ifdef CONFIG_BTRFS_DEBUG
3794
for (int i = 0; i < num_extent_folios(eb); i++)
3795
ASSERT(folio_test_dirty(eb->folios[i]));
3796
#endif
3797
}
3798
3799
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
3800
{
3801
3802
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3803
for (int i = 0; i < num_extent_folios(eb); i++) {
3804
struct folio *folio = eb->folios[i];
3805
3806
if (!folio)
3807
continue;
3808
3809
btrfs_meta_folio_clear_uptodate(folio, eb);
3810
}
3811
}
3812
3813
void set_extent_buffer_uptodate(struct extent_buffer *eb)
3814
{
3815
3816
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3817
for (int i = 0; i < num_extent_folios(eb); i++)
3818
btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
3819
}
3820
3821
static void clear_extent_buffer_reading(struct extent_buffer *eb)
3822
{
3823
clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags);
3824
}
3825
3826
static void end_bbio_meta_read(struct btrfs_bio *bbio)
3827
{
3828
struct extent_buffer *eb = bbio->private;
3829
bool uptodate = !bbio->bio.bi_status;
3830
3831
/*
3832
* If the extent buffer is marked UPTODATE before the read operation
3833
* completes, other calls to read_extent_buffer_pages() will return
3834
* early without waiting for the read to finish, causing data races.
3835
*/
3836
WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
3837
3838
eb->read_mirror = bbio->mirror_num;
3839
3840
if (uptodate &&
3841
btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
3842
uptodate = false;
3843
3844
if (uptodate)
3845
set_extent_buffer_uptodate(eb);
3846
else
3847
clear_extent_buffer_uptodate(eb);
3848
3849
clear_extent_buffer_reading(eb);
3850
free_extent_buffer(eb);
3851
3852
bio_put(&bbio->bio);
3853
}
3854
3855
int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
3856
const struct btrfs_tree_parent_check *check)
3857
{
3858
struct btrfs_fs_info *fs_info = eb->fs_info;
3859
struct btrfs_bio *bbio;
3860
3861
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3862
return 0;
3863
3864
/*
3865
* We could have had EXTENT_BUFFER_UPTODATE cleared by the write
3866
* operation, which could potentially still be in flight. In this case
3867
* we simply want to return an error.
3868
*/
3869
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
3870
return -EIO;
3871
3872
/* Someone else is already reading the buffer, just wait for it. */
3873
if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
3874
return 0;
3875
3876
/*
3877
* Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
3878
* test_and_set_bit(EXTENT_BUFFER_READING), someone else could have
3879
* started and finished reading the same eb. In this case, UPTODATE
3880
* will now be set, and we shouldn't read it in again.
3881
*/
3882
if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
3883
clear_extent_buffer_reading(eb);
3884
return 0;
3885
}
3886
3887
eb->read_mirror = 0;
3888
check_buffer_tree_ref(eb);
3889
refcount_inc(&eb->refs);
3890
3891
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
3892
REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode),
3893
eb->start, end_bbio_meta_read, eb);
3894
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
3895
memcpy(&bbio->parent_check, check, sizeof(*check));
3896
for (int i = 0; i < num_extent_folios(eb); i++) {
3897
struct folio *folio = eb->folios[i];
3898
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
3899
u32 range_len = min_t(u64, folio_next_pos(folio),
3900
eb->start + eb->len) - range_start;
3901
3902
bio_add_folio_nofail(&bbio->bio, folio, range_len,
3903
offset_in_folio(folio, range_start));
3904
}
3905
btrfs_submit_bbio(bbio, mirror_num);
3906
return 0;
3907
}
3908
3909
int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
3910
const struct btrfs_tree_parent_check *check)
3911
{
3912
int ret;
3913
3914
ret = read_extent_buffer_pages_nowait(eb, mirror_num, check);
3915
if (ret < 0)
3916
return ret;
3917
3918
wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
3919
if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)))
3920
return -EIO;
3921
return 0;
3922
}
3923
3924
static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
3925
unsigned long len)
3926
{
3927
btrfs_warn(eb->fs_info,
3928
"access to eb bytenr %llu len %u out of range start %lu len %lu",
3929
eb->start, eb->len, start, len);
3930
DEBUG_WARN();
3931
3932
return true;
3933
}
3934
3935
/*
3936
* Check if the [start, start + len) range is valid before reading/writing
3937
* the eb.
3938
* NOTE: @start and @len are offset inside the eb, not logical address.
3939
*
3940
* Caller should not touch the dst/src memory if this function returns error.
3941
*/
3942
static inline int check_eb_range(const struct extent_buffer *eb,
3943
unsigned long start, unsigned long len)
3944
{
3945
unsigned long offset;
3946
3947
/* start, start + len should not go beyond eb->len nor overflow */
3948
if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
3949
return report_eb_range(eb, start, len);
3950
3951
return false;
3952
}
3953
3954
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
3955
unsigned long start, unsigned long len)
3956
{
3957
const int unit_size = eb->folio_size;
3958
size_t cur;
3959
size_t offset;
3960
char *dst = (char *)dstv;
3961
unsigned long i = get_eb_folio_index(eb, start);
3962
3963
if (check_eb_range(eb, start, len)) {
3964
/*
3965
* Invalid range hit, reset the memory, so callers won't get
3966
* some random garbage for their uninitialized memory.
3967
*/
3968
memset(dstv, 0, len);
3969
return;
3970
}
3971
3972
if (eb->addr) {
3973
memcpy(dstv, eb->addr + start, len);
3974
return;
3975
}
3976
3977
offset = get_eb_offset_in_folio(eb, start);
3978
3979
while (len > 0) {
3980
char *kaddr;
3981
3982
cur = min(len, unit_size - offset);
3983
kaddr = folio_address(eb->folios[i]);
3984
memcpy(dst, kaddr + offset, cur);
3985
3986
dst += cur;
3987
len -= cur;
3988
offset = 0;
3989
i++;
3990
}
3991
}
3992
3993
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
3994
void __user *dstv,
3995
unsigned long start, unsigned long len)
3996
{
3997
const int unit_size = eb->folio_size;
3998
size_t cur;
3999
size_t offset;
4000
char __user *dst = (char __user *)dstv;
4001
unsigned long i = get_eb_folio_index(eb, start);
4002
int ret = 0;
4003
4004
WARN_ON(start > eb->len);
4005
WARN_ON(start + len > eb->start + eb->len);
4006
4007
if (eb->addr) {
4008
if (copy_to_user_nofault(dstv, eb->addr + start, len))
4009
ret = -EFAULT;
4010
return ret;
4011
}
4012
4013
offset = get_eb_offset_in_folio(eb, start);
4014
4015
while (len > 0) {
4016
char *kaddr;
4017
4018
cur = min(len, unit_size - offset);
4019
kaddr = folio_address(eb->folios[i]);
4020
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
4021
ret = -EFAULT;
4022
break;
4023
}
4024
4025
dst += cur;
4026
len -= cur;
4027
offset = 0;
4028
i++;
4029
}
4030
4031
return ret;
4032
}
4033
4034
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
4035
unsigned long start, unsigned long len)
4036
{
4037
const int unit_size = eb->folio_size;
4038
size_t cur;
4039
size_t offset;
4040
char *kaddr;
4041
char *ptr = (char *)ptrv;
4042
unsigned long i = get_eb_folio_index(eb, start);
4043
int ret = 0;
4044
4045
if (check_eb_range(eb, start, len))
4046
return -EINVAL;
4047
4048
if (eb->addr)
4049
return memcmp(ptrv, eb->addr + start, len);
4050
4051
offset = get_eb_offset_in_folio(eb, start);
4052
4053
while (len > 0) {
4054
cur = min(len, unit_size - offset);
4055
kaddr = folio_address(eb->folios[i]);
4056
ret = memcmp(ptr, kaddr + offset, cur);
4057
if (ret)
4058
break;
4059
4060
ptr += cur;
4061
len -= cur;
4062
offset = 0;
4063
i++;
4064
}
4065
return ret;
4066
}
4067
4068
/*
4069
* Check that the extent buffer is uptodate.
4070
*
4071
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
4072
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
4073
*/
4074
static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
4075
{
4076
struct btrfs_fs_info *fs_info = eb->fs_info;
4077
struct folio *folio = eb->folios[i];
4078
4079
ASSERT(folio);
4080
4081
/*
4082
* If we are using the commit root we could potentially clear a page
4083
* Uptodate while we're using the extent buffer that we've previously
4084
* looked up. We don't want to complain in this case, as the page was
4085
* valid before, we just didn't write it out. Instead we want to catch
4086
* the case where we didn't actually read the block properly, which
4087
* would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.
4088
*/
4089
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4090
return;
4091
4092
if (btrfs_meta_is_subpage(fs_info)) {
4093
folio = eb->folios[0];
4094
ASSERT(i == 0);
4095
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
4096
eb->start, eb->len)))
4097
btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
4098
} else {
4099
WARN_ON(!folio_test_uptodate(folio));
4100
}
4101
}
4102
4103
static void __write_extent_buffer(const struct extent_buffer *eb,
4104
const void *srcv, unsigned long start,
4105
unsigned long len, bool use_memmove)
4106
{
4107
const int unit_size = eb->folio_size;
4108
size_t cur;
4109
size_t offset;
4110
char *kaddr;
4111
const char *src = (const char *)srcv;
4112
unsigned long i = get_eb_folio_index(eb, start);
4113
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
4114
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4115
4116
if (check_eb_range(eb, start, len))
4117
return;
4118
4119
if (eb->addr) {
4120
if (use_memmove)
4121
memmove(eb->addr + start, srcv, len);
4122
else
4123
memcpy(eb->addr + start, srcv, len);
4124
return;
4125
}
4126
4127
offset = get_eb_offset_in_folio(eb, start);
4128
4129
while (len > 0) {
4130
if (check_uptodate)
4131
assert_eb_folio_uptodate(eb, i);
4132
4133
cur = min(len, unit_size - offset);
4134
kaddr = folio_address(eb->folios[i]);
4135
if (use_memmove)
4136
memmove(kaddr + offset, src, cur);
4137
else
4138
memcpy(kaddr + offset, src, cur);
4139
4140
src += cur;
4141
len -= cur;
4142
offset = 0;
4143
i++;
4144
}
4145
}
4146
4147
void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
4148
unsigned long start, unsigned long len)
4149
{
4150
return __write_extent_buffer(eb, srcv, start, len, false);
4151
}
4152
4153
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
4154
unsigned long start, unsigned long len)
4155
{
4156
const int unit_size = eb->folio_size;
4157
unsigned long cur = start;
4158
4159
if (eb->addr) {
4160
memset(eb->addr + start, c, len);
4161
return;
4162
}
4163
4164
while (cur < start + len) {
4165
unsigned long index = get_eb_folio_index(eb, cur);
4166
unsigned int offset = get_eb_offset_in_folio(eb, cur);
4167
unsigned int cur_len = min(start + len - cur, unit_size - offset);
4168
4169
assert_eb_folio_uptodate(eb, index);
4170
memset(folio_address(eb->folios[index]) + offset, c, cur_len);
4171
4172
cur += cur_len;
4173
}
4174
}
4175
4176
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
4177
unsigned long len)
4178
{
4179
if (check_eb_range(eb, start, len))
4180
return;
4181
return memset_extent_buffer(eb, 0, start, len);
4182
}
4183
4184
void copy_extent_buffer_full(const struct extent_buffer *dst,
4185
const struct extent_buffer *src)
4186
{
4187
const int unit_size = src->folio_size;
4188
unsigned long cur = 0;
4189
4190
ASSERT(dst->len == src->len);
4191
4192
while (cur < src->len) {
4193
unsigned long index = get_eb_folio_index(src, cur);
4194
unsigned long offset = get_eb_offset_in_folio(src, cur);
4195
unsigned long cur_len = min(src->len, unit_size - offset);
4196
void *addr = folio_address(src->folios[index]) + offset;
4197
4198
write_extent_buffer(dst, addr, cur, cur_len);
4199
4200
cur += cur_len;
4201
}
4202
}
4203
4204
void copy_extent_buffer(const struct extent_buffer *dst,
4205
const struct extent_buffer *src,
4206
unsigned long dst_offset, unsigned long src_offset,
4207
unsigned long len)
4208
{
4209
const int unit_size = dst->folio_size;
4210
u64 dst_len = dst->len;
4211
size_t cur;
4212
size_t offset;
4213
char *kaddr;
4214
unsigned long i = get_eb_folio_index(dst, dst_offset);
4215
4216
if (check_eb_range(dst, dst_offset, len) ||
4217
check_eb_range(src, src_offset, len))
4218
return;
4219
4220
WARN_ON(src->len != dst_len);
4221
4222
offset = get_eb_offset_in_folio(dst, dst_offset);
4223
4224
while (len > 0) {
4225
assert_eb_folio_uptodate(dst, i);
4226
4227
cur = min(len, (unsigned long)(unit_size - offset));
4228
4229
kaddr = folio_address(dst->folios[i]);
4230
read_extent_buffer(src, kaddr + offset, src_offset, cur);
4231
4232
src_offset += cur;
4233
len -= cur;
4234
offset = 0;
4235
i++;
4236
}
4237
}
4238
4239
/*
4240
* Calculate the folio and offset of the byte containing the given bit number.
4241
*
4242
* @eb: the extent buffer
4243
* @start: offset of the bitmap item in the extent buffer
4244
* @nr: bit number
4245
* @folio_index: return index of the folio in the extent buffer that contains
4246
* the given bit number
4247
* @folio_offset: return offset into the folio given by folio_index
4248
*
4249
* This helper hides the ugliness of finding the byte in an extent buffer which
4250
* contains a given bit.
4251
*/
4252
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
4253
unsigned long start, unsigned long nr,
4254
unsigned long *folio_index,
4255
size_t *folio_offset)
4256
{
4257
size_t byte_offset = BIT_BYTE(nr);
4258
size_t offset;
4259
4260
/*
4261
* The byte we want is the offset of the extent buffer + the offset of
4262
* the bitmap item in the extent buffer + the offset of the byte in the
4263
* bitmap item.
4264
*/
4265
offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
4266
4267
*folio_index = offset >> eb->folio_shift;
4268
*folio_offset = offset_in_eb_folio(eb, offset);
4269
}
4270
4271
/*
4272
* Determine whether a bit in a bitmap item is set.
4273
*
4274
* @eb: the extent buffer
4275
* @start: offset of the bitmap item in the extent buffer
4276
* @nr: bit number to test
4277
*/
4278
bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
4279
unsigned long nr)
4280
{
4281
unsigned long i;
4282
size_t offset;
4283
u8 *kaddr;
4284
4285
eb_bitmap_offset(eb, start, nr, &i, &offset);
4286
assert_eb_folio_uptodate(eb, i);
4287
kaddr = folio_address(eb->folios[i]);
4288
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
4289
}
4290
4291
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
4292
{
4293
unsigned long index = get_eb_folio_index(eb, bytenr);
4294
4295
if (check_eb_range(eb, bytenr, 1))
4296
return NULL;
4297
return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
4298
}
4299
4300
/*
4301
* Set an area of a bitmap to 1.
4302
*
4303
* @eb: the extent buffer
4304
* @start: offset of the bitmap item in the extent buffer
4305
* @pos: bit number of the first bit
4306
* @len: number of bits to set
4307
*/
4308
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
4309
unsigned long pos, unsigned long len)
4310
{
4311
unsigned int first_byte = start + BIT_BYTE(pos);
4312
unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
4313
const bool same_byte = (first_byte == last_byte);
4314
u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
4315
u8 *kaddr;
4316
4317
if (same_byte)
4318
mask &= BITMAP_LAST_BYTE_MASK(pos + len);
4319
4320
/* Handle the first byte. */
4321
kaddr = extent_buffer_get_byte(eb, first_byte);
4322
*kaddr |= mask;
4323
if (same_byte)
4324
return;
4325
4326
/* Handle the byte aligned part. */
4327
ASSERT(first_byte + 1 <= last_byte);
4328
memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1);
4329
4330
/* Handle the last byte. */
4331
kaddr = extent_buffer_get_byte(eb, last_byte);
4332
*kaddr |= BITMAP_LAST_BYTE_MASK(pos + len);
4333
}
4334
4335
4336
/*
4337
* Clear an area of a bitmap.
4338
*
4339
* @eb: the extent buffer
4340
* @start: offset of the bitmap item in the extent buffer
4341
* @pos: bit number of the first bit
4342
* @len: number of bits to clear
4343
*/
4344
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
4345
unsigned long start, unsigned long pos,
4346
unsigned long len)
4347
{
4348
unsigned int first_byte = start + BIT_BYTE(pos);
4349
unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
4350
const bool same_byte = (first_byte == last_byte);
4351
u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
4352
u8 *kaddr;
4353
4354
if (same_byte)
4355
mask &= BITMAP_LAST_BYTE_MASK(pos + len);
4356
4357
/* Handle the first byte. */
4358
kaddr = extent_buffer_get_byte(eb, first_byte);
4359
*kaddr &= ~mask;
4360
if (same_byte)
4361
return;
4362
4363
/* Handle the byte aligned part. */
4364
ASSERT(first_byte + 1 <= last_byte);
4365
memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1);
4366
4367
/* Handle the last byte. */
4368
kaddr = extent_buffer_get_byte(eb, last_byte);
4369
*kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len);
4370
}
4371
4372
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4373
{
4374
unsigned long distance = (src > dst) ? src - dst : dst - src;
4375
return distance < len;
4376
}
4377
4378
void memcpy_extent_buffer(const struct extent_buffer *dst,
4379
unsigned long dst_offset, unsigned long src_offset,
4380
unsigned long len)
4381
{
4382
const int unit_size = dst->folio_size;
4383
unsigned long cur_off = 0;
4384
4385
if (check_eb_range(dst, dst_offset, len) ||
4386
check_eb_range(dst, src_offset, len))
4387
return;
4388
4389
if (dst->addr) {
4390
const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
4391
4392
if (use_memmove)
4393
memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
4394
else
4395
memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
4396
return;
4397
}
4398
4399
while (cur_off < len) {
4400
unsigned long cur_src = cur_off + src_offset;
4401
unsigned long folio_index = get_eb_folio_index(dst, cur_src);
4402
unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
4403
unsigned long cur_len = min(src_offset + len - cur_src,
4404
unit_size - folio_off);
4405
void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
4406
const bool use_memmove = areas_overlap(src_offset + cur_off,
4407
dst_offset + cur_off, cur_len);
4408
4409
__write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len,
4410
use_memmove);
4411
cur_off += cur_len;
4412
}
4413
}
4414
4415
void memmove_extent_buffer(const struct extent_buffer *dst,
4416
unsigned long dst_offset, unsigned long src_offset,
4417
unsigned long len)
4418
{
4419
unsigned long dst_end = dst_offset + len - 1;
4420
unsigned long src_end = src_offset + len - 1;
4421
4422
if (check_eb_range(dst, dst_offset, len) ||
4423
check_eb_range(dst, src_offset, len))
4424
return;
4425
4426
if (dst_offset < src_offset) {
4427
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4428
return;
4429
}
4430
4431
if (dst->addr) {
4432
memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
4433
return;
4434
}
4435
4436
while (len > 0) {
4437
unsigned long src_i;
4438
size_t cur;
4439
size_t dst_off_in_folio;
4440
size_t src_off_in_folio;
4441
void *src_addr;
4442
bool use_memmove;
4443
4444
src_i = get_eb_folio_index(dst, src_end);
4445
4446
dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
4447
src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
4448
4449
cur = min_t(unsigned long, len, src_off_in_folio + 1);
4450
cur = min(cur, dst_off_in_folio + 1);
4451
4452
src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
4453
cur + 1;
4454
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
4455
cur);
4456
4457
__write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur,
4458
use_memmove);
4459
4460
dst_end -= cur;
4461
src_end -= cur;
4462
len -= cur;
4463
}
4464
}
4465
4466
static int try_release_subpage_extent_buffer(struct folio *folio)
4467
{
4468
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
4469
struct extent_buffer *eb;
4470
unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits);
4471
unsigned long index = start;
4472
unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
4473
int ret;
4474
4475
rcu_read_lock();
4476
xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
4477
/*
4478
* The same as try_release_extent_buffer(), to ensure the eb
4479
* won't disappear out from under us.
4480
*/
4481
spin_lock(&eb->refs_lock);
4482
rcu_read_unlock();
4483
4484
if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4485
spin_unlock(&eb->refs_lock);
4486
rcu_read_lock();
4487
continue;
4488
}
4489
4490
/*
4491
* If tree ref isn't set then we know the ref on this eb is a
4492
* real ref, so just return, this eb will likely be freed soon
4493
* anyway.
4494
*/
4495
if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4496
spin_unlock(&eb->refs_lock);
4497
break;
4498
}
4499
4500
/*
4501
* Here we don't care about the return value, we will always
4502
* check the folio private at the end. And
4503
* release_extent_buffer() will release the refs_lock.
4504
*/
4505
release_extent_buffer(eb);
4506
rcu_read_lock();
4507
}
4508
rcu_read_unlock();
4509
4510
/*
4511
* Finally to check if we have cleared folio private, as if we have
4512
* released all ebs in the page, the folio private should be cleared now.
4513
*/
4514
spin_lock(&folio->mapping->i_private_lock);
4515
if (!folio_test_private(folio))
4516
ret = 1;
4517
else
4518
ret = 0;
4519
spin_unlock(&folio->mapping->i_private_lock);
4520
return ret;
4521
}
4522
4523
int try_release_extent_buffer(struct folio *folio)
4524
{
4525
struct extent_buffer *eb;
4526
4527
if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
4528
return try_release_subpage_extent_buffer(folio);
4529
4530
/*
4531
* We need to make sure nobody is changing folio private, as we rely on
4532
* folio private as the pointer to extent buffer.
4533
*/
4534
spin_lock(&folio->mapping->i_private_lock);
4535
if (!folio_test_private(folio)) {
4536
spin_unlock(&folio->mapping->i_private_lock);
4537
return 1;
4538
}
4539
4540
eb = folio_get_private(folio);
4541
BUG_ON(!eb);
4542
4543
/*
4544
* This is a little awful but should be ok, we need to make sure that
4545
* the eb doesn't disappear out from under us while we're looking at
4546
* this page.
4547
*/
4548
spin_lock(&eb->refs_lock);
4549
if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4550
spin_unlock(&eb->refs_lock);
4551
spin_unlock(&folio->mapping->i_private_lock);
4552
return 0;
4553
}
4554
spin_unlock(&folio->mapping->i_private_lock);
4555
4556
/*
4557
* If tree ref isn't set then we know the ref on this eb is a real ref,
4558
* so just return, this page will likely be freed soon anyway.
4559
*/
4560
if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4561
spin_unlock(&eb->refs_lock);
4562
return 0;
4563
}
4564
4565
return release_extent_buffer(eb);
4566
}
4567
4568
/*
4569
* Attempt to readahead a child block.
4570
*
4571
* @fs_info: the fs_info
4572
* @bytenr: bytenr to read
4573
* @owner_root: objectid of the root that owns this eb
4574
* @gen: generation for the uptodate check, can be 0
4575
* @level: level for the eb
4576
*
4577
* Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
4578
* normal uptodate check of the eb, without checking the generation. If we have
4579
* to read the block we will not block on anything.
4580
*/
4581
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
4582
u64 bytenr, u64 owner_root, u64 gen, int level)
4583
{
4584
struct btrfs_tree_parent_check check = {
4585
.level = level,
4586
.transid = gen
4587
};
4588
struct extent_buffer *eb;
4589
int ret;
4590
4591
eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
4592
if (IS_ERR(eb))
4593
return;
4594
4595
if (btrfs_buffer_uptodate(eb, gen, true)) {
4596
free_extent_buffer(eb);
4597
return;
4598
}
4599
4600
ret = read_extent_buffer_pages_nowait(eb, 0, &check);
4601
if (ret < 0)
4602
free_extent_buffer_stale(eb);
4603
else
4604
free_extent_buffer(eb);
4605
}
4606
4607
/*
4608
* Readahead a node's child block.
4609
*
4610
* @node: parent node we're reading from
4611
* @slot: slot in the parent node for the child we want to read
4612
*
4613
* A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
4614
* the slot in the node provided.
4615
*/
4616
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
4617
{
4618
btrfs_readahead_tree_block(node->fs_info,
4619
btrfs_node_blockptr(node, slot),
4620
btrfs_header_owner(node),
4621
btrfs_node_ptr_generation(node, slot),
4622
btrfs_header_level(node) - 1);
4623
}
4624
4625