Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/direct-io.c
50032 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include <linux/fsverity.h>
4
#include <linux/iomap.h>
5
#include "ctree.h"
6
#include "delalloc-space.h"
7
#include "direct-io.h"
8
#include "extent-tree.h"
9
#include "file.h"
10
#include "fs.h"
11
#include "transaction.h"
12
#include "volumes.h"
13
#include "bio.h"
14
#include "ordered-data.h"
15
16
struct btrfs_dio_data {
17
ssize_t submitted;
18
struct extent_changeset *data_reserved;
19
struct btrfs_ordered_extent *ordered;
20
bool data_space_reserved;
21
bool nocow_done;
22
};
23
24
struct btrfs_dio_private {
25
/* Range of I/O */
26
u64 file_offset;
27
u32 bytes;
28
29
/* This must be last */
30
struct btrfs_bio bbio;
31
};
32
33
static struct bio_set btrfs_dio_bioset;
34
35
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
36
struct extent_state **cached_state,
37
unsigned int iomap_flags)
38
{
39
const bool writing = (iomap_flags & IOMAP_WRITE);
40
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
41
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
42
struct btrfs_ordered_extent *ordered;
43
int ret = 0;
44
45
/* Direct lock must be taken before the extent lock. */
46
if (nowait) {
47
if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
48
return -EAGAIN;
49
} else {
50
btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
51
}
52
53
while (1) {
54
if (nowait) {
55
if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
56
cached_state)) {
57
ret = -EAGAIN;
58
break;
59
}
60
} else {
61
btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
62
}
63
/*
64
* We're concerned with the entire range that we're going to be
65
* doing DIO to, so we need to make sure there's no ordered
66
* extents in this range.
67
*/
68
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
69
lockend - lockstart + 1);
70
71
/*
72
* We need to make sure there are no buffered pages in this
73
* range either, we could have raced between the invalidate in
74
* generic_file_direct_write and locking the extent. The
75
* invalidate needs to happen so that reads after a write do not
76
* get stale data.
77
*/
78
if (!ordered &&
79
(!writing || !filemap_range_has_page(inode->i_mapping,
80
lockstart, lockend)))
81
break;
82
83
btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
84
85
if (ordered) {
86
if (nowait) {
87
btrfs_put_ordered_extent(ordered);
88
ret = -EAGAIN;
89
break;
90
}
91
/*
92
* If we are doing a DIO read and the ordered extent we
93
* found is for a buffered write, we can not wait for it
94
* to complete and retry, because if we do so we can
95
* deadlock with concurrent buffered writes on page
96
* locks. This happens only if our DIO read covers more
97
* than one extent map, if at this point has already
98
* created an ordered extent for a previous extent map
99
* and locked its range in the inode's io tree, and a
100
* concurrent write against that previous extent map's
101
* range and this range started (we unlock the ranges
102
* in the io tree only when the bios complete and
103
* buffered writes always lock pages before attempting
104
* to lock range in the io tree).
105
*/
106
if (writing ||
107
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
108
btrfs_start_ordered_extent(ordered);
109
else
110
ret = nowait ? -EAGAIN : -ENOTBLK;
111
btrfs_put_ordered_extent(ordered);
112
} else {
113
/*
114
* We could trigger writeback for this range (and wait
115
* for it to complete) and then invalidate the pages for
116
* this range (through invalidate_inode_pages2_range()),
117
* but that can lead us to a deadlock with a concurrent
118
* call to readahead (a buffered read or a defrag call
119
* triggered a readahead) on a page lock due to an
120
* ordered dio extent we created before but did not have
121
* yet a corresponding bio submitted (whence it can not
122
* complete), which makes readahead wait for that
123
* ordered extent to complete while holding a lock on
124
* that page.
125
*/
126
ret = nowait ? -EAGAIN : -ENOTBLK;
127
}
128
129
if (ret)
130
break;
131
132
cond_resched();
133
}
134
135
if (ret)
136
btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
137
return ret;
138
}
139
140
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
141
struct btrfs_dio_data *dio_data,
142
const u64 start,
143
const struct btrfs_file_extent *file_extent,
144
const int type)
145
{
146
struct extent_map *em = NULL;
147
struct btrfs_ordered_extent *ordered;
148
149
if (type != BTRFS_ORDERED_NOCOW) {
150
em = btrfs_create_io_em(inode, start, file_extent, type);
151
if (IS_ERR(em))
152
goto out;
153
}
154
155
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
156
(1U << type) |
157
(1U << BTRFS_ORDERED_DIRECT));
158
if (IS_ERR(ordered)) {
159
if (em) {
160
btrfs_free_extent_map(em);
161
btrfs_drop_extent_map_range(inode, start,
162
start + file_extent->num_bytes - 1, false);
163
}
164
em = ERR_CAST(ordered);
165
} else {
166
ASSERT(!dio_data->ordered);
167
dio_data->ordered = ordered;
168
}
169
out:
170
171
return em;
172
}
173
174
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
175
struct btrfs_dio_data *dio_data,
176
u64 start, u64 len)
177
{
178
struct btrfs_root *root = inode->root;
179
struct btrfs_fs_info *fs_info = root->fs_info;
180
struct btrfs_file_extent file_extent;
181
struct extent_map *em;
182
struct btrfs_key ins;
183
u64 alloc_hint;
184
int ret;
185
186
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
187
again:
188
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
189
0, alloc_hint, &ins, true, true);
190
if (ret == -EAGAIN) {
191
ASSERT(btrfs_is_zoned(fs_info));
192
wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
193
TASK_UNINTERRUPTIBLE);
194
goto again;
195
}
196
if (ret)
197
return ERR_PTR(ret);
198
199
file_extent.disk_bytenr = ins.objectid;
200
file_extent.disk_num_bytes = ins.offset;
201
file_extent.num_bytes = ins.offset;
202
file_extent.ram_bytes = ins.offset;
203
file_extent.offset = 0;
204
file_extent.compression = BTRFS_COMPRESS_NONE;
205
em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
206
BTRFS_ORDERED_REGULAR);
207
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
208
if (IS_ERR(em))
209
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
210
211
return em;
212
}
213
214
static int btrfs_get_blocks_direct_write(struct extent_map **map,
215
struct inode *inode,
216
struct btrfs_dio_data *dio_data,
217
u64 start, u64 *lenp,
218
unsigned int iomap_flags)
219
{
220
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
221
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
222
struct btrfs_file_extent file_extent;
223
struct extent_map *em = *map;
224
int type;
225
u64 block_start;
226
struct btrfs_block_group *bg;
227
bool can_nocow = false;
228
bool space_reserved = false;
229
u64 len = *lenp;
230
u64 prev_len;
231
int ret = 0;
232
233
/*
234
* We don't allocate a new extent in the following cases
235
*
236
* 1) The inode is marked as NODATACOW. In this case we'll just use the
237
* existing extent.
238
* 2) The extent is marked as PREALLOC. We're good to go here and can
239
* just use the extent.
240
*
241
*/
242
if ((em->flags & EXTENT_FLAG_PREALLOC) ||
243
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
244
em->disk_bytenr != EXTENT_MAP_HOLE)) {
245
if (em->flags & EXTENT_FLAG_PREALLOC)
246
type = BTRFS_ORDERED_PREALLOC;
247
else
248
type = BTRFS_ORDERED_NOCOW;
249
len = min(len, em->len - (start - em->start));
250
block_start = btrfs_extent_map_block_start(em) + (start - em->start);
251
252
if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
253
false) == 1) {
254
bg = btrfs_inc_nocow_writers(fs_info, block_start);
255
if (bg)
256
can_nocow = true;
257
}
258
}
259
260
prev_len = len;
261
if (can_nocow) {
262
struct extent_map *em2;
263
264
/* We can NOCOW, so only need to reserve metadata space. */
265
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
266
nowait);
267
if (ret < 0) {
268
/* Our caller expects us to free the input extent map. */
269
btrfs_free_extent_map(em);
270
*map = NULL;
271
btrfs_dec_nocow_writers(bg);
272
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
273
ret = -EAGAIN;
274
goto out;
275
}
276
space_reserved = true;
277
278
em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
279
&file_extent, type);
280
btrfs_dec_nocow_writers(bg);
281
if (type == BTRFS_ORDERED_PREALLOC) {
282
btrfs_free_extent_map(em);
283
*map = em2;
284
em = em2;
285
}
286
287
if (IS_ERR(em2)) {
288
ret = PTR_ERR(em2);
289
goto out;
290
}
291
292
dio_data->nocow_done = true;
293
} else {
294
/* Our caller expects us to free the input extent map. */
295
btrfs_free_extent_map(em);
296
*map = NULL;
297
298
if (nowait) {
299
ret = -EAGAIN;
300
goto out;
301
}
302
303
/*
304
* If we could not allocate data space before locking the file
305
* range and we can't do a NOCOW write, then we have to fail.
306
*/
307
if (!dio_data->data_space_reserved) {
308
ret = -ENOSPC;
309
goto out;
310
}
311
312
/*
313
* We have to COW and we have already reserved data space before,
314
* so now we reserve only metadata.
315
*/
316
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
317
false);
318
if (ret < 0)
319
goto out;
320
space_reserved = true;
321
322
em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
323
if (IS_ERR(em)) {
324
ret = PTR_ERR(em);
325
goto out;
326
}
327
*map = em;
328
len = min(len, em->len - (start - em->start));
329
if (len < prev_len)
330
btrfs_delalloc_release_metadata(BTRFS_I(inode),
331
prev_len - len, true);
332
}
333
334
/*
335
* We have created our ordered extent, so we can now release our reservation
336
* for an outstanding extent.
337
*/
338
btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
339
340
/*
341
* Need to update the i_size under the extent lock so buffered
342
* readers will get the updated i_size when we unlock.
343
*/
344
if (start + len > i_size_read(inode))
345
i_size_write(inode, start + len);
346
out:
347
if (ret && space_reserved) {
348
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
349
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
350
}
351
*lenp = len;
352
return ret;
353
}
354
355
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
356
loff_t length, unsigned int flags, struct iomap *iomap,
357
struct iomap *srcmap)
358
{
359
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
360
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
361
struct extent_map *em;
362
struct extent_state *cached_state = NULL;
363
struct btrfs_dio_data *dio_data = iter->private;
364
u64 lockstart, lockend;
365
const bool write = !!(flags & IOMAP_WRITE);
366
int ret = 0;
367
u64 len = length;
368
const u64 data_alloc_len = length;
369
u32 unlock_bits = EXTENT_LOCKED;
370
371
/*
372
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
373
* we're NOWAIT we may submit a bio for a partial range and return
374
* EIOCBQUEUED, which would result in an errant short read.
375
*
376
* The best way to handle this would be to allow for partial completions
377
* of iocb's, so we could submit the partial bio, return and fault in
378
* the rest of the pages, and then submit the io for the rest of the
379
* range. However we don't have that currently, so simply return
380
* -EAGAIN at this point so that the normal path is used.
381
*/
382
if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
383
return -EAGAIN;
384
385
/*
386
* Cap the size of reads to that usually seen in buffered I/O as we need
387
* to allocate a contiguous array for the checksums.
388
*/
389
if (!write)
390
len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
391
392
lockstart = start;
393
lockend = start + len - 1;
394
395
/*
396
* iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
397
* enough if we've written compressed pages to this area, so we need to
398
* flush the dirty pages again to make absolutely sure that any
399
* outstanding dirty pages are on disk - the first flush only starts
400
* compression on the data, while keeping the pages locked, so by the
401
* time the second flush returns we know bios for the compressed pages
402
* were submitted and finished, and the pages no longer under writeback.
403
*
404
* If we have a NOWAIT request and we have any pages in the range that
405
* are locked, likely due to compression still in progress, we don't want
406
* to block on page locks. We also don't want to block on pages marked as
407
* dirty or under writeback (same as for the non-compression case).
408
* iomap_dio_rw() did the same check, but after that and before we got
409
* here, mmap'ed writes may have happened or buffered reads started
410
* (readpage() and readahead(), which lock pages), as we haven't locked
411
* the file range yet.
412
*/
413
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
414
&BTRFS_I(inode)->runtime_flags)) {
415
if (flags & IOMAP_NOWAIT) {
416
if (filemap_range_needs_writeback(inode->i_mapping,
417
lockstart, lockend))
418
return -EAGAIN;
419
} else {
420
ret = filemap_fdatawrite_range(inode->i_mapping, start,
421
start + length - 1);
422
if (ret)
423
return ret;
424
}
425
}
426
427
memset(dio_data, 0, sizeof(*dio_data));
428
429
/*
430
* We always try to allocate data space and must do it before locking
431
* the file range, to avoid deadlocks with concurrent writes to the same
432
* range if the range has several extents and the writes don't expand the
433
* current i_size (the inode lock is taken in shared mode). If we fail to
434
* allocate data space here we continue and later, after locking the
435
* file range, we fail with ENOSPC only if we figure out we can not do a
436
* NOCOW write.
437
*/
438
if (write && !(flags & IOMAP_NOWAIT)) {
439
ret = btrfs_check_data_free_space(BTRFS_I(inode),
440
&dio_data->data_reserved,
441
start, data_alloc_len, false);
442
if (!ret)
443
dio_data->data_space_reserved = true;
444
else if (!(BTRFS_I(inode)->flags &
445
(BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
446
goto err;
447
}
448
449
/*
450
* If this errors out it's because we couldn't invalidate pagecache for
451
* this range and we need to fallback to buffered IO, or we are doing a
452
* NOWAIT read/write and we need to block.
453
*/
454
ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
455
if (ret < 0)
456
goto err;
457
458
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
459
if (IS_ERR(em)) {
460
ret = PTR_ERR(em);
461
goto unlock_err;
462
}
463
464
/*
465
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
466
* io. INLINE is special, and we could probably kludge it in here, but
467
* it's still buffered so for safety lets just fall back to the generic
468
* buffered path.
469
*
470
* For COMPRESSED we _have_ to read the entire extent in so we can
471
* decompress it, so there will be buffering required no matter what we
472
* do, so go ahead and fallback to buffered.
473
*
474
* We return -ENOTBLK because that's what makes DIO go ahead and go back
475
* to buffered IO. Don't blame me, this is the price we pay for using
476
* the generic code.
477
*/
478
if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
479
btrfs_free_extent_map(em);
480
/*
481
* If we are in a NOWAIT context, return -EAGAIN in order to
482
* fallback to buffered IO. This is not only because we can
483
* block with buffered IO (no support for NOWAIT semantics at
484
* the moment) but also to avoid returning short reads to user
485
* space - this happens if we were able to read some data from
486
* previous non-compressed extents and then when we fallback to
487
* buffered IO, at btrfs_file_read_iter() by calling
488
* filemap_read(), we fail to fault in pages for the read buffer,
489
* in which case filemap_read() returns a short read (the number
490
* of bytes previously read is > 0, so it does not return -EFAULT).
491
*/
492
ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
493
goto unlock_err;
494
}
495
496
len = min(len, em->len - (start - em->start));
497
498
/*
499
* If we have a NOWAIT request and the range contains multiple extents
500
* (or a mix of extents and holes), then we return -EAGAIN to make the
501
* caller fallback to a context where it can do a blocking (without
502
* NOWAIT) request. This way we avoid doing partial IO and returning
503
* success to the caller, which is not optimal for writes and for reads
504
* it can result in unexpected behaviour for an application.
505
*
506
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
507
* iomap_dio_rw(), we can end up returning less data then what the caller
508
* asked for, resulting in an unexpected, and incorrect, short read.
509
* That is, the caller asked to read N bytes and we return less than that,
510
* which is wrong unless we are crossing EOF. This happens if we get a
511
* page fault error when trying to fault in pages for the buffer that is
512
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
513
* have previously submitted bios for other extents in the range, in
514
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
515
* those bios have completed by the time we get the page fault error,
516
* which we return back to our caller - we should only return EIOCBQUEUED
517
* after we have submitted bios for all the extents in the range.
518
*/
519
if ((flags & IOMAP_NOWAIT) && len < length) {
520
btrfs_free_extent_map(em);
521
ret = -EAGAIN;
522
goto unlock_err;
523
}
524
525
if (write) {
526
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
527
start, &len, flags);
528
if (ret < 0)
529
goto unlock_err;
530
/* Recalc len in case the new em is smaller than requested */
531
len = min(len, em->len - (start - em->start));
532
if (dio_data->data_space_reserved) {
533
u64 release_offset;
534
u64 release_len = 0;
535
536
if (dio_data->nocow_done) {
537
release_offset = start;
538
release_len = data_alloc_len;
539
} else if (len < data_alloc_len) {
540
release_offset = start + len;
541
release_len = data_alloc_len - len;
542
}
543
544
if (release_len > 0)
545
btrfs_free_reserved_data_space(BTRFS_I(inode),
546
dio_data->data_reserved,
547
release_offset,
548
release_len);
549
}
550
}
551
552
/*
553
* Translate extent map information to iomap.
554
* We trim the extents (and move the addr) even though iomap code does
555
* that, since we have locked only the parts we are performing I/O in.
556
*/
557
if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
558
((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
559
iomap->addr = IOMAP_NULL_ADDR;
560
iomap->type = IOMAP_HOLE;
561
} else {
562
iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
563
iomap->type = IOMAP_MAPPED;
564
}
565
iomap->offset = start;
566
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
567
iomap->length = len;
568
btrfs_free_extent_map(em);
569
570
/*
571
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
572
* writes only hold it for this part. We hold the extent lock until
573
* we're completely done with the extent map to make sure it remains
574
* valid.
575
*/
576
if (write)
577
unlock_bits |= EXTENT_DIO_LOCKED;
578
579
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
580
unlock_bits, &cached_state);
581
582
/* We didn't use everything, unlock the dio extent for the remainder. */
583
if (!write && (start + len) < lockend)
584
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
585
lockend, NULL);
586
587
return 0;
588
589
unlock_err:
590
/*
591
* Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
592
* to update this, be explicit that we expect EXTENT_LOCKED and
593
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
594
*/
595
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
596
EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
597
err:
598
if (dio_data->data_space_reserved) {
599
btrfs_free_reserved_data_space(BTRFS_I(inode),
600
dio_data->data_reserved,
601
start, data_alloc_len);
602
extent_changeset_free(dio_data->data_reserved);
603
}
604
605
return ret;
606
}
607
608
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
609
ssize_t written, unsigned int flags, struct iomap *iomap)
610
{
611
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
612
struct btrfs_dio_data *dio_data = iter->private;
613
size_t submitted = dio_data->submitted;
614
const bool write = !!(flags & IOMAP_WRITE);
615
int ret = 0;
616
617
if (!write && (iomap->type == IOMAP_HOLE)) {
618
/* If reading from a hole, unlock and return */
619
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
620
pos + length - 1, NULL);
621
return 0;
622
}
623
624
if (submitted < length) {
625
pos += submitted;
626
length -= submitted;
627
if (write)
628
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
629
pos, length, false);
630
else
631
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
632
pos + length - 1, NULL);
633
ret = -ENOTBLK;
634
}
635
if (write) {
636
btrfs_put_ordered_extent(dio_data->ordered);
637
dio_data->ordered = NULL;
638
}
639
640
if (write)
641
extent_changeset_free(dio_data->data_reserved);
642
return ret;
643
}
644
645
static void btrfs_dio_end_io(struct btrfs_bio *bbio)
646
{
647
struct btrfs_dio_private *dip =
648
container_of(bbio, struct btrfs_dio_private, bbio);
649
struct btrfs_inode *inode = bbio->inode;
650
struct bio *bio = &bbio->bio;
651
652
if (bio->bi_status) {
653
btrfs_warn(inode->root->fs_info,
654
"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
655
btrfs_ino(inode), bio->bi_opf,
656
dip->file_offset, dip->bytes, bio->bi_status);
657
}
658
659
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
660
btrfs_finish_ordered_extent(bbio->ordered, NULL,
661
dip->file_offset, dip->bytes,
662
!bio->bi_status);
663
} else {
664
btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
665
dip->file_offset + dip->bytes - 1, NULL);
666
}
667
668
bbio->bio.bi_private = bbio->private;
669
iomap_dio_bio_end_io(bio);
670
}
671
672
static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
673
struct btrfs_ordered_extent *ordered)
674
{
675
u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
676
u64 len = bbio->bio.bi_iter.bi_size;
677
struct btrfs_ordered_extent *new;
678
int ret;
679
680
/* Must always be called for the beginning of an ordered extent. */
681
if (WARN_ON_ONCE(start != ordered->disk_bytenr))
682
return -EINVAL;
683
684
/* No need to split if the ordered extent covers the entire bio. */
685
if (ordered->disk_num_bytes == len) {
686
refcount_inc(&ordered->refs);
687
bbio->ordered = ordered;
688
return 0;
689
}
690
691
/*
692
* Don't split the extent_map for NOCOW extents, as we're writing into
693
* a pre-existing one.
694
*/
695
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
696
ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
697
ordered->num_bytes, len,
698
ordered->disk_bytenr);
699
if (ret)
700
return ret;
701
}
702
703
new = btrfs_split_ordered_extent(ordered, len);
704
if (IS_ERR(new))
705
return PTR_ERR(new);
706
bbio->ordered = new;
707
return 0;
708
}
709
710
static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
711
loff_t file_offset)
712
{
713
struct btrfs_bio *bbio = btrfs_bio(bio);
714
struct btrfs_dio_private *dip =
715
container_of(bbio, struct btrfs_dio_private, bbio);
716
struct btrfs_dio_data *dio_data = iter->private;
717
718
btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
719
btrfs_dio_end_io, bio->bi_private);
720
721
dip->file_offset = file_offset;
722
dip->bytes = bio->bi_iter.bi_size;
723
724
dio_data->submitted += bio->bi_iter.bi_size;
725
726
/*
727
* Check if we are doing a partial write. If we are, we need to split
728
* the ordered extent to match the submitted bio. Hang on to the
729
* remaining unfinishable ordered_extent in dio_data so that it can be
730
* cancelled in iomap_end to avoid a deadlock wherein faulting the
731
* remaining pages is blocked on the outstanding ordered extent.
732
*/
733
if (iter->flags & IOMAP_WRITE) {
734
int ret;
735
736
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737
if (ret) {
738
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739
file_offset, dip->bytes,
740
!ret);
741
bio->bi_status = errno_to_blk_status(ret);
742
iomap_dio_bio_end_io(bio);
743
return;
744
}
745
}
746
747
btrfs_submit_bbio(bbio, 0);
748
}
749
750
static const struct iomap_ops btrfs_dio_iomap_ops = {
751
.iomap_begin = btrfs_dio_iomap_begin,
752
.iomap_end = btrfs_dio_iomap_end,
753
};
754
755
static const struct iomap_dio_ops btrfs_dio_ops = {
756
.submit_io = btrfs_dio_submit_io,
757
.bio_set = &btrfs_dio_bioset,
758
};
759
760
static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761
size_t done_before)
762
{
763
struct btrfs_dio_data data = { 0 };
764
765
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766
IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
767
}
768
769
static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770
size_t done_before)
771
{
772
struct btrfs_dio_data data = { 0 };
773
774
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775
IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
776
}
777
778
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779
const struct iov_iter *iter, loff_t offset)
780
{
781
const u32 blocksize_mask = fs_info->sectorsize - 1;
782
783
if (offset & blocksize_mask)
784
return -EINVAL;
785
786
if (iov_iter_alignment(iter) & blocksize_mask)
787
return -EINVAL;
788
return 0;
789
}
790
791
ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
792
{
793
struct file *file = iocb->ki_filp;
794
struct inode *inode = file_inode(file);
795
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
796
loff_t pos;
797
ssize_t written = 0;
798
ssize_t written_buffered;
799
size_t prev_left = 0;
800
loff_t endbyte;
801
ssize_t ret;
802
unsigned int ilock_flags = 0;
803
struct iomap_dio *dio;
804
const u64 data_profile = btrfs_data_alloc_profile(fs_info) &
805
BTRFS_BLOCK_GROUP_PROFILE_MASK;
806
807
if (iocb->ki_flags & IOCB_NOWAIT)
808
ilock_flags |= BTRFS_ILOCK_TRY;
809
810
/*
811
* If the write DIO is within EOF, use a shared lock and also only if
812
* security bits will likely not be dropped by file_remove_privs() called
813
* from btrfs_write_check(). Either will need to be rechecked after the
814
* lock was acquired.
815
*/
816
if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
817
ilock_flags |= BTRFS_ILOCK_SHARED;
818
819
/*
820
* If our data profile has duplication (either extra mirrors or RAID56),
821
* we can not trust the direct IO buffer, the content may change during
822
* writeback and cause different contents written to different mirrors.
823
*
824
* Thus only RAID0 and SINGLE can go true zero-copy direct IO.
825
*/
826
if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0)
827
goto buffered;
828
829
relock:
830
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
831
if (ret < 0)
832
return ret;
833
834
/* Shared lock cannot be used with security bits set. */
835
if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
836
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
837
ilock_flags &= ~BTRFS_ILOCK_SHARED;
838
goto relock;
839
}
840
841
ret = generic_write_checks(iocb, from);
842
if (ret <= 0) {
843
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
844
return ret;
845
}
846
847
ret = btrfs_write_check(iocb, ret);
848
if (ret < 0) {
849
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
850
goto out;
851
}
852
853
pos = iocb->ki_pos;
854
/*
855
* Re-check since file size may have changed just before taking the
856
* lock or pos may have changed because of O_APPEND in generic_write_check()
857
*/
858
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
859
pos + iov_iter_count(from) > i_size_read(inode)) {
860
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
861
ilock_flags &= ~BTRFS_ILOCK_SHARED;
862
goto relock;
863
}
864
865
if (check_direct_IO(fs_info, from, pos)) {
866
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
867
goto buffered;
868
}
869
/*
870
* We can't control the folios being passed in, applications can write
871
* to them while a direct IO write is in progress. This means the
872
* content might change after we calculated the data checksum.
873
* Therefore we can end up storing a checksum that doesn't match the
874
* persisted data.
875
*
876
* To be extra safe and avoid false data checksum mismatch, if the
877
* inode requires data checksum, just fallback to buffered IO.
878
* For buffered IO we have full control of page cache and can ensure
879
* no one is modifying the content during writeback.
880
*/
881
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
882
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
883
goto buffered;
884
}
885
886
/*
887
* The iov_iter can be mapped to the same file range we are writing to.
888
* If that's the case, then we will deadlock in the iomap code, because
889
* it first calls our callback btrfs_dio_iomap_begin(), which will create
890
* an ordered extent, and after that it will fault in the pages that the
891
* iov_iter refers to. During the fault in we end up in the readahead
892
* pages code (starting at btrfs_readahead()), which will lock the range,
893
* find that ordered extent and then wait for it to complete (at
894
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
895
* obviously the ordered extent can never complete as we didn't submit
896
* yet the respective bio(s). This always happens when the buffer is
897
* memory mapped to the same file range, since the iomap DIO code always
898
* invalidates pages in the target file range (after starting and waiting
899
* for any writeback).
900
*
901
* So here we disable page faults in the iov_iter and then retry if we
902
* got -EFAULT, faulting in the pages before the retry.
903
*/
904
again:
905
from->nofault = true;
906
dio = btrfs_dio_write(iocb, from, written);
907
from->nofault = false;
908
909
if (IS_ERR_OR_NULL(dio)) {
910
ret = PTR_ERR_OR_ZERO(dio);
911
} else {
912
/*
913
* If we have a synchronous write, we must make sure the fsync
914
* triggered by the iomap_dio_complete() call below doesn't
915
* deadlock on the inode lock - we are already holding it and we
916
* can't call it after unlocking because we may need to complete
917
* partial writes due to the input buffer (or parts of it) not
918
* being already faulted in.
919
*/
920
ASSERT(current->journal_info == NULL);
921
current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
922
ret = iomap_dio_complete(dio);
923
current->journal_info = NULL;
924
}
925
926
/* No increment (+=) because iomap returns a cumulative value. */
927
if (ret > 0)
928
written = ret;
929
930
if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
931
const size_t left = iov_iter_count(from);
932
/*
933
* We have more data left to write. Try to fault in as many as
934
* possible of the remainder pages and retry. We do this without
935
* releasing and locking again the inode, to prevent races with
936
* truncate.
937
*
938
* Also, in case the iov refers to pages in the file range of the
939
* file we want to write to (due to a mmap), we could enter an
940
* infinite loop if we retry after faulting the pages in, since
941
* iomap will invalidate any pages in the range early on, before
942
* it tries to fault in the pages of the iov. So we keep track of
943
* how much was left of iov in the previous EFAULT and fallback
944
* to buffered IO in case we haven't made any progress.
945
*/
946
if (left == prev_left) {
947
ret = -ENOTBLK;
948
} else {
949
fault_in_iov_iter_readable(from, left);
950
prev_left = left;
951
goto again;
952
}
953
}
954
955
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
956
957
/*
958
* If 'ret' is -ENOTBLK or we have not written all data, then it means
959
* we must fallback to buffered IO.
960
*/
961
if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
962
goto out;
963
964
buffered:
965
/*
966
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
967
* it must retry the operation in a context where blocking is acceptable,
968
* because even if we end up not blocking during the buffered IO attempt
969
* below, we will block when flushing and waiting for the IO.
970
*/
971
if (iocb->ki_flags & IOCB_NOWAIT) {
972
ret = -EAGAIN;
973
goto out;
974
}
975
976
pos = iocb->ki_pos;
977
written_buffered = btrfs_buffered_write(iocb, from);
978
if (written_buffered < 0) {
979
ret = written_buffered;
980
goto out;
981
}
982
/*
983
* Ensure all data is persisted. We want the next direct IO read to be
984
* able to read what was just written.
985
*/
986
endbyte = pos + written_buffered - 1;
987
ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
988
if (ret)
989
goto out;
990
ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
991
if (ret)
992
goto out;
993
written += written_buffered;
994
iocb->ki_pos = pos + written_buffered;
995
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
996
endbyte >> PAGE_SHIFT);
997
out:
998
return ret < 0 ? ret : written;
999
}
1000
1001
static int check_direct_read(struct btrfs_fs_info *fs_info,
1002
const struct iov_iter *iter, loff_t offset)
1003
{
1004
int ret;
1005
int i, seg;
1006
1007
ret = check_direct_IO(fs_info, iter, offset);
1008
if (ret < 0)
1009
return ret;
1010
1011
if (!iter_is_iovec(iter))
1012
return 0;
1013
1014
for (seg = 0; seg < iter->nr_segs; seg++) {
1015
for (i = seg + 1; i < iter->nr_segs; i++) {
1016
const struct iovec *iov1 = iter_iov(iter) + seg;
1017
const struct iovec *iov2 = iter_iov(iter) + i;
1018
1019
if (iov1->iov_base == iov2->iov_base)
1020
return -EINVAL;
1021
}
1022
}
1023
return 0;
1024
}
1025
1026
ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1027
{
1028
struct inode *inode = file_inode(iocb->ki_filp);
1029
size_t prev_left = 0;
1030
ssize_t read = 0;
1031
ssize_t ret;
1032
1033
if (fsverity_active(inode))
1034
return 0;
1035
1036
if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1037
return 0;
1038
1039
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1040
again:
1041
/*
1042
* This is similar to what we do for direct IO writes, see the comment
1043
* at btrfs_direct_write(), but we also disable page faults in addition
1044
* to disabling them only at the iov_iter level. This is because when
1045
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1046
* which can still trigger page fault ins despite having set ->nofault
1047
* to true of our 'to' iov_iter.
1048
*
1049
* The difference to direct IO writes is that we deadlock when trying
1050
* to lock the extent range in the inode's tree during he page reads
1051
* triggered by the fault in (while for writes it is due to waiting for
1052
* our own ordered extent). This is because for direct IO reads,
1053
* btrfs_dio_iomap_begin() returns with the extent range locked, which
1054
* is only unlocked in the endio callback (end_bio_extent_readpage()).
1055
*/
1056
pagefault_disable();
1057
to->nofault = true;
1058
ret = btrfs_dio_read(iocb, to, read);
1059
to->nofault = false;
1060
pagefault_enable();
1061
1062
/* No increment (+=) because iomap returns a cumulative value. */
1063
if (ret > 0)
1064
read = ret;
1065
1066
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1067
const size_t left = iov_iter_count(to);
1068
1069
if (left == prev_left) {
1070
/*
1071
* We didn't make any progress since the last attempt,
1072
* fallback to a buffered read for the remainder of the
1073
* range. This is just to avoid any possibility of looping
1074
* for too long.
1075
*/
1076
ret = read;
1077
} else {
1078
/*
1079
* We made some progress since the last retry or this is
1080
* the first time we are retrying. Fault in as many pages
1081
* as possible and retry.
1082
*/
1083
fault_in_iov_iter_writeable(to, left);
1084
prev_left = left;
1085
goto again;
1086
}
1087
}
1088
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1089
return ret < 0 ? ret : read;
1090
}
1091
1092
int __init btrfs_init_dio(void)
1093
{
1094
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1095
offsetof(struct btrfs_dio_private, bbio.bio),
1096
BIOSET_NEED_BVECS))
1097
return -ENOMEM;
1098
1099
return 0;
1100
}
1101
1102
void __cold btrfs_destroy_dio(void)
1103
{
1104
bioset_exit(&btrfs_dio_bioset);
1105
}
1106
1107