Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/direct-io.c
26282 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include <linux/fsverity.h>
4
#include <linux/iomap.h>
5
#include "ctree.h"
6
#include "delalloc-space.h"
7
#include "direct-io.h"
8
#include "extent-tree.h"
9
#include "file.h"
10
#include "fs.h"
11
#include "transaction.h"
12
#include "volumes.h"
13
14
struct btrfs_dio_data {
15
ssize_t submitted;
16
struct extent_changeset *data_reserved;
17
struct btrfs_ordered_extent *ordered;
18
bool data_space_reserved;
19
bool nocow_done;
20
};
21
22
struct btrfs_dio_private {
23
/* Range of I/O */
24
u64 file_offset;
25
u32 bytes;
26
27
/* This must be last */
28
struct btrfs_bio bbio;
29
};
30
31
static struct bio_set btrfs_dio_bioset;
32
33
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
34
struct extent_state **cached_state,
35
unsigned int iomap_flags)
36
{
37
const bool writing = (iomap_flags & IOMAP_WRITE);
38
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
39
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
40
struct btrfs_ordered_extent *ordered;
41
int ret = 0;
42
43
/* Direct lock must be taken before the extent lock. */
44
if (nowait) {
45
if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
46
return -EAGAIN;
47
} else {
48
btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
49
}
50
51
while (1) {
52
if (nowait) {
53
if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
54
cached_state)) {
55
ret = -EAGAIN;
56
break;
57
}
58
} else {
59
btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
60
}
61
/*
62
* We're concerned with the entire range that we're going to be
63
* doing DIO to, so we need to make sure there's no ordered
64
* extents in this range.
65
*/
66
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
67
lockend - lockstart + 1);
68
69
/*
70
* We need to make sure there are no buffered pages in this
71
* range either, we could have raced between the invalidate in
72
* generic_file_direct_write and locking the extent. The
73
* invalidate needs to happen so that reads after a write do not
74
* get stale data.
75
*/
76
if (!ordered &&
77
(!writing || !filemap_range_has_page(inode->i_mapping,
78
lockstart, lockend)))
79
break;
80
81
btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
82
83
if (ordered) {
84
if (nowait) {
85
btrfs_put_ordered_extent(ordered);
86
ret = -EAGAIN;
87
break;
88
}
89
/*
90
* If we are doing a DIO read and the ordered extent we
91
* found is for a buffered write, we can not wait for it
92
* to complete and retry, because if we do so we can
93
* deadlock with concurrent buffered writes on page
94
* locks. This happens only if our DIO read covers more
95
* than one extent map, if at this point has already
96
* created an ordered extent for a previous extent map
97
* and locked its range in the inode's io tree, and a
98
* concurrent write against that previous extent map's
99
* range and this range started (we unlock the ranges
100
* in the io tree only when the bios complete and
101
* buffered writes always lock pages before attempting
102
* to lock range in the io tree).
103
*/
104
if (writing ||
105
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
106
btrfs_start_ordered_extent(ordered);
107
else
108
ret = nowait ? -EAGAIN : -ENOTBLK;
109
btrfs_put_ordered_extent(ordered);
110
} else {
111
/*
112
* We could trigger writeback for this range (and wait
113
* for it to complete) and then invalidate the pages for
114
* this range (through invalidate_inode_pages2_range()),
115
* but that can lead us to a deadlock with a concurrent
116
* call to readahead (a buffered read or a defrag call
117
* triggered a readahead) on a page lock due to an
118
* ordered dio extent we created before but did not have
119
* yet a corresponding bio submitted (whence it can not
120
* complete), which makes readahead wait for that
121
* ordered extent to complete while holding a lock on
122
* that page.
123
*/
124
ret = nowait ? -EAGAIN : -ENOTBLK;
125
}
126
127
if (ret)
128
break;
129
130
cond_resched();
131
}
132
133
if (ret)
134
btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
135
return ret;
136
}
137
138
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
139
struct btrfs_dio_data *dio_data,
140
const u64 start,
141
const struct btrfs_file_extent *file_extent,
142
const int type)
143
{
144
struct extent_map *em = NULL;
145
struct btrfs_ordered_extent *ordered;
146
147
if (type != BTRFS_ORDERED_NOCOW) {
148
em = btrfs_create_io_em(inode, start, file_extent, type);
149
if (IS_ERR(em))
150
goto out;
151
}
152
153
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
154
(1U << type) |
155
(1U << BTRFS_ORDERED_DIRECT));
156
if (IS_ERR(ordered)) {
157
if (em) {
158
btrfs_free_extent_map(em);
159
btrfs_drop_extent_map_range(inode, start,
160
start + file_extent->num_bytes - 1, false);
161
}
162
em = ERR_CAST(ordered);
163
} else {
164
ASSERT(!dio_data->ordered);
165
dio_data->ordered = ordered;
166
}
167
out:
168
169
return em;
170
}
171
172
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
173
struct btrfs_dio_data *dio_data,
174
u64 start, u64 len)
175
{
176
struct btrfs_root *root = inode->root;
177
struct btrfs_fs_info *fs_info = root->fs_info;
178
struct btrfs_file_extent file_extent;
179
struct extent_map *em;
180
struct btrfs_key ins;
181
u64 alloc_hint;
182
int ret;
183
184
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
185
again:
186
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
187
0, alloc_hint, &ins, 1, 1);
188
if (ret == -EAGAIN) {
189
ASSERT(btrfs_is_zoned(fs_info));
190
wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
191
TASK_UNINTERRUPTIBLE);
192
goto again;
193
}
194
if (ret)
195
return ERR_PTR(ret);
196
197
file_extent.disk_bytenr = ins.objectid;
198
file_extent.disk_num_bytes = ins.offset;
199
file_extent.num_bytes = ins.offset;
200
file_extent.ram_bytes = ins.offset;
201
file_extent.offset = 0;
202
file_extent.compression = BTRFS_COMPRESS_NONE;
203
em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
204
BTRFS_ORDERED_REGULAR);
205
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
206
if (IS_ERR(em))
207
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
208
209
return em;
210
}
211
212
static int btrfs_get_blocks_direct_write(struct extent_map **map,
213
struct inode *inode,
214
struct btrfs_dio_data *dio_data,
215
u64 start, u64 *lenp,
216
unsigned int iomap_flags)
217
{
218
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
219
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
220
struct btrfs_file_extent file_extent;
221
struct extent_map *em = *map;
222
int type;
223
u64 block_start;
224
struct btrfs_block_group *bg;
225
bool can_nocow = false;
226
bool space_reserved = false;
227
u64 len = *lenp;
228
u64 prev_len;
229
int ret = 0;
230
231
/*
232
* We don't allocate a new extent in the following cases
233
*
234
* 1) The inode is marked as NODATACOW. In this case we'll just use the
235
* existing extent.
236
* 2) The extent is marked as PREALLOC. We're good to go here and can
237
* just use the extent.
238
*
239
*/
240
if ((em->flags & EXTENT_FLAG_PREALLOC) ||
241
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
242
em->disk_bytenr != EXTENT_MAP_HOLE)) {
243
if (em->flags & EXTENT_FLAG_PREALLOC)
244
type = BTRFS_ORDERED_PREALLOC;
245
else
246
type = BTRFS_ORDERED_NOCOW;
247
len = min(len, em->len - (start - em->start));
248
block_start = btrfs_extent_map_block_start(em) + (start - em->start);
249
250
if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
251
false) == 1) {
252
bg = btrfs_inc_nocow_writers(fs_info, block_start);
253
if (bg)
254
can_nocow = true;
255
}
256
}
257
258
prev_len = len;
259
if (can_nocow) {
260
struct extent_map *em2;
261
262
/* We can NOCOW, so only need to reserve metadata space. */
263
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
264
nowait);
265
if (ret < 0) {
266
/* Our caller expects us to free the input extent map. */
267
btrfs_free_extent_map(em);
268
*map = NULL;
269
btrfs_dec_nocow_writers(bg);
270
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
271
ret = -EAGAIN;
272
goto out;
273
}
274
space_reserved = true;
275
276
em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
277
&file_extent, type);
278
btrfs_dec_nocow_writers(bg);
279
if (type == BTRFS_ORDERED_PREALLOC) {
280
btrfs_free_extent_map(em);
281
*map = em2;
282
em = em2;
283
}
284
285
if (IS_ERR(em2)) {
286
ret = PTR_ERR(em2);
287
goto out;
288
}
289
290
dio_data->nocow_done = true;
291
} else {
292
/* Our caller expects us to free the input extent map. */
293
btrfs_free_extent_map(em);
294
*map = NULL;
295
296
if (nowait) {
297
ret = -EAGAIN;
298
goto out;
299
}
300
301
/*
302
* If we could not allocate data space before locking the file
303
* range and we can't do a NOCOW write, then we have to fail.
304
*/
305
if (!dio_data->data_space_reserved) {
306
ret = -ENOSPC;
307
goto out;
308
}
309
310
/*
311
* We have to COW and we have already reserved data space before,
312
* so now we reserve only metadata.
313
*/
314
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
315
false);
316
if (ret < 0)
317
goto out;
318
space_reserved = true;
319
320
em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
321
if (IS_ERR(em)) {
322
ret = PTR_ERR(em);
323
goto out;
324
}
325
*map = em;
326
len = min(len, em->len - (start - em->start));
327
if (len < prev_len)
328
btrfs_delalloc_release_metadata(BTRFS_I(inode),
329
prev_len - len, true);
330
}
331
332
/*
333
* We have created our ordered extent, so we can now release our reservation
334
* for an outstanding extent.
335
*/
336
btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
337
338
/*
339
* Need to update the i_size under the extent lock so buffered
340
* readers will get the updated i_size when we unlock.
341
*/
342
if (start + len > i_size_read(inode))
343
i_size_write(inode, start + len);
344
out:
345
if (ret && space_reserved) {
346
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
347
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
348
}
349
*lenp = len;
350
return ret;
351
}
352
353
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
354
loff_t length, unsigned int flags, struct iomap *iomap,
355
struct iomap *srcmap)
356
{
357
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
358
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
359
struct extent_map *em;
360
struct extent_state *cached_state = NULL;
361
struct btrfs_dio_data *dio_data = iter->private;
362
u64 lockstart, lockend;
363
const bool write = !!(flags & IOMAP_WRITE);
364
int ret = 0;
365
u64 len = length;
366
const u64 data_alloc_len = length;
367
u32 unlock_bits = EXTENT_LOCKED;
368
369
/*
370
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
371
* we're NOWAIT we may submit a bio for a partial range and return
372
* EIOCBQUEUED, which would result in an errant short read.
373
*
374
* The best way to handle this would be to allow for partial completions
375
* of iocb's, so we could submit the partial bio, return and fault in
376
* the rest of the pages, and then submit the io for the rest of the
377
* range. However we don't have that currently, so simply return
378
* -EAGAIN at this point so that the normal path is used.
379
*/
380
if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
381
return -EAGAIN;
382
383
/*
384
* Cap the size of reads to that usually seen in buffered I/O as we need
385
* to allocate a contiguous array for the checksums.
386
*/
387
if (!write)
388
len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
389
390
lockstart = start;
391
lockend = start + len - 1;
392
393
/*
394
* iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
395
* enough if we've written compressed pages to this area, so we need to
396
* flush the dirty pages again to make absolutely sure that any
397
* outstanding dirty pages are on disk - the first flush only starts
398
* compression on the data, while keeping the pages locked, so by the
399
* time the second flush returns we know bios for the compressed pages
400
* were submitted and finished, and the pages no longer under writeback.
401
*
402
* If we have a NOWAIT request and we have any pages in the range that
403
* are locked, likely due to compression still in progress, we don't want
404
* to block on page locks. We also don't want to block on pages marked as
405
* dirty or under writeback (same as for the non-compression case).
406
* iomap_dio_rw() did the same check, but after that and before we got
407
* here, mmap'ed writes may have happened or buffered reads started
408
* (readpage() and readahead(), which lock pages), as we haven't locked
409
* the file range yet.
410
*/
411
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
412
&BTRFS_I(inode)->runtime_flags)) {
413
if (flags & IOMAP_NOWAIT) {
414
if (filemap_range_needs_writeback(inode->i_mapping,
415
lockstart, lockend))
416
return -EAGAIN;
417
} else {
418
ret = filemap_fdatawrite_range(inode->i_mapping, start,
419
start + length - 1);
420
if (ret)
421
return ret;
422
}
423
}
424
425
memset(dio_data, 0, sizeof(*dio_data));
426
427
/*
428
* We always try to allocate data space and must do it before locking
429
* the file range, to avoid deadlocks with concurrent writes to the same
430
* range if the range has several extents and the writes don't expand the
431
* current i_size (the inode lock is taken in shared mode). If we fail to
432
* allocate data space here we continue and later, after locking the
433
* file range, we fail with ENOSPC only if we figure out we can not do a
434
* NOCOW write.
435
*/
436
if (write && !(flags & IOMAP_NOWAIT)) {
437
ret = btrfs_check_data_free_space(BTRFS_I(inode),
438
&dio_data->data_reserved,
439
start, data_alloc_len, false);
440
if (!ret)
441
dio_data->data_space_reserved = true;
442
else if (!(BTRFS_I(inode)->flags &
443
(BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
444
goto err;
445
}
446
447
/*
448
* If this errors out it's because we couldn't invalidate pagecache for
449
* this range and we need to fallback to buffered IO, or we are doing a
450
* NOWAIT read/write and we need to block.
451
*/
452
ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
453
if (ret < 0)
454
goto err;
455
456
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
457
if (IS_ERR(em)) {
458
ret = PTR_ERR(em);
459
goto unlock_err;
460
}
461
462
/*
463
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
464
* io. INLINE is special, and we could probably kludge it in here, but
465
* it's still buffered so for safety lets just fall back to the generic
466
* buffered path.
467
*
468
* For COMPRESSED we _have_ to read the entire extent in so we can
469
* decompress it, so there will be buffering required no matter what we
470
* do, so go ahead and fallback to buffered.
471
*
472
* We return -ENOTBLK because that's what makes DIO go ahead and go back
473
* to buffered IO. Don't blame me, this is the price we pay for using
474
* the generic code.
475
*/
476
if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
477
btrfs_free_extent_map(em);
478
/*
479
* If we are in a NOWAIT context, return -EAGAIN in order to
480
* fallback to buffered IO. This is not only because we can
481
* block with buffered IO (no support for NOWAIT semantics at
482
* the moment) but also to avoid returning short reads to user
483
* space - this happens if we were able to read some data from
484
* previous non-compressed extents and then when we fallback to
485
* buffered IO, at btrfs_file_read_iter() by calling
486
* filemap_read(), we fail to fault in pages for the read buffer,
487
* in which case filemap_read() returns a short read (the number
488
* of bytes previously read is > 0, so it does not return -EFAULT).
489
*/
490
ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
491
goto unlock_err;
492
}
493
494
len = min(len, em->len - (start - em->start));
495
496
/*
497
* If we have a NOWAIT request and the range contains multiple extents
498
* (or a mix of extents and holes), then we return -EAGAIN to make the
499
* caller fallback to a context where it can do a blocking (without
500
* NOWAIT) request. This way we avoid doing partial IO and returning
501
* success to the caller, which is not optimal for writes and for reads
502
* it can result in unexpected behaviour for an application.
503
*
504
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
505
* iomap_dio_rw(), we can end up returning less data then what the caller
506
* asked for, resulting in an unexpected, and incorrect, short read.
507
* That is, the caller asked to read N bytes and we return less than that,
508
* which is wrong unless we are crossing EOF. This happens if we get a
509
* page fault error when trying to fault in pages for the buffer that is
510
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
511
* have previously submitted bios for other extents in the range, in
512
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
513
* those bios have completed by the time we get the page fault error,
514
* which we return back to our caller - we should only return EIOCBQUEUED
515
* after we have submitted bios for all the extents in the range.
516
*/
517
if ((flags & IOMAP_NOWAIT) && len < length) {
518
btrfs_free_extent_map(em);
519
ret = -EAGAIN;
520
goto unlock_err;
521
}
522
523
if (write) {
524
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
525
start, &len, flags);
526
if (ret < 0)
527
goto unlock_err;
528
/* Recalc len in case the new em is smaller than requested */
529
len = min(len, em->len - (start - em->start));
530
if (dio_data->data_space_reserved) {
531
u64 release_offset;
532
u64 release_len = 0;
533
534
if (dio_data->nocow_done) {
535
release_offset = start;
536
release_len = data_alloc_len;
537
} else if (len < data_alloc_len) {
538
release_offset = start + len;
539
release_len = data_alloc_len - len;
540
}
541
542
if (release_len > 0)
543
btrfs_free_reserved_data_space(BTRFS_I(inode),
544
dio_data->data_reserved,
545
release_offset,
546
release_len);
547
}
548
}
549
550
/*
551
* Translate extent map information to iomap.
552
* We trim the extents (and move the addr) even though iomap code does
553
* that, since we have locked only the parts we are performing I/O in.
554
*/
555
if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
556
((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
557
iomap->addr = IOMAP_NULL_ADDR;
558
iomap->type = IOMAP_HOLE;
559
} else {
560
iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
561
iomap->type = IOMAP_MAPPED;
562
}
563
iomap->offset = start;
564
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
565
iomap->length = len;
566
btrfs_free_extent_map(em);
567
568
/*
569
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
570
* writes only hold it for this part. We hold the extent lock until
571
* we're completely done with the extent map to make sure it remains
572
* valid.
573
*/
574
if (write)
575
unlock_bits |= EXTENT_DIO_LOCKED;
576
577
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
578
unlock_bits, &cached_state);
579
580
/* We didn't use everything, unlock the dio extent for the remainder. */
581
if (!write && (start + len) < lockend)
582
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
583
lockend, NULL);
584
585
return 0;
586
587
unlock_err:
588
/*
589
* Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
590
* to update this, be explicit that we expect EXTENT_LOCKED and
591
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
592
*/
593
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
594
EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
595
err:
596
if (dio_data->data_space_reserved) {
597
btrfs_free_reserved_data_space(BTRFS_I(inode),
598
dio_data->data_reserved,
599
start, data_alloc_len);
600
extent_changeset_free(dio_data->data_reserved);
601
}
602
603
return ret;
604
}
605
606
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
607
ssize_t written, unsigned int flags, struct iomap *iomap)
608
{
609
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
610
struct btrfs_dio_data *dio_data = iter->private;
611
size_t submitted = dio_data->submitted;
612
const bool write = !!(flags & IOMAP_WRITE);
613
int ret = 0;
614
615
if (!write && (iomap->type == IOMAP_HOLE)) {
616
/* If reading from a hole, unlock and return */
617
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
618
pos + length - 1, NULL);
619
return 0;
620
}
621
622
if (submitted < length) {
623
pos += submitted;
624
length -= submitted;
625
if (write)
626
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
627
pos, length, false);
628
else
629
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
630
pos + length - 1, NULL);
631
ret = -ENOTBLK;
632
}
633
if (write) {
634
btrfs_put_ordered_extent(dio_data->ordered);
635
dio_data->ordered = NULL;
636
}
637
638
if (write)
639
extent_changeset_free(dio_data->data_reserved);
640
return ret;
641
}
642
643
static void btrfs_dio_end_io(struct btrfs_bio *bbio)
644
{
645
struct btrfs_dio_private *dip =
646
container_of(bbio, struct btrfs_dio_private, bbio);
647
struct btrfs_inode *inode = bbio->inode;
648
struct bio *bio = &bbio->bio;
649
650
if (bio->bi_status) {
651
btrfs_warn(inode->root->fs_info,
652
"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
653
btrfs_ino(inode), bio->bi_opf,
654
dip->file_offset, dip->bytes, bio->bi_status);
655
}
656
657
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
658
btrfs_finish_ordered_extent(bbio->ordered, NULL,
659
dip->file_offset, dip->bytes,
660
!bio->bi_status);
661
} else {
662
btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
663
dip->file_offset + dip->bytes - 1, NULL);
664
}
665
666
bbio->bio.bi_private = bbio->private;
667
iomap_dio_bio_end_io(bio);
668
}
669
670
static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
671
struct btrfs_ordered_extent *ordered)
672
{
673
u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
674
u64 len = bbio->bio.bi_iter.bi_size;
675
struct btrfs_ordered_extent *new;
676
int ret;
677
678
/* Must always be called for the beginning of an ordered extent. */
679
if (WARN_ON_ONCE(start != ordered->disk_bytenr))
680
return -EINVAL;
681
682
/* No need to split if the ordered extent covers the entire bio. */
683
if (ordered->disk_num_bytes == len) {
684
refcount_inc(&ordered->refs);
685
bbio->ordered = ordered;
686
return 0;
687
}
688
689
/*
690
* Don't split the extent_map for NOCOW extents, as we're writing into
691
* a pre-existing one.
692
*/
693
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
694
ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
695
ordered->num_bytes, len,
696
ordered->disk_bytenr);
697
if (ret)
698
return ret;
699
}
700
701
new = btrfs_split_ordered_extent(ordered, len);
702
if (IS_ERR(new))
703
return PTR_ERR(new);
704
bbio->ordered = new;
705
return 0;
706
}
707
708
static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
709
loff_t file_offset)
710
{
711
struct btrfs_bio *bbio = btrfs_bio(bio);
712
struct btrfs_dio_private *dip =
713
container_of(bbio, struct btrfs_dio_private, bbio);
714
struct btrfs_dio_data *dio_data = iter->private;
715
716
btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
717
btrfs_dio_end_io, bio->bi_private);
718
bbio->inode = BTRFS_I(iter->inode);
719
bbio->file_offset = file_offset;
720
721
dip->file_offset = file_offset;
722
dip->bytes = bio->bi_iter.bi_size;
723
724
dio_data->submitted += bio->bi_iter.bi_size;
725
726
/*
727
* Check if we are doing a partial write. If we are, we need to split
728
* the ordered extent to match the submitted bio. Hang on to the
729
* remaining unfinishable ordered_extent in dio_data so that it can be
730
* cancelled in iomap_end to avoid a deadlock wherein faulting the
731
* remaining pages is blocked on the outstanding ordered extent.
732
*/
733
if (iter->flags & IOMAP_WRITE) {
734
int ret;
735
736
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737
if (ret) {
738
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739
file_offset, dip->bytes,
740
!ret);
741
bio->bi_status = errno_to_blk_status(ret);
742
iomap_dio_bio_end_io(bio);
743
return;
744
}
745
}
746
747
btrfs_submit_bbio(bbio, 0);
748
}
749
750
static const struct iomap_ops btrfs_dio_iomap_ops = {
751
.iomap_begin = btrfs_dio_iomap_begin,
752
.iomap_end = btrfs_dio_iomap_end,
753
};
754
755
static const struct iomap_dio_ops btrfs_dio_ops = {
756
.submit_io = btrfs_dio_submit_io,
757
.bio_set = &btrfs_dio_bioset,
758
};
759
760
static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761
size_t done_before)
762
{
763
struct btrfs_dio_data data = { 0 };
764
765
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766
IOMAP_DIO_PARTIAL, &data, done_before);
767
}
768
769
static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770
size_t done_before)
771
{
772
struct btrfs_dio_data data = { 0 };
773
774
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775
IOMAP_DIO_PARTIAL, &data, done_before);
776
}
777
778
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779
const struct iov_iter *iter, loff_t offset)
780
{
781
const u32 blocksize_mask = fs_info->sectorsize - 1;
782
783
if (offset & blocksize_mask)
784
return -EINVAL;
785
786
if (iov_iter_alignment(iter) & blocksize_mask)
787
return -EINVAL;
788
789
return 0;
790
}
791
792
ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
793
{
794
struct file *file = iocb->ki_filp;
795
struct inode *inode = file_inode(file);
796
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
797
loff_t pos;
798
ssize_t written = 0;
799
ssize_t written_buffered;
800
size_t prev_left = 0;
801
loff_t endbyte;
802
ssize_t ret;
803
unsigned int ilock_flags = 0;
804
struct iomap_dio *dio;
805
806
if (iocb->ki_flags & IOCB_NOWAIT)
807
ilock_flags |= BTRFS_ILOCK_TRY;
808
809
/*
810
* If the write DIO is within EOF, use a shared lock and also only if
811
* security bits will likely not be dropped by file_remove_privs() called
812
* from btrfs_write_check(). Either will need to be rechecked after the
813
* lock was acquired.
814
*/
815
if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
816
ilock_flags |= BTRFS_ILOCK_SHARED;
817
818
relock:
819
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
820
if (ret < 0)
821
return ret;
822
823
/* Shared lock cannot be used with security bits set. */
824
if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
825
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
826
ilock_flags &= ~BTRFS_ILOCK_SHARED;
827
goto relock;
828
}
829
830
ret = generic_write_checks(iocb, from);
831
if (ret <= 0) {
832
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
833
return ret;
834
}
835
836
ret = btrfs_write_check(iocb, ret);
837
if (ret < 0) {
838
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
839
goto out;
840
}
841
842
pos = iocb->ki_pos;
843
/*
844
* Re-check since file size may have changed just before taking the
845
* lock or pos may have changed because of O_APPEND in generic_write_check()
846
*/
847
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
848
pos + iov_iter_count(from) > i_size_read(inode)) {
849
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
850
ilock_flags &= ~BTRFS_ILOCK_SHARED;
851
goto relock;
852
}
853
854
if (check_direct_IO(fs_info, from, pos)) {
855
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
856
goto buffered;
857
}
858
/*
859
* We can't control the folios being passed in, applications can write
860
* to them while a direct IO write is in progress. This means the
861
* content might change after we calculated the data checksum.
862
* Therefore we can end up storing a checksum that doesn't match the
863
* persisted data.
864
*
865
* To be extra safe and avoid false data checksum mismatch, if the
866
* inode requires data checksum, just fallback to buffered IO.
867
* For buffered IO we have full control of page cache and can ensure
868
* no one is modifying the content during writeback.
869
*/
870
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
871
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
872
goto buffered;
873
}
874
875
/*
876
* The iov_iter can be mapped to the same file range we are writing to.
877
* If that's the case, then we will deadlock in the iomap code, because
878
* it first calls our callback btrfs_dio_iomap_begin(), which will create
879
* an ordered extent, and after that it will fault in the pages that the
880
* iov_iter refers to. During the fault in we end up in the readahead
881
* pages code (starting at btrfs_readahead()), which will lock the range,
882
* find that ordered extent and then wait for it to complete (at
883
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
884
* obviously the ordered extent can never complete as we didn't submit
885
* yet the respective bio(s). This always happens when the buffer is
886
* memory mapped to the same file range, since the iomap DIO code always
887
* invalidates pages in the target file range (after starting and waiting
888
* for any writeback).
889
*
890
* So here we disable page faults in the iov_iter and then retry if we
891
* got -EFAULT, faulting in the pages before the retry.
892
*/
893
again:
894
from->nofault = true;
895
dio = btrfs_dio_write(iocb, from, written);
896
from->nofault = false;
897
898
if (IS_ERR_OR_NULL(dio)) {
899
ret = PTR_ERR_OR_ZERO(dio);
900
} else {
901
/*
902
* If we have a synchronous write, we must make sure the fsync
903
* triggered by the iomap_dio_complete() call below doesn't
904
* deadlock on the inode lock - we are already holding it and we
905
* can't call it after unlocking because we may need to complete
906
* partial writes due to the input buffer (or parts of it) not
907
* being already faulted in.
908
*/
909
ASSERT(current->journal_info == NULL);
910
current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
911
ret = iomap_dio_complete(dio);
912
current->journal_info = NULL;
913
}
914
915
/* No increment (+=) because iomap returns a cumulative value. */
916
if (ret > 0)
917
written = ret;
918
919
if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
920
const size_t left = iov_iter_count(from);
921
/*
922
* We have more data left to write. Try to fault in as many as
923
* possible of the remainder pages and retry. We do this without
924
* releasing and locking again the inode, to prevent races with
925
* truncate.
926
*
927
* Also, in case the iov refers to pages in the file range of the
928
* file we want to write to (due to a mmap), we could enter an
929
* infinite loop if we retry after faulting the pages in, since
930
* iomap will invalidate any pages in the range early on, before
931
* it tries to fault in the pages of the iov. So we keep track of
932
* how much was left of iov in the previous EFAULT and fallback
933
* to buffered IO in case we haven't made any progress.
934
*/
935
if (left == prev_left) {
936
ret = -ENOTBLK;
937
} else {
938
fault_in_iov_iter_readable(from, left);
939
prev_left = left;
940
goto again;
941
}
942
}
943
944
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
945
946
/*
947
* If 'ret' is -ENOTBLK or we have not written all data, then it means
948
* we must fallback to buffered IO.
949
*/
950
if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
951
goto out;
952
953
buffered:
954
/*
955
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
956
* it must retry the operation in a context where blocking is acceptable,
957
* because even if we end up not blocking during the buffered IO attempt
958
* below, we will block when flushing and waiting for the IO.
959
*/
960
if (iocb->ki_flags & IOCB_NOWAIT) {
961
ret = -EAGAIN;
962
goto out;
963
}
964
965
pos = iocb->ki_pos;
966
written_buffered = btrfs_buffered_write(iocb, from);
967
if (written_buffered < 0) {
968
ret = written_buffered;
969
goto out;
970
}
971
/*
972
* Ensure all data is persisted. We want the next direct IO read to be
973
* able to read what was just written.
974
*/
975
endbyte = pos + written_buffered - 1;
976
ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
977
if (ret)
978
goto out;
979
ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
980
if (ret)
981
goto out;
982
written += written_buffered;
983
iocb->ki_pos = pos + written_buffered;
984
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
985
endbyte >> PAGE_SHIFT);
986
out:
987
return ret < 0 ? ret : written;
988
}
989
990
static int check_direct_read(struct btrfs_fs_info *fs_info,
991
const struct iov_iter *iter, loff_t offset)
992
{
993
int ret;
994
int i, seg;
995
996
ret = check_direct_IO(fs_info, iter, offset);
997
if (ret < 0)
998
return ret;
999
1000
if (!iter_is_iovec(iter))
1001
return 0;
1002
1003
for (seg = 0; seg < iter->nr_segs; seg++) {
1004
for (i = seg + 1; i < iter->nr_segs; i++) {
1005
const struct iovec *iov1 = iter_iov(iter) + seg;
1006
const struct iovec *iov2 = iter_iov(iter) + i;
1007
1008
if (iov1->iov_base == iov2->iov_base)
1009
return -EINVAL;
1010
}
1011
}
1012
return 0;
1013
}
1014
1015
ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1016
{
1017
struct inode *inode = file_inode(iocb->ki_filp);
1018
size_t prev_left = 0;
1019
ssize_t read = 0;
1020
ssize_t ret;
1021
1022
if (fsverity_active(inode))
1023
return 0;
1024
1025
if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1026
return 0;
1027
1028
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1029
again:
1030
/*
1031
* This is similar to what we do for direct IO writes, see the comment
1032
* at btrfs_direct_write(), but we also disable page faults in addition
1033
* to disabling them only at the iov_iter level. This is because when
1034
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1035
* which can still trigger page fault ins despite having set ->nofault
1036
* to true of our 'to' iov_iter.
1037
*
1038
* The difference to direct IO writes is that we deadlock when trying
1039
* to lock the extent range in the inode's tree during he page reads
1040
* triggered by the fault in (while for writes it is due to waiting for
1041
* our own ordered extent). This is because for direct IO reads,
1042
* btrfs_dio_iomap_begin() returns with the extent range locked, which
1043
* is only unlocked in the endio callback (end_bio_extent_readpage()).
1044
*/
1045
pagefault_disable();
1046
to->nofault = true;
1047
ret = btrfs_dio_read(iocb, to, read);
1048
to->nofault = false;
1049
pagefault_enable();
1050
1051
/* No increment (+=) because iomap returns a cumulative value. */
1052
if (ret > 0)
1053
read = ret;
1054
1055
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1056
const size_t left = iov_iter_count(to);
1057
1058
if (left == prev_left) {
1059
/*
1060
* We didn't make any progress since the last attempt,
1061
* fallback to a buffered read for the remainder of the
1062
* range. This is just to avoid any possibility of looping
1063
* for too long.
1064
*/
1065
ret = read;
1066
} else {
1067
/*
1068
* We made some progress since the last retry or this is
1069
* the first time we are retrying. Fault in as many pages
1070
* as possible and retry.
1071
*/
1072
fault_in_iov_iter_writeable(to, left);
1073
prev_left = left;
1074
goto again;
1075
}
1076
}
1077
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1078
return ret < 0 ? ret : read;
1079
}
1080
1081
int __init btrfs_init_dio(void)
1082
{
1083
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1084
offsetof(struct btrfs_dio_private, bbio.bio),
1085
BIOSET_NEED_BVECS))
1086
return -ENOMEM;
1087
1088
return 0;
1089
}
1090
1091
void __cold btrfs_destroy_dio(void)
1092
{
1093
bioset_exit(&btrfs_dio_bioset);
1094
}
1095
1096