Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/filemap.c
49108 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* linux/mm/filemap.c
4
*
5
* Copyright (C) 1994-1999 Linus Torvalds
6
*/
7
8
/*
9
* This file handles the generic file mmap semantics used by
10
* most "normal" filesystems (but you don't /have/ to use this:
11
* the NFS filesystem used to do this differently, for example)
12
*/
13
#include <linux/export.h>
14
#include <linux/compiler.h>
15
#include <linux/dax.h>
16
#include <linux/fs.h>
17
#include <linux/sched/signal.h>
18
#include <linux/uaccess.h>
19
#include <linux/capability.h>
20
#include <linux/kernel_stat.h>
21
#include <linux/gfp.h>
22
#include <linux/mm.h>
23
#include <linux/swap.h>
24
#include <linux/leafops.h>
25
#include <linux/syscalls.h>
26
#include <linux/mman.h>
27
#include <linux/pagemap.h>
28
#include <linux/file.h>
29
#include <linux/uio.h>
30
#include <linux/error-injection.h>
31
#include <linux/hash.h>
32
#include <linux/writeback.h>
33
#include <linux/backing-dev.h>
34
#include <linux/pagevec.h>
35
#include <linux/security.h>
36
#include <linux/cpuset.h>
37
#include <linux/hugetlb.h>
38
#include <linux/memcontrol.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/rmap.h>
41
#include <linux/delayacct.h>
42
#include <linux/psi.h>
43
#include <linux/ramfs.h>
44
#include <linux/page_idle.h>
45
#include <linux/migrate.h>
46
#include <linux/pipe_fs_i.h>
47
#include <linux/splice.h>
48
#include <linux/rcupdate_wait.h>
49
#include <linux/sched/mm.h>
50
#include <linux/sysctl.h>
51
#include <linux/pgalloc.h>
52
53
#include <asm/tlbflush.h>
54
#include "internal.h"
55
56
#define CREATE_TRACE_POINTS
57
#include <trace/events/filemap.h>
58
59
/*
60
* FIXME: remove all knowledge of the buffer layer from the core VM
61
*/
62
#include <linux/buffer_head.h> /* for try_to_free_buffers */
63
64
#include <asm/mman.h>
65
66
#include "swap.h"
67
68
/*
69
* Shared mappings implemented 30.11.1994. It's not fully working yet,
70
* though.
71
*
72
* Shared mappings now work. 15.8.1995 Bruno.
73
*
74
* finished 'unifying' the page and buffer cache and SMP-threaded the
75
* page-cache, 21.05.1999, Ingo Molnar <[email protected]>
76
*
77
* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <[email protected]>
78
*/
79
80
/*
81
* Lock ordering:
82
*
83
* ->i_mmap_rwsem (truncate_pagecache)
84
* ->private_lock (__free_pte->block_dirty_folio)
85
* ->swap_lock (exclusive_swap_page, others)
86
* ->i_pages lock
87
*
88
* ->i_rwsem
89
* ->invalidate_lock (acquired by fs in truncate path)
90
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
91
*
92
* ->mmap_lock
93
* ->i_mmap_rwsem
94
* ->page_table_lock or pte_lock (various, mainly in memory.c)
95
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
96
*
97
* ->mmap_lock
98
* ->invalidate_lock (filemap_fault)
99
* ->lock_page (filemap_fault, access_process_vm)
100
*
101
* ->i_rwsem (generic_perform_write)
102
* ->mmap_lock (fault_in_readable->do_page_fault)
103
*
104
* bdi->wb.list_lock
105
* sb_lock (fs/fs-writeback.c)
106
* ->i_pages lock (__sync_single_inode)
107
*
108
* ->i_mmap_rwsem
109
* ->anon_vma.lock (vma_merge)
110
*
111
* ->anon_vma.lock
112
* ->page_table_lock or pte_lock (anon_vma_prepare and various)
113
*
114
* ->page_table_lock or pte_lock
115
* ->swap_lock (try_to_unmap_one)
116
* ->private_lock (try_to_unmap_one)
117
* ->i_pages lock (try_to_unmap_one)
118
* ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
119
* ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
120
* ->private_lock (folio_remove_rmap_pte->set_page_dirty)
121
* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
122
* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
123
* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
124
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
125
* ->inode->i_lock (zap_pte_range->set_page_dirty)
126
* ->private_lock (zap_pte_range->block_dirty_folio)
127
*/
128
129
static void page_cache_delete(struct address_space *mapping,
130
struct folio *folio, void *shadow)
131
{
132
XA_STATE(xas, &mapping->i_pages, folio->index);
133
long nr = 1;
134
135
mapping_set_update(&xas, mapping);
136
137
xas_set_order(&xas, folio->index, folio_order(folio));
138
nr = folio_nr_pages(folio);
139
140
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
141
142
xas_store(&xas, shadow);
143
xas_init_marks(&xas);
144
145
folio->mapping = NULL;
146
/* Leave folio->index set: truncation lookup relies upon it */
147
mapping->nrpages -= nr;
148
}
149
150
static void filemap_unaccount_folio(struct address_space *mapping,
151
struct folio *folio)
152
{
153
long nr;
154
155
VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
156
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
157
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
158
current->comm, folio_pfn(folio));
159
dump_page(&folio->page, "still mapped when deleted");
160
dump_stack();
161
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
162
163
if (mapping_exiting(mapping) && !folio_test_large(folio)) {
164
int mapcount = folio_mapcount(folio);
165
166
if (folio_ref_count(folio) >= mapcount + 2) {
167
/*
168
* All vmas have already been torn down, so it's
169
* a good bet that actually the page is unmapped
170
* and we'd rather not leak it: if we're wrong,
171
* another bad page check should catch it later.
172
*/
173
atomic_set(&folio->_mapcount, -1);
174
folio_ref_sub(folio, mapcount);
175
}
176
}
177
}
178
179
/* hugetlb folios do not participate in page cache accounting. */
180
if (folio_test_hugetlb(folio))
181
return;
182
183
nr = folio_nr_pages(folio);
184
185
lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
186
if (folio_test_swapbacked(folio)) {
187
lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
188
if (folio_test_pmd_mappable(folio))
189
lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
190
} else if (folio_test_pmd_mappable(folio)) {
191
lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
192
filemap_nr_thps_dec(mapping);
193
}
194
if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
195
mod_node_page_state(folio_pgdat(folio),
196
NR_KERNEL_FILE_PAGES, -nr);
197
198
/*
199
* At this point folio must be either written or cleaned by
200
* truncate. Dirty folio here signals a bug and loss of
201
* unwritten data - on ordinary filesystems.
202
*
203
* But it's harmless on in-memory filesystems like tmpfs; and can
204
* occur when a driver which did get_user_pages() sets page dirty
205
* before putting it, while the inode is being finally evicted.
206
*
207
* Below fixes dirty accounting after removing the folio entirely
208
* but leaves the dirty flag set: it has no effect for truncated
209
* folio and anyway will be cleared before returning folio to
210
* buddy allocator.
211
*/
212
if (WARN_ON_ONCE(folio_test_dirty(folio) &&
213
mapping_can_writeback(mapping)))
214
folio_account_cleaned(folio, inode_to_wb(mapping->host));
215
}
216
217
/*
218
* Delete a page from the page cache and free it. Caller has to make
219
* sure the page is locked and that nobody else uses it - or that usage
220
* is safe. The caller must hold the i_pages lock.
221
*/
222
void __filemap_remove_folio(struct folio *folio, void *shadow)
223
{
224
struct address_space *mapping = folio->mapping;
225
226
trace_mm_filemap_delete_from_page_cache(folio);
227
filemap_unaccount_folio(mapping, folio);
228
page_cache_delete(mapping, folio, shadow);
229
}
230
231
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
232
{
233
void (*free_folio)(struct folio *);
234
235
free_folio = mapping->a_ops->free_folio;
236
if (free_folio)
237
free_folio(folio);
238
239
folio_put_refs(folio, folio_nr_pages(folio));
240
}
241
242
/**
243
* filemap_remove_folio - Remove folio from page cache.
244
* @folio: The folio.
245
*
246
* This must be called only on folios that are locked and have been
247
* verified to be in the page cache. It will never put the folio into
248
* the free list because the caller has a reference on the page.
249
*/
250
void filemap_remove_folio(struct folio *folio)
251
{
252
struct address_space *mapping = folio->mapping;
253
254
BUG_ON(!folio_test_locked(folio));
255
spin_lock(&mapping->host->i_lock);
256
xa_lock_irq(&mapping->i_pages);
257
__filemap_remove_folio(folio, NULL);
258
xa_unlock_irq(&mapping->i_pages);
259
if (mapping_shrinkable(mapping))
260
inode_lru_list_add(mapping->host);
261
spin_unlock(&mapping->host->i_lock);
262
263
filemap_free_folio(mapping, folio);
264
}
265
266
/*
267
* page_cache_delete_batch - delete several folios from page cache
268
* @mapping: the mapping to which folios belong
269
* @fbatch: batch of folios to delete
270
*
271
* The function walks over mapping->i_pages and removes folios passed in
272
* @fbatch from the mapping. The function expects @fbatch to be sorted
273
* by page index and is optimised for it to be dense.
274
* It tolerates holes in @fbatch (mapping entries at those indices are not
275
* modified).
276
*
277
* The function expects the i_pages lock to be held.
278
*/
279
static void page_cache_delete_batch(struct address_space *mapping,
280
struct folio_batch *fbatch)
281
{
282
XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
283
long total_pages = 0;
284
int i = 0;
285
struct folio *folio;
286
287
mapping_set_update(&xas, mapping);
288
xas_for_each(&xas, folio, ULONG_MAX) {
289
if (i >= folio_batch_count(fbatch))
290
break;
291
292
/* A swap/dax/shadow entry got inserted? Skip it. */
293
if (xa_is_value(folio))
294
continue;
295
/*
296
* A page got inserted in our range? Skip it. We have our
297
* pages locked so they are protected from being removed.
298
* If we see a page whose index is higher than ours, it
299
* means our page has been removed, which shouldn't be
300
* possible because we're holding the PageLock.
301
*/
302
if (folio != fbatch->folios[i]) {
303
VM_BUG_ON_FOLIO(folio->index >
304
fbatch->folios[i]->index, folio);
305
continue;
306
}
307
308
WARN_ON_ONCE(!folio_test_locked(folio));
309
310
folio->mapping = NULL;
311
/* Leave folio->index set: truncation lookup relies on it */
312
313
i++;
314
xas_store(&xas, NULL);
315
total_pages += folio_nr_pages(folio);
316
}
317
mapping->nrpages -= total_pages;
318
}
319
320
void delete_from_page_cache_batch(struct address_space *mapping,
321
struct folio_batch *fbatch)
322
{
323
int i;
324
325
if (!folio_batch_count(fbatch))
326
return;
327
328
spin_lock(&mapping->host->i_lock);
329
xa_lock_irq(&mapping->i_pages);
330
for (i = 0; i < folio_batch_count(fbatch); i++) {
331
struct folio *folio = fbatch->folios[i];
332
333
trace_mm_filemap_delete_from_page_cache(folio);
334
filemap_unaccount_folio(mapping, folio);
335
}
336
page_cache_delete_batch(mapping, fbatch);
337
xa_unlock_irq(&mapping->i_pages);
338
if (mapping_shrinkable(mapping))
339
inode_lru_list_add(mapping->host);
340
spin_unlock(&mapping->host->i_lock);
341
342
for (i = 0; i < folio_batch_count(fbatch); i++)
343
filemap_free_folio(mapping, fbatch->folios[i]);
344
}
345
346
int filemap_check_errors(struct address_space *mapping)
347
{
348
int ret = 0;
349
/* Check for outstanding write errors */
350
if (test_bit(AS_ENOSPC, &mapping->flags) &&
351
test_and_clear_bit(AS_ENOSPC, &mapping->flags))
352
ret = -ENOSPC;
353
if (test_bit(AS_EIO, &mapping->flags) &&
354
test_and_clear_bit(AS_EIO, &mapping->flags))
355
ret = -EIO;
356
return ret;
357
}
358
EXPORT_SYMBOL(filemap_check_errors);
359
360
static int filemap_check_and_keep_errors(struct address_space *mapping)
361
{
362
/* Check for outstanding write errors */
363
if (test_bit(AS_EIO, &mapping->flags))
364
return -EIO;
365
if (test_bit(AS_ENOSPC, &mapping->flags))
366
return -ENOSPC;
367
return 0;
368
}
369
370
static int filemap_writeback(struct address_space *mapping, loff_t start,
371
loff_t end, enum writeback_sync_modes sync_mode,
372
long *nr_to_write)
373
{
374
struct writeback_control wbc = {
375
.sync_mode = sync_mode,
376
.nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX,
377
.range_start = start,
378
.range_end = end,
379
};
380
int ret;
381
382
if (!mapping_can_writeback(mapping) ||
383
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
384
return 0;
385
386
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
387
ret = do_writepages(mapping, &wbc);
388
wbc_detach_inode(&wbc);
389
390
if (!ret && nr_to_write)
391
*nr_to_write = wbc.nr_to_write;
392
return ret;
393
}
394
395
/**
396
* filemap_fdatawrite_range - start writeback on mapping dirty pages in range
397
* @mapping: address space structure to write
398
* @start: offset in bytes where the range starts
399
* @end: offset in bytes where the range ends (inclusive)
400
*
401
* Start writeback against all of a mapping's dirty pages that lie
402
* within the byte offsets <start, end> inclusive.
403
*
404
* This is a data integrity operation that waits upon dirty or in writeback
405
* pages.
406
*
407
* Return: %0 on success, negative error code otherwise.
408
*/
409
int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
410
loff_t end)
411
{
412
return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
413
}
414
EXPORT_SYMBOL(filemap_fdatawrite_range);
415
416
int filemap_fdatawrite(struct address_space *mapping)
417
{
418
return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);
419
}
420
EXPORT_SYMBOL(filemap_fdatawrite);
421
422
/**
423
* filemap_flush_range - start writeback on a range
424
* @mapping: target address_space
425
* @start: index to start writeback on
426
* @end: last (inclusive) index for writeback
427
*
428
* This is a non-integrity writeback helper, to start writing back folios
429
* for the indicated range.
430
*
431
* Return: %0 on success, negative error code otherwise.
432
*/
433
int filemap_flush_range(struct address_space *mapping, loff_t start,
434
loff_t end)
435
{
436
return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
437
}
438
EXPORT_SYMBOL_GPL(filemap_flush_range);
439
440
/**
441
* filemap_flush - mostly a non-blocking flush
442
* @mapping: target address_space
443
*
444
* This is a mostly non-blocking flush. Not suitable for data-integrity
445
* purposes - I/O may not be started against all dirty pages.
446
*
447
* Return: %0 on success, negative error code otherwise.
448
*/
449
int filemap_flush(struct address_space *mapping)
450
{
451
return filemap_flush_range(mapping, 0, LLONG_MAX);
452
}
453
EXPORT_SYMBOL(filemap_flush);
454
455
/*
456
* Start writeback on @nr_to_write pages from @mapping. No one but the existing
457
* btrfs caller should be using this. Talk to linux-mm if you think adding a
458
* new caller is a good idea.
459
*/
460
int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
461
{
462
return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
463
nr_to_write);
464
}
465
EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
466
467
/**
468
* filemap_range_has_page - check if a page exists in range.
469
* @mapping: address space within which to check
470
* @start_byte: offset in bytes where the range starts
471
* @end_byte: offset in bytes where the range ends (inclusive)
472
*
473
* Find at least one page in the range supplied, usually used to check if
474
* direct writing in this range will trigger a writeback.
475
*
476
* Return: %true if at least one page exists in the specified range,
477
* %false otherwise.
478
*/
479
bool filemap_range_has_page(struct address_space *mapping,
480
loff_t start_byte, loff_t end_byte)
481
{
482
struct folio *folio;
483
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
484
pgoff_t max = end_byte >> PAGE_SHIFT;
485
486
if (end_byte < start_byte)
487
return false;
488
489
rcu_read_lock();
490
for (;;) {
491
folio = xas_find(&xas, max);
492
if (xas_retry(&xas, folio))
493
continue;
494
/* Shadow entries don't count */
495
if (xa_is_value(folio))
496
continue;
497
/*
498
* We don't need to try to pin this page; we're about to
499
* release the RCU lock anyway. It is enough to know that
500
* there was a page here recently.
501
*/
502
break;
503
}
504
rcu_read_unlock();
505
506
return folio != NULL;
507
}
508
EXPORT_SYMBOL(filemap_range_has_page);
509
510
static void __filemap_fdatawait_range(struct address_space *mapping,
511
loff_t start_byte, loff_t end_byte)
512
{
513
pgoff_t index = start_byte >> PAGE_SHIFT;
514
pgoff_t end = end_byte >> PAGE_SHIFT;
515
struct folio_batch fbatch;
516
unsigned nr_folios;
517
518
folio_batch_init(&fbatch);
519
520
while (index <= end) {
521
unsigned i;
522
523
nr_folios = filemap_get_folios_tag(mapping, &index, end,
524
PAGECACHE_TAG_WRITEBACK, &fbatch);
525
526
if (!nr_folios)
527
break;
528
529
for (i = 0; i < nr_folios; i++) {
530
struct folio *folio = fbatch.folios[i];
531
532
folio_wait_writeback(folio);
533
}
534
folio_batch_release(&fbatch);
535
cond_resched();
536
}
537
}
538
539
/**
540
* filemap_fdatawait_range - wait for writeback to complete
541
* @mapping: address space structure to wait for
542
* @start_byte: offset in bytes where the range starts
543
* @end_byte: offset in bytes where the range ends (inclusive)
544
*
545
* Walk the list of under-writeback pages of the given address space
546
* in the given range and wait for all of them. Check error status of
547
* the address space and return it.
548
*
549
* Since the error status of the address space is cleared by this function,
550
* callers are responsible for checking the return value and handling and/or
551
* reporting the error.
552
*
553
* Return: error status of the address space.
554
*/
555
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
556
loff_t end_byte)
557
{
558
__filemap_fdatawait_range(mapping, start_byte, end_byte);
559
return filemap_check_errors(mapping);
560
}
561
EXPORT_SYMBOL(filemap_fdatawait_range);
562
563
/**
564
* filemap_fdatawait_range_keep_errors - wait for writeback to complete
565
* @mapping: address space structure to wait for
566
* @start_byte: offset in bytes where the range starts
567
* @end_byte: offset in bytes where the range ends (inclusive)
568
*
569
* Walk the list of under-writeback pages of the given address space in the
570
* given range and wait for all of them. Unlike filemap_fdatawait_range(),
571
* this function does not clear error status of the address space.
572
*
573
* Use this function if callers don't handle errors themselves. Expected
574
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
575
* fsfreeze(8)
576
*/
577
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
578
loff_t start_byte, loff_t end_byte)
579
{
580
__filemap_fdatawait_range(mapping, start_byte, end_byte);
581
return filemap_check_and_keep_errors(mapping);
582
}
583
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
584
585
/**
586
* file_fdatawait_range - wait for writeback to complete
587
* @file: file pointing to address space structure to wait for
588
* @start_byte: offset in bytes where the range starts
589
* @end_byte: offset in bytes where the range ends (inclusive)
590
*
591
* Walk the list of under-writeback pages of the address space that file
592
* refers to, in the given range and wait for all of them. Check error
593
* status of the address space vs. the file->f_wb_err cursor and return it.
594
*
595
* Since the error status of the file is advanced by this function,
596
* callers are responsible for checking the return value and handling and/or
597
* reporting the error.
598
*
599
* Return: error status of the address space vs. the file->f_wb_err cursor.
600
*/
601
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
602
{
603
struct address_space *mapping = file->f_mapping;
604
605
__filemap_fdatawait_range(mapping, start_byte, end_byte);
606
return file_check_and_advance_wb_err(file);
607
}
608
EXPORT_SYMBOL(file_fdatawait_range);
609
610
/**
611
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
612
* @mapping: address space structure to wait for
613
*
614
* Walk the list of under-writeback pages of the given address space
615
* and wait for all of them. Unlike filemap_fdatawait(), this function
616
* does not clear error status of the address space.
617
*
618
* Use this function if callers don't handle errors themselves. Expected
619
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
620
* fsfreeze(8)
621
*
622
* Return: error status of the address space.
623
*/
624
int filemap_fdatawait_keep_errors(struct address_space *mapping)
625
{
626
__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
627
return filemap_check_and_keep_errors(mapping);
628
}
629
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
630
631
/* Returns true if writeback might be needed or already in progress. */
632
static bool mapping_needs_writeback(struct address_space *mapping)
633
{
634
return mapping->nrpages;
635
}
636
637
bool filemap_range_has_writeback(struct address_space *mapping,
638
loff_t start_byte, loff_t end_byte)
639
{
640
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
641
pgoff_t max = end_byte >> PAGE_SHIFT;
642
struct folio *folio;
643
644
if (end_byte < start_byte)
645
return false;
646
647
rcu_read_lock();
648
xas_for_each(&xas, folio, max) {
649
if (xas_retry(&xas, folio))
650
continue;
651
if (xa_is_value(folio))
652
continue;
653
if (folio_test_dirty(folio) || folio_test_locked(folio) ||
654
folio_test_writeback(folio))
655
break;
656
}
657
rcu_read_unlock();
658
return folio != NULL;
659
}
660
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
661
662
/**
663
* filemap_write_and_wait_range - write out & wait on a file range
664
* @mapping: the address_space for the pages
665
* @lstart: offset in bytes where the range starts
666
* @lend: offset in bytes where the range ends (inclusive)
667
*
668
* Write out and wait upon file offsets lstart->lend, inclusive.
669
*
670
* Note that @lend is inclusive (describes the last byte to be written) so
671
* that this function can be used to write to the very end-of-file (end = -1).
672
*
673
* Return: error status of the address space.
674
*/
675
int filemap_write_and_wait_range(struct address_space *mapping,
676
loff_t lstart, loff_t lend)
677
{
678
int err = 0, err2;
679
680
if (lend < lstart)
681
return 0;
682
683
if (mapping_needs_writeback(mapping)) {
684
err = filemap_fdatawrite_range(mapping, lstart, lend);
685
/*
686
* Even if the above returned error, the pages may be
687
* written partially (e.g. -ENOSPC), so we wait for it.
688
* But the -EIO is special case, it may indicate the worst
689
* thing (e.g. bug) happened, so we avoid waiting for it.
690
*/
691
if (err != -EIO)
692
__filemap_fdatawait_range(mapping, lstart, lend);
693
}
694
err2 = filemap_check_errors(mapping);
695
if (!err)
696
err = err2;
697
return err;
698
}
699
EXPORT_SYMBOL(filemap_write_and_wait_range);
700
701
void __filemap_set_wb_err(struct address_space *mapping, int err)
702
{
703
errseq_t eseq = errseq_set(&mapping->wb_err, err);
704
705
trace_filemap_set_wb_err(mapping, eseq);
706
}
707
EXPORT_SYMBOL(__filemap_set_wb_err);
708
709
/**
710
* file_check_and_advance_wb_err - report wb error (if any) that was previously
711
* and advance wb_err to current one
712
* @file: struct file on which the error is being reported
713
*
714
* When userland calls fsync (or something like nfsd does the equivalent), we
715
* want to report any writeback errors that occurred since the last fsync (or
716
* since the file was opened if there haven't been any).
717
*
718
* Grab the wb_err from the mapping. If it matches what we have in the file,
719
* then just quickly return 0. The file is all caught up.
720
*
721
* If it doesn't match, then take the mapping value, set the "seen" flag in
722
* it and try to swap it into place. If it works, or another task beat us
723
* to it with the new value, then update the f_wb_err and return the error
724
* portion. The error at this point must be reported via proper channels
725
* (a'la fsync, or NFS COMMIT operation, etc.).
726
*
727
* While we handle mapping->wb_err with atomic operations, the f_wb_err
728
* value is protected by the f_lock since we must ensure that it reflects
729
* the latest value swapped in for this file descriptor.
730
*
731
* Return: %0 on success, negative error code otherwise.
732
*/
733
int file_check_and_advance_wb_err(struct file *file)
734
{
735
int err = 0;
736
errseq_t old = READ_ONCE(file->f_wb_err);
737
struct address_space *mapping = file->f_mapping;
738
739
/* Locklessly handle the common case where nothing has changed */
740
if (errseq_check(&mapping->wb_err, old)) {
741
/* Something changed, must use slow path */
742
spin_lock(&file->f_lock);
743
old = file->f_wb_err;
744
err = errseq_check_and_advance(&mapping->wb_err,
745
&file->f_wb_err);
746
trace_file_check_and_advance_wb_err(file, old);
747
spin_unlock(&file->f_lock);
748
}
749
750
/*
751
* We're mostly using this function as a drop in replacement for
752
* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
753
* that the legacy code would have had on these flags.
754
*/
755
clear_bit(AS_EIO, &mapping->flags);
756
clear_bit(AS_ENOSPC, &mapping->flags);
757
return err;
758
}
759
EXPORT_SYMBOL(file_check_and_advance_wb_err);
760
761
/**
762
* file_write_and_wait_range - write out & wait on a file range
763
* @file: file pointing to address_space with pages
764
* @lstart: offset in bytes where the range starts
765
* @lend: offset in bytes where the range ends (inclusive)
766
*
767
* Write out and wait upon file offsets lstart->lend, inclusive.
768
*
769
* Note that @lend is inclusive (describes the last byte to be written) so
770
* that this function can be used to write to the very end-of-file (end = -1).
771
*
772
* After writing out and waiting on the data, we check and advance the
773
* f_wb_err cursor to the latest value, and return any errors detected there.
774
*
775
* Return: %0 on success, negative error code otherwise.
776
*/
777
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
778
{
779
int err = 0, err2;
780
struct address_space *mapping = file->f_mapping;
781
782
if (lend < lstart)
783
return 0;
784
785
if (mapping_needs_writeback(mapping)) {
786
err = filemap_fdatawrite_range(mapping, lstart, lend);
787
/* See comment of filemap_write_and_wait() */
788
if (err != -EIO)
789
__filemap_fdatawait_range(mapping, lstart, lend);
790
}
791
err2 = file_check_and_advance_wb_err(file);
792
if (!err)
793
err = err2;
794
return err;
795
}
796
EXPORT_SYMBOL(file_write_and_wait_range);
797
798
/**
799
* replace_page_cache_folio - replace a pagecache folio with a new one
800
* @old: folio to be replaced
801
* @new: folio to replace with
802
*
803
* This function replaces a folio in the pagecache with a new one. On
804
* success it acquires the pagecache reference for the new folio and
805
* drops it for the old folio. Both the old and new folios must be
806
* locked. This function does not add the new folio to the LRU, the
807
* caller must do that.
808
*
809
* The remove + add is atomic. This function cannot fail.
810
*/
811
void replace_page_cache_folio(struct folio *old, struct folio *new)
812
{
813
struct address_space *mapping = old->mapping;
814
void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
815
pgoff_t offset = old->index;
816
XA_STATE(xas, &mapping->i_pages, offset);
817
818
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
819
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
820
VM_BUG_ON_FOLIO(new->mapping, new);
821
822
folio_get(new);
823
new->mapping = mapping;
824
new->index = offset;
825
826
mem_cgroup_replace_folio(old, new);
827
828
xas_lock_irq(&xas);
829
xas_store(&xas, new);
830
831
old->mapping = NULL;
832
/* hugetlb pages do not participate in page cache accounting. */
833
if (!folio_test_hugetlb(old))
834
lruvec_stat_sub_folio(old, NR_FILE_PAGES);
835
if (!folio_test_hugetlb(new))
836
lruvec_stat_add_folio(new, NR_FILE_PAGES);
837
if (folio_test_swapbacked(old))
838
lruvec_stat_sub_folio(old, NR_SHMEM);
839
if (folio_test_swapbacked(new))
840
lruvec_stat_add_folio(new, NR_SHMEM);
841
xas_unlock_irq(&xas);
842
if (free_folio)
843
free_folio(old);
844
folio_put(old);
845
}
846
EXPORT_SYMBOL_GPL(replace_page_cache_folio);
847
848
noinline int __filemap_add_folio(struct address_space *mapping,
849
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
850
{
851
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
852
bool huge;
853
long nr;
854
unsigned int forder = folio_order(folio);
855
856
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
857
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
858
VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
859
folio);
860
mapping_set_update(&xas, mapping);
861
862
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
863
huge = folio_test_hugetlb(folio);
864
nr = folio_nr_pages(folio);
865
866
gfp &= GFP_RECLAIM_MASK;
867
folio_ref_add(folio, nr);
868
folio->mapping = mapping;
869
folio->index = xas.xa_index;
870
871
for (;;) {
872
int order = -1;
873
void *entry, *old = NULL;
874
875
xas_lock_irq(&xas);
876
xas_for_each_conflict(&xas, entry) {
877
old = entry;
878
if (!xa_is_value(entry)) {
879
xas_set_err(&xas, -EEXIST);
880
goto unlock;
881
}
882
/*
883
* If a larger entry exists,
884
* it will be the first and only entry iterated.
885
*/
886
if (order == -1)
887
order = xas_get_order(&xas);
888
}
889
890
if (old) {
891
if (order > 0 && order > forder) {
892
unsigned int split_order = max(forder,
893
xas_try_split_min_order(order));
894
895
/* How to handle large swap entries? */
896
BUG_ON(shmem_mapping(mapping));
897
898
while (order > forder) {
899
xas_set_order(&xas, index, split_order);
900
xas_try_split(&xas, old, order);
901
if (xas_error(&xas))
902
goto unlock;
903
order = split_order;
904
split_order =
905
max(xas_try_split_min_order(
906
split_order),
907
forder);
908
}
909
xas_reset(&xas);
910
}
911
if (shadowp)
912
*shadowp = old;
913
}
914
915
xas_store(&xas, folio);
916
if (xas_error(&xas))
917
goto unlock;
918
919
mapping->nrpages += nr;
920
921
/* hugetlb pages do not participate in page cache accounting */
922
if (!huge) {
923
lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
924
if (folio_test_pmd_mappable(folio))
925
lruvec_stat_mod_folio(folio,
926
NR_FILE_THPS, nr);
927
}
928
929
unlock:
930
xas_unlock_irq(&xas);
931
932
if (!xas_nomem(&xas, gfp))
933
break;
934
}
935
936
if (xas_error(&xas))
937
goto error;
938
939
trace_mm_filemap_add_to_page_cache(folio);
940
return 0;
941
error:
942
folio->mapping = NULL;
943
/* Leave folio->index set: truncation relies upon it */
944
folio_put_refs(folio, nr);
945
return xas_error(&xas);
946
}
947
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
948
949
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
950
pgoff_t index, gfp_t gfp)
951
{
952
void *shadow = NULL;
953
int ret;
954
struct mem_cgroup *tmp;
955
bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
956
957
if (kernel_file)
958
tmp = set_active_memcg(root_mem_cgroup);
959
ret = mem_cgroup_charge(folio, NULL, gfp);
960
if (kernel_file)
961
set_active_memcg(tmp);
962
if (ret)
963
return ret;
964
965
__folio_set_locked(folio);
966
ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
967
if (unlikely(ret)) {
968
mem_cgroup_uncharge(folio);
969
__folio_clear_locked(folio);
970
} else {
971
/*
972
* The folio might have been evicted from cache only
973
* recently, in which case it should be activated like
974
* any other repeatedly accessed folio.
975
* The exception is folios getting rewritten; evicting other
976
* data from the working set, only to cache data that will
977
* get overwritten with something else, is a waste of memory.
978
*/
979
WARN_ON_ONCE(folio_test_active(folio));
980
if (!(gfp & __GFP_WRITE) && shadow)
981
workingset_refault(folio, shadow);
982
folio_add_lru(folio);
983
if (kernel_file)
984
mod_node_page_state(folio_pgdat(folio),
985
NR_KERNEL_FILE_PAGES,
986
folio_nr_pages(folio));
987
}
988
return ret;
989
}
990
EXPORT_SYMBOL_GPL(filemap_add_folio);
991
992
#ifdef CONFIG_NUMA
993
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
994
struct mempolicy *policy)
995
{
996
int n;
997
struct folio *folio;
998
999
if (policy)
1000
return folio_alloc_mpol_noprof(gfp, order, policy,
1001
NO_INTERLEAVE_INDEX, numa_node_id());
1002
1003
if (cpuset_do_page_mem_spread()) {
1004
unsigned int cpuset_mems_cookie;
1005
do {
1006
cpuset_mems_cookie = read_mems_allowed_begin();
1007
n = cpuset_mem_spread_node();
1008
folio = __folio_alloc_node_noprof(gfp, order, n);
1009
} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
1010
1011
return folio;
1012
}
1013
return folio_alloc_noprof(gfp, order);
1014
}
1015
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
1016
#endif
1017
1018
/*
1019
* filemap_invalidate_lock_two - lock invalidate_lock for two mappings
1020
*
1021
* Lock exclusively invalidate_lock of any passed mapping that is not NULL.
1022
*
1023
* @mapping1: the first mapping to lock
1024
* @mapping2: the second mapping to lock
1025
*/
1026
void filemap_invalidate_lock_two(struct address_space *mapping1,
1027
struct address_space *mapping2)
1028
{
1029
if (mapping1 > mapping2)
1030
swap(mapping1, mapping2);
1031
if (mapping1)
1032
down_write(&mapping1->invalidate_lock);
1033
if (mapping2 && mapping1 != mapping2)
1034
down_write_nested(&mapping2->invalidate_lock, 1);
1035
}
1036
EXPORT_SYMBOL(filemap_invalidate_lock_two);
1037
1038
/*
1039
* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
1040
*
1041
* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
1042
*
1043
* @mapping1: the first mapping to unlock
1044
* @mapping2: the second mapping to unlock
1045
*/
1046
void filemap_invalidate_unlock_two(struct address_space *mapping1,
1047
struct address_space *mapping2)
1048
{
1049
if (mapping1)
1050
up_write(&mapping1->invalidate_lock);
1051
if (mapping2 && mapping1 != mapping2)
1052
up_write(&mapping2->invalidate_lock);
1053
}
1054
EXPORT_SYMBOL(filemap_invalidate_unlock_two);
1055
1056
/*
1057
* In order to wait for pages to become available there must be
1058
* waitqueues associated with pages. By using a hash table of
1059
* waitqueues where the bucket discipline is to maintain all
1060
* waiters on the same queue and wake all when any of the pages
1061
* become available, and for the woken contexts to check to be
1062
* sure the appropriate page became available, this saves space
1063
* at a cost of "thundering herd" phenomena during rare hash
1064
* collisions.
1065
*/
1066
#define PAGE_WAIT_TABLE_BITS 8
1067
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
1068
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
1069
1070
static wait_queue_head_t *folio_waitqueue(struct folio *folio)
1071
{
1072
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
1073
}
1074
1075
/* How many times do we accept lock stealing from under a waiter? */
1076
static int sysctl_page_lock_unfairness = 5;
1077
static const struct ctl_table filemap_sysctl_table[] = {
1078
{
1079
.procname = "page_lock_unfairness",
1080
.data = &sysctl_page_lock_unfairness,
1081
.maxlen = sizeof(sysctl_page_lock_unfairness),
1082
.mode = 0644,
1083
.proc_handler = proc_dointvec_minmax,
1084
.extra1 = SYSCTL_ZERO,
1085
}
1086
};
1087
1088
void __init pagecache_init(void)
1089
{
1090
int i;
1091
1092
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
1093
init_waitqueue_head(&folio_wait_table[i]);
1094
1095
page_writeback_init();
1096
register_sysctl_init("vm", filemap_sysctl_table);
1097
}
1098
1099
/*
1100
* The page wait code treats the "wait->flags" somewhat unusually, because
1101
* we have multiple different kinds of waits, not just the usual "exclusive"
1102
* one.
1103
*
1104
* We have:
1105
*
1106
* (a) no special bits set:
1107
*
1108
* We're just waiting for the bit to be released, and when a waker
1109
* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
1110
* and remove it from the wait queue.
1111
*
1112
* Simple and straightforward.
1113
*
1114
* (b) WQ_FLAG_EXCLUSIVE:
1115
*
1116
* The waiter is waiting to get the lock, and only one waiter should
1117
* be woken up to avoid any thundering herd behavior. We'll set the
1118
* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
1119
*
1120
* This is the traditional exclusive wait.
1121
*
1122
* (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
1123
*
1124
* The waiter is waiting to get the bit, and additionally wants the
1125
* lock to be transferred to it for fair lock behavior. If the lock
1126
* cannot be taken, we stop walking the wait queue without waking
1127
* the waiter.
1128
*
1129
* This is the "fair lock handoff" case, and in addition to setting
1130
* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
1131
* that it now has the lock.
1132
*/
1133
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
1134
{
1135
unsigned int flags;
1136
struct wait_page_key *key = arg;
1137
struct wait_page_queue *wait_page
1138
= container_of(wait, struct wait_page_queue, wait);
1139
1140
if (!wake_page_match(wait_page, key))
1141
return 0;
1142
1143
/*
1144
* If it's a lock handoff wait, we get the bit for it, and
1145
* stop walking (and do not wake it up) if we can't.
1146
*/
1147
flags = wait->flags;
1148
if (flags & WQ_FLAG_EXCLUSIVE) {
1149
if (test_bit(key->bit_nr, &key->folio->flags.f))
1150
return -1;
1151
if (flags & WQ_FLAG_CUSTOM) {
1152
if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
1153
return -1;
1154
flags |= WQ_FLAG_DONE;
1155
}
1156
}
1157
1158
/*
1159
* We are holding the wait-queue lock, but the waiter that
1160
* is waiting for this will be checking the flags without
1161
* any locking.
1162
*
1163
* So update the flags atomically, and wake up the waiter
1164
* afterwards to avoid any races. This store-release pairs
1165
* with the load-acquire in folio_wait_bit_common().
1166
*/
1167
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
1168
wake_up_state(wait->private, mode);
1169
1170
/*
1171
* Ok, we have successfully done what we're waiting for,
1172
* and we can unconditionally remove the wait entry.
1173
*
1174
* Note that this pairs with the "finish_wait()" in the
1175
* waiter, and has to be the absolute last thing we do.
1176
* After this list_del_init(&wait->entry) the wait entry
1177
* might be de-allocated and the process might even have
1178
* exited.
1179
*/
1180
list_del_init_careful(&wait->entry);
1181
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1182
}
1183
1184
static void folio_wake_bit(struct folio *folio, int bit_nr)
1185
{
1186
wait_queue_head_t *q = folio_waitqueue(folio);
1187
struct wait_page_key key;
1188
unsigned long flags;
1189
1190
key.folio = folio;
1191
key.bit_nr = bit_nr;
1192
key.page_match = 0;
1193
1194
spin_lock_irqsave(&q->lock, flags);
1195
__wake_up_locked_key(q, TASK_NORMAL, &key);
1196
1197
/*
1198
* It's possible to miss clearing waiters here, when we woke our page
1199
* waiters, but the hashed waitqueue has waiters for other pages on it.
1200
* That's okay, it's a rare case. The next waker will clear it.
1201
*
1202
* Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
1203
* other), the flag may be cleared in the course of freeing the page;
1204
* but that is not required for correctness.
1205
*/
1206
if (!waitqueue_active(q) || !key.page_match)
1207
folio_clear_waiters(folio);
1208
1209
spin_unlock_irqrestore(&q->lock, flags);
1210
}
1211
1212
/*
1213
* A choice of three behaviors for folio_wait_bit_common():
1214
*/
1215
enum behavior {
1216
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
1217
* __folio_lock() waiting on then setting PG_locked.
1218
*/
1219
SHARED, /* Hold ref to page and check the bit when woken, like
1220
* folio_wait_writeback() waiting on PG_writeback.
1221
*/
1222
DROP, /* Drop ref to page before wait, no check when woken,
1223
* like folio_put_wait_locked() on PG_locked.
1224
*/
1225
};
1226
1227
/*
1228
* Attempt to check (or get) the folio flag, and mark us done
1229
* if successful.
1230
*/
1231
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
1232
struct wait_queue_entry *wait)
1233
{
1234
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1235
if (test_and_set_bit(bit_nr, &folio->flags.f))
1236
return false;
1237
} else if (test_bit(bit_nr, &folio->flags.f))
1238
return false;
1239
1240
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
1241
return true;
1242
}
1243
1244
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
1245
int state, enum behavior behavior)
1246
{
1247
wait_queue_head_t *q = folio_waitqueue(folio);
1248
int unfairness = sysctl_page_lock_unfairness;
1249
struct wait_page_queue wait_page;
1250
wait_queue_entry_t *wait = &wait_page.wait;
1251
bool thrashing = false;
1252
unsigned long pflags;
1253
bool in_thrashing;
1254
1255
if (bit_nr == PG_locked &&
1256
!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1257
delayacct_thrashing_start(&in_thrashing);
1258
psi_memstall_enter(&pflags);
1259
thrashing = true;
1260
}
1261
1262
init_wait(wait);
1263
wait->func = wake_page_function;
1264
wait_page.folio = folio;
1265
wait_page.bit_nr = bit_nr;
1266
1267
repeat:
1268
wait->flags = 0;
1269
if (behavior == EXCLUSIVE) {
1270
wait->flags = WQ_FLAG_EXCLUSIVE;
1271
if (--unfairness < 0)
1272
wait->flags |= WQ_FLAG_CUSTOM;
1273
}
1274
1275
/*
1276
* Do one last check whether we can get the
1277
* page bit synchronously.
1278
*
1279
* Do the folio_set_waiters() marking before that
1280
* to let any waker we _just_ missed know they
1281
* need to wake us up (otherwise they'll never
1282
* even go to the slow case that looks at the
1283
* page queue), and add ourselves to the wait
1284
* queue if we need to sleep.
1285
*
1286
* This part needs to be done under the queue
1287
* lock to avoid races.
1288
*/
1289
spin_lock_irq(&q->lock);
1290
folio_set_waiters(folio);
1291
if (!folio_trylock_flag(folio, bit_nr, wait))
1292
__add_wait_queue_entry_tail(q, wait);
1293
spin_unlock_irq(&q->lock);
1294
1295
/*
1296
* From now on, all the logic will be based on
1297
* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
1298
* see whether the page bit testing has already
1299
* been done by the wake function.
1300
*
1301
* We can drop our reference to the folio.
1302
*/
1303
if (behavior == DROP)
1304
folio_put(folio);
1305
1306
/*
1307
* Note that until the "finish_wait()", or until
1308
* we see the WQ_FLAG_WOKEN flag, we need to
1309
* be very careful with the 'wait->flags', because
1310
* we may race with a waker that sets them.
1311
*/
1312
for (;;) {
1313
unsigned int flags;
1314
1315
set_current_state(state);
1316
1317
/* Loop until we've been woken or interrupted */
1318
flags = smp_load_acquire(&wait->flags);
1319
if (!(flags & WQ_FLAG_WOKEN)) {
1320
if (signal_pending_state(state, current))
1321
break;
1322
1323
io_schedule();
1324
continue;
1325
}
1326
1327
/* If we were non-exclusive, we're done */
1328
if (behavior != EXCLUSIVE)
1329
break;
1330
1331
/* If the waker got the lock for us, we're done */
1332
if (flags & WQ_FLAG_DONE)
1333
break;
1334
1335
/*
1336
* Otherwise, if we're getting the lock, we need to
1337
* try to get it ourselves.
1338
*
1339
* And if that fails, we'll have to retry this all.
1340
*/
1341
if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
1342
goto repeat;
1343
1344
wait->flags |= WQ_FLAG_DONE;
1345
break;
1346
}
1347
1348
/*
1349
* If a signal happened, this 'finish_wait()' may remove the last
1350
* waiter from the wait-queues, but the folio waiters bit will remain
1351
* set. That's ok. The next wakeup will take care of it, and trying
1352
* to do it here would be difficult and prone to races.
1353
*/
1354
finish_wait(q, wait);
1355
1356
if (thrashing) {
1357
delayacct_thrashing_end(&in_thrashing);
1358
psi_memstall_leave(&pflags);
1359
}
1360
1361
/*
1362
* NOTE! The wait->flags weren't stable until we've done the
1363
* 'finish_wait()', and we could have exited the loop above due
1364
* to a signal, and had a wakeup event happen after the signal
1365
* test but before the 'finish_wait()'.
1366
*
1367
* So only after the finish_wait() can we reliably determine
1368
* if we got woken up or not, so we can now figure out the final
1369
* return value based on that state without races.
1370
*
1371
* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
1372
* waiter, but an exclusive one requires WQ_FLAG_DONE.
1373
*/
1374
if (behavior == EXCLUSIVE)
1375
return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1376
1377
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1378
}
1379
1380
#ifdef CONFIG_MIGRATION
1381
/**
1382
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
1383
* @entry: migration swap entry.
1384
* @ptl: already locked ptl. This function will drop the lock.
1385
*
1386
* Wait for a migration entry referencing the given page to be removed. This is
1387
* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
1388
* this can be called without taking a reference on the page. Instead this
1389
* should be called while holding the ptl for the migration entry referencing
1390
* the page.
1391
*
1392
* Returns after unlocking the ptl.
1393
*
1394
* This follows the same logic as folio_wait_bit_common() so see the comments
1395
* there.
1396
*/
1397
void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
1398
__releases(ptl)
1399
{
1400
struct wait_page_queue wait_page;
1401
wait_queue_entry_t *wait = &wait_page.wait;
1402
bool thrashing = false;
1403
unsigned long pflags;
1404
bool in_thrashing;
1405
wait_queue_head_t *q;
1406
struct folio *folio = softleaf_to_folio(entry);
1407
1408
q = folio_waitqueue(folio);
1409
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1410
delayacct_thrashing_start(&in_thrashing);
1411
psi_memstall_enter(&pflags);
1412
thrashing = true;
1413
}
1414
1415
init_wait(wait);
1416
wait->func = wake_page_function;
1417
wait_page.folio = folio;
1418
wait_page.bit_nr = PG_locked;
1419
wait->flags = 0;
1420
1421
spin_lock_irq(&q->lock);
1422
folio_set_waiters(folio);
1423
if (!folio_trylock_flag(folio, PG_locked, wait))
1424
__add_wait_queue_entry_tail(q, wait);
1425
spin_unlock_irq(&q->lock);
1426
1427
/*
1428
* If a migration entry exists for the page the migration path must hold
1429
* a valid reference to the page, and it must take the ptl to remove the
1430
* migration entry. So the page is valid until the ptl is dropped.
1431
*/
1432
spin_unlock(ptl);
1433
1434
for (;;) {
1435
unsigned int flags;
1436
1437
set_current_state(TASK_UNINTERRUPTIBLE);
1438
1439
/* Loop until we've been woken or interrupted */
1440
flags = smp_load_acquire(&wait->flags);
1441
if (!(flags & WQ_FLAG_WOKEN)) {
1442
if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
1443
break;
1444
1445
io_schedule();
1446
continue;
1447
}
1448
break;
1449
}
1450
1451
finish_wait(q, wait);
1452
1453
if (thrashing) {
1454
delayacct_thrashing_end(&in_thrashing);
1455
psi_memstall_leave(&pflags);
1456
}
1457
}
1458
#endif
1459
1460
void folio_wait_bit(struct folio *folio, int bit_nr)
1461
{
1462
folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1463
}
1464
EXPORT_SYMBOL(folio_wait_bit);
1465
1466
int folio_wait_bit_killable(struct folio *folio, int bit_nr)
1467
{
1468
return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
1469
}
1470
EXPORT_SYMBOL(folio_wait_bit_killable);
1471
1472
/**
1473
* folio_put_wait_locked - Drop a reference and wait for it to be unlocked
1474
* @folio: The folio to wait for.
1475
* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
1476
*
1477
* The caller should hold a reference on @folio. They expect the page to
1478
* become unlocked relatively soon, but do not wish to hold up migration
1479
* (for example) by holding the reference while waiting for the folio to
1480
* come unlocked. After this function returns, the caller should not
1481
* dereference @folio.
1482
*
1483
* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
1484
*/
1485
static int folio_put_wait_locked(struct folio *folio, int state)
1486
{
1487
return folio_wait_bit_common(folio, PG_locked, state, DROP);
1488
}
1489
1490
/**
1491
* folio_unlock - Unlock a locked folio.
1492
* @folio: The folio.
1493
*
1494
* Unlocks the folio and wakes up any thread sleeping on the page lock.
1495
*
1496
* Context: May be called from interrupt or process context. May not be
1497
* called from NMI context.
1498
*/
1499
void folio_unlock(struct folio *folio)
1500
{
1501
/* Bit 7 allows x86 to check the byte's sign bit */
1502
BUILD_BUG_ON(PG_waiters != 7);
1503
BUILD_BUG_ON(PG_locked > 7);
1504
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1505
if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
1506
folio_wake_bit(folio, PG_locked);
1507
}
1508
EXPORT_SYMBOL(folio_unlock);
1509
1510
/**
1511
* folio_end_read - End read on a folio.
1512
* @folio: The folio.
1513
* @success: True if all reads completed successfully.
1514
*
1515
* When all reads against a folio have completed, filesystems should
1516
* call this function to let the pagecache know that no more reads
1517
* are outstanding. This will unlock the folio and wake up any thread
1518
* sleeping on the lock. The folio will also be marked uptodate if all
1519
* reads succeeded.
1520
*
1521
* Context: May be called from interrupt or process context. May not be
1522
* called from NMI context.
1523
*/
1524
void folio_end_read(struct folio *folio, bool success)
1525
{
1526
unsigned long mask = 1 << PG_locked;
1527
1528
/* Must be in bottom byte for x86 to work */
1529
BUILD_BUG_ON(PG_uptodate > 7);
1530
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1531
VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
1532
1533
if (likely(success))
1534
mask |= 1 << PG_uptodate;
1535
if (folio_xor_flags_has_waiters(folio, mask))
1536
folio_wake_bit(folio, PG_locked);
1537
}
1538
EXPORT_SYMBOL(folio_end_read);
1539
1540
/**
1541
* folio_end_private_2 - Clear PG_private_2 and wake any waiters.
1542
* @folio: The folio.
1543
*
1544
* Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
1545
* it. The folio reference held for PG_private_2 being set is released.
1546
*
1547
* This is, for example, used when a netfs folio is being written to a local
1548
* disk cache, thereby allowing writes to the cache for the same folio to be
1549
* serialised.
1550
*/
1551
void folio_end_private_2(struct folio *folio)
1552
{
1553
VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
1554
clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
1555
folio_wake_bit(folio, PG_private_2);
1556
folio_put(folio);
1557
}
1558
EXPORT_SYMBOL(folio_end_private_2);
1559
1560
/**
1561
* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
1562
* @folio: The folio to wait on.
1563
*
1564
* Wait for PG_private_2 to be cleared on a folio.
1565
*/
1566
void folio_wait_private_2(struct folio *folio)
1567
{
1568
while (folio_test_private_2(folio))
1569
folio_wait_bit(folio, PG_private_2);
1570
}
1571
EXPORT_SYMBOL(folio_wait_private_2);
1572
1573
/**
1574
* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
1575
* @folio: The folio to wait on.
1576
*
1577
* Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
1578
* received by the calling task.
1579
*
1580
* Return:
1581
* - 0 if successful.
1582
* - -EINTR if a fatal signal was encountered.
1583
*/
1584
int folio_wait_private_2_killable(struct folio *folio)
1585
{
1586
int ret = 0;
1587
1588
while (folio_test_private_2(folio)) {
1589
ret = folio_wait_bit_killable(folio, PG_private_2);
1590
if (ret < 0)
1591
break;
1592
}
1593
1594
return ret;
1595
}
1596
EXPORT_SYMBOL(folio_wait_private_2_killable);
1597
1598
static void filemap_end_dropbehind(struct folio *folio)
1599
{
1600
struct address_space *mapping = folio->mapping;
1601
1602
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1603
1604
if (folio_test_writeback(folio) || folio_test_dirty(folio))
1605
return;
1606
if (!folio_test_clear_dropbehind(folio))
1607
return;
1608
if (mapping)
1609
folio_unmap_invalidate(mapping, folio, 0);
1610
}
1611
1612
/*
1613
* If folio was marked as dropbehind, then pages should be dropped when writeback
1614
* completes. Do that now. If we fail, it's likely because of a big folio -
1615
* just reset dropbehind for that case and latter completions should invalidate.
1616
*/
1617
void folio_end_dropbehind(struct folio *folio)
1618
{
1619
if (!folio_test_dropbehind(folio))
1620
return;
1621
1622
/*
1623
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
1624
* but can happen if normal writeback just happens to find dirty folios
1625
* that were created as part of uncached writeback, and that writeback
1626
* would otherwise not need non-IRQ handling. Just skip the
1627
* invalidation in that case.
1628
*/
1629
if (in_task() && folio_trylock(folio)) {
1630
filemap_end_dropbehind(folio);
1631
folio_unlock(folio);
1632
}
1633
}
1634
EXPORT_SYMBOL_GPL(folio_end_dropbehind);
1635
1636
/**
1637
* folio_end_writeback_no_dropbehind - End writeback against a folio.
1638
* @folio: The folio.
1639
*
1640
* The folio must actually be under writeback.
1641
* This call is intended for filesystems that need to defer dropbehind.
1642
*
1643
* Context: May be called from process or interrupt context.
1644
*/
1645
void folio_end_writeback_no_dropbehind(struct folio *folio)
1646
{
1647
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
1648
1649
/*
1650
* folio_test_clear_reclaim() could be used here but it is an
1651
* atomic operation and overkill in this particular case. Failing
1652
* to shuffle a folio marked for immediate reclaim is too mild
1653
* a gain to justify taking an atomic operation penalty at the
1654
* end of every folio writeback.
1655
*/
1656
if (folio_test_reclaim(folio)) {
1657
folio_clear_reclaim(folio);
1658
folio_rotate_reclaimable(folio);
1659
}
1660
1661
if (__folio_end_writeback(folio))
1662
folio_wake_bit(folio, PG_writeback);
1663
1664
acct_reclaim_writeback(folio);
1665
}
1666
EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);
1667
1668
/**
1669
* folio_end_writeback - End writeback against a folio.
1670
* @folio: The folio.
1671
*
1672
* The folio must actually be under writeback.
1673
*
1674
* Context: May be called from process or interrupt context.
1675
*/
1676
void folio_end_writeback(struct folio *folio)
1677
{
1678
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
1679
1680
/*
1681
* Writeback does not hold a folio reference of its own, relying
1682
* on truncation to wait for the clearing of PG_writeback.
1683
* But here we must make sure that the folio is not freed and
1684
* reused before the folio_wake_bit().
1685
*/
1686
folio_get(folio);
1687
folio_end_writeback_no_dropbehind(folio);
1688
folio_end_dropbehind(folio);
1689
folio_put(folio);
1690
}
1691
EXPORT_SYMBOL(folio_end_writeback);
1692
1693
/**
1694
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
1695
* @folio: The folio to lock
1696
*/
1697
void __folio_lock(struct folio *folio)
1698
{
1699
folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
1700
EXCLUSIVE);
1701
}
1702
EXPORT_SYMBOL(__folio_lock);
1703
1704
int __folio_lock_killable(struct folio *folio)
1705
{
1706
return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
1707
EXCLUSIVE);
1708
}
1709
EXPORT_SYMBOL_GPL(__folio_lock_killable);
1710
1711
static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
1712
{
1713
struct wait_queue_head *q = folio_waitqueue(folio);
1714
int ret;
1715
1716
wait->folio = folio;
1717
wait->bit_nr = PG_locked;
1718
1719
spin_lock_irq(&q->lock);
1720
__add_wait_queue_entry_tail(q, &wait->wait);
1721
folio_set_waiters(folio);
1722
ret = !folio_trylock(folio);
1723
/*
1724
* If we were successful now, we know we're still on the
1725
* waitqueue as we're still under the lock. This means it's
1726
* safe to remove and return success, we know the callback
1727
* isn't going to trigger.
1728
*/
1729
if (!ret)
1730
__remove_wait_queue(q, &wait->wait);
1731
else
1732
ret = -EIOCBQUEUED;
1733
spin_unlock_irq(&q->lock);
1734
return ret;
1735
}
1736
1737
/*
1738
* Return values:
1739
* 0 - folio is locked.
1740
* non-zero - folio is not locked.
1741
* mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
1742
* vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
1743
* FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
1744
*
1745
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
1746
* with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
1747
*/
1748
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
1749
{
1750
unsigned int flags = vmf->flags;
1751
1752
if (fault_flag_allow_retry_first(flags)) {
1753
/*
1754
* CAUTION! In this case, mmap_lock/per-VMA lock is not
1755
* released even though returning VM_FAULT_RETRY.
1756
*/
1757
if (flags & FAULT_FLAG_RETRY_NOWAIT)
1758
return VM_FAULT_RETRY;
1759
1760
release_fault_lock(vmf);
1761
if (flags & FAULT_FLAG_KILLABLE)
1762
folio_wait_locked_killable(folio);
1763
else
1764
folio_wait_locked(folio);
1765
return VM_FAULT_RETRY;
1766
}
1767
if (flags & FAULT_FLAG_KILLABLE) {
1768
bool ret;
1769
1770
ret = __folio_lock_killable(folio);
1771
if (ret) {
1772
release_fault_lock(vmf);
1773
return VM_FAULT_RETRY;
1774
}
1775
} else {
1776
__folio_lock(folio);
1777
}
1778
1779
return 0;
1780
}
1781
1782
/**
1783
* page_cache_next_miss() - Find the next gap in the page cache.
1784
* @mapping: Mapping.
1785
* @index: Index.
1786
* @max_scan: Maximum range to search.
1787
*
1788
* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
1789
* gap with the lowest index.
1790
*
1791
* This function may be called under the rcu_read_lock. However, this will
1792
* not atomically search a snapshot of the cache at a single point in time.
1793
* For example, if a gap is created at index 5, then subsequently a gap is
1794
* created at index 10, page_cache_next_miss covering both indices may
1795
* return 10 if called under the rcu_read_lock.
1796
*
1797
* Return: The index of the gap if found, otherwise an index outside the
1798
* range specified (in which case 'return - index >= max_scan' will be true).
1799
* In the rare case of index wrap-around, 0 will be returned.
1800
*/
1801
pgoff_t page_cache_next_miss(struct address_space *mapping,
1802
pgoff_t index, unsigned long max_scan)
1803
{
1804
XA_STATE(xas, &mapping->i_pages, index);
1805
unsigned long nr = max_scan;
1806
1807
while (nr--) {
1808
void *entry = xas_next(&xas);
1809
if (!entry || xa_is_value(entry))
1810
return xas.xa_index;
1811
if (xas.xa_index == 0)
1812
return 0;
1813
}
1814
1815
return index + max_scan;
1816
}
1817
EXPORT_SYMBOL(page_cache_next_miss);
1818
1819
/**
1820
* page_cache_prev_miss() - Find the previous gap in the page cache.
1821
* @mapping: Mapping.
1822
* @index: Index.
1823
* @max_scan: Maximum range to search.
1824
*
1825
* Search the range [max(index - max_scan + 1, 0), index] for the
1826
* gap with the highest index.
1827
*
1828
* This function may be called under the rcu_read_lock. However, this will
1829
* not atomically search a snapshot of the cache at a single point in time.
1830
* For example, if a gap is created at index 10, then subsequently a gap is
1831
* created at index 5, page_cache_prev_miss() covering both indices may
1832
* return 5 if called under the rcu_read_lock.
1833
*
1834
* Return: The index of the gap if found, otherwise an index outside the
1835
* range specified (in which case 'index - return >= max_scan' will be true).
1836
* In the rare case of wrap-around, ULONG_MAX will be returned.
1837
*/
1838
pgoff_t page_cache_prev_miss(struct address_space *mapping,
1839
pgoff_t index, unsigned long max_scan)
1840
{
1841
XA_STATE(xas, &mapping->i_pages, index);
1842
1843
while (max_scan--) {
1844
void *entry = xas_prev(&xas);
1845
if (!entry || xa_is_value(entry))
1846
break;
1847
if (xas.xa_index == ULONG_MAX)
1848
break;
1849
}
1850
1851
return xas.xa_index;
1852
}
1853
EXPORT_SYMBOL(page_cache_prev_miss);
1854
1855
/*
1856
* Lockless page cache protocol:
1857
* On the lookup side:
1858
* 1. Load the folio from i_pages
1859
* 2. Increment the refcount if it's not zero
1860
* 3. If the folio is not found by xas_reload(), put the refcount and retry
1861
*
1862
* On the removal side:
1863
* A. Freeze the page (by zeroing the refcount if nobody else has a reference)
1864
* B. Remove the page from i_pages
1865
* C. Return the page to the page allocator
1866
*
1867
* This means that any page may have its reference count temporarily
1868
* increased by a speculative page cache (or GUP-fast) lookup as it can
1869
* be allocated by another user before the RCU grace period expires.
1870
* Because the refcount temporarily acquired here may end up being the
1871
* last refcount on the page, any page allocation must be freeable by
1872
* folio_put().
1873
*/
1874
1875
/*
1876
* filemap_get_entry - Get a page cache entry.
1877
* @mapping: the address_space to search
1878
* @index: The page cache index.
1879
*
1880
* Looks up the page cache entry at @mapping & @index. If it is a folio,
1881
* it is returned with an increased refcount. If it is a shadow entry
1882
* of a previously evicted folio, or a swap entry from shmem/tmpfs,
1883
* it is returned without further action.
1884
*
1885
* Return: The folio, swap or shadow entry, %NULL if nothing is found.
1886
*/
1887
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
1888
{
1889
XA_STATE(xas, &mapping->i_pages, index);
1890
struct folio *folio;
1891
1892
rcu_read_lock();
1893
repeat:
1894
xas_reset(&xas);
1895
folio = xas_load(&xas);
1896
if (xas_retry(&xas, folio))
1897
goto repeat;
1898
/*
1899
* A shadow entry of a recently evicted page, or a swap entry from
1900
* shmem/tmpfs. Return it without attempting to raise page count.
1901
*/
1902
if (!folio || xa_is_value(folio))
1903
goto out;
1904
1905
if (!folio_try_get(folio))
1906
goto repeat;
1907
1908
if (unlikely(folio != xas_reload(&xas))) {
1909
folio_put(folio);
1910
goto repeat;
1911
}
1912
out:
1913
rcu_read_unlock();
1914
1915
return folio;
1916
}
1917
1918
/**
1919
* __filemap_get_folio_mpol - Find and get a reference to a folio.
1920
* @mapping: The address_space to search.
1921
* @index: The page index.
1922
* @fgp_flags: %FGP flags modify how the folio is returned.
1923
* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
1924
* @policy: NUMA memory allocation policy to follow.
1925
*
1926
* Looks up the page cache entry at @mapping & @index.
1927
*
1928
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
1929
* if the %GFP flags specified for %FGP_CREAT are atomic.
1930
*
1931
* If this function returns a folio, it is returned with an increased refcount.
1932
*
1933
* Return: The found folio or an ERR_PTR() otherwise.
1934
*/
1935
struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
1936
pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy)
1937
{
1938
struct folio *folio;
1939
1940
repeat:
1941
folio = filemap_get_entry(mapping, index);
1942
if (xa_is_value(folio))
1943
folio = NULL;
1944
if (!folio)
1945
goto no_page;
1946
1947
if (fgp_flags & FGP_LOCK) {
1948
if (fgp_flags & FGP_NOWAIT) {
1949
if (!folio_trylock(folio)) {
1950
folio_put(folio);
1951
return ERR_PTR(-EAGAIN);
1952
}
1953
} else {
1954
folio_lock(folio);
1955
}
1956
1957
/* Has the page been truncated? */
1958
if (unlikely(folio->mapping != mapping)) {
1959
folio_unlock(folio);
1960
folio_put(folio);
1961
goto repeat;
1962
}
1963
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
1964
}
1965
1966
if (fgp_flags & FGP_ACCESSED)
1967
folio_mark_accessed(folio);
1968
else if (fgp_flags & FGP_WRITE) {
1969
/* Clear idle flag for buffer write */
1970
if (folio_test_idle(folio))
1971
folio_clear_idle(folio);
1972
}
1973
1974
if (fgp_flags & FGP_STABLE)
1975
folio_wait_stable(folio);
1976
no_page:
1977
if (!folio && (fgp_flags & FGP_CREAT)) {
1978
unsigned int min_order = mapping_min_folio_order(mapping);
1979
unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
1980
int err;
1981
index = mapping_align_index(mapping, index);
1982
1983
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1984
gfp |= __GFP_WRITE;
1985
if (fgp_flags & FGP_NOFS)
1986
gfp &= ~__GFP_FS;
1987
if (fgp_flags & FGP_NOWAIT) {
1988
gfp &= ~GFP_KERNEL;
1989
gfp |= GFP_NOWAIT;
1990
}
1991
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1992
fgp_flags |= FGP_LOCK;
1993
1994
if (order > mapping_max_folio_order(mapping))
1995
order = mapping_max_folio_order(mapping);
1996
/* If we're not aligned, allocate a smaller folio */
1997
if (index & ((1UL << order) - 1))
1998
order = __ffs(index);
1999
2000
do {
2001
gfp_t alloc_gfp = gfp;
2002
2003
err = -ENOMEM;
2004
if (order > min_order)
2005
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
2006
folio = filemap_alloc_folio(alloc_gfp, order, policy);
2007
if (!folio)
2008
continue;
2009
2010
/* Init accessed so avoid atomic mark_page_accessed later */
2011
if (fgp_flags & FGP_ACCESSED)
2012
__folio_set_referenced(folio);
2013
if (fgp_flags & FGP_DONTCACHE)
2014
__folio_set_dropbehind(folio);
2015
2016
err = filemap_add_folio(mapping, folio, index, gfp);
2017
if (!err)
2018
break;
2019
folio_put(folio);
2020
folio = NULL;
2021
} while (order-- > min_order);
2022
2023
if (err == -EEXIST)
2024
goto repeat;
2025
if (err) {
2026
/*
2027
* When NOWAIT I/O fails to allocate folios this could
2028
* be due to a nonblocking memory allocation and not
2029
* because the system actually is out of memory.
2030
* Return -EAGAIN so that there caller retries in a
2031
* blocking fashion instead of propagating -ENOMEM
2032
* to the application.
2033
*/
2034
if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
2035
err = -EAGAIN;
2036
return ERR_PTR(err);
2037
}
2038
/*
2039
* filemap_add_folio locks the page, and for mmap
2040
* we expect an unlocked page.
2041
*/
2042
if (folio && (fgp_flags & FGP_FOR_MMAP))
2043
folio_unlock(folio);
2044
}
2045
2046
if (!folio)
2047
return ERR_PTR(-ENOENT);
2048
/* not an uncached lookup, clear uncached if set */
2049
if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
2050
folio_clear_dropbehind(folio);
2051
return folio;
2052
}
2053
EXPORT_SYMBOL(__filemap_get_folio_mpol);
2054
2055
static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
2056
xa_mark_t mark)
2057
{
2058
struct folio *folio;
2059
2060
retry:
2061
if (mark == XA_PRESENT)
2062
folio = xas_find(xas, max);
2063
else
2064
folio = xas_find_marked(xas, max, mark);
2065
2066
if (xas_retry(xas, folio))
2067
goto retry;
2068
/*
2069
* A shadow entry of a recently evicted page, a swap
2070
* entry from shmem/tmpfs or a DAX entry. Return it
2071
* without attempting to raise page count.
2072
*/
2073
if (!folio || xa_is_value(folio))
2074
return folio;
2075
2076
if (!folio_try_get(folio))
2077
goto reset;
2078
2079
if (unlikely(folio != xas_reload(xas))) {
2080
folio_put(folio);
2081
goto reset;
2082
}
2083
2084
return folio;
2085
reset:
2086
xas_reset(xas);
2087
goto retry;
2088
}
2089
2090
/**
2091
* find_get_entries - gang pagecache lookup
2092
* @mapping: The address_space to search
2093
* @start: The starting page cache index
2094
* @end: The final page index (inclusive).
2095
* @fbatch: Where the resulting entries are placed.
2096
* @indices: The cache indices corresponding to the entries in @entries
2097
*
2098
* find_get_entries() will search for and return a batch of entries in
2099
* the mapping. The entries are placed in @fbatch. find_get_entries()
2100
* takes a reference on any actual folios it returns.
2101
*
2102
* The entries have ascending indexes. The indices may not be consecutive
2103
* due to not-present entries or large folios.
2104
*
2105
* Any shadow entries of evicted folios, or swap entries from
2106
* shmem/tmpfs, are included in the returned array.
2107
*
2108
* Return: The number of entries which were found.
2109
*/
2110
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
2111
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2112
{
2113
XA_STATE(xas, &mapping->i_pages, *start);
2114
struct folio *folio;
2115
2116
rcu_read_lock();
2117
while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
2118
indices[fbatch->nr] = xas.xa_index;
2119
if (!folio_batch_add(fbatch, folio))
2120
break;
2121
}
2122
2123
if (folio_batch_count(fbatch)) {
2124
unsigned long nr;
2125
int idx = folio_batch_count(fbatch) - 1;
2126
2127
folio = fbatch->folios[idx];
2128
if (!xa_is_value(folio))
2129
nr = folio_nr_pages(folio);
2130
else
2131
nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
2132
*start = round_down(indices[idx] + nr, nr);
2133
}
2134
rcu_read_unlock();
2135
2136
return folio_batch_count(fbatch);
2137
}
2138
2139
/**
2140
* find_lock_entries - Find a batch of pagecache entries.
2141
* @mapping: The address_space to search.
2142
* @start: The starting page cache index.
2143
* @end: The final page index (inclusive).
2144
* @fbatch: Where the resulting entries are placed.
2145
* @indices: The cache indices of the entries in @fbatch.
2146
*
2147
* find_lock_entries() will return a batch of entries from @mapping.
2148
* Swap, shadow and DAX entries are included. Folios are returned
2149
* locked and with an incremented refcount. Folios which are locked
2150
* by somebody else or under writeback are skipped. Folios which are
2151
* partially outside the range are not returned.
2152
*
2153
* The entries have ascending indexes. The indices may not be consecutive
2154
* due to not-present entries, large folios, folios which could not be
2155
* locked or folios under writeback.
2156
*
2157
* Return: The number of entries which were found.
2158
*/
2159
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
2160
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2161
{
2162
XA_STATE(xas, &mapping->i_pages, *start);
2163
struct folio *folio;
2164
2165
rcu_read_lock();
2166
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
2167
unsigned long base;
2168
unsigned long nr;
2169
2170
if (!xa_is_value(folio)) {
2171
nr = folio_nr_pages(folio);
2172
base = folio->index;
2173
/* Omit large folio which begins before the start */
2174
if (base < *start)
2175
goto put;
2176
/* Omit large folio which extends beyond the end */
2177
if (base + nr - 1 > end)
2178
goto put;
2179
if (!folio_trylock(folio))
2180
goto put;
2181
if (folio->mapping != mapping ||
2182
folio_test_writeback(folio))
2183
goto unlock;
2184
VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
2185
folio);
2186
} else {
2187
nr = 1 << xas_get_order(&xas);
2188
base = xas.xa_index & ~(nr - 1);
2189
/* Omit order>0 value which begins before the start */
2190
if (base < *start)
2191
continue;
2192
/* Omit order>0 value which extends beyond the end */
2193
if (base + nr - 1 > end)
2194
break;
2195
}
2196
2197
/* Update start now so that last update is correct on return */
2198
*start = base + nr;
2199
indices[fbatch->nr] = xas.xa_index;
2200
if (!folio_batch_add(fbatch, folio))
2201
break;
2202
continue;
2203
unlock:
2204
folio_unlock(folio);
2205
put:
2206
folio_put(folio);
2207
}
2208
rcu_read_unlock();
2209
2210
return folio_batch_count(fbatch);
2211
}
2212
2213
/**
2214
* filemap_get_folios - Get a batch of folios
2215
* @mapping: The address_space to search
2216
* @start: The starting page index
2217
* @end: The final page index (inclusive)
2218
* @fbatch: The batch to fill.
2219
*
2220
* Search for and return a batch of folios in the mapping starting at
2221
* index @start and up to index @end (inclusive). The folios are returned
2222
* in @fbatch with an elevated reference count.
2223
*
2224
* Return: The number of folios which were found.
2225
* We also update @start to index the next folio for the traversal.
2226
*/
2227
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
2228
pgoff_t end, struct folio_batch *fbatch)
2229
{
2230
return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
2231
}
2232
EXPORT_SYMBOL(filemap_get_folios);
2233
2234
/**
2235
* filemap_get_folios_contig - Get a batch of contiguous folios
2236
* @mapping: The address_space to search
2237
* @start: The starting page index
2238
* @end: The final page index (inclusive)
2239
* @fbatch: The batch to fill
2240
*
2241
* filemap_get_folios_contig() works exactly like filemap_get_folios(),
2242
* except the returned folios are guaranteed to be contiguous. This may
2243
* not return all contiguous folios if the batch gets filled up.
2244
*
2245
* Return: The number of folios found.
2246
* Also update @start to be positioned for traversal of the next folio.
2247
*/
2248
2249
unsigned filemap_get_folios_contig(struct address_space *mapping,
2250
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
2251
{
2252
XA_STATE(xas, &mapping->i_pages, *start);
2253
unsigned long nr;
2254
struct folio *folio;
2255
2256
rcu_read_lock();
2257
2258
for (folio = xas_load(&xas); folio && xas.xa_index <= end;
2259
folio = xas_next(&xas)) {
2260
if (xas_retry(&xas, folio))
2261
continue;
2262
/*
2263
* If the entry has been swapped out, we can stop looking.
2264
* No current caller is looking for DAX entries.
2265
*/
2266
if (xa_is_value(folio))
2267
goto update_start;
2268
2269
/* If we landed in the middle of a THP, continue at its end. */
2270
if (xa_is_sibling(folio))
2271
goto update_start;
2272
2273
if (!folio_try_get(folio))
2274
goto retry;
2275
2276
if (unlikely(folio != xas_reload(&xas)))
2277
goto put_folio;
2278
2279
if (!folio_batch_add(fbatch, folio)) {
2280
nr = folio_nr_pages(folio);
2281
*start = folio->index + nr;
2282
goto out;
2283
}
2284
xas_advance(&xas, folio_next_index(folio) - 1);
2285
continue;
2286
put_folio:
2287
folio_put(folio);
2288
2289
retry:
2290
xas_reset(&xas);
2291
}
2292
2293
update_start:
2294
nr = folio_batch_count(fbatch);
2295
2296
if (nr) {
2297
folio = fbatch->folios[nr - 1];
2298
*start = folio_next_index(folio);
2299
}
2300
out:
2301
rcu_read_unlock();
2302
return folio_batch_count(fbatch);
2303
}
2304
EXPORT_SYMBOL(filemap_get_folios_contig);
2305
2306
/**
2307
* filemap_get_folios_tag - Get a batch of folios matching @tag
2308
* @mapping: The address_space to search
2309
* @start: The starting page index
2310
* @end: The final page index (inclusive)
2311
* @tag: The tag index
2312
* @fbatch: The batch to fill
2313
*
2314
* The first folio may start before @start; if it does, it will contain
2315
* @start. The final folio may extend beyond @end; if it does, it will
2316
* contain @end. The folios have ascending indices. There may be gaps
2317
* between the folios if there are indices which have no folio in the
2318
* page cache. If folios are added to or removed from the page cache
2319
* while this is running, they may or may not be found by this call.
2320
* Only returns folios that are tagged with @tag.
2321
*
2322
* Return: The number of folios found.
2323
* Also update @start to index the next folio for traversal.
2324
*/
2325
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
2326
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
2327
{
2328
XA_STATE(xas, &mapping->i_pages, *start);
2329
struct folio *folio;
2330
2331
rcu_read_lock();
2332
while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
2333
/*
2334
* Shadow entries should never be tagged, but this iteration
2335
* is lockless so there is a window for page reclaim to evict
2336
* a page we saw tagged. Skip over it.
2337
*/
2338
if (xa_is_value(folio))
2339
continue;
2340
if (!folio_batch_add(fbatch, folio)) {
2341
unsigned long nr = folio_nr_pages(folio);
2342
*start = folio->index + nr;
2343
goto out;
2344
}
2345
}
2346
/*
2347
* We come here when there is no page beyond @end. We take care to not
2348
* overflow the index @start as it confuses some of the callers. This
2349
* breaks the iteration when there is a page at index -1 but that is
2350
* already broke anyway.
2351
*/
2352
if (end == (pgoff_t)-1)
2353
*start = (pgoff_t)-1;
2354
else
2355
*start = end + 1;
2356
out:
2357
rcu_read_unlock();
2358
2359
return folio_batch_count(fbatch);
2360
}
2361
EXPORT_SYMBOL(filemap_get_folios_tag);
2362
2363
/**
2364
* filemap_get_folios_dirty - Get a batch of dirty folios
2365
* @mapping: The address_space to search
2366
* @start: The starting folio index
2367
* @end: The final folio index (inclusive)
2368
* @fbatch: The batch to fill
2369
*
2370
* filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
2371
* the returned folios are presumed to be dirty or undergoing writeback. Dirty
2372
* state is presumed because we don't block on folio lock nor want to miss
2373
* folios. Callers that need to can recheck state upon locking the folio.
2374
*
2375
* This may not return all dirty folios if the batch gets filled up.
2376
*
2377
* Return: The number of folios found.
2378
* Also update @start to be positioned for traversal of the next folio.
2379
*/
2380
unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
2381
pgoff_t end, struct folio_batch *fbatch)
2382
{
2383
XA_STATE(xas, &mapping->i_pages, *start);
2384
struct folio *folio;
2385
2386
rcu_read_lock();
2387
while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
2388
if (xa_is_value(folio))
2389
continue;
2390
if (folio_trylock(folio)) {
2391
bool clean = !folio_test_dirty(folio) &&
2392
!folio_test_writeback(folio);
2393
folio_unlock(folio);
2394
if (clean) {
2395
folio_put(folio);
2396
continue;
2397
}
2398
}
2399
if (!folio_batch_add(fbatch, folio)) {
2400
unsigned long nr = folio_nr_pages(folio);
2401
*start = folio->index + nr;
2402
goto out;
2403
}
2404
}
2405
/*
2406
* We come here when there is no folio beyond @end. We take care to not
2407
* overflow the index @start as it confuses some of the callers. This
2408
* breaks the iteration when there is a folio at index -1 but that is
2409
* already broke anyway.
2410
*/
2411
if (end == (pgoff_t)-1)
2412
*start = (pgoff_t)-1;
2413
else
2414
*start = end + 1;
2415
out:
2416
rcu_read_unlock();
2417
2418
return folio_batch_count(fbatch);
2419
}
2420
2421
/*
2422
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
2423
* a _large_ part of the i/o request. Imagine the worst scenario:
2424
*
2425
* ---R__________________________________________B__________
2426
* ^ reading here ^ bad block(assume 4k)
2427
*
2428
* read(R) => miss => readahead(R...B) => media error => frustrating retries
2429
* => failing the whole request => read(R) => read(R+1) =>
2430
* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
2431
* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
2432
* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
2433
*
2434
* It is going insane. Fix it by quickly scaling down the readahead size.
2435
*/
2436
static void shrink_readahead_size_eio(struct file_ra_state *ra)
2437
{
2438
ra->ra_pages /= 4;
2439
}
2440
2441
/*
2442
* filemap_get_read_batch - Get a batch of folios for read
2443
*
2444
* Get a batch of folios which represent a contiguous range of bytes in
2445
* the file. No exceptional entries will be returned. If @index is in
2446
* the middle of a folio, the entire folio will be returned. The last
2447
* folio in the batch may have the readahead flag set or the uptodate flag
2448
* clear so that the caller can take the appropriate action.
2449
*/
2450
static void filemap_get_read_batch(struct address_space *mapping,
2451
pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
2452
{
2453
XA_STATE(xas, &mapping->i_pages, index);
2454
struct folio *folio;
2455
2456
rcu_read_lock();
2457
for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
2458
if (xas_retry(&xas, folio))
2459
continue;
2460
if (xas.xa_index > max || xa_is_value(folio))
2461
break;
2462
if (xa_is_sibling(folio))
2463
break;
2464
if (!folio_try_get(folio))
2465
goto retry;
2466
2467
if (unlikely(folio != xas_reload(&xas)))
2468
goto put_folio;
2469
2470
if (!folio_batch_add(fbatch, folio))
2471
break;
2472
if (!folio_test_uptodate(folio))
2473
break;
2474
if (folio_test_readahead(folio))
2475
break;
2476
xas_advance(&xas, folio_next_index(folio) - 1);
2477
continue;
2478
put_folio:
2479
folio_put(folio);
2480
retry:
2481
xas_reset(&xas);
2482
}
2483
rcu_read_unlock();
2484
}
2485
2486
static int filemap_read_folio(struct file *file, filler_t filler,
2487
struct folio *folio)
2488
{
2489
bool workingset = folio_test_workingset(folio);
2490
unsigned long pflags;
2491
int error;
2492
2493
/* Start the actual read. The read will unlock the page. */
2494
if (unlikely(workingset))
2495
psi_memstall_enter(&pflags);
2496
error = filler(file, folio);
2497
if (unlikely(workingset))
2498
psi_memstall_leave(&pflags);
2499
if (error)
2500
return error;
2501
2502
error = folio_wait_locked_killable(folio);
2503
if (error)
2504
return error;
2505
if (folio_test_uptodate(folio))
2506
return 0;
2507
if (file)
2508
shrink_readahead_size_eio(&file->f_ra);
2509
return -EIO;
2510
}
2511
2512
static bool filemap_range_uptodate(struct address_space *mapping,
2513
loff_t pos, size_t count, struct folio *folio,
2514
bool need_uptodate)
2515
{
2516
if (folio_test_uptodate(folio))
2517
return true;
2518
/* pipes can't handle partially uptodate pages */
2519
if (need_uptodate)
2520
return false;
2521
if (!mapping->a_ops->is_partially_uptodate)
2522
return false;
2523
if (mapping->host->i_blkbits >= folio_shift(folio))
2524
return false;
2525
2526
if (folio_pos(folio) > pos) {
2527
count -= folio_pos(folio) - pos;
2528
pos = 0;
2529
} else {
2530
pos -= folio_pos(folio);
2531
}
2532
2533
if (pos == 0 && count >= folio_size(folio))
2534
return false;
2535
2536
return mapping->a_ops->is_partially_uptodate(folio, pos, count);
2537
}
2538
2539
static int filemap_update_page(struct kiocb *iocb,
2540
struct address_space *mapping, size_t count,
2541
struct folio *folio, bool need_uptodate)
2542
{
2543
int error;
2544
2545
if (iocb->ki_flags & IOCB_NOWAIT) {
2546
if (!filemap_invalidate_trylock_shared(mapping))
2547
return -EAGAIN;
2548
} else {
2549
filemap_invalidate_lock_shared(mapping);
2550
}
2551
2552
if (!folio_trylock(folio)) {
2553
error = -EAGAIN;
2554
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
2555
goto unlock_mapping;
2556
if (!(iocb->ki_flags & IOCB_WAITQ)) {
2557
filemap_invalidate_unlock_shared(mapping);
2558
/*
2559
* This is where we usually end up waiting for a
2560
* previously submitted readahead to finish.
2561
*/
2562
folio_put_wait_locked(folio, TASK_KILLABLE);
2563
return AOP_TRUNCATED_PAGE;
2564
}
2565
error = __folio_lock_async(folio, iocb->ki_waitq);
2566
if (error)
2567
goto unlock_mapping;
2568
}
2569
2570
error = AOP_TRUNCATED_PAGE;
2571
if (!folio->mapping)
2572
goto unlock;
2573
2574
error = 0;
2575
if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
2576
need_uptodate))
2577
goto unlock;
2578
2579
error = -EAGAIN;
2580
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
2581
goto unlock;
2582
2583
error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
2584
folio);
2585
goto unlock_mapping;
2586
unlock:
2587
folio_unlock(folio);
2588
unlock_mapping:
2589
filemap_invalidate_unlock_shared(mapping);
2590
if (error == AOP_TRUNCATED_PAGE)
2591
folio_put(folio);
2592
return error;
2593
}
2594
2595
static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
2596
{
2597
struct address_space *mapping = iocb->ki_filp->f_mapping;
2598
struct folio *folio;
2599
int error;
2600
unsigned int min_order = mapping_min_folio_order(mapping);
2601
pgoff_t index;
2602
2603
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
2604
return -EAGAIN;
2605
2606
folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
2607
if (!folio)
2608
return -ENOMEM;
2609
if (iocb->ki_flags & IOCB_DONTCACHE)
2610
__folio_set_dropbehind(folio);
2611
2612
/*
2613
* Protect against truncate / hole punch. Grabbing invalidate_lock
2614
* here assures we cannot instantiate and bring uptodate new
2615
* pagecache folios after evicting page cache during truncate
2616
* and before actually freeing blocks. Note that we could
2617
* release invalidate_lock after inserting the folio into
2618
* the page cache as the locked folio would then be enough to
2619
* synchronize with hole punching. But there are code paths
2620
* such as filemap_update_page() filling in partially uptodate
2621
* pages or ->readahead() that need to hold invalidate_lock
2622
* while mapping blocks for IO so let's hold the lock here as
2623
* well to keep locking rules simple.
2624
*/
2625
filemap_invalidate_lock_shared(mapping);
2626
index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
2627
error = filemap_add_folio(mapping, folio, index,
2628
mapping_gfp_constraint(mapping, GFP_KERNEL));
2629
if (error == -EEXIST)
2630
error = AOP_TRUNCATED_PAGE;
2631
if (error)
2632
goto error;
2633
2634
error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
2635
folio);
2636
if (error)
2637
goto error;
2638
2639
filemap_invalidate_unlock_shared(mapping);
2640
folio_batch_add(fbatch, folio);
2641
return 0;
2642
error:
2643
filemap_invalidate_unlock_shared(mapping);
2644
folio_put(folio);
2645
return error;
2646
}
2647
2648
static int filemap_readahead(struct kiocb *iocb, struct file *file,
2649
struct address_space *mapping, struct folio *folio,
2650
pgoff_t last_index)
2651
{
2652
DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
2653
2654
if (iocb->ki_flags & IOCB_NOIO)
2655
return -EAGAIN;
2656
if (iocb->ki_flags & IOCB_DONTCACHE)
2657
ractl.dropbehind = 1;
2658
page_cache_async_ra(&ractl, folio, last_index - folio->index);
2659
return 0;
2660
}
2661
2662
static int filemap_get_pages(struct kiocb *iocb, size_t count,
2663
struct folio_batch *fbatch, bool need_uptodate)
2664
{
2665
struct file *filp = iocb->ki_filp;
2666
struct address_space *mapping = filp->f_mapping;
2667
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
2668
pgoff_t last_index;
2669
struct folio *folio;
2670
unsigned int flags;
2671
int err = 0;
2672
2673
/* "last_index" is the index of the folio beyond the end of the read */
2674
last_index = round_up(iocb->ki_pos + count,
2675
mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;
2676
retry:
2677
if (fatal_signal_pending(current))
2678
return -EINTR;
2679
2680
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2681
if (!folio_batch_count(fbatch)) {
2682
DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
2683
2684
if (iocb->ki_flags & IOCB_NOIO)
2685
return -EAGAIN;
2686
if (iocb->ki_flags & IOCB_NOWAIT)
2687
flags = memalloc_noio_save();
2688
if (iocb->ki_flags & IOCB_DONTCACHE)
2689
ractl.dropbehind = 1;
2690
page_cache_sync_ra(&ractl, last_index - index);
2691
if (iocb->ki_flags & IOCB_NOWAIT)
2692
memalloc_noio_restore(flags);
2693
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2694
}
2695
if (!folio_batch_count(fbatch)) {
2696
err = filemap_create_folio(iocb, fbatch);
2697
if (err == AOP_TRUNCATED_PAGE)
2698
goto retry;
2699
return err;
2700
}
2701
2702
folio = fbatch->folios[folio_batch_count(fbatch) - 1];
2703
if (folio_test_readahead(folio)) {
2704
err = filemap_readahead(iocb, filp, mapping, folio, last_index);
2705
if (err)
2706
goto err;
2707
}
2708
if (!folio_test_uptodate(folio)) {
2709
if (folio_batch_count(fbatch) > 1) {
2710
err = -EAGAIN;
2711
goto err;
2712
}
2713
err = filemap_update_page(iocb, mapping, count, folio,
2714
need_uptodate);
2715
if (err)
2716
goto err;
2717
}
2718
2719
trace_mm_filemap_get_pages(mapping, index, last_index - 1);
2720
return 0;
2721
err:
2722
if (err < 0)
2723
folio_put(folio);
2724
if (likely(--fbatch->nr))
2725
return 0;
2726
if (err == AOP_TRUNCATED_PAGE)
2727
goto retry;
2728
return err;
2729
}
2730
2731
static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
2732
{
2733
unsigned int shift = folio_shift(folio);
2734
2735
return (pos1 >> shift == pos2 >> shift);
2736
}
2737
2738
static void filemap_end_dropbehind_read(struct folio *folio)
2739
{
2740
if (!folio_test_dropbehind(folio))
2741
return;
2742
if (folio_test_writeback(folio) || folio_test_dirty(folio))
2743
return;
2744
if (folio_trylock(folio)) {
2745
filemap_end_dropbehind(folio);
2746
folio_unlock(folio);
2747
}
2748
}
2749
2750
/**
2751
* filemap_read - Read data from the page cache.
2752
* @iocb: The iocb to read.
2753
* @iter: Destination for the data.
2754
* @already_read: Number of bytes already read by the caller.
2755
*
2756
* Copies data from the page cache. If the data is not currently present,
2757
* uses the readahead and read_folio address_space operations to fetch it.
2758
*
2759
* Return: Total number of bytes copied, including those already read by
2760
* the caller. If an error happens before any bytes are copied, returns
2761
* a negative error number.
2762
*/
2763
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
2764
ssize_t already_read)
2765
{
2766
struct file *filp = iocb->ki_filp;
2767
struct file_ra_state *ra = &filp->f_ra;
2768
struct address_space *mapping = filp->f_mapping;
2769
struct inode *inode = mapping->host;
2770
struct folio_batch fbatch;
2771
int i, error = 0;
2772
bool writably_mapped;
2773
loff_t isize, end_offset;
2774
loff_t last_pos = ra->prev_pos;
2775
2776
if (unlikely(iocb->ki_pos < 0))
2777
return -EINVAL;
2778
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
2779
return 0;
2780
if (unlikely(!iov_iter_count(iter)))
2781
return 0;
2782
2783
iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
2784
folio_batch_init(&fbatch);
2785
2786
do {
2787
cond_resched();
2788
2789
/*
2790
* If we've already successfully copied some data, then we
2791
* can no longer safely return -EIOCBQUEUED. Hence mark
2792
* an async read NOWAIT at that point.
2793
*/
2794
if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2795
iocb->ki_flags |= IOCB_NOWAIT;
2796
2797
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
2798
break;
2799
2800
error = filemap_get_pages(iocb, iter->count, &fbatch, false);
2801
if (error < 0)
2802
break;
2803
2804
/*
2805
* i_size must be checked after we know the pages are Uptodate.
2806
*
2807
* Checking i_size after the check allows us to calculate
2808
* the correct value for "nr", which means the zero-filled
2809
* part of the page is not copied back to userspace (unless
2810
* another truncate extends the file - this is desired though).
2811
*/
2812
isize = i_size_read(inode);
2813
if (unlikely(iocb->ki_pos >= isize))
2814
goto put_folios;
2815
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
2816
2817
/*
2818
* Once we start copying data, we don't want to be touching any
2819
* cachelines that might be contended:
2820
*/
2821
writably_mapped = mapping_writably_mapped(mapping);
2822
2823
/*
2824
* When a read accesses the same folio several times, only
2825
* mark it as accessed the first time.
2826
*/
2827
if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
2828
fbatch.folios[0]))
2829
folio_mark_accessed(fbatch.folios[0]);
2830
2831
for (i = 0; i < folio_batch_count(&fbatch); i++) {
2832
struct folio *folio = fbatch.folios[i];
2833
size_t fsize = folio_size(folio);
2834
size_t offset = iocb->ki_pos & (fsize - 1);
2835
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
2836
fsize - offset);
2837
size_t copied;
2838
2839
if (end_offset < folio_pos(folio))
2840
break;
2841
if (i > 0)
2842
folio_mark_accessed(folio);
2843
/*
2844
* If users can be writing to this folio using arbitrary
2845
* virtual addresses, take care of potential aliasing
2846
* before reading the folio on the kernel side.
2847
*/
2848
if (writably_mapped)
2849
flush_dcache_folio(folio);
2850
2851
copied = copy_folio_to_iter(folio, offset, bytes, iter);
2852
2853
already_read += copied;
2854
iocb->ki_pos += copied;
2855
last_pos = iocb->ki_pos;
2856
2857
if (copied < bytes) {
2858
error = -EFAULT;
2859
break;
2860
}
2861
}
2862
put_folios:
2863
for (i = 0; i < folio_batch_count(&fbatch); i++) {
2864
struct folio *folio = fbatch.folios[i];
2865
2866
filemap_end_dropbehind_read(folio);
2867
folio_put(folio);
2868
}
2869
folio_batch_init(&fbatch);
2870
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
2871
2872
file_accessed(filp);
2873
ra->prev_pos = last_pos;
2874
return already_read ? already_read : error;
2875
}
2876
EXPORT_SYMBOL_GPL(filemap_read);
2877
2878
int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
2879
{
2880
struct address_space *mapping = iocb->ki_filp->f_mapping;
2881
loff_t pos = iocb->ki_pos;
2882
loff_t end = pos + count - 1;
2883
2884
if (iocb->ki_flags & IOCB_NOWAIT) {
2885
if (filemap_range_needs_writeback(mapping, pos, end))
2886
return -EAGAIN;
2887
return 0;
2888
}
2889
2890
return filemap_write_and_wait_range(mapping, pos, end);
2891
}
2892
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
2893
2894
int filemap_invalidate_pages(struct address_space *mapping,
2895
loff_t pos, loff_t end, bool nowait)
2896
{
2897
int ret;
2898
2899
if (nowait) {
2900
/* we could block if there are any pages in the range */
2901
if (filemap_range_has_page(mapping, pos, end))
2902
return -EAGAIN;
2903
} else {
2904
ret = filemap_write_and_wait_range(mapping, pos, end);
2905
if (ret)
2906
return ret;
2907
}
2908
2909
/*
2910
* After a write we want buffered reads to be sure to go to disk to get
2911
* the new data. We invalidate clean cached page from the region we're
2912
* about to write. We do this *before* the write so that we can return
2913
* without clobbering -EIOCBQUEUED from ->direct_IO().
2914
*/
2915
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
2916
end >> PAGE_SHIFT);
2917
}
2918
2919
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
2920
{
2921
struct address_space *mapping = iocb->ki_filp->f_mapping;
2922
2923
return filemap_invalidate_pages(mapping, iocb->ki_pos,
2924
iocb->ki_pos + count - 1,
2925
iocb->ki_flags & IOCB_NOWAIT);
2926
}
2927
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
2928
2929
/**
2930
* generic_file_read_iter - generic filesystem read routine
2931
* @iocb: kernel I/O control block
2932
* @iter: destination for the data read
2933
*
2934
* This is the "read_iter()" routine for all filesystems
2935
* that can use the page cache directly.
2936
*
2937
* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
2938
* be returned when no data can be read without waiting for I/O requests
2939
* to complete; it doesn't prevent readahead.
2940
*
2941
* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
2942
* requests shall be made for the read or for readahead. When no data
2943
* can be read, -EAGAIN shall be returned. When readahead would be
2944
* triggered, a partial, possibly empty read shall be returned.
2945
*
2946
* Return:
2947
* * number of bytes copied, even for partial reads
2948
* * negative error code (or 0 if IOCB_NOIO) if nothing was read
2949
*/
2950
ssize_t
2951
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2952
{
2953
size_t count = iov_iter_count(iter);
2954
ssize_t retval = 0;
2955
2956
if (!count)
2957
return 0; /* skip atime */
2958
2959
if (iocb->ki_flags & IOCB_DIRECT) {
2960
struct file *file = iocb->ki_filp;
2961
struct address_space *mapping = file->f_mapping;
2962
struct inode *inode = mapping->host;
2963
2964
retval = kiocb_write_and_wait(iocb, count);
2965
if (retval < 0)
2966
return retval;
2967
file_accessed(file);
2968
2969
retval = mapping->a_ops->direct_IO(iocb, iter);
2970
if (retval >= 0) {
2971
iocb->ki_pos += retval;
2972
count -= retval;
2973
}
2974
if (retval != -EIOCBQUEUED)
2975
iov_iter_revert(iter, count - iov_iter_count(iter));
2976
2977
/*
2978
* Btrfs can have a short DIO read if we encounter
2979
* compressed extents, so if there was an error, or if
2980
* we've already read everything we wanted to, or if
2981
* there was a short read because we hit EOF, go ahead
2982
* and return. Otherwise fallthrough to buffered io for
2983
* the rest of the read. Buffered reads will not work for
2984
* DAX files, so don't bother trying.
2985
*/
2986
if (retval < 0 || !count || IS_DAX(inode))
2987
return retval;
2988
if (iocb->ki_pos >= i_size_read(inode))
2989
return retval;
2990
}
2991
2992
return filemap_read(iocb, iter, retval);
2993
}
2994
EXPORT_SYMBOL(generic_file_read_iter);
2995
2996
/*
2997
* Splice subpages from a folio into a pipe.
2998
*/
2999
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
3000
struct folio *folio, loff_t fpos, size_t size)
3001
{
3002
struct page *page;
3003
size_t spliced = 0, offset = offset_in_folio(folio, fpos);
3004
3005
page = folio_page(folio, offset / PAGE_SIZE);
3006
size = min(size, folio_size(folio) - offset);
3007
offset %= PAGE_SIZE;
3008
3009
while (spliced < size && !pipe_is_full(pipe)) {
3010
struct pipe_buffer *buf = pipe_head_buf(pipe);
3011
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
3012
3013
*buf = (struct pipe_buffer) {
3014
.ops = &page_cache_pipe_buf_ops,
3015
.page = page,
3016
.offset = offset,
3017
.len = part,
3018
};
3019
folio_get(folio);
3020
pipe->head++;
3021
page++;
3022
spliced += part;
3023
offset = 0;
3024
}
3025
3026
return spliced;
3027
}
3028
3029
/**
3030
* filemap_splice_read - Splice data from a file's pagecache into a pipe
3031
* @in: The file to read from
3032
* @ppos: Pointer to the file position to read from
3033
* @pipe: The pipe to splice into
3034
* @len: The amount to splice
3035
* @flags: The SPLICE_F_* flags
3036
*
3037
* This function gets folios from a file's pagecache and splices them into the
3038
* pipe. Readahead will be called as necessary to fill more folios. This may
3039
* be used for blockdevs also.
3040
*
3041
* Return: On success, the number of bytes read will be returned and *@ppos
3042
* will be updated if appropriate; 0 will be returned if there is no more data
3043
* to be read; -EAGAIN will be returned if the pipe had no space, and some
3044
* other negative error code will be returned on error. A short read may occur
3045
* if the pipe has insufficient space, we reach the end of the data or we hit a
3046
* hole.
3047
*/
3048
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
3049
struct pipe_inode_info *pipe,
3050
size_t len, unsigned int flags)
3051
{
3052
struct folio_batch fbatch;
3053
struct kiocb iocb;
3054
size_t total_spliced = 0, used, npages;
3055
loff_t isize, end_offset;
3056
bool writably_mapped;
3057
int i, error = 0;
3058
3059
if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
3060
return 0;
3061
3062
init_sync_kiocb(&iocb, in);
3063
iocb.ki_pos = *ppos;
3064
3065
/* Work out how much data we can actually add into the pipe */
3066
used = pipe_buf_usage(pipe);
3067
npages = max_t(ssize_t, pipe->max_usage - used, 0);
3068
len = min_t(size_t, len, npages * PAGE_SIZE);
3069
3070
folio_batch_init(&fbatch);
3071
3072
do {
3073
cond_resched();
3074
3075
if (*ppos >= i_size_read(in->f_mapping->host))
3076
break;
3077
3078
iocb.ki_pos = *ppos;
3079
error = filemap_get_pages(&iocb, len, &fbatch, true);
3080
if (error < 0)
3081
break;
3082
3083
/*
3084
* i_size must be checked after we know the pages are Uptodate.
3085
*
3086
* Checking i_size after the check allows us to calculate
3087
* the correct value for "nr", which means the zero-filled
3088
* part of the page is not copied back to userspace (unless
3089
* another truncate extends the file - this is desired though).
3090
*/
3091
isize = i_size_read(in->f_mapping->host);
3092
if (unlikely(*ppos >= isize))
3093
break;
3094
end_offset = min_t(loff_t, isize, *ppos + len);
3095
3096
/*
3097
* Once we start copying data, we don't want to be touching any
3098
* cachelines that might be contended:
3099
*/
3100
writably_mapped = mapping_writably_mapped(in->f_mapping);
3101
3102
for (i = 0; i < folio_batch_count(&fbatch); i++) {
3103
struct folio *folio = fbatch.folios[i];
3104
size_t n;
3105
3106
if (folio_pos(folio) >= end_offset)
3107
goto out;
3108
folio_mark_accessed(folio);
3109
3110
/*
3111
* If users can be writing to this folio using arbitrary
3112
* virtual addresses, take care of potential aliasing
3113
* before reading the folio on the kernel side.
3114
*/
3115
if (writably_mapped)
3116
flush_dcache_folio(folio);
3117
3118
n = min_t(loff_t, len, isize - *ppos);
3119
n = splice_folio_into_pipe(pipe, folio, *ppos, n);
3120
if (!n)
3121
goto out;
3122
len -= n;
3123
total_spliced += n;
3124
*ppos += n;
3125
in->f_ra.prev_pos = *ppos;
3126
if (pipe_is_full(pipe))
3127
goto out;
3128
}
3129
3130
folio_batch_release(&fbatch);
3131
} while (len);
3132
3133
out:
3134
folio_batch_release(&fbatch);
3135
file_accessed(in);
3136
3137
return total_spliced ? total_spliced : error;
3138
}
3139
EXPORT_SYMBOL(filemap_splice_read);
3140
3141
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
3142
struct address_space *mapping, struct folio *folio,
3143
loff_t start, loff_t end, bool seek_data)
3144
{
3145
const struct address_space_operations *ops = mapping->a_ops;
3146
size_t offset, bsz = i_blocksize(mapping->host);
3147
3148
if (xa_is_value(folio) || folio_test_uptodate(folio))
3149
return seek_data ? start : end;
3150
if (!ops->is_partially_uptodate)
3151
return seek_data ? end : start;
3152
3153
xas_pause(xas);
3154
rcu_read_unlock();
3155
folio_lock(folio);
3156
if (unlikely(folio->mapping != mapping))
3157
goto unlock;
3158
3159
offset = offset_in_folio(folio, start) & ~(bsz - 1);
3160
3161
do {
3162
if (ops->is_partially_uptodate(folio, offset, bsz) ==
3163
seek_data)
3164
break;
3165
start = (start + bsz) & ~((u64)bsz - 1);
3166
offset += bsz;
3167
} while (offset < folio_size(folio));
3168
unlock:
3169
folio_unlock(folio);
3170
rcu_read_lock();
3171
return start;
3172
}
3173
3174
static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
3175
{
3176
if (xa_is_value(folio))
3177
return PAGE_SIZE << xas_get_order(xas);
3178
return folio_size(folio);
3179
}
3180
3181
/**
3182
* mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
3183
* @mapping: Address space to search.
3184
* @start: First byte to consider.
3185
* @end: Limit of search (exclusive).
3186
* @whence: Either SEEK_HOLE or SEEK_DATA.
3187
*
3188
* If the page cache knows which blocks contain holes and which blocks
3189
* contain data, your filesystem can use this function to implement
3190
* SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are
3191
* entirely memory-based such as tmpfs, and filesystems which support
3192
* unwritten extents.
3193
*
3194
* Return: The requested offset on success, or -ENXIO if @whence specifies
3195
* SEEK_DATA and there is no data after @start. There is an implicit hole
3196
* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
3197
* and @end contain data.
3198
*/
3199
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
3200
loff_t end, int whence)
3201
{
3202
XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
3203
pgoff_t max = (end - 1) >> PAGE_SHIFT;
3204
bool seek_data = (whence == SEEK_DATA);
3205
struct folio *folio;
3206
3207
if (end <= start)
3208
return -ENXIO;
3209
3210
rcu_read_lock();
3211
while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
3212
loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
3213
size_t seek_size;
3214
3215
if (start < pos) {
3216
if (!seek_data)
3217
goto unlock;
3218
start = pos;
3219
}
3220
3221
seek_size = seek_folio_size(&xas, folio);
3222
pos = round_up((u64)pos + 1, seek_size);
3223
start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
3224
seek_data);
3225
if (start < pos)
3226
goto unlock;
3227
if (start >= end)
3228
break;
3229
if (seek_size > PAGE_SIZE)
3230
xas_set(&xas, pos >> PAGE_SHIFT);
3231
if (!xa_is_value(folio))
3232
folio_put(folio);
3233
}
3234
if (seek_data)
3235
start = -ENXIO;
3236
unlock:
3237
rcu_read_unlock();
3238
if (folio && !xa_is_value(folio))
3239
folio_put(folio);
3240
if (start > end)
3241
return end;
3242
return start;
3243
}
3244
3245
#ifdef CONFIG_MMU
3246
#define MMAP_LOTSAMISS (100)
3247
/*
3248
* lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
3249
* @vmf - the vm_fault for this fault.
3250
* @folio - the folio to lock.
3251
* @fpin - the pointer to the file we may pin (or is already pinned).
3252
*
3253
* This works similar to lock_folio_or_retry in that it can drop the
3254
* mmap_lock. It differs in that it actually returns the folio locked
3255
* if it returns 1 and 0 if it couldn't lock the folio. If we did have
3256
* to drop the mmap_lock then fpin will point to the pinned file and
3257
* needs to be fput()'ed at a later point.
3258
*/
3259
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
3260
struct file **fpin)
3261
{
3262
if (folio_trylock(folio))
3263
return 1;
3264
3265
/*
3266
* NOTE! This will make us return with VM_FAULT_RETRY, but with
3267
* the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
3268
* is supposed to work. We have way too many special cases..
3269
*/
3270
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
3271
return 0;
3272
3273
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
3274
if (vmf->flags & FAULT_FLAG_KILLABLE) {
3275
if (__folio_lock_killable(folio)) {
3276
/*
3277
* We didn't have the right flags to drop the
3278
* fault lock, but all fault_handlers only check
3279
* for fatal signals if we return VM_FAULT_RETRY,
3280
* so we need to drop the fault lock here and
3281
* return 0 if we don't have a fpin.
3282
*/
3283
if (*fpin == NULL)
3284
release_fault_lock(vmf);
3285
return 0;
3286
}
3287
} else
3288
__folio_lock(folio);
3289
3290
return 1;
3291
}
3292
3293
/*
3294
* Synchronous readahead happens when we don't even find a page in the page
3295
* cache at all. We don't want to perform IO under the mmap sem, so if we have
3296
* to drop the mmap sem we return the file that was pinned in order for us to do
3297
* that. If we didn't pin a file then we return NULL. The file that is
3298
* returned needs to be fput()'ed when we're done with it.
3299
*/
3300
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
3301
{
3302
struct file *file = vmf->vma->vm_file;
3303
struct file_ra_state *ra = &file->f_ra;
3304
struct address_space *mapping = file->f_mapping;
3305
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
3306
struct file *fpin = NULL;
3307
vm_flags_t vm_flags = vmf->vma->vm_flags;
3308
bool force_thp_readahead = false;
3309
unsigned short mmap_miss;
3310
3311
/* Use the readahead code, even if readahead is disabled */
3312
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3313
(vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
3314
force_thp_readahead = true;
3315
3316
if (!force_thp_readahead) {
3317
/*
3318
* If we don't want any read-ahead, don't bother.
3319
* VM_EXEC case below is already intended for random access.
3320
*/
3321
if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
3322
return fpin;
3323
3324
if (!ra->ra_pages)
3325
return fpin;
3326
3327
if (vm_flags & VM_SEQ_READ) {
3328
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3329
page_cache_sync_ra(&ractl, ra->ra_pages);
3330
return fpin;
3331
}
3332
}
3333
3334
if (!(vm_flags & VM_SEQ_READ)) {
3335
/* Avoid banging the cache line if not needed */
3336
mmap_miss = READ_ONCE(ra->mmap_miss);
3337
if (mmap_miss < MMAP_LOTSAMISS * 10)
3338
WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
3339
3340
/*
3341
* Do we miss much more than hit in this file? If so,
3342
* stop bothering with read-ahead. It will only hurt.
3343
*/
3344
if (mmap_miss > MMAP_LOTSAMISS)
3345
return fpin;
3346
}
3347
3348
if (force_thp_readahead) {
3349
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3350
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
3351
ra->size = HPAGE_PMD_NR;
3352
/*
3353
* Fetch two PMD folios, so we get the chance to actually
3354
* readahead, unless we've been told not to.
3355
*/
3356
if (!(vm_flags & VM_RAND_READ))
3357
ra->size *= 2;
3358
ra->async_size = HPAGE_PMD_NR;
3359
ra->order = HPAGE_PMD_ORDER;
3360
page_cache_ra_order(&ractl, ra);
3361
return fpin;
3362
}
3363
3364
if (vm_flags & VM_EXEC) {
3365
/*
3366
* Allow arch to request a preferred minimum folio order for
3367
* executable memory. This can often be beneficial to
3368
* performance if (e.g.) arm64 can contpte-map the folio.
3369
* Executable memory rarely benefits from readahead, due to its
3370
* random access nature, so set async_size to 0.
3371
*
3372
* Limit to the boundaries of the VMA to avoid reading in any
3373
* pad that might exist between sections, which would be a waste
3374
* of memory.
3375
*/
3376
struct vm_area_struct *vma = vmf->vma;
3377
unsigned long start = vma->vm_pgoff;
3378
unsigned long end = start + vma_pages(vma);
3379
unsigned long ra_end;
3380
3381
ra->order = exec_folio_order();
3382
ra->start = round_down(vmf->pgoff, 1UL << ra->order);
3383
ra->start = max(ra->start, start);
3384
ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
3385
ra_end = min(ra_end, end);
3386
ra->size = ra_end - ra->start;
3387
ra->async_size = 0;
3388
} else {
3389
/*
3390
* mmap read-around
3391
*/
3392
ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
3393
ra->size = ra->ra_pages;
3394
ra->async_size = ra->ra_pages / 4;
3395
ra->order = 0;
3396
}
3397
3398
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3399
ractl._index = ra->start;
3400
page_cache_ra_order(&ractl, ra);
3401
return fpin;
3402
}
3403
3404
/*
3405
* Asynchronous readahead happens when we find the page and PG_readahead,
3406
* so we want to possibly extend the readahead further. We return the file that
3407
* was pinned if we have to drop the mmap_lock in order to do IO.
3408
*/
3409
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
3410
struct folio *folio)
3411
{
3412
struct file *file = vmf->vma->vm_file;
3413
struct file_ra_state *ra = &file->f_ra;
3414
DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
3415
struct file *fpin = NULL;
3416
unsigned short mmap_miss;
3417
3418
/* If we don't want any read-ahead, don't bother */
3419
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
3420
return fpin;
3421
3422
/*
3423
* If the folio is locked, we're likely racing against another fault.
3424
* Don't touch the mmap_miss counter to avoid decreasing it multiple
3425
* times for a single folio and break the balance with mmap_miss
3426
* increase in do_sync_mmap_readahead().
3427
*/
3428
if (likely(!folio_test_locked(folio))) {
3429
mmap_miss = READ_ONCE(ra->mmap_miss);
3430
if (mmap_miss)
3431
WRITE_ONCE(ra->mmap_miss, --mmap_miss);
3432
}
3433
3434
if (folio_test_readahead(folio)) {
3435
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3436
page_cache_async_ra(&ractl, folio, ra->ra_pages);
3437
}
3438
return fpin;
3439
}
3440
3441
static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
3442
{
3443
struct vm_area_struct *vma = vmf->vma;
3444
vm_fault_t ret = 0;
3445
pte_t *ptep;
3446
3447
/*
3448
* We might have COW'ed a pagecache folio and might now have an mlocked
3449
* anon folio mapped. The original pagecache folio is not mlocked and
3450
* might have been evicted. During a read+clear/modify/write update of
3451
* the PTE, such as done in do_numa_page()/change_pte_range(), we
3452
* temporarily clear the PTE under PT lock and might detect it here as
3453
* "none" when not holding the PT lock.
3454
*
3455
* Not rechecking the PTE under PT lock could result in an unexpected
3456
* major fault in an mlock'ed region. Recheck only for this special
3457
* scenario while holding the PT lock, to not degrade non-mlocked
3458
* scenarios. Recheck the PTE without PT lock firstly, thereby reducing
3459
* the number of times we hold PT lock.
3460
*/
3461
if (!(vma->vm_flags & VM_LOCKED))
3462
return 0;
3463
3464
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
3465
return 0;
3466
3467
ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
3468
&vmf->ptl);
3469
if (unlikely(!ptep))
3470
return VM_FAULT_NOPAGE;
3471
3472
if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
3473
ret = VM_FAULT_NOPAGE;
3474
} else {
3475
spin_lock(vmf->ptl);
3476
if (unlikely(!pte_none(ptep_get(ptep))))
3477
ret = VM_FAULT_NOPAGE;
3478
spin_unlock(vmf->ptl);
3479
}
3480
pte_unmap(ptep);
3481
return ret;
3482
}
3483
3484
/**
3485
* filemap_fault - read in file data for page fault handling
3486
* @vmf: struct vm_fault containing details of the fault
3487
*
3488
* filemap_fault() is invoked via the vma operations vector for a
3489
* mapped memory region to read in file data during a page fault.
3490
*
3491
* The goto's are kind of ugly, but this streamlines the normal case of having
3492
* it in the page cache, and handles the special cases reasonably without
3493
* having a lot of duplicated code.
3494
*
3495
* vma->vm_mm->mmap_lock must be held on entry.
3496
*
3497
* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
3498
* may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
3499
*
3500
* If our return value does not have VM_FAULT_RETRY set, the mmap_lock
3501
* has not been released.
3502
*
3503
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
3504
*
3505
* Return: bitwise-OR of %VM_FAULT_ codes.
3506
*/
3507
vm_fault_t filemap_fault(struct vm_fault *vmf)
3508
{
3509
int error;
3510
struct file *file = vmf->vma->vm_file;
3511
struct file *fpin = NULL;
3512
struct address_space *mapping = file->f_mapping;
3513
struct inode *inode = mapping->host;
3514
pgoff_t max_idx, index = vmf->pgoff;
3515
struct folio *folio;
3516
vm_fault_t ret = 0;
3517
bool mapping_locked = false;
3518
3519
max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3520
if (unlikely(index >= max_idx))
3521
return VM_FAULT_SIGBUS;
3522
3523
trace_mm_filemap_fault(mapping, index);
3524
3525
/*
3526
* Do we have something in the page cache already?
3527
*/
3528
folio = filemap_get_folio(mapping, index);
3529
if (likely(!IS_ERR(folio))) {
3530
/*
3531
* We found the page, so try async readahead before waiting for
3532
* the lock.
3533
*/
3534
if (!(vmf->flags & FAULT_FLAG_TRIED))
3535
fpin = do_async_mmap_readahead(vmf, folio);
3536
if (unlikely(!folio_test_uptodate(folio))) {
3537
filemap_invalidate_lock_shared(mapping);
3538
mapping_locked = true;
3539
}
3540
} else {
3541
ret = filemap_fault_recheck_pte_none(vmf);
3542
if (unlikely(ret))
3543
return ret;
3544
3545
/* No page in the page cache at all */
3546
count_vm_event(PGMAJFAULT);
3547
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
3548
ret = VM_FAULT_MAJOR;
3549
fpin = do_sync_mmap_readahead(vmf);
3550
retry_find:
3551
/*
3552
* See comment in filemap_create_folio() why we need
3553
* invalidate_lock
3554
*/
3555
if (!mapping_locked) {
3556
filemap_invalidate_lock_shared(mapping);
3557
mapping_locked = true;
3558
}
3559
folio = __filemap_get_folio(mapping, index,
3560
FGP_CREAT|FGP_FOR_MMAP,
3561
vmf->gfp_mask);
3562
if (IS_ERR(folio)) {
3563
if (fpin)
3564
goto out_retry;
3565
filemap_invalidate_unlock_shared(mapping);
3566
return VM_FAULT_OOM;
3567
}
3568
}
3569
3570
if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
3571
goto out_retry;
3572
3573
/* Did it get truncated? */
3574
if (unlikely(folio->mapping != mapping)) {
3575
folio_unlock(folio);
3576
folio_put(folio);
3577
goto retry_find;
3578
}
3579
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
3580
3581
/*
3582
* We have a locked folio in the page cache, now we need to check
3583
* that it's up-to-date. If not, it is going to be due to an error,
3584
* or because readahead was otherwise unable to retrieve it.
3585
*/
3586
if (unlikely(!folio_test_uptodate(folio))) {
3587
/*
3588
* If the invalidate lock is not held, the folio was in cache
3589
* and uptodate and now it is not. Strange but possible since we
3590
* didn't hold the page lock all the time. Let's drop
3591
* everything, get the invalidate lock and try again.
3592
*/
3593
if (!mapping_locked) {
3594
folio_unlock(folio);
3595
folio_put(folio);
3596
goto retry_find;
3597
}
3598
3599
/*
3600
* OK, the folio is really not uptodate. This can be because the
3601
* VMA has the VM_RAND_READ flag set, or because an error
3602
* arose. Let's read it in directly.
3603
*/
3604
goto page_not_uptodate;
3605
}
3606
3607
/*
3608
* We've made it this far and we had to drop our mmap_lock, now is the
3609
* time to return to the upper layer and have it re-find the vma and
3610
* redo the fault.
3611
*/
3612
if (fpin) {
3613
folio_unlock(folio);
3614
goto out_retry;
3615
}
3616
if (mapping_locked)
3617
filemap_invalidate_unlock_shared(mapping);
3618
3619
/*
3620
* Found the page and have a reference on it.
3621
* We must recheck i_size under page lock.
3622
*/
3623
max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3624
if (unlikely(index >= max_idx)) {
3625
folio_unlock(folio);
3626
folio_put(folio);
3627
return VM_FAULT_SIGBUS;
3628
}
3629
3630
vmf->page = folio_file_page(folio, index);
3631
return ret | VM_FAULT_LOCKED;
3632
3633
page_not_uptodate:
3634
/*
3635
* Umm, take care of errors if the page isn't up-to-date.
3636
* Try to re-read it _once_. We do this synchronously,
3637
* because there really aren't any performance issues here
3638
* and we need to check for errors.
3639
*/
3640
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3641
error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
3642
if (fpin)
3643
goto out_retry;
3644
folio_put(folio);
3645
3646
if (!error || error == AOP_TRUNCATED_PAGE)
3647
goto retry_find;
3648
filemap_invalidate_unlock_shared(mapping);
3649
3650
return VM_FAULT_SIGBUS;
3651
3652
out_retry:
3653
/*
3654
* We dropped the mmap_lock, we need to return to the fault handler to
3655
* re-find the vma and come back and find our hopefully still populated
3656
* page.
3657
*/
3658
if (!IS_ERR(folio))
3659
folio_put(folio);
3660
if (mapping_locked)
3661
filemap_invalidate_unlock_shared(mapping);
3662
if (fpin)
3663
fput(fpin);
3664
return ret | VM_FAULT_RETRY;
3665
}
3666
EXPORT_SYMBOL(filemap_fault);
3667
3668
static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
3669
pgoff_t start)
3670
{
3671
struct mm_struct *mm = vmf->vma->vm_mm;
3672
3673
/* Huge page is mapped? No need to proceed. */
3674
if (pmd_trans_huge(*vmf->pmd)) {
3675
folio_unlock(folio);
3676
folio_put(folio);
3677
return true;
3678
}
3679
3680
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
3681
struct page *page = folio_file_page(folio, start);
3682
vm_fault_t ret = do_set_pmd(vmf, folio, page);
3683
if (!ret) {
3684
/* The page is mapped successfully, reference consumed. */
3685
folio_unlock(folio);
3686
return true;
3687
}
3688
}
3689
3690
if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
3691
pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
3692
3693
return false;
3694
}
3695
3696
static struct folio *next_uptodate_folio(struct xa_state *xas,
3697
struct address_space *mapping, pgoff_t end_pgoff)
3698
{
3699
struct folio *folio = xas_next_entry(xas, end_pgoff);
3700
unsigned long max_idx;
3701
3702
do {
3703
if (!folio)
3704
return NULL;
3705
if (xas_retry(xas, folio))
3706
continue;
3707
if (xa_is_value(folio))
3708
continue;
3709
if (!folio_try_get(folio))
3710
continue;
3711
if (folio_test_locked(folio))
3712
goto skip;
3713
/* Has the page moved or been split? */
3714
if (unlikely(folio != xas_reload(xas)))
3715
goto skip;
3716
if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
3717
goto skip;
3718
if (!folio_trylock(folio))
3719
goto skip;
3720
if (folio->mapping != mapping)
3721
goto unlock;
3722
if (!folio_test_uptodate(folio))
3723
goto unlock;
3724
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3725
if (xas->xa_index >= max_idx)
3726
goto unlock;
3727
return folio;
3728
unlock:
3729
folio_unlock(folio);
3730
skip:
3731
folio_put(folio);
3732
} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
3733
3734
return NULL;
3735
}
3736
3737
/*
3738
* Map page range [start_page, start_page + nr_pages) of folio.
3739
* start_page is gotten from start by folio_page(folio, start)
3740
*/
3741
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
3742
struct folio *folio, unsigned long start,
3743
unsigned long addr, unsigned int nr_pages,
3744
unsigned long *rss, unsigned short *mmap_miss,
3745
pgoff_t file_end)
3746
{
3747
struct address_space *mapping = folio->mapping;
3748
unsigned int ref_from_caller = 1;
3749
vm_fault_t ret = 0;
3750
struct page *page = folio_page(folio, start);
3751
unsigned int count = 0;
3752
pte_t *old_ptep = vmf->pte;
3753
unsigned long addr0;
3754
3755
/*
3756
* Map the large folio fully where possible:
3757
*
3758
* - The folio is fully within size of the file or belong
3759
* to shmem/tmpfs;
3760
* - The folio doesn't cross VMA boundary;
3761
* - The folio doesn't cross page table boundary;
3762
*/
3763
addr0 = addr - start * PAGE_SIZE;
3764
if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
3765
folio_within_vma(folio, vmf->vma) &&
3766
(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
3767
vmf->pte -= start;
3768
page -= start;
3769
addr = addr0;
3770
nr_pages = folio_nr_pages(folio);
3771
}
3772
3773
do {
3774
if (PageHWPoison(page + count))
3775
goto skip;
3776
3777
/*
3778
* If there are too many folios that are recently evicted
3779
* in a file, they will probably continue to be evicted.
3780
* In such situation, read-ahead is only a waste of IO.
3781
* Don't decrease mmap_miss in this scenario to make sure
3782
* we can stop read-ahead.
3783
*/
3784
if (!folio_test_workingset(folio))
3785
(*mmap_miss)++;
3786
3787
/*
3788
* NOTE: If there're PTE markers, we'll leave them to be
3789
* handled in the specific fault path, and it'll prohibit the
3790
* fault-around logic.
3791
*/
3792
if (!pte_none(ptep_get(&vmf->pte[count])))
3793
goto skip;
3794
3795
count++;
3796
continue;
3797
skip:
3798
if (count) {
3799
set_pte_range(vmf, folio, page, count, addr);
3800
*rss += count;
3801
folio_ref_add(folio, count - ref_from_caller);
3802
ref_from_caller = 0;
3803
if (in_range(vmf->address, addr, count * PAGE_SIZE))
3804
ret = VM_FAULT_NOPAGE;
3805
}
3806
3807
count++;
3808
page += count;
3809
vmf->pte += count;
3810
addr += count * PAGE_SIZE;
3811
count = 0;
3812
} while (--nr_pages > 0);
3813
3814
if (count) {
3815
set_pte_range(vmf, folio, page, count, addr);
3816
*rss += count;
3817
folio_ref_add(folio, count - ref_from_caller);
3818
ref_from_caller = 0;
3819
if (in_range(vmf->address, addr, count * PAGE_SIZE))
3820
ret = VM_FAULT_NOPAGE;
3821
}
3822
3823
vmf->pte = old_ptep;
3824
if (ref_from_caller)
3825
/* Locked folios cannot get truncated. */
3826
folio_ref_dec(folio);
3827
3828
return ret;
3829
}
3830
3831
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
3832
struct folio *folio, unsigned long addr,
3833
unsigned long *rss, unsigned short *mmap_miss)
3834
{
3835
vm_fault_t ret = 0;
3836
struct page *page = &folio->page;
3837
3838
if (PageHWPoison(page))
3839
goto out;
3840
3841
/* See comment of filemap_map_folio_range() */
3842
if (!folio_test_workingset(folio))
3843
(*mmap_miss)++;
3844
3845
/*
3846
* NOTE: If there're PTE markers, we'll leave them to be
3847
* handled in the specific fault path, and it'll prohibit
3848
* the fault-around logic.
3849
*/
3850
if (!pte_none(ptep_get(vmf->pte)))
3851
goto out;
3852
3853
if (vmf->address == addr)
3854
ret = VM_FAULT_NOPAGE;
3855
3856
set_pte_range(vmf, folio, page, 1, addr);
3857
(*rss)++;
3858
return ret;
3859
3860
out:
3861
/* Locked folios cannot get truncated. */
3862
folio_ref_dec(folio);
3863
return ret;
3864
}
3865
3866
vm_fault_t filemap_map_pages(struct vm_fault *vmf,
3867
pgoff_t start_pgoff, pgoff_t end_pgoff)
3868
{
3869
struct vm_area_struct *vma = vmf->vma;
3870
struct file *file = vma->vm_file;
3871
struct address_space *mapping = file->f_mapping;
3872
pgoff_t file_end, last_pgoff = start_pgoff;
3873
unsigned long addr;
3874
XA_STATE(xas, &mapping->i_pages, start_pgoff);
3875
struct folio *folio;
3876
vm_fault_t ret = 0;
3877
unsigned long rss = 0;
3878
unsigned int nr_pages = 0, folio_type;
3879
unsigned short mmap_miss = 0, mmap_miss_saved;
3880
3881
rcu_read_lock();
3882
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
3883
if (!folio)
3884
goto out;
3885
3886
file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
3887
end_pgoff = min(end_pgoff, file_end);
3888
3889
/*
3890
* Do not allow to map with PMD across i_size to preserve
3891
* SIGBUS semantics.
3892
*
3893
* Make an exception for shmem/tmpfs that for long time
3894
* intentionally mapped with PMDs across i_size.
3895
*/
3896
if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
3897
filemap_map_pmd(vmf, folio, start_pgoff)) {
3898
ret = VM_FAULT_NOPAGE;
3899
goto out;
3900
}
3901
3902
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
3903
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
3904
if (!vmf->pte) {
3905
folio_unlock(folio);
3906
folio_put(folio);
3907
goto out;
3908
}
3909
3910
folio_type = mm_counter_file(folio);
3911
do {
3912
unsigned long end;
3913
3914
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
3915
vmf->pte += xas.xa_index - last_pgoff;
3916
last_pgoff = xas.xa_index;
3917
end = folio_next_index(folio) - 1;
3918
nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
3919
3920
if (!folio_test_large(folio))
3921
ret |= filemap_map_order0_folio(vmf,
3922
folio, addr, &rss, &mmap_miss);
3923
else
3924
ret |= filemap_map_folio_range(vmf, folio,
3925
xas.xa_index - folio->index, addr,
3926
nr_pages, &rss, &mmap_miss, file_end);
3927
3928
folio_unlock(folio);
3929
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
3930
add_mm_counter(vma->vm_mm, folio_type, rss);
3931
pte_unmap_unlock(vmf->pte, vmf->ptl);
3932
trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
3933
out:
3934
rcu_read_unlock();
3935
3936
mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
3937
if (mmap_miss >= mmap_miss_saved)
3938
WRITE_ONCE(file->f_ra.mmap_miss, 0);
3939
else
3940
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
3941
3942
return ret;
3943
}
3944
EXPORT_SYMBOL(filemap_map_pages);
3945
3946
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3947
{
3948
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
3949
struct folio *folio = page_folio(vmf->page);
3950
vm_fault_t ret = VM_FAULT_LOCKED;
3951
3952
sb_start_pagefault(mapping->host->i_sb);
3953
file_update_time(vmf->vma->vm_file);
3954
folio_lock(folio);
3955
if (folio->mapping != mapping) {
3956
folio_unlock(folio);
3957
ret = VM_FAULT_NOPAGE;
3958
goto out;
3959
}
3960
/*
3961
* We mark the folio dirty already here so that when freeze is in
3962
* progress, we are guaranteed that writeback during freezing will
3963
* see the dirty folio and writeprotect it again.
3964
*/
3965
folio_mark_dirty(folio);
3966
folio_wait_stable(folio);
3967
out:
3968
sb_end_pagefault(mapping->host->i_sb);
3969
return ret;
3970
}
3971
3972
const struct vm_operations_struct generic_file_vm_ops = {
3973
.fault = filemap_fault,
3974
.map_pages = filemap_map_pages,
3975
.page_mkwrite = filemap_page_mkwrite,
3976
};
3977
3978
/* This is used for a general mmap of a disk file */
3979
3980
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
3981
{
3982
struct address_space *mapping = file->f_mapping;
3983
3984
if (!mapping->a_ops->read_folio)
3985
return -ENOEXEC;
3986
file_accessed(file);
3987
vma->vm_ops = &generic_file_vm_ops;
3988
return 0;
3989
}
3990
3991
int generic_file_mmap_prepare(struct vm_area_desc *desc)
3992
{
3993
struct file *file = desc->file;
3994
struct address_space *mapping = file->f_mapping;
3995
3996
if (!mapping->a_ops->read_folio)
3997
return -ENOEXEC;
3998
file_accessed(file);
3999
desc->vm_ops = &generic_file_vm_ops;
4000
return 0;
4001
}
4002
4003
/*
4004
* This is for filesystems which do not implement ->writepage.
4005
*/
4006
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
4007
{
4008
if (vma_is_shared_maywrite(vma))
4009
return -EINVAL;
4010
return generic_file_mmap(file, vma);
4011
}
4012
4013
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
4014
{
4015
if (is_shared_maywrite(desc->vm_flags))
4016
return -EINVAL;
4017
return generic_file_mmap_prepare(desc);
4018
}
4019
#else
4020
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4021
{
4022
return VM_FAULT_SIGBUS;
4023
}
4024
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
4025
{
4026
return -ENOSYS;
4027
}
4028
int generic_file_mmap_prepare(struct vm_area_desc *desc)
4029
{
4030
return -ENOSYS;
4031
}
4032
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
4033
{
4034
return -ENOSYS;
4035
}
4036
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
4037
{
4038
return -ENOSYS;
4039
}
4040
#endif /* CONFIG_MMU */
4041
4042
EXPORT_SYMBOL(filemap_page_mkwrite);
4043
EXPORT_SYMBOL(generic_file_mmap);
4044
EXPORT_SYMBOL(generic_file_mmap_prepare);
4045
EXPORT_SYMBOL(generic_file_readonly_mmap);
4046
EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
4047
4048
static struct folio *do_read_cache_folio(struct address_space *mapping,
4049
pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
4050
{
4051
struct folio *folio;
4052
int err;
4053
4054
if (!filler)
4055
filler = mapping->a_ops->read_folio;
4056
repeat:
4057
folio = filemap_get_folio(mapping, index);
4058
if (IS_ERR(folio)) {
4059
folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);
4060
if (!folio)
4061
return ERR_PTR(-ENOMEM);
4062
index = mapping_align_index(mapping, index);
4063
err = filemap_add_folio(mapping, folio, index, gfp);
4064
if (unlikely(err)) {
4065
folio_put(folio);
4066
if (err == -EEXIST)
4067
goto repeat;
4068
/* Presumably ENOMEM for xarray node */
4069
return ERR_PTR(err);
4070
}
4071
4072
goto filler;
4073
}
4074
if (folio_test_uptodate(folio))
4075
goto out;
4076
4077
if (!folio_trylock(folio)) {
4078
folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
4079
goto repeat;
4080
}
4081
4082
/* Folio was truncated from mapping */
4083
if (!folio->mapping) {
4084
folio_unlock(folio);
4085
folio_put(folio);
4086
goto repeat;
4087
}
4088
4089
/* Someone else locked and filled the page in a very small window */
4090
if (folio_test_uptodate(folio)) {
4091
folio_unlock(folio);
4092
goto out;
4093
}
4094
4095
filler:
4096
err = filemap_read_folio(file, filler, folio);
4097
if (err) {
4098
folio_put(folio);
4099
if (err == AOP_TRUNCATED_PAGE)
4100
goto repeat;
4101
return ERR_PTR(err);
4102
}
4103
4104
out:
4105
folio_mark_accessed(folio);
4106
return folio;
4107
}
4108
4109
/**
4110
* read_cache_folio - Read into page cache, fill it if needed.
4111
* @mapping: The address_space to read from.
4112
* @index: The index to read.
4113
* @filler: Function to perform the read, or NULL to use aops->read_folio().
4114
* @file: Passed to filler function, may be NULL if not required.
4115
*
4116
* Read one page into the page cache. If it succeeds, the folio returned
4117
* will contain @index, but it may not be the first page of the folio.
4118
*
4119
* If the filler function returns an error, it will be returned to the
4120
* caller.
4121
*
4122
* Context: May sleep. Expects mapping->invalidate_lock to be held.
4123
* Return: An uptodate folio on success, ERR_PTR() on failure.
4124
*/
4125
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
4126
filler_t filler, struct file *file)
4127
{
4128
return do_read_cache_folio(mapping, index, filler, file,
4129
mapping_gfp_mask(mapping));
4130
}
4131
EXPORT_SYMBOL(read_cache_folio);
4132
4133
/**
4134
* mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
4135
* @mapping: The address_space for the folio.
4136
* @index: The index that the allocated folio will contain.
4137
* @gfp: The page allocator flags to use if allocating.
4138
*
4139
* This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
4140
* any new memory allocations done using the specified allocation flags.
4141
*
4142
* The most likely error from this function is EIO, but ENOMEM is
4143
* possible and so is EINTR. If ->read_folio returns another error,
4144
* that will be returned to the caller.
4145
*
4146
* The function expects mapping->invalidate_lock to be already held.
4147
*
4148
* Return: Uptodate folio on success, ERR_PTR() on failure.
4149
*/
4150
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
4151
pgoff_t index, gfp_t gfp)
4152
{
4153
return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
4154
}
4155
EXPORT_SYMBOL(mapping_read_folio_gfp);
4156
4157
static struct page *do_read_cache_page(struct address_space *mapping,
4158
pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
4159
{
4160
struct folio *folio;
4161
4162
folio = do_read_cache_folio(mapping, index, filler, file, gfp);
4163
if (IS_ERR(folio))
4164
return &folio->page;
4165
return folio_file_page(folio, index);
4166
}
4167
4168
struct page *read_cache_page(struct address_space *mapping,
4169
pgoff_t index, filler_t *filler, struct file *file)
4170
{
4171
return do_read_cache_page(mapping, index, filler, file,
4172
mapping_gfp_mask(mapping));
4173
}
4174
EXPORT_SYMBOL(read_cache_page);
4175
4176
/**
4177
* read_cache_page_gfp - read into page cache, using specified page allocation flags.
4178
* @mapping: the page's address_space
4179
* @index: the page index
4180
* @gfp: the page allocator flags to use if allocating
4181
*
4182
* This is the same as "read_mapping_page(mapping, index, NULL)", but with
4183
* any new page allocations done using the specified allocation flags.
4184
*
4185
* If the page does not get brought uptodate, return -EIO.
4186
*
4187
* The function expects mapping->invalidate_lock to be already held.
4188
*
4189
* Return: up to date page on success, ERR_PTR() on failure.
4190
*/
4191
struct page *read_cache_page_gfp(struct address_space *mapping,
4192
pgoff_t index,
4193
gfp_t gfp)
4194
{
4195
return do_read_cache_page(mapping, index, NULL, NULL, gfp);
4196
}
4197
EXPORT_SYMBOL(read_cache_page_gfp);
4198
4199
/*
4200
* Warn about a page cache invalidation failure during a direct I/O write.
4201
*/
4202
static void dio_warn_stale_pagecache(struct file *filp)
4203
{
4204
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
4205
char pathname[128];
4206
char *path;
4207
4208
errseq_set(&filp->f_mapping->wb_err, -EIO);
4209
if (__ratelimit(&_rs)) {
4210
path = file_path(filp, pathname, sizeof(pathname));
4211
if (IS_ERR(path))
4212
path = "(unknown)";
4213
pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
4214
pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
4215
current->comm);
4216
}
4217
}
4218
4219
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
4220
{
4221
struct address_space *mapping = iocb->ki_filp->f_mapping;
4222
4223
if (mapping->nrpages &&
4224
invalidate_inode_pages2_range(mapping,
4225
iocb->ki_pos >> PAGE_SHIFT,
4226
(iocb->ki_pos + count - 1) >> PAGE_SHIFT))
4227
dio_warn_stale_pagecache(iocb->ki_filp);
4228
}
4229
4230
ssize_t
4231
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
4232
{
4233
struct address_space *mapping = iocb->ki_filp->f_mapping;
4234
size_t write_len = iov_iter_count(from);
4235
ssize_t written;
4236
4237
/*
4238
* If a page can not be invalidated, return 0 to fall back
4239
* to buffered write.
4240
*/
4241
written = kiocb_invalidate_pages(iocb, write_len);
4242
if (written) {
4243
if (written == -EBUSY)
4244
return 0;
4245
return written;
4246
}
4247
4248
written = mapping->a_ops->direct_IO(iocb, from);
4249
4250
/*
4251
* Finally, try again to invalidate clean pages which might have been
4252
* cached by non-direct readahead, or faulted in by get_user_pages()
4253
* if the source of the write was an mmap'ed region of the file
4254
* we're writing. Either one is a pretty crazy thing to do,
4255
* so we don't support it 100%. If this invalidation
4256
* fails, tough, the write still worked...
4257
*
4258
* Most of the time we do not need this since dio_complete() will do
4259
* the invalidation for us. However there are some file systems that
4260
* do not end up with dio_complete() being called, so let's not break
4261
* them by removing it completely.
4262
*
4263
* Noticeable example is a blkdev_direct_IO().
4264
*
4265
* Skip invalidation for async writes or if mapping has no pages.
4266
*/
4267
if (written > 0) {
4268
struct inode *inode = mapping->host;
4269
loff_t pos = iocb->ki_pos;
4270
4271
kiocb_invalidate_post_direct_write(iocb, written);
4272
pos += written;
4273
write_len -= written;
4274
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
4275
i_size_write(inode, pos);
4276
mark_inode_dirty(inode);
4277
}
4278
iocb->ki_pos = pos;
4279
}
4280
if (written != -EIOCBQUEUED)
4281
iov_iter_revert(from, write_len - iov_iter_count(from));
4282
return written;
4283
}
4284
EXPORT_SYMBOL(generic_file_direct_write);
4285
4286
ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
4287
{
4288
struct file *file = iocb->ki_filp;
4289
loff_t pos = iocb->ki_pos;
4290
struct address_space *mapping = file->f_mapping;
4291
const struct address_space_operations *a_ops = mapping->a_ops;
4292
size_t chunk = mapping_max_folio_size(mapping);
4293
long status = 0;
4294
ssize_t written = 0;
4295
4296
do {
4297
struct folio *folio;
4298
size_t offset; /* Offset into folio */
4299
size_t bytes; /* Bytes to write to folio */
4300
size_t copied; /* Bytes copied from user */
4301
void *fsdata = NULL;
4302
4303
bytes = iov_iter_count(i);
4304
retry:
4305
offset = pos & (chunk - 1);
4306
bytes = min(chunk - offset, bytes);
4307
balance_dirty_pages_ratelimited(mapping);
4308
4309
if (fatal_signal_pending(current)) {
4310
status = -EINTR;
4311
break;
4312
}
4313
4314
status = a_ops->write_begin(iocb, mapping, pos, bytes,
4315
&folio, &fsdata);
4316
if (unlikely(status < 0))
4317
break;
4318
4319
offset = offset_in_folio(folio, pos);
4320
if (bytes > folio_size(folio) - offset)
4321
bytes = folio_size(folio) - offset;
4322
4323
if (mapping_writably_mapped(mapping))
4324
flush_dcache_folio(folio);
4325
4326
/*
4327
* Faults here on mmap()s can recurse into arbitrary
4328
* filesystem code. Lots of locks are held that can
4329
* deadlock. Use an atomic copy to avoid deadlocking
4330
* in page fault handling.
4331
*/
4332
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
4333
flush_dcache_folio(folio);
4334
4335
status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
4336
folio, fsdata);
4337
if (unlikely(status != copied)) {
4338
iov_iter_revert(i, copied - max(status, 0L));
4339
if (unlikely(status < 0))
4340
break;
4341
}
4342
cond_resched();
4343
4344
if (unlikely(status == 0)) {
4345
/*
4346
* A short copy made ->write_end() reject the
4347
* thing entirely. Might be memory poisoning
4348
* halfway through, might be a race with munmap,
4349
* might be severe memory pressure.
4350
*/
4351
if (chunk > PAGE_SIZE)
4352
chunk /= 2;
4353
if (copied) {
4354
bytes = copied;
4355
goto retry;
4356
}
4357
4358
/*
4359
* 'folio' is now unlocked and faults on it can be
4360
* handled. Ensure forward progress by trying to
4361
* fault it in now.
4362
*/
4363
if (fault_in_iov_iter_readable(i, bytes) == bytes) {
4364
status = -EFAULT;
4365
break;
4366
}
4367
} else {
4368
pos += status;
4369
written += status;
4370
}
4371
} while (iov_iter_count(i));
4372
4373
if (!written)
4374
return status;
4375
iocb->ki_pos += written;
4376
return written;
4377
}
4378
EXPORT_SYMBOL(generic_perform_write);
4379
4380
/**
4381
* __generic_file_write_iter - write data to a file
4382
* @iocb: IO state structure (file, offset, etc.)
4383
* @from: iov_iter with data to write
4384
*
4385
* This function does all the work needed for actually writing data to a
4386
* file. It does all basic checks, removes SUID from the file, updates
4387
* modification times and calls proper subroutines depending on whether we
4388
* do direct IO or a standard buffered write.
4389
*
4390
* It expects i_rwsem to be grabbed unless we work on a block device or similar
4391
* object which does not need locking at all.
4392
*
4393
* This function does *not* take care of syncing data in case of O_SYNC write.
4394
* A caller has to handle it. This is mainly due to the fact that we want to
4395
* avoid syncing under i_rwsem.
4396
*
4397
* Return:
4398
* * number of bytes written, even for truncated writes
4399
* * negative error code if no data has been written at all
4400
*/
4401
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
4402
{
4403
struct file *file = iocb->ki_filp;
4404
struct address_space *mapping = file->f_mapping;
4405
struct inode *inode = mapping->host;
4406
ssize_t ret;
4407
4408
ret = file_remove_privs(file);
4409
if (ret)
4410
return ret;
4411
4412
ret = file_update_time(file);
4413
if (ret)
4414
return ret;
4415
4416
if (iocb->ki_flags & IOCB_DIRECT) {
4417
ret = generic_file_direct_write(iocb, from);
4418
/*
4419
* If the write stopped short of completing, fall back to
4420
* buffered writes. Some filesystems do this for writes to
4421
* holes, for example. For DAX files, a buffered write will
4422
* not succeed (even if it did, DAX does not handle dirty
4423
* page-cache pages correctly).
4424
*/
4425
if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
4426
return ret;
4427
return direct_write_fallback(iocb, from, ret,
4428
generic_perform_write(iocb, from));
4429
}
4430
4431
return generic_perform_write(iocb, from);
4432
}
4433
EXPORT_SYMBOL(__generic_file_write_iter);
4434
4435
/**
4436
* generic_file_write_iter - write data to a file
4437
* @iocb: IO state structure
4438
* @from: iov_iter with data to write
4439
*
4440
* This is a wrapper around __generic_file_write_iter() to be used by most
4441
* filesystems. It takes care of syncing the file in case of O_SYNC file
4442
* and acquires i_rwsem as needed.
4443
* Return:
4444
* * negative error code if no data has been written at all of
4445
* vfs_fsync_range() failed for a synchronous write
4446
* * number of bytes written, even for truncated writes
4447
*/
4448
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
4449
{
4450
struct file *file = iocb->ki_filp;
4451
struct inode *inode = file->f_mapping->host;
4452
ssize_t ret;
4453
4454
inode_lock(inode);
4455
ret = generic_write_checks(iocb, from);
4456
if (ret > 0)
4457
ret = __generic_file_write_iter(iocb, from);
4458
inode_unlock(inode);
4459
4460
if (ret > 0)
4461
ret = generic_write_sync(iocb, ret);
4462
return ret;
4463
}
4464
EXPORT_SYMBOL(generic_file_write_iter);
4465
4466
/**
4467
* filemap_release_folio() - Release fs-specific metadata on a folio.
4468
* @folio: The folio which the kernel is trying to free.
4469
* @gfp: Memory allocation flags (and I/O mode).
4470
*
4471
* The address_space is trying to release any data attached to a folio
4472
* (presumably at folio->private).
4473
*
4474
* This will also be called if the private_2 flag is set on a page,
4475
* indicating that the folio has other metadata associated with it.
4476
*
4477
* The @gfp argument specifies whether I/O may be performed to release
4478
* this page (__GFP_IO), and whether the call may block
4479
* (__GFP_RECLAIM & __GFP_FS).
4480
*
4481
* Return: %true if the release was successful, otherwise %false.
4482
*/
4483
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
4484
{
4485
struct address_space * const mapping = folio->mapping;
4486
4487
BUG_ON(!folio_test_locked(folio));
4488
if (!folio_needs_release(folio))
4489
return true;
4490
if (folio_test_writeback(folio))
4491
return false;
4492
4493
if (mapping && mapping->a_ops->release_folio)
4494
return mapping->a_ops->release_folio(folio, gfp);
4495
return try_to_free_buffers(folio);
4496
}
4497
EXPORT_SYMBOL(filemap_release_folio);
4498
4499
/**
4500
* filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
4501
* @inode: The inode to flush
4502
* @flush: Set to write back rather than simply invalidate.
4503
* @start: First byte to in range.
4504
* @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
4505
* onwards.
4506
*
4507
* Invalidate all the folios on an inode that contribute to the specified
4508
* range, possibly writing them back first. Whilst the operation is
4509
* undertaken, the invalidate lock is held to prevent new folios from being
4510
* installed.
4511
*/
4512
int filemap_invalidate_inode(struct inode *inode, bool flush,
4513
loff_t start, loff_t end)
4514
{
4515
struct address_space *mapping = inode->i_mapping;
4516
pgoff_t first = start >> PAGE_SHIFT;
4517
pgoff_t last = end >> PAGE_SHIFT;
4518
pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;
4519
4520
if (!mapping || !mapping->nrpages || end < start)
4521
goto out;
4522
4523
/* Prevent new folios from being added to the inode. */
4524
filemap_invalidate_lock(mapping);
4525
4526
if (!mapping->nrpages)
4527
goto unlock;
4528
4529
unmap_mapping_pages(mapping, first, nr, false);
4530
4531
/* Write back the data if we're asked to. */
4532
if (flush)
4533
filemap_fdatawrite_range(mapping, start, end);
4534
4535
/* Wait for writeback to complete on all folios and discard. */
4536
invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
4537
4538
unlock:
4539
filemap_invalidate_unlock(mapping);
4540
out:
4541
return filemap_check_errors(mapping);
4542
}
4543
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);
4544
4545
#ifdef CONFIG_CACHESTAT_SYSCALL
4546
/**
4547
* filemap_cachestat() - compute the page cache statistics of a mapping
4548
* @mapping: The mapping to compute the statistics for.
4549
* @first_index: The starting page cache index.
4550
* @last_index: The final page index (inclusive).
4551
* @cs: the cachestat struct to write the result to.
4552
*
4553
* This will query the page cache statistics of a mapping in the
4554
* page range of [first_index, last_index] (inclusive). The statistics
4555
* queried include: number of dirty pages, number of pages marked for
4556
* writeback, and the number of (recently) evicted pages.
4557
*/
4558
static void filemap_cachestat(struct address_space *mapping,
4559
pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
4560
{
4561
XA_STATE(xas, &mapping->i_pages, first_index);
4562
struct folio *folio;
4563
4564
/* Flush stats (and potentially sleep) outside the RCU read section. */
4565
mem_cgroup_flush_stats_ratelimited(NULL);
4566
4567
rcu_read_lock();
4568
xas_for_each(&xas, folio, last_index) {
4569
int order;
4570
unsigned long nr_pages;
4571
pgoff_t folio_first_index, folio_last_index;
4572
4573
/*
4574
* Don't deref the folio. It is not pinned, and might
4575
* get freed (and reused) underneath us.
4576
*
4577
* We *could* pin it, but that would be expensive for
4578
* what should be a fast and lightweight syscall.
4579
*
4580
* Instead, derive all information of interest from
4581
* the rcu-protected xarray.
4582
*/
4583
4584
if (xas_retry(&xas, folio))
4585
continue;
4586
4587
order = xas_get_order(&xas);
4588
nr_pages = 1 << order;
4589
folio_first_index = round_down(xas.xa_index, 1 << order);
4590
folio_last_index = folio_first_index + nr_pages - 1;
4591
4592
/* Folios might straddle the range boundaries, only count covered pages */
4593
if (folio_first_index < first_index)
4594
nr_pages -= first_index - folio_first_index;
4595
4596
if (folio_last_index > last_index)
4597
nr_pages -= folio_last_index - last_index;
4598
4599
if (xa_is_value(folio)) {
4600
/* page is evicted */
4601
void *shadow = (void *)folio;
4602
bool workingset; /* not used */
4603
4604
cs->nr_evicted += nr_pages;
4605
4606
#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
4607
if (shmem_mapping(mapping)) {
4608
/* shmem file - in swap cache */
4609
swp_entry_t swp = radix_to_swp_entry(folio);
4610
4611
/* swapin error results in poisoned entry */
4612
if (!softleaf_is_swap(swp))
4613
goto resched;
4614
4615
/*
4616
* Getting a swap entry from the shmem
4617
* inode means we beat
4618
* shmem_unuse(). rcu_read_lock()
4619
* ensures swapoff waits for us before
4620
* freeing the swapper space. However,
4621
* we can race with swapping and
4622
* invalidation, so there might not be
4623
* a shadow in the swapcache (yet).
4624
*/
4625
shadow = swap_cache_get_shadow(swp);
4626
if (!shadow)
4627
goto resched;
4628
}
4629
#endif
4630
if (workingset_test_recent(shadow, true, &workingset, false))
4631
cs->nr_recently_evicted += nr_pages;
4632
4633
goto resched;
4634
}
4635
4636
/* page is in cache */
4637
cs->nr_cache += nr_pages;
4638
4639
if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
4640
cs->nr_dirty += nr_pages;
4641
4642
if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
4643
cs->nr_writeback += nr_pages;
4644
4645
resched:
4646
if (need_resched()) {
4647
xas_pause(&xas);
4648
cond_resched_rcu();
4649
}
4650
}
4651
rcu_read_unlock();
4652
}
4653
4654
/*
4655
* See mincore: reveal pagecache information only for files
4656
* that the calling process has write access to, or could (if
4657
* tried) open for writing.
4658
*/
4659
static inline bool can_do_cachestat(struct file *f)
4660
{
4661
if (f->f_mode & FMODE_WRITE)
4662
return true;
4663
if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
4664
return true;
4665
return file_permission(f, MAY_WRITE) == 0;
4666
}
4667
4668
/*
4669
* The cachestat(2) system call.
4670
*
4671
* cachestat() returns the page cache statistics of a file in the
4672
* bytes range specified by `off` and `len`: number of cached pages,
4673
* number of dirty pages, number of pages marked for writeback,
4674
* number of evicted pages, and number of recently evicted pages.
4675
*
4676
* An evicted page is a page that is previously in the page cache
4677
* but has been evicted since. A page is recently evicted if its last
4678
* eviction was recent enough that its reentry to the cache would
4679
* indicate that it is actively being used by the system, and that
4680
* there is memory pressure on the system.
4681
*
4682
* `off` and `len` must be non-negative integers. If `len` > 0,
4683
* the queried range is [`off`, `off` + `len`]. If `len` == 0,
4684
* we will query in the range from `off` to the end of the file.
4685
*
4686
* The `flags` argument is unused for now, but is included for future
4687
* extensibility. User should pass 0 (i.e no flag specified).
4688
*
4689
* Currently, hugetlbfs is not supported.
4690
*
4691
* Because the status of a page can change after cachestat() checks it
4692
* but before it returns to the application, the returned values may
4693
* contain stale information.
4694
*
4695
* return values:
4696
* zero - success
4697
* -EFAULT - cstat or cstat_range points to an illegal address
4698
* -EINVAL - invalid flags
4699
* -EBADF - invalid file descriptor
4700
* -EOPNOTSUPP - file descriptor is of a hugetlbfs file
4701
*/
4702
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
4703
struct cachestat_range __user *, cstat_range,
4704
struct cachestat __user *, cstat, unsigned int, flags)
4705
{
4706
CLASS(fd, f)(fd);
4707
struct address_space *mapping;
4708
struct cachestat_range csr;
4709
struct cachestat cs;
4710
pgoff_t first_index, last_index;
4711
4712
if (fd_empty(f))
4713
return -EBADF;
4714
4715
if (copy_from_user(&csr, cstat_range,
4716
sizeof(struct cachestat_range)))
4717
return -EFAULT;
4718
4719
/* hugetlbfs is not supported */
4720
if (is_file_hugepages(fd_file(f)))
4721
return -EOPNOTSUPP;
4722
4723
if (!can_do_cachestat(fd_file(f)))
4724
return -EPERM;
4725
4726
if (flags != 0)
4727
return -EINVAL;
4728
4729
first_index = csr.off >> PAGE_SHIFT;
4730
last_index =
4731
csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
4732
memset(&cs, 0, sizeof(struct cachestat));
4733
mapping = fd_file(f)->f_mapping;
4734
filemap_cachestat(mapping, first_index, last_index, &cs);
4735
4736
if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
4737
return -EFAULT;
4738
4739
return 0;
4740
}
4741
#endif /* CONFIG_CACHESTAT_SYSCALL */
4742
4743