// SPDX-License-Identifier: GPL-2.0-only1/*2* linux/mm/filemap.c3*4* Copyright (C) 1994-1999 Linus Torvalds5*/67/*8* This file handles the generic file mmap semantics used by9* most "normal" filesystems (but you don't /have/ to use this:10* the NFS filesystem used to do this differently, for example)11*/12#include <linux/export.h>13#include <linux/compiler.h>14#include <linux/dax.h>15#include <linux/fs.h>16#include <linux/sched/signal.h>17#include <linux/uaccess.h>18#include <linux/capability.h>19#include <linux/kernel_stat.h>20#include <linux/gfp.h>21#include <linux/mm.h>22#include <linux/swap.h>23#include <linux/leafops.h>24#include <linux/syscalls.h>25#include <linux/mman.h>26#include <linux/pagemap.h>27#include <linux/file.h>28#include <linux/uio.h>29#include <linux/error-injection.h>30#include <linux/hash.h>31#include <linux/writeback.h>32#include <linux/backing-dev.h>33#include <linux/pagevec.h>34#include <linux/security.h>35#include <linux/cpuset.h>36#include <linux/hugetlb.h>37#include <linux/memcontrol.h>38#include <linux/shmem_fs.h>39#include <linux/rmap.h>40#include <linux/delayacct.h>41#include <linux/psi.h>42#include <linux/ramfs.h>43#include <linux/page_idle.h>44#include <linux/migrate.h>45#include <linux/pipe_fs_i.h>46#include <linux/splice.h>47#include <linux/rcupdate_wait.h>48#include <linux/sched/mm.h>49#include <linux/sysctl.h>50#include <linux/pgalloc.h>5152#include <asm/tlbflush.h>53#include "internal.h"5455#define CREATE_TRACE_POINTS56#include <trace/events/filemap.h>5758/*59* FIXME: remove all knowledge of the buffer layer from the core VM60*/61#include <linux/buffer_head.h> /* for try_to_free_buffers */6263#include <asm/mman.h>6465#include "swap.h"6667/*68* Shared mappings implemented 30.11.1994. It's not fully working yet,69* though.70*71* Shared mappings now work. 15.8.1995 Bruno.72*73* finished 'unifying' the page and buffer cache and SMP-threaded the74* page-cache, 21.05.1999, Ingo Molnar <[email protected]>75*76* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <[email protected]>77*/7879/*80* Lock ordering:81*82* ->i_mmap_rwsem (truncate_pagecache)83* ->private_lock (__free_pte->block_dirty_folio)84* ->swap_lock (exclusive_swap_page, others)85* ->i_pages lock86*87* ->i_rwsem88* ->invalidate_lock (acquired by fs in truncate path)89* ->i_mmap_rwsem (truncate->unmap_mapping_range)90*91* ->mmap_lock92* ->i_mmap_rwsem93* ->page_table_lock or pte_lock (various, mainly in memory.c)94* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)95*96* ->mmap_lock97* ->invalidate_lock (filemap_fault)98* ->lock_page (filemap_fault, access_process_vm)99*100* ->i_rwsem (generic_perform_write)101* ->mmap_lock (fault_in_readable->do_page_fault)102*103* bdi->wb.list_lock104* sb_lock (fs/fs-writeback.c)105* ->i_pages lock (__sync_single_inode)106*107* ->i_mmap_rwsem108* ->anon_vma.lock (vma_merge)109*110* ->anon_vma.lock111* ->page_table_lock or pte_lock (anon_vma_prepare and various)112*113* ->page_table_lock or pte_lock114* ->swap_lock (try_to_unmap_one)115* ->private_lock (try_to_unmap_one)116* ->i_pages lock (try_to_unmap_one)117* ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)118* ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)119* ->private_lock (folio_remove_rmap_pte->set_page_dirty)120* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)121* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)122* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)123* bdi.wb->list_lock (zap_pte_range->set_page_dirty)124* ->inode->i_lock (zap_pte_range->set_page_dirty)125* ->private_lock (zap_pte_range->block_dirty_folio)126*/127128static void page_cache_delete(struct address_space *mapping,129struct folio *folio, void *shadow)130{131XA_STATE(xas, &mapping->i_pages, folio->index);132long nr = 1;133134mapping_set_update(&xas, mapping);135136xas_set_order(&xas, folio->index, folio_order(folio));137nr = folio_nr_pages(folio);138139VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);140141xas_store(&xas, shadow);142xas_init_marks(&xas);143144folio->mapping = NULL;145/* Leave folio->index set: truncation lookup relies upon it */146mapping->nrpages -= nr;147}148149static void filemap_unaccount_folio(struct address_space *mapping,150struct folio *folio)151{152long nr;153154VM_BUG_ON_FOLIO(folio_mapped(folio), folio);155if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {156pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",157current->comm, folio_pfn(folio));158dump_page(&folio->page, "still mapped when deleted");159dump_stack();160add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);161162if (mapping_exiting(mapping) && !folio_test_large(folio)) {163int mapcount = folio_mapcount(folio);164165if (folio_ref_count(folio) >= mapcount + 2) {166/*167* All vmas have already been torn down, so it's168* a good bet that actually the page is unmapped169* and we'd rather not leak it: if we're wrong,170* another bad page check should catch it later.171*/172atomic_set(&folio->_mapcount, -1);173folio_ref_sub(folio, mapcount);174}175}176}177178/* hugetlb folios do not participate in page cache accounting. */179if (folio_test_hugetlb(folio))180return;181182nr = folio_nr_pages(folio);183184lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);185if (folio_test_swapbacked(folio)) {186lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);187if (folio_test_pmd_mappable(folio))188lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);189} else if (folio_test_pmd_mappable(folio)) {190lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);191filemap_nr_thps_dec(mapping);192}193if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))194mod_node_page_state(folio_pgdat(folio),195NR_KERNEL_FILE_PAGES, -nr);196197/*198* At this point folio must be either written or cleaned by199* truncate. Dirty folio here signals a bug and loss of200* unwritten data - on ordinary filesystems.201*202* But it's harmless on in-memory filesystems like tmpfs; and can203* occur when a driver which did get_user_pages() sets page dirty204* before putting it, while the inode is being finally evicted.205*206* Below fixes dirty accounting after removing the folio entirely207* but leaves the dirty flag set: it has no effect for truncated208* folio and anyway will be cleared before returning folio to209* buddy allocator.210*/211if (WARN_ON_ONCE(folio_test_dirty(folio) &&212mapping_can_writeback(mapping)))213folio_account_cleaned(folio, inode_to_wb(mapping->host));214}215216/*217* Delete a page from the page cache and free it. Caller has to make218* sure the page is locked and that nobody else uses it - or that usage219* is safe. The caller must hold the i_pages lock.220*/221void __filemap_remove_folio(struct folio *folio, void *shadow)222{223struct address_space *mapping = folio->mapping;224225trace_mm_filemap_delete_from_page_cache(folio);226filemap_unaccount_folio(mapping, folio);227page_cache_delete(mapping, folio, shadow);228}229230void filemap_free_folio(struct address_space *mapping, struct folio *folio)231{232void (*free_folio)(struct folio *);233234free_folio = mapping->a_ops->free_folio;235if (free_folio)236free_folio(folio);237238folio_put_refs(folio, folio_nr_pages(folio));239}240241/**242* filemap_remove_folio - Remove folio from page cache.243* @folio: The folio.244*245* This must be called only on folios that are locked and have been246* verified to be in the page cache. It will never put the folio into247* the free list because the caller has a reference on the page.248*/249void filemap_remove_folio(struct folio *folio)250{251struct address_space *mapping = folio->mapping;252253BUG_ON(!folio_test_locked(folio));254spin_lock(&mapping->host->i_lock);255xa_lock_irq(&mapping->i_pages);256__filemap_remove_folio(folio, NULL);257xa_unlock_irq(&mapping->i_pages);258if (mapping_shrinkable(mapping))259inode_lru_list_add(mapping->host);260spin_unlock(&mapping->host->i_lock);261262filemap_free_folio(mapping, folio);263}264265/*266* page_cache_delete_batch - delete several folios from page cache267* @mapping: the mapping to which folios belong268* @fbatch: batch of folios to delete269*270* The function walks over mapping->i_pages and removes folios passed in271* @fbatch from the mapping. The function expects @fbatch to be sorted272* by page index and is optimised for it to be dense.273* It tolerates holes in @fbatch (mapping entries at those indices are not274* modified).275*276* The function expects the i_pages lock to be held.277*/278static void page_cache_delete_batch(struct address_space *mapping,279struct folio_batch *fbatch)280{281XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);282long total_pages = 0;283int i = 0;284struct folio *folio;285286mapping_set_update(&xas, mapping);287xas_for_each(&xas, folio, ULONG_MAX) {288if (i >= folio_batch_count(fbatch))289break;290291/* A swap/dax/shadow entry got inserted? Skip it. */292if (xa_is_value(folio))293continue;294/*295* A page got inserted in our range? Skip it. We have our296* pages locked so they are protected from being removed.297* If we see a page whose index is higher than ours, it298* means our page has been removed, which shouldn't be299* possible because we're holding the PageLock.300*/301if (folio != fbatch->folios[i]) {302VM_BUG_ON_FOLIO(folio->index >303fbatch->folios[i]->index, folio);304continue;305}306307WARN_ON_ONCE(!folio_test_locked(folio));308309folio->mapping = NULL;310/* Leave folio->index set: truncation lookup relies on it */311312i++;313xas_store(&xas, NULL);314total_pages += folio_nr_pages(folio);315}316mapping->nrpages -= total_pages;317}318319void delete_from_page_cache_batch(struct address_space *mapping,320struct folio_batch *fbatch)321{322int i;323324if (!folio_batch_count(fbatch))325return;326327spin_lock(&mapping->host->i_lock);328xa_lock_irq(&mapping->i_pages);329for (i = 0; i < folio_batch_count(fbatch); i++) {330struct folio *folio = fbatch->folios[i];331332trace_mm_filemap_delete_from_page_cache(folio);333filemap_unaccount_folio(mapping, folio);334}335page_cache_delete_batch(mapping, fbatch);336xa_unlock_irq(&mapping->i_pages);337if (mapping_shrinkable(mapping))338inode_lru_list_add(mapping->host);339spin_unlock(&mapping->host->i_lock);340341for (i = 0; i < folio_batch_count(fbatch); i++)342filemap_free_folio(mapping, fbatch->folios[i]);343}344345int filemap_check_errors(struct address_space *mapping)346{347int ret = 0;348/* Check for outstanding write errors */349if (test_bit(AS_ENOSPC, &mapping->flags) &&350test_and_clear_bit(AS_ENOSPC, &mapping->flags))351ret = -ENOSPC;352if (test_bit(AS_EIO, &mapping->flags) &&353test_and_clear_bit(AS_EIO, &mapping->flags))354ret = -EIO;355return ret;356}357EXPORT_SYMBOL(filemap_check_errors);358359static int filemap_check_and_keep_errors(struct address_space *mapping)360{361/* Check for outstanding write errors */362if (test_bit(AS_EIO, &mapping->flags))363return -EIO;364if (test_bit(AS_ENOSPC, &mapping->flags))365return -ENOSPC;366return 0;367}368369static int filemap_writeback(struct address_space *mapping, loff_t start,370loff_t end, enum writeback_sync_modes sync_mode,371long *nr_to_write)372{373struct writeback_control wbc = {374.sync_mode = sync_mode,375.nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX,376.range_start = start,377.range_end = end,378};379int ret;380381if (!mapping_can_writeback(mapping) ||382!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))383return 0;384385wbc_attach_fdatawrite_inode(&wbc, mapping->host);386ret = do_writepages(mapping, &wbc);387wbc_detach_inode(&wbc);388389if (!ret && nr_to_write)390*nr_to_write = wbc.nr_to_write;391return ret;392}393394/**395* filemap_fdatawrite_range - start writeback on mapping dirty pages in range396* @mapping: address space structure to write397* @start: offset in bytes where the range starts398* @end: offset in bytes where the range ends (inclusive)399*400* Start writeback against all of a mapping's dirty pages that lie401* within the byte offsets <start, end> inclusive.402*403* This is a data integrity operation that waits upon dirty or in writeback404* pages.405*406* Return: %0 on success, negative error code otherwise.407*/408int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,409loff_t end)410{411return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);412}413EXPORT_SYMBOL(filemap_fdatawrite_range);414415int filemap_fdatawrite(struct address_space *mapping)416{417return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);418}419EXPORT_SYMBOL(filemap_fdatawrite);420421/**422* filemap_flush_range - start writeback on a range423* @mapping: target address_space424* @start: index to start writeback on425* @end: last (inclusive) index for writeback426*427* This is a non-integrity writeback helper, to start writing back folios428* for the indicated range.429*430* Return: %0 on success, negative error code otherwise.431*/432int filemap_flush_range(struct address_space *mapping, loff_t start,433loff_t end)434{435return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);436}437EXPORT_SYMBOL_GPL(filemap_flush_range);438439/**440* filemap_flush - mostly a non-blocking flush441* @mapping: target address_space442*443* This is a mostly non-blocking flush. Not suitable for data-integrity444* purposes - I/O may not be started against all dirty pages.445*446* Return: %0 on success, negative error code otherwise.447*/448int filemap_flush(struct address_space *mapping)449{450return filemap_flush_range(mapping, 0, LLONG_MAX);451}452EXPORT_SYMBOL(filemap_flush);453454/*455* Start writeback on @nr_to_write pages from @mapping. No one but the existing456* btrfs caller should be using this. Talk to linux-mm if you think adding a457* new caller is a good idea.458*/459int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)460{461return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,462nr_to_write);463}464EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");465466/**467* filemap_range_has_page - check if a page exists in range.468* @mapping: address space within which to check469* @start_byte: offset in bytes where the range starts470* @end_byte: offset in bytes where the range ends (inclusive)471*472* Find at least one page in the range supplied, usually used to check if473* direct writing in this range will trigger a writeback.474*475* Return: %true if at least one page exists in the specified range,476* %false otherwise.477*/478bool filemap_range_has_page(struct address_space *mapping,479loff_t start_byte, loff_t end_byte)480{481struct folio *folio;482XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);483pgoff_t max = end_byte >> PAGE_SHIFT;484485if (end_byte < start_byte)486return false;487488rcu_read_lock();489for (;;) {490folio = xas_find(&xas, max);491if (xas_retry(&xas, folio))492continue;493/* Shadow entries don't count */494if (xa_is_value(folio))495continue;496/*497* We don't need to try to pin this page; we're about to498* release the RCU lock anyway. It is enough to know that499* there was a page here recently.500*/501break;502}503rcu_read_unlock();504505return folio != NULL;506}507EXPORT_SYMBOL(filemap_range_has_page);508509static void __filemap_fdatawait_range(struct address_space *mapping,510loff_t start_byte, loff_t end_byte)511{512pgoff_t index = start_byte >> PAGE_SHIFT;513pgoff_t end = end_byte >> PAGE_SHIFT;514struct folio_batch fbatch;515unsigned nr_folios;516517folio_batch_init(&fbatch);518519while (index <= end) {520unsigned i;521522nr_folios = filemap_get_folios_tag(mapping, &index, end,523PAGECACHE_TAG_WRITEBACK, &fbatch);524525if (!nr_folios)526break;527528for (i = 0; i < nr_folios; i++) {529struct folio *folio = fbatch.folios[i];530531folio_wait_writeback(folio);532}533folio_batch_release(&fbatch);534cond_resched();535}536}537538/**539* filemap_fdatawait_range - wait for writeback to complete540* @mapping: address space structure to wait for541* @start_byte: offset in bytes where the range starts542* @end_byte: offset in bytes where the range ends (inclusive)543*544* Walk the list of under-writeback pages of the given address space545* in the given range and wait for all of them. Check error status of546* the address space and return it.547*548* Since the error status of the address space is cleared by this function,549* callers are responsible for checking the return value and handling and/or550* reporting the error.551*552* Return: error status of the address space.553*/554int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,555loff_t end_byte)556{557__filemap_fdatawait_range(mapping, start_byte, end_byte);558return filemap_check_errors(mapping);559}560EXPORT_SYMBOL(filemap_fdatawait_range);561562/**563* filemap_fdatawait_range_keep_errors - wait for writeback to complete564* @mapping: address space structure to wait for565* @start_byte: offset in bytes where the range starts566* @end_byte: offset in bytes where the range ends (inclusive)567*568* Walk the list of under-writeback pages of the given address space in the569* given range and wait for all of them. Unlike filemap_fdatawait_range(),570* this function does not clear error status of the address space.571*572* Use this function if callers don't handle errors themselves. Expected573* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),574* fsfreeze(8)575*/576int filemap_fdatawait_range_keep_errors(struct address_space *mapping,577loff_t start_byte, loff_t end_byte)578{579__filemap_fdatawait_range(mapping, start_byte, end_byte);580return filemap_check_and_keep_errors(mapping);581}582EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);583584/**585* file_fdatawait_range - wait for writeback to complete586* @file: file pointing to address space structure to wait for587* @start_byte: offset in bytes where the range starts588* @end_byte: offset in bytes where the range ends (inclusive)589*590* Walk the list of under-writeback pages of the address space that file591* refers to, in the given range and wait for all of them. Check error592* status of the address space vs. the file->f_wb_err cursor and return it.593*594* Since the error status of the file is advanced by this function,595* callers are responsible for checking the return value and handling and/or596* reporting the error.597*598* Return: error status of the address space vs. the file->f_wb_err cursor.599*/600int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)601{602struct address_space *mapping = file->f_mapping;603604__filemap_fdatawait_range(mapping, start_byte, end_byte);605return file_check_and_advance_wb_err(file);606}607EXPORT_SYMBOL(file_fdatawait_range);608609/**610* filemap_fdatawait_keep_errors - wait for writeback without clearing errors611* @mapping: address space structure to wait for612*613* Walk the list of under-writeback pages of the given address space614* and wait for all of them. Unlike filemap_fdatawait(), this function615* does not clear error status of the address space.616*617* Use this function if callers don't handle errors themselves. Expected618* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),619* fsfreeze(8)620*621* Return: error status of the address space.622*/623int filemap_fdatawait_keep_errors(struct address_space *mapping)624{625__filemap_fdatawait_range(mapping, 0, LLONG_MAX);626return filemap_check_and_keep_errors(mapping);627}628EXPORT_SYMBOL(filemap_fdatawait_keep_errors);629630/* Returns true if writeback might be needed or already in progress. */631static bool mapping_needs_writeback(struct address_space *mapping)632{633return mapping->nrpages;634}635636bool filemap_range_has_writeback(struct address_space *mapping,637loff_t start_byte, loff_t end_byte)638{639XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);640pgoff_t max = end_byte >> PAGE_SHIFT;641struct folio *folio;642643if (end_byte < start_byte)644return false;645646rcu_read_lock();647xas_for_each(&xas, folio, max) {648if (xas_retry(&xas, folio))649continue;650if (xa_is_value(folio))651continue;652if (folio_test_dirty(folio) || folio_test_locked(folio) ||653folio_test_writeback(folio))654break;655}656rcu_read_unlock();657return folio != NULL;658}659EXPORT_SYMBOL_GPL(filemap_range_has_writeback);660661/**662* filemap_write_and_wait_range - write out & wait on a file range663* @mapping: the address_space for the pages664* @lstart: offset in bytes where the range starts665* @lend: offset in bytes where the range ends (inclusive)666*667* Write out and wait upon file offsets lstart->lend, inclusive.668*669* Note that @lend is inclusive (describes the last byte to be written) so670* that this function can be used to write to the very end-of-file (end = -1).671*672* Return: error status of the address space.673*/674int filemap_write_and_wait_range(struct address_space *mapping,675loff_t lstart, loff_t lend)676{677int err = 0, err2;678679if (lend < lstart)680return 0;681682if (mapping_needs_writeback(mapping)) {683err = filemap_fdatawrite_range(mapping, lstart, lend);684/*685* Even if the above returned error, the pages may be686* written partially (e.g. -ENOSPC), so we wait for it.687* But the -EIO is special case, it may indicate the worst688* thing (e.g. bug) happened, so we avoid waiting for it.689*/690if (err != -EIO)691__filemap_fdatawait_range(mapping, lstart, lend);692}693err2 = filemap_check_errors(mapping);694if (!err)695err = err2;696return err;697}698EXPORT_SYMBOL(filemap_write_and_wait_range);699700void __filemap_set_wb_err(struct address_space *mapping, int err)701{702errseq_t eseq = errseq_set(&mapping->wb_err, err);703704trace_filemap_set_wb_err(mapping, eseq);705}706EXPORT_SYMBOL(__filemap_set_wb_err);707708/**709* file_check_and_advance_wb_err - report wb error (if any) that was previously710* and advance wb_err to current one711* @file: struct file on which the error is being reported712*713* When userland calls fsync (or something like nfsd does the equivalent), we714* want to report any writeback errors that occurred since the last fsync (or715* since the file was opened if there haven't been any).716*717* Grab the wb_err from the mapping. If it matches what we have in the file,718* then just quickly return 0. The file is all caught up.719*720* If it doesn't match, then take the mapping value, set the "seen" flag in721* it and try to swap it into place. If it works, or another task beat us722* to it with the new value, then update the f_wb_err and return the error723* portion. The error at this point must be reported via proper channels724* (a'la fsync, or NFS COMMIT operation, etc.).725*726* While we handle mapping->wb_err with atomic operations, the f_wb_err727* value is protected by the f_lock since we must ensure that it reflects728* the latest value swapped in for this file descriptor.729*730* Return: %0 on success, negative error code otherwise.731*/732int file_check_and_advance_wb_err(struct file *file)733{734int err = 0;735errseq_t old = READ_ONCE(file->f_wb_err);736struct address_space *mapping = file->f_mapping;737738/* Locklessly handle the common case where nothing has changed */739if (errseq_check(&mapping->wb_err, old)) {740/* Something changed, must use slow path */741spin_lock(&file->f_lock);742old = file->f_wb_err;743err = errseq_check_and_advance(&mapping->wb_err,744&file->f_wb_err);745trace_file_check_and_advance_wb_err(file, old);746spin_unlock(&file->f_lock);747}748749/*750* We're mostly using this function as a drop in replacement for751* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect752* that the legacy code would have had on these flags.753*/754clear_bit(AS_EIO, &mapping->flags);755clear_bit(AS_ENOSPC, &mapping->flags);756return err;757}758EXPORT_SYMBOL(file_check_and_advance_wb_err);759760/**761* file_write_and_wait_range - write out & wait on a file range762* @file: file pointing to address_space with pages763* @lstart: offset in bytes where the range starts764* @lend: offset in bytes where the range ends (inclusive)765*766* Write out and wait upon file offsets lstart->lend, inclusive.767*768* Note that @lend is inclusive (describes the last byte to be written) so769* that this function can be used to write to the very end-of-file (end = -1).770*771* After writing out and waiting on the data, we check and advance the772* f_wb_err cursor to the latest value, and return any errors detected there.773*774* Return: %0 on success, negative error code otherwise.775*/776int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)777{778int err = 0, err2;779struct address_space *mapping = file->f_mapping;780781if (lend < lstart)782return 0;783784if (mapping_needs_writeback(mapping)) {785err = filemap_fdatawrite_range(mapping, lstart, lend);786/* See comment of filemap_write_and_wait() */787if (err != -EIO)788__filemap_fdatawait_range(mapping, lstart, lend);789}790err2 = file_check_and_advance_wb_err(file);791if (!err)792err = err2;793return err;794}795EXPORT_SYMBOL(file_write_and_wait_range);796797/**798* replace_page_cache_folio - replace a pagecache folio with a new one799* @old: folio to be replaced800* @new: folio to replace with801*802* This function replaces a folio in the pagecache with a new one. On803* success it acquires the pagecache reference for the new folio and804* drops it for the old folio. Both the old and new folios must be805* locked. This function does not add the new folio to the LRU, the806* caller must do that.807*808* The remove + add is atomic. This function cannot fail.809*/810void replace_page_cache_folio(struct folio *old, struct folio *new)811{812struct address_space *mapping = old->mapping;813void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;814pgoff_t offset = old->index;815XA_STATE(xas, &mapping->i_pages, offset);816817VM_BUG_ON_FOLIO(!folio_test_locked(old), old);818VM_BUG_ON_FOLIO(!folio_test_locked(new), new);819VM_BUG_ON_FOLIO(new->mapping, new);820821folio_get(new);822new->mapping = mapping;823new->index = offset;824825mem_cgroup_replace_folio(old, new);826827xas_lock_irq(&xas);828xas_store(&xas, new);829830old->mapping = NULL;831/* hugetlb pages do not participate in page cache accounting. */832if (!folio_test_hugetlb(old))833lruvec_stat_sub_folio(old, NR_FILE_PAGES);834if (!folio_test_hugetlb(new))835lruvec_stat_add_folio(new, NR_FILE_PAGES);836if (folio_test_swapbacked(old))837lruvec_stat_sub_folio(old, NR_SHMEM);838if (folio_test_swapbacked(new))839lruvec_stat_add_folio(new, NR_SHMEM);840xas_unlock_irq(&xas);841if (free_folio)842free_folio(old);843folio_put(old);844}845EXPORT_SYMBOL_GPL(replace_page_cache_folio);846847noinline int __filemap_add_folio(struct address_space *mapping,848struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)849{850XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));851bool huge;852long nr;853unsigned int forder = folio_order(folio);854855VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);856VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);857VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),858folio);859mapping_set_update(&xas, mapping);860861VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);862huge = folio_test_hugetlb(folio);863nr = folio_nr_pages(folio);864865gfp &= GFP_RECLAIM_MASK;866folio_ref_add(folio, nr);867folio->mapping = mapping;868folio->index = xas.xa_index;869870for (;;) {871int order = -1;872void *entry, *old = NULL;873874xas_lock_irq(&xas);875xas_for_each_conflict(&xas, entry) {876old = entry;877if (!xa_is_value(entry)) {878xas_set_err(&xas, -EEXIST);879goto unlock;880}881/*882* If a larger entry exists,883* it will be the first and only entry iterated.884*/885if (order == -1)886order = xas_get_order(&xas);887}888889if (old) {890if (order > 0 && order > forder) {891unsigned int split_order = max(forder,892xas_try_split_min_order(order));893894/* How to handle large swap entries? */895BUG_ON(shmem_mapping(mapping));896897while (order > forder) {898xas_set_order(&xas, index, split_order);899xas_try_split(&xas, old, order);900if (xas_error(&xas))901goto unlock;902order = split_order;903split_order =904max(xas_try_split_min_order(905split_order),906forder);907}908xas_reset(&xas);909}910if (shadowp)911*shadowp = old;912}913914xas_store(&xas, folio);915if (xas_error(&xas))916goto unlock;917918mapping->nrpages += nr;919920/* hugetlb pages do not participate in page cache accounting */921if (!huge) {922lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);923if (folio_test_pmd_mappable(folio))924lruvec_stat_mod_folio(folio,925NR_FILE_THPS, nr);926}927928unlock:929xas_unlock_irq(&xas);930931if (!xas_nomem(&xas, gfp))932break;933}934935if (xas_error(&xas))936goto error;937938trace_mm_filemap_add_to_page_cache(folio);939return 0;940error:941folio->mapping = NULL;942/* Leave folio->index set: truncation relies upon it */943folio_put_refs(folio, nr);944return xas_error(&xas);945}946ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);947948int filemap_add_folio(struct address_space *mapping, struct folio *folio,949pgoff_t index, gfp_t gfp)950{951void *shadow = NULL;952int ret;953struct mem_cgroup *tmp;954bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);955956if (kernel_file)957tmp = set_active_memcg(root_mem_cgroup);958ret = mem_cgroup_charge(folio, NULL, gfp);959if (kernel_file)960set_active_memcg(tmp);961if (ret)962return ret;963964__folio_set_locked(folio);965ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);966if (unlikely(ret)) {967mem_cgroup_uncharge(folio);968__folio_clear_locked(folio);969} else {970/*971* The folio might have been evicted from cache only972* recently, in which case it should be activated like973* any other repeatedly accessed folio.974* The exception is folios getting rewritten; evicting other975* data from the working set, only to cache data that will976* get overwritten with something else, is a waste of memory.977*/978WARN_ON_ONCE(folio_test_active(folio));979if (!(gfp & __GFP_WRITE) && shadow)980workingset_refault(folio, shadow);981folio_add_lru(folio);982if (kernel_file)983mod_node_page_state(folio_pgdat(folio),984NR_KERNEL_FILE_PAGES,985folio_nr_pages(folio));986}987return ret;988}989EXPORT_SYMBOL_GPL(filemap_add_folio);990991#ifdef CONFIG_NUMA992struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,993struct mempolicy *policy)994{995int n;996struct folio *folio;997998if (policy)999return folio_alloc_mpol_noprof(gfp, order, policy,1000NO_INTERLEAVE_INDEX, numa_node_id());10011002if (cpuset_do_page_mem_spread()) {1003unsigned int cpuset_mems_cookie;1004do {1005cpuset_mems_cookie = read_mems_allowed_begin();1006n = cpuset_mem_spread_node();1007folio = __folio_alloc_node_noprof(gfp, order, n);1008} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));10091010return folio;1011}1012return folio_alloc_noprof(gfp, order);1013}1014EXPORT_SYMBOL(filemap_alloc_folio_noprof);1015#endif10161017/*1018* filemap_invalidate_lock_two - lock invalidate_lock for two mappings1019*1020* Lock exclusively invalidate_lock of any passed mapping that is not NULL.1021*1022* @mapping1: the first mapping to lock1023* @mapping2: the second mapping to lock1024*/1025void filemap_invalidate_lock_two(struct address_space *mapping1,1026struct address_space *mapping2)1027{1028if (mapping1 > mapping2)1029swap(mapping1, mapping2);1030if (mapping1)1031down_write(&mapping1->invalidate_lock);1032if (mapping2 && mapping1 != mapping2)1033down_write_nested(&mapping2->invalidate_lock, 1);1034}1035EXPORT_SYMBOL(filemap_invalidate_lock_two);10361037/*1038* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings1039*1040* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.1041*1042* @mapping1: the first mapping to unlock1043* @mapping2: the second mapping to unlock1044*/1045void filemap_invalidate_unlock_two(struct address_space *mapping1,1046struct address_space *mapping2)1047{1048if (mapping1)1049up_write(&mapping1->invalidate_lock);1050if (mapping2 && mapping1 != mapping2)1051up_write(&mapping2->invalidate_lock);1052}1053EXPORT_SYMBOL(filemap_invalidate_unlock_two);10541055/*1056* In order to wait for pages to become available there must be1057* waitqueues associated with pages. By using a hash table of1058* waitqueues where the bucket discipline is to maintain all1059* waiters on the same queue and wake all when any of the pages1060* become available, and for the woken contexts to check to be1061* sure the appropriate page became available, this saves space1062* at a cost of "thundering herd" phenomena during rare hash1063* collisions.1064*/1065#define PAGE_WAIT_TABLE_BITS 81066#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)1067static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;10681069static wait_queue_head_t *folio_waitqueue(struct folio *folio)1070{1071return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];1072}10731074/* How many times do we accept lock stealing from under a waiter? */1075static int sysctl_page_lock_unfairness = 5;1076static const struct ctl_table filemap_sysctl_table[] = {1077{1078.procname = "page_lock_unfairness",1079.data = &sysctl_page_lock_unfairness,1080.maxlen = sizeof(sysctl_page_lock_unfairness),1081.mode = 0644,1082.proc_handler = proc_dointvec_minmax,1083.extra1 = SYSCTL_ZERO,1084}1085};10861087void __init pagecache_init(void)1088{1089int i;10901091for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)1092init_waitqueue_head(&folio_wait_table[i]);10931094page_writeback_init();1095register_sysctl_init("vm", filemap_sysctl_table);1096}10971098/*1099* The page wait code treats the "wait->flags" somewhat unusually, because1100* we have multiple different kinds of waits, not just the usual "exclusive"1101* one.1102*1103* We have:1104*1105* (a) no special bits set:1106*1107* We're just waiting for the bit to be released, and when a waker1108* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,1109* and remove it from the wait queue.1110*1111* Simple and straightforward.1112*1113* (b) WQ_FLAG_EXCLUSIVE:1114*1115* The waiter is waiting to get the lock, and only one waiter should1116* be woken up to avoid any thundering herd behavior. We'll set the1117* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.1118*1119* This is the traditional exclusive wait.1120*1121* (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:1122*1123* The waiter is waiting to get the bit, and additionally wants the1124* lock to be transferred to it for fair lock behavior. If the lock1125* cannot be taken, we stop walking the wait queue without waking1126* the waiter.1127*1128* This is the "fair lock handoff" case, and in addition to setting1129* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see1130* that it now has the lock.1131*/1132static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)1133{1134unsigned int flags;1135struct wait_page_key *key = arg;1136struct wait_page_queue *wait_page1137= container_of(wait, struct wait_page_queue, wait);11381139if (!wake_page_match(wait_page, key))1140return 0;11411142/*1143* If it's a lock handoff wait, we get the bit for it, and1144* stop walking (and do not wake it up) if we can't.1145*/1146flags = wait->flags;1147if (flags & WQ_FLAG_EXCLUSIVE) {1148if (test_bit(key->bit_nr, &key->folio->flags.f))1149return -1;1150if (flags & WQ_FLAG_CUSTOM) {1151if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))1152return -1;1153flags |= WQ_FLAG_DONE;1154}1155}11561157/*1158* We are holding the wait-queue lock, but the waiter that1159* is waiting for this will be checking the flags without1160* any locking.1161*1162* So update the flags atomically, and wake up the waiter1163* afterwards to avoid any races. This store-release pairs1164* with the load-acquire in folio_wait_bit_common().1165*/1166smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);1167wake_up_state(wait->private, mode);11681169/*1170* Ok, we have successfully done what we're waiting for,1171* and we can unconditionally remove the wait entry.1172*1173* Note that this pairs with the "finish_wait()" in the1174* waiter, and has to be the absolute last thing we do.1175* After this list_del_init(&wait->entry) the wait entry1176* might be de-allocated and the process might even have1177* exited.1178*/1179list_del_init_careful(&wait->entry);1180return (flags & WQ_FLAG_EXCLUSIVE) != 0;1181}11821183static void folio_wake_bit(struct folio *folio, int bit_nr)1184{1185wait_queue_head_t *q = folio_waitqueue(folio);1186struct wait_page_key key;1187unsigned long flags;11881189key.folio = folio;1190key.bit_nr = bit_nr;1191key.page_match = 0;11921193spin_lock_irqsave(&q->lock, flags);1194__wake_up_locked_key(q, TASK_NORMAL, &key);11951196/*1197* It's possible to miss clearing waiters here, when we woke our page1198* waiters, but the hashed waitqueue has waiters for other pages on it.1199* That's okay, it's a rare case. The next waker will clear it.1200*1201* Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,1202* other), the flag may be cleared in the course of freeing the page;1203* but that is not required for correctness.1204*/1205if (!waitqueue_active(q) || !key.page_match)1206folio_clear_waiters(folio);12071208spin_unlock_irqrestore(&q->lock, flags);1209}12101211/*1212* A choice of three behaviors for folio_wait_bit_common():1213*/1214enum behavior {1215EXCLUSIVE, /* Hold ref to page and take the bit when woken, like1216* __folio_lock() waiting on then setting PG_locked.1217*/1218SHARED, /* Hold ref to page and check the bit when woken, like1219* folio_wait_writeback() waiting on PG_writeback.1220*/1221DROP, /* Drop ref to page before wait, no check when woken,1222* like folio_put_wait_locked() on PG_locked.1223*/1224};12251226/*1227* Attempt to check (or get) the folio flag, and mark us done1228* if successful.1229*/1230static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,1231struct wait_queue_entry *wait)1232{1233if (wait->flags & WQ_FLAG_EXCLUSIVE) {1234if (test_and_set_bit(bit_nr, &folio->flags.f))1235return false;1236} else if (test_bit(bit_nr, &folio->flags.f))1237return false;12381239wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;1240return true;1241}12421243static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,1244int state, enum behavior behavior)1245{1246wait_queue_head_t *q = folio_waitqueue(folio);1247int unfairness = sysctl_page_lock_unfairness;1248struct wait_page_queue wait_page;1249wait_queue_entry_t *wait = &wait_page.wait;1250bool thrashing = false;1251unsigned long pflags;1252bool in_thrashing;12531254if (bit_nr == PG_locked &&1255!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1256delayacct_thrashing_start(&in_thrashing);1257psi_memstall_enter(&pflags);1258thrashing = true;1259}12601261init_wait(wait);1262wait->func = wake_page_function;1263wait_page.folio = folio;1264wait_page.bit_nr = bit_nr;12651266repeat:1267wait->flags = 0;1268if (behavior == EXCLUSIVE) {1269wait->flags = WQ_FLAG_EXCLUSIVE;1270if (--unfairness < 0)1271wait->flags |= WQ_FLAG_CUSTOM;1272}12731274/*1275* Do one last check whether we can get the1276* page bit synchronously.1277*1278* Do the folio_set_waiters() marking before that1279* to let any waker we _just_ missed know they1280* need to wake us up (otherwise they'll never1281* even go to the slow case that looks at the1282* page queue), and add ourselves to the wait1283* queue if we need to sleep.1284*1285* This part needs to be done under the queue1286* lock to avoid races.1287*/1288spin_lock_irq(&q->lock);1289folio_set_waiters(folio);1290if (!folio_trylock_flag(folio, bit_nr, wait))1291__add_wait_queue_entry_tail(q, wait);1292spin_unlock_irq(&q->lock);12931294/*1295* From now on, all the logic will be based on1296* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to1297* see whether the page bit testing has already1298* been done by the wake function.1299*1300* We can drop our reference to the folio.1301*/1302if (behavior == DROP)1303folio_put(folio);13041305/*1306* Note that until the "finish_wait()", or until1307* we see the WQ_FLAG_WOKEN flag, we need to1308* be very careful with the 'wait->flags', because1309* we may race with a waker that sets them.1310*/1311for (;;) {1312unsigned int flags;13131314set_current_state(state);13151316/* Loop until we've been woken or interrupted */1317flags = smp_load_acquire(&wait->flags);1318if (!(flags & WQ_FLAG_WOKEN)) {1319if (signal_pending_state(state, current))1320break;13211322io_schedule();1323continue;1324}13251326/* If we were non-exclusive, we're done */1327if (behavior != EXCLUSIVE)1328break;13291330/* If the waker got the lock for us, we're done */1331if (flags & WQ_FLAG_DONE)1332break;13331334/*1335* Otherwise, if we're getting the lock, we need to1336* try to get it ourselves.1337*1338* And if that fails, we'll have to retry this all.1339*/1340if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))1341goto repeat;13421343wait->flags |= WQ_FLAG_DONE;1344break;1345}13461347/*1348* If a signal happened, this 'finish_wait()' may remove the last1349* waiter from the wait-queues, but the folio waiters bit will remain1350* set. That's ok. The next wakeup will take care of it, and trying1351* to do it here would be difficult and prone to races.1352*/1353finish_wait(q, wait);13541355if (thrashing) {1356delayacct_thrashing_end(&in_thrashing);1357psi_memstall_leave(&pflags);1358}13591360/*1361* NOTE! The wait->flags weren't stable until we've done the1362* 'finish_wait()', and we could have exited the loop above due1363* to a signal, and had a wakeup event happen after the signal1364* test but before the 'finish_wait()'.1365*1366* So only after the finish_wait() can we reliably determine1367* if we got woken up or not, so we can now figure out the final1368* return value based on that state without races.1369*1370* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive1371* waiter, but an exclusive one requires WQ_FLAG_DONE.1372*/1373if (behavior == EXCLUSIVE)1374return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;13751376return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;1377}13781379#ifdef CONFIG_MIGRATION1380/**1381* migration_entry_wait_on_locked - Wait for a migration entry to be removed1382* @entry: migration swap entry.1383* @ptl: already locked ptl. This function will drop the lock.1384*1385* Wait for a migration entry referencing the given page to be removed. This is1386* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except1387* this can be called without taking a reference on the page. Instead this1388* should be called while holding the ptl for the migration entry referencing1389* the page.1390*1391* Returns after unlocking the ptl.1392*1393* This follows the same logic as folio_wait_bit_common() so see the comments1394* there.1395*/1396void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)1397__releases(ptl)1398{1399struct wait_page_queue wait_page;1400wait_queue_entry_t *wait = &wait_page.wait;1401bool thrashing = false;1402unsigned long pflags;1403bool in_thrashing;1404wait_queue_head_t *q;1405struct folio *folio = softleaf_to_folio(entry);14061407q = folio_waitqueue(folio);1408if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1409delayacct_thrashing_start(&in_thrashing);1410psi_memstall_enter(&pflags);1411thrashing = true;1412}14131414init_wait(wait);1415wait->func = wake_page_function;1416wait_page.folio = folio;1417wait_page.bit_nr = PG_locked;1418wait->flags = 0;14191420spin_lock_irq(&q->lock);1421folio_set_waiters(folio);1422if (!folio_trylock_flag(folio, PG_locked, wait))1423__add_wait_queue_entry_tail(q, wait);1424spin_unlock_irq(&q->lock);14251426/*1427* If a migration entry exists for the page the migration path must hold1428* a valid reference to the page, and it must take the ptl to remove the1429* migration entry. So the page is valid until the ptl is dropped.1430*/1431spin_unlock(ptl);14321433for (;;) {1434unsigned int flags;14351436set_current_state(TASK_UNINTERRUPTIBLE);14371438/* Loop until we've been woken or interrupted */1439flags = smp_load_acquire(&wait->flags);1440if (!(flags & WQ_FLAG_WOKEN)) {1441if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))1442break;14431444io_schedule();1445continue;1446}1447break;1448}14491450finish_wait(q, wait);14511452if (thrashing) {1453delayacct_thrashing_end(&in_thrashing);1454psi_memstall_leave(&pflags);1455}1456}1457#endif14581459void folio_wait_bit(struct folio *folio, int bit_nr)1460{1461folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);1462}1463EXPORT_SYMBOL(folio_wait_bit);14641465int folio_wait_bit_killable(struct folio *folio, int bit_nr)1466{1467return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);1468}1469EXPORT_SYMBOL(folio_wait_bit_killable);14701471/**1472* folio_put_wait_locked - Drop a reference and wait for it to be unlocked1473* @folio: The folio to wait for.1474* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).1475*1476* The caller should hold a reference on @folio. They expect the page to1477* become unlocked relatively soon, but do not wish to hold up migration1478* (for example) by holding the reference while waiting for the folio to1479* come unlocked. After this function returns, the caller should not1480* dereference @folio.1481*1482* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.1483*/1484static int folio_put_wait_locked(struct folio *folio, int state)1485{1486return folio_wait_bit_common(folio, PG_locked, state, DROP);1487}14881489/**1490* folio_unlock - Unlock a locked folio.1491* @folio: The folio.1492*1493* Unlocks the folio and wakes up any thread sleeping on the page lock.1494*1495* Context: May be called from interrupt or process context. May not be1496* called from NMI context.1497*/1498void folio_unlock(struct folio *folio)1499{1500/* Bit 7 allows x86 to check the byte's sign bit */1501BUILD_BUG_ON(PG_waiters != 7);1502BUILD_BUG_ON(PG_locked > 7);1503VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1504if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))1505folio_wake_bit(folio, PG_locked);1506}1507EXPORT_SYMBOL(folio_unlock);15081509/**1510* folio_end_read - End read on a folio.1511* @folio: The folio.1512* @success: True if all reads completed successfully.1513*1514* When all reads against a folio have completed, filesystems should1515* call this function to let the pagecache know that no more reads1516* are outstanding. This will unlock the folio and wake up any thread1517* sleeping on the lock. The folio will also be marked uptodate if all1518* reads succeeded.1519*1520* Context: May be called from interrupt or process context. May not be1521* called from NMI context.1522*/1523void folio_end_read(struct folio *folio, bool success)1524{1525unsigned long mask = 1 << PG_locked;15261527/* Must be in bottom byte for x86 to work */1528BUILD_BUG_ON(PG_uptodate > 7);1529VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1530VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);15311532if (likely(success))1533mask |= 1 << PG_uptodate;1534if (folio_xor_flags_has_waiters(folio, mask))1535folio_wake_bit(folio, PG_locked);1536}1537EXPORT_SYMBOL(folio_end_read);15381539/**1540* folio_end_private_2 - Clear PG_private_2 and wake any waiters.1541* @folio: The folio.1542*1543* Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for1544* it. The folio reference held for PG_private_2 being set is released.1545*1546* This is, for example, used when a netfs folio is being written to a local1547* disk cache, thereby allowing writes to the cache for the same folio to be1548* serialised.1549*/1550void folio_end_private_2(struct folio *folio)1551{1552VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);1553clear_bit_unlock(PG_private_2, folio_flags(folio, 0));1554folio_wake_bit(folio, PG_private_2);1555folio_put(folio);1556}1557EXPORT_SYMBOL(folio_end_private_2);15581559/**1560* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.1561* @folio: The folio to wait on.1562*1563* Wait for PG_private_2 to be cleared on a folio.1564*/1565void folio_wait_private_2(struct folio *folio)1566{1567while (folio_test_private_2(folio))1568folio_wait_bit(folio, PG_private_2);1569}1570EXPORT_SYMBOL(folio_wait_private_2);15711572/**1573* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.1574* @folio: The folio to wait on.1575*1576* Wait for PG_private_2 to be cleared on a folio or until a fatal signal is1577* received by the calling task.1578*1579* Return:1580* - 0 if successful.1581* - -EINTR if a fatal signal was encountered.1582*/1583int folio_wait_private_2_killable(struct folio *folio)1584{1585int ret = 0;15861587while (folio_test_private_2(folio)) {1588ret = folio_wait_bit_killable(folio, PG_private_2);1589if (ret < 0)1590break;1591}15921593return ret;1594}1595EXPORT_SYMBOL(folio_wait_private_2_killable);15961597static void filemap_end_dropbehind(struct folio *folio)1598{1599struct address_space *mapping = folio->mapping;16001601VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);16021603if (folio_test_writeback(folio) || folio_test_dirty(folio))1604return;1605if (!folio_test_clear_dropbehind(folio))1606return;1607if (mapping)1608folio_unmap_invalidate(mapping, folio, 0);1609}16101611/*1612* If folio was marked as dropbehind, then pages should be dropped when writeback1613* completes. Do that now. If we fail, it's likely because of a big folio -1614* just reset dropbehind for that case and latter completions should invalidate.1615*/1616void folio_end_dropbehind(struct folio *folio)1617{1618if (!folio_test_dropbehind(folio))1619return;16201621/*1622* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,1623* but can happen if normal writeback just happens to find dirty folios1624* that were created as part of uncached writeback, and that writeback1625* would otherwise not need non-IRQ handling. Just skip the1626* invalidation in that case.1627*/1628if (in_task() && folio_trylock(folio)) {1629filemap_end_dropbehind(folio);1630folio_unlock(folio);1631}1632}1633EXPORT_SYMBOL_GPL(folio_end_dropbehind);16341635/**1636* folio_end_writeback_no_dropbehind - End writeback against a folio.1637* @folio: The folio.1638*1639* The folio must actually be under writeback.1640* This call is intended for filesystems that need to defer dropbehind.1641*1642* Context: May be called from process or interrupt context.1643*/1644void folio_end_writeback_no_dropbehind(struct folio *folio)1645{1646VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);16471648/*1649* folio_test_clear_reclaim() could be used here but it is an1650* atomic operation and overkill in this particular case. Failing1651* to shuffle a folio marked for immediate reclaim is too mild1652* a gain to justify taking an atomic operation penalty at the1653* end of every folio writeback.1654*/1655if (folio_test_reclaim(folio)) {1656folio_clear_reclaim(folio);1657folio_rotate_reclaimable(folio);1658}16591660if (__folio_end_writeback(folio))1661folio_wake_bit(folio, PG_writeback);16621663acct_reclaim_writeback(folio);1664}1665EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);16661667/**1668* folio_end_writeback - End writeback against a folio.1669* @folio: The folio.1670*1671* The folio must actually be under writeback.1672*1673* Context: May be called from process or interrupt context.1674*/1675void folio_end_writeback(struct folio *folio)1676{1677VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);16781679/*1680* Writeback does not hold a folio reference of its own, relying1681* on truncation to wait for the clearing of PG_writeback.1682* But here we must make sure that the folio is not freed and1683* reused before the folio_wake_bit().1684*/1685folio_get(folio);1686folio_end_writeback_no_dropbehind(folio);1687folio_end_dropbehind(folio);1688folio_put(folio);1689}1690EXPORT_SYMBOL(folio_end_writeback);16911692/**1693* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.1694* @folio: The folio to lock1695*/1696void __folio_lock(struct folio *folio)1697{1698folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,1699EXCLUSIVE);1700}1701EXPORT_SYMBOL(__folio_lock);17021703int __folio_lock_killable(struct folio *folio)1704{1705return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,1706EXCLUSIVE);1707}1708EXPORT_SYMBOL_GPL(__folio_lock_killable);17091710static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)1711{1712struct wait_queue_head *q = folio_waitqueue(folio);1713int ret;17141715wait->folio = folio;1716wait->bit_nr = PG_locked;17171718spin_lock_irq(&q->lock);1719__add_wait_queue_entry_tail(q, &wait->wait);1720folio_set_waiters(folio);1721ret = !folio_trylock(folio);1722/*1723* If we were successful now, we know we're still on the1724* waitqueue as we're still under the lock. This means it's1725* safe to remove and return success, we know the callback1726* isn't going to trigger.1727*/1728if (!ret)1729__remove_wait_queue(q, &wait->wait);1730else1731ret = -EIOCBQUEUED;1732spin_unlock_irq(&q->lock);1733return ret;1734}17351736/*1737* Return values:1738* 0 - folio is locked.1739* non-zero - folio is not locked.1740* mmap_lock or per-VMA lock has been released (mmap_read_unlock() or1741* vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and1742* FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.1743*1744* If neither ALLOW_RETRY nor KILLABLE are set, will always return 01745* with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.1746*/1747vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)1748{1749unsigned int flags = vmf->flags;17501751if (fault_flag_allow_retry_first(flags)) {1752/*1753* CAUTION! In this case, mmap_lock/per-VMA lock is not1754* released even though returning VM_FAULT_RETRY.1755*/1756if (flags & FAULT_FLAG_RETRY_NOWAIT)1757return VM_FAULT_RETRY;17581759release_fault_lock(vmf);1760if (flags & FAULT_FLAG_KILLABLE)1761folio_wait_locked_killable(folio);1762else1763folio_wait_locked(folio);1764return VM_FAULT_RETRY;1765}1766if (flags & FAULT_FLAG_KILLABLE) {1767bool ret;17681769ret = __folio_lock_killable(folio);1770if (ret) {1771release_fault_lock(vmf);1772return VM_FAULT_RETRY;1773}1774} else {1775__folio_lock(folio);1776}17771778return 0;1779}17801781/**1782* page_cache_next_miss() - Find the next gap in the page cache.1783* @mapping: Mapping.1784* @index: Index.1785* @max_scan: Maximum range to search.1786*1787* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the1788* gap with the lowest index.1789*1790* This function may be called under the rcu_read_lock. However, this will1791* not atomically search a snapshot of the cache at a single point in time.1792* For example, if a gap is created at index 5, then subsequently a gap is1793* created at index 10, page_cache_next_miss covering both indices may1794* return 10 if called under the rcu_read_lock.1795*1796* Return: The index of the gap if found, otherwise an index outside the1797* range specified (in which case 'return - index >= max_scan' will be true).1798* In the rare case of index wrap-around, 0 will be returned.1799*/1800pgoff_t page_cache_next_miss(struct address_space *mapping,1801pgoff_t index, unsigned long max_scan)1802{1803XA_STATE(xas, &mapping->i_pages, index);1804unsigned long nr = max_scan;18051806while (nr--) {1807void *entry = xas_next(&xas);1808if (!entry || xa_is_value(entry))1809return xas.xa_index;1810if (xas.xa_index == 0)1811return 0;1812}18131814return index + max_scan;1815}1816EXPORT_SYMBOL(page_cache_next_miss);18171818/**1819* page_cache_prev_miss() - Find the previous gap in the page cache.1820* @mapping: Mapping.1821* @index: Index.1822* @max_scan: Maximum range to search.1823*1824* Search the range [max(index - max_scan + 1, 0), index] for the1825* gap with the highest index.1826*1827* This function may be called under the rcu_read_lock. However, this will1828* not atomically search a snapshot of the cache at a single point in time.1829* For example, if a gap is created at index 10, then subsequently a gap is1830* created at index 5, page_cache_prev_miss() covering both indices may1831* return 5 if called under the rcu_read_lock.1832*1833* Return: The index of the gap if found, otherwise an index outside the1834* range specified (in which case 'index - return >= max_scan' will be true).1835* In the rare case of wrap-around, ULONG_MAX will be returned.1836*/1837pgoff_t page_cache_prev_miss(struct address_space *mapping,1838pgoff_t index, unsigned long max_scan)1839{1840XA_STATE(xas, &mapping->i_pages, index);18411842while (max_scan--) {1843void *entry = xas_prev(&xas);1844if (!entry || xa_is_value(entry))1845break;1846if (xas.xa_index == ULONG_MAX)1847break;1848}18491850return xas.xa_index;1851}1852EXPORT_SYMBOL(page_cache_prev_miss);18531854/*1855* Lockless page cache protocol:1856* On the lookup side:1857* 1. Load the folio from i_pages1858* 2. Increment the refcount if it's not zero1859* 3. If the folio is not found by xas_reload(), put the refcount and retry1860*1861* On the removal side:1862* A. Freeze the page (by zeroing the refcount if nobody else has a reference)1863* B. Remove the page from i_pages1864* C. Return the page to the page allocator1865*1866* This means that any page may have its reference count temporarily1867* increased by a speculative page cache (or GUP-fast) lookup as it can1868* be allocated by another user before the RCU grace period expires.1869* Because the refcount temporarily acquired here may end up being the1870* last refcount on the page, any page allocation must be freeable by1871* folio_put().1872*/18731874/*1875* filemap_get_entry - Get a page cache entry.1876* @mapping: the address_space to search1877* @index: The page cache index.1878*1879* Looks up the page cache entry at @mapping & @index. If it is a folio,1880* it is returned with an increased refcount. If it is a shadow entry1881* of a previously evicted folio, or a swap entry from shmem/tmpfs,1882* it is returned without further action.1883*1884* Return: The folio, swap or shadow entry, %NULL if nothing is found.1885*/1886void *filemap_get_entry(struct address_space *mapping, pgoff_t index)1887{1888XA_STATE(xas, &mapping->i_pages, index);1889struct folio *folio;18901891rcu_read_lock();1892repeat:1893xas_reset(&xas);1894folio = xas_load(&xas);1895if (xas_retry(&xas, folio))1896goto repeat;1897/*1898* A shadow entry of a recently evicted page, or a swap entry from1899* shmem/tmpfs. Return it without attempting to raise page count.1900*/1901if (!folio || xa_is_value(folio))1902goto out;19031904if (!folio_try_get(folio))1905goto repeat;19061907if (unlikely(folio != xas_reload(&xas))) {1908folio_put(folio);1909goto repeat;1910}1911out:1912rcu_read_unlock();19131914return folio;1915}19161917/**1918* __filemap_get_folio_mpol - Find and get a reference to a folio.1919* @mapping: The address_space to search.1920* @index: The page index.1921* @fgp_flags: %FGP flags modify how the folio is returned.1922* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.1923* @policy: NUMA memory allocation policy to follow.1924*1925* Looks up the page cache entry at @mapping & @index.1926*1927* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even1928* if the %GFP flags specified for %FGP_CREAT are atomic.1929*1930* If this function returns a folio, it is returned with an increased refcount.1931*1932* Return: The found folio or an ERR_PTR() otherwise.1933*/1934struct folio *__filemap_get_folio_mpol(struct address_space *mapping,1935pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy)1936{1937struct folio *folio;19381939repeat:1940folio = filemap_get_entry(mapping, index);1941if (xa_is_value(folio))1942folio = NULL;1943if (!folio)1944goto no_page;19451946if (fgp_flags & FGP_LOCK) {1947if (fgp_flags & FGP_NOWAIT) {1948if (!folio_trylock(folio)) {1949folio_put(folio);1950return ERR_PTR(-EAGAIN);1951}1952} else {1953folio_lock(folio);1954}19551956/* Has the page been truncated? */1957if (unlikely(folio->mapping != mapping)) {1958folio_unlock(folio);1959folio_put(folio);1960goto repeat;1961}1962VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);1963}19641965if (fgp_flags & FGP_ACCESSED)1966folio_mark_accessed(folio);1967else if (fgp_flags & FGP_WRITE) {1968/* Clear idle flag for buffer write */1969if (folio_test_idle(folio))1970folio_clear_idle(folio);1971}19721973if (fgp_flags & FGP_STABLE)1974folio_wait_stable(folio);1975no_page:1976if (!folio && (fgp_flags & FGP_CREAT)) {1977unsigned int min_order = mapping_min_folio_order(mapping);1978unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));1979int err;1980index = mapping_align_index(mapping, index);19811982if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))1983gfp |= __GFP_WRITE;1984if (fgp_flags & FGP_NOFS)1985gfp &= ~__GFP_FS;1986if (fgp_flags & FGP_NOWAIT) {1987gfp &= ~GFP_KERNEL;1988gfp |= GFP_NOWAIT;1989}1990if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))1991fgp_flags |= FGP_LOCK;19921993if (order > mapping_max_folio_order(mapping))1994order = mapping_max_folio_order(mapping);1995/* If we're not aligned, allocate a smaller folio */1996if (index & ((1UL << order) - 1))1997order = __ffs(index);19981999do {2000gfp_t alloc_gfp = gfp;20012002err = -ENOMEM;2003if (order > min_order)2004alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;2005folio = filemap_alloc_folio(alloc_gfp, order, policy);2006if (!folio)2007continue;20082009/* Init accessed so avoid atomic mark_page_accessed later */2010if (fgp_flags & FGP_ACCESSED)2011__folio_set_referenced(folio);2012if (fgp_flags & FGP_DONTCACHE)2013__folio_set_dropbehind(folio);20142015err = filemap_add_folio(mapping, folio, index, gfp);2016if (!err)2017break;2018folio_put(folio);2019folio = NULL;2020} while (order-- > min_order);20212022if (err == -EEXIST)2023goto repeat;2024if (err) {2025/*2026* When NOWAIT I/O fails to allocate folios this could2027* be due to a nonblocking memory allocation and not2028* because the system actually is out of memory.2029* Return -EAGAIN so that there caller retries in a2030* blocking fashion instead of propagating -ENOMEM2031* to the application.2032*/2033if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)2034err = -EAGAIN;2035return ERR_PTR(err);2036}2037/*2038* filemap_add_folio locks the page, and for mmap2039* we expect an unlocked page.2040*/2041if (folio && (fgp_flags & FGP_FOR_MMAP))2042folio_unlock(folio);2043}20442045if (!folio)2046return ERR_PTR(-ENOENT);2047/* not an uncached lookup, clear uncached if set */2048if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))2049folio_clear_dropbehind(folio);2050return folio;2051}2052EXPORT_SYMBOL(__filemap_get_folio_mpol);20532054static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,2055xa_mark_t mark)2056{2057struct folio *folio;20582059retry:2060if (mark == XA_PRESENT)2061folio = xas_find(xas, max);2062else2063folio = xas_find_marked(xas, max, mark);20642065if (xas_retry(xas, folio))2066goto retry;2067/*2068* A shadow entry of a recently evicted page, a swap2069* entry from shmem/tmpfs or a DAX entry. Return it2070* without attempting to raise page count.2071*/2072if (!folio || xa_is_value(folio))2073return folio;20742075if (!folio_try_get(folio))2076goto reset;20772078if (unlikely(folio != xas_reload(xas))) {2079folio_put(folio);2080goto reset;2081}20822083return folio;2084reset:2085xas_reset(xas);2086goto retry;2087}20882089/**2090* find_get_entries - gang pagecache lookup2091* @mapping: The address_space to search2092* @start: The starting page cache index2093* @end: The final page index (inclusive).2094* @fbatch: Where the resulting entries are placed.2095* @indices: The cache indices corresponding to the entries in @entries2096*2097* find_get_entries() will search for and return a batch of entries in2098* the mapping. The entries are placed in @fbatch. find_get_entries()2099* takes a reference on any actual folios it returns.2100*2101* The entries have ascending indexes. The indices may not be consecutive2102* due to not-present entries or large folios.2103*2104* Any shadow entries of evicted folios, or swap entries from2105* shmem/tmpfs, are included in the returned array.2106*2107* Return: The number of entries which were found.2108*/2109unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,2110pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2111{2112XA_STATE(xas, &mapping->i_pages, *start);2113struct folio *folio;21142115rcu_read_lock();2116while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {2117indices[fbatch->nr] = xas.xa_index;2118if (!folio_batch_add(fbatch, folio))2119break;2120}21212122if (folio_batch_count(fbatch)) {2123unsigned long nr;2124int idx = folio_batch_count(fbatch) - 1;21252126folio = fbatch->folios[idx];2127if (!xa_is_value(folio))2128nr = folio_nr_pages(folio);2129else2130nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);2131*start = round_down(indices[idx] + nr, nr);2132}2133rcu_read_unlock();21342135return folio_batch_count(fbatch);2136}21372138/**2139* find_lock_entries - Find a batch of pagecache entries.2140* @mapping: The address_space to search.2141* @start: The starting page cache index.2142* @end: The final page index (inclusive).2143* @fbatch: Where the resulting entries are placed.2144* @indices: The cache indices of the entries in @fbatch.2145*2146* find_lock_entries() will return a batch of entries from @mapping.2147* Swap, shadow and DAX entries are included. Folios are returned2148* locked and with an incremented refcount. Folios which are locked2149* by somebody else or under writeback are skipped. Folios which are2150* partially outside the range are not returned.2151*2152* The entries have ascending indexes. The indices may not be consecutive2153* due to not-present entries, large folios, folios which could not be2154* locked or folios under writeback.2155*2156* Return: The number of entries which were found.2157*/2158unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,2159pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2160{2161XA_STATE(xas, &mapping->i_pages, *start);2162struct folio *folio;21632164rcu_read_lock();2165while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {2166unsigned long base;2167unsigned long nr;21682169if (!xa_is_value(folio)) {2170nr = folio_nr_pages(folio);2171base = folio->index;2172/* Omit large folio which begins before the start */2173if (base < *start)2174goto put;2175/* Omit large folio which extends beyond the end */2176if (base + nr - 1 > end)2177goto put;2178if (!folio_trylock(folio))2179goto put;2180if (folio->mapping != mapping ||2181folio_test_writeback(folio))2182goto unlock;2183VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),2184folio);2185} else {2186nr = 1 << xas_get_order(&xas);2187base = xas.xa_index & ~(nr - 1);2188/* Omit order>0 value which begins before the start */2189if (base < *start)2190continue;2191/* Omit order>0 value which extends beyond the end */2192if (base + nr - 1 > end)2193break;2194}21952196/* Update start now so that last update is correct on return */2197*start = base + nr;2198indices[fbatch->nr] = xas.xa_index;2199if (!folio_batch_add(fbatch, folio))2200break;2201continue;2202unlock:2203folio_unlock(folio);2204put:2205folio_put(folio);2206}2207rcu_read_unlock();22082209return folio_batch_count(fbatch);2210}22112212/**2213* filemap_get_folios - Get a batch of folios2214* @mapping: The address_space to search2215* @start: The starting page index2216* @end: The final page index (inclusive)2217* @fbatch: The batch to fill.2218*2219* Search for and return a batch of folios in the mapping starting at2220* index @start and up to index @end (inclusive). The folios are returned2221* in @fbatch with an elevated reference count.2222*2223* Return: The number of folios which were found.2224* We also update @start to index the next folio for the traversal.2225*/2226unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,2227pgoff_t end, struct folio_batch *fbatch)2228{2229return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);2230}2231EXPORT_SYMBOL(filemap_get_folios);22322233/**2234* filemap_get_folios_contig - Get a batch of contiguous folios2235* @mapping: The address_space to search2236* @start: The starting page index2237* @end: The final page index (inclusive)2238* @fbatch: The batch to fill2239*2240* filemap_get_folios_contig() works exactly like filemap_get_folios(),2241* except the returned folios are guaranteed to be contiguous. This may2242* not return all contiguous folios if the batch gets filled up.2243*2244* Return: The number of folios found.2245* Also update @start to be positioned for traversal of the next folio.2246*/22472248unsigned filemap_get_folios_contig(struct address_space *mapping,2249pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)2250{2251XA_STATE(xas, &mapping->i_pages, *start);2252unsigned long nr;2253struct folio *folio;22542255rcu_read_lock();22562257for (folio = xas_load(&xas); folio && xas.xa_index <= end;2258folio = xas_next(&xas)) {2259if (xas_retry(&xas, folio))2260continue;2261/*2262* If the entry has been swapped out, we can stop looking.2263* No current caller is looking for DAX entries.2264*/2265if (xa_is_value(folio))2266goto update_start;22672268/* If we landed in the middle of a THP, continue at its end. */2269if (xa_is_sibling(folio))2270goto update_start;22712272if (!folio_try_get(folio))2273goto retry;22742275if (unlikely(folio != xas_reload(&xas)))2276goto put_folio;22772278if (!folio_batch_add(fbatch, folio)) {2279nr = folio_nr_pages(folio);2280*start = folio->index + nr;2281goto out;2282}2283xas_advance(&xas, folio_next_index(folio) - 1);2284continue;2285put_folio:2286folio_put(folio);22872288retry:2289xas_reset(&xas);2290}22912292update_start:2293nr = folio_batch_count(fbatch);22942295if (nr) {2296folio = fbatch->folios[nr - 1];2297*start = folio_next_index(folio);2298}2299out:2300rcu_read_unlock();2301return folio_batch_count(fbatch);2302}2303EXPORT_SYMBOL(filemap_get_folios_contig);23042305/**2306* filemap_get_folios_tag - Get a batch of folios matching @tag2307* @mapping: The address_space to search2308* @start: The starting page index2309* @end: The final page index (inclusive)2310* @tag: The tag index2311* @fbatch: The batch to fill2312*2313* The first folio may start before @start; if it does, it will contain2314* @start. The final folio may extend beyond @end; if it does, it will2315* contain @end. The folios have ascending indices. There may be gaps2316* between the folios if there are indices which have no folio in the2317* page cache. If folios are added to or removed from the page cache2318* while this is running, they may or may not be found by this call.2319* Only returns folios that are tagged with @tag.2320*2321* Return: The number of folios found.2322* Also update @start to index the next folio for traversal.2323*/2324unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,2325pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)2326{2327XA_STATE(xas, &mapping->i_pages, *start);2328struct folio *folio;23292330rcu_read_lock();2331while ((folio = find_get_entry(&xas, end, tag)) != NULL) {2332/*2333* Shadow entries should never be tagged, but this iteration2334* is lockless so there is a window for page reclaim to evict2335* a page we saw tagged. Skip over it.2336*/2337if (xa_is_value(folio))2338continue;2339if (!folio_batch_add(fbatch, folio)) {2340unsigned long nr = folio_nr_pages(folio);2341*start = folio->index + nr;2342goto out;2343}2344}2345/*2346* We come here when there is no page beyond @end. We take care to not2347* overflow the index @start as it confuses some of the callers. This2348* breaks the iteration when there is a page at index -1 but that is2349* already broke anyway.2350*/2351if (end == (pgoff_t)-1)2352*start = (pgoff_t)-1;2353else2354*start = end + 1;2355out:2356rcu_read_unlock();23572358return folio_batch_count(fbatch);2359}2360EXPORT_SYMBOL(filemap_get_folios_tag);23612362/**2363* filemap_get_folios_dirty - Get a batch of dirty folios2364* @mapping: The address_space to search2365* @start: The starting folio index2366* @end: The final folio index (inclusive)2367* @fbatch: The batch to fill2368*2369* filemap_get_folios_dirty() works exactly like filemap_get_folios(), except2370* the returned folios are presumed to be dirty or undergoing writeback. Dirty2371* state is presumed because we don't block on folio lock nor want to miss2372* folios. Callers that need to can recheck state upon locking the folio.2373*2374* This may not return all dirty folios if the batch gets filled up.2375*2376* Return: The number of folios found.2377* Also update @start to be positioned for traversal of the next folio.2378*/2379unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,2380pgoff_t end, struct folio_batch *fbatch)2381{2382XA_STATE(xas, &mapping->i_pages, *start);2383struct folio *folio;23842385rcu_read_lock();2386while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {2387if (xa_is_value(folio))2388continue;2389if (folio_trylock(folio)) {2390bool clean = !folio_test_dirty(folio) &&2391!folio_test_writeback(folio);2392folio_unlock(folio);2393if (clean) {2394folio_put(folio);2395continue;2396}2397}2398if (!folio_batch_add(fbatch, folio)) {2399unsigned long nr = folio_nr_pages(folio);2400*start = folio->index + nr;2401goto out;2402}2403}2404/*2405* We come here when there is no folio beyond @end. We take care to not2406* overflow the index @start as it confuses some of the callers. This2407* breaks the iteration when there is a folio at index -1 but that is2408* already broke anyway.2409*/2410if (end == (pgoff_t)-1)2411*start = (pgoff_t)-1;2412else2413*start = end + 1;2414out:2415rcu_read_unlock();24162417return folio_batch_count(fbatch);2418}24192420/*2421* CD/DVDs are error prone. When a medium error occurs, the driver may fail2422* a _large_ part of the i/o request. Imagine the worst scenario:2423*2424* ---R__________________________________________B__________2425* ^ reading here ^ bad block(assume 4k)2426*2427* read(R) => miss => readahead(R...B) => media error => frustrating retries2428* => failing the whole request => read(R) => read(R+1) =>2429* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>2430* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>2431* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......2432*2433* It is going insane. Fix it by quickly scaling down the readahead size.2434*/2435static void shrink_readahead_size_eio(struct file_ra_state *ra)2436{2437ra->ra_pages /= 4;2438}24392440/*2441* filemap_get_read_batch - Get a batch of folios for read2442*2443* Get a batch of folios which represent a contiguous range of bytes in2444* the file. No exceptional entries will be returned. If @index is in2445* the middle of a folio, the entire folio will be returned. The last2446* folio in the batch may have the readahead flag set or the uptodate flag2447* clear so that the caller can take the appropriate action.2448*/2449static void filemap_get_read_batch(struct address_space *mapping,2450pgoff_t index, pgoff_t max, struct folio_batch *fbatch)2451{2452XA_STATE(xas, &mapping->i_pages, index);2453struct folio *folio;24542455rcu_read_lock();2456for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {2457if (xas_retry(&xas, folio))2458continue;2459if (xas.xa_index > max || xa_is_value(folio))2460break;2461if (xa_is_sibling(folio))2462break;2463if (!folio_try_get(folio))2464goto retry;24652466if (unlikely(folio != xas_reload(&xas)))2467goto put_folio;24682469if (!folio_batch_add(fbatch, folio))2470break;2471if (!folio_test_uptodate(folio))2472break;2473if (folio_test_readahead(folio))2474break;2475xas_advance(&xas, folio_next_index(folio) - 1);2476continue;2477put_folio:2478folio_put(folio);2479retry:2480xas_reset(&xas);2481}2482rcu_read_unlock();2483}24842485static int filemap_read_folio(struct file *file, filler_t filler,2486struct folio *folio)2487{2488bool workingset = folio_test_workingset(folio);2489unsigned long pflags;2490int error;24912492/* Start the actual read. The read will unlock the page. */2493if (unlikely(workingset))2494psi_memstall_enter(&pflags);2495error = filler(file, folio);2496if (unlikely(workingset))2497psi_memstall_leave(&pflags);2498if (error)2499return error;25002501error = folio_wait_locked_killable(folio);2502if (error)2503return error;2504if (folio_test_uptodate(folio))2505return 0;2506if (file)2507shrink_readahead_size_eio(&file->f_ra);2508return -EIO;2509}25102511static bool filemap_range_uptodate(struct address_space *mapping,2512loff_t pos, size_t count, struct folio *folio,2513bool need_uptodate)2514{2515if (folio_test_uptodate(folio))2516return true;2517/* pipes can't handle partially uptodate pages */2518if (need_uptodate)2519return false;2520if (!mapping->a_ops->is_partially_uptodate)2521return false;2522if (mapping->host->i_blkbits >= folio_shift(folio))2523return false;25242525if (folio_pos(folio) > pos) {2526count -= folio_pos(folio) - pos;2527pos = 0;2528} else {2529pos -= folio_pos(folio);2530}25312532if (pos == 0 && count >= folio_size(folio))2533return false;25342535return mapping->a_ops->is_partially_uptodate(folio, pos, count);2536}25372538static int filemap_update_page(struct kiocb *iocb,2539struct address_space *mapping, size_t count,2540struct folio *folio, bool need_uptodate)2541{2542int error;25432544if (iocb->ki_flags & IOCB_NOWAIT) {2545if (!filemap_invalidate_trylock_shared(mapping))2546return -EAGAIN;2547} else {2548filemap_invalidate_lock_shared(mapping);2549}25502551if (!folio_trylock(folio)) {2552error = -EAGAIN;2553if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))2554goto unlock_mapping;2555if (!(iocb->ki_flags & IOCB_WAITQ)) {2556filemap_invalidate_unlock_shared(mapping);2557/*2558* This is where we usually end up waiting for a2559* previously submitted readahead to finish.2560*/2561folio_put_wait_locked(folio, TASK_KILLABLE);2562return AOP_TRUNCATED_PAGE;2563}2564error = __folio_lock_async(folio, iocb->ki_waitq);2565if (error)2566goto unlock_mapping;2567}25682569error = AOP_TRUNCATED_PAGE;2570if (!folio->mapping)2571goto unlock;25722573error = 0;2574if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,2575need_uptodate))2576goto unlock;25772578error = -EAGAIN;2579if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))2580goto unlock;25812582error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2583folio);2584goto unlock_mapping;2585unlock:2586folio_unlock(folio);2587unlock_mapping:2588filemap_invalidate_unlock_shared(mapping);2589if (error == AOP_TRUNCATED_PAGE)2590folio_put(folio);2591return error;2592}25932594static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)2595{2596struct address_space *mapping = iocb->ki_filp->f_mapping;2597struct folio *folio;2598int error;2599unsigned int min_order = mapping_min_folio_order(mapping);2600pgoff_t index;26012602if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))2603return -EAGAIN;26042605folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);2606if (!folio)2607return -ENOMEM;2608if (iocb->ki_flags & IOCB_DONTCACHE)2609__folio_set_dropbehind(folio);26102611/*2612* Protect against truncate / hole punch. Grabbing invalidate_lock2613* here assures we cannot instantiate and bring uptodate new2614* pagecache folios after evicting page cache during truncate2615* and before actually freeing blocks. Note that we could2616* release invalidate_lock after inserting the folio into2617* the page cache as the locked folio would then be enough to2618* synchronize with hole punching. But there are code paths2619* such as filemap_update_page() filling in partially uptodate2620* pages or ->readahead() that need to hold invalidate_lock2621* while mapping blocks for IO so let's hold the lock here as2622* well to keep locking rules simple.2623*/2624filemap_invalidate_lock_shared(mapping);2625index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;2626error = filemap_add_folio(mapping, folio, index,2627mapping_gfp_constraint(mapping, GFP_KERNEL));2628if (error == -EEXIST)2629error = AOP_TRUNCATED_PAGE;2630if (error)2631goto error;26322633error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2634folio);2635if (error)2636goto error;26372638filemap_invalidate_unlock_shared(mapping);2639folio_batch_add(fbatch, folio);2640return 0;2641error:2642filemap_invalidate_unlock_shared(mapping);2643folio_put(folio);2644return error;2645}26462647static int filemap_readahead(struct kiocb *iocb, struct file *file,2648struct address_space *mapping, struct folio *folio,2649pgoff_t last_index)2650{2651DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);26522653if (iocb->ki_flags & IOCB_NOIO)2654return -EAGAIN;2655if (iocb->ki_flags & IOCB_DONTCACHE)2656ractl.dropbehind = 1;2657page_cache_async_ra(&ractl, folio, last_index - folio->index);2658return 0;2659}26602661static int filemap_get_pages(struct kiocb *iocb, size_t count,2662struct folio_batch *fbatch, bool need_uptodate)2663{2664struct file *filp = iocb->ki_filp;2665struct address_space *mapping = filp->f_mapping;2666pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;2667pgoff_t last_index;2668struct folio *folio;2669unsigned int flags;2670int err = 0;26712672/* "last_index" is the index of the folio beyond the end of the read */2673last_index = round_up(iocb->ki_pos + count,2674mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;2675retry:2676if (fatal_signal_pending(current))2677return -EINTR;26782679filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2680if (!folio_batch_count(fbatch)) {2681DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);26822683if (iocb->ki_flags & IOCB_NOIO)2684return -EAGAIN;2685if (iocb->ki_flags & IOCB_NOWAIT)2686flags = memalloc_noio_save();2687if (iocb->ki_flags & IOCB_DONTCACHE)2688ractl.dropbehind = 1;2689page_cache_sync_ra(&ractl, last_index - index);2690if (iocb->ki_flags & IOCB_NOWAIT)2691memalloc_noio_restore(flags);2692filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2693}2694if (!folio_batch_count(fbatch)) {2695err = filemap_create_folio(iocb, fbatch);2696if (err == AOP_TRUNCATED_PAGE)2697goto retry;2698return err;2699}27002701folio = fbatch->folios[folio_batch_count(fbatch) - 1];2702if (folio_test_readahead(folio)) {2703err = filemap_readahead(iocb, filp, mapping, folio, last_index);2704if (err)2705goto err;2706}2707if (!folio_test_uptodate(folio)) {2708if (folio_batch_count(fbatch) > 1) {2709err = -EAGAIN;2710goto err;2711}2712err = filemap_update_page(iocb, mapping, count, folio,2713need_uptodate);2714if (err)2715goto err;2716}27172718trace_mm_filemap_get_pages(mapping, index, last_index - 1);2719return 0;2720err:2721if (err < 0)2722folio_put(folio);2723if (likely(--fbatch->nr))2724return 0;2725if (err == AOP_TRUNCATED_PAGE)2726goto retry;2727return err;2728}27292730static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)2731{2732unsigned int shift = folio_shift(folio);27332734return (pos1 >> shift == pos2 >> shift);2735}27362737static void filemap_end_dropbehind_read(struct folio *folio)2738{2739if (!folio_test_dropbehind(folio))2740return;2741if (folio_test_writeback(folio) || folio_test_dirty(folio))2742return;2743if (folio_trylock(folio)) {2744filemap_end_dropbehind(folio);2745folio_unlock(folio);2746}2747}27482749/**2750* filemap_read - Read data from the page cache.2751* @iocb: The iocb to read.2752* @iter: Destination for the data.2753* @already_read: Number of bytes already read by the caller.2754*2755* Copies data from the page cache. If the data is not currently present,2756* uses the readahead and read_folio address_space operations to fetch it.2757*2758* Return: Total number of bytes copied, including those already read by2759* the caller. If an error happens before any bytes are copied, returns2760* a negative error number.2761*/2762ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,2763ssize_t already_read)2764{2765struct file *filp = iocb->ki_filp;2766struct file_ra_state *ra = &filp->f_ra;2767struct address_space *mapping = filp->f_mapping;2768struct inode *inode = mapping->host;2769struct folio_batch fbatch;2770int i, error = 0;2771bool writably_mapped;2772loff_t isize, end_offset;2773loff_t last_pos = ra->prev_pos;27742775if (unlikely(iocb->ki_pos < 0))2776return -EINVAL;2777if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))2778return 0;2779if (unlikely(!iov_iter_count(iter)))2780return 0;27812782iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);2783folio_batch_init(&fbatch);27842785do {2786cond_resched();27872788/*2789* If we've already successfully copied some data, then we2790* can no longer safely return -EIOCBQUEUED. Hence mark2791* an async read NOWAIT at that point.2792*/2793if ((iocb->ki_flags & IOCB_WAITQ) && already_read)2794iocb->ki_flags |= IOCB_NOWAIT;27952796if (unlikely(iocb->ki_pos >= i_size_read(inode)))2797break;27982799error = filemap_get_pages(iocb, iter->count, &fbatch, false);2800if (error < 0)2801break;28022803/*2804* i_size must be checked after we know the pages are Uptodate.2805*2806* Checking i_size after the check allows us to calculate2807* the correct value for "nr", which means the zero-filled2808* part of the page is not copied back to userspace (unless2809* another truncate extends the file - this is desired though).2810*/2811isize = i_size_read(inode);2812if (unlikely(iocb->ki_pos >= isize))2813goto put_folios;2814end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);28152816/*2817* Once we start copying data, we don't want to be touching any2818* cachelines that might be contended:2819*/2820writably_mapped = mapping_writably_mapped(mapping);28212822/*2823* When a read accesses the same folio several times, only2824* mark it as accessed the first time.2825*/2826if (!pos_same_folio(iocb->ki_pos, last_pos - 1,2827fbatch.folios[0]))2828folio_mark_accessed(fbatch.folios[0]);28292830for (i = 0; i < folio_batch_count(&fbatch); i++) {2831struct folio *folio = fbatch.folios[i];2832size_t fsize = folio_size(folio);2833size_t offset = iocb->ki_pos & (fsize - 1);2834size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,2835fsize - offset);2836size_t copied;28372838if (end_offset < folio_pos(folio))2839break;2840if (i > 0)2841folio_mark_accessed(folio);2842/*2843* If users can be writing to this folio using arbitrary2844* virtual addresses, take care of potential aliasing2845* before reading the folio on the kernel side.2846*/2847if (writably_mapped)2848flush_dcache_folio(folio);28492850copied = copy_folio_to_iter(folio, offset, bytes, iter);28512852already_read += copied;2853iocb->ki_pos += copied;2854last_pos = iocb->ki_pos;28552856if (copied < bytes) {2857error = -EFAULT;2858break;2859}2860}2861put_folios:2862for (i = 0; i < folio_batch_count(&fbatch); i++) {2863struct folio *folio = fbatch.folios[i];28642865filemap_end_dropbehind_read(folio);2866folio_put(folio);2867}2868folio_batch_init(&fbatch);2869} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);28702871file_accessed(filp);2872ra->prev_pos = last_pos;2873return already_read ? already_read : error;2874}2875EXPORT_SYMBOL_GPL(filemap_read);28762877int kiocb_write_and_wait(struct kiocb *iocb, size_t count)2878{2879struct address_space *mapping = iocb->ki_filp->f_mapping;2880loff_t pos = iocb->ki_pos;2881loff_t end = pos + count - 1;28822883if (iocb->ki_flags & IOCB_NOWAIT) {2884if (filemap_range_needs_writeback(mapping, pos, end))2885return -EAGAIN;2886return 0;2887}28882889return filemap_write_and_wait_range(mapping, pos, end);2890}2891EXPORT_SYMBOL_GPL(kiocb_write_and_wait);28922893int filemap_invalidate_pages(struct address_space *mapping,2894loff_t pos, loff_t end, bool nowait)2895{2896int ret;28972898if (nowait) {2899/* we could block if there are any pages in the range */2900if (filemap_range_has_page(mapping, pos, end))2901return -EAGAIN;2902} else {2903ret = filemap_write_and_wait_range(mapping, pos, end);2904if (ret)2905return ret;2906}29072908/*2909* After a write we want buffered reads to be sure to go to disk to get2910* the new data. We invalidate clean cached page from the region we're2911* about to write. We do this *before* the write so that we can return2912* without clobbering -EIOCBQUEUED from ->direct_IO().2913*/2914return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,2915end >> PAGE_SHIFT);2916}29172918int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)2919{2920struct address_space *mapping = iocb->ki_filp->f_mapping;29212922return filemap_invalidate_pages(mapping, iocb->ki_pos,2923iocb->ki_pos + count - 1,2924iocb->ki_flags & IOCB_NOWAIT);2925}2926EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);29272928/**2929* generic_file_read_iter - generic filesystem read routine2930* @iocb: kernel I/O control block2931* @iter: destination for the data read2932*2933* This is the "read_iter()" routine for all filesystems2934* that can use the page cache directly.2935*2936* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall2937* be returned when no data can be read without waiting for I/O requests2938* to complete; it doesn't prevent readahead.2939*2940* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O2941* requests shall be made for the read or for readahead. When no data2942* can be read, -EAGAIN shall be returned. When readahead would be2943* triggered, a partial, possibly empty read shall be returned.2944*2945* Return:2946* * number of bytes copied, even for partial reads2947* * negative error code (or 0 if IOCB_NOIO) if nothing was read2948*/2949ssize_t2950generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)2951{2952size_t count = iov_iter_count(iter);2953ssize_t retval = 0;29542955if (!count)2956return 0; /* skip atime */29572958if (iocb->ki_flags & IOCB_DIRECT) {2959struct file *file = iocb->ki_filp;2960struct address_space *mapping = file->f_mapping;2961struct inode *inode = mapping->host;29622963retval = kiocb_write_and_wait(iocb, count);2964if (retval < 0)2965return retval;2966file_accessed(file);29672968retval = mapping->a_ops->direct_IO(iocb, iter);2969if (retval >= 0) {2970iocb->ki_pos += retval;2971count -= retval;2972}2973if (retval != -EIOCBQUEUED)2974iov_iter_revert(iter, count - iov_iter_count(iter));29752976/*2977* Btrfs can have a short DIO read if we encounter2978* compressed extents, so if there was an error, or if2979* we've already read everything we wanted to, or if2980* there was a short read because we hit EOF, go ahead2981* and return. Otherwise fallthrough to buffered io for2982* the rest of the read. Buffered reads will not work for2983* DAX files, so don't bother trying.2984*/2985if (retval < 0 || !count || IS_DAX(inode))2986return retval;2987if (iocb->ki_pos >= i_size_read(inode))2988return retval;2989}29902991return filemap_read(iocb, iter, retval);2992}2993EXPORT_SYMBOL(generic_file_read_iter);29942995/*2996* Splice subpages from a folio into a pipe.2997*/2998size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,2999struct folio *folio, loff_t fpos, size_t size)3000{3001struct page *page;3002size_t spliced = 0, offset = offset_in_folio(folio, fpos);30033004page = folio_page(folio, offset / PAGE_SIZE);3005size = min(size, folio_size(folio) - offset);3006offset %= PAGE_SIZE;30073008while (spliced < size && !pipe_is_full(pipe)) {3009struct pipe_buffer *buf = pipe_head_buf(pipe);3010size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);30113012*buf = (struct pipe_buffer) {3013.ops = &page_cache_pipe_buf_ops,3014.page = page,3015.offset = offset,3016.len = part,3017};3018folio_get(folio);3019pipe->head++;3020page++;3021spliced += part;3022offset = 0;3023}30243025return spliced;3026}30273028/**3029* filemap_splice_read - Splice data from a file's pagecache into a pipe3030* @in: The file to read from3031* @ppos: Pointer to the file position to read from3032* @pipe: The pipe to splice into3033* @len: The amount to splice3034* @flags: The SPLICE_F_* flags3035*3036* This function gets folios from a file's pagecache and splices them into the3037* pipe. Readahead will be called as necessary to fill more folios. This may3038* be used for blockdevs also.3039*3040* Return: On success, the number of bytes read will be returned and *@ppos3041* will be updated if appropriate; 0 will be returned if there is no more data3042* to be read; -EAGAIN will be returned if the pipe had no space, and some3043* other negative error code will be returned on error. A short read may occur3044* if the pipe has insufficient space, we reach the end of the data or we hit a3045* hole.3046*/3047ssize_t filemap_splice_read(struct file *in, loff_t *ppos,3048struct pipe_inode_info *pipe,3049size_t len, unsigned int flags)3050{3051struct folio_batch fbatch;3052struct kiocb iocb;3053size_t total_spliced = 0, used, npages;3054loff_t isize, end_offset;3055bool writably_mapped;3056int i, error = 0;30573058if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))3059return 0;30603061init_sync_kiocb(&iocb, in);3062iocb.ki_pos = *ppos;30633064/* Work out how much data we can actually add into the pipe */3065used = pipe_buf_usage(pipe);3066npages = max_t(ssize_t, pipe->max_usage - used, 0);3067len = min_t(size_t, len, npages * PAGE_SIZE);30683069folio_batch_init(&fbatch);30703071do {3072cond_resched();30733074if (*ppos >= i_size_read(in->f_mapping->host))3075break;30763077iocb.ki_pos = *ppos;3078error = filemap_get_pages(&iocb, len, &fbatch, true);3079if (error < 0)3080break;30813082/*3083* i_size must be checked after we know the pages are Uptodate.3084*3085* Checking i_size after the check allows us to calculate3086* the correct value for "nr", which means the zero-filled3087* part of the page is not copied back to userspace (unless3088* another truncate extends the file - this is desired though).3089*/3090isize = i_size_read(in->f_mapping->host);3091if (unlikely(*ppos >= isize))3092break;3093end_offset = min_t(loff_t, isize, *ppos + len);30943095/*3096* Once we start copying data, we don't want to be touching any3097* cachelines that might be contended:3098*/3099writably_mapped = mapping_writably_mapped(in->f_mapping);31003101for (i = 0; i < folio_batch_count(&fbatch); i++) {3102struct folio *folio = fbatch.folios[i];3103size_t n;31043105if (folio_pos(folio) >= end_offset)3106goto out;3107folio_mark_accessed(folio);31083109/*3110* If users can be writing to this folio using arbitrary3111* virtual addresses, take care of potential aliasing3112* before reading the folio on the kernel side.3113*/3114if (writably_mapped)3115flush_dcache_folio(folio);31163117n = min_t(loff_t, len, isize - *ppos);3118n = splice_folio_into_pipe(pipe, folio, *ppos, n);3119if (!n)3120goto out;3121len -= n;3122total_spliced += n;3123*ppos += n;3124in->f_ra.prev_pos = *ppos;3125if (pipe_is_full(pipe))3126goto out;3127}31283129folio_batch_release(&fbatch);3130} while (len);31313132out:3133folio_batch_release(&fbatch);3134file_accessed(in);31353136return total_spliced ? total_spliced : error;3137}3138EXPORT_SYMBOL(filemap_splice_read);31393140static inline loff_t folio_seek_hole_data(struct xa_state *xas,3141struct address_space *mapping, struct folio *folio,3142loff_t start, loff_t end, bool seek_data)3143{3144const struct address_space_operations *ops = mapping->a_ops;3145size_t offset, bsz = i_blocksize(mapping->host);31463147if (xa_is_value(folio) || folio_test_uptodate(folio))3148return seek_data ? start : end;3149if (!ops->is_partially_uptodate)3150return seek_data ? end : start;31513152xas_pause(xas);3153rcu_read_unlock();3154folio_lock(folio);3155if (unlikely(folio->mapping != mapping))3156goto unlock;31573158offset = offset_in_folio(folio, start) & ~(bsz - 1);31593160do {3161if (ops->is_partially_uptodate(folio, offset, bsz) ==3162seek_data)3163break;3164start = (start + bsz) & ~((u64)bsz - 1);3165offset += bsz;3166} while (offset < folio_size(folio));3167unlock:3168folio_unlock(folio);3169rcu_read_lock();3170return start;3171}31723173static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)3174{3175if (xa_is_value(folio))3176return PAGE_SIZE << xas_get_order(xas);3177return folio_size(folio);3178}31793180/**3181* mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.3182* @mapping: Address space to search.3183* @start: First byte to consider.3184* @end: Limit of search (exclusive).3185* @whence: Either SEEK_HOLE or SEEK_DATA.3186*3187* If the page cache knows which blocks contain holes and which blocks3188* contain data, your filesystem can use this function to implement3189* SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are3190* entirely memory-based such as tmpfs, and filesystems which support3191* unwritten extents.3192*3193* Return: The requested offset on success, or -ENXIO if @whence specifies3194* SEEK_DATA and there is no data after @start. There is an implicit hole3195* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start3196* and @end contain data.3197*/3198loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,3199loff_t end, int whence)3200{3201XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);3202pgoff_t max = (end - 1) >> PAGE_SHIFT;3203bool seek_data = (whence == SEEK_DATA);3204struct folio *folio;32053206if (end <= start)3207return -ENXIO;32083209rcu_read_lock();3210while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {3211loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;3212size_t seek_size;32133214if (start < pos) {3215if (!seek_data)3216goto unlock;3217start = pos;3218}32193220seek_size = seek_folio_size(&xas, folio);3221pos = round_up((u64)pos + 1, seek_size);3222start = folio_seek_hole_data(&xas, mapping, folio, start, pos,3223seek_data);3224if (start < pos)3225goto unlock;3226if (start >= end)3227break;3228if (seek_size > PAGE_SIZE)3229xas_set(&xas, pos >> PAGE_SHIFT);3230if (!xa_is_value(folio))3231folio_put(folio);3232}3233if (seek_data)3234start = -ENXIO;3235unlock:3236rcu_read_unlock();3237if (folio && !xa_is_value(folio))3238folio_put(folio);3239if (start > end)3240return end;3241return start;3242}32433244#ifdef CONFIG_MMU3245#define MMAP_LOTSAMISS (100)3246/*3247* lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock3248* @vmf - the vm_fault for this fault.3249* @folio - the folio to lock.3250* @fpin - the pointer to the file we may pin (or is already pinned).3251*3252* This works similar to lock_folio_or_retry in that it can drop the3253* mmap_lock. It differs in that it actually returns the folio locked3254* if it returns 1 and 0 if it couldn't lock the folio. If we did have3255* to drop the mmap_lock then fpin will point to the pinned file and3256* needs to be fput()'ed at a later point.3257*/3258static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,3259struct file **fpin)3260{3261if (folio_trylock(folio))3262return 1;32633264/*3265* NOTE! This will make us return with VM_FAULT_RETRY, but with3266* the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT3267* is supposed to work. We have way too many special cases..3268*/3269if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)3270return 0;32713272*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);3273if (vmf->flags & FAULT_FLAG_KILLABLE) {3274if (__folio_lock_killable(folio)) {3275/*3276* We didn't have the right flags to drop the3277* fault lock, but all fault_handlers only check3278* for fatal signals if we return VM_FAULT_RETRY,3279* so we need to drop the fault lock here and3280* return 0 if we don't have a fpin.3281*/3282if (*fpin == NULL)3283release_fault_lock(vmf);3284return 0;3285}3286} else3287__folio_lock(folio);32883289return 1;3290}32913292/*3293* Synchronous readahead happens when we don't even find a page in the page3294* cache at all. We don't want to perform IO under the mmap sem, so if we have3295* to drop the mmap sem we return the file that was pinned in order for us to do3296* that. If we didn't pin a file then we return NULL. The file that is3297* returned needs to be fput()'ed when we're done with it.3298*/3299static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)3300{3301struct file *file = vmf->vma->vm_file;3302struct file_ra_state *ra = &file->f_ra;3303struct address_space *mapping = file->f_mapping;3304DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);3305struct file *fpin = NULL;3306vm_flags_t vm_flags = vmf->vma->vm_flags;3307bool force_thp_readahead = false;3308unsigned short mmap_miss;33093310/* Use the readahead code, even if readahead is disabled */3311if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&3312(vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)3313force_thp_readahead = true;33143315if (!force_thp_readahead) {3316/*3317* If we don't want any read-ahead, don't bother.3318* VM_EXEC case below is already intended for random access.3319*/3320if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)3321return fpin;33223323if (!ra->ra_pages)3324return fpin;33253326if (vm_flags & VM_SEQ_READ) {3327fpin = maybe_unlock_mmap_for_io(vmf, fpin);3328page_cache_sync_ra(&ractl, ra->ra_pages);3329return fpin;3330}3331}33323333if (!(vm_flags & VM_SEQ_READ)) {3334/* Avoid banging the cache line if not needed */3335mmap_miss = READ_ONCE(ra->mmap_miss);3336if (mmap_miss < MMAP_LOTSAMISS * 10)3337WRITE_ONCE(ra->mmap_miss, ++mmap_miss);33383339/*3340* Do we miss much more than hit in this file? If so,3341* stop bothering with read-ahead. It will only hurt.3342*/3343if (mmap_miss > MMAP_LOTSAMISS)3344return fpin;3345}33463347if (force_thp_readahead) {3348fpin = maybe_unlock_mmap_for_io(vmf, fpin);3349ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);3350ra->size = HPAGE_PMD_NR;3351/*3352* Fetch two PMD folios, so we get the chance to actually3353* readahead, unless we've been told not to.3354*/3355if (!(vm_flags & VM_RAND_READ))3356ra->size *= 2;3357ra->async_size = HPAGE_PMD_NR;3358ra->order = HPAGE_PMD_ORDER;3359page_cache_ra_order(&ractl, ra);3360return fpin;3361}33623363if (vm_flags & VM_EXEC) {3364/*3365* Allow arch to request a preferred minimum folio order for3366* executable memory. This can often be beneficial to3367* performance if (e.g.) arm64 can contpte-map the folio.3368* Executable memory rarely benefits from readahead, due to its3369* random access nature, so set async_size to 0.3370*3371* Limit to the boundaries of the VMA to avoid reading in any3372* pad that might exist between sections, which would be a waste3373* of memory.3374*/3375struct vm_area_struct *vma = vmf->vma;3376unsigned long start = vma->vm_pgoff;3377unsigned long end = start + vma_pages(vma);3378unsigned long ra_end;33793380ra->order = exec_folio_order();3381ra->start = round_down(vmf->pgoff, 1UL << ra->order);3382ra->start = max(ra->start, start);3383ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);3384ra_end = min(ra_end, end);3385ra->size = ra_end - ra->start;3386ra->async_size = 0;3387} else {3388/*3389* mmap read-around3390*/3391ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);3392ra->size = ra->ra_pages;3393ra->async_size = ra->ra_pages / 4;3394ra->order = 0;3395}33963397fpin = maybe_unlock_mmap_for_io(vmf, fpin);3398ractl._index = ra->start;3399page_cache_ra_order(&ractl, ra);3400return fpin;3401}34023403/*3404* Asynchronous readahead happens when we find the page and PG_readahead,3405* so we want to possibly extend the readahead further. We return the file that3406* was pinned if we have to drop the mmap_lock in order to do IO.3407*/3408static struct file *do_async_mmap_readahead(struct vm_fault *vmf,3409struct folio *folio)3410{3411struct file *file = vmf->vma->vm_file;3412struct file_ra_state *ra = &file->f_ra;3413DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);3414struct file *fpin = NULL;3415unsigned short mmap_miss;34163417/* If we don't want any read-ahead, don't bother */3418if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)3419return fpin;34203421/*3422* If the folio is locked, we're likely racing against another fault.3423* Don't touch the mmap_miss counter to avoid decreasing it multiple3424* times for a single folio and break the balance with mmap_miss3425* increase in do_sync_mmap_readahead().3426*/3427if (likely(!folio_test_locked(folio))) {3428mmap_miss = READ_ONCE(ra->mmap_miss);3429if (mmap_miss)3430WRITE_ONCE(ra->mmap_miss, --mmap_miss);3431}34323433if (folio_test_readahead(folio)) {3434fpin = maybe_unlock_mmap_for_io(vmf, fpin);3435page_cache_async_ra(&ractl, folio, ra->ra_pages);3436}3437return fpin;3438}34393440static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)3441{3442struct vm_area_struct *vma = vmf->vma;3443vm_fault_t ret = 0;3444pte_t *ptep;34453446/*3447* We might have COW'ed a pagecache folio and might now have an mlocked3448* anon folio mapped. The original pagecache folio is not mlocked and3449* might have been evicted. During a read+clear/modify/write update of3450* the PTE, such as done in do_numa_page()/change_pte_range(), we3451* temporarily clear the PTE under PT lock and might detect it here as3452* "none" when not holding the PT lock.3453*3454* Not rechecking the PTE under PT lock could result in an unexpected3455* major fault in an mlock'ed region. Recheck only for this special3456* scenario while holding the PT lock, to not degrade non-mlocked3457* scenarios. Recheck the PTE without PT lock firstly, thereby reducing3458* the number of times we hold PT lock.3459*/3460if (!(vma->vm_flags & VM_LOCKED))3461return 0;34623463if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))3464return 0;34653466ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,3467&vmf->ptl);3468if (unlikely(!ptep))3469return VM_FAULT_NOPAGE;34703471if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {3472ret = VM_FAULT_NOPAGE;3473} else {3474spin_lock(vmf->ptl);3475if (unlikely(!pte_none(ptep_get(ptep))))3476ret = VM_FAULT_NOPAGE;3477spin_unlock(vmf->ptl);3478}3479pte_unmap(ptep);3480return ret;3481}34823483/**3484* filemap_fault - read in file data for page fault handling3485* @vmf: struct vm_fault containing details of the fault3486*3487* filemap_fault() is invoked via the vma operations vector for a3488* mapped memory region to read in file data during a page fault.3489*3490* The goto's are kind of ugly, but this streamlines the normal case of having3491* it in the page cache, and handles the special cases reasonably without3492* having a lot of duplicated code.3493*3494* vma->vm_mm->mmap_lock must be held on entry.3495*3496* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock3497* may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().3498*3499* If our return value does not have VM_FAULT_RETRY set, the mmap_lock3500* has not been released.3501*3502* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.3503*3504* Return: bitwise-OR of %VM_FAULT_ codes.3505*/3506vm_fault_t filemap_fault(struct vm_fault *vmf)3507{3508int error;3509struct file *file = vmf->vma->vm_file;3510struct file *fpin = NULL;3511struct address_space *mapping = file->f_mapping;3512struct inode *inode = mapping->host;3513pgoff_t max_idx, index = vmf->pgoff;3514struct folio *folio;3515vm_fault_t ret = 0;3516bool mapping_locked = false;35173518max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3519if (unlikely(index >= max_idx))3520return VM_FAULT_SIGBUS;35213522trace_mm_filemap_fault(mapping, index);35233524/*3525* Do we have something in the page cache already?3526*/3527folio = filemap_get_folio(mapping, index);3528if (likely(!IS_ERR(folio))) {3529/*3530* We found the page, so try async readahead before waiting for3531* the lock.3532*/3533if (!(vmf->flags & FAULT_FLAG_TRIED))3534fpin = do_async_mmap_readahead(vmf, folio);3535if (unlikely(!folio_test_uptodate(folio))) {3536filemap_invalidate_lock_shared(mapping);3537mapping_locked = true;3538}3539} else {3540ret = filemap_fault_recheck_pte_none(vmf);3541if (unlikely(ret))3542return ret;35433544/* No page in the page cache at all */3545count_vm_event(PGMAJFAULT);3546count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);3547ret = VM_FAULT_MAJOR;3548fpin = do_sync_mmap_readahead(vmf);3549retry_find:3550/*3551* See comment in filemap_create_folio() why we need3552* invalidate_lock3553*/3554if (!mapping_locked) {3555filemap_invalidate_lock_shared(mapping);3556mapping_locked = true;3557}3558folio = __filemap_get_folio(mapping, index,3559FGP_CREAT|FGP_FOR_MMAP,3560vmf->gfp_mask);3561if (IS_ERR(folio)) {3562if (fpin)3563goto out_retry;3564filemap_invalidate_unlock_shared(mapping);3565return VM_FAULT_OOM;3566}3567}35683569if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))3570goto out_retry;35713572/* Did it get truncated? */3573if (unlikely(folio->mapping != mapping)) {3574folio_unlock(folio);3575folio_put(folio);3576goto retry_find;3577}3578VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);35793580/*3581* We have a locked folio in the page cache, now we need to check3582* that it's up-to-date. If not, it is going to be due to an error,3583* or because readahead was otherwise unable to retrieve it.3584*/3585if (unlikely(!folio_test_uptodate(folio))) {3586/*3587* If the invalidate lock is not held, the folio was in cache3588* and uptodate and now it is not. Strange but possible since we3589* didn't hold the page lock all the time. Let's drop3590* everything, get the invalidate lock and try again.3591*/3592if (!mapping_locked) {3593folio_unlock(folio);3594folio_put(folio);3595goto retry_find;3596}35973598/*3599* OK, the folio is really not uptodate. This can be because the3600* VMA has the VM_RAND_READ flag set, or because an error3601* arose. Let's read it in directly.3602*/3603goto page_not_uptodate;3604}36053606/*3607* We've made it this far and we had to drop our mmap_lock, now is the3608* time to return to the upper layer and have it re-find the vma and3609* redo the fault.3610*/3611if (fpin) {3612folio_unlock(folio);3613goto out_retry;3614}3615if (mapping_locked)3616filemap_invalidate_unlock_shared(mapping);36173618/*3619* Found the page and have a reference on it.3620* We must recheck i_size under page lock.3621*/3622max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3623if (unlikely(index >= max_idx)) {3624folio_unlock(folio);3625folio_put(folio);3626return VM_FAULT_SIGBUS;3627}36283629vmf->page = folio_file_page(folio, index);3630return ret | VM_FAULT_LOCKED;36313632page_not_uptodate:3633/*3634* Umm, take care of errors if the page isn't up-to-date.3635* Try to re-read it _once_. We do this synchronously,3636* because there really aren't any performance issues here3637* and we need to check for errors.3638*/3639fpin = maybe_unlock_mmap_for_io(vmf, fpin);3640error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);3641if (fpin)3642goto out_retry;3643folio_put(folio);36443645if (!error || error == AOP_TRUNCATED_PAGE)3646goto retry_find;3647filemap_invalidate_unlock_shared(mapping);36483649return VM_FAULT_SIGBUS;36503651out_retry:3652/*3653* We dropped the mmap_lock, we need to return to the fault handler to3654* re-find the vma and come back and find our hopefully still populated3655* page.3656*/3657if (!IS_ERR(folio))3658folio_put(folio);3659if (mapping_locked)3660filemap_invalidate_unlock_shared(mapping);3661if (fpin)3662fput(fpin);3663return ret | VM_FAULT_RETRY;3664}3665EXPORT_SYMBOL(filemap_fault);36663667static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,3668pgoff_t start)3669{3670struct mm_struct *mm = vmf->vma->vm_mm;36713672/* Huge page is mapped? No need to proceed. */3673if (pmd_trans_huge(*vmf->pmd)) {3674folio_unlock(folio);3675folio_put(folio);3676return true;3677}36783679if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {3680struct page *page = folio_file_page(folio, start);3681vm_fault_t ret = do_set_pmd(vmf, folio, page);3682if (!ret) {3683/* The page is mapped successfully, reference consumed. */3684folio_unlock(folio);3685return true;3686}3687}36883689if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)3690pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);36913692return false;3693}36943695static struct folio *next_uptodate_folio(struct xa_state *xas,3696struct address_space *mapping, pgoff_t end_pgoff)3697{3698struct folio *folio = xas_next_entry(xas, end_pgoff);3699unsigned long max_idx;37003701do {3702if (!folio)3703return NULL;3704if (xas_retry(xas, folio))3705continue;3706if (xa_is_value(folio))3707continue;3708if (!folio_try_get(folio))3709continue;3710if (folio_test_locked(folio))3711goto skip;3712/* Has the page moved or been split? */3713if (unlikely(folio != xas_reload(xas)))3714goto skip;3715if (!folio_test_uptodate(folio) || folio_test_readahead(folio))3716goto skip;3717if (!folio_trylock(folio))3718goto skip;3719if (folio->mapping != mapping)3720goto unlock;3721if (!folio_test_uptodate(folio))3722goto unlock;3723max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);3724if (xas->xa_index >= max_idx)3725goto unlock;3726return folio;3727unlock:3728folio_unlock(folio);3729skip:3730folio_put(folio);3731} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);37323733return NULL;3734}37353736/*3737* Map page range [start_page, start_page + nr_pages) of folio.3738* start_page is gotten from start by folio_page(folio, start)3739*/3740static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,3741struct folio *folio, unsigned long start,3742unsigned long addr, unsigned int nr_pages,3743unsigned long *rss, unsigned short *mmap_miss,3744pgoff_t file_end)3745{3746struct address_space *mapping = folio->mapping;3747unsigned int ref_from_caller = 1;3748vm_fault_t ret = 0;3749struct page *page = folio_page(folio, start);3750unsigned int count = 0;3751pte_t *old_ptep = vmf->pte;3752unsigned long addr0;37533754/*3755* Map the large folio fully where possible:3756*3757* - The folio is fully within size of the file or belong3758* to shmem/tmpfs;3759* - The folio doesn't cross VMA boundary;3760* - The folio doesn't cross page table boundary;3761*/3762addr0 = addr - start * PAGE_SIZE;3763if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&3764folio_within_vma(folio, vmf->vma) &&3765(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {3766vmf->pte -= start;3767page -= start;3768addr = addr0;3769nr_pages = folio_nr_pages(folio);3770}37713772do {3773if (PageHWPoison(page + count))3774goto skip;37753776/*3777* If there are too many folios that are recently evicted3778* in a file, they will probably continue to be evicted.3779* In such situation, read-ahead is only a waste of IO.3780* Don't decrease mmap_miss in this scenario to make sure3781* we can stop read-ahead.3782*/3783if (!folio_test_workingset(folio))3784(*mmap_miss)++;37853786/*3787* NOTE: If there're PTE markers, we'll leave them to be3788* handled in the specific fault path, and it'll prohibit the3789* fault-around logic.3790*/3791if (!pte_none(ptep_get(&vmf->pte[count])))3792goto skip;37933794count++;3795continue;3796skip:3797if (count) {3798set_pte_range(vmf, folio, page, count, addr);3799*rss += count;3800folio_ref_add(folio, count - ref_from_caller);3801ref_from_caller = 0;3802if (in_range(vmf->address, addr, count * PAGE_SIZE))3803ret = VM_FAULT_NOPAGE;3804}38053806count++;3807page += count;3808vmf->pte += count;3809addr += count * PAGE_SIZE;3810count = 0;3811} while (--nr_pages > 0);38123813if (count) {3814set_pte_range(vmf, folio, page, count, addr);3815*rss += count;3816folio_ref_add(folio, count - ref_from_caller);3817ref_from_caller = 0;3818if (in_range(vmf->address, addr, count * PAGE_SIZE))3819ret = VM_FAULT_NOPAGE;3820}38213822vmf->pte = old_ptep;3823if (ref_from_caller)3824/* Locked folios cannot get truncated. */3825folio_ref_dec(folio);38263827return ret;3828}38293830static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,3831struct folio *folio, unsigned long addr,3832unsigned long *rss, unsigned short *mmap_miss)3833{3834vm_fault_t ret = 0;3835struct page *page = &folio->page;38363837if (PageHWPoison(page))3838goto out;38393840/* See comment of filemap_map_folio_range() */3841if (!folio_test_workingset(folio))3842(*mmap_miss)++;38433844/*3845* NOTE: If there're PTE markers, we'll leave them to be3846* handled in the specific fault path, and it'll prohibit3847* the fault-around logic.3848*/3849if (!pte_none(ptep_get(vmf->pte)))3850goto out;38513852if (vmf->address == addr)3853ret = VM_FAULT_NOPAGE;38543855set_pte_range(vmf, folio, page, 1, addr);3856(*rss)++;3857return ret;38583859out:3860/* Locked folios cannot get truncated. */3861folio_ref_dec(folio);3862return ret;3863}38643865vm_fault_t filemap_map_pages(struct vm_fault *vmf,3866pgoff_t start_pgoff, pgoff_t end_pgoff)3867{3868struct vm_area_struct *vma = vmf->vma;3869struct file *file = vma->vm_file;3870struct address_space *mapping = file->f_mapping;3871pgoff_t file_end, last_pgoff = start_pgoff;3872unsigned long addr;3873XA_STATE(xas, &mapping->i_pages, start_pgoff);3874struct folio *folio;3875vm_fault_t ret = 0;3876unsigned long rss = 0;3877unsigned int nr_pages = 0, folio_type;3878unsigned short mmap_miss = 0, mmap_miss_saved;38793880rcu_read_lock();3881folio = next_uptodate_folio(&xas, mapping, end_pgoff);3882if (!folio)3883goto out;38843885file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;3886end_pgoff = min(end_pgoff, file_end);38873888/*3889* Do not allow to map with PMD across i_size to preserve3890* SIGBUS semantics.3891*3892* Make an exception for shmem/tmpfs that for long time3893* intentionally mapped with PMDs across i_size.3894*/3895if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&3896filemap_map_pmd(vmf, folio, start_pgoff)) {3897ret = VM_FAULT_NOPAGE;3898goto out;3899}39003901addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);3902vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);3903if (!vmf->pte) {3904folio_unlock(folio);3905folio_put(folio);3906goto out;3907}39083909folio_type = mm_counter_file(folio);3910do {3911unsigned long end;39123913addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;3914vmf->pte += xas.xa_index - last_pgoff;3915last_pgoff = xas.xa_index;3916end = folio_next_index(folio) - 1;3917nr_pages = min(end, end_pgoff) - xas.xa_index + 1;39183919if (!folio_test_large(folio))3920ret |= filemap_map_order0_folio(vmf,3921folio, addr, &rss, &mmap_miss);3922else3923ret |= filemap_map_folio_range(vmf, folio,3924xas.xa_index - folio->index, addr,3925nr_pages, &rss, &mmap_miss, file_end);39263927folio_unlock(folio);3928} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);3929add_mm_counter(vma->vm_mm, folio_type, rss);3930pte_unmap_unlock(vmf->pte, vmf->ptl);3931trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);3932out:3933rcu_read_unlock();39343935mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);3936if (mmap_miss >= mmap_miss_saved)3937WRITE_ONCE(file->f_ra.mmap_miss, 0);3938else3939WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);39403941return ret;3942}3943EXPORT_SYMBOL(filemap_map_pages);39443945vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)3946{3947struct address_space *mapping = vmf->vma->vm_file->f_mapping;3948struct folio *folio = page_folio(vmf->page);3949vm_fault_t ret = VM_FAULT_LOCKED;39503951sb_start_pagefault(mapping->host->i_sb);3952file_update_time(vmf->vma->vm_file);3953folio_lock(folio);3954if (folio->mapping != mapping) {3955folio_unlock(folio);3956ret = VM_FAULT_NOPAGE;3957goto out;3958}3959/*3960* We mark the folio dirty already here so that when freeze is in3961* progress, we are guaranteed that writeback during freezing will3962* see the dirty folio and writeprotect it again.3963*/3964folio_mark_dirty(folio);3965folio_wait_stable(folio);3966out:3967sb_end_pagefault(mapping->host->i_sb);3968return ret;3969}39703971const struct vm_operations_struct generic_file_vm_ops = {3972.fault = filemap_fault,3973.map_pages = filemap_map_pages,3974.page_mkwrite = filemap_page_mkwrite,3975};39763977/* This is used for a general mmap of a disk file */39783979int generic_file_mmap(struct file *file, struct vm_area_struct *vma)3980{3981struct address_space *mapping = file->f_mapping;39823983if (!mapping->a_ops->read_folio)3984return -ENOEXEC;3985file_accessed(file);3986vma->vm_ops = &generic_file_vm_ops;3987return 0;3988}39893990int generic_file_mmap_prepare(struct vm_area_desc *desc)3991{3992struct file *file = desc->file;3993struct address_space *mapping = file->f_mapping;39943995if (!mapping->a_ops->read_folio)3996return -ENOEXEC;3997file_accessed(file);3998desc->vm_ops = &generic_file_vm_ops;3999return 0;4000}40014002/*4003* This is for filesystems which do not implement ->writepage.4004*/4005int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)4006{4007if (vma_is_shared_maywrite(vma))4008return -EINVAL;4009return generic_file_mmap(file, vma);4010}40114012int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)4013{4014if (is_shared_maywrite(desc->vm_flags))4015return -EINVAL;4016return generic_file_mmap_prepare(desc);4017}4018#else4019vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)4020{4021return VM_FAULT_SIGBUS;4022}4023int generic_file_mmap(struct file *file, struct vm_area_struct *vma)4024{4025return -ENOSYS;4026}4027int generic_file_mmap_prepare(struct vm_area_desc *desc)4028{4029return -ENOSYS;4030}4031int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)4032{4033return -ENOSYS;4034}4035int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)4036{4037return -ENOSYS;4038}4039#endif /* CONFIG_MMU */40404041EXPORT_SYMBOL(filemap_page_mkwrite);4042EXPORT_SYMBOL(generic_file_mmap);4043EXPORT_SYMBOL(generic_file_mmap_prepare);4044EXPORT_SYMBOL(generic_file_readonly_mmap);4045EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);40464047static struct folio *do_read_cache_folio(struct address_space *mapping,4048pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)4049{4050struct folio *folio;4051int err;40524053if (!filler)4054filler = mapping->a_ops->read_folio;4055repeat:4056folio = filemap_get_folio(mapping, index);4057if (IS_ERR(folio)) {4058folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);4059if (!folio)4060return ERR_PTR(-ENOMEM);4061index = mapping_align_index(mapping, index);4062err = filemap_add_folio(mapping, folio, index, gfp);4063if (unlikely(err)) {4064folio_put(folio);4065if (err == -EEXIST)4066goto repeat;4067/* Presumably ENOMEM for xarray node */4068return ERR_PTR(err);4069}40704071goto filler;4072}4073if (folio_test_uptodate(folio))4074goto out;40754076if (!folio_trylock(folio)) {4077folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);4078goto repeat;4079}40804081/* Folio was truncated from mapping */4082if (!folio->mapping) {4083folio_unlock(folio);4084folio_put(folio);4085goto repeat;4086}40874088/* Someone else locked and filled the page in a very small window */4089if (folio_test_uptodate(folio)) {4090folio_unlock(folio);4091goto out;4092}40934094filler:4095err = filemap_read_folio(file, filler, folio);4096if (err) {4097folio_put(folio);4098if (err == AOP_TRUNCATED_PAGE)4099goto repeat;4100return ERR_PTR(err);4101}41024103out:4104folio_mark_accessed(folio);4105return folio;4106}41074108/**4109* read_cache_folio - Read into page cache, fill it if needed.4110* @mapping: The address_space to read from.4111* @index: The index to read.4112* @filler: Function to perform the read, or NULL to use aops->read_folio().4113* @file: Passed to filler function, may be NULL if not required.4114*4115* Read one page into the page cache. If it succeeds, the folio returned4116* will contain @index, but it may not be the first page of the folio.4117*4118* If the filler function returns an error, it will be returned to the4119* caller.4120*4121* Context: May sleep. Expects mapping->invalidate_lock to be held.4122* Return: An uptodate folio on success, ERR_PTR() on failure.4123*/4124struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,4125filler_t filler, struct file *file)4126{4127return do_read_cache_folio(mapping, index, filler, file,4128mapping_gfp_mask(mapping));4129}4130EXPORT_SYMBOL(read_cache_folio);41314132/**4133* mapping_read_folio_gfp - Read into page cache, using specified allocation flags.4134* @mapping: The address_space for the folio.4135* @index: The index that the allocated folio will contain.4136* @gfp: The page allocator flags to use if allocating.4137*4138* This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with4139* any new memory allocations done using the specified allocation flags.4140*4141* The most likely error from this function is EIO, but ENOMEM is4142* possible and so is EINTR. If ->read_folio returns another error,4143* that will be returned to the caller.4144*4145* The function expects mapping->invalidate_lock to be already held.4146*4147* Return: Uptodate folio on success, ERR_PTR() on failure.4148*/4149struct folio *mapping_read_folio_gfp(struct address_space *mapping,4150pgoff_t index, gfp_t gfp)4151{4152return do_read_cache_folio(mapping, index, NULL, NULL, gfp);4153}4154EXPORT_SYMBOL(mapping_read_folio_gfp);41554156static struct page *do_read_cache_page(struct address_space *mapping,4157pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)4158{4159struct folio *folio;41604161folio = do_read_cache_folio(mapping, index, filler, file, gfp);4162if (IS_ERR(folio))4163return &folio->page;4164return folio_file_page(folio, index);4165}41664167struct page *read_cache_page(struct address_space *mapping,4168pgoff_t index, filler_t *filler, struct file *file)4169{4170return do_read_cache_page(mapping, index, filler, file,4171mapping_gfp_mask(mapping));4172}4173EXPORT_SYMBOL(read_cache_page);41744175/**4176* read_cache_page_gfp - read into page cache, using specified page allocation flags.4177* @mapping: the page's address_space4178* @index: the page index4179* @gfp: the page allocator flags to use if allocating4180*4181* This is the same as "read_mapping_page(mapping, index, NULL)", but with4182* any new page allocations done using the specified allocation flags.4183*4184* If the page does not get brought uptodate, return -EIO.4185*4186* The function expects mapping->invalidate_lock to be already held.4187*4188* Return: up to date page on success, ERR_PTR() on failure.4189*/4190struct page *read_cache_page_gfp(struct address_space *mapping,4191pgoff_t index,4192gfp_t gfp)4193{4194return do_read_cache_page(mapping, index, NULL, NULL, gfp);4195}4196EXPORT_SYMBOL(read_cache_page_gfp);41974198/*4199* Warn about a page cache invalidation failure during a direct I/O write.4200*/4201static void dio_warn_stale_pagecache(struct file *filp)4202{4203static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);4204char pathname[128];4205char *path;42064207errseq_set(&filp->f_mapping->wb_err, -EIO);4208if (__ratelimit(&_rs)) {4209path = file_path(filp, pathname, sizeof(pathname));4210if (IS_ERR(path))4211path = "(unknown)";4212pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");4213pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,4214current->comm);4215}4216}42174218void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)4219{4220struct address_space *mapping = iocb->ki_filp->f_mapping;42214222if (mapping->nrpages &&4223invalidate_inode_pages2_range(mapping,4224iocb->ki_pos >> PAGE_SHIFT,4225(iocb->ki_pos + count - 1) >> PAGE_SHIFT))4226dio_warn_stale_pagecache(iocb->ki_filp);4227}42284229ssize_t4230generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)4231{4232struct address_space *mapping = iocb->ki_filp->f_mapping;4233size_t write_len = iov_iter_count(from);4234ssize_t written;42354236/*4237* If a page can not be invalidated, return 0 to fall back4238* to buffered write.4239*/4240written = kiocb_invalidate_pages(iocb, write_len);4241if (written) {4242if (written == -EBUSY)4243return 0;4244return written;4245}42464247written = mapping->a_ops->direct_IO(iocb, from);42484249/*4250* Finally, try again to invalidate clean pages which might have been4251* cached by non-direct readahead, or faulted in by get_user_pages()4252* if the source of the write was an mmap'ed region of the file4253* we're writing. Either one is a pretty crazy thing to do,4254* so we don't support it 100%. If this invalidation4255* fails, tough, the write still worked...4256*4257* Most of the time we do not need this since dio_complete() will do4258* the invalidation for us. However there are some file systems that4259* do not end up with dio_complete() being called, so let's not break4260* them by removing it completely.4261*4262* Noticeable example is a blkdev_direct_IO().4263*4264* Skip invalidation for async writes or if mapping has no pages.4265*/4266if (written > 0) {4267struct inode *inode = mapping->host;4268loff_t pos = iocb->ki_pos;42694270kiocb_invalidate_post_direct_write(iocb, written);4271pos += written;4272write_len -= written;4273if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {4274i_size_write(inode, pos);4275mark_inode_dirty(inode);4276}4277iocb->ki_pos = pos;4278}4279if (written != -EIOCBQUEUED)4280iov_iter_revert(from, write_len - iov_iter_count(from));4281return written;4282}4283EXPORT_SYMBOL(generic_file_direct_write);42844285ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)4286{4287struct file *file = iocb->ki_filp;4288loff_t pos = iocb->ki_pos;4289struct address_space *mapping = file->f_mapping;4290const struct address_space_operations *a_ops = mapping->a_ops;4291size_t chunk = mapping_max_folio_size(mapping);4292long status = 0;4293ssize_t written = 0;42944295do {4296struct folio *folio;4297size_t offset; /* Offset into folio */4298size_t bytes; /* Bytes to write to folio */4299size_t copied; /* Bytes copied from user */4300void *fsdata = NULL;43014302bytes = iov_iter_count(i);4303retry:4304offset = pos & (chunk - 1);4305bytes = min(chunk - offset, bytes);4306balance_dirty_pages_ratelimited(mapping);43074308if (fatal_signal_pending(current)) {4309status = -EINTR;4310break;4311}43124313status = a_ops->write_begin(iocb, mapping, pos, bytes,4314&folio, &fsdata);4315if (unlikely(status < 0))4316break;43174318offset = offset_in_folio(folio, pos);4319if (bytes > folio_size(folio) - offset)4320bytes = folio_size(folio) - offset;43214322if (mapping_writably_mapped(mapping))4323flush_dcache_folio(folio);43244325/*4326* Faults here on mmap()s can recurse into arbitrary4327* filesystem code. Lots of locks are held that can4328* deadlock. Use an atomic copy to avoid deadlocking4329* in page fault handling.4330*/4331copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);4332flush_dcache_folio(folio);43334334status = a_ops->write_end(iocb, mapping, pos, bytes, copied,4335folio, fsdata);4336if (unlikely(status != copied)) {4337iov_iter_revert(i, copied - max(status, 0L));4338if (unlikely(status < 0))4339break;4340}4341cond_resched();43424343if (unlikely(status == 0)) {4344/*4345* A short copy made ->write_end() reject the4346* thing entirely. Might be memory poisoning4347* halfway through, might be a race with munmap,4348* might be severe memory pressure.4349*/4350if (chunk > PAGE_SIZE)4351chunk /= 2;4352if (copied) {4353bytes = copied;4354goto retry;4355}43564357/*4358* 'folio' is now unlocked and faults on it can be4359* handled. Ensure forward progress by trying to4360* fault it in now.4361*/4362if (fault_in_iov_iter_readable(i, bytes) == bytes) {4363status = -EFAULT;4364break;4365}4366} else {4367pos += status;4368written += status;4369}4370} while (iov_iter_count(i));43714372if (!written)4373return status;4374iocb->ki_pos += written;4375return written;4376}4377EXPORT_SYMBOL(generic_perform_write);43784379/**4380* __generic_file_write_iter - write data to a file4381* @iocb: IO state structure (file, offset, etc.)4382* @from: iov_iter with data to write4383*4384* This function does all the work needed for actually writing data to a4385* file. It does all basic checks, removes SUID from the file, updates4386* modification times and calls proper subroutines depending on whether we4387* do direct IO or a standard buffered write.4388*4389* It expects i_rwsem to be grabbed unless we work on a block device or similar4390* object which does not need locking at all.4391*4392* This function does *not* take care of syncing data in case of O_SYNC write.4393* A caller has to handle it. This is mainly due to the fact that we want to4394* avoid syncing under i_rwsem.4395*4396* Return:4397* * number of bytes written, even for truncated writes4398* * negative error code if no data has been written at all4399*/4400ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4401{4402struct file *file = iocb->ki_filp;4403struct address_space *mapping = file->f_mapping;4404struct inode *inode = mapping->host;4405ssize_t ret;44064407ret = file_remove_privs(file);4408if (ret)4409return ret;44104411ret = file_update_time(file);4412if (ret)4413return ret;44144415if (iocb->ki_flags & IOCB_DIRECT) {4416ret = generic_file_direct_write(iocb, from);4417/*4418* If the write stopped short of completing, fall back to4419* buffered writes. Some filesystems do this for writes to4420* holes, for example. For DAX files, a buffered write will4421* not succeed (even if it did, DAX does not handle dirty4422* page-cache pages correctly).4423*/4424if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))4425return ret;4426return direct_write_fallback(iocb, from, ret,4427generic_perform_write(iocb, from));4428}44294430return generic_perform_write(iocb, from);4431}4432EXPORT_SYMBOL(__generic_file_write_iter);44334434/**4435* generic_file_write_iter - write data to a file4436* @iocb: IO state structure4437* @from: iov_iter with data to write4438*4439* This is a wrapper around __generic_file_write_iter() to be used by most4440* filesystems. It takes care of syncing the file in case of O_SYNC file4441* and acquires i_rwsem as needed.4442* Return:4443* * negative error code if no data has been written at all of4444* vfs_fsync_range() failed for a synchronous write4445* * number of bytes written, even for truncated writes4446*/4447ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4448{4449struct file *file = iocb->ki_filp;4450struct inode *inode = file->f_mapping->host;4451ssize_t ret;44524453inode_lock(inode);4454ret = generic_write_checks(iocb, from);4455if (ret > 0)4456ret = __generic_file_write_iter(iocb, from);4457inode_unlock(inode);44584459if (ret > 0)4460ret = generic_write_sync(iocb, ret);4461return ret;4462}4463EXPORT_SYMBOL(generic_file_write_iter);44644465/**4466* filemap_release_folio() - Release fs-specific metadata on a folio.4467* @folio: The folio which the kernel is trying to free.4468* @gfp: Memory allocation flags (and I/O mode).4469*4470* The address_space is trying to release any data attached to a folio4471* (presumably at folio->private).4472*4473* This will also be called if the private_2 flag is set on a page,4474* indicating that the folio has other metadata associated with it.4475*4476* The @gfp argument specifies whether I/O may be performed to release4477* this page (__GFP_IO), and whether the call may block4478* (__GFP_RECLAIM & __GFP_FS).4479*4480* Return: %true if the release was successful, otherwise %false.4481*/4482bool filemap_release_folio(struct folio *folio, gfp_t gfp)4483{4484struct address_space * const mapping = folio->mapping;44854486BUG_ON(!folio_test_locked(folio));4487if (!folio_needs_release(folio))4488return true;4489if (folio_test_writeback(folio))4490return false;44914492if (mapping && mapping->a_ops->release_folio)4493return mapping->a_ops->release_folio(folio, gfp);4494return try_to_free_buffers(folio);4495}4496EXPORT_SYMBOL(filemap_release_folio);44974498/**4499* filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache4500* @inode: The inode to flush4501* @flush: Set to write back rather than simply invalidate.4502* @start: First byte to in range.4503* @end: Last byte in range (inclusive), or LLONG_MAX for everything from start4504* onwards.4505*4506* Invalidate all the folios on an inode that contribute to the specified4507* range, possibly writing them back first. Whilst the operation is4508* undertaken, the invalidate lock is held to prevent new folios from being4509* installed.4510*/4511int filemap_invalidate_inode(struct inode *inode, bool flush,4512loff_t start, loff_t end)4513{4514struct address_space *mapping = inode->i_mapping;4515pgoff_t first = start >> PAGE_SHIFT;4516pgoff_t last = end >> PAGE_SHIFT;4517pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;45184519if (!mapping || !mapping->nrpages || end < start)4520goto out;45214522/* Prevent new folios from being added to the inode. */4523filemap_invalidate_lock(mapping);45244525if (!mapping->nrpages)4526goto unlock;45274528unmap_mapping_pages(mapping, first, nr, false);45294530/* Write back the data if we're asked to. */4531if (flush)4532filemap_fdatawrite_range(mapping, start, end);45334534/* Wait for writeback to complete on all folios and discard. */4535invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);45364537unlock:4538filemap_invalidate_unlock(mapping);4539out:4540return filemap_check_errors(mapping);4541}4542EXPORT_SYMBOL_GPL(filemap_invalidate_inode);45434544#ifdef CONFIG_CACHESTAT_SYSCALL4545/**4546* filemap_cachestat() - compute the page cache statistics of a mapping4547* @mapping: The mapping to compute the statistics for.4548* @first_index: The starting page cache index.4549* @last_index: The final page index (inclusive).4550* @cs: the cachestat struct to write the result to.4551*4552* This will query the page cache statistics of a mapping in the4553* page range of [first_index, last_index] (inclusive). The statistics4554* queried include: number of dirty pages, number of pages marked for4555* writeback, and the number of (recently) evicted pages.4556*/4557static void filemap_cachestat(struct address_space *mapping,4558pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)4559{4560XA_STATE(xas, &mapping->i_pages, first_index);4561struct folio *folio;45624563/* Flush stats (and potentially sleep) outside the RCU read section. */4564mem_cgroup_flush_stats_ratelimited(NULL);45654566rcu_read_lock();4567xas_for_each(&xas, folio, last_index) {4568int order;4569unsigned long nr_pages;4570pgoff_t folio_first_index, folio_last_index;45714572/*4573* Don't deref the folio. It is not pinned, and might4574* get freed (and reused) underneath us.4575*4576* We *could* pin it, but that would be expensive for4577* what should be a fast and lightweight syscall.4578*4579* Instead, derive all information of interest from4580* the rcu-protected xarray.4581*/45824583if (xas_retry(&xas, folio))4584continue;45854586order = xas_get_order(&xas);4587nr_pages = 1 << order;4588folio_first_index = round_down(xas.xa_index, 1 << order);4589folio_last_index = folio_first_index + nr_pages - 1;45904591/* Folios might straddle the range boundaries, only count covered pages */4592if (folio_first_index < first_index)4593nr_pages -= first_index - folio_first_index;45944595if (folio_last_index > last_index)4596nr_pages -= folio_last_index - last_index;45974598if (xa_is_value(folio)) {4599/* page is evicted */4600void *shadow = (void *)folio;4601bool workingset; /* not used */46024603cs->nr_evicted += nr_pages;46044605#ifdef CONFIG_SWAP /* implies CONFIG_MMU */4606if (shmem_mapping(mapping)) {4607/* shmem file - in swap cache */4608swp_entry_t swp = radix_to_swp_entry(folio);46094610/* swapin error results in poisoned entry */4611if (!softleaf_is_swap(swp))4612goto resched;46134614/*4615* Getting a swap entry from the shmem4616* inode means we beat4617* shmem_unuse(). rcu_read_lock()4618* ensures swapoff waits for us before4619* freeing the swapper space. However,4620* we can race with swapping and4621* invalidation, so there might not be4622* a shadow in the swapcache (yet).4623*/4624shadow = swap_cache_get_shadow(swp);4625if (!shadow)4626goto resched;4627}4628#endif4629if (workingset_test_recent(shadow, true, &workingset, false))4630cs->nr_recently_evicted += nr_pages;46314632goto resched;4633}46344635/* page is in cache */4636cs->nr_cache += nr_pages;46374638if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))4639cs->nr_dirty += nr_pages;46404641if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))4642cs->nr_writeback += nr_pages;46434644resched:4645if (need_resched()) {4646xas_pause(&xas);4647cond_resched_rcu();4648}4649}4650rcu_read_unlock();4651}46524653/*4654* See mincore: reveal pagecache information only for files4655* that the calling process has write access to, or could (if4656* tried) open for writing.4657*/4658static inline bool can_do_cachestat(struct file *f)4659{4660if (f->f_mode & FMODE_WRITE)4661return true;4662if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))4663return true;4664return file_permission(f, MAY_WRITE) == 0;4665}46664667/*4668* The cachestat(2) system call.4669*4670* cachestat() returns the page cache statistics of a file in the4671* bytes range specified by `off` and `len`: number of cached pages,4672* number of dirty pages, number of pages marked for writeback,4673* number of evicted pages, and number of recently evicted pages.4674*4675* An evicted page is a page that is previously in the page cache4676* but has been evicted since. A page is recently evicted if its last4677* eviction was recent enough that its reentry to the cache would4678* indicate that it is actively being used by the system, and that4679* there is memory pressure on the system.4680*4681* `off` and `len` must be non-negative integers. If `len` > 0,4682* the queried range is [`off`, `off` + `len`]. If `len` == 0,4683* we will query in the range from `off` to the end of the file.4684*4685* The `flags` argument is unused for now, but is included for future4686* extensibility. User should pass 0 (i.e no flag specified).4687*4688* Currently, hugetlbfs is not supported.4689*4690* Because the status of a page can change after cachestat() checks it4691* but before it returns to the application, the returned values may4692* contain stale information.4693*4694* return values:4695* zero - success4696* -EFAULT - cstat or cstat_range points to an illegal address4697* -EINVAL - invalid flags4698* -EBADF - invalid file descriptor4699* -EOPNOTSUPP - file descriptor is of a hugetlbfs file4700*/4701SYSCALL_DEFINE4(cachestat, unsigned int, fd,4702struct cachestat_range __user *, cstat_range,4703struct cachestat __user *, cstat, unsigned int, flags)4704{4705CLASS(fd, f)(fd);4706struct address_space *mapping;4707struct cachestat_range csr;4708struct cachestat cs;4709pgoff_t first_index, last_index;47104711if (fd_empty(f))4712return -EBADF;47134714if (copy_from_user(&csr, cstat_range,4715sizeof(struct cachestat_range)))4716return -EFAULT;47174718/* hugetlbfs is not supported */4719if (is_file_hugepages(fd_file(f)))4720return -EOPNOTSUPP;47214722if (!can_do_cachestat(fd_file(f)))4723return -EPERM;47244725if (flags != 0)4726return -EINVAL;47274728first_index = csr.off >> PAGE_SHIFT;4729last_index =4730csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;4731memset(&cs, 0, sizeof(struct cachestat));4732mapping = fd_file(f)->f_mapping;4733filemap_cachestat(mapping, first_index, last_index, &cs);47344735if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))4736return -EFAULT;47374738return 0;4739}4740#endif /* CONFIG_CACHESTAT_SYSCALL */474147424743