// SPDX-License-Identifier: GPL-2.0-only1/*2* linux/mm/filemap.c3*4* Copyright (C) 1994-1999 Linus Torvalds5*/67/*8* This file handles the generic file mmap semantics used by9* most "normal" filesystems (but you don't /have/ to use this:10* the NFS filesystem used to do this differently, for example)11*/12#include <linux/export.h>13#include <linux/compiler.h>14#include <linux/dax.h>15#include <linux/fs.h>16#include <linux/sched/signal.h>17#include <linux/uaccess.h>18#include <linux/capability.h>19#include <linux/kernel_stat.h>20#include <linux/gfp.h>21#include <linux/mm.h>22#include <linux/swap.h>23#include <linux/swapops.h>24#include <linux/syscalls.h>25#include <linux/mman.h>26#include <linux/pagemap.h>27#include <linux/file.h>28#include <linux/uio.h>29#include <linux/error-injection.h>30#include <linux/hash.h>31#include <linux/writeback.h>32#include <linux/backing-dev.h>33#include <linux/pagevec.h>34#include <linux/security.h>35#include <linux/cpuset.h>36#include <linux/hugetlb.h>37#include <linux/memcontrol.h>38#include <linux/shmem_fs.h>39#include <linux/rmap.h>40#include <linux/delayacct.h>41#include <linux/psi.h>42#include <linux/ramfs.h>43#include <linux/page_idle.h>44#include <linux/migrate.h>45#include <linux/pipe_fs_i.h>46#include <linux/splice.h>47#include <linux/rcupdate_wait.h>48#include <linux/sched/mm.h>49#include <linux/sysctl.h>50#include <asm/pgalloc.h>51#include <asm/tlbflush.h>52#include "internal.h"5354#define CREATE_TRACE_POINTS55#include <trace/events/filemap.h>5657/*58* FIXME: remove all knowledge of the buffer layer from the core VM59*/60#include <linux/buffer_head.h> /* for try_to_free_buffers */6162#include <asm/mman.h>6364#include "swap.h"6566/*67* Shared mappings implemented 30.11.1994. It's not fully working yet,68* though.69*70* Shared mappings now work. 15.8.1995 Bruno.71*72* finished 'unifying' the page and buffer cache and SMP-threaded the73* page-cache, 21.05.1999, Ingo Molnar <[email protected]>74*75* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <[email protected]>76*/7778/*79* Lock ordering:80*81* ->i_mmap_rwsem (truncate_pagecache)82* ->private_lock (__free_pte->block_dirty_folio)83* ->swap_lock (exclusive_swap_page, others)84* ->i_pages lock85*86* ->i_rwsem87* ->invalidate_lock (acquired by fs in truncate path)88* ->i_mmap_rwsem (truncate->unmap_mapping_range)89*90* ->mmap_lock91* ->i_mmap_rwsem92* ->page_table_lock or pte_lock (various, mainly in memory.c)93* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)94*95* ->mmap_lock96* ->invalidate_lock (filemap_fault)97* ->lock_page (filemap_fault, access_process_vm)98*99* ->i_rwsem (generic_perform_write)100* ->mmap_lock (fault_in_readable->do_page_fault)101*102* bdi->wb.list_lock103* sb_lock (fs/fs-writeback.c)104* ->i_pages lock (__sync_single_inode)105*106* ->i_mmap_rwsem107* ->anon_vma.lock (vma_merge)108*109* ->anon_vma.lock110* ->page_table_lock or pte_lock (anon_vma_prepare and various)111*112* ->page_table_lock or pte_lock113* ->swap_lock (try_to_unmap_one)114* ->private_lock (try_to_unmap_one)115* ->i_pages lock (try_to_unmap_one)116* ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)117* ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)118* ->private_lock (folio_remove_rmap_pte->set_page_dirty)119* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)120* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)121* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)122* bdi.wb->list_lock (zap_pte_range->set_page_dirty)123* ->inode->i_lock (zap_pte_range->set_page_dirty)124* ->private_lock (zap_pte_range->block_dirty_folio)125*/126127static void page_cache_delete(struct address_space *mapping,128struct folio *folio, void *shadow)129{130XA_STATE(xas, &mapping->i_pages, folio->index);131long nr = 1;132133mapping_set_update(&xas, mapping);134135xas_set_order(&xas, folio->index, folio_order(folio));136nr = folio_nr_pages(folio);137138VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);139140xas_store(&xas, shadow);141xas_init_marks(&xas);142143folio->mapping = NULL;144/* Leave folio->index set: truncation lookup relies upon it */145mapping->nrpages -= nr;146}147148static void filemap_unaccount_folio(struct address_space *mapping,149struct folio *folio)150{151long nr;152153VM_BUG_ON_FOLIO(folio_mapped(folio), folio);154if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {155pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",156current->comm, folio_pfn(folio));157dump_page(&folio->page, "still mapped when deleted");158dump_stack();159add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);160161if (mapping_exiting(mapping) && !folio_test_large(folio)) {162int mapcount = folio_mapcount(folio);163164if (folio_ref_count(folio) >= mapcount + 2) {165/*166* All vmas have already been torn down, so it's167* a good bet that actually the page is unmapped168* and we'd rather not leak it: if we're wrong,169* another bad page check should catch it later.170*/171atomic_set(&folio->_mapcount, -1);172folio_ref_sub(folio, mapcount);173}174}175}176177/* hugetlb folios do not participate in page cache accounting. */178if (folio_test_hugetlb(folio))179return;180181nr = folio_nr_pages(folio);182183__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);184if (folio_test_swapbacked(folio)) {185__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);186if (folio_test_pmd_mappable(folio))187__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);188} else if (folio_test_pmd_mappable(folio)) {189__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);190filemap_nr_thps_dec(mapping);191}192193/*194* At this point folio must be either written or cleaned by195* truncate. Dirty folio here signals a bug and loss of196* unwritten data - on ordinary filesystems.197*198* But it's harmless on in-memory filesystems like tmpfs; and can199* occur when a driver which did get_user_pages() sets page dirty200* before putting it, while the inode is being finally evicted.201*202* Below fixes dirty accounting after removing the folio entirely203* but leaves the dirty flag set: it has no effect for truncated204* folio and anyway will be cleared before returning folio to205* buddy allocator.206*/207if (WARN_ON_ONCE(folio_test_dirty(folio) &&208mapping_can_writeback(mapping)))209folio_account_cleaned(folio, inode_to_wb(mapping->host));210}211212/*213* Delete a page from the page cache and free it. Caller has to make214* sure the page is locked and that nobody else uses it - or that usage215* is safe. The caller must hold the i_pages lock.216*/217void __filemap_remove_folio(struct folio *folio, void *shadow)218{219struct address_space *mapping = folio->mapping;220221trace_mm_filemap_delete_from_page_cache(folio);222filemap_unaccount_folio(mapping, folio);223page_cache_delete(mapping, folio, shadow);224}225226void filemap_free_folio(struct address_space *mapping, struct folio *folio)227{228void (*free_folio)(struct folio *);229230free_folio = mapping->a_ops->free_folio;231if (free_folio)232free_folio(folio);233234folio_put_refs(folio, folio_nr_pages(folio));235}236237/**238* filemap_remove_folio - Remove folio from page cache.239* @folio: The folio.240*241* This must be called only on folios that are locked and have been242* verified to be in the page cache. It will never put the folio into243* the free list because the caller has a reference on the page.244*/245void filemap_remove_folio(struct folio *folio)246{247struct address_space *mapping = folio->mapping;248249BUG_ON(!folio_test_locked(folio));250spin_lock(&mapping->host->i_lock);251xa_lock_irq(&mapping->i_pages);252__filemap_remove_folio(folio, NULL);253xa_unlock_irq(&mapping->i_pages);254if (mapping_shrinkable(mapping))255inode_add_lru(mapping->host);256spin_unlock(&mapping->host->i_lock);257258filemap_free_folio(mapping, folio);259}260261/*262* page_cache_delete_batch - delete several folios from page cache263* @mapping: the mapping to which folios belong264* @fbatch: batch of folios to delete265*266* The function walks over mapping->i_pages and removes folios passed in267* @fbatch from the mapping. The function expects @fbatch to be sorted268* by page index and is optimised for it to be dense.269* It tolerates holes in @fbatch (mapping entries at those indices are not270* modified).271*272* The function expects the i_pages lock to be held.273*/274static void page_cache_delete_batch(struct address_space *mapping,275struct folio_batch *fbatch)276{277XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);278long total_pages = 0;279int i = 0;280struct folio *folio;281282mapping_set_update(&xas, mapping);283xas_for_each(&xas, folio, ULONG_MAX) {284if (i >= folio_batch_count(fbatch))285break;286287/* A swap/dax/shadow entry got inserted? Skip it. */288if (xa_is_value(folio))289continue;290/*291* A page got inserted in our range? Skip it. We have our292* pages locked so they are protected from being removed.293* If we see a page whose index is higher than ours, it294* means our page has been removed, which shouldn't be295* possible because we're holding the PageLock.296*/297if (folio != fbatch->folios[i]) {298VM_BUG_ON_FOLIO(folio->index >299fbatch->folios[i]->index, folio);300continue;301}302303WARN_ON_ONCE(!folio_test_locked(folio));304305folio->mapping = NULL;306/* Leave folio->index set: truncation lookup relies on it */307308i++;309xas_store(&xas, NULL);310total_pages += folio_nr_pages(folio);311}312mapping->nrpages -= total_pages;313}314315void delete_from_page_cache_batch(struct address_space *mapping,316struct folio_batch *fbatch)317{318int i;319320if (!folio_batch_count(fbatch))321return;322323spin_lock(&mapping->host->i_lock);324xa_lock_irq(&mapping->i_pages);325for (i = 0; i < folio_batch_count(fbatch); i++) {326struct folio *folio = fbatch->folios[i];327328trace_mm_filemap_delete_from_page_cache(folio);329filemap_unaccount_folio(mapping, folio);330}331page_cache_delete_batch(mapping, fbatch);332xa_unlock_irq(&mapping->i_pages);333if (mapping_shrinkable(mapping))334inode_add_lru(mapping->host);335spin_unlock(&mapping->host->i_lock);336337for (i = 0; i < folio_batch_count(fbatch); i++)338filemap_free_folio(mapping, fbatch->folios[i]);339}340341int filemap_check_errors(struct address_space *mapping)342{343int ret = 0;344/* Check for outstanding write errors */345if (test_bit(AS_ENOSPC, &mapping->flags) &&346test_and_clear_bit(AS_ENOSPC, &mapping->flags))347ret = -ENOSPC;348if (test_bit(AS_EIO, &mapping->flags) &&349test_and_clear_bit(AS_EIO, &mapping->flags))350ret = -EIO;351return ret;352}353EXPORT_SYMBOL(filemap_check_errors);354355static int filemap_check_and_keep_errors(struct address_space *mapping)356{357/* Check for outstanding write errors */358if (test_bit(AS_EIO, &mapping->flags))359return -EIO;360if (test_bit(AS_ENOSPC, &mapping->flags))361return -ENOSPC;362return 0;363}364365/**366* filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range367* @mapping: address space structure to write368* @wbc: the writeback_control controlling the writeout369*370* Call writepages on the mapping using the provided wbc to control the371* writeout.372*373* Return: %0 on success, negative error code otherwise.374*/375int filemap_fdatawrite_wbc(struct address_space *mapping,376struct writeback_control *wbc)377{378int ret;379380if (!mapping_can_writeback(mapping) ||381!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))382return 0;383384wbc_attach_fdatawrite_inode(wbc, mapping->host);385ret = do_writepages(mapping, wbc);386wbc_detach_inode(wbc);387return ret;388}389EXPORT_SYMBOL(filemap_fdatawrite_wbc);390391/**392* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range393* @mapping: address space structure to write394* @start: offset in bytes where the range starts395* @end: offset in bytes where the range ends (inclusive)396* @sync_mode: enable synchronous operation397*398* Start writeback against all of a mapping's dirty pages that lie399* within the byte offsets <start, end> inclusive.400*401* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as402* opposed to a regular memory cleansing writeback. The difference between403* these two operations is that if a dirty page/buffer is encountered, it must404* be waited upon, and not just skipped over.405*406* Return: %0 on success, negative error code otherwise.407*/408int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,409loff_t end, int sync_mode)410{411struct writeback_control wbc = {412.sync_mode = sync_mode,413.nr_to_write = LONG_MAX,414.range_start = start,415.range_end = end,416};417418return filemap_fdatawrite_wbc(mapping, &wbc);419}420421static inline int __filemap_fdatawrite(struct address_space *mapping,422int sync_mode)423{424return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);425}426427int filemap_fdatawrite(struct address_space *mapping)428{429return __filemap_fdatawrite(mapping, WB_SYNC_ALL);430}431EXPORT_SYMBOL(filemap_fdatawrite);432433int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,434loff_t end)435{436return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);437}438EXPORT_SYMBOL(filemap_fdatawrite_range);439440/**441* filemap_fdatawrite_range_kick - start writeback on a range442* @mapping: target address_space443* @start: index to start writeback on444* @end: last (inclusive) index for writeback445*446* This is a non-integrity writeback helper, to start writing back folios447* for the indicated range.448*449* Return: %0 on success, negative error code otherwise.450*/451int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,452loff_t end)453{454return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);455}456EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);457458/**459* filemap_flush - mostly a non-blocking flush460* @mapping: target address_space461*462* This is a mostly non-blocking flush. Not suitable for data-integrity463* purposes - I/O may not be started against all dirty pages.464*465* Return: %0 on success, negative error code otherwise.466*/467int filemap_flush(struct address_space *mapping)468{469return __filemap_fdatawrite(mapping, WB_SYNC_NONE);470}471EXPORT_SYMBOL(filemap_flush);472473/**474* filemap_range_has_page - check if a page exists in range.475* @mapping: address space within which to check476* @start_byte: offset in bytes where the range starts477* @end_byte: offset in bytes where the range ends (inclusive)478*479* Find at least one page in the range supplied, usually used to check if480* direct writing in this range will trigger a writeback.481*482* Return: %true if at least one page exists in the specified range,483* %false otherwise.484*/485bool filemap_range_has_page(struct address_space *mapping,486loff_t start_byte, loff_t end_byte)487{488struct folio *folio;489XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);490pgoff_t max = end_byte >> PAGE_SHIFT;491492if (end_byte < start_byte)493return false;494495rcu_read_lock();496for (;;) {497folio = xas_find(&xas, max);498if (xas_retry(&xas, folio))499continue;500/* Shadow entries don't count */501if (xa_is_value(folio))502continue;503/*504* We don't need to try to pin this page; we're about to505* release the RCU lock anyway. It is enough to know that506* there was a page here recently.507*/508break;509}510rcu_read_unlock();511512return folio != NULL;513}514EXPORT_SYMBOL(filemap_range_has_page);515516static void __filemap_fdatawait_range(struct address_space *mapping,517loff_t start_byte, loff_t end_byte)518{519pgoff_t index = start_byte >> PAGE_SHIFT;520pgoff_t end = end_byte >> PAGE_SHIFT;521struct folio_batch fbatch;522unsigned nr_folios;523524folio_batch_init(&fbatch);525526while (index <= end) {527unsigned i;528529nr_folios = filemap_get_folios_tag(mapping, &index, end,530PAGECACHE_TAG_WRITEBACK, &fbatch);531532if (!nr_folios)533break;534535for (i = 0; i < nr_folios; i++) {536struct folio *folio = fbatch.folios[i];537538folio_wait_writeback(folio);539}540folio_batch_release(&fbatch);541cond_resched();542}543}544545/**546* filemap_fdatawait_range - wait for writeback to complete547* @mapping: address space structure to wait for548* @start_byte: offset in bytes where the range starts549* @end_byte: offset in bytes where the range ends (inclusive)550*551* Walk the list of under-writeback pages of the given address space552* in the given range and wait for all of them. Check error status of553* the address space and return it.554*555* Since the error status of the address space is cleared by this function,556* callers are responsible for checking the return value and handling and/or557* reporting the error.558*559* Return: error status of the address space.560*/561int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,562loff_t end_byte)563{564__filemap_fdatawait_range(mapping, start_byte, end_byte);565return filemap_check_errors(mapping);566}567EXPORT_SYMBOL(filemap_fdatawait_range);568569/**570* filemap_fdatawait_range_keep_errors - wait for writeback to complete571* @mapping: address space structure to wait for572* @start_byte: offset in bytes where the range starts573* @end_byte: offset in bytes where the range ends (inclusive)574*575* Walk the list of under-writeback pages of the given address space in the576* given range and wait for all of them. Unlike filemap_fdatawait_range(),577* this function does not clear error status of the address space.578*579* Use this function if callers don't handle errors themselves. Expected580* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),581* fsfreeze(8)582*/583int filemap_fdatawait_range_keep_errors(struct address_space *mapping,584loff_t start_byte, loff_t end_byte)585{586__filemap_fdatawait_range(mapping, start_byte, end_byte);587return filemap_check_and_keep_errors(mapping);588}589EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);590591/**592* file_fdatawait_range - wait for writeback to complete593* @file: file pointing to address space structure to wait for594* @start_byte: offset in bytes where the range starts595* @end_byte: offset in bytes where the range ends (inclusive)596*597* Walk the list of under-writeback pages of the address space that file598* refers to, in the given range and wait for all of them. Check error599* status of the address space vs. the file->f_wb_err cursor and return it.600*601* Since the error status of the file is advanced by this function,602* callers are responsible for checking the return value and handling and/or603* reporting the error.604*605* Return: error status of the address space vs. the file->f_wb_err cursor.606*/607int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)608{609struct address_space *mapping = file->f_mapping;610611__filemap_fdatawait_range(mapping, start_byte, end_byte);612return file_check_and_advance_wb_err(file);613}614EXPORT_SYMBOL(file_fdatawait_range);615616/**617* filemap_fdatawait_keep_errors - wait for writeback without clearing errors618* @mapping: address space structure to wait for619*620* Walk the list of under-writeback pages of the given address space621* and wait for all of them. Unlike filemap_fdatawait(), this function622* does not clear error status of the address space.623*624* Use this function if callers don't handle errors themselves. Expected625* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),626* fsfreeze(8)627*628* Return: error status of the address space.629*/630int filemap_fdatawait_keep_errors(struct address_space *mapping)631{632__filemap_fdatawait_range(mapping, 0, LLONG_MAX);633return filemap_check_and_keep_errors(mapping);634}635EXPORT_SYMBOL(filemap_fdatawait_keep_errors);636637/* Returns true if writeback might be needed or already in progress. */638static bool mapping_needs_writeback(struct address_space *mapping)639{640return mapping->nrpages;641}642643bool filemap_range_has_writeback(struct address_space *mapping,644loff_t start_byte, loff_t end_byte)645{646XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);647pgoff_t max = end_byte >> PAGE_SHIFT;648struct folio *folio;649650if (end_byte < start_byte)651return false;652653rcu_read_lock();654xas_for_each(&xas, folio, max) {655if (xas_retry(&xas, folio))656continue;657if (xa_is_value(folio))658continue;659if (folio_test_dirty(folio) || folio_test_locked(folio) ||660folio_test_writeback(folio))661break;662}663rcu_read_unlock();664return folio != NULL;665}666EXPORT_SYMBOL_GPL(filemap_range_has_writeback);667668/**669* filemap_write_and_wait_range - write out & wait on a file range670* @mapping: the address_space for the pages671* @lstart: offset in bytes where the range starts672* @lend: offset in bytes where the range ends (inclusive)673*674* Write out and wait upon file offsets lstart->lend, inclusive.675*676* Note that @lend is inclusive (describes the last byte to be written) so677* that this function can be used to write to the very end-of-file (end = -1).678*679* Return: error status of the address space.680*/681int filemap_write_and_wait_range(struct address_space *mapping,682loff_t lstart, loff_t lend)683{684int err = 0, err2;685686if (lend < lstart)687return 0;688689if (mapping_needs_writeback(mapping)) {690err = __filemap_fdatawrite_range(mapping, lstart, lend,691WB_SYNC_ALL);692/*693* Even if the above returned error, the pages may be694* written partially (e.g. -ENOSPC), so we wait for it.695* But the -EIO is special case, it may indicate the worst696* thing (e.g. bug) happened, so we avoid waiting for it.697*/698if (err != -EIO)699__filemap_fdatawait_range(mapping, lstart, lend);700}701err2 = filemap_check_errors(mapping);702if (!err)703err = err2;704return err;705}706EXPORT_SYMBOL(filemap_write_and_wait_range);707708void __filemap_set_wb_err(struct address_space *mapping, int err)709{710errseq_t eseq = errseq_set(&mapping->wb_err, err);711712trace_filemap_set_wb_err(mapping, eseq);713}714EXPORT_SYMBOL(__filemap_set_wb_err);715716/**717* file_check_and_advance_wb_err - report wb error (if any) that was previously718* and advance wb_err to current one719* @file: struct file on which the error is being reported720*721* When userland calls fsync (or something like nfsd does the equivalent), we722* want to report any writeback errors that occurred since the last fsync (or723* since the file was opened if there haven't been any).724*725* Grab the wb_err from the mapping. If it matches what we have in the file,726* then just quickly return 0. The file is all caught up.727*728* If it doesn't match, then take the mapping value, set the "seen" flag in729* it and try to swap it into place. If it works, or another task beat us730* to it with the new value, then update the f_wb_err and return the error731* portion. The error at this point must be reported via proper channels732* (a'la fsync, or NFS COMMIT operation, etc.).733*734* While we handle mapping->wb_err with atomic operations, the f_wb_err735* value is protected by the f_lock since we must ensure that it reflects736* the latest value swapped in for this file descriptor.737*738* Return: %0 on success, negative error code otherwise.739*/740int file_check_and_advance_wb_err(struct file *file)741{742int err = 0;743errseq_t old = READ_ONCE(file->f_wb_err);744struct address_space *mapping = file->f_mapping;745746/* Locklessly handle the common case where nothing has changed */747if (errseq_check(&mapping->wb_err, old)) {748/* Something changed, must use slow path */749spin_lock(&file->f_lock);750old = file->f_wb_err;751err = errseq_check_and_advance(&mapping->wb_err,752&file->f_wb_err);753trace_file_check_and_advance_wb_err(file, old);754spin_unlock(&file->f_lock);755}756757/*758* We're mostly using this function as a drop in replacement for759* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect760* that the legacy code would have had on these flags.761*/762clear_bit(AS_EIO, &mapping->flags);763clear_bit(AS_ENOSPC, &mapping->flags);764return err;765}766EXPORT_SYMBOL(file_check_and_advance_wb_err);767768/**769* file_write_and_wait_range - write out & wait on a file range770* @file: file pointing to address_space with pages771* @lstart: offset in bytes where the range starts772* @lend: offset in bytes where the range ends (inclusive)773*774* Write out and wait upon file offsets lstart->lend, inclusive.775*776* Note that @lend is inclusive (describes the last byte to be written) so777* that this function can be used to write to the very end-of-file (end = -1).778*779* After writing out and waiting on the data, we check and advance the780* f_wb_err cursor to the latest value, and return any errors detected there.781*782* Return: %0 on success, negative error code otherwise.783*/784int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)785{786int err = 0, err2;787struct address_space *mapping = file->f_mapping;788789if (lend < lstart)790return 0;791792if (mapping_needs_writeback(mapping)) {793err = __filemap_fdatawrite_range(mapping, lstart, lend,794WB_SYNC_ALL);795/* See comment of filemap_write_and_wait() */796if (err != -EIO)797__filemap_fdatawait_range(mapping, lstart, lend);798}799err2 = file_check_and_advance_wb_err(file);800if (!err)801err = err2;802return err;803}804EXPORT_SYMBOL(file_write_and_wait_range);805806/**807* replace_page_cache_folio - replace a pagecache folio with a new one808* @old: folio to be replaced809* @new: folio to replace with810*811* This function replaces a folio in the pagecache with a new one. On812* success it acquires the pagecache reference for the new folio and813* drops it for the old folio. Both the old and new folios must be814* locked. This function does not add the new folio to the LRU, the815* caller must do that.816*817* The remove + add is atomic. This function cannot fail.818*/819void replace_page_cache_folio(struct folio *old, struct folio *new)820{821struct address_space *mapping = old->mapping;822void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;823pgoff_t offset = old->index;824XA_STATE(xas, &mapping->i_pages, offset);825826VM_BUG_ON_FOLIO(!folio_test_locked(old), old);827VM_BUG_ON_FOLIO(!folio_test_locked(new), new);828VM_BUG_ON_FOLIO(new->mapping, new);829830folio_get(new);831new->mapping = mapping;832new->index = offset;833834mem_cgroup_replace_folio(old, new);835836xas_lock_irq(&xas);837xas_store(&xas, new);838839old->mapping = NULL;840/* hugetlb pages do not participate in page cache accounting. */841if (!folio_test_hugetlb(old))842__lruvec_stat_sub_folio(old, NR_FILE_PAGES);843if (!folio_test_hugetlb(new))844__lruvec_stat_add_folio(new, NR_FILE_PAGES);845if (folio_test_swapbacked(old))846__lruvec_stat_sub_folio(old, NR_SHMEM);847if (folio_test_swapbacked(new))848__lruvec_stat_add_folio(new, NR_SHMEM);849xas_unlock_irq(&xas);850if (free_folio)851free_folio(old);852folio_put(old);853}854EXPORT_SYMBOL_GPL(replace_page_cache_folio);855856noinline int __filemap_add_folio(struct address_space *mapping,857struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)858{859XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));860bool huge;861long nr;862unsigned int forder = folio_order(folio);863864VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);865VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);866VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),867folio);868mapping_set_update(&xas, mapping);869870VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);871huge = folio_test_hugetlb(folio);872nr = folio_nr_pages(folio);873874gfp &= GFP_RECLAIM_MASK;875folio_ref_add(folio, nr);876folio->mapping = mapping;877folio->index = xas.xa_index;878879for (;;) {880int order = -1;881void *entry, *old = NULL;882883xas_lock_irq(&xas);884xas_for_each_conflict(&xas, entry) {885old = entry;886if (!xa_is_value(entry)) {887xas_set_err(&xas, -EEXIST);888goto unlock;889}890/*891* If a larger entry exists,892* it will be the first and only entry iterated.893*/894if (order == -1)895order = xas_get_order(&xas);896}897898if (old) {899if (order > 0 && order > forder) {900unsigned int split_order = max(forder,901xas_try_split_min_order(order));902903/* How to handle large swap entries? */904BUG_ON(shmem_mapping(mapping));905906while (order > forder) {907xas_set_order(&xas, index, split_order);908xas_try_split(&xas, old, order);909if (xas_error(&xas))910goto unlock;911order = split_order;912split_order =913max(xas_try_split_min_order(914split_order),915forder);916}917xas_reset(&xas);918}919if (shadowp)920*shadowp = old;921}922923xas_store(&xas, folio);924if (xas_error(&xas))925goto unlock;926927mapping->nrpages += nr;928929/* hugetlb pages do not participate in page cache accounting */930if (!huge) {931__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);932if (folio_test_pmd_mappable(folio))933__lruvec_stat_mod_folio(folio,934NR_FILE_THPS, nr);935}936937unlock:938xas_unlock_irq(&xas);939940if (!xas_nomem(&xas, gfp))941break;942}943944if (xas_error(&xas))945goto error;946947trace_mm_filemap_add_to_page_cache(folio);948return 0;949error:950folio->mapping = NULL;951/* Leave folio->index set: truncation relies upon it */952folio_put_refs(folio, nr);953return xas_error(&xas);954}955ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);956957int filemap_add_folio(struct address_space *mapping, struct folio *folio,958pgoff_t index, gfp_t gfp)959{960void *shadow = NULL;961int ret;962963ret = mem_cgroup_charge(folio, NULL, gfp);964if (ret)965return ret;966967__folio_set_locked(folio);968ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);969if (unlikely(ret)) {970mem_cgroup_uncharge(folio);971__folio_clear_locked(folio);972} else {973/*974* The folio might have been evicted from cache only975* recently, in which case it should be activated like976* any other repeatedly accessed folio.977* The exception is folios getting rewritten; evicting other978* data from the working set, only to cache data that will979* get overwritten with something else, is a waste of memory.980*/981WARN_ON_ONCE(folio_test_active(folio));982if (!(gfp & __GFP_WRITE) && shadow)983workingset_refault(folio, shadow);984folio_add_lru(folio);985}986return ret;987}988EXPORT_SYMBOL_GPL(filemap_add_folio);989990#ifdef CONFIG_NUMA991struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)992{993int n;994struct folio *folio;995996if (cpuset_do_page_mem_spread()) {997unsigned int cpuset_mems_cookie;998do {999cpuset_mems_cookie = read_mems_allowed_begin();1000n = cpuset_mem_spread_node();1001folio = __folio_alloc_node_noprof(gfp, order, n);1002} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));10031004return folio;1005}1006return folio_alloc_noprof(gfp, order);1007}1008EXPORT_SYMBOL(filemap_alloc_folio_noprof);1009#endif10101011/*1012* filemap_invalidate_lock_two - lock invalidate_lock for two mappings1013*1014* Lock exclusively invalidate_lock of any passed mapping that is not NULL.1015*1016* @mapping1: the first mapping to lock1017* @mapping2: the second mapping to lock1018*/1019void filemap_invalidate_lock_two(struct address_space *mapping1,1020struct address_space *mapping2)1021{1022if (mapping1 > mapping2)1023swap(mapping1, mapping2);1024if (mapping1)1025down_write(&mapping1->invalidate_lock);1026if (mapping2 && mapping1 != mapping2)1027down_write_nested(&mapping2->invalidate_lock, 1);1028}1029EXPORT_SYMBOL(filemap_invalidate_lock_two);10301031/*1032* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings1033*1034* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.1035*1036* @mapping1: the first mapping to unlock1037* @mapping2: the second mapping to unlock1038*/1039void filemap_invalidate_unlock_two(struct address_space *mapping1,1040struct address_space *mapping2)1041{1042if (mapping1)1043up_write(&mapping1->invalidate_lock);1044if (mapping2 && mapping1 != mapping2)1045up_write(&mapping2->invalidate_lock);1046}1047EXPORT_SYMBOL(filemap_invalidate_unlock_two);10481049/*1050* In order to wait for pages to become available there must be1051* waitqueues associated with pages. By using a hash table of1052* waitqueues where the bucket discipline is to maintain all1053* waiters on the same queue and wake all when any of the pages1054* become available, and for the woken contexts to check to be1055* sure the appropriate page became available, this saves space1056* at a cost of "thundering herd" phenomena during rare hash1057* collisions.1058*/1059#define PAGE_WAIT_TABLE_BITS 81060#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)1061static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;10621063static wait_queue_head_t *folio_waitqueue(struct folio *folio)1064{1065return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];1066}10671068/* How many times do we accept lock stealing from under a waiter? */1069static int sysctl_page_lock_unfairness = 5;1070static const struct ctl_table filemap_sysctl_table[] = {1071{1072.procname = "page_lock_unfairness",1073.data = &sysctl_page_lock_unfairness,1074.maxlen = sizeof(sysctl_page_lock_unfairness),1075.mode = 0644,1076.proc_handler = proc_dointvec_minmax,1077.extra1 = SYSCTL_ZERO,1078}1079};10801081void __init pagecache_init(void)1082{1083int i;10841085for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)1086init_waitqueue_head(&folio_wait_table[i]);10871088page_writeback_init();1089register_sysctl_init("vm", filemap_sysctl_table);1090}10911092/*1093* The page wait code treats the "wait->flags" somewhat unusually, because1094* we have multiple different kinds of waits, not just the usual "exclusive"1095* one.1096*1097* We have:1098*1099* (a) no special bits set:1100*1101* We're just waiting for the bit to be released, and when a waker1102* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,1103* and remove it from the wait queue.1104*1105* Simple and straightforward.1106*1107* (b) WQ_FLAG_EXCLUSIVE:1108*1109* The waiter is waiting to get the lock, and only one waiter should1110* be woken up to avoid any thundering herd behavior. We'll set the1111* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.1112*1113* This is the traditional exclusive wait.1114*1115* (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:1116*1117* The waiter is waiting to get the bit, and additionally wants the1118* lock to be transferred to it for fair lock behavior. If the lock1119* cannot be taken, we stop walking the wait queue without waking1120* the waiter.1121*1122* This is the "fair lock handoff" case, and in addition to setting1123* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see1124* that it now has the lock.1125*/1126static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)1127{1128unsigned int flags;1129struct wait_page_key *key = arg;1130struct wait_page_queue *wait_page1131= container_of(wait, struct wait_page_queue, wait);11321133if (!wake_page_match(wait_page, key))1134return 0;11351136/*1137* If it's a lock handoff wait, we get the bit for it, and1138* stop walking (and do not wake it up) if we can't.1139*/1140flags = wait->flags;1141if (flags & WQ_FLAG_EXCLUSIVE) {1142if (test_bit(key->bit_nr, &key->folio->flags))1143return -1;1144if (flags & WQ_FLAG_CUSTOM) {1145if (test_and_set_bit(key->bit_nr, &key->folio->flags))1146return -1;1147flags |= WQ_FLAG_DONE;1148}1149}11501151/*1152* We are holding the wait-queue lock, but the waiter that1153* is waiting for this will be checking the flags without1154* any locking.1155*1156* So update the flags atomically, and wake up the waiter1157* afterwards to avoid any races. This store-release pairs1158* with the load-acquire in folio_wait_bit_common().1159*/1160smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);1161wake_up_state(wait->private, mode);11621163/*1164* Ok, we have successfully done what we're waiting for,1165* and we can unconditionally remove the wait entry.1166*1167* Note that this pairs with the "finish_wait()" in the1168* waiter, and has to be the absolute last thing we do.1169* After this list_del_init(&wait->entry) the wait entry1170* might be de-allocated and the process might even have1171* exited.1172*/1173list_del_init_careful(&wait->entry);1174return (flags & WQ_FLAG_EXCLUSIVE) != 0;1175}11761177static void folio_wake_bit(struct folio *folio, int bit_nr)1178{1179wait_queue_head_t *q = folio_waitqueue(folio);1180struct wait_page_key key;1181unsigned long flags;11821183key.folio = folio;1184key.bit_nr = bit_nr;1185key.page_match = 0;11861187spin_lock_irqsave(&q->lock, flags);1188__wake_up_locked_key(q, TASK_NORMAL, &key);11891190/*1191* It's possible to miss clearing waiters here, when we woke our page1192* waiters, but the hashed waitqueue has waiters for other pages on it.1193* That's okay, it's a rare case. The next waker will clear it.1194*1195* Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,1196* other), the flag may be cleared in the course of freeing the page;1197* but that is not required for correctness.1198*/1199if (!waitqueue_active(q) || !key.page_match)1200folio_clear_waiters(folio);12011202spin_unlock_irqrestore(&q->lock, flags);1203}12041205/*1206* A choice of three behaviors for folio_wait_bit_common():1207*/1208enum behavior {1209EXCLUSIVE, /* Hold ref to page and take the bit when woken, like1210* __folio_lock() waiting on then setting PG_locked.1211*/1212SHARED, /* Hold ref to page and check the bit when woken, like1213* folio_wait_writeback() waiting on PG_writeback.1214*/1215DROP, /* Drop ref to page before wait, no check when woken,1216* like folio_put_wait_locked() on PG_locked.1217*/1218};12191220/*1221* Attempt to check (or get) the folio flag, and mark us done1222* if successful.1223*/1224static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,1225struct wait_queue_entry *wait)1226{1227if (wait->flags & WQ_FLAG_EXCLUSIVE) {1228if (test_and_set_bit(bit_nr, &folio->flags))1229return false;1230} else if (test_bit(bit_nr, &folio->flags))1231return false;12321233wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;1234return true;1235}12361237static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,1238int state, enum behavior behavior)1239{1240wait_queue_head_t *q = folio_waitqueue(folio);1241int unfairness = sysctl_page_lock_unfairness;1242struct wait_page_queue wait_page;1243wait_queue_entry_t *wait = &wait_page.wait;1244bool thrashing = false;1245unsigned long pflags;1246bool in_thrashing;12471248if (bit_nr == PG_locked &&1249!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1250delayacct_thrashing_start(&in_thrashing);1251psi_memstall_enter(&pflags);1252thrashing = true;1253}12541255init_wait(wait);1256wait->func = wake_page_function;1257wait_page.folio = folio;1258wait_page.bit_nr = bit_nr;12591260repeat:1261wait->flags = 0;1262if (behavior == EXCLUSIVE) {1263wait->flags = WQ_FLAG_EXCLUSIVE;1264if (--unfairness < 0)1265wait->flags |= WQ_FLAG_CUSTOM;1266}12671268/*1269* Do one last check whether we can get the1270* page bit synchronously.1271*1272* Do the folio_set_waiters() marking before that1273* to let any waker we _just_ missed know they1274* need to wake us up (otherwise they'll never1275* even go to the slow case that looks at the1276* page queue), and add ourselves to the wait1277* queue if we need to sleep.1278*1279* This part needs to be done under the queue1280* lock to avoid races.1281*/1282spin_lock_irq(&q->lock);1283folio_set_waiters(folio);1284if (!folio_trylock_flag(folio, bit_nr, wait))1285__add_wait_queue_entry_tail(q, wait);1286spin_unlock_irq(&q->lock);12871288/*1289* From now on, all the logic will be based on1290* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to1291* see whether the page bit testing has already1292* been done by the wake function.1293*1294* We can drop our reference to the folio.1295*/1296if (behavior == DROP)1297folio_put(folio);12981299/*1300* Note that until the "finish_wait()", or until1301* we see the WQ_FLAG_WOKEN flag, we need to1302* be very careful with the 'wait->flags', because1303* we may race with a waker that sets them.1304*/1305for (;;) {1306unsigned int flags;13071308set_current_state(state);13091310/* Loop until we've been woken or interrupted */1311flags = smp_load_acquire(&wait->flags);1312if (!(flags & WQ_FLAG_WOKEN)) {1313if (signal_pending_state(state, current))1314break;13151316io_schedule();1317continue;1318}13191320/* If we were non-exclusive, we're done */1321if (behavior != EXCLUSIVE)1322break;13231324/* If the waker got the lock for us, we're done */1325if (flags & WQ_FLAG_DONE)1326break;13271328/*1329* Otherwise, if we're getting the lock, we need to1330* try to get it ourselves.1331*1332* And if that fails, we'll have to retry this all.1333*/1334if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))1335goto repeat;13361337wait->flags |= WQ_FLAG_DONE;1338break;1339}13401341/*1342* If a signal happened, this 'finish_wait()' may remove the last1343* waiter from the wait-queues, but the folio waiters bit will remain1344* set. That's ok. The next wakeup will take care of it, and trying1345* to do it here would be difficult and prone to races.1346*/1347finish_wait(q, wait);13481349if (thrashing) {1350delayacct_thrashing_end(&in_thrashing);1351psi_memstall_leave(&pflags);1352}13531354/*1355* NOTE! The wait->flags weren't stable until we've done the1356* 'finish_wait()', and we could have exited the loop above due1357* to a signal, and had a wakeup event happen after the signal1358* test but before the 'finish_wait()'.1359*1360* So only after the finish_wait() can we reliably determine1361* if we got woken up or not, so we can now figure out the final1362* return value based on that state without races.1363*1364* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive1365* waiter, but an exclusive one requires WQ_FLAG_DONE.1366*/1367if (behavior == EXCLUSIVE)1368return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;13691370return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;1371}13721373#ifdef CONFIG_MIGRATION1374/**1375* migration_entry_wait_on_locked - Wait for a migration entry to be removed1376* @entry: migration swap entry.1377* @ptl: already locked ptl. This function will drop the lock.1378*1379* Wait for a migration entry referencing the given page to be removed. This is1380* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except1381* this can be called without taking a reference on the page. Instead this1382* should be called while holding the ptl for the migration entry referencing1383* the page.1384*1385* Returns after unlocking the ptl.1386*1387* This follows the same logic as folio_wait_bit_common() so see the comments1388* there.1389*/1390void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)1391__releases(ptl)1392{1393struct wait_page_queue wait_page;1394wait_queue_entry_t *wait = &wait_page.wait;1395bool thrashing = false;1396unsigned long pflags;1397bool in_thrashing;1398wait_queue_head_t *q;1399struct folio *folio = pfn_swap_entry_folio(entry);14001401q = folio_waitqueue(folio);1402if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1403delayacct_thrashing_start(&in_thrashing);1404psi_memstall_enter(&pflags);1405thrashing = true;1406}14071408init_wait(wait);1409wait->func = wake_page_function;1410wait_page.folio = folio;1411wait_page.bit_nr = PG_locked;1412wait->flags = 0;14131414spin_lock_irq(&q->lock);1415folio_set_waiters(folio);1416if (!folio_trylock_flag(folio, PG_locked, wait))1417__add_wait_queue_entry_tail(q, wait);1418spin_unlock_irq(&q->lock);14191420/*1421* If a migration entry exists for the page the migration path must hold1422* a valid reference to the page, and it must take the ptl to remove the1423* migration entry. So the page is valid until the ptl is dropped.1424*/1425spin_unlock(ptl);14261427for (;;) {1428unsigned int flags;14291430set_current_state(TASK_UNINTERRUPTIBLE);14311432/* Loop until we've been woken or interrupted */1433flags = smp_load_acquire(&wait->flags);1434if (!(flags & WQ_FLAG_WOKEN)) {1435if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))1436break;14371438io_schedule();1439continue;1440}1441break;1442}14431444finish_wait(q, wait);14451446if (thrashing) {1447delayacct_thrashing_end(&in_thrashing);1448psi_memstall_leave(&pflags);1449}1450}1451#endif14521453void folio_wait_bit(struct folio *folio, int bit_nr)1454{1455folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);1456}1457EXPORT_SYMBOL(folio_wait_bit);14581459int folio_wait_bit_killable(struct folio *folio, int bit_nr)1460{1461return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);1462}1463EXPORT_SYMBOL(folio_wait_bit_killable);14641465/**1466* folio_put_wait_locked - Drop a reference and wait for it to be unlocked1467* @folio: The folio to wait for.1468* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).1469*1470* The caller should hold a reference on @folio. They expect the page to1471* become unlocked relatively soon, but do not wish to hold up migration1472* (for example) by holding the reference while waiting for the folio to1473* come unlocked. After this function returns, the caller should not1474* dereference @folio.1475*1476* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.1477*/1478static int folio_put_wait_locked(struct folio *folio, int state)1479{1480return folio_wait_bit_common(folio, PG_locked, state, DROP);1481}14821483/**1484* folio_unlock - Unlock a locked folio.1485* @folio: The folio.1486*1487* Unlocks the folio and wakes up any thread sleeping on the page lock.1488*1489* Context: May be called from interrupt or process context. May not be1490* called from NMI context.1491*/1492void folio_unlock(struct folio *folio)1493{1494/* Bit 7 allows x86 to check the byte's sign bit */1495BUILD_BUG_ON(PG_waiters != 7);1496BUILD_BUG_ON(PG_locked > 7);1497VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1498if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))1499folio_wake_bit(folio, PG_locked);1500}1501EXPORT_SYMBOL(folio_unlock);15021503/**1504* folio_end_read - End read on a folio.1505* @folio: The folio.1506* @success: True if all reads completed successfully.1507*1508* When all reads against a folio have completed, filesystems should1509* call this function to let the pagecache know that no more reads1510* are outstanding. This will unlock the folio and wake up any thread1511* sleeping on the lock. The folio will also be marked uptodate if all1512* reads succeeded.1513*1514* Context: May be called from interrupt or process context. May not be1515* called from NMI context.1516*/1517void folio_end_read(struct folio *folio, bool success)1518{1519unsigned long mask = 1 << PG_locked;15201521/* Must be in bottom byte for x86 to work */1522BUILD_BUG_ON(PG_uptodate > 7);1523VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1524VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);15251526if (likely(success))1527mask |= 1 << PG_uptodate;1528if (folio_xor_flags_has_waiters(folio, mask))1529folio_wake_bit(folio, PG_locked);1530}1531EXPORT_SYMBOL(folio_end_read);15321533/**1534* folio_end_private_2 - Clear PG_private_2 and wake any waiters.1535* @folio: The folio.1536*1537* Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for1538* it. The folio reference held for PG_private_2 being set is released.1539*1540* This is, for example, used when a netfs folio is being written to a local1541* disk cache, thereby allowing writes to the cache for the same folio to be1542* serialised.1543*/1544void folio_end_private_2(struct folio *folio)1545{1546VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);1547clear_bit_unlock(PG_private_2, folio_flags(folio, 0));1548folio_wake_bit(folio, PG_private_2);1549folio_put(folio);1550}1551EXPORT_SYMBOL(folio_end_private_2);15521553/**1554* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.1555* @folio: The folio to wait on.1556*1557* Wait for PG_private_2 to be cleared on a folio.1558*/1559void folio_wait_private_2(struct folio *folio)1560{1561while (folio_test_private_2(folio))1562folio_wait_bit(folio, PG_private_2);1563}1564EXPORT_SYMBOL(folio_wait_private_2);15651566/**1567* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.1568* @folio: The folio to wait on.1569*1570* Wait for PG_private_2 to be cleared on a folio or until a fatal signal is1571* received by the calling task.1572*1573* Return:1574* - 0 if successful.1575* - -EINTR if a fatal signal was encountered.1576*/1577int folio_wait_private_2_killable(struct folio *folio)1578{1579int ret = 0;15801581while (folio_test_private_2(folio)) {1582ret = folio_wait_bit_killable(folio, PG_private_2);1583if (ret < 0)1584break;1585}15861587return ret;1588}1589EXPORT_SYMBOL(folio_wait_private_2_killable);15901591static void filemap_end_dropbehind(struct folio *folio)1592{1593struct address_space *mapping = folio->mapping;15941595VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);15961597if (folio_test_writeback(folio) || folio_test_dirty(folio))1598return;1599if (!folio_test_clear_dropbehind(folio))1600return;1601if (mapping)1602folio_unmap_invalidate(mapping, folio, 0);1603}16041605/*1606* If folio was marked as dropbehind, then pages should be dropped when writeback1607* completes. Do that now. If we fail, it's likely because of a big folio -1608* just reset dropbehind for that case and latter completions should invalidate.1609*/1610static void filemap_end_dropbehind_write(struct folio *folio)1611{1612if (!folio_test_dropbehind(folio))1613return;16141615/*1616* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,1617* but can happen if normal writeback just happens to find dirty folios1618* that were created as part of uncached writeback, and that writeback1619* would otherwise not need non-IRQ handling. Just skip the1620* invalidation in that case.1621*/1622if (in_task() && folio_trylock(folio)) {1623filemap_end_dropbehind(folio);1624folio_unlock(folio);1625}1626}16271628/**1629* folio_end_writeback - End writeback against a folio.1630* @folio: The folio.1631*1632* The folio must actually be under writeback.1633*1634* Context: May be called from process or interrupt context.1635*/1636void folio_end_writeback(struct folio *folio)1637{1638VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);16391640/*1641* folio_test_clear_reclaim() could be used here but it is an1642* atomic operation and overkill in this particular case. Failing1643* to shuffle a folio marked for immediate reclaim is too mild1644* a gain to justify taking an atomic operation penalty at the1645* end of every folio writeback.1646*/1647if (folio_test_reclaim(folio)) {1648folio_clear_reclaim(folio);1649folio_rotate_reclaimable(folio);1650}16511652/*1653* Writeback does not hold a folio reference of its own, relying1654* on truncation to wait for the clearing of PG_writeback.1655* But here we must make sure that the folio is not freed and1656* reused before the folio_wake_bit().1657*/1658folio_get(folio);1659if (__folio_end_writeback(folio))1660folio_wake_bit(folio, PG_writeback);16611662filemap_end_dropbehind_write(folio);1663acct_reclaim_writeback(folio);1664folio_put(folio);1665}1666EXPORT_SYMBOL(folio_end_writeback);16671668/**1669* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.1670* @folio: The folio to lock1671*/1672void __folio_lock(struct folio *folio)1673{1674folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,1675EXCLUSIVE);1676}1677EXPORT_SYMBOL(__folio_lock);16781679int __folio_lock_killable(struct folio *folio)1680{1681return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,1682EXCLUSIVE);1683}1684EXPORT_SYMBOL_GPL(__folio_lock_killable);16851686static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)1687{1688struct wait_queue_head *q = folio_waitqueue(folio);1689int ret;16901691wait->folio = folio;1692wait->bit_nr = PG_locked;16931694spin_lock_irq(&q->lock);1695__add_wait_queue_entry_tail(q, &wait->wait);1696folio_set_waiters(folio);1697ret = !folio_trylock(folio);1698/*1699* If we were successful now, we know we're still on the1700* waitqueue as we're still under the lock. This means it's1701* safe to remove and return success, we know the callback1702* isn't going to trigger.1703*/1704if (!ret)1705__remove_wait_queue(q, &wait->wait);1706else1707ret = -EIOCBQUEUED;1708spin_unlock_irq(&q->lock);1709return ret;1710}17111712/*1713* Return values:1714* 0 - folio is locked.1715* non-zero - folio is not locked.1716* mmap_lock or per-VMA lock has been released (mmap_read_unlock() or1717* vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and1718* FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.1719*1720* If neither ALLOW_RETRY nor KILLABLE are set, will always return 01721* with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.1722*/1723vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)1724{1725unsigned int flags = vmf->flags;17261727if (fault_flag_allow_retry_first(flags)) {1728/*1729* CAUTION! In this case, mmap_lock/per-VMA lock is not1730* released even though returning VM_FAULT_RETRY.1731*/1732if (flags & FAULT_FLAG_RETRY_NOWAIT)1733return VM_FAULT_RETRY;17341735release_fault_lock(vmf);1736if (flags & FAULT_FLAG_KILLABLE)1737folio_wait_locked_killable(folio);1738else1739folio_wait_locked(folio);1740return VM_FAULT_RETRY;1741}1742if (flags & FAULT_FLAG_KILLABLE) {1743bool ret;17441745ret = __folio_lock_killable(folio);1746if (ret) {1747release_fault_lock(vmf);1748return VM_FAULT_RETRY;1749}1750} else {1751__folio_lock(folio);1752}17531754return 0;1755}17561757/**1758* page_cache_next_miss() - Find the next gap in the page cache.1759* @mapping: Mapping.1760* @index: Index.1761* @max_scan: Maximum range to search.1762*1763* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the1764* gap with the lowest index.1765*1766* This function may be called under the rcu_read_lock. However, this will1767* not atomically search a snapshot of the cache at a single point in time.1768* For example, if a gap is created at index 5, then subsequently a gap is1769* created at index 10, page_cache_next_miss covering both indices may1770* return 10 if called under the rcu_read_lock.1771*1772* Return: The index of the gap if found, otherwise an index outside the1773* range specified (in which case 'return - index >= max_scan' will be true).1774* In the rare case of index wrap-around, 0 will be returned.1775*/1776pgoff_t page_cache_next_miss(struct address_space *mapping,1777pgoff_t index, unsigned long max_scan)1778{1779XA_STATE(xas, &mapping->i_pages, index);1780unsigned long nr = max_scan;17811782while (nr--) {1783void *entry = xas_next(&xas);1784if (!entry || xa_is_value(entry))1785return xas.xa_index;1786if (xas.xa_index == 0)1787return 0;1788}17891790return index + max_scan;1791}1792EXPORT_SYMBOL(page_cache_next_miss);17931794/**1795* page_cache_prev_miss() - Find the previous gap in the page cache.1796* @mapping: Mapping.1797* @index: Index.1798* @max_scan: Maximum range to search.1799*1800* Search the range [max(index - max_scan + 1, 0), index] for the1801* gap with the highest index.1802*1803* This function may be called under the rcu_read_lock. However, this will1804* not atomically search a snapshot of the cache at a single point in time.1805* For example, if a gap is created at index 10, then subsequently a gap is1806* created at index 5, page_cache_prev_miss() covering both indices may1807* return 5 if called under the rcu_read_lock.1808*1809* Return: The index of the gap if found, otherwise an index outside the1810* range specified (in which case 'index - return >= max_scan' will be true).1811* In the rare case of wrap-around, ULONG_MAX will be returned.1812*/1813pgoff_t page_cache_prev_miss(struct address_space *mapping,1814pgoff_t index, unsigned long max_scan)1815{1816XA_STATE(xas, &mapping->i_pages, index);18171818while (max_scan--) {1819void *entry = xas_prev(&xas);1820if (!entry || xa_is_value(entry))1821break;1822if (xas.xa_index == ULONG_MAX)1823break;1824}18251826return xas.xa_index;1827}1828EXPORT_SYMBOL(page_cache_prev_miss);18291830/*1831* Lockless page cache protocol:1832* On the lookup side:1833* 1. Load the folio from i_pages1834* 2. Increment the refcount if it's not zero1835* 3. If the folio is not found by xas_reload(), put the refcount and retry1836*1837* On the removal side:1838* A. Freeze the page (by zeroing the refcount if nobody else has a reference)1839* B. Remove the page from i_pages1840* C. Return the page to the page allocator1841*1842* This means that any page may have its reference count temporarily1843* increased by a speculative page cache (or GUP-fast) lookup as it can1844* be allocated by another user before the RCU grace period expires.1845* Because the refcount temporarily acquired here may end up being the1846* last refcount on the page, any page allocation must be freeable by1847* folio_put().1848*/18491850/*1851* filemap_get_entry - Get a page cache entry.1852* @mapping: the address_space to search1853* @index: The page cache index.1854*1855* Looks up the page cache entry at @mapping & @index. If it is a folio,1856* it is returned with an increased refcount. If it is a shadow entry1857* of a previously evicted folio, or a swap entry from shmem/tmpfs,1858* it is returned without further action.1859*1860* Return: The folio, swap or shadow entry, %NULL if nothing is found.1861*/1862void *filemap_get_entry(struct address_space *mapping, pgoff_t index)1863{1864XA_STATE(xas, &mapping->i_pages, index);1865struct folio *folio;18661867rcu_read_lock();1868repeat:1869xas_reset(&xas);1870folio = xas_load(&xas);1871if (xas_retry(&xas, folio))1872goto repeat;1873/*1874* A shadow entry of a recently evicted page, or a swap entry from1875* shmem/tmpfs. Return it without attempting to raise page count.1876*/1877if (!folio || xa_is_value(folio))1878goto out;18791880if (!folio_try_get(folio))1881goto repeat;18821883if (unlikely(folio != xas_reload(&xas))) {1884folio_put(folio);1885goto repeat;1886}1887out:1888rcu_read_unlock();18891890return folio;1891}18921893/**1894* __filemap_get_folio - Find and get a reference to a folio.1895* @mapping: The address_space to search.1896* @index: The page index.1897* @fgp_flags: %FGP flags modify how the folio is returned.1898* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.1899*1900* Looks up the page cache entry at @mapping & @index.1901*1902* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even1903* if the %GFP flags specified for %FGP_CREAT are atomic.1904*1905* If this function returns a folio, it is returned with an increased refcount.1906*1907* Return: The found folio or an ERR_PTR() otherwise.1908*/1909struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,1910fgf_t fgp_flags, gfp_t gfp)1911{1912struct folio *folio;19131914repeat:1915folio = filemap_get_entry(mapping, index);1916if (xa_is_value(folio))1917folio = NULL;1918if (!folio)1919goto no_page;19201921if (fgp_flags & FGP_LOCK) {1922if (fgp_flags & FGP_NOWAIT) {1923if (!folio_trylock(folio)) {1924folio_put(folio);1925return ERR_PTR(-EAGAIN);1926}1927} else {1928folio_lock(folio);1929}19301931/* Has the page been truncated? */1932if (unlikely(folio->mapping != mapping)) {1933folio_unlock(folio);1934folio_put(folio);1935goto repeat;1936}1937VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);1938}19391940if (fgp_flags & FGP_ACCESSED)1941folio_mark_accessed(folio);1942else if (fgp_flags & FGP_WRITE) {1943/* Clear idle flag for buffer write */1944if (folio_test_idle(folio))1945folio_clear_idle(folio);1946}19471948if (fgp_flags & FGP_STABLE)1949folio_wait_stable(folio);1950no_page:1951if (!folio && (fgp_flags & FGP_CREAT)) {1952unsigned int min_order = mapping_min_folio_order(mapping);1953unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));1954int err;1955index = mapping_align_index(mapping, index);19561957if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))1958gfp |= __GFP_WRITE;1959if (fgp_flags & FGP_NOFS)1960gfp &= ~__GFP_FS;1961if (fgp_flags & FGP_NOWAIT) {1962gfp &= ~GFP_KERNEL;1963gfp |= GFP_NOWAIT | __GFP_NOWARN;1964}1965if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))1966fgp_flags |= FGP_LOCK;19671968if (order > mapping_max_folio_order(mapping))1969order = mapping_max_folio_order(mapping);1970/* If we're not aligned, allocate a smaller folio */1971if (index & ((1UL << order) - 1))1972order = __ffs(index);19731974do {1975gfp_t alloc_gfp = gfp;19761977err = -ENOMEM;1978if (order > min_order)1979alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;1980folio = filemap_alloc_folio(alloc_gfp, order);1981if (!folio)1982continue;19831984/* Init accessed so avoid atomic mark_page_accessed later */1985if (fgp_flags & FGP_ACCESSED)1986__folio_set_referenced(folio);1987if (fgp_flags & FGP_DONTCACHE)1988__folio_set_dropbehind(folio);19891990err = filemap_add_folio(mapping, folio, index, gfp);1991if (!err)1992break;1993folio_put(folio);1994folio = NULL;1995} while (order-- > min_order);19961997if (err == -EEXIST)1998goto repeat;1999if (err) {2000/*2001* When NOWAIT I/O fails to allocate folios this could2002* be due to a nonblocking memory allocation and not2003* because the system actually is out of memory.2004* Return -EAGAIN so that there caller retries in a2005* blocking fashion instead of propagating -ENOMEM2006* to the application.2007*/2008if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)2009err = -EAGAIN;2010return ERR_PTR(err);2011}2012/*2013* filemap_add_folio locks the page, and for mmap2014* we expect an unlocked page.2015*/2016if (folio && (fgp_flags & FGP_FOR_MMAP))2017folio_unlock(folio);2018}20192020if (!folio)2021return ERR_PTR(-ENOENT);2022/* not an uncached lookup, clear uncached if set */2023if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))2024folio_clear_dropbehind(folio);2025return folio;2026}2027EXPORT_SYMBOL(__filemap_get_folio);20282029static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,2030xa_mark_t mark)2031{2032struct folio *folio;20332034retry:2035if (mark == XA_PRESENT)2036folio = xas_find(xas, max);2037else2038folio = xas_find_marked(xas, max, mark);20392040if (xas_retry(xas, folio))2041goto retry;2042/*2043* A shadow entry of a recently evicted page, a swap2044* entry from shmem/tmpfs or a DAX entry. Return it2045* without attempting to raise page count.2046*/2047if (!folio || xa_is_value(folio))2048return folio;20492050if (!folio_try_get(folio))2051goto reset;20522053if (unlikely(folio != xas_reload(xas))) {2054folio_put(folio);2055goto reset;2056}20572058return folio;2059reset:2060xas_reset(xas);2061goto retry;2062}20632064/**2065* find_get_entries - gang pagecache lookup2066* @mapping: The address_space to search2067* @start: The starting page cache index2068* @end: The final page index (inclusive).2069* @fbatch: Where the resulting entries are placed.2070* @indices: The cache indices corresponding to the entries in @entries2071*2072* find_get_entries() will search for and return a batch of entries in2073* the mapping. The entries are placed in @fbatch. find_get_entries()2074* takes a reference on any actual folios it returns.2075*2076* The entries have ascending indexes. The indices may not be consecutive2077* due to not-present entries or large folios.2078*2079* Any shadow entries of evicted folios, or swap entries from2080* shmem/tmpfs, are included in the returned array.2081*2082* Return: The number of entries which were found.2083*/2084unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,2085pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2086{2087XA_STATE(xas, &mapping->i_pages, *start);2088struct folio *folio;20892090rcu_read_lock();2091while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {2092indices[fbatch->nr] = xas.xa_index;2093if (!folio_batch_add(fbatch, folio))2094break;2095}20962097if (folio_batch_count(fbatch)) {2098unsigned long nr;2099int idx = folio_batch_count(fbatch) - 1;21002101folio = fbatch->folios[idx];2102if (!xa_is_value(folio))2103nr = folio_nr_pages(folio);2104else2105nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);2106*start = round_down(indices[idx] + nr, nr);2107}2108rcu_read_unlock();21092110return folio_batch_count(fbatch);2111}21122113/**2114* find_lock_entries - Find a batch of pagecache entries.2115* @mapping: The address_space to search.2116* @start: The starting page cache index.2117* @end: The final page index (inclusive).2118* @fbatch: Where the resulting entries are placed.2119* @indices: The cache indices of the entries in @fbatch.2120*2121* find_lock_entries() will return a batch of entries from @mapping.2122* Swap, shadow and DAX entries are included. Folios are returned2123* locked and with an incremented refcount. Folios which are locked2124* by somebody else or under writeback are skipped. Folios which are2125* partially outside the range are not returned.2126*2127* The entries have ascending indexes. The indices may not be consecutive2128* due to not-present entries, large folios, folios which could not be2129* locked or folios under writeback.2130*2131* Return: The number of entries which were found.2132*/2133unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,2134pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2135{2136XA_STATE(xas, &mapping->i_pages, *start);2137struct folio *folio;21382139rcu_read_lock();2140while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {2141unsigned long base;2142unsigned long nr;21432144if (!xa_is_value(folio)) {2145nr = folio_nr_pages(folio);2146base = folio->index;2147/* Omit large folio which begins before the start */2148if (base < *start)2149goto put;2150/* Omit large folio which extends beyond the end */2151if (base + nr - 1 > end)2152goto put;2153if (!folio_trylock(folio))2154goto put;2155if (folio->mapping != mapping ||2156folio_test_writeback(folio))2157goto unlock;2158VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),2159folio);2160} else {2161nr = 1 << xas_get_order(&xas);2162base = xas.xa_index & ~(nr - 1);2163/* Omit order>0 value which begins before the start */2164if (base < *start)2165continue;2166/* Omit order>0 value which extends beyond the end */2167if (base + nr - 1 > end)2168break;2169}21702171/* Update start now so that last update is correct on return */2172*start = base + nr;2173indices[fbatch->nr] = xas.xa_index;2174if (!folio_batch_add(fbatch, folio))2175break;2176continue;2177unlock:2178folio_unlock(folio);2179put:2180folio_put(folio);2181}2182rcu_read_unlock();21832184return folio_batch_count(fbatch);2185}21862187/**2188* filemap_get_folios - Get a batch of folios2189* @mapping: The address_space to search2190* @start: The starting page index2191* @end: The final page index (inclusive)2192* @fbatch: The batch to fill.2193*2194* Search for and return a batch of folios in the mapping starting at2195* index @start and up to index @end (inclusive). The folios are returned2196* in @fbatch with an elevated reference count.2197*2198* Return: The number of folios which were found.2199* We also update @start to index the next folio for the traversal.2200*/2201unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,2202pgoff_t end, struct folio_batch *fbatch)2203{2204return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);2205}2206EXPORT_SYMBOL(filemap_get_folios);22072208/**2209* filemap_get_folios_contig - Get a batch of contiguous folios2210* @mapping: The address_space to search2211* @start: The starting page index2212* @end: The final page index (inclusive)2213* @fbatch: The batch to fill2214*2215* filemap_get_folios_contig() works exactly like filemap_get_folios(),2216* except the returned folios are guaranteed to be contiguous. This may2217* not return all contiguous folios if the batch gets filled up.2218*2219* Return: The number of folios found.2220* Also update @start to be positioned for traversal of the next folio.2221*/22222223unsigned filemap_get_folios_contig(struct address_space *mapping,2224pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)2225{2226XA_STATE(xas, &mapping->i_pages, *start);2227unsigned long nr;2228struct folio *folio;22292230rcu_read_lock();22312232for (folio = xas_load(&xas); folio && xas.xa_index <= end;2233folio = xas_next(&xas)) {2234if (xas_retry(&xas, folio))2235continue;2236/*2237* If the entry has been swapped out, we can stop looking.2238* No current caller is looking for DAX entries.2239*/2240if (xa_is_value(folio))2241goto update_start;22422243/* If we landed in the middle of a THP, continue at its end. */2244if (xa_is_sibling(folio))2245goto update_start;22462247if (!folio_try_get(folio))2248goto retry;22492250if (unlikely(folio != xas_reload(&xas)))2251goto put_folio;22522253if (!folio_batch_add(fbatch, folio)) {2254nr = folio_nr_pages(folio);2255*start = folio->index + nr;2256goto out;2257}2258xas_advance(&xas, folio_next_index(folio) - 1);2259continue;2260put_folio:2261folio_put(folio);22622263retry:2264xas_reset(&xas);2265}22662267update_start:2268nr = folio_batch_count(fbatch);22692270if (nr) {2271folio = fbatch->folios[nr - 1];2272*start = folio_next_index(folio);2273}2274out:2275rcu_read_unlock();2276return folio_batch_count(fbatch);2277}2278EXPORT_SYMBOL(filemap_get_folios_contig);22792280/**2281* filemap_get_folios_tag - Get a batch of folios matching @tag2282* @mapping: The address_space to search2283* @start: The starting page index2284* @end: The final page index (inclusive)2285* @tag: The tag index2286* @fbatch: The batch to fill2287*2288* The first folio may start before @start; if it does, it will contain2289* @start. The final folio may extend beyond @end; if it does, it will2290* contain @end. The folios have ascending indices. There may be gaps2291* between the folios if there are indices which have no folio in the2292* page cache. If folios are added to or removed from the page cache2293* while this is running, they may or may not be found by this call.2294* Only returns folios that are tagged with @tag.2295*2296* Return: The number of folios found.2297* Also update @start to index the next folio for traversal.2298*/2299unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,2300pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)2301{2302XA_STATE(xas, &mapping->i_pages, *start);2303struct folio *folio;23042305rcu_read_lock();2306while ((folio = find_get_entry(&xas, end, tag)) != NULL) {2307/*2308* Shadow entries should never be tagged, but this iteration2309* is lockless so there is a window for page reclaim to evict2310* a page we saw tagged. Skip over it.2311*/2312if (xa_is_value(folio))2313continue;2314if (!folio_batch_add(fbatch, folio)) {2315unsigned long nr = folio_nr_pages(folio);2316*start = folio->index + nr;2317goto out;2318}2319}2320/*2321* We come here when there is no page beyond @end. We take care to not2322* overflow the index @start as it confuses some of the callers. This2323* breaks the iteration when there is a page at index -1 but that is2324* already broke anyway.2325*/2326if (end == (pgoff_t)-1)2327*start = (pgoff_t)-1;2328else2329*start = end + 1;2330out:2331rcu_read_unlock();23322333return folio_batch_count(fbatch);2334}2335EXPORT_SYMBOL(filemap_get_folios_tag);23362337/*2338* CD/DVDs are error prone. When a medium error occurs, the driver may fail2339* a _large_ part of the i/o request. Imagine the worst scenario:2340*2341* ---R__________________________________________B__________2342* ^ reading here ^ bad block(assume 4k)2343*2344* read(R) => miss => readahead(R...B) => media error => frustrating retries2345* => failing the whole request => read(R) => read(R+1) =>2346* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>2347* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>2348* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......2349*2350* It is going insane. Fix it by quickly scaling down the readahead size.2351*/2352static void shrink_readahead_size_eio(struct file_ra_state *ra)2353{2354ra->ra_pages /= 4;2355}23562357/*2358* filemap_get_read_batch - Get a batch of folios for read2359*2360* Get a batch of folios which represent a contiguous range of bytes in2361* the file. No exceptional entries will be returned. If @index is in2362* the middle of a folio, the entire folio will be returned. The last2363* folio in the batch may have the readahead flag set or the uptodate flag2364* clear so that the caller can take the appropriate action.2365*/2366static void filemap_get_read_batch(struct address_space *mapping,2367pgoff_t index, pgoff_t max, struct folio_batch *fbatch)2368{2369XA_STATE(xas, &mapping->i_pages, index);2370struct folio *folio;23712372rcu_read_lock();2373for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {2374if (xas_retry(&xas, folio))2375continue;2376if (xas.xa_index > max || xa_is_value(folio))2377break;2378if (xa_is_sibling(folio))2379break;2380if (!folio_try_get(folio))2381goto retry;23822383if (unlikely(folio != xas_reload(&xas)))2384goto put_folio;23852386if (!folio_batch_add(fbatch, folio))2387break;2388if (!folio_test_uptodate(folio))2389break;2390if (folio_test_readahead(folio))2391break;2392xas_advance(&xas, folio_next_index(folio) - 1);2393continue;2394put_folio:2395folio_put(folio);2396retry:2397xas_reset(&xas);2398}2399rcu_read_unlock();2400}24012402static int filemap_read_folio(struct file *file, filler_t filler,2403struct folio *folio)2404{2405bool workingset = folio_test_workingset(folio);2406unsigned long pflags;2407int error;24082409/* Start the actual read. The read will unlock the page. */2410if (unlikely(workingset))2411psi_memstall_enter(&pflags);2412error = filler(file, folio);2413if (unlikely(workingset))2414psi_memstall_leave(&pflags);2415if (error)2416return error;24172418error = folio_wait_locked_killable(folio);2419if (error)2420return error;2421if (folio_test_uptodate(folio))2422return 0;2423if (file)2424shrink_readahead_size_eio(&file->f_ra);2425return -EIO;2426}24272428static bool filemap_range_uptodate(struct address_space *mapping,2429loff_t pos, size_t count, struct folio *folio,2430bool need_uptodate)2431{2432if (folio_test_uptodate(folio))2433return true;2434/* pipes can't handle partially uptodate pages */2435if (need_uptodate)2436return false;2437if (!mapping->a_ops->is_partially_uptodate)2438return false;2439if (mapping->host->i_blkbits >= folio_shift(folio))2440return false;24412442if (folio_pos(folio) > pos) {2443count -= folio_pos(folio) - pos;2444pos = 0;2445} else {2446pos -= folio_pos(folio);2447}24482449return mapping->a_ops->is_partially_uptodate(folio, pos, count);2450}24512452static int filemap_update_page(struct kiocb *iocb,2453struct address_space *mapping, size_t count,2454struct folio *folio, bool need_uptodate)2455{2456int error;24572458if (iocb->ki_flags & IOCB_NOWAIT) {2459if (!filemap_invalidate_trylock_shared(mapping))2460return -EAGAIN;2461} else {2462filemap_invalidate_lock_shared(mapping);2463}24642465if (!folio_trylock(folio)) {2466error = -EAGAIN;2467if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))2468goto unlock_mapping;2469if (!(iocb->ki_flags & IOCB_WAITQ)) {2470filemap_invalidate_unlock_shared(mapping);2471/*2472* This is where we usually end up waiting for a2473* previously submitted readahead to finish.2474*/2475folio_put_wait_locked(folio, TASK_KILLABLE);2476return AOP_TRUNCATED_PAGE;2477}2478error = __folio_lock_async(folio, iocb->ki_waitq);2479if (error)2480goto unlock_mapping;2481}24822483error = AOP_TRUNCATED_PAGE;2484if (!folio->mapping)2485goto unlock;24862487error = 0;2488if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,2489need_uptodate))2490goto unlock;24912492error = -EAGAIN;2493if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))2494goto unlock;24952496error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2497folio);2498goto unlock_mapping;2499unlock:2500folio_unlock(folio);2501unlock_mapping:2502filemap_invalidate_unlock_shared(mapping);2503if (error == AOP_TRUNCATED_PAGE)2504folio_put(folio);2505return error;2506}25072508static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)2509{2510struct address_space *mapping = iocb->ki_filp->f_mapping;2511struct folio *folio;2512int error;2513unsigned int min_order = mapping_min_folio_order(mapping);2514pgoff_t index;25152516if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))2517return -EAGAIN;25182519folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);2520if (!folio)2521return -ENOMEM;2522if (iocb->ki_flags & IOCB_DONTCACHE)2523__folio_set_dropbehind(folio);25242525/*2526* Protect against truncate / hole punch. Grabbing invalidate_lock2527* here assures we cannot instantiate and bring uptodate new2528* pagecache folios after evicting page cache during truncate2529* and before actually freeing blocks. Note that we could2530* release invalidate_lock after inserting the folio into2531* the page cache as the locked folio would then be enough to2532* synchronize with hole punching. But there are code paths2533* such as filemap_update_page() filling in partially uptodate2534* pages or ->readahead() that need to hold invalidate_lock2535* while mapping blocks for IO so let's hold the lock here as2536* well to keep locking rules simple.2537*/2538filemap_invalidate_lock_shared(mapping);2539index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;2540error = filemap_add_folio(mapping, folio, index,2541mapping_gfp_constraint(mapping, GFP_KERNEL));2542if (error == -EEXIST)2543error = AOP_TRUNCATED_PAGE;2544if (error)2545goto error;25462547error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2548folio);2549if (error)2550goto error;25512552filemap_invalidate_unlock_shared(mapping);2553folio_batch_add(fbatch, folio);2554return 0;2555error:2556filemap_invalidate_unlock_shared(mapping);2557folio_put(folio);2558return error;2559}25602561static int filemap_readahead(struct kiocb *iocb, struct file *file,2562struct address_space *mapping, struct folio *folio,2563pgoff_t last_index)2564{2565DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);25662567if (iocb->ki_flags & IOCB_NOIO)2568return -EAGAIN;2569if (iocb->ki_flags & IOCB_DONTCACHE)2570ractl.dropbehind = 1;2571page_cache_async_ra(&ractl, folio, last_index - folio->index);2572return 0;2573}25742575static int filemap_get_pages(struct kiocb *iocb, size_t count,2576struct folio_batch *fbatch, bool need_uptodate)2577{2578struct file *filp = iocb->ki_filp;2579struct address_space *mapping = filp->f_mapping;2580pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;2581pgoff_t last_index;2582struct folio *folio;2583unsigned int flags;2584int err = 0;25852586/* "last_index" is the index of the page beyond the end of the read */2587last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);2588retry:2589if (fatal_signal_pending(current))2590return -EINTR;25912592filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2593if (!folio_batch_count(fbatch)) {2594DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);25952596if (iocb->ki_flags & IOCB_NOIO)2597return -EAGAIN;2598if (iocb->ki_flags & IOCB_NOWAIT)2599flags = memalloc_noio_save();2600if (iocb->ki_flags & IOCB_DONTCACHE)2601ractl.dropbehind = 1;2602page_cache_sync_ra(&ractl, last_index - index);2603if (iocb->ki_flags & IOCB_NOWAIT)2604memalloc_noio_restore(flags);2605filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2606}2607if (!folio_batch_count(fbatch)) {2608err = filemap_create_folio(iocb, fbatch);2609if (err == AOP_TRUNCATED_PAGE)2610goto retry;2611return err;2612}26132614folio = fbatch->folios[folio_batch_count(fbatch) - 1];2615if (folio_test_readahead(folio)) {2616err = filemap_readahead(iocb, filp, mapping, folio, last_index);2617if (err)2618goto err;2619}2620if (!folio_test_uptodate(folio)) {2621if ((iocb->ki_flags & IOCB_WAITQ) &&2622folio_batch_count(fbatch) > 1)2623iocb->ki_flags |= IOCB_NOWAIT;2624err = filemap_update_page(iocb, mapping, count, folio,2625need_uptodate);2626if (err)2627goto err;2628}26292630trace_mm_filemap_get_pages(mapping, index, last_index - 1);2631return 0;2632err:2633if (err < 0)2634folio_put(folio);2635if (likely(--fbatch->nr))2636return 0;2637if (err == AOP_TRUNCATED_PAGE)2638goto retry;2639return err;2640}26412642static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)2643{2644unsigned int shift = folio_shift(folio);26452646return (pos1 >> shift == pos2 >> shift);2647}26482649static void filemap_end_dropbehind_read(struct folio *folio)2650{2651if (!folio_test_dropbehind(folio))2652return;2653if (folio_test_writeback(folio) || folio_test_dirty(folio))2654return;2655if (folio_trylock(folio)) {2656filemap_end_dropbehind(folio);2657folio_unlock(folio);2658}2659}26602661/**2662* filemap_read - Read data from the page cache.2663* @iocb: The iocb to read.2664* @iter: Destination for the data.2665* @already_read: Number of bytes already read by the caller.2666*2667* Copies data from the page cache. If the data is not currently present,2668* uses the readahead and read_folio address_space operations to fetch it.2669*2670* Return: Total number of bytes copied, including those already read by2671* the caller. If an error happens before any bytes are copied, returns2672* a negative error number.2673*/2674ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,2675ssize_t already_read)2676{2677struct file *filp = iocb->ki_filp;2678struct file_ra_state *ra = &filp->f_ra;2679struct address_space *mapping = filp->f_mapping;2680struct inode *inode = mapping->host;2681struct folio_batch fbatch;2682int i, error = 0;2683bool writably_mapped;2684loff_t isize, end_offset;2685loff_t last_pos = ra->prev_pos;26862687if (unlikely(iocb->ki_pos < 0))2688return -EINVAL;2689if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))2690return 0;2691if (unlikely(!iov_iter_count(iter)))2692return 0;26932694iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);2695folio_batch_init(&fbatch);26962697do {2698cond_resched();26992700/*2701* If we've already successfully copied some data, then we2702* can no longer safely return -EIOCBQUEUED. Hence mark2703* an async read NOWAIT at that point.2704*/2705if ((iocb->ki_flags & IOCB_WAITQ) && already_read)2706iocb->ki_flags |= IOCB_NOWAIT;27072708if (unlikely(iocb->ki_pos >= i_size_read(inode)))2709break;27102711error = filemap_get_pages(iocb, iter->count, &fbatch, false);2712if (error < 0)2713break;27142715/*2716* i_size must be checked after we know the pages are Uptodate.2717*2718* Checking i_size after the check allows us to calculate2719* the correct value for "nr", which means the zero-filled2720* part of the page is not copied back to userspace (unless2721* another truncate extends the file - this is desired though).2722*/2723isize = i_size_read(inode);2724if (unlikely(iocb->ki_pos >= isize))2725goto put_folios;2726end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);27272728/*2729* Once we start copying data, we don't want to be touching any2730* cachelines that might be contended:2731*/2732writably_mapped = mapping_writably_mapped(mapping);27332734/*2735* When a read accesses the same folio several times, only2736* mark it as accessed the first time.2737*/2738if (!pos_same_folio(iocb->ki_pos, last_pos - 1,2739fbatch.folios[0]))2740folio_mark_accessed(fbatch.folios[0]);27412742for (i = 0; i < folio_batch_count(&fbatch); i++) {2743struct folio *folio = fbatch.folios[i];2744size_t fsize = folio_size(folio);2745size_t offset = iocb->ki_pos & (fsize - 1);2746size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,2747fsize - offset);2748size_t copied;27492750if (end_offset < folio_pos(folio))2751break;2752if (i > 0)2753folio_mark_accessed(folio);2754/*2755* If users can be writing to this folio using arbitrary2756* virtual addresses, take care of potential aliasing2757* before reading the folio on the kernel side.2758*/2759if (writably_mapped)2760flush_dcache_folio(folio);27612762copied = copy_folio_to_iter(folio, offset, bytes, iter);27632764already_read += copied;2765iocb->ki_pos += copied;2766last_pos = iocb->ki_pos;27672768if (copied < bytes) {2769error = -EFAULT;2770break;2771}2772}2773put_folios:2774for (i = 0; i < folio_batch_count(&fbatch); i++) {2775struct folio *folio = fbatch.folios[i];27762777filemap_end_dropbehind_read(folio);2778folio_put(folio);2779}2780folio_batch_init(&fbatch);2781} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);27822783file_accessed(filp);2784ra->prev_pos = last_pos;2785return already_read ? already_read : error;2786}2787EXPORT_SYMBOL_GPL(filemap_read);27882789int kiocb_write_and_wait(struct kiocb *iocb, size_t count)2790{2791struct address_space *mapping = iocb->ki_filp->f_mapping;2792loff_t pos = iocb->ki_pos;2793loff_t end = pos + count - 1;27942795if (iocb->ki_flags & IOCB_NOWAIT) {2796if (filemap_range_needs_writeback(mapping, pos, end))2797return -EAGAIN;2798return 0;2799}28002801return filemap_write_and_wait_range(mapping, pos, end);2802}2803EXPORT_SYMBOL_GPL(kiocb_write_and_wait);28042805int filemap_invalidate_pages(struct address_space *mapping,2806loff_t pos, loff_t end, bool nowait)2807{2808int ret;28092810if (nowait) {2811/* we could block if there are any pages in the range */2812if (filemap_range_has_page(mapping, pos, end))2813return -EAGAIN;2814} else {2815ret = filemap_write_and_wait_range(mapping, pos, end);2816if (ret)2817return ret;2818}28192820/*2821* After a write we want buffered reads to be sure to go to disk to get2822* the new data. We invalidate clean cached page from the region we're2823* about to write. We do this *before* the write so that we can return2824* without clobbering -EIOCBQUEUED from ->direct_IO().2825*/2826return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,2827end >> PAGE_SHIFT);2828}28292830int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)2831{2832struct address_space *mapping = iocb->ki_filp->f_mapping;28332834return filemap_invalidate_pages(mapping, iocb->ki_pos,2835iocb->ki_pos + count - 1,2836iocb->ki_flags & IOCB_NOWAIT);2837}2838EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);28392840/**2841* generic_file_read_iter - generic filesystem read routine2842* @iocb: kernel I/O control block2843* @iter: destination for the data read2844*2845* This is the "read_iter()" routine for all filesystems2846* that can use the page cache directly.2847*2848* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall2849* be returned when no data can be read without waiting for I/O requests2850* to complete; it doesn't prevent readahead.2851*2852* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O2853* requests shall be made for the read or for readahead. When no data2854* can be read, -EAGAIN shall be returned. When readahead would be2855* triggered, a partial, possibly empty read shall be returned.2856*2857* Return:2858* * number of bytes copied, even for partial reads2859* * negative error code (or 0 if IOCB_NOIO) if nothing was read2860*/2861ssize_t2862generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)2863{2864size_t count = iov_iter_count(iter);2865ssize_t retval = 0;28662867if (!count)2868return 0; /* skip atime */28692870if (iocb->ki_flags & IOCB_DIRECT) {2871struct file *file = iocb->ki_filp;2872struct address_space *mapping = file->f_mapping;2873struct inode *inode = mapping->host;28742875retval = kiocb_write_and_wait(iocb, count);2876if (retval < 0)2877return retval;2878file_accessed(file);28792880retval = mapping->a_ops->direct_IO(iocb, iter);2881if (retval >= 0) {2882iocb->ki_pos += retval;2883count -= retval;2884}2885if (retval != -EIOCBQUEUED)2886iov_iter_revert(iter, count - iov_iter_count(iter));28872888/*2889* Btrfs can have a short DIO read if we encounter2890* compressed extents, so if there was an error, or if2891* we've already read everything we wanted to, or if2892* there was a short read because we hit EOF, go ahead2893* and return. Otherwise fallthrough to buffered io for2894* the rest of the read. Buffered reads will not work for2895* DAX files, so don't bother trying.2896*/2897if (retval < 0 || !count || IS_DAX(inode))2898return retval;2899if (iocb->ki_pos >= i_size_read(inode))2900return retval;2901}29022903return filemap_read(iocb, iter, retval);2904}2905EXPORT_SYMBOL(generic_file_read_iter);29062907/*2908* Splice subpages from a folio into a pipe.2909*/2910size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,2911struct folio *folio, loff_t fpos, size_t size)2912{2913struct page *page;2914size_t spliced = 0, offset = offset_in_folio(folio, fpos);29152916page = folio_page(folio, offset / PAGE_SIZE);2917size = min(size, folio_size(folio) - offset);2918offset %= PAGE_SIZE;29192920while (spliced < size && !pipe_is_full(pipe)) {2921struct pipe_buffer *buf = pipe_head_buf(pipe);2922size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);29232924*buf = (struct pipe_buffer) {2925.ops = &page_cache_pipe_buf_ops,2926.page = page,2927.offset = offset,2928.len = part,2929};2930folio_get(folio);2931pipe->head++;2932page++;2933spliced += part;2934offset = 0;2935}29362937return spliced;2938}29392940/**2941* filemap_splice_read - Splice data from a file's pagecache into a pipe2942* @in: The file to read from2943* @ppos: Pointer to the file position to read from2944* @pipe: The pipe to splice into2945* @len: The amount to splice2946* @flags: The SPLICE_F_* flags2947*2948* This function gets folios from a file's pagecache and splices them into the2949* pipe. Readahead will be called as necessary to fill more folios. This may2950* be used for blockdevs also.2951*2952* Return: On success, the number of bytes read will be returned and *@ppos2953* will be updated if appropriate; 0 will be returned if there is no more data2954* to be read; -EAGAIN will be returned if the pipe had no space, and some2955* other negative error code will be returned on error. A short read may occur2956* if the pipe has insufficient space, we reach the end of the data or we hit a2957* hole.2958*/2959ssize_t filemap_splice_read(struct file *in, loff_t *ppos,2960struct pipe_inode_info *pipe,2961size_t len, unsigned int flags)2962{2963struct folio_batch fbatch;2964struct kiocb iocb;2965size_t total_spliced = 0, used, npages;2966loff_t isize, end_offset;2967bool writably_mapped;2968int i, error = 0;29692970if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))2971return 0;29722973init_sync_kiocb(&iocb, in);2974iocb.ki_pos = *ppos;29752976/* Work out how much data we can actually add into the pipe */2977used = pipe_buf_usage(pipe);2978npages = max_t(ssize_t, pipe->max_usage - used, 0);2979len = min_t(size_t, len, npages * PAGE_SIZE);29802981folio_batch_init(&fbatch);29822983do {2984cond_resched();29852986if (*ppos >= i_size_read(in->f_mapping->host))2987break;29882989iocb.ki_pos = *ppos;2990error = filemap_get_pages(&iocb, len, &fbatch, true);2991if (error < 0)2992break;29932994/*2995* i_size must be checked after we know the pages are Uptodate.2996*2997* Checking i_size after the check allows us to calculate2998* the correct value for "nr", which means the zero-filled2999* part of the page is not copied back to userspace (unless3000* another truncate extends the file - this is desired though).3001*/3002isize = i_size_read(in->f_mapping->host);3003if (unlikely(*ppos >= isize))3004break;3005end_offset = min_t(loff_t, isize, *ppos + len);30063007/*3008* Once we start copying data, we don't want to be touching any3009* cachelines that might be contended:3010*/3011writably_mapped = mapping_writably_mapped(in->f_mapping);30123013for (i = 0; i < folio_batch_count(&fbatch); i++) {3014struct folio *folio = fbatch.folios[i];3015size_t n;30163017if (folio_pos(folio) >= end_offset)3018goto out;3019folio_mark_accessed(folio);30203021/*3022* If users can be writing to this folio using arbitrary3023* virtual addresses, take care of potential aliasing3024* before reading the folio on the kernel side.3025*/3026if (writably_mapped)3027flush_dcache_folio(folio);30283029n = min_t(loff_t, len, isize - *ppos);3030n = splice_folio_into_pipe(pipe, folio, *ppos, n);3031if (!n)3032goto out;3033len -= n;3034total_spliced += n;3035*ppos += n;3036in->f_ra.prev_pos = *ppos;3037if (pipe_is_full(pipe))3038goto out;3039}30403041folio_batch_release(&fbatch);3042} while (len);30433044out:3045folio_batch_release(&fbatch);3046file_accessed(in);30473048return total_spliced ? total_spliced : error;3049}3050EXPORT_SYMBOL(filemap_splice_read);30513052static inline loff_t folio_seek_hole_data(struct xa_state *xas,3053struct address_space *mapping, struct folio *folio,3054loff_t start, loff_t end, bool seek_data)3055{3056const struct address_space_operations *ops = mapping->a_ops;3057size_t offset, bsz = i_blocksize(mapping->host);30583059if (xa_is_value(folio) || folio_test_uptodate(folio))3060return seek_data ? start : end;3061if (!ops->is_partially_uptodate)3062return seek_data ? end : start;30633064xas_pause(xas);3065rcu_read_unlock();3066folio_lock(folio);3067if (unlikely(folio->mapping != mapping))3068goto unlock;30693070offset = offset_in_folio(folio, start) & ~(bsz - 1);30713072do {3073if (ops->is_partially_uptodate(folio, offset, bsz) ==3074seek_data)3075break;3076start = (start + bsz) & ~((u64)bsz - 1);3077offset += bsz;3078} while (offset < folio_size(folio));3079unlock:3080folio_unlock(folio);3081rcu_read_lock();3082return start;3083}30843085static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)3086{3087if (xa_is_value(folio))3088return PAGE_SIZE << xas_get_order(xas);3089return folio_size(folio);3090}30913092/**3093* mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.3094* @mapping: Address space to search.3095* @start: First byte to consider.3096* @end: Limit of search (exclusive).3097* @whence: Either SEEK_HOLE or SEEK_DATA.3098*3099* If the page cache knows which blocks contain holes and which blocks3100* contain data, your filesystem can use this function to implement3101* SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are3102* entirely memory-based such as tmpfs, and filesystems which support3103* unwritten extents.3104*3105* Return: The requested offset on success, or -ENXIO if @whence specifies3106* SEEK_DATA and there is no data after @start. There is an implicit hole3107* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start3108* and @end contain data.3109*/3110loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,3111loff_t end, int whence)3112{3113XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);3114pgoff_t max = (end - 1) >> PAGE_SHIFT;3115bool seek_data = (whence == SEEK_DATA);3116struct folio *folio;31173118if (end <= start)3119return -ENXIO;31203121rcu_read_lock();3122while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {3123loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;3124size_t seek_size;31253126if (start < pos) {3127if (!seek_data)3128goto unlock;3129start = pos;3130}31313132seek_size = seek_folio_size(&xas, folio);3133pos = round_up((u64)pos + 1, seek_size);3134start = folio_seek_hole_data(&xas, mapping, folio, start, pos,3135seek_data);3136if (start < pos)3137goto unlock;3138if (start >= end)3139break;3140if (seek_size > PAGE_SIZE)3141xas_set(&xas, pos >> PAGE_SHIFT);3142if (!xa_is_value(folio))3143folio_put(folio);3144}3145if (seek_data)3146start = -ENXIO;3147unlock:3148rcu_read_unlock();3149if (folio && !xa_is_value(folio))3150folio_put(folio);3151if (start > end)3152return end;3153return start;3154}31553156#ifdef CONFIG_MMU3157#define MMAP_LOTSAMISS (100)3158/*3159* lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock3160* @vmf - the vm_fault for this fault.3161* @folio - the folio to lock.3162* @fpin - the pointer to the file we may pin (or is already pinned).3163*3164* This works similar to lock_folio_or_retry in that it can drop the3165* mmap_lock. It differs in that it actually returns the folio locked3166* if it returns 1 and 0 if it couldn't lock the folio. If we did have3167* to drop the mmap_lock then fpin will point to the pinned file and3168* needs to be fput()'ed at a later point.3169*/3170static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,3171struct file **fpin)3172{3173if (folio_trylock(folio))3174return 1;31753176/*3177* NOTE! This will make us return with VM_FAULT_RETRY, but with3178* the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT3179* is supposed to work. We have way too many special cases..3180*/3181if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)3182return 0;31833184*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);3185if (vmf->flags & FAULT_FLAG_KILLABLE) {3186if (__folio_lock_killable(folio)) {3187/*3188* We didn't have the right flags to drop the3189* fault lock, but all fault_handlers only check3190* for fatal signals if we return VM_FAULT_RETRY,3191* so we need to drop the fault lock here and3192* return 0 if we don't have a fpin.3193*/3194if (*fpin == NULL)3195release_fault_lock(vmf);3196return 0;3197}3198} else3199__folio_lock(folio);32003201return 1;3202}32033204/*3205* Synchronous readahead happens when we don't even find a page in the page3206* cache at all. We don't want to perform IO under the mmap sem, so if we have3207* to drop the mmap sem we return the file that was pinned in order for us to do3208* that. If we didn't pin a file then we return NULL. The file that is3209* returned needs to be fput()'ed when we're done with it.3210*/3211static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)3212{3213struct file *file = vmf->vma->vm_file;3214struct file_ra_state *ra = &file->f_ra;3215struct address_space *mapping = file->f_mapping;3216DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);3217struct file *fpin = NULL;3218vm_flags_t vm_flags = vmf->vma->vm_flags;3219unsigned short mmap_miss;32203221#ifdef CONFIG_TRANSPARENT_HUGEPAGE3222/* Use the readahead code, even if readahead is disabled */3223if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {3224fpin = maybe_unlock_mmap_for_io(vmf, fpin);3225ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);3226ra->size = HPAGE_PMD_NR;3227/*3228* Fetch two PMD folios, so we get the chance to actually3229* readahead, unless we've been told not to.3230*/3231if (!(vm_flags & VM_RAND_READ))3232ra->size *= 2;3233ra->async_size = HPAGE_PMD_NR;3234ra->order = HPAGE_PMD_ORDER;3235page_cache_ra_order(&ractl, ra);3236return fpin;3237}3238#endif32393240/*3241* If we don't want any read-ahead, don't bother. VM_EXEC case below is3242* already intended for random access.3243*/3244if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)3245return fpin;3246if (!ra->ra_pages)3247return fpin;32483249if (vm_flags & VM_SEQ_READ) {3250fpin = maybe_unlock_mmap_for_io(vmf, fpin);3251page_cache_sync_ra(&ractl, ra->ra_pages);3252return fpin;3253}32543255/* Avoid banging the cache line if not needed */3256mmap_miss = READ_ONCE(ra->mmap_miss);3257if (mmap_miss < MMAP_LOTSAMISS * 10)3258WRITE_ONCE(ra->mmap_miss, ++mmap_miss);32593260/*3261* Do we miss much more than hit in this file? If so,3262* stop bothering with read-ahead. It will only hurt.3263*/3264if (mmap_miss > MMAP_LOTSAMISS)3265return fpin;32663267if (vm_flags & VM_EXEC) {3268/*3269* Allow arch to request a preferred minimum folio order for3270* executable memory. This can often be beneficial to3271* performance if (e.g.) arm64 can contpte-map the folio.3272* Executable memory rarely benefits from readahead, due to its3273* random access nature, so set async_size to 0.3274*3275* Limit to the boundaries of the VMA to avoid reading in any3276* pad that might exist between sections, which would be a waste3277* of memory.3278*/3279struct vm_area_struct *vma = vmf->vma;3280unsigned long start = vma->vm_pgoff;3281unsigned long end = start + vma_pages(vma);3282unsigned long ra_end;32833284ra->order = exec_folio_order();3285ra->start = round_down(vmf->pgoff, 1UL << ra->order);3286ra->start = max(ra->start, start);3287ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);3288ra_end = min(ra_end, end);3289ra->size = ra_end - ra->start;3290ra->async_size = 0;3291} else {3292/*3293* mmap read-around3294*/3295ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);3296ra->size = ra->ra_pages;3297ra->async_size = ra->ra_pages / 4;3298ra->order = 0;3299}33003301fpin = maybe_unlock_mmap_for_io(vmf, fpin);3302ractl._index = ra->start;3303page_cache_ra_order(&ractl, ra);3304return fpin;3305}33063307/*3308* Asynchronous readahead happens when we find the page and PG_readahead,3309* so we want to possibly extend the readahead further. We return the file that3310* was pinned if we have to drop the mmap_lock in order to do IO.3311*/3312static struct file *do_async_mmap_readahead(struct vm_fault *vmf,3313struct folio *folio)3314{3315struct file *file = vmf->vma->vm_file;3316struct file_ra_state *ra = &file->f_ra;3317DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);3318struct file *fpin = NULL;3319unsigned short mmap_miss;33203321/* If we don't want any read-ahead, don't bother */3322if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)3323return fpin;33243325mmap_miss = READ_ONCE(ra->mmap_miss);3326if (mmap_miss)3327WRITE_ONCE(ra->mmap_miss, --mmap_miss);33283329if (folio_test_readahead(folio)) {3330fpin = maybe_unlock_mmap_for_io(vmf, fpin);3331page_cache_async_ra(&ractl, folio, ra->ra_pages);3332}3333return fpin;3334}33353336static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)3337{3338struct vm_area_struct *vma = vmf->vma;3339vm_fault_t ret = 0;3340pte_t *ptep;33413342/*3343* We might have COW'ed a pagecache folio and might now have an mlocked3344* anon folio mapped. The original pagecache folio is not mlocked and3345* might have been evicted. During a read+clear/modify/write update of3346* the PTE, such as done in do_numa_page()/change_pte_range(), we3347* temporarily clear the PTE under PT lock and might detect it here as3348* "none" when not holding the PT lock.3349*3350* Not rechecking the PTE under PT lock could result in an unexpected3351* major fault in an mlock'ed region. Recheck only for this special3352* scenario while holding the PT lock, to not degrade non-mlocked3353* scenarios. Recheck the PTE without PT lock firstly, thereby reducing3354* the number of times we hold PT lock.3355*/3356if (!(vma->vm_flags & VM_LOCKED))3357return 0;33583359if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))3360return 0;33613362ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,3363&vmf->ptl);3364if (unlikely(!ptep))3365return VM_FAULT_NOPAGE;33663367if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {3368ret = VM_FAULT_NOPAGE;3369} else {3370spin_lock(vmf->ptl);3371if (unlikely(!pte_none(ptep_get(ptep))))3372ret = VM_FAULT_NOPAGE;3373spin_unlock(vmf->ptl);3374}3375pte_unmap(ptep);3376return ret;3377}33783379/**3380* filemap_fault - read in file data for page fault handling3381* @vmf: struct vm_fault containing details of the fault3382*3383* filemap_fault() is invoked via the vma operations vector for a3384* mapped memory region to read in file data during a page fault.3385*3386* The goto's are kind of ugly, but this streamlines the normal case of having3387* it in the page cache, and handles the special cases reasonably without3388* having a lot of duplicated code.3389*3390* vma->vm_mm->mmap_lock must be held on entry.3391*3392* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock3393* may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().3394*3395* If our return value does not have VM_FAULT_RETRY set, the mmap_lock3396* has not been released.3397*3398* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.3399*3400* Return: bitwise-OR of %VM_FAULT_ codes.3401*/3402vm_fault_t filemap_fault(struct vm_fault *vmf)3403{3404int error;3405struct file *file = vmf->vma->vm_file;3406struct file *fpin = NULL;3407struct address_space *mapping = file->f_mapping;3408struct inode *inode = mapping->host;3409pgoff_t max_idx, index = vmf->pgoff;3410struct folio *folio;3411vm_fault_t ret = 0;3412bool mapping_locked = false;34133414max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3415if (unlikely(index >= max_idx))3416return VM_FAULT_SIGBUS;34173418trace_mm_filemap_fault(mapping, index);34193420/*3421* Do we have something in the page cache already?3422*/3423folio = filemap_get_folio(mapping, index);3424if (likely(!IS_ERR(folio))) {3425/*3426* We found the page, so try async readahead before waiting for3427* the lock.3428*/3429if (!(vmf->flags & FAULT_FLAG_TRIED))3430fpin = do_async_mmap_readahead(vmf, folio);3431if (unlikely(!folio_test_uptodate(folio))) {3432filemap_invalidate_lock_shared(mapping);3433mapping_locked = true;3434}3435} else {3436ret = filemap_fault_recheck_pte_none(vmf);3437if (unlikely(ret))3438return ret;34393440/* No page in the page cache at all */3441count_vm_event(PGMAJFAULT);3442count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);3443ret = VM_FAULT_MAJOR;3444fpin = do_sync_mmap_readahead(vmf);3445retry_find:3446/*3447* See comment in filemap_create_folio() why we need3448* invalidate_lock3449*/3450if (!mapping_locked) {3451filemap_invalidate_lock_shared(mapping);3452mapping_locked = true;3453}3454folio = __filemap_get_folio(mapping, index,3455FGP_CREAT|FGP_FOR_MMAP,3456vmf->gfp_mask);3457if (IS_ERR(folio)) {3458if (fpin)3459goto out_retry;3460filemap_invalidate_unlock_shared(mapping);3461return VM_FAULT_OOM;3462}3463}34643465if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))3466goto out_retry;34673468/* Did it get truncated? */3469if (unlikely(folio->mapping != mapping)) {3470folio_unlock(folio);3471folio_put(folio);3472goto retry_find;3473}3474VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);34753476/*3477* We have a locked folio in the page cache, now we need to check3478* that it's up-to-date. If not, it is going to be due to an error,3479* or because readahead was otherwise unable to retrieve it.3480*/3481if (unlikely(!folio_test_uptodate(folio))) {3482/*3483* If the invalidate lock is not held, the folio was in cache3484* and uptodate and now it is not. Strange but possible since we3485* didn't hold the page lock all the time. Let's drop3486* everything, get the invalidate lock and try again.3487*/3488if (!mapping_locked) {3489folio_unlock(folio);3490folio_put(folio);3491goto retry_find;3492}34933494/*3495* OK, the folio is really not uptodate. This can be because the3496* VMA has the VM_RAND_READ flag set, or because an error3497* arose. Let's read it in directly.3498*/3499goto page_not_uptodate;3500}35013502/*3503* We've made it this far and we had to drop our mmap_lock, now is the3504* time to return to the upper layer and have it re-find the vma and3505* redo the fault.3506*/3507if (fpin) {3508folio_unlock(folio);3509goto out_retry;3510}3511if (mapping_locked)3512filemap_invalidate_unlock_shared(mapping);35133514/*3515* Found the page and have a reference on it.3516* We must recheck i_size under page lock.3517*/3518max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3519if (unlikely(index >= max_idx)) {3520folio_unlock(folio);3521folio_put(folio);3522return VM_FAULT_SIGBUS;3523}35243525vmf->page = folio_file_page(folio, index);3526return ret | VM_FAULT_LOCKED;35273528page_not_uptodate:3529/*3530* Umm, take care of errors if the page isn't up-to-date.3531* Try to re-read it _once_. We do this synchronously,3532* because there really aren't any performance issues here3533* and we need to check for errors.3534*/3535fpin = maybe_unlock_mmap_for_io(vmf, fpin);3536error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);3537if (fpin)3538goto out_retry;3539folio_put(folio);35403541if (!error || error == AOP_TRUNCATED_PAGE)3542goto retry_find;3543filemap_invalidate_unlock_shared(mapping);35443545return VM_FAULT_SIGBUS;35463547out_retry:3548/*3549* We dropped the mmap_lock, we need to return to the fault handler to3550* re-find the vma and come back and find our hopefully still populated3551* page.3552*/3553if (!IS_ERR(folio))3554folio_put(folio);3555if (mapping_locked)3556filemap_invalidate_unlock_shared(mapping);3557if (fpin)3558fput(fpin);3559return ret | VM_FAULT_RETRY;3560}3561EXPORT_SYMBOL(filemap_fault);35623563static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,3564pgoff_t start)3565{3566struct mm_struct *mm = vmf->vma->vm_mm;35673568/* Huge page is mapped? No need to proceed. */3569if (pmd_trans_huge(*vmf->pmd)) {3570folio_unlock(folio);3571folio_put(folio);3572return true;3573}35743575if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {3576struct page *page = folio_file_page(folio, start);3577vm_fault_t ret = do_set_pmd(vmf, folio, page);3578if (!ret) {3579/* The page is mapped successfully, reference consumed. */3580folio_unlock(folio);3581return true;3582}3583}35843585if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)3586pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);35873588return false;3589}35903591static struct folio *next_uptodate_folio(struct xa_state *xas,3592struct address_space *mapping, pgoff_t end_pgoff)3593{3594struct folio *folio = xas_next_entry(xas, end_pgoff);3595unsigned long max_idx;35963597do {3598if (!folio)3599return NULL;3600if (xas_retry(xas, folio))3601continue;3602if (xa_is_value(folio))3603continue;3604if (!folio_try_get(folio))3605continue;3606if (folio_test_locked(folio))3607goto skip;3608/* Has the page moved or been split? */3609if (unlikely(folio != xas_reload(xas)))3610goto skip;3611if (!folio_test_uptodate(folio) || folio_test_readahead(folio))3612goto skip;3613if (!folio_trylock(folio))3614goto skip;3615if (folio->mapping != mapping)3616goto unlock;3617if (!folio_test_uptodate(folio))3618goto unlock;3619max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);3620if (xas->xa_index >= max_idx)3621goto unlock;3622return folio;3623unlock:3624folio_unlock(folio);3625skip:3626folio_put(folio);3627} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);36283629return NULL;3630}36313632/*3633* Map page range [start_page, start_page + nr_pages) of folio.3634* start_page is gotten from start by folio_page(folio, start)3635*/3636static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,3637struct folio *folio, unsigned long start,3638unsigned long addr, unsigned int nr_pages,3639unsigned long *rss, unsigned short *mmap_miss)3640{3641vm_fault_t ret = 0;3642struct page *page = folio_page(folio, start);3643unsigned int count = 0;3644pte_t *old_ptep = vmf->pte;36453646do {3647if (PageHWPoison(page + count))3648goto skip;36493650/*3651* If there are too many folios that are recently evicted3652* in a file, they will probably continue to be evicted.3653* In such situation, read-ahead is only a waste of IO.3654* Don't decrease mmap_miss in this scenario to make sure3655* we can stop read-ahead.3656*/3657if (!folio_test_workingset(folio))3658(*mmap_miss)++;36593660/*3661* NOTE: If there're PTE markers, we'll leave them to be3662* handled in the specific fault path, and it'll prohibit the3663* fault-around logic.3664*/3665if (!pte_none(ptep_get(&vmf->pte[count])))3666goto skip;36673668count++;3669continue;3670skip:3671if (count) {3672set_pte_range(vmf, folio, page, count, addr);3673*rss += count;3674folio_ref_add(folio, count);3675if (in_range(vmf->address, addr, count * PAGE_SIZE))3676ret = VM_FAULT_NOPAGE;3677}36783679count++;3680page += count;3681vmf->pte += count;3682addr += count * PAGE_SIZE;3683count = 0;3684} while (--nr_pages > 0);36853686if (count) {3687set_pte_range(vmf, folio, page, count, addr);3688*rss += count;3689folio_ref_add(folio, count);3690if (in_range(vmf->address, addr, count * PAGE_SIZE))3691ret = VM_FAULT_NOPAGE;3692}36933694vmf->pte = old_ptep;36953696return ret;3697}36983699static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,3700struct folio *folio, unsigned long addr,3701unsigned long *rss, unsigned short *mmap_miss)3702{3703vm_fault_t ret = 0;3704struct page *page = &folio->page;37053706if (PageHWPoison(page))3707return ret;37083709/* See comment of filemap_map_folio_range() */3710if (!folio_test_workingset(folio))3711(*mmap_miss)++;37123713/*3714* NOTE: If there're PTE markers, we'll leave them to be3715* handled in the specific fault path, and it'll prohibit3716* the fault-around logic.3717*/3718if (!pte_none(ptep_get(vmf->pte)))3719return ret;37203721if (vmf->address == addr)3722ret = VM_FAULT_NOPAGE;37233724set_pte_range(vmf, folio, page, 1, addr);3725(*rss)++;3726folio_ref_inc(folio);37273728return ret;3729}37303731vm_fault_t filemap_map_pages(struct vm_fault *vmf,3732pgoff_t start_pgoff, pgoff_t end_pgoff)3733{3734struct vm_area_struct *vma = vmf->vma;3735struct file *file = vma->vm_file;3736struct address_space *mapping = file->f_mapping;3737pgoff_t file_end, last_pgoff = start_pgoff;3738unsigned long addr;3739XA_STATE(xas, &mapping->i_pages, start_pgoff);3740struct folio *folio;3741vm_fault_t ret = 0;3742unsigned long rss = 0;3743unsigned int nr_pages = 0, folio_type;3744unsigned short mmap_miss = 0, mmap_miss_saved;37453746rcu_read_lock();3747folio = next_uptodate_folio(&xas, mapping, end_pgoff);3748if (!folio)3749goto out;37503751if (filemap_map_pmd(vmf, folio, start_pgoff)) {3752ret = VM_FAULT_NOPAGE;3753goto out;3754}37553756addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);3757vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);3758if (!vmf->pte) {3759folio_unlock(folio);3760folio_put(folio);3761goto out;3762}37633764file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;3765if (end_pgoff > file_end)3766end_pgoff = file_end;37673768folio_type = mm_counter_file(folio);3769do {3770unsigned long end;37713772addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;3773vmf->pte += xas.xa_index - last_pgoff;3774last_pgoff = xas.xa_index;3775end = folio_next_index(folio) - 1;3776nr_pages = min(end, end_pgoff) - xas.xa_index + 1;37773778if (!folio_test_large(folio))3779ret |= filemap_map_order0_folio(vmf,3780folio, addr, &rss, &mmap_miss);3781else3782ret |= filemap_map_folio_range(vmf, folio,3783xas.xa_index - folio->index, addr,3784nr_pages, &rss, &mmap_miss);37853786folio_unlock(folio);3787folio_put(folio);3788} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);3789add_mm_counter(vma->vm_mm, folio_type, rss);3790pte_unmap_unlock(vmf->pte, vmf->ptl);3791trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);3792out:3793rcu_read_unlock();37943795mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);3796if (mmap_miss >= mmap_miss_saved)3797WRITE_ONCE(file->f_ra.mmap_miss, 0);3798else3799WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);38003801return ret;3802}3803EXPORT_SYMBOL(filemap_map_pages);38043805vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)3806{3807struct address_space *mapping = vmf->vma->vm_file->f_mapping;3808struct folio *folio = page_folio(vmf->page);3809vm_fault_t ret = VM_FAULT_LOCKED;38103811sb_start_pagefault(mapping->host->i_sb);3812file_update_time(vmf->vma->vm_file);3813folio_lock(folio);3814if (folio->mapping != mapping) {3815folio_unlock(folio);3816ret = VM_FAULT_NOPAGE;3817goto out;3818}3819/*3820* We mark the folio dirty already here so that when freeze is in3821* progress, we are guaranteed that writeback during freezing will3822* see the dirty folio and writeprotect it again.3823*/3824folio_mark_dirty(folio);3825folio_wait_stable(folio);3826out:3827sb_end_pagefault(mapping->host->i_sb);3828return ret;3829}38303831const struct vm_operations_struct generic_file_vm_ops = {3832.fault = filemap_fault,3833.map_pages = filemap_map_pages,3834.page_mkwrite = filemap_page_mkwrite,3835};38363837/* This is used for a general mmap of a disk file */38383839int generic_file_mmap(struct file *file, struct vm_area_struct *vma)3840{3841struct address_space *mapping = file->f_mapping;38423843if (!mapping->a_ops->read_folio)3844return -ENOEXEC;3845file_accessed(file);3846vma->vm_ops = &generic_file_vm_ops;3847return 0;3848}38493850int generic_file_mmap_prepare(struct vm_area_desc *desc)3851{3852struct file *file = desc->file;3853struct address_space *mapping = file->f_mapping;38543855if (!mapping->a_ops->read_folio)3856return -ENOEXEC;3857file_accessed(file);3858desc->vm_ops = &generic_file_vm_ops;3859return 0;3860}38613862/*3863* This is for filesystems which do not implement ->writepage.3864*/3865int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)3866{3867if (vma_is_shared_maywrite(vma))3868return -EINVAL;3869return generic_file_mmap(file, vma);3870}38713872int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)3873{3874if (is_shared_maywrite(desc->vm_flags))3875return -EINVAL;3876return generic_file_mmap_prepare(desc);3877}3878#else3879vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)3880{3881return VM_FAULT_SIGBUS;3882}3883int generic_file_mmap(struct file *file, struct vm_area_struct *vma)3884{3885return -ENOSYS;3886}3887int generic_file_mmap_prepare(struct vm_area_desc *desc)3888{3889return -ENOSYS;3890}3891int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)3892{3893return -ENOSYS;3894}3895int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)3896{3897return -ENOSYS;3898}3899#endif /* CONFIG_MMU */39003901EXPORT_SYMBOL(filemap_page_mkwrite);3902EXPORT_SYMBOL(generic_file_mmap);3903EXPORT_SYMBOL(generic_file_mmap_prepare);3904EXPORT_SYMBOL(generic_file_readonly_mmap);3905EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);39063907static struct folio *do_read_cache_folio(struct address_space *mapping,3908pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)3909{3910struct folio *folio;3911int err;39123913if (!filler)3914filler = mapping->a_ops->read_folio;3915repeat:3916folio = filemap_get_folio(mapping, index);3917if (IS_ERR(folio)) {3918folio = filemap_alloc_folio(gfp,3919mapping_min_folio_order(mapping));3920if (!folio)3921return ERR_PTR(-ENOMEM);3922index = mapping_align_index(mapping, index);3923err = filemap_add_folio(mapping, folio, index, gfp);3924if (unlikely(err)) {3925folio_put(folio);3926if (err == -EEXIST)3927goto repeat;3928/* Presumably ENOMEM for xarray node */3929return ERR_PTR(err);3930}39313932goto filler;3933}3934if (folio_test_uptodate(folio))3935goto out;39363937if (!folio_trylock(folio)) {3938folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);3939goto repeat;3940}39413942/* Folio was truncated from mapping */3943if (!folio->mapping) {3944folio_unlock(folio);3945folio_put(folio);3946goto repeat;3947}39483949/* Someone else locked and filled the page in a very small window */3950if (folio_test_uptodate(folio)) {3951folio_unlock(folio);3952goto out;3953}39543955filler:3956err = filemap_read_folio(file, filler, folio);3957if (err) {3958folio_put(folio);3959if (err == AOP_TRUNCATED_PAGE)3960goto repeat;3961return ERR_PTR(err);3962}39633964out:3965folio_mark_accessed(folio);3966return folio;3967}39683969/**3970* read_cache_folio - Read into page cache, fill it if needed.3971* @mapping: The address_space to read from.3972* @index: The index to read.3973* @filler: Function to perform the read, or NULL to use aops->read_folio().3974* @file: Passed to filler function, may be NULL if not required.3975*3976* Read one page into the page cache. If it succeeds, the folio returned3977* will contain @index, but it may not be the first page of the folio.3978*3979* If the filler function returns an error, it will be returned to the3980* caller.3981*3982* Context: May sleep. Expects mapping->invalidate_lock to be held.3983* Return: An uptodate folio on success, ERR_PTR() on failure.3984*/3985struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,3986filler_t filler, struct file *file)3987{3988return do_read_cache_folio(mapping, index, filler, file,3989mapping_gfp_mask(mapping));3990}3991EXPORT_SYMBOL(read_cache_folio);39923993/**3994* mapping_read_folio_gfp - Read into page cache, using specified allocation flags.3995* @mapping: The address_space for the folio.3996* @index: The index that the allocated folio will contain.3997* @gfp: The page allocator flags to use if allocating.3998*3999* This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with4000* any new memory allocations done using the specified allocation flags.4001*4002* The most likely error from this function is EIO, but ENOMEM is4003* possible and so is EINTR. If ->read_folio returns another error,4004* that will be returned to the caller.4005*4006* The function expects mapping->invalidate_lock to be already held.4007*4008* Return: Uptodate folio on success, ERR_PTR() on failure.4009*/4010struct folio *mapping_read_folio_gfp(struct address_space *mapping,4011pgoff_t index, gfp_t gfp)4012{4013return do_read_cache_folio(mapping, index, NULL, NULL, gfp);4014}4015EXPORT_SYMBOL(mapping_read_folio_gfp);40164017static struct page *do_read_cache_page(struct address_space *mapping,4018pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)4019{4020struct folio *folio;40214022folio = do_read_cache_folio(mapping, index, filler, file, gfp);4023if (IS_ERR(folio))4024return &folio->page;4025return folio_file_page(folio, index);4026}40274028struct page *read_cache_page(struct address_space *mapping,4029pgoff_t index, filler_t *filler, struct file *file)4030{4031return do_read_cache_page(mapping, index, filler, file,4032mapping_gfp_mask(mapping));4033}4034EXPORT_SYMBOL(read_cache_page);40354036/**4037* read_cache_page_gfp - read into page cache, using specified page allocation flags.4038* @mapping: the page's address_space4039* @index: the page index4040* @gfp: the page allocator flags to use if allocating4041*4042* This is the same as "read_mapping_page(mapping, index, NULL)", but with4043* any new page allocations done using the specified allocation flags.4044*4045* If the page does not get brought uptodate, return -EIO.4046*4047* The function expects mapping->invalidate_lock to be already held.4048*4049* Return: up to date page on success, ERR_PTR() on failure.4050*/4051struct page *read_cache_page_gfp(struct address_space *mapping,4052pgoff_t index,4053gfp_t gfp)4054{4055return do_read_cache_page(mapping, index, NULL, NULL, gfp);4056}4057EXPORT_SYMBOL(read_cache_page_gfp);40584059/*4060* Warn about a page cache invalidation failure during a direct I/O write.4061*/4062static void dio_warn_stale_pagecache(struct file *filp)4063{4064static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);4065char pathname[128];4066char *path;40674068errseq_set(&filp->f_mapping->wb_err, -EIO);4069if (__ratelimit(&_rs)) {4070path = file_path(filp, pathname, sizeof(pathname));4071if (IS_ERR(path))4072path = "(unknown)";4073pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");4074pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,4075current->comm);4076}4077}40784079void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)4080{4081struct address_space *mapping = iocb->ki_filp->f_mapping;40824083if (mapping->nrpages &&4084invalidate_inode_pages2_range(mapping,4085iocb->ki_pos >> PAGE_SHIFT,4086(iocb->ki_pos + count - 1) >> PAGE_SHIFT))4087dio_warn_stale_pagecache(iocb->ki_filp);4088}40894090ssize_t4091generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)4092{4093struct address_space *mapping = iocb->ki_filp->f_mapping;4094size_t write_len = iov_iter_count(from);4095ssize_t written;40964097/*4098* If a page can not be invalidated, return 0 to fall back4099* to buffered write.4100*/4101written = kiocb_invalidate_pages(iocb, write_len);4102if (written) {4103if (written == -EBUSY)4104return 0;4105return written;4106}41074108written = mapping->a_ops->direct_IO(iocb, from);41094110/*4111* Finally, try again to invalidate clean pages which might have been4112* cached by non-direct readahead, or faulted in by get_user_pages()4113* if the source of the write was an mmap'ed region of the file4114* we're writing. Either one is a pretty crazy thing to do,4115* so we don't support it 100%. If this invalidation4116* fails, tough, the write still worked...4117*4118* Most of the time we do not need this since dio_complete() will do4119* the invalidation for us. However there are some file systems that4120* do not end up with dio_complete() being called, so let's not break4121* them by removing it completely.4122*4123* Noticeable example is a blkdev_direct_IO().4124*4125* Skip invalidation for async writes or if mapping has no pages.4126*/4127if (written > 0) {4128struct inode *inode = mapping->host;4129loff_t pos = iocb->ki_pos;41304131kiocb_invalidate_post_direct_write(iocb, written);4132pos += written;4133write_len -= written;4134if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {4135i_size_write(inode, pos);4136mark_inode_dirty(inode);4137}4138iocb->ki_pos = pos;4139}4140if (written != -EIOCBQUEUED)4141iov_iter_revert(from, write_len - iov_iter_count(from));4142return written;4143}4144EXPORT_SYMBOL(generic_file_direct_write);41454146ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)4147{4148struct file *file = iocb->ki_filp;4149loff_t pos = iocb->ki_pos;4150struct address_space *mapping = file->f_mapping;4151const struct address_space_operations *a_ops = mapping->a_ops;4152size_t chunk = mapping_max_folio_size(mapping);4153long status = 0;4154ssize_t written = 0;41554156do {4157struct folio *folio;4158size_t offset; /* Offset into folio */4159size_t bytes; /* Bytes to write to folio */4160size_t copied; /* Bytes copied from user */4161void *fsdata = NULL;41624163bytes = iov_iter_count(i);4164retry:4165offset = pos & (chunk - 1);4166bytes = min(chunk - offset, bytes);4167balance_dirty_pages_ratelimited(mapping);41684169if (fatal_signal_pending(current)) {4170status = -EINTR;4171break;4172}41734174status = a_ops->write_begin(iocb, mapping, pos, bytes,4175&folio, &fsdata);4176if (unlikely(status < 0))4177break;41784179offset = offset_in_folio(folio, pos);4180if (bytes > folio_size(folio) - offset)4181bytes = folio_size(folio) - offset;41824183if (mapping_writably_mapped(mapping))4184flush_dcache_folio(folio);41854186/*4187* Faults here on mmap()s can recurse into arbitrary4188* filesystem code. Lots of locks are held that can4189* deadlock. Use an atomic copy to avoid deadlocking4190* in page fault handling.4191*/4192copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);4193flush_dcache_folio(folio);41944195status = a_ops->write_end(iocb, mapping, pos, bytes, copied,4196folio, fsdata);4197if (unlikely(status != copied)) {4198iov_iter_revert(i, copied - max(status, 0L));4199if (unlikely(status < 0))4200break;4201}4202cond_resched();42034204if (unlikely(status == 0)) {4205/*4206* A short copy made ->write_end() reject the4207* thing entirely. Might be memory poisoning4208* halfway through, might be a race with munmap,4209* might be severe memory pressure.4210*/4211if (chunk > PAGE_SIZE)4212chunk /= 2;4213if (copied) {4214bytes = copied;4215goto retry;4216}42174218/*4219* 'folio' is now unlocked and faults on it can be4220* handled. Ensure forward progress by trying to4221* fault it in now.4222*/4223if (fault_in_iov_iter_readable(i, bytes) == bytes) {4224status = -EFAULT;4225break;4226}4227} else {4228pos += status;4229written += status;4230}4231} while (iov_iter_count(i));42324233if (!written)4234return status;4235iocb->ki_pos += written;4236return written;4237}4238EXPORT_SYMBOL(generic_perform_write);42394240/**4241* __generic_file_write_iter - write data to a file4242* @iocb: IO state structure (file, offset, etc.)4243* @from: iov_iter with data to write4244*4245* This function does all the work needed for actually writing data to a4246* file. It does all basic checks, removes SUID from the file, updates4247* modification times and calls proper subroutines depending on whether we4248* do direct IO or a standard buffered write.4249*4250* It expects i_rwsem to be grabbed unless we work on a block device or similar4251* object which does not need locking at all.4252*4253* This function does *not* take care of syncing data in case of O_SYNC write.4254* A caller has to handle it. This is mainly due to the fact that we want to4255* avoid syncing under i_rwsem.4256*4257* Return:4258* * number of bytes written, even for truncated writes4259* * negative error code if no data has been written at all4260*/4261ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4262{4263struct file *file = iocb->ki_filp;4264struct address_space *mapping = file->f_mapping;4265struct inode *inode = mapping->host;4266ssize_t ret;42674268ret = file_remove_privs(file);4269if (ret)4270return ret;42714272ret = file_update_time(file);4273if (ret)4274return ret;42754276if (iocb->ki_flags & IOCB_DIRECT) {4277ret = generic_file_direct_write(iocb, from);4278/*4279* If the write stopped short of completing, fall back to4280* buffered writes. Some filesystems do this for writes to4281* holes, for example. For DAX files, a buffered write will4282* not succeed (even if it did, DAX does not handle dirty4283* page-cache pages correctly).4284*/4285if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))4286return ret;4287return direct_write_fallback(iocb, from, ret,4288generic_perform_write(iocb, from));4289}42904291return generic_perform_write(iocb, from);4292}4293EXPORT_SYMBOL(__generic_file_write_iter);42944295/**4296* generic_file_write_iter - write data to a file4297* @iocb: IO state structure4298* @from: iov_iter with data to write4299*4300* This is a wrapper around __generic_file_write_iter() to be used by most4301* filesystems. It takes care of syncing the file in case of O_SYNC file4302* and acquires i_rwsem as needed.4303* Return:4304* * negative error code if no data has been written at all of4305* vfs_fsync_range() failed for a synchronous write4306* * number of bytes written, even for truncated writes4307*/4308ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4309{4310struct file *file = iocb->ki_filp;4311struct inode *inode = file->f_mapping->host;4312ssize_t ret;43134314inode_lock(inode);4315ret = generic_write_checks(iocb, from);4316if (ret > 0)4317ret = __generic_file_write_iter(iocb, from);4318inode_unlock(inode);43194320if (ret > 0)4321ret = generic_write_sync(iocb, ret);4322return ret;4323}4324EXPORT_SYMBOL(generic_file_write_iter);43254326/**4327* filemap_release_folio() - Release fs-specific metadata on a folio.4328* @folio: The folio which the kernel is trying to free.4329* @gfp: Memory allocation flags (and I/O mode).4330*4331* The address_space is trying to release any data attached to a folio4332* (presumably at folio->private).4333*4334* This will also be called if the private_2 flag is set on a page,4335* indicating that the folio has other metadata associated with it.4336*4337* The @gfp argument specifies whether I/O may be performed to release4338* this page (__GFP_IO), and whether the call may block4339* (__GFP_RECLAIM & __GFP_FS).4340*4341* Return: %true if the release was successful, otherwise %false.4342*/4343bool filemap_release_folio(struct folio *folio, gfp_t gfp)4344{4345struct address_space * const mapping = folio->mapping;43464347BUG_ON(!folio_test_locked(folio));4348if (!folio_needs_release(folio))4349return true;4350if (folio_test_writeback(folio))4351return false;43524353if (mapping && mapping->a_ops->release_folio)4354return mapping->a_ops->release_folio(folio, gfp);4355return try_to_free_buffers(folio);4356}4357EXPORT_SYMBOL(filemap_release_folio);43584359/**4360* filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache4361* @inode: The inode to flush4362* @flush: Set to write back rather than simply invalidate.4363* @start: First byte to in range.4364* @end: Last byte in range (inclusive), or LLONG_MAX for everything from start4365* onwards.4366*4367* Invalidate all the folios on an inode that contribute to the specified4368* range, possibly writing them back first. Whilst the operation is4369* undertaken, the invalidate lock is held to prevent new folios from being4370* installed.4371*/4372int filemap_invalidate_inode(struct inode *inode, bool flush,4373loff_t start, loff_t end)4374{4375struct address_space *mapping = inode->i_mapping;4376pgoff_t first = start >> PAGE_SHIFT;4377pgoff_t last = end >> PAGE_SHIFT;4378pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;43794380if (!mapping || !mapping->nrpages || end < start)4381goto out;43824383/* Prevent new folios from being added to the inode. */4384filemap_invalidate_lock(mapping);43854386if (!mapping->nrpages)4387goto unlock;43884389unmap_mapping_pages(mapping, first, nr, false);43904391/* Write back the data if we're asked to. */4392if (flush) {4393struct writeback_control wbc = {4394.sync_mode = WB_SYNC_ALL,4395.nr_to_write = LONG_MAX,4396.range_start = start,4397.range_end = end,4398};43994400filemap_fdatawrite_wbc(mapping, &wbc);4401}44024403/* Wait for writeback to complete on all folios and discard. */4404invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);44054406unlock:4407filemap_invalidate_unlock(mapping);4408out:4409return filemap_check_errors(mapping);4410}4411EXPORT_SYMBOL_GPL(filemap_invalidate_inode);44124413#ifdef CONFIG_CACHESTAT_SYSCALL4414/**4415* filemap_cachestat() - compute the page cache statistics of a mapping4416* @mapping: The mapping to compute the statistics for.4417* @first_index: The starting page cache index.4418* @last_index: The final page index (inclusive).4419* @cs: the cachestat struct to write the result to.4420*4421* This will query the page cache statistics of a mapping in the4422* page range of [first_index, last_index] (inclusive). The statistics4423* queried include: number of dirty pages, number of pages marked for4424* writeback, and the number of (recently) evicted pages.4425*/4426static void filemap_cachestat(struct address_space *mapping,4427pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)4428{4429XA_STATE(xas, &mapping->i_pages, first_index);4430struct folio *folio;44314432/* Flush stats (and potentially sleep) outside the RCU read section. */4433mem_cgroup_flush_stats_ratelimited(NULL);44344435rcu_read_lock();4436xas_for_each(&xas, folio, last_index) {4437int order;4438unsigned long nr_pages;4439pgoff_t folio_first_index, folio_last_index;44404441/*4442* Don't deref the folio. It is not pinned, and might4443* get freed (and reused) underneath us.4444*4445* We *could* pin it, but that would be expensive for4446* what should be a fast and lightweight syscall.4447*4448* Instead, derive all information of interest from4449* the rcu-protected xarray.4450*/44514452if (xas_retry(&xas, folio))4453continue;44544455order = xas_get_order(&xas);4456nr_pages = 1 << order;4457folio_first_index = round_down(xas.xa_index, 1 << order);4458folio_last_index = folio_first_index + nr_pages - 1;44594460/* Folios might straddle the range boundaries, only count covered pages */4461if (folio_first_index < first_index)4462nr_pages -= first_index - folio_first_index;44634464if (folio_last_index > last_index)4465nr_pages -= folio_last_index - last_index;44664467if (xa_is_value(folio)) {4468/* page is evicted */4469void *shadow = (void *)folio;4470bool workingset; /* not used */44714472cs->nr_evicted += nr_pages;44734474#ifdef CONFIG_SWAP /* implies CONFIG_MMU */4475if (shmem_mapping(mapping)) {4476/* shmem file - in swap cache */4477swp_entry_t swp = radix_to_swp_entry(folio);44784479/* swapin error results in poisoned entry */4480if (non_swap_entry(swp))4481goto resched;44824483/*4484* Getting a swap entry from the shmem4485* inode means we beat4486* shmem_unuse(). rcu_read_lock()4487* ensures swapoff waits for us before4488* freeing the swapper space. However,4489* we can race with swapping and4490* invalidation, so there might not be4491* a shadow in the swapcache (yet).4492*/4493shadow = get_shadow_from_swap_cache(swp);4494if (!shadow)4495goto resched;4496}4497#endif4498if (workingset_test_recent(shadow, true, &workingset, false))4499cs->nr_recently_evicted += nr_pages;45004501goto resched;4502}45034504/* page is in cache */4505cs->nr_cache += nr_pages;45064507if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))4508cs->nr_dirty += nr_pages;45094510if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))4511cs->nr_writeback += nr_pages;45124513resched:4514if (need_resched()) {4515xas_pause(&xas);4516cond_resched_rcu();4517}4518}4519rcu_read_unlock();4520}45214522/*4523* See mincore: reveal pagecache information only for files4524* that the calling process has write access to, or could (if4525* tried) open for writing.4526*/4527static inline bool can_do_cachestat(struct file *f)4528{4529if (f->f_mode & FMODE_WRITE)4530return true;4531if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))4532return true;4533return file_permission(f, MAY_WRITE) == 0;4534}45354536/*4537* The cachestat(2) system call.4538*4539* cachestat() returns the page cache statistics of a file in the4540* bytes range specified by `off` and `len`: number of cached pages,4541* number of dirty pages, number of pages marked for writeback,4542* number of evicted pages, and number of recently evicted pages.4543*4544* An evicted page is a page that is previously in the page cache4545* but has been evicted since. A page is recently evicted if its last4546* eviction was recent enough that its reentry to the cache would4547* indicate that it is actively being used by the system, and that4548* there is memory pressure on the system.4549*4550* `off` and `len` must be non-negative integers. If `len` > 0,4551* the queried range is [`off`, `off` + `len`]. If `len` == 0,4552* we will query in the range from `off` to the end of the file.4553*4554* The `flags` argument is unused for now, but is included for future4555* extensibility. User should pass 0 (i.e no flag specified).4556*4557* Currently, hugetlbfs is not supported.4558*4559* Because the status of a page can change after cachestat() checks it4560* but before it returns to the application, the returned values may4561* contain stale information.4562*4563* return values:4564* zero - success4565* -EFAULT - cstat or cstat_range points to an illegal address4566* -EINVAL - invalid flags4567* -EBADF - invalid file descriptor4568* -EOPNOTSUPP - file descriptor is of a hugetlbfs file4569*/4570SYSCALL_DEFINE4(cachestat, unsigned int, fd,4571struct cachestat_range __user *, cstat_range,4572struct cachestat __user *, cstat, unsigned int, flags)4573{4574CLASS(fd, f)(fd);4575struct address_space *mapping;4576struct cachestat_range csr;4577struct cachestat cs;4578pgoff_t first_index, last_index;45794580if (fd_empty(f))4581return -EBADF;45824583if (copy_from_user(&csr, cstat_range,4584sizeof(struct cachestat_range)))4585return -EFAULT;45864587/* hugetlbfs is not supported */4588if (is_file_hugepages(fd_file(f)))4589return -EOPNOTSUPP;45904591if (!can_do_cachestat(fd_file(f)))4592return -EPERM;45934594if (flags != 0)4595return -EINVAL;45964597first_index = csr.off >> PAGE_SHIFT;4598last_index =4599csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;4600memset(&cs, 0, sizeof(struct cachestat));4601mapping = fd_file(f)->f_mapping;4602filemap_cachestat(mapping, first_index, last_index, &cs);46034604if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))4605return -EFAULT;46064607return 0;4608}4609#endif /* CONFIG_CACHESTAT_SYSCALL */461046114612