// SPDX-License-Identifier: GPL-2.0-only1#include <linux/kernel.h>2#include <linux/errno.h>3#include <linux/err.h>4#include <linux/spinlock.h>56#include <linux/mm.h>7#include <linux/memfd.h>8#include <linux/memremap.h>9#include <linux/pagemap.h>10#include <linux/rmap.h>11#include <linux/swap.h>12#include <linux/swapops.h>13#include <linux/secretmem.h>1415#include <linux/sched/signal.h>16#include <linux/rwsem.h>17#include <linux/hugetlb.h>18#include <linux/migrate.h>19#include <linux/mm_inline.h>20#include <linux/pagevec.h>21#include <linux/sched/mm.h>22#include <linux/shmem_fs.h>2324#include <asm/mmu_context.h>25#include <asm/tlbflush.h>2627#include "internal.h"28#include "swap.h"2930struct follow_page_context {31struct dev_pagemap *pgmap;32unsigned int page_mask;33};3435static inline void sanity_check_pinned_pages(struct page **pages,36unsigned long npages)37{38if (!IS_ENABLED(CONFIG_DEBUG_VM))39return;4041/*42* We only pin anonymous pages if they are exclusive. Once pinned, we43* can no longer turn them possibly shared and PageAnonExclusive() will44* stick around until the page is freed.45*46* We'd like to verify that our pinned anonymous pages are still mapped47* exclusively. The issue with anon THP is that we don't know how48* they are/were mapped when pinning them. However, for anon49* THP we can assume that either the given page (PTE-mapped THP) or50* the head page (PMD-mapped THP) should be PageAnonExclusive(). If51* neither is the case, there is certainly something wrong.52*/53for (; npages; npages--, pages++) {54struct page *page = *pages;55struct folio *folio;5657if (!page)58continue;5960folio = page_folio(page);6162if (is_zero_page(page) ||63!folio_test_anon(folio))64continue;65if (!folio_test_large(folio) || folio_test_hugetlb(folio))66VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);67else68/* Either a PTE-mapped or a PMD-mapped THP. */69VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&70!PageAnonExclusive(page), page);71}72}7374/*75* Return the folio with ref appropriately incremented,76* or NULL if that failed.77*/78static inline struct folio *try_get_folio(struct page *page, int refs)79{80struct folio *folio;8182retry:83folio = page_folio(page);84if (WARN_ON_ONCE(folio_ref_count(folio) < 0))85return NULL;86if (unlikely(!folio_ref_try_add(folio, refs)))87return NULL;8889/*90* At this point we have a stable reference to the folio; but it91* could be that between calling page_folio() and the refcount92* increment, the folio was split, in which case we'd end up93* holding a reference on a folio that has nothing to do with the page94* we were given anymore.95* So now that the folio is stable, recheck that the page still96* belongs to this folio.97*/98if (unlikely(page_folio(page) != folio)) {99folio_put_refs(folio, refs);100goto retry;101}102103return folio;104}105106static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)107{108if (flags & FOLL_PIN) {109if (is_zero_folio(folio))110return;111node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);112if (folio_has_pincount(folio))113atomic_sub(refs, &folio->_pincount);114else115refs *= GUP_PIN_COUNTING_BIAS;116}117118folio_put_refs(folio, refs);119}120121/**122* try_grab_folio() - add a folio's refcount by a flag-dependent amount123* @folio: pointer to folio to be grabbed124* @refs: the value to (effectively) add to the folio's refcount125* @flags: gup flags: these are the FOLL_* flag values126*127* This might not do anything at all, depending on the flags argument.128*129* "grab" names in this file mean, "look at flags to decide whether to use130* FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.131*132* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same133* time.134*135* Return: 0 for success, or if no action was required (if neither FOLL_PIN136* nor FOLL_GET was set, nothing is done). A negative error code for failure:137*138* -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not139* be grabbed.140*141* It is called when we have a stable reference for the folio, typically in142* GUP slow path.143*/144int __must_check try_grab_folio(struct folio *folio, int refs,145unsigned int flags)146{147if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))148return -ENOMEM;149150if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))151return -EREMOTEIO;152153if (flags & FOLL_GET)154folio_ref_add(folio, refs);155else if (flags & FOLL_PIN) {156/*157* Don't take a pin on the zero page - it's not going anywhere158* and it is used in a *lot* of places.159*/160if (is_zero_folio(folio))161return 0;162163/*164* Increment the normal page refcount field at least once,165* so that the page really is pinned.166*/167if (folio_has_pincount(folio)) {168folio_ref_add(folio, refs);169atomic_add(refs, &folio->_pincount);170} else {171folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);172}173174node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);175}176177return 0;178}179180/**181* unpin_user_page() - release a dma-pinned page182* @page: pointer to page to be released183*184* Pages that were pinned via pin_user_pages*() must be released via either185* unpin_user_page(), or one of the unpin_user_pages*() routines. This is so186* that such pages can be separately tracked and uniquely handled. In187* particular, interactions with RDMA and filesystems need special handling.188*/189void unpin_user_page(struct page *page)190{191sanity_check_pinned_pages(&page, 1);192gup_put_folio(page_folio(page), 1, FOLL_PIN);193}194EXPORT_SYMBOL(unpin_user_page);195196/**197* unpin_folio() - release a dma-pinned folio198* @folio: pointer to folio to be released199*200* Folios that were pinned via memfd_pin_folios() or other similar routines201* must be released either using unpin_folio() or unpin_folios().202*/203void unpin_folio(struct folio *folio)204{205gup_put_folio(folio, 1, FOLL_PIN);206}207EXPORT_SYMBOL_GPL(unpin_folio);208209/**210* folio_add_pin - Try to get an additional pin on a pinned folio211* @folio: The folio to be pinned212*213* Get an additional pin on a folio we already have a pin on. Makes no change214* if the folio is a zero_page.215*/216void folio_add_pin(struct folio *folio)217{218if (is_zero_folio(folio))219return;220221/*222* Similar to try_grab_folio(): be sure to *also* increment the normal223* page refcount field at least once, so that the page really is224* pinned.225*/226if (folio_has_pincount(folio)) {227WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);228folio_ref_inc(folio);229atomic_inc(&folio->_pincount);230} else {231WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);232folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);233}234}235236static inline struct folio *gup_folio_range_next(struct page *start,237unsigned long npages, unsigned long i, unsigned int *ntails)238{239struct page *next = nth_page(start, i);240struct folio *folio = page_folio(next);241unsigned int nr = 1;242243if (folio_test_large(folio))244nr = min_t(unsigned int, npages - i,245folio_nr_pages(folio) - folio_page_idx(folio, next));246247*ntails = nr;248return folio;249}250251static inline struct folio *gup_folio_next(struct page **list,252unsigned long npages, unsigned long i, unsigned int *ntails)253{254struct folio *folio = page_folio(list[i]);255unsigned int nr;256257for (nr = i + 1; nr < npages; nr++) {258if (page_folio(list[nr]) != folio)259break;260}261262*ntails = nr - i;263return folio;264}265266/**267* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages268* @pages: array of pages to be maybe marked dirty, and definitely released.269* @npages: number of pages in the @pages array.270* @make_dirty: whether to mark the pages dirty271*272* "gup-pinned page" refers to a page that has had one of the get_user_pages()273* variants called on that page.274*275* For each page in the @pages array, make that page (or its head page, if a276* compound page) dirty, if @make_dirty is true, and if the page was previously277* listed as clean. In any case, releases all pages using unpin_user_page(),278* possibly via unpin_user_pages(), for the non-dirty case.279*280* Please see the unpin_user_page() documentation for details.281*282* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is283* required, then the caller should a) verify that this is really correct,284* because _lock() is usually required, and b) hand code it:285* set_page_dirty_lock(), unpin_user_page().286*287*/288void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,289bool make_dirty)290{291unsigned long i;292struct folio *folio;293unsigned int nr;294295if (!make_dirty) {296unpin_user_pages(pages, npages);297return;298}299300sanity_check_pinned_pages(pages, npages);301for (i = 0; i < npages; i += nr) {302folio = gup_folio_next(pages, npages, i, &nr);303/*304* Checking PageDirty at this point may race with305* clear_page_dirty_for_io(), but that's OK. Two key306* cases:307*308* 1) This code sees the page as already dirty, so it309* skips the call to set_page_dirty(). That could happen310* because clear_page_dirty_for_io() called311* folio_mkclean(), followed by set_page_dirty().312* However, now the page is going to get written back,313* which meets the original intention of setting it314* dirty, so all is well: clear_page_dirty_for_io() goes315* on to call TestClearPageDirty(), and write the page316* back.317*318* 2) This code sees the page as clean, so it calls319* set_page_dirty(). The page stays dirty, despite being320* written back, so it gets written back again in the321* next writeback cycle. This is harmless.322*/323if (!folio_test_dirty(folio)) {324folio_lock(folio);325folio_mark_dirty(folio);326folio_unlock(folio);327}328gup_put_folio(folio, nr, FOLL_PIN);329}330}331EXPORT_SYMBOL(unpin_user_pages_dirty_lock);332333/**334* unpin_user_page_range_dirty_lock() - release and optionally dirty335* gup-pinned page range336*337* @page: the starting page of a range maybe marked dirty, and definitely released.338* @npages: number of consecutive pages to release.339* @make_dirty: whether to mark the pages dirty340*341* "gup-pinned page range" refers to a range of pages that has had one of the342* pin_user_pages() variants called on that page.343*344* For the page ranges defined by [page .. page+npages], make that range (or345* its head pages, if a compound page) dirty, if @make_dirty is true, and if the346* page range was previously listed as clean.347*348* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is349* required, then the caller should a) verify that this is really correct,350* because _lock() is usually required, and b) hand code it:351* set_page_dirty_lock(), unpin_user_page().352*353*/354void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,355bool make_dirty)356{357unsigned long i;358struct folio *folio;359unsigned int nr;360361for (i = 0; i < npages; i += nr) {362folio = gup_folio_range_next(page, npages, i, &nr);363if (make_dirty && !folio_test_dirty(folio)) {364folio_lock(folio);365folio_mark_dirty(folio);366folio_unlock(folio);367}368gup_put_folio(folio, nr, FOLL_PIN);369}370}371EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);372373static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)374{375unsigned long i;376struct folio *folio;377unsigned int nr;378379/*380* Don't perform any sanity checks because we might have raced with381* fork() and some anonymous pages might now actually be shared --382* which is why we're unpinning after all.383*/384for (i = 0; i < npages; i += nr) {385folio = gup_folio_next(pages, npages, i, &nr);386gup_put_folio(folio, nr, FOLL_PIN);387}388}389390/**391* unpin_user_pages() - release an array of gup-pinned pages.392* @pages: array of pages to be marked dirty and released.393* @npages: number of pages in the @pages array.394*395* For each page in the @pages array, release the page using unpin_user_page().396*397* Please see the unpin_user_page() documentation for details.398*/399void unpin_user_pages(struct page **pages, unsigned long npages)400{401unsigned long i;402struct folio *folio;403unsigned int nr;404405/*406* If this WARN_ON() fires, then the system *might* be leaking pages (by407* leaving them pinned), but probably not. More likely, gup/pup returned408* a hard -ERRNO error to the caller, who erroneously passed it here.409*/410if (WARN_ON(IS_ERR_VALUE(npages)))411return;412413sanity_check_pinned_pages(pages, npages);414for (i = 0; i < npages; i += nr) {415if (!pages[i]) {416nr = 1;417continue;418}419folio = gup_folio_next(pages, npages, i, &nr);420gup_put_folio(folio, nr, FOLL_PIN);421}422}423EXPORT_SYMBOL(unpin_user_pages);424425/**426* unpin_user_folio() - release pages of a folio427* @folio: pointer to folio to be released428* @npages: number of pages of same folio429*430* Release npages of the folio431*/432void unpin_user_folio(struct folio *folio, unsigned long npages)433{434gup_put_folio(folio, npages, FOLL_PIN);435}436EXPORT_SYMBOL(unpin_user_folio);437438/**439* unpin_folios() - release an array of gup-pinned folios.440* @folios: array of folios to be marked dirty and released.441* @nfolios: number of folios in the @folios array.442*443* For each folio in the @folios array, release the folio using gup_put_folio.444*445* Please see the unpin_folio() documentation for details.446*/447void unpin_folios(struct folio **folios, unsigned long nfolios)448{449unsigned long i = 0, j;450451/*452* If this WARN_ON() fires, then the system *might* be leaking folios453* (by leaving them pinned), but probably not. More likely, gup/pup454* returned a hard -ERRNO error to the caller, who erroneously passed455* it here.456*/457if (WARN_ON(IS_ERR_VALUE(nfolios)))458return;459460while (i < nfolios) {461for (j = i + 1; j < nfolios; j++)462if (folios[i] != folios[j])463break;464465if (folios[i])466gup_put_folio(folios[i], j - i, FOLL_PIN);467i = j;468}469}470EXPORT_SYMBOL_GPL(unpin_folios);471472/*473* Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's474* lifecycle. Avoid setting the bit unless necessary, or it might cause write475* cache bouncing on large SMP machines for concurrent pinned gups.476*/477static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)478{479if (!test_bit(MMF_HAS_PINNED, mm_flags))480set_bit(MMF_HAS_PINNED, mm_flags);481}482483#ifdef CONFIG_MMU484485#ifdef CONFIG_HAVE_GUP_FAST486static int record_subpages(struct page *page, unsigned long sz,487unsigned long addr, unsigned long end,488struct page **pages)489{490struct page *start_page;491int nr;492493start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);494for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)495pages[nr] = nth_page(start_page, nr);496497return nr;498}499500/**501* try_grab_folio_fast() - Attempt to get or pin a folio in fast path.502* @page: pointer to page to be grabbed503* @refs: the value to (effectively) add to the folio's refcount504* @flags: gup flags: these are the FOLL_* flag values.505*506* "grab" names in this file mean, "look at flags to decide whether to use507* FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.508*509* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the510* same time. (That's true throughout the get_user_pages*() and511* pin_user_pages*() APIs.) Cases:512*513* FOLL_GET: folio's refcount will be incremented by @refs.514*515* FOLL_PIN on large folios: folio's refcount will be incremented by516* @refs, and its pincount will be incremented by @refs.517*518* FOLL_PIN on single-page folios: folio's refcount will be incremented by519* @refs * GUP_PIN_COUNTING_BIAS.520*521* Return: The folio containing @page (with refcount appropriately522* incremented) for success, or NULL upon failure. If neither FOLL_GET523* nor FOLL_PIN was set, that's considered failure, and furthermore,524* a likely bug in the caller, so a warning is also emitted.525*526* It uses add ref unless zero to elevate the folio refcount and must be called527* in fast path only.528*/529static struct folio *try_grab_folio_fast(struct page *page, int refs,530unsigned int flags)531{532struct folio *folio;533534/* Raise warn if it is not called in fast GUP */535VM_WARN_ON_ONCE(!irqs_disabled());536537if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))538return NULL;539540if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))541return NULL;542543if (flags & FOLL_GET)544return try_get_folio(page, refs);545546/* FOLL_PIN is set */547548/*549* Don't take a pin on the zero page - it's not going anywhere550* and it is used in a *lot* of places.551*/552if (is_zero_page(page))553return page_folio(page);554555folio = try_get_folio(page, refs);556if (!folio)557return NULL;558559/*560* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a561* right zone, so fail and let the caller fall back to the slow562* path.563*/564if (unlikely((flags & FOLL_LONGTERM) &&565!folio_is_longterm_pinnable(folio))) {566folio_put_refs(folio, refs);567return NULL;568}569570/*571* When pinning a large folio, use an exact count to track it.572*573* However, be sure to *also* increment the normal folio574* refcount field at least once, so that the folio really575* is pinned. That's why the refcount from the earlier576* try_get_folio() is left intact.577*/578if (folio_has_pincount(folio))579atomic_add(refs, &folio->_pincount);580else581folio_ref_add(folio,582refs * (GUP_PIN_COUNTING_BIAS - 1));583/*584* Adjust the pincount before re-checking the PTE for changes.585* This is essentially a smp_mb() and is paired with a memory586* barrier in folio_try_share_anon_rmap_*().587*/588smp_mb__after_atomic();589590node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);591592return folio;593}594#endif /* CONFIG_HAVE_GUP_FAST */595596/* Common code for can_follow_write_* */597static inline bool can_follow_write_common(struct page *page,598struct vm_area_struct *vma, unsigned int flags)599{600/* Maybe FOLL_FORCE is set to override it? */601if (!(flags & FOLL_FORCE))602return false;603604/* But FOLL_FORCE has no effect on shared mappings */605if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))606return false;607608/* ... or read-only private ones */609if (!(vma->vm_flags & VM_MAYWRITE))610return false;611612/* ... or already writable ones that just need to take a write fault */613if (vma->vm_flags & VM_WRITE)614return false;615616/*617* See can_change_pte_writable(): we broke COW and could map the page618* writable if we have an exclusive anonymous page ...619*/620return page && PageAnon(page) && PageAnonExclusive(page);621}622623static struct page *no_page_table(struct vm_area_struct *vma,624unsigned int flags, unsigned long address)625{626if (!(flags & FOLL_DUMP))627return NULL;628629/*630* When core dumping, we don't want to allocate unnecessary pages or631* page tables. Return error instead of NULL to skip handle_mm_fault,632* then get_dump_page() will return NULL to leave a hole in the dump.633* But we can only make this optimization where a hole would surely634* be zero-filled if handle_mm_fault() actually did handle it.635*/636if (is_vm_hugetlb_page(vma)) {637struct hstate *h = hstate_vma(vma);638639if (!hugetlbfs_pagecache_present(h, vma, address))640return ERR_PTR(-EFAULT);641} else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {642return ERR_PTR(-EFAULT);643}644645return NULL;646}647648#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES649/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */650static inline bool can_follow_write_pud(pud_t pud, struct page *page,651struct vm_area_struct *vma,652unsigned int flags)653{654/* If the pud is writable, we can write to the page. */655if (pud_write(pud))656return true;657658return can_follow_write_common(page, vma, flags);659}660661static struct page *follow_huge_pud(struct vm_area_struct *vma,662unsigned long addr, pud_t *pudp,663int flags, struct follow_page_context *ctx)664{665struct mm_struct *mm = vma->vm_mm;666struct page *page;667pud_t pud = *pudp;668unsigned long pfn = pud_pfn(pud);669int ret;670671assert_spin_locked(pud_lockptr(mm, pudp));672673if (!pud_present(pud))674return NULL;675676if ((flags & FOLL_WRITE) &&677!can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))678return NULL;679680pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;681page = pfn_to_page(pfn);682683if (!pud_write(pud) && gup_must_unshare(vma, flags, page))684return ERR_PTR(-EMLINK);685686ret = try_grab_folio(page_folio(page), 1, flags);687if (ret)688page = ERR_PTR(ret);689else690ctx->page_mask = HPAGE_PUD_NR - 1;691692return page;693}694695/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */696static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,697struct vm_area_struct *vma,698unsigned int flags)699{700/* If the pmd is writable, we can write to the page. */701if (pmd_write(pmd))702return true;703704if (!can_follow_write_common(page, vma, flags))705return false;706707/* ... and a write-fault isn't required for other reasons. */708if (pmd_needs_soft_dirty_wp(vma, pmd))709return false;710return !userfaultfd_huge_pmd_wp(vma, pmd);711}712713static struct page *follow_huge_pmd(struct vm_area_struct *vma,714unsigned long addr, pmd_t *pmd,715unsigned int flags,716struct follow_page_context *ctx)717{718struct mm_struct *mm = vma->vm_mm;719pmd_t pmdval = *pmd;720struct page *page;721int ret;722723assert_spin_locked(pmd_lockptr(mm, pmd));724725page = pmd_page(pmdval);726if ((flags & FOLL_WRITE) &&727!can_follow_write_pmd(pmdval, page, vma, flags))728return NULL;729730/* Avoid dumping huge zero page */731if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))732return ERR_PTR(-EFAULT);733734if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))735return NULL;736737if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))738return ERR_PTR(-EMLINK);739740VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&741!PageAnonExclusive(page), page);742743ret = try_grab_folio(page_folio(page), 1, flags);744if (ret)745return ERR_PTR(ret);746747#ifdef CONFIG_TRANSPARENT_HUGEPAGE748if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))749touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);750#endif /* CONFIG_TRANSPARENT_HUGEPAGE */751752page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;753ctx->page_mask = HPAGE_PMD_NR - 1;754755return page;756}757758#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */759static struct page *follow_huge_pud(struct vm_area_struct *vma,760unsigned long addr, pud_t *pudp,761int flags, struct follow_page_context *ctx)762{763return NULL;764}765766static struct page *follow_huge_pmd(struct vm_area_struct *vma,767unsigned long addr, pmd_t *pmd,768unsigned int flags,769struct follow_page_context *ctx)770{771return NULL;772}773#endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */774775static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,776pte_t *pte, unsigned int flags)777{778if (flags & FOLL_TOUCH) {779pte_t orig_entry = ptep_get(pte);780pte_t entry = orig_entry;781782if (flags & FOLL_WRITE)783entry = pte_mkdirty(entry);784entry = pte_mkyoung(entry);785786if (!pte_same(orig_entry, entry)) {787set_pte_at(vma->vm_mm, address, pte, entry);788update_mmu_cache(vma, address, pte);789}790}791792/* Proper page table entry exists, but no corresponding struct page */793return -EEXIST;794}795796/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */797static inline bool can_follow_write_pte(pte_t pte, struct page *page,798struct vm_area_struct *vma,799unsigned int flags)800{801/* If the pte is writable, we can write to the page. */802if (pte_write(pte))803return true;804805if (!can_follow_write_common(page, vma, flags))806return false;807808/* ... and a write-fault isn't required for other reasons. */809if (pte_needs_soft_dirty_wp(vma, pte))810return false;811return !userfaultfd_pte_wp(vma, pte);812}813814static struct page *follow_page_pte(struct vm_area_struct *vma,815unsigned long address, pmd_t *pmd, unsigned int flags,816struct dev_pagemap **pgmap)817{818struct mm_struct *mm = vma->vm_mm;819struct folio *folio;820struct page *page;821spinlock_t *ptl;822pte_t *ptep, pte;823int ret;824825ptep = pte_offset_map_lock(mm, pmd, address, &ptl);826if (!ptep)827return no_page_table(vma, flags, address);828pte = ptep_get(ptep);829if (!pte_present(pte))830goto no_page;831if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))832goto no_page;833834page = vm_normal_page(vma, address, pte);835836/*837* We only care about anon pages in can_follow_write_pte().838*/839if ((flags & FOLL_WRITE) &&840!can_follow_write_pte(pte, page, vma, flags)) {841page = NULL;842goto out;843}844845if (unlikely(!page)) {846if (flags & FOLL_DUMP) {847/* Avoid special (like zero) pages in core dumps */848page = ERR_PTR(-EFAULT);849goto out;850}851852if (is_zero_pfn(pte_pfn(pte))) {853page = pte_page(pte);854} else {855ret = follow_pfn_pte(vma, address, ptep, flags);856page = ERR_PTR(ret);857goto out;858}859}860folio = page_folio(page);861862if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {863page = ERR_PTR(-EMLINK);864goto out;865}866867VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&868!PageAnonExclusive(page), page);869870/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */871ret = try_grab_folio(folio, 1, flags);872if (unlikely(ret)) {873page = ERR_PTR(ret);874goto out;875}876877/*878* We need to make the page accessible if and only if we are going879* to access its content (the FOLL_PIN case). Please see880* Documentation/core-api/pin_user_pages.rst for details.881*/882if (flags & FOLL_PIN) {883ret = arch_make_folio_accessible(folio);884if (ret) {885unpin_user_page(page);886page = ERR_PTR(ret);887goto out;888}889}890if (flags & FOLL_TOUCH) {891if ((flags & FOLL_WRITE) &&892!pte_dirty(pte) && !folio_test_dirty(folio))893folio_mark_dirty(folio);894/*895* pte_mkyoung() would be more correct here, but atomic care896* is needed to avoid losing the dirty bit: it is easier to use897* folio_mark_accessed().898*/899folio_mark_accessed(folio);900}901out:902pte_unmap_unlock(ptep, ptl);903return page;904no_page:905pte_unmap_unlock(ptep, ptl);906if (!pte_none(pte))907return NULL;908return no_page_table(vma, flags, address);909}910911static struct page *follow_pmd_mask(struct vm_area_struct *vma,912unsigned long address, pud_t *pudp,913unsigned int flags,914struct follow_page_context *ctx)915{916pmd_t *pmd, pmdval;917spinlock_t *ptl;918struct page *page;919struct mm_struct *mm = vma->vm_mm;920921pmd = pmd_offset(pudp, address);922pmdval = pmdp_get_lockless(pmd);923if (pmd_none(pmdval))924return no_page_table(vma, flags, address);925if (!pmd_present(pmdval))926return no_page_table(vma, flags, address);927if (likely(!pmd_leaf(pmdval)))928return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);929930if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))931return no_page_table(vma, flags, address);932933ptl = pmd_lock(mm, pmd);934pmdval = *pmd;935if (unlikely(!pmd_present(pmdval))) {936spin_unlock(ptl);937return no_page_table(vma, flags, address);938}939if (unlikely(!pmd_leaf(pmdval))) {940spin_unlock(ptl);941return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);942}943if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {944spin_unlock(ptl);945split_huge_pmd(vma, pmd, address);946/* If pmd was left empty, stuff a page table in there quickly */947return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :948follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);949}950page = follow_huge_pmd(vma, address, pmd, flags, ctx);951spin_unlock(ptl);952return page;953}954955static struct page *follow_pud_mask(struct vm_area_struct *vma,956unsigned long address, p4d_t *p4dp,957unsigned int flags,958struct follow_page_context *ctx)959{960pud_t *pudp, pud;961spinlock_t *ptl;962struct page *page;963struct mm_struct *mm = vma->vm_mm;964965pudp = pud_offset(p4dp, address);966pud = READ_ONCE(*pudp);967if (!pud_present(pud))968return no_page_table(vma, flags, address);969if (pud_leaf(pud)) {970ptl = pud_lock(mm, pudp);971page = follow_huge_pud(vma, address, pudp, flags, ctx);972spin_unlock(ptl);973if (page)974return page;975return no_page_table(vma, flags, address);976}977if (unlikely(pud_bad(pud)))978return no_page_table(vma, flags, address);979980return follow_pmd_mask(vma, address, pudp, flags, ctx);981}982983static struct page *follow_p4d_mask(struct vm_area_struct *vma,984unsigned long address, pgd_t *pgdp,985unsigned int flags,986struct follow_page_context *ctx)987{988p4d_t *p4dp, p4d;989990p4dp = p4d_offset(pgdp, address);991p4d = READ_ONCE(*p4dp);992BUILD_BUG_ON(p4d_leaf(p4d));993994if (!p4d_present(p4d) || p4d_bad(p4d))995return no_page_table(vma, flags, address);996997return follow_pud_mask(vma, address, p4dp, flags, ctx);998}9991000/**1001* follow_page_mask - look up a page descriptor from a user-virtual address1002* @vma: vm_area_struct mapping @address1003* @address: virtual address to look up1004* @flags: flags modifying lookup behaviour1005* @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a1006* pointer to output page_mask1007*1008* @flags can have FOLL_ flags set, defined in <linux/mm.h>1009*1010* When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches1011* the device's dev_pagemap metadata to avoid repeating expensive lookups.1012*1013* When getting an anonymous page and the caller has to trigger unsharing1014* of a shared anonymous page first, -EMLINK is returned. The caller should1015* trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only1016* relevant with FOLL_PIN and !FOLL_WRITE.1017*1018* On output, the @ctx->page_mask is set according to the size of the page.1019*1020* Return: the mapped (struct page *), %NULL if no mapping exists, or1021* an error pointer if there is a mapping to something not represented1022* by a page descriptor (see also vm_normal_page()).1023*/1024static struct page *follow_page_mask(struct vm_area_struct *vma,1025unsigned long address, unsigned int flags,1026struct follow_page_context *ctx)1027{1028pgd_t *pgd;1029struct mm_struct *mm = vma->vm_mm;1030struct page *page;10311032vma_pgtable_walk_begin(vma);10331034ctx->page_mask = 0;1035pgd = pgd_offset(mm, address);10361037if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))1038page = no_page_table(vma, flags, address);1039else1040page = follow_p4d_mask(vma, address, pgd, flags, ctx);10411042vma_pgtable_walk_end(vma);10431044return page;1045}10461047static int get_gate_page(struct mm_struct *mm, unsigned long address,1048unsigned int gup_flags, struct vm_area_struct **vma,1049struct page **page)1050{1051pgd_t *pgd;1052p4d_t *p4d;1053pud_t *pud;1054pmd_t *pmd;1055pte_t *pte;1056pte_t entry;1057int ret = -EFAULT;10581059/* user gate pages are read-only */1060if (gup_flags & FOLL_WRITE)1061return -EFAULT;1062pgd = pgd_offset(mm, address);1063if (pgd_none(*pgd))1064return -EFAULT;1065p4d = p4d_offset(pgd, address);1066if (p4d_none(*p4d))1067return -EFAULT;1068pud = pud_offset(p4d, address);1069if (pud_none(*pud))1070return -EFAULT;1071pmd = pmd_offset(pud, address);1072if (!pmd_present(*pmd))1073return -EFAULT;1074pte = pte_offset_map(pmd, address);1075if (!pte)1076return -EFAULT;1077entry = ptep_get(pte);1078if (pte_none(entry))1079goto unmap;1080*vma = get_gate_vma(mm);1081if (!page)1082goto out;1083*page = vm_normal_page(*vma, address, entry);1084if (!*page) {1085if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))1086goto unmap;1087*page = pte_page(entry);1088}1089ret = try_grab_folio(page_folio(*page), 1, gup_flags);1090if (unlikely(ret))1091goto unmap;1092out:1093ret = 0;1094unmap:1095pte_unmap(pte);1096return ret;1097}10981099/*1100* mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not1101* FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set1102* to 0 and -EBUSY returned.1103*/1104static int faultin_page(struct vm_area_struct *vma,1105unsigned long address, unsigned int flags, bool unshare,1106int *locked)1107{1108unsigned int fault_flags = 0;1109vm_fault_t ret;11101111if (flags & FOLL_NOFAULT)1112return -EFAULT;1113if (flags & FOLL_WRITE)1114fault_flags |= FAULT_FLAG_WRITE;1115if (flags & FOLL_REMOTE)1116fault_flags |= FAULT_FLAG_REMOTE;1117if (flags & FOLL_UNLOCKABLE) {1118fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;1119/*1120* FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set1121* FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.1122* That's because some callers may not be prepared to1123* handle early exits caused by non-fatal signals.1124*/1125if (flags & FOLL_INTERRUPTIBLE)1126fault_flags |= FAULT_FLAG_INTERRUPTIBLE;1127}1128if (flags & FOLL_NOWAIT)1129fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;1130if (flags & FOLL_TRIED) {1131/*1132* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED1133* can co-exist1134*/1135fault_flags |= FAULT_FLAG_TRIED;1136}1137if (unshare) {1138fault_flags |= FAULT_FLAG_UNSHARE;1139/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */1140VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);1141}11421143ret = handle_mm_fault(vma, address, fault_flags, NULL);11441145if (ret & VM_FAULT_COMPLETED) {1146/*1147* With FAULT_FLAG_RETRY_NOWAIT we'll never release the1148* mmap lock in the page fault handler. Sanity check this.1149*/1150WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);1151*locked = 0;11521153/*1154* We should do the same as VM_FAULT_RETRY, but let's not1155* return -EBUSY since that's not reflecting the reality of1156* what has happened - we've just fully completed a page1157* fault, with the mmap lock released. Use -EAGAIN to show1158* that we want to take the mmap lock _again_.1159*/1160return -EAGAIN;1161}11621163if (ret & VM_FAULT_ERROR) {1164int err = vm_fault_to_errno(ret, flags);11651166if (err)1167return err;1168BUG();1169}11701171if (ret & VM_FAULT_RETRY) {1172if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))1173*locked = 0;1174return -EBUSY;1175}11761177return 0;1178}11791180/*1181* Writing to file-backed mappings which require folio dirty tracking using GUP1182* is a fundamentally broken operation, as kernel write access to GUP mappings1183* do not adhere to the semantics expected by a file system.1184*1185* Consider the following scenario:-1186*1187* 1. A folio is written to via GUP which write-faults the memory, notifying1188* the file system and dirtying the folio.1189* 2. Later, writeback is triggered, resulting in the folio being cleaned and1190* the PTE being marked read-only.1191* 3. The GUP caller writes to the folio, as it is mapped read/write via the1192* direct mapping.1193* 4. The GUP caller, now done with the page, unpins it and sets it dirty1194* (though it does not have to).1195*1196* This results in both data being written to a folio without writenotify, and1197* the folio being dirtied unexpectedly (if the caller decides to do so).1198*/1199static bool writable_file_mapping_allowed(struct vm_area_struct *vma,1200unsigned long gup_flags)1201{1202/*1203* If we aren't pinning then no problematic write can occur. A long term1204* pin is the most egregious case so this is the case we disallow.1205*/1206if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=1207(FOLL_PIN | FOLL_LONGTERM))1208return true;12091210/*1211* If the VMA does not require dirty tracking then no problematic write1212* can occur either.1213*/1214return !vma_needs_dirty_tracking(vma);1215}12161217static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)1218{1219vm_flags_t vm_flags = vma->vm_flags;1220int write = (gup_flags & FOLL_WRITE);1221int foreign = (gup_flags & FOLL_REMOTE);1222bool vma_anon = vma_is_anonymous(vma);12231224if (vm_flags & (VM_IO | VM_PFNMAP))1225return -EFAULT;12261227if ((gup_flags & FOLL_ANON) && !vma_anon)1228return -EFAULT;12291230if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))1231return -EOPNOTSUPP;12321233if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))1234return -EOPNOTSUPP;12351236if (vma_is_secretmem(vma))1237return -EFAULT;12381239if (write) {1240if (!vma_anon &&1241!writable_file_mapping_allowed(vma, gup_flags))1242return -EFAULT;12431244if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {1245if (!(gup_flags & FOLL_FORCE))1246return -EFAULT;1247/*1248* We used to let the write,force case do COW in a1249* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could1250* set a breakpoint in a read-only mapping of an1251* executable, without corrupting the file (yet only1252* when that file had been opened for writing!).1253* Anon pages in shared mappings are surprising: now1254* just reject it.1255*/1256if (!is_cow_mapping(vm_flags))1257return -EFAULT;1258}1259} else if (!(vm_flags & VM_READ)) {1260if (!(gup_flags & FOLL_FORCE))1261return -EFAULT;1262/*1263* Is there actually any vma we can reach here which does not1264* have VM_MAYREAD set?1265*/1266if (!(vm_flags & VM_MAYREAD))1267return -EFAULT;1268}1269/*1270* gups are always data accesses, not instruction1271* fetches, so execute=false here1272*/1273if (!arch_vma_access_permitted(vma, write, false, foreign))1274return -EFAULT;1275return 0;1276}12771278/*1279* This is "vma_lookup()", but with a warning if we would have1280* historically expanded the stack in the GUP code.1281*/1282static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,1283unsigned long addr)1284{1285#ifdef CONFIG_STACK_GROWSUP1286return vma_lookup(mm, addr);1287#else1288static volatile unsigned long next_warn;1289struct vm_area_struct *vma;1290unsigned long now, next;12911292vma = find_vma(mm, addr);1293if (!vma || (addr >= vma->vm_start))1294return vma;12951296/* Only warn for half-way relevant accesses */1297if (!(vma->vm_flags & VM_GROWSDOWN))1298return NULL;1299if (vma->vm_start - addr > 65536)1300return NULL;13011302/* Let's not warn more than once an hour.. */1303now = jiffies; next = next_warn;1304if (next && time_before(now, next))1305return NULL;1306next_warn = now + 60*60*HZ;13071308/* Let people know things may have changed. */1309pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",1310current->comm, task_pid_nr(current),1311vma->vm_start, vma->vm_end, addr);1312dump_stack();1313return NULL;1314#endif1315}13161317/**1318* __get_user_pages() - pin user pages in memory1319* @mm: mm_struct of target mm1320* @start: starting user address1321* @nr_pages: number of pages from start to pin1322* @gup_flags: flags modifying pin behaviour1323* @pages: array that receives pointers to the pages pinned.1324* Should be at least nr_pages long. Or NULL, if caller1325* only intends to ensure the pages are faulted in.1326* @locked: whether we're still with the mmap_lock held1327*1328* Returns either number of pages pinned (which may be less than the1329* number requested), or an error. Details about the return value:1330*1331* -- If nr_pages is 0, returns 0.1332* -- If nr_pages is >0, but no pages were pinned, returns -errno.1333* -- If nr_pages is >0, and some pages were pinned, returns the number of1334* pages pinned. Again, this may be less than nr_pages.1335* -- 0 return value is possible when the fault would need to be retried.1336*1337* The caller is responsible for releasing returned @pages, via put_page().1338*1339* Must be called with mmap_lock held. It may be released. See below.1340*1341* __get_user_pages walks a process's page tables and takes a reference to1342* each struct page that each user address corresponds to at a given1343* instant. That is, it takes the page that would be accessed if a user1344* thread accesses the given user virtual address at that instant.1345*1346* This does not guarantee that the page exists in the user mappings when1347* __get_user_pages returns, and there may even be a completely different1348* page there in some cases (eg. if mmapped pagecache has been invalidated1349* and subsequently re-faulted). However it does guarantee that the page1350* won't be freed completely. And mostly callers simply care that the page1351* contains data that was valid *at some point in time*. Typically, an IO1352* or similar operation cannot guarantee anything stronger anyway because1353* locks can't be held over the syscall boundary.1354*1355* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If1356* the page is written to, set_page_dirty (or set_page_dirty_lock, as1357* appropriate) must be called after the page is finished with, and1358* before put_page is called.1359*1360* If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may1361* be released. If this happens *@locked will be set to 0 on return.1362*1363* A caller using such a combination of @gup_flags must therefore hold the1364* mmap_lock for reading only, and recognize when it's been released. Otherwise,1365* it must be held for either reading or writing and will not be released.1366*1367* In most cases, get_user_pages or get_user_pages_fast should be used1368* instead of __get_user_pages. __get_user_pages should be used only if1369* you need some special @gup_flags.1370*/1371static long __get_user_pages(struct mm_struct *mm,1372unsigned long start, unsigned long nr_pages,1373unsigned int gup_flags, struct page **pages,1374int *locked)1375{1376long ret = 0, i = 0;1377struct vm_area_struct *vma = NULL;1378struct follow_page_context ctx = { NULL };13791380if (!nr_pages)1381return 0;13821383start = untagged_addr_remote(mm, start);13841385VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));13861387/* FOLL_GET and FOLL_PIN are mutually exclusive. */1388VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==1389(FOLL_PIN | FOLL_GET));13901391do {1392struct page *page;1393unsigned int page_increm;13941395/* first iteration or cross vma bound */1396if (!vma || start >= vma->vm_end) {1397/*1398* MADV_POPULATE_(READ|WRITE) wants to handle VMA1399* lookups+error reporting differently.1400*/1401if (gup_flags & FOLL_MADV_POPULATE) {1402vma = vma_lookup(mm, start);1403if (!vma) {1404ret = -ENOMEM;1405goto out;1406}1407if (check_vma_flags(vma, gup_flags)) {1408ret = -EINVAL;1409goto out;1410}1411goto retry;1412}1413vma = gup_vma_lookup(mm, start);1414if (!vma && in_gate_area(mm, start)) {1415ret = get_gate_page(mm, start & PAGE_MASK,1416gup_flags, &vma,1417pages ? &page : NULL);1418if (ret)1419goto out;1420ctx.page_mask = 0;1421goto next_page;1422}14231424if (!vma) {1425ret = -EFAULT;1426goto out;1427}1428ret = check_vma_flags(vma, gup_flags);1429if (ret)1430goto out;1431}1432retry:1433/*1434* If we have a pending SIGKILL, don't keep faulting pages and1435* potentially allocating memory.1436*/1437if (fatal_signal_pending(current)) {1438ret = -EINTR;1439goto out;1440}1441cond_resched();14421443page = follow_page_mask(vma, start, gup_flags, &ctx);1444if (!page || PTR_ERR(page) == -EMLINK) {1445ret = faultin_page(vma, start, gup_flags,1446PTR_ERR(page) == -EMLINK, locked);1447switch (ret) {1448case 0:1449goto retry;1450case -EBUSY:1451case -EAGAIN:1452ret = 0;1453fallthrough;1454case -EFAULT:1455case -ENOMEM:1456case -EHWPOISON:1457goto out;1458}1459BUG();1460} else if (PTR_ERR(page) == -EEXIST) {1461/*1462* Proper page table entry exists, but no corresponding1463* struct page. If the caller expects **pages to be1464* filled in, bail out now, because that can't be done1465* for this page.1466*/1467if (pages) {1468ret = PTR_ERR(page);1469goto out;1470}1471} else if (IS_ERR(page)) {1472ret = PTR_ERR(page);1473goto out;1474}1475next_page:1476page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);1477if (page_increm > nr_pages)1478page_increm = nr_pages;14791480if (pages) {1481struct page *subpage;1482unsigned int j;14831484/*1485* This must be a large folio (and doesn't need to1486* be the whole folio; it can be part of it), do1487* the refcount work for all the subpages too.1488*1489* NOTE: here the page may not be the head page1490* e.g. when start addr is not thp-size aligned.1491* try_grab_folio() should have taken care of tail1492* pages.1493*/1494if (page_increm > 1) {1495struct folio *folio = page_folio(page);14961497/*1498* Since we already hold refcount on the1499* large folio, this should never fail.1500*/1501if (try_grab_folio(folio, page_increm - 1,1502gup_flags)) {1503/*1504* Release the 1st page ref if the1505* folio is problematic, fail hard.1506*/1507gup_put_folio(folio, 1, gup_flags);1508ret = -EFAULT;1509goto out;1510}1511}15121513for (j = 0; j < page_increm; j++) {1514subpage = nth_page(page, j);1515pages[i + j] = subpage;1516flush_anon_page(vma, subpage, start + j * PAGE_SIZE);1517flush_dcache_page(subpage);1518}1519}15201521i += page_increm;1522start += page_increm * PAGE_SIZE;1523nr_pages -= page_increm;1524} while (nr_pages);1525out:1526if (ctx.pgmap)1527put_dev_pagemap(ctx.pgmap);1528return i ? i : ret;1529}15301531static bool vma_permits_fault(struct vm_area_struct *vma,1532unsigned int fault_flags)1533{1534bool write = !!(fault_flags & FAULT_FLAG_WRITE);1535bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);1536vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;15371538if (!(vm_flags & vma->vm_flags))1539return false;15401541/*1542* The architecture might have a hardware protection1543* mechanism other than read/write that can deny access.1544*1545* gup always represents data access, not instruction1546* fetches, so execute=false here:1547*/1548if (!arch_vma_access_permitted(vma, write, false, foreign))1549return false;15501551return true;1552}15531554/**1555* fixup_user_fault() - manually resolve a user page fault1556* @mm: mm_struct of target mm1557* @address: user address1558* @fault_flags:flags to pass down to handle_mm_fault()1559* @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller1560* does not allow retry. If NULL, the caller must guarantee1561* that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.1562*1563* This is meant to be called in the specific scenario where for locking reasons1564* we try to access user memory in atomic context (within a pagefault_disable()1565* section), this returns -EFAULT, and we want to resolve the user fault before1566* trying again.1567*1568* Typically this is meant to be used by the futex code.1569*1570* The main difference with get_user_pages() is that this function will1571* unconditionally call handle_mm_fault() which will in turn perform all the1572* necessary SW fixup of the dirty and young bits in the PTE, while1573* get_user_pages() only guarantees to update these in the struct page.1574*1575* This is important for some architectures where those bits also gate the1576* access permission to the page because they are maintained in software. On1577* such architectures, gup() will not be enough to make a subsequent access1578* succeed.1579*1580* This function will not return with an unlocked mmap_lock. So it has not the1581* same semantics wrt the @mm->mmap_lock as does filemap_fault().1582*/1583int fixup_user_fault(struct mm_struct *mm,1584unsigned long address, unsigned int fault_flags,1585bool *unlocked)1586{1587struct vm_area_struct *vma;1588vm_fault_t ret;15891590address = untagged_addr_remote(mm, address);15911592if (unlocked)1593fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;15941595retry:1596vma = gup_vma_lookup(mm, address);1597if (!vma)1598return -EFAULT;15991600if (!vma_permits_fault(vma, fault_flags))1601return -EFAULT;16021603if ((fault_flags & FAULT_FLAG_KILLABLE) &&1604fatal_signal_pending(current))1605return -EINTR;16061607ret = handle_mm_fault(vma, address, fault_flags, NULL);16081609if (ret & VM_FAULT_COMPLETED) {1610/*1611* NOTE: it's a pity that we need to retake the lock here1612* to pair with the unlock() in the callers. Ideally we1613* could tell the callers so they do not need to unlock.1614*/1615mmap_read_lock(mm);1616*unlocked = true;1617return 0;1618}16191620if (ret & VM_FAULT_ERROR) {1621int err = vm_fault_to_errno(ret, 0);16221623if (err)1624return err;1625BUG();1626}16271628if (ret & VM_FAULT_RETRY) {1629mmap_read_lock(mm);1630*unlocked = true;1631fault_flags |= FAULT_FLAG_TRIED;1632goto retry;1633}16341635return 0;1636}1637EXPORT_SYMBOL_GPL(fixup_user_fault);16381639/*1640* GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is1641* specified, it'll also respond to generic signals. The caller of GUP1642* that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.1643*/1644static bool gup_signal_pending(unsigned int flags)1645{1646if (fatal_signal_pending(current))1647return true;16481649if (!(flags & FOLL_INTERRUPTIBLE))1650return false;16511652return signal_pending(current);1653}16541655/*1656* Locking: (*locked == 1) means that the mmap_lock has already been acquired by1657* the caller. This function may drop the mmap_lock. If it does so, then it will1658* set (*locked = 0).1659*1660* (*locked == 0) means that the caller expects this function to acquire and1661* drop the mmap_lock. Therefore, the value of *locked will still be zero when1662* the function returns, even though it may have changed temporarily during1663* function execution.1664*1665* Please note that this function, unlike __get_user_pages(), will not return 01666* for nr_pages > 0, unless FOLL_NOWAIT is used.1667*/1668static __always_inline long __get_user_pages_locked(struct mm_struct *mm,1669unsigned long start,1670unsigned long nr_pages,1671struct page **pages,1672int *locked,1673unsigned int flags)1674{1675long ret, pages_done;1676bool must_unlock = false;16771678if (!nr_pages)1679return 0;16801681/*1682* The internal caller expects GUP to manage the lock internally and the1683* lock must be released when this returns.1684*/1685if (!*locked) {1686if (mmap_read_lock_killable(mm))1687return -EAGAIN;1688must_unlock = true;1689*locked = 1;1690}1691else1692mmap_assert_locked(mm);16931694if (flags & FOLL_PIN)1695mm_set_has_pinned_flag(&mm->flags);16961697/*1698* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior1699* is to set FOLL_GET if the caller wants pages[] filled in (but has1700* carelessly failed to specify FOLL_GET), so keep doing that, but only1701* for FOLL_GET, not for the newer FOLL_PIN.1702*1703* FOLL_PIN always expects pages to be non-null, but no need to assert1704* that here, as any failures will be obvious enough.1705*/1706if (pages && !(flags & FOLL_PIN))1707flags |= FOLL_GET;17081709pages_done = 0;1710for (;;) {1711ret = __get_user_pages(mm, start, nr_pages, flags, pages,1712locked);1713if (!(flags & FOLL_UNLOCKABLE)) {1714/* VM_FAULT_RETRY couldn't trigger, bypass */1715pages_done = ret;1716break;1717}17181719/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */1720VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));17211722if (ret > 0) {1723nr_pages -= ret;1724pages_done += ret;1725if (!nr_pages)1726break;1727}1728if (*locked) {1729/*1730* VM_FAULT_RETRY didn't trigger or it was a1731* FOLL_NOWAIT.1732*/1733if (!pages_done)1734pages_done = ret;1735break;1736}1737/*1738* VM_FAULT_RETRY triggered, so seek to the faulting offset.1739* For the prefault case (!pages) we only update counts.1740*/1741if (likely(pages))1742pages += ret;1743start += ret << PAGE_SHIFT;17441745/* The lock was temporarily dropped, so we must unlock later */1746must_unlock = true;17471748retry:1749/*1750* Repeat on the address that fired VM_FAULT_RETRY1751* with both FAULT_FLAG_ALLOW_RETRY and1752* FAULT_FLAG_TRIED. Note that GUP can be interrupted1753* by fatal signals of even common signals, depending on1754* the caller's request. So we need to check it before we1755* start trying again otherwise it can loop forever.1756*/1757if (gup_signal_pending(flags)) {1758if (!pages_done)1759pages_done = -EINTR;1760break;1761}17621763ret = mmap_read_lock_killable(mm);1764if (ret) {1765if (!pages_done)1766pages_done = ret;1767break;1768}17691770*locked = 1;1771ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,1772pages, locked);1773if (!*locked) {1774/* Continue to retry until we succeeded */1775VM_WARN_ON_ONCE(ret != 0);1776goto retry;1777}1778if (ret != 1) {1779VM_WARN_ON_ONCE(ret > 1);1780if (!pages_done)1781pages_done = ret;1782break;1783}1784nr_pages--;1785pages_done++;1786if (!nr_pages)1787break;1788if (likely(pages))1789pages++;1790start += PAGE_SIZE;1791}1792if (must_unlock && *locked) {1793/*1794* We either temporarily dropped the lock, or the caller1795* requested that we both acquire and drop the lock. Either way,1796* we must now unlock, and notify the caller of that state.1797*/1798mmap_read_unlock(mm);1799*locked = 0;1800}18011802/*1803* Failing to pin anything implies something has gone wrong (except when1804* FOLL_NOWAIT is specified).1805*/1806if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))1807return -EFAULT;18081809return pages_done;1810}18111812/**1813* populate_vma_page_range() - populate a range of pages in the vma.1814* @vma: target vma1815* @start: start address1816* @end: end address1817* @locked: whether the mmap_lock is still held1818*1819* This takes care of mlocking the pages too if VM_LOCKED is set.1820*1821* Return either number of pages pinned in the vma, or a negative error1822* code on error.1823*1824* vma->vm_mm->mmap_lock must be held.1825*1826* If @locked is NULL, it may be held for read or write and will1827* be unperturbed.1828*1829* If @locked is non-NULL, it must held for read only and may be1830* released. If it's released, *@locked will be set to 0.1831*/1832long populate_vma_page_range(struct vm_area_struct *vma,1833unsigned long start, unsigned long end, int *locked)1834{1835struct mm_struct *mm = vma->vm_mm;1836unsigned long nr_pages = (end - start) / PAGE_SIZE;1837int local_locked = 1;1838int gup_flags;1839long ret;18401841VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));1842VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));1843VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);1844VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);1845mmap_assert_locked(mm);18461847/*1848* Rightly or wrongly, the VM_LOCKONFAULT case has never used1849* faultin_page() to break COW, so it has no work to do here.1850*/1851if (vma->vm_flags & VM_LOCKONFAULT)1852return nr_pages;18531854/* ... similarly, we've never faulted in PROT_NONE pages */1855if (!vma_is_accessible(vma))1856return -EFAULT;18571858gup_flags = FOLL_TOUCH;1859/*1860* We want to touch writable mappings with a write fault in order1861* to break COW, except for shared mappings because these don't COW1862* and we would not want to dirty them for nothing.1863*1864* Otherwise, do a read fault, and use FOLL_FORCE in case it's not1865* readable (ie write-only or executable).1866*/1867if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)1868gup_flags |= FOLL_WRITE;1869else1870gup_flags |= FOLL_FORCE;18711872if (locked)1873gup_flags |= FOLL_UNLOCKABLE;18741875/*1876* We made sure addr is within a VMA, so the following will1877* not result in a stack expansion that recurses back here.1878*/1879ret = __get_user_pages(mm, start, nr_pages, gup_flags,1880NULL, locked ? locked : &local_locked);1881lru_add_drain();1882return ret;1883}18841885/*1886* faultin_page_range() - populate (prefault) page tables inside the1887* given range readable/writable1888*1889* This takes care of mlocking the pages, too, if VM_LOCKED is set.1890*1891* @mm: the mm to populate page tables in1892* @start: start address1893* @end: end address1894* @write: whether to prefault readable or writable1895* @locked: whether the mmap_lock is still held1896*1897* Returns either number of processed pages in the MM, or a negative error1898* code on error (see __get_user_pages()). Note that this function reports1899* errors related to VMAs, such as incompatible mappings, as expected by1900* MADV_POPULATE_(READ|WRITE).1901*1902* The range must be page-aligned.1903*1904* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.1905*/1906long faultin_page_range(struct mm_struct *mm, unsigned long start,1907unsigned long end, bool write, int *locked)1908{1909unsigned long nr_pages = (end - start) / PAGE_SIZE;1910int gup_flags;1911long ret;19121913VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));1914VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));1915mmap_assert_locked(mm);19161917/*1918* FOLL_TOUCH: Mark page accessed and thereby young; will also mark1919* the page dirty with FOLL_WRITE -- which doesn't make a1920* difference with !FOLL_FORCE, because the page is writable1921* in the page table.1922* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit1923* a poisoned page.1924* !FOLL_FORCE: Require proper access permissions.1925*/1926gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |1927FOLL_MADV_POPULATE;1928if (write)1929gup_flags |= FOLL_WRITE;19301931ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,1932gup_flags);1933lru_add_drain();1934return ret;1935}19361937/*1938* __mm_populate - populate and/or mlock pages within a range of address space.1939*1940* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap1941* flags. VMAs must be already marked with the desired vm_flags, and1942* mmap_lock must not be held.1943*/1944int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)1945{1946struct mm_struct *mm = current->mm;1947unsigned long end, nstart, nend;1948struct vm_area_struct *vma = NULL;1949int locked = 0;1950long ret = 0;19511952end = start + len;19531954for (nstart = start; nstart < end; nstart = nend) {1955/*1956* We want to fault in pages for [nstart; end) address range.1957* Find first corresponding VMA.1958*/1959if (!locked) {1960locked = 1;1961mmap_read_lock(mm);1962vma = find_vma_intersection(mm, nstart, end);1963} else if (nstart >= vma->vm_end)1964vma = find_vma_intersection(mm, vma->vm_end, end);19651966if (!vma)1967break;1968/*1969* Set [nstart; nend) to intersection of desired address1970* range with the first VMA. Also, skip undesirable VMA types.1971*/1972nend = min(end, vma->vm_end);1973if (vma->vm_flags & (VM_IO | VM_PFNMAP))1974continue;1975if (nstart < vma->vm_start)1976nstart = vma->vm_start;1977/*1978* Now fault in a range of pages. populate_vma_page_range()1979* double checks the vma flags, so that it won't mlock pages1980* if the vma was already munlocked.1981*/1982ret = populate_vma_page_range(vma, nstart, nend, &locked);1983if (ret < 0) {1984if (ignore_errors) {1985ret = 0;1986continue; /* continue at next VMA */1987}1988break;1989}1990nend = nstart + ret * PAGE_SIZE;1991ret = 0;1992}1993if (locked)1994mmap_read_unlock(mm);1995return ret; /* 0 or negative error code */1996}1997#else /* CONFIG_MMU */1998static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,1999unsigned long nr_pages, struct page **pages,2000int *locked, unsigned int foll_flags)2001{2002struct vm_area_struct *vma;2003bool must_unlock = false;2004vm_flags_t vm_flags;2005long i;20062007if (!nr_pages)2008return 0;20092010/*2011* The internal caller expects GUP to manage the lock internally and the2012* lock must be released when this returns.2013*/2014if (!*locked) {2015if (mmap_read_lock_killable(mm))2016return -EAGAIN;2017must_unlock = true;2018*locked = 1;2019}20202021/* calculate required read or write permissions.2022* If FOLL_FORCE is set, we only require the "MAY" flags.2023*/2024vm_flags = (foll_flags & FOLL_WRITE) ?2025(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);2026vm_flags &= (foll_flags & FOLL_FORCE) ?2027(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);20282029for (i = 0; i < nr_pages; i++) {2030vma = find_vma(mm, start);2031if (!vma)2032break;20332034/* protect what we can, including chardevs */2035if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||2036!(vm_flags & vma->vm_flags))2037break;20382039if (pages) {2040pages[i] = virt_to_page((void *)start);2041if (pages[i])2042get_page(pages[i]);2043}20442045start = (start + PAGE_SIZE) & PAGE_MASK;2046}20472048if (must_unlock && *locked) {2049mmap_read_unlock(mm);2050*locked = 0;2051}20522053return i ? : -EFAULT;2054}2055#endif /* !CONFIG_MMU */20562057/**2058* fault_in_writeable - fault in userspace address range for writing2059* @uaddr: start of address range2060* @size: size of address range2061*2062* Returns the number of bytes not faulted in (like copy_to_user() and2063* copy_from_user()).2064*/2065size_t fault_in_writeable(char __user *uaddr, size_t size)2066{2067const unsigned long start = (unsigned long)uaddr;2068const unsigned long end = start + size;2069unsigned long cur;20702071if (unlikely(size == 0))2072return 0;2073if (!user_write_access_begin(uaddr, size))2074return size;20752076/* Stop once we overflow to 0. */2077for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2078unsafe_put_user(0, (char __user *)cur, out);2079out:2080user_write_access_end();2081if (size > cur - start)2082return size - (cur - start);2083return 0;2084}2085EXPORT_SYMBOL(fault_in_writeable);20862087/**2088* fault_in_subpage_writeable - fault in an address range for writing2089* @uaddr: start of address range2090* @size: size of address range2091*2092* Fault in a user address range for writing while checking for permissions at2093* sub-page granularity (e.g. arm64 MTE). This function should be used when2094* the caller cannot guarantee forward progress of a copy_to_user() loop.2095*2096* Returns the number of bytes not faulted in (like copy_to_user() and2097* copy_from_user()).2098*/2099size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)2100{2101size_t faulted_in;21022103/*2104* Attempt faulting in at page granularity first for page table2105* permission checking. The arch-specific probe_subpage_writeable()2106* functions may not check for this.2107*/2108faulted_in = size - fault_in_writeable(uaddr, size);2109if (faulted_in)2110faulted_in -= probe_subpage_writeable(uaddr, faulted_in);21112112return size - faulted_in;2113}2114EXPORT_SYMBOL(fault_in_subpage_writeable);21152116/*2117* fault_in_safe_writeable - fault in an address range for writing2118* @uaddr: start of address range2119* @size: length of address range2120*2121* Faults in an address range for writing. This is primarily useful when we2122* already know that some or all of the pages in the address range aren't in2123* memory.2124*2125* Unlike fault_in_writeable(), this function is non-destructive.2126*2127* Note that we don't pin or otherwise hold the pages referenced that we fault2128* in. There's no guarantee that they'll stay in memory for any duration of2129* time.2130*2131* Returns the number of bytes not faulted in, like copy_to_user() and2132* copy_from_user().2133*/2134size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)2135{2136const unsigned long start = (unsigned long)uaddr;2137const unsigned long end = start + size;2138unsigned long cur;2139struct mm_struct *mm = current->mm;2140bool unlocked = false;21412142if (unlikely(size == 0))2143return 0;21442145mmap_read_lock(mm);2146/* Stop once we overflow to 0. */2147for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2148if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))2149break;2150mmap_read_unlock(mm);21512152if (size > cur - start)2153return size - (cur - start);2154return 0;2155}2156EXPORT_SYMBOL(fault_in_safe_writeable);21572158/**2159* fault_in_readable - fault in userspace address range for reading2160* @uaddr: start of user address range2161* @size: size of user address range2162*2163* Returns the number of bytes not faulted in (like copy_to_user() and2164* copy_from_user()).2165*/2166size_t fault_in_readable(const char __user *uaddr, size_t size)2167{2168const unsigned long start = (unsigned long)uaddr;2169const unsigned long end = start + size;2170unsigned long cur;2171volatile char c;21722173if (unlikely(size == 0))2174return 0;2175if (!user_read_access_begin(uaddr, size))2176return size;21772178/* Stop once we overflow to 0. */2179for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2180unsafe_get_user(c, (const char __user *)cur, out);2181out:2182user_read_access_end();2183(void)c;2184if (size > cur - start)2185return size - (cur - start);2186return 0;2187}2188EXPORT_SYMBOL(fault_in_readable);21892190/**2191* get_dump_page() - pin user page in memory while writing it to core dump2192* @addr: user address2193* @locked: a pointer to an int denoting whether the mmap sem is held2194*2195* Returns struct page pointer of user page pinned for dump,2196* to be freed afterwards by put_page().2197*2198* Returns NULL on any kind of failure - a hole must then be inserted into2199* the corefile, to preserve alignment with its headers; and also returns2200* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -2201* allowing a hole to be left in the corefile to save disk space.2202*2203* Called without mmap_lock (takes and releases the mmap_lock by itself).2204*/2205#ifdef CONFIG_ELF_CORE2206struct page *get_dump_page(unsigned long addr, int *locked)2207{2208struct page *page;2209int ret;22102211ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,2212FOLL_FORCE | FOLL_DUMP | FOLL_GET);2213return (ret == 1) ? page : NULL;2214}2215#endif /* CONFIG_ELF_CORE */22162217#ifdef CONFIG_MIGRATION22182219/*2220* An array of either pages or folios ("pofs"). Although it may seem tempting to2221* avoid this complication, by simply interpreting a list of folios as a list of2222* pages, that approach won't work in the longer term, because eventually the2223* layouts of struct page and struct folio will become completely different.2224* Furthermore, this pof approach avoids excessive page_folio() calls.2225*/2226struct pages_or_folios {2227union {2228struct page **pages;2229struct folio **folios;2230void **entries;2231};2232bool has_folios;2233long nr_entries;2234};22352236static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)2237{2238if (pofs->has_folios)2239return pofs->folios[i];2240return page_folio(pofs->pages[i]);2241}22422243static void pofs_clear_entry(struct pages_or_folios *pofs, long i)2244{2245pofs->entries[i] = NULL;2246}22472248static void pofs_unpin(struct pages_or_folios *pofs)2249{2250if (pofs->has_folios)2251unpin_folios(pofs->folios, pofs->nr_entries);2252else2253unpin_user_pages(pofs->pages, pofs->nr_entries);2254}22552256static struct folio *pofs_next_folio(struct folio *folio,2257struct pages_or_folios *pofs, long *index_ptr)2258{2259long i = *index_ptr + 1;22602261if (!pofs->has_folios && folio_test_large(folio)) {2262const unsigned long start_pfn = folio_pfn(folio);2263const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);22642265for (; i < pofs->nr_entries; i++) {2266unsigned long pfn = page_to_pfn(pofs->pages[i]);22672268/* Is this page part of this folio? */2269if (pfn < start_pfn || pfn >= end_pfn)2270break;2271}2272}22732274if (unlikely(i == pofs->nr_entries))2275return NULL;2276*index_ptr = i;22772278return pofs_get_folio(pofs, i);2279}22802281/*2282* Returns the number of collected folios. Return value is always >= 0.2283*/2284static unsigned long collect_longterm_unpinnable_folios(2285struct list_head *movable_folio_list,2286struct pages_or_folios *pofs)2287{2288unsigned long collected = 0;2289bool drain_allow = true;2290struct folio *folio;2291long i = 0;22922293for (folio = pofs_get_folio(pofs, i); folio;2294folio = pofs_next_folio(folio, pofs, &i)) {22952296if (folio_is_longterm_pinnable(folio))2297continue;22982299collected++;23002301if (folio_is_device_coherent(folio))2302continue;23032304if (folio_test_hugetlb(folio)) {2305folio_isolate_hugetlb(folio, movable_folio_list);2306continue;2307}23082309if (!folio_test_lru(folio) && drain_allow) {2310lru_add_drain_all();2311drain_allow = false;2312}23132314if (!folio_isolate_lru(folio))2315continue;23162317list_add_tail(&folio->lru, movable_folio_list);2318node_stat_mod_folio(folio,2319NR_ISOLATED_ANON + folio_is_file_lru(folio),2320folio_nr_pages(folio));2321}23222323return collected;2324}23252326/*2327* Unpins all folios and migrates device coherent folios and movable_folio_list.2328* Returns -EAGAIN if all folios were successfully migrated or -errno for2329* failure (or partial success).2330*/2331static int2332migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,2333struct pages_or_folios *pofs)2334{2335int ret;2336unsigned long i;23372338for (i = 0; i < pofs->nr_entries; i++) {2339struct folio *folio = pofs_get_folio(pofs, i);23402341if (folio_is_device_coherent(folio)) {2342/*2343* Migration will fail if the folio is pinned, so2344* convert the pin on the source folio to a normal2345* reference.2346*/2347pofs_clear_entry(pofs, i);2348folio_get(folio);2349gup_put_folio(folio, 1, FOLL_PIN);23502351if (migrate_device_coherent_folio(folio)) {2352ret = -EBUSY;2353goto err;2354}23552356continue;2357}23582359/*2360* We can't migrate folios with unexpected references, so drop2361* the reference obtained by __get_user_pages_locked().2362* Migrating folios have been added to movable_folio_list after2363* calling folio_isolate_lru() which takes a reference so the2364* folio won't be freed if it's migrating.2365*/2366unpin_folio(folio);2367pofs_clear_entry(pofs, i);2368}23692370if (!list_empty(movable_folio_list)) {2371struct migration_target_control mtc = {2372.nid = NUMA_NO_NODE,2373.gfp_mask = GFP_USER | __GFP_NOWARN,2374.reason = MR_LONGTERM_PIN,2375};23762377if (migrate_pages(movable_folio_list, alloc_migration_target,2378NULL, (unsigned long)&mtc, MIGRATE_SYNC,2379MR_LONGTERM_PIN, NULL)) {2380ret = -ENOMEM;2381goto err;2382}2383}23842385putback_movable_pages(movable_folio_list);23862387return -EAGAIN;23882389err:2390pofs_unpin(pofs);2391putback_movable_pages(movable_folio_list);23922393return ret;2394}23952396static long2397check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)2398{2399LIST_HEAD(movable_folio_list);2400unsigned long collected;24012402collected = collect_longterm_unpinnable_folios(&movable_folio_list,2403pofs);2404if (!collected)2405return 0;24062407return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);2408}24092410/*2411* Check whether all folios are *allowed* to be pinned indefinitely (long term).2412* Rather confusingly, all folios in the range are required to be pinned via2413* FOLL_PIN, before calling this routine.2414*2415* Return values:2416*2417* 0: if everything is OK and all folios in the range are allowed to be pinned,2418* then this routine leaves all folios pinned and returns zero for success.2419*2420* -EAGAIN: if any folios in the range are not allowed to be pinned, then this2421* routine will migrate those folios away, unpin all the folios in the range. If2422* migration of the entire set of folios succeeds, then -EAGAIN is returned. The2423* caller should re-pin the entire range with FOLL_PIN and then call this2424* routine again.2425*2426* -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this2427* indicates a migration failure. The caller should give up, and propagate the2428* error back up the call stack. The caller does not need to unpin any folios in2429* that case, because this routine will do the unpinning.2430*/2431static long check_and_migrate_movable_folios(unsigned long nr_folios,2432struct folio **folios)2433{2434struct pages_or_folios pofs = {2435.folios = folios,2436.has_folios = true,2437.nr_entries = nr_folios,2438};24392440return check_and_migrate_movable_pages_or_folios(&pofs);2441}24422443/*2444* Return values and behavior are the same as those for2445* check_and_migrate_movable_folios().2446*/2447static long check_and_migrate_movable_pages(unsigned long nr_pages,2448struct page **pages)2449{2450struct pages_or_folios pofs = {2451.pages = pages,2452.has_folios = false,2453.nr_entries = nr_pages,2454};24552456return check_and_migrate_movable_pages_or_folios(&pofs);2457}2458#else2459static long check_and_migrate_movable_pages(unsigned long nr_pages,2460struct page **pages)2461{2462return 0;2463}24642465static long check_and_migrate_movable_folios(unsigned long nr_folios,2466struct folio **folios)2467{2468return 0;2469}2470#endif /* CONFIG_MIGRATION */24712472/*2473* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which2474* allows us to process the FOLL_LONGTERM flag.2475*/2476static long __gup_longterm_locked(struct mm_struct *mm,2477unsigned long start,2478unsigned long nr_pages,2479struct page **pages,2480int *locked,2481unsigned int gup_flags)2482{2483unsigned int flags;2484long rc, nr_pinned_pages;24852486if (!(gup_flags & FOLL_LONGTERM))2487return __get_user_pages_locked(mm, start, nr_pages, pages,2488locked, gup_flags);24892490flags = memalloc_pin_save();2491do {2492nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,2493pages, locked,2494gup_flags);2495if (nr_pinned_pages <= 0) {2496rc = nr_pinned_pages;2497break;2498}24992500/* FOLL_LONGTERM implies FOLL_PIN */2501rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);2502} while (rc == -EAGAIN);2503memalloc_pin_restore(flags);2504return rc ? rc : nr_pinned_pages;2505}25062507/*2508* Check that the given flags are valid for the exported gup/pup interface, and2509* update them with the required flags that the caller must have set.2510*/2511static bool is_valid_gup_args(struct page **pages, int *locked,2512unsigned int *gup_flags_p, unsigned int to_set)2513{2514unsigned int gup_flags = *gup_flags_p;25152516/*2517* These flags not allowed to be specified externally to the gup2518* interfaces:2519* - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only2520* - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()2521* - FOLL_UNLOCKABLE is internal only and used if locked is !NULL2522*/2523if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))2524return false;25252526gup_flags |= to_set;2527if (locked) {2528/* At the external interface locked must be set */2529if (WARN_ON_ONCE(*locked != 1))2530return false;25312532gup_flags |= FOLL_UNLOCKABLE;2533}25342535/* FOLL_GET and FOLL_PIN are mutually exclusive. */2536if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==2537(FOLL_PIN | FOLL_GET)))2538return false;25392540/* LONGTERM can only be specified when pinning */2541if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))2542return false;25432544/* Pages input must be given if using GET/PIN */2545if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))2546return false;25472548/* We want to allow the pgmap to be hot-unplugged at all times */2549if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&2550(gup_flags & FOLL_PCI_P2PDMA)))2551return false;25522553*gup_flags_p = gup_flags;2554return true;2555}25562557#ifdef CONFIG_MMU2558/**2559* get_user_pages_remote() - pin user pages in memory2560* @mm: mm_struct of target mm2561* @start: starting user address2562* @nr_pages: number of pages from start to pin2563* @gup_flags: flags modifying lookup behaviour2564* @pages: array that receives pointers to the pages pinned.2565* Should be at least nr_pages long. Or NULL, if caller2566* only intends to ensure the pages are faulted in.2567* @locked: pointer to lock flag indicating whether lock is held and2568* subsequently whether VM_FAULT_RETRY functionality can be2569* utilised. Lock must initially be held.2570*2571* Returns either number of pages pinned (which may be less than the2572* number requested), or an error. Details about the return value:2573*2574* -- If nr_pages is 0, returns 0.2575* -- If nr_pages is >0, but no pages were pinned, returns -errno.2576* -- If nr_pages is >0, and some pages were pinned, returns the number of2577* pages pinned. Again, this may be less than nr_pages.2578*2579* The caller is responsible for releasing returned @pages, via put_page().2580*2581* Must be called with mmap_lock held for read or write.2582*2583* get_user_pages_remote walks a process's page tables and takes a reference2584* to each struct page that each user address corresponds to at a given2585* instant. That is, it takes the page that would be accessed if a user2586* thread accesses the given user virtual address at that instant.2587*2588* This does not guarantee that the page exists in the user mappings when2589* get_user_pages_remote returns, and there may even be a completely different2590* page there in some cases (eg. if mmapped pagecache has been invalidated2591* and subsequently re-faulted). However it does guarantee that the page2592* won't be freed completely. And mostly callers simply care that the page2593* contains data that was valid *at some point in time*. Typically, an IO2594* or similar operation cannot guarantee anything stronger anyway because2595* locks can't be held over the syscall boundary.2596*2597* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page2598* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must2599* be called after the page is finished with, and before put_page is called.2600*2601* get_user_pages_remote is typically used for fewer-copy IO operations,2602* to get a handle on the memory by some means other than accesses2603* via the user virtual addresses. The pages may be submitted for2604* DMA to devices or accessed via their kernel linear mapping (via the2605* kmap APIs). Care should be taken to use the correct cache flushing APIs.2606*2607* See also get_user_pages_fast, for performance critical applications.2608*2609* get_user_pages_remote should be phased out in favor of2610* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing2611* should use get_user_pages_remote because it cannot pass2612* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.2613*/2614long get_user_pages_remote(struct mm_struct *mm,2615unsigned long start, unsigned long nr_pages,2616unsigned int gup_flags, struct page **pages,2617int *locked)2618{2619int local_locked = 1;26202621if (!is_valid_gup_args(pages, locked, &gup_flags,2622FOLL_TOUCH | FOLL_REMOTE))2623return -EINVAL;26242625return __get_user_pages_locked(mm, start, nr_pages, pages,2626locked ? locked : &local_locked,2627gup_flags);2628}2629EXPORT_SYMBOL(get_user_pages_remote);26302631#else /* CONFIG_MMU */2632long get_user_pages_remote(struct mm_struct *mm,2633unsigned long start, unsigned long nr_pages,2634unsigned int gup_flags, struct page **pages,2635int *locked)2636{2637return 0;2638}2639#endif /* !CONFIG_MMU */26402641/**2642* get_user_pages() - pin user pages in memory2643* @start: starting user address2644* @nr_pages: number of pages from start to pin2645* @gup_flags: flags modifying lookup behaviour2646* @pages: array that receives pointers to the pages pinned.2647* Should be at least nr_pages long. Or NULL, if caller2648* only intends to ensure the pages are faulted in.2649*2650* This is the same as get_user_pages_remote(), just with a less-flexible2651* calling convention where we assume that the mm being operated on belongs to2652* the current task, and doesn't allow passing of a locked parameter. We also2653* obviously don't pass FOLL_REMOTE in here.2654*/2655long get_user_pages(unsigned long start, unsigned long nr_pages,2656unsigned int gup_flags, struct page **pages)2657{2658int locked = 1;26592660if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))2661return -EINVAL;26622663return __get_user_pages_locked(current->mm, start, nr_pages, pages,2664&locked, gup_flags);2665}2666EXPORT_SYMBOL(get_user_pages);26672668/*2669* get_user_pages_unlocked() is suitable to replace the form:2670*2671* mmap_read_lock(mm);2672* get_user_pages(mm, ..., pages, NULL);2673* mmap_read_unlock(mm);2674*2675* with:2676*2677* get_user_pages_unlocked(mm, ..., pages);2678*2679* It is functionally equivalent to get_user_pages_fast so2680* get_user_pages_fast should be used instead if specific gup_flags2681* (e.g. FOLL_FORCE) are not required.2682*/2683long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,2684struct page **pages, unsigned int gup_flags)2685{2686int locked = 0;26872688if (!is_valid_gup_args(pages, NULL, &gup_flags,2689FOLL_TOUCH | FOLL_UNLOCKABLE))2690return -EINVAL;26912692return __get_user_pages_locked(current->mm, start, nr_pages, pages,2693&locked, gup_flags);2694}2695EXPORT_SYMBOL(get_user_pages_unlocked);26962697/*2698* GUP-fast2699*2700* get_user_pages_fast attempts to pin user pages by walking the page2701* tables directly and avoids taking locks. Thus the walker needs to be2702* protected from page table pages being freed from under it, and should2703* block any THP splits.2704*2705* One way to achieve this is to have the walker disable interrupts, and2706* rely on IPIs from the TLB flushing code blocking before the page table2707* pages are freed. This is unsuitable for architectures that do not need2708* to broadcast an IPI when invalidating TLBs.2709*2710* Another way to achieve this is to batch up page table containing pages2711* belonging to more than one mm_user, then rcu_sched a callback to free those2712* pages. Disabling interrupts will allow the gup_fast() walker to both block2713* the rcu_sched callback, and an IPI that we broadcast for splitting THPs2714* (which is a relatively rare event). The code below adopts this strategy.2715*2716* Before activating this code, please be aware that the following assumptions2717* are currently made:2718*2719* *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to2720* free pages containing page tables or TLB flushing requires IPI broadcast.2721*2722* *) ptes can be read atomically by the architecture.2723*2724* *) valid user addesses are below TASK_MAX_SIZE2725*2726* The last two assumptions can be relaxed by the addition of helper functions.2727*2728* This code is based heavily on the PowerPC implementation by Nick Piggin.2729*/2730#ifdef CONFIG_HAVE_GUP_FAST2731/*2732* Used in the GUP-fast path to determine whether GUP is permitted to work on2733* a specific folio.2734*2735* This call assumes the caller has pinned the folio, that the lowest page table2736* level still points to this folio, and that interrupts have been disabled.2737*2738* GUP-fast must reject all secretmem folios.2739*2740* Writing to pinned file-backed dirty tracked folios is inherently problematic2741* (see comment describing the writable_file_mapping_allowed() function). We2742* therefore try to avoid the most egregious case of a long-term mapping doing2743* so.2744*2745* This function cannot be as thorough as that one as the VMA is not available2746* in the fast path, so instead we whitelist known good cases and if in doubt,2747* fall back to the slow path.2748*/2749static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)2750{2751bool reject_file_backed = false;2752struct address_space *mapping;2753bool check_secretmem = false;2754unsigned long mapping_flags;27552756/*2757* If we aren't pinning then no problematic write can occur. A long term2758* pin is the most egregious case so this is the one we disallow.2759*/2760if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==2761(FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))2762reject_file_backed = true;27632764/* We hold a folio reference, so we can safely access folio fields. */27652766/* secretmem folios are always order-0 folios. */2767if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))2768check_secretmem = true;27692770if (!reject_file_backed && !check_secretmem)2771return true;27722773if (WARN_ON_ONCE(folio_test_slab(folio)))2774return false;27752776/* hugetlb neither requires dirty-tracking nor can be secretmem. */2777if (folio_test_hugetlb(folio))2778return true;27792780/*2781* GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods2782* cannot proceed, which means no actions performed under RCU can2783* proceed either.2784*2785* inodes and thus their mappings are freed under RCU, which means the2786* mapping cannot be freed beneath us and thus we can safely dereference2787* it.2788*/2789lockdep_assert_irqs_disabled();27902791/*2792* However, there may be operations which _alter_ the mapping, so ensure2793* we read it once and only once.2794*/2795mapping = READ_ONCE(folio->mapping);27962797/*2798* The mapping may have been truncated, in any case we cannot determine2799* if this mapping is safe - fall back to slow path to determine how to2800* proceed.2801*/2802if (!mapping)2803return false;28042805/* Anonymous folios pose no problem. */2806mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;2807if (mapping_flags)2808return mapping_flags & FOLIO_MAPPING_ANON;28092810/*2811* At this point, we know the mapping is non-null and points to an2812* address_space object.2813*/2814if (check_secretmem && secretmem_mapping(mapping))2815return false;2816/* The only remaining allowed file system is shmem. */2817return !reject_file_backed || shmem_mapping(mapping);2818}28192820static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,2821unsigned int flags, struct page **pages)2822{2823while ((*nr) - nr_start) {2824struct folio *folio = page_folio(pages[--(*nr)]);28252826folio_clear_referenced(folio);2827gup_put_folio(folio, 1, flags);2828}2829}28302831#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL2832/*2833* GUP-fast relies on pte change detection to avoid concurrent pgtable2834* operations.2835*2836* To pin the page, GUP-fast needs to do below in order:2837* (1) pin the page (by prefetching pte), then (2) check pte not changed.2838*2839* For the rest of pgtable operations where pgtable updates can be racy2840* with GUP-fast, we need to do (1) clear pte, then (2) check whether page2841* is pinned.2842*2843* Above will work for all pte-level operations, including THP split.2844*2845* For THP collapse, it's a bit more complicated because GUP-fast may be2846* walking a pgtable page that is being freed (pte is still valid but pmd2847* can be cleared already). To avoid race in such condition, we need to2848* also check pmd here to make sure pmd doesn't change (corresponds to2849* pmdp_collapse_flush() in the THP collapse code path).2850*/2851static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,2852unsigned long end, unsigned int flags, struct page **pages,2853int *nr)2854{2855struct dev_pagemap *pgmap = NULL;2856int ret = 0;2857pte_t *ptep, *ptem;28582859ptem = ptep = pte_offset_map(&pmd, addr);2860if (!ptep)2861return 0;2862do {2863pte_t pte = ptep_get_lockless(ptep);2864struct page *page;2865struct folio *folio;28662867/*2868* Always fallback to ordinary GUP on PROT_NONE-mapped pages:2869* pte_access_permitted() better should reject these pages2870* either way: otherwise, GUP-fast might succeed in2871* cases where ordinary GUP would fail due to VMA access2872* permissions.2873*/2874if (pte_protnone(pte))2875goto pte_unmap;28762877if (!pte_access_permitted(pte, flags & FOLL_WRITE))2878goto pte_unmap;28792880if (pte_special(pte))2881goto pte_unmap;28822883/* If it's not marked as special it must have a valid memmap. */2884VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));2885page = pte_page(pte);28862887folio = try_grab_folio_fast(page, 1, flags);2888if (!folio)2889goto pte_unmap;28902891if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||2892unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {2893gup_put_folio(folio, 1, flags);2894goto pte_unmap;2895}28962897if (!gup_fast_folio_allowed(folio, flags)) {2898gup_put_folio(folio, 1, flags);2899goto pte_unmap;2900}29012902if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {2903gup_put_folio(folio, 1, flags);2904goto pte_unmap;2905}29062907/*2908* We need to make the page accessible if and only if we are2909* going to access its content (the FOLL_PIN case). Please2910* see Documentation/core-api/pin_user_pages.rst for2911* details.2912*/2913if (flags & FOLL_PIN) {2914ret = arch_make_folio_accessible(folio);2915if (ret) {2916gup_put_folio(folio, 1, flags);2917goto pte_unmap;2918}2919}2920folio_set_referenced(folio);2921pages[*nr] = page;2922(*nr)++;2923} while (ptep++, addr += PAGE_SIZE, addr != end);29242925ret = 1;29262927pte_unmap:2928if (pgmap)2929put_dev_pagemap(pgmap);2930pte_unmap(ptem);2931return ret;2932}2933#else29342935/*2936* If we can't determine whether or not a pte is special, then fail immediately2937* for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not2938* to be special.2939*2940* For a futex to be placed on a THP tail page, get_futex_key requires a2941* get_user_pages_fast_only implementation that can pin pages. Thus it's still2942* useful to have gup_fast_pmd_leaf even if we can't operate on ptes.2943*/2944static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,2945unsigned long end, unsigned int flags, struct page **pages,2946int *nr)2947{2948return 0;2949}2950#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */29512952static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,2953unsigned long end, unsigned int flags, struct page **pages,2954int *nr)2955{2956struct page *page;2957struct folio *folio;2958int refs;29592960if (!pmd_access_permitted(orig, flags & FOLL_WRITE))2961return 0;29622963if (pmd_special(orig))2964return 0;29652966page = pmd_page(orig);2967refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);29682969folio = try_grab_folio_fast(page, refs, flags);2970if (!folio)2971return 0;29722973if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {2974gup_put_folio(folio, refs, flags);2975return 0;2976}29772978if (!gup_fast_folio_allowed(folio, flags)) {2979gup_put_folio(folio, refs, flags);2980return 0;2981}2982if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {2983gup_put_folio(folio, refs, flags);2984return 0;2985}29862987*nr += refs;2988folio_set_referenced(folio);2989return 1;2990}29912992static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,2993unsigned long end, unsigned int flags, struct page **pages,2994int *nr)2995{2996struct page *page;2997struct folio *folio;2998int refs;29993000if (!pud_access_permitted(orig, flags & FOLL_WRITE))3001return 0;30023003if (pud_special(orig))3004return 0;30053006page = pud_page(orig);3007refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);30083009folio = try_grab_folio_fast(page, refs, flags);3010if (!folio)3011return 0;30123013if (unlikely(pud_val(orig) != pud_val(*pudp))) {3014gup_put_folio(folio, refs, flags);3015return 0;3016}30173018if (!gup_fast_folio_allowed(folio, flags)) {3019gup_put_folio(folio, refs, flags);3020return 0;3021}30223023if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {3024gup_put_folio(folio, refs, flags);3025return 0;3026}30273028*nr += refs;3029folio_set_referenced(folio);3030return 1;3031}30323033static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,3034unsigned long end, unsigned int flags, struct page **pages,3035int *nr)3036{3037unsigned long next;3038pmd_t *pmdp;30393040pmdp = pmd_offset_lockless(pudp, pud, addr);3041do {3042pmd_t pmd = pmdp_get_lockless(pmdp);30433044next = pmd_addr_end(addr, end);3045if (!pmd_present(pmd))3046return 0;30473048if (unlikely(pmd_leaf(pmd))) {3049/* See gup_fast_pte_range() */3050if (pmd_protnone(pmd))3051return 0;30523053if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,3054pages, nr))3055return 0;30563057} else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,3058pages, nr))3059return 0;3060} while (pmdp++, addr = next, addr != end);30613062return 1;3063}30643065static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,3066unsigned long end, unsigned int flags, struct page **pages,3067int *nr)3068{3069unsigned long next;3070pud_t *pudp;30713072pudp = pud_offset_lockless(p4dp, p4d, addr);3073do {3074pud_t pud = READ_ONCE(*pudp);30753076next = pud_addr_end(addr, end);3077if (unlikely(!pud_present(pud)))3078return 0;3079if (unlikely(pud_leaf(pud))) {3080if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,3081pages, nr))3082return 0;3083} else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,3084pages, nr))3085return 0;3086} while (pudp++, addr = next, addr != end);30873088return 1;3089}30903091static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,3092unsigned long end, unsigned int flags, struct page **pages,3093int *nr)3094{3095unsigned long next;3096p4d_t *p4dp;30973098p4dp = p4d_offset_lockless(pgdp, pgd, addr);3099do {3100p4d_t p4d = READ_ONCE(*p4dp);31013102next = p4d_addr_end(addr, end);3103if (!p4d_present(p4d))3104return 0;3105BUILD_BUG_ON(p4d_leaf(p4d));3106if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,3107pages, nr))3108return 0;3109} while (p4dp++, addr = next, addr != end);31103111return 1;3112}31133114static void gup_fast_pgd_range(unsigned long addr, unsigned long end,3115unsigned int flags, struct page **pages, int *nr)3116{3117unsigned long next;3118pgd_t *pgdp;31193120pgdp = pgd_offset(current->mm, addr);3121do {3122pgd_t pgd = READ_ONCE(*pgdp);31233124next = pgd_addr_end(addr, end);3125if (pgd_none(pgd))3126return;3127BUILD_BUG_ON(pgd_leaf(pgd));3128if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,3129pages, nr))3130return;3131} while (pgdp++, addr = next, addr != end);3132}3133#else3134static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,3135unsigned int flags, struct page **pages, int *nr)3136{3137}3138#endif /* CONFIG_HAVE_GUP_FAST */31393140#ifndef gup_fast_permitted3141/*3142* Check if it's allowed to use get_user_pages_fast_only() for the range, or3143* we need to fall back to the slow version:3144*/3145static bool gup_fast_permitted(unsigned long start, unsigned long end)3146{3147return true;3148}3149#endif31503151static unsigned long gup_fast(unsigned long start, unsigned long end,3152unsigned int gup_flags, struct page **pages)3153{3154unsigned long flags;3155int nr_pinned = 0;3156unsigned seq;31573158if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||3159!gup_fast_permitted(start, end))3160return 0;31613162if (gup_flags & FOLL_PIN) {3163if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq))3164return 0;3165}31663167/*3168* Disable interrupts. The nested form is used, in order to allow full,3169* general purpose use of this routine.3170*3171* With interrupts disabled, we block page table pages from being freed3172* from under us. See struct mmu_table_batch comments in3173* include/asm-generic/tlb.h for more details.3174*3175* We do not adopt an rcu_read_lock() here as we also want to block IPIs3176* that come from callers of tlb_remove_table_sync_one().3177*/3178local_irq_save(flags);3179gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);3180local_irq_restore(flags);31813182/*3183* When pinning pages for DMA there could be a concurrent write protect3184* from fork() via copy_page_range(), in this case always fail GUP-fast.3185*/3186if (gup_flags & FOLL_PIN) {3187if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {3188gup_fast_unpin_user_pages(pages, nr_pinned);3189return 0;3190} else {3191sanity_check_pinned_pages(pages, nr_pinned);3192}3193}3194return nr_pinned;3195}31963197static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,3198unsigned int gup_flags, struct page **pages)3199{3200unsigned long len, end;3201unsigned long nr_pinned;3202int locked = 0;3203int ret;32043205if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |3206FOLL_FORCE | FOLL_PIN | FOLL_GET |3207FOLL_FAST_ONLY | FOLL_NOFAULT |3208FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))3209return -EINVAL;32103211if (gup_flags & FOLL_PIN)3212mm_set_has_pinned_flag(¤t->mm->flags);32133214if (!(gup_flags & FOLL_FAST_ONLY))3215might_lock_read(¤t->mm->mmap_lock);32163217start = untagged_addr(start) & PAGE_MASK;3218len = nr_pages << PAGE_SHIFT;3219if (check_add_overflow(start, len, &end))3220return -EOVERFLOW;3221if (end > TASK_SIZE_MAX)3222return -EFAULT;32233224nr_pinned = gup_fast(start, end, gup_flags, pages);3225if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)3226return nr_pinned;32273228/* Slow path: try to get the remaining pages with get_user_pages */3229start += nr_pinned << PAGE_SHIFT;3230pages += nr_pinned;3231ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,3232pages, &locked,3233gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);3234if (ret < 0) {3235/*3236* The caller has to unpin the pages we already pinned so3237* returning -errno is not an option3238*/3239if (nr_pinned)3240return nr_pinned;3241return ret;3242}3243return ret + nr_pinned;3244}32453246/**3247* get_user_pages_fast_only() - pin user pages in memory3248* @start: starting user address3249* @nr_pages: number of pages from start to pin3250* @gup_flags: flags modifying pin behaviour3251* @pages: array that receives pointers to the pages pinned.3252* Should be at least nr_pages long.3253*3254* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to3255* the regular GUP.3256*3257* If the architecture does not support this function, simply return with no3258* pages pinned.3259*3260* Careful, careful! COW breaking can go either way, so a non-write3261* access can get ambiguous page results. If you call this function without3262* 'write' set, you'd better be sure that you're ok with that ambiguity.3263*/3264int get_user_pages_fast_only(unsigned long start, int nr_pages,3265unsigned int gup_flags, struct page **pages)3266{3267/*3268* Internally (within mm/gup.c), gup fast variants must set FOLL_GET,3269* because gup fast is always a "pin with a +1 page refcount" request.3270*3271* FOLL_FAST_ONLY is required in order to match the API description of3272* this routine: no fall back to regular ("slow") GUP.3273*/3274if (!is_valid_gup_args(pages, NULL, &gup_flags,3275FOLL_GET | FOLL_FAST_ONLY))3276return -EINVAL;32773278return gup_fast_fallback(start, nr_pages, gup_flags, pages);3279}3280EXPORT_SYMBOL_GPL(get_user_pages_fast_only);32813282/**3283* get_user_pages_fast() - pin user pages in memory3284* @start: starting user address3285* @nr_pages: number of pages from start to pin3286* @gup_flags: flags modifying pin behaviour3287* @pages: array that receives pointers to the pages pinned.3288* Should be at least nr_pages long.3289*3290* Attempt to pin user pages in memory without taking mm->mmap_lock.3291* If not successful, it will fall back to taking the lock and3292* calling get_user_pages().3293*3294* Returns number of pages pinned. This may be fewer than the number requested.3295* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns3296* -errno.3297*/3298int get_user_pages_fast(unsigned long start, int nr_pages,3299unsigned int gup_flags, struct page **pages)3300{3301/*3302* The caller may or may not have explicitly set FOLL_GET; either way is3303* OK. However, internally (within mm/gup.c), gup fast variants must set3304* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"3305* request.3306*/3307if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))3308return -EINVAL;3309return gup_fast_fallback(start, nr_pages, gup_flags, pages);3310}3311EXPORT_SYMBOL_GPL(get_user_pages_fast);33123313/**3314* pin_user_pages_fast() - pin user pages in memory without taking locks3315*3316* @start: starting user address3317* @nr_pages: number of pages from start to pin3318* @gup_flags: flags modifying pin behaviour3319* @pages: array that receives pointers to the pages pinned.3320* Should be at least nr_pages long.3321*3322* Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See3323* get_user_pages_fast() for documentation on the function arguments, because3324* the arguments here are identical.3325*3326* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3327* see Documentation/core-api/pin_user_pages.rst for further details.3328*3329* Note that if a zero_page is amongst the returned pages, it will not have3330* pins in it and unpin_user_page() will not remove pins from it.3331*/3332int pin_user_pages_fast(unsigned long start, int nr_pages,3333unsigned int gup_flags, struct page **pages)3334{3335if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))3336return -EINVAL;3337return gup_fast_fallback(start, nr_pages, gup_flags, pages);3338}3339EXPORT_SYMBOL_GPL(pin_user_pages_fast);33403341/**3342* pin_user_pages_remote() - pin pages of a remote process3343*3344* @mm: mm_struct of target mm3345* @start: starting user address3346* @nr_pages: number of pages from start to pin3347* @gup_flags: flags modifying lookup behaviour3348* @pages: array that receives pointers to the pages pinned.3349* Should be at least nr_pages long.3350* @locked: pointer to lock flag indicating whether lock is held and3351* subsequently whether VM_FAULT_RETRY functionality can be3352* utilised. Lock must initially be held.3353*3354* Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See3355* get_user_pages_remote() for documentation on the function arguments, because3356* the arguments here are identical.3357*3358* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3359* see Documentation/core-api/pin_user_pages.rst for details.3360*3361* Note that if a zero_page is amongst the returned pages, it will not have3362* pins in it and unpin_user_page*() will not remove pins from it.3363*/3364long pin_user_pages_remote(struct mm_struct *mm,3365unsigned long start, unsigned long nr_pages,3366unsigned int gup_flags, struct page **pages,3367int *locked)3368{3369int local_locked = 1;33703371if (!is_valid_gup_args(pages, locked, &gup_flags,3372FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))3373return 0;3374return __gup_longterm_locked(mm, start, nr_pages, pages,3375locked ? locked : &local_locked,3376gup_flags);3377}3378EXPORT_SYMBOL(pin_user_pages_remote);33793380/**3381* pin_user_pages() - pin user pages in memory for use by other devices3382*3383* @start: starting user address3384* @nr_pages: number of pages from start to pin3385* @gup_flags: flags modifying lookup behaviour3386* @pages: array that receives pointers to the pages pinned.3387* Should be at least nr_pages long.3388*3389* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and3390* FOLL_PIN is set.3391*3392* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3393* see Documentation/core-api/pin_user_pages.rst for details.3394*3395* Note that if a zero_page is amongst the returned pages, it will not have3396* pins in it and unpin_user_page*() will not remove pins from it.3397*/3398long pin_user_pages(unsigned long start, unsigned long nr_pages,3399unsigned int gup_flags, struct page **pages)3400{3401int locked = 1;34023403if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))3404return 0;3405return __gup_longterm_locked(current->mm, start, nr_pages,3406pages, &locked, gup_flags);3407}3408EXPORT_SYMBOL(pin_user_pages);34093410/*3411* pin_user_pages_unlocked() is the FOLL_PIN variant of3412* get_user_pages_unlocked(). Behavior is the same, except that this one sets3413* FOLL_PIN and rejects FOLL_GET.3414*3415* Note that if a zero_page is amongst the returned pages, it will not have3416* pins in it and unpin_user_page*() will not remove pins from it.3417*/3418long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,3419struct page **pages, unsigned int gup_flags)3420{3421int locked = 0;34223423if (!is_valid_gup_args(pages, NULL, &gup_flags,3424FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))3425return 0;34263427return __gup_longterm_locked(current->mm, start, nr_pages, pages,3428&locked, gup_flags);3429}3430EXPORT_SYMBOL(pin_user_pages_unlocked);34313432/**3433* memfd_pin_folios() - pin folios associated with a memfd3434* @memfd: the memfd whose folios are to be pinned3435* @start: the first memfd offset3436* @end: the last memfd offset (inclusive)3437* @folios: array that receives pointers to the folios pinned3438* @max_folios: maximum number of entries in @folios3439* @offset: the offset into the first folio3440*3441* Attempt to pin folios associated with a memfd in the contiguous range3442* [start, end]. Given that a memfd is either backed by shmem or hugetlb,3443* the folios can either be found in the page cache or need to be allocated3444* if necessary. Once the folios are located, they are all pinned via3445* FOLL_PIN and @offset is populatedwith the offset into the first folio.3446* And, eventually, these pinned folios must be released either using3447* unpin_folios() or unpin_folio().3448*3449* It must be noted that the folios may be pinned for an indefinite amount3450* of time. And, in most cases, the duration of time they may stay pinned3451* would be controlled by the userspace. This behavior is effectively the3452* same as using FOLL_LONGTERM with other GUP APIs.3453*3454* Returns number of folios pinned, which could be less than @max_folios3455* as it depends on the folio sizes that cover the range [start, end].3456* If no folios were pinned, it returns -errno.3457*/3458long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,3459struct folio **folios, unsigned int max_folios,3460pgoff_t *offset)3461{3462unsigned int flags, nr_folios, nr_found;3463unsigned int i, pgshift = PAGE_SHIFT;3464pgoff_t start_idx, end_idx;3465struct folio *folio = NULL;3466struct folio_batch fbatch;3467struct hstate *h;3468long ret = -EINVAL;34693470if (start < 0 || start > end || !max_folios)3471return -EINVAL;34723473if (!memfd)3474return -EINVAL;34753476if (!shmem_file(memfd) && !is_file_hugepages(memfd))3477return -EINVAL;34783479if (end >= i_size_read(file_inode(memfd)))3480return -EINVAL;34813482if (is_file_hugepages(memfd)) {3483h = hstate_file(memfd);3484pgshift = huge_page_shift(h);3485}34863487flags = memalloc_pin_save();3488do {3489nr_folios = 0;3490start_idx = start >> pgshift;3491end_idx = end >> pgshift;3492if (is_file_hugepages(memfd)) {3493start_idx <<= huge_page_order(h);3494end_idx <<= huge_page_order(h);3495}34963497folio_batch_init(&fbatch);3498while (start_idx <= end_idx && nr_folios < max_folios) {3499/*3500* In most cases, we should be able to find the folios3501* in the page cache. If we cannot find them for some3502* reason, we try to allocate them and add them to the3503* page cache.3504*/3505nr_found = filemap_get_folios_contig(memfd->f_mapping,3506&start_idx,3507end_idx,3508&fbatch);3509if (folio) {3510folio_put(folio);3511folio = NULL;3512}35133514for (i = 0; i < nr_found; i++) {3515folio = fbatch.folios[i];35163517if (try_grab_folio(folio, 1, FOLL_PIN)) {3518folio_batch_release(&fbatch);3519ret = -EINVAL;3520goto err;3521}35223523if (nr_folios == 0)3524*offset = offset_in_folio(folio, start);35253526folios[nr_folios] = folio;3527if (++nr_folios == max_folios)3528break;3529}35303531folio = NULL;3532folio_batch_release(&fbatch);3533if (!nr_found) {3534folio = memfd_alloc_folio(memfd, start_idx);3535if (IS_ERR(folio)) {3536ret = PTR_ERR(folio);3537if (ret != -EEXIST)3538goto err;3539folio = NULL;3540}3541}3542}35433544ret = check_and_migrate_movable_folios(nr_folios, folios);3545} while (ret == -EAGAIN);35463547memalloc_pin_restore(flags);3548return ret ? ret : nr_folios;3549err:3550memalloc_pin_restore(flags);3551unpin_folios(folios, nr_folios);35523553return ret;3554}3555EXPORT_SYMBOL_GPL(memfd_pin_folios);35563557/**3558* folio_add_pins() - add pins to an already-pinned folio3559* @folio: the folio to add more pins to3560* @pins: number of pins to add3561*3562* Try to add more pins to an already-pinned folio. The semantics3563* of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot3564* be changed.3565*3566* This function is helpful when having obtained a pin on a large folio3567* using memfd_pin_folios(), but wanting to logically unpin parts3568* (e.g., individual pages) of the folio later, for example, using3569* unpin_user_page_range_dirty_lock().3570*3571* This is not the right interface to initially pin a folio.3572*/3573int folio_add_pins(struct folio *folio, unsigned int pins)3574{3575VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));35763577return try_grab_folio(folio, pins, FOLL_PIN);3578}3579EXPORT_SYMBOL_GPL(folio_add_pins);358035813582