/*1* Lockless get_user_pages_fast for x862*3* Copyright (C) 2008 Nick Piggin4* Copyright (C) 2008 Novell Inc.5*/6#include <linux/sched.h>7#include <linux/mm.h>8#include <linux/vmstat.h>9#include <linux/highmem.h>10#include <linux/swap.h>1112#include <asm/pgtable.h>1314static inline pte_t gup_get_pte(pte_t *ptep)15{16#ifndef CONFIG_X86_PAE17return ACCESS_ONCE(*ptep);18#else19/*20* With get_user_pages_fast, we walk down the pagetables without taking21* any locks. For this we would like to load the pointers atomically,22* but that is not possible (without expensive cmpxchg8b) on PAE. What23* we do have is the guarantee that a pte will only either go from not24* present to present, or present to not present or both -- it will not25* switch to a completely different present page without a TLB flush in26* between; something that we are blocking by holding interrupts off.27*28* Setting ptes from not present to present goes:29* ptep->pte_high = h;30* smp_wmb();31* ptep->pte_low = l;32*33* And present to not present goes:34* ptep->pte_low = 0;35* smp_wmb();36* ptep->pte_high = 0;37*38* We must ensure here that the load of pte_low sees l iff pte_high39* sees h. We load pte_high *after* loading pte_low, which ensures we40* don't see an older value of pte_high. *Then* we recheck pte_low,41* which ensures that we haven't picked up a changed pte high. We might42* have got rubbish values from pte_low and pte_high, but we are43* guaranteed that pte_low will not have the present bit set *unless*44* it is 'l'. And get_user_pages_fast only operates on present ptes, so45* we're safe.46*47* gup_get_pte should not be used or copied outside gup.c without being48* very careful -- it does not atomically load the pte or anything that49* is likely to be useful for you.50*/51pte_t pte;5253retry:54pte.pte_low = ptep->pte_low;55smp_rmb();56pte.pte_high = ptep->pte_high;57smp_rmb();58if (unlikely(pte.pte_low != ptep->pte_low))59goto retry;6061return pte;62#endif63}6465/*66* The performance critical leaf functions are made noinline otherwise gcc67* inlines everything into a single function which results in too much68* register pressure.69*/70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,71unsigned long end, int write, struct page **pages, int *nr)72{73unsigned long mask;74pte_t *ptep;7576mask = _PAGE_PRESENT|_PAGE_USER;77if (write)78mask |= _PAGE_RW;7980ptep = pte_offset_map(&pmd, addr);81do {82pte_t pte = gup_get_pte(ptep);83struct page *page;8485if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {86pte_unmap(ptep);87return 0;88}89VM_BUG_ON(!pfn_valid(pte_pfn(pte)));90page = pte_page(pte);91get_page(page);92SetPageReferenced(page);93pages[*nr] = page;94(*nr)++;9596} while (ptep++, addr += PAGE_SIZE, addr != end);97pte_unmap(ptep - 1);9899return 1;100}101102static inline void get_head_page_multiple(struct page *page, int nr)103{104VM_BUG_ON(page != compound_head(page));105VM_BUG_ON(page_count(page) == 0);106atomic_add(nr, &page->_count);107SetPageReferenced(page);108}109110static inline void get_huge_page_tail(struct page *page)111{112/*113* __split_huge_page_refcount() cannot run114* from under us.115*/116VM_BUG_ON(atomic_read(&page->_count) < 0);117atomic_inc(&page->_count);118}119120static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,121unsigned long end, int write, struct page **pages, int *nr)122{123unsigned long mask;124pte_t pte = *(pte_t *)&pmd;125struct page *head, *page;126int refs;127128mask = _PAGE_PRESENT|_PAGE_USER;129if (write)130mask |= _PAGE_RW;131if ((pte_flags(pte) & mask) != mask)132return 0;133/* hugepages are never "special" */134VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);135VM_BUG_ON(!pfn_valid(pte_pfn(pte)));136137refs = 0;138head = pte_page(pte);139page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);140do {141VM_BUG_ON(compound_head(page) != head);142pages[*nr] = page;143if (PageTail(page))144get_huge_page_tail(page);145(*nr)++;146page++;147refs++;148} while (addr += PAGE_SIZE, addr != end);149get_head_page_multiple(head, refs);150151return 1;152}153154static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,155int write, struct page **pages, int *nr)156{157unsigned long next;158pmd_t *pmdp;159160pmdp = pmd_offset(&pud, addr);161do {162pmd_t pmd = *pmdp;163164next = pmd_addr_end(addr, end);165/*166* The pmd_trans_splitting() check below explains why167* pmdp_splitting_flush has to flush the tlb, to stop168* this gup-fast code from running while we set the169* splitting bit in the pmd. Returning zero will take170* the slow path that will call wait_split_huge_page()171* if the pmd is still in splitting state. gup-fast172* can't because it has irq disabled and173* wait_split_huge_page() would never return as the174* tlb flush IPI wouldn't run.175*/176if (pmd_none(pmd) || pmd_trans_splitting(pmd))177return 0;178if (unlikely(pmd_large(pmd))) {179if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))180return 0;181} else {182if (!gup_pte_range(pmd, addr, next, write, pages, nr))183return 0;184}185} while (pmdp++, addr = next, addr != end);186187return 1;188}189190static noinline int gup_huge_pud(pud_t pud, unsigned long addr,191unsigned long end, int write, struct page **pages, int *nr)192{193unsigned long mask;194pte_t pte = *(pte_t *)&pud;195struct page *head, *page;196int refs;197198mask = _PAGE_PRESENT|_PAGE_USER;199if (write)200mask |= _PAGE_RW;201if ((pte_flags(pte) & mask) != mask)202return 0;203/* hugepages are never "special" */204VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);205VM_BUG_ON(!pfn_valid(pte_pfn(pte)));206207refs = 0;208head = pte_page(pte);209page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);210do {211VM_BUG_ON(compound_head(page) != head);212pages[*nr] = page;213(*nr)++;214page++;215refs++;216} while (addr += PAGE_SIZE, addr != end);217get_head_page_multiple(head, refs);218219return 1;220}221222static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,223int write, struct page **pages, int *nr)224{225unsigned long next;226pud_t *pudp;227228pudp = pud_offset(&pgd, addr);229do {230pud_t pud = *pudp;231232next = pud_addr_end(addr, end);233if (pud_none(pud))234return 0;235if (unlikely(pud_large(pud))) {236if (!gup_huge_pud(pud, addr, next, write, pages, nr))237return 0;238} else {239if (!gup_pmd_range(pud, addr, next, write, pages, nr))240return 0;241}242} while (pudp++, addr = next, addr != end);243244return 1;245}246247/*248* Like get_user_pages_fast() except its IRQ-safe in that it won't fall249* back to the regular GUP.250*/251int __get_user_pages_fast(unsigned long start, int nr_pages, int write,252struct page **pages)253{254struct mm_struct *mm = current->mm;255unsigned long addr, len, end;256unsigned long next;257unsigned long flags;258pgd_t *pgdp;259int nr = 0;260261start &= PAGE_MASK;262addr = start;263len = (unsigned long) nr_pages << PAGE_SHIFT;264end = start + len;265if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,266(void __user *)start, len)))267return 0;268269/*270* XXX: batch / limit 'nr', to avoid large irq off latency271* needs some instrumenting to determine the common sizes used by272* important workloads (eg. DB2), and whether limiting the batch size273* will decrease performance.274*275* It seems like we're in the clear for the moment. Direct-IO is276* the main guy that batches up lots of get_user_pages, and even277* they are limited to 64-at-a-time which is not so many.278*/279/*280* This doesn't prevent pagetable teardown, but does prevent281* the pagetables and pages from being freed on x86.282*283* So long as we atomically load page table pointers versus teardown284* (which we do on x86, with the above PAE exception), we can follow the285* address down to the the page and take a ref on it.286*/287local_irq_save(flags);288pgdp = pgd_offset(mm, addr);289do {290pgd_t pgd = *pgdp;291292next = pgd_addr_end(addr, end);293if (pgd_none(pgd))294break;295if (!gup_pud_range(pgd, addr, next, write, pages, &nr))296break;297} while (pgdp++, addr = next, addr != end);298local_irq_restore(flags);299300return nr;301}302303/**304* get_user_pages_fast() - pin user pages in memory305* @start: starting user address306* @nr_pages: number of pages from start to pin307* @write: whether pages will be written to308* @pages: array that receives pointers to the pages pinned.309* Should be at least nr_pages long.310*311* Attempt to pin user pages in memory without taking mm->mmap_sem.312* If not successful, it will fall back to taking the lock and313* calling get_user_pages().314*315* Returns number of pages pinned. This may be fewer than the number316* requested. If nr_pages is 0 or negative, returns 0. If no pages317* were pinned, returns -errno.318*/319int get_user_pages_fast(unsigned long start, int nr_pages, int write,320struct page **pages)321{322struct mm_struct *mm = current->mm;323unsigned long addr, len, end;324unsigned long next;325pgd_t *pgdp;326int nr = 0;327328start &= PAGE_MASK;329addr = start;330len = (unsigned long) nr_pages << PAGE_SHIFT;331332end = start + len;333if (end < start)334goto slow_irqon;335336#ifdef CONFIG_X86_64337if (end >> __VIRTUAL_MASK_SHIFT)338goto slow_irqon;339#endif340341/*342* XXX: batch / limit 'nr', to avoid large irq off latency343* needs some instrumenting to determine the common sizes used by344* important workloads (eg. DB2), and whether limiting the batch size345* will decrease performance.346*347* It seems like we're in the clear for the moment. Direct-IO is348* the main guy that batches up lots of get_user_pages, and even349* they are limited to 64-at-a-time which is not so many.350*/351/*352* This doesn't prevent pagetable teardown, but does prevent353* the pagetables and pages from being freed on x86.354*355* So long as we atomically load page table pointers versus teardown356* (which we do on x86, with the above PAE exception), we can follow the357* address down to the the page and take a ref on it.358*/359local_irq_disable();360pgdp = pgd_offset(mm, addr);361do {362pgd_t pgd = *pgdp;363364next = pgd_addr_end(addr, end);365if (pgd_none(pgd))366goto slow;367if (!gup_pud_range(pgd, addr, next, write, pages, &nr))368goto slow;369} while (pgdp++, addr = next, addr != end);370local_irq_enable();371372VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);373return nr;374375{376int ret;377378slow:379local_irq_enable();380slow_irqon:381/* Try to get the remaining pages with get_user_pages */382start += nr << PAGE_SHIFT;383pages += nr;384385down_read(&mm->mmap_sem);386ret = get_user_pages(current, mm, start,387(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);388up_read(&mm->mmap_sem);389390/* Have to be a bit careful with return values */391if (nr > 0) {392if (ret < 0)393ret = nr;394else395ret += nr;396}397398return ret;399}400}401402403