Path: blob/main/sys/compat/linuxkpi/common/src/linux_page.c
102437 views
/*-1* Copyright (c) 2010 Isilon Systems, Inc.2* Copyright (c) 2016 Matthew Macy ([email protected])3* Copyright (c) 2017 Mellanox Technologies, Ltd.4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice unmodified, this list of conditions, and the following11* disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR17* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES18* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.19* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,20* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT21* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,22* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY23* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT24* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF25* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.26*/2728#include <sys/param.h>29#include <sys/systm.h>30#include <sys/malloc.h>31#include <sys/kernel.h>32#include <sys/sysctl.h>33#include <sys/lock.h>34#include <sys/mutex.h>35#include <sys/rwlock.h>36#include <sys/proc.h>37#include <sys/sched.h>38#include <sys/memrange.h>3940#include <machine/bus.h>4142#include <vm/vm.h>43#include <vm/pmap.h>44#include <vm/vm_param.h>45#include <vm/vm_kern.h>46#include <vm/vm_object.h>47#include <vm/vm_map.h>48#include <vm/vm_page.h>49#include <vm/vm_pageout.h>50#include <vm/vm_pager.h>51#include <vm/vm_radix.h>52#include <vm/vm_reserv.h>53#include <vm/vm_extern.h>5455#include <vm/uma.h>56#include <vm/uma_int.h>5758#include <linux/gfp.h>59#include <linux/mm.h>60#include <linux/preempt.h>61#include <linux/fs.h>62#include <linux/shmem_fs.h>63#include <linux/kernel.h>64#include <linux/idr.h>65#include <linux/io.h>66#include <linux/io-mapping.h>6768#ifdef __i386__69DEFINE_IDR(mtrr_idr);70static MALLOC_DEFINE(M_LKMTRR, "idr", "Linux MTRR compat");71extern int pat_works;72#endif7374void75si_meminfo(struct sysinfo *si)76{77si->totalram = physmem;78si->freeram = vm_free_count();79si->totalhigh = 0;80si->freehigh = 0;81si->mem_unit = PAGE_SIZE;82}8384void *85linux_page_address(const struct page *page)86{8788if (page->object != kernel_object) {89return (PMAP_HAS_DMAP ?90((void *)(uintptr_t)PHYS_TO_DMAP(page_to_phys(page))) :91NULL);92}93return ((void *)(uintptr_t)(VM_MIN_KERNEL_ADDRESS +94IDX_TO_OFF(page->pindex)));95}9697struct page *98linux_alloc_pages(gfp_t flags, unsigned int order)99{100struct page *page;101102if (PMAP_HAS_DMAP) {103unsigned long npages = 1UL << order;104int req = VM_ALLOC_WIRED;105106if ((flags & M_ZERO) != 0)107req |= VM_ALLOC_ZERO;108109if (order == 0 && (flags & GFP_DMA32) == 0) {110page = vm_page_alloc_noobj(req);111if (page == NULL)112return (NULL);113} else {114vm_paddr_t pmax = (flags & GFP_DMA32) ?115BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR;116117if ((flags & __GFP_NORETRY) != 0)118req |= VM_ALLOC_NORECLAIM;119120retry:121page = vm_page_alloc_noobj_contig(req, npages, 0, pmax,122PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);123if (page == NULL) {124if ((flags & (M_WAITOK | __GFP_NORETRY)) ==125M_WAITOK) {126int err = vm_page_reclaim_contig(req,127npages, 0, pmax, PAGE_SIZE, 0);128if (err == ENOMEM)129vm_wait(NULL);130else if (err != 0)131return (NULL);132flags &= ~M_WAITOK;133goto retry;134}135return (NULL);136}137}138} else {139vm_offset_t vaddr;140141vaddr = linux_alloc_kmem(flags, order);142if (vaddr == 0)143return (NULL);144145page = virt_to_page((void *)vaddr);146147KASSERT(vaddr == (vm_offset_t)page_address(page),148("Page address mismatch"));149}150151return (page);152}153154static void155_linux_free_kmem(vm_offset_t addr, unsigned int order)156{157size_t size = ((size_t)PAGE_SIZE) << order;158159kmem_free((void *)addr, size);160}161162void163linux_free_pages(struct page *page, unsigned int order)164{165if (PMAP_HAS_DMAP) {166unsigned long npages = 1UL << order;167unsigned long x;168169for (x = 0; x != npages; x++) {170vm_page_t pgo = page + x;171172/*173* The "free page" function is used in several174* contexts.175*176* Some pages are allocated by `linux_alloc_pages()`177* above, but not all of them are. For instance in the178* DRM drivers, some pages come from179* `shmem_read_mapping_page_gfp()`.180*181* That's why we need to check if the page is managed182* or not here.183*/184if ((pgo->oflags & VPO_UNMANAGED) == 0) {185vm_page_unwire(pgo, PQ_ACTIVE);186} else {187if (vm_page_unwire_noq(pgo))188vm_page_free(pgo);189}190}191} else {192vm_offset_t vaddr;193194vaddr = (vm_offset_t)page_address(page);195196_linux_free_kmem(vaddr, order);197}198}199200void201linux_release_pages(release_pages_arg arg, int nr)202{203int i;204205CTASSERT(offsetof(struct folio, page) == 0);206207for (i = 0; i < nr; i++)208__free_page(arg.pages[i]);209}210211vm_offset_t212linux_alloc_kmem(gfp_t flags, unsigned int order)213{214size_t size = ((size_t)PAGE_SIZE) << order;215void *addr;216217addr = kmem_alloc_contig(size, flags & GFP_NATIVE_MASK, 0,218((flags & GFP_DMA32) == 0) ? -1UL : BUS_SPACE_MAXADDR_32BIT,219PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);220221return ((vm_offset_t)addr);222}223224void225linux_free_kmem(vm_offset_t addr, unsigned int order)226{227KASSERT((addr & ~PAGE_MASK) == 0,228("%s: addr %p is not page aligned", __func__, (void *)addr));229230if (addr >= VM_MIN_KERNEL_ADDRESS && addr < VM_MAX_KERNEL_ADDRESS) {231_linux_free_kmem(addr, order);232} else {233vm_page_t page;234235page = PHYS_TO_VM_PAGE(DMAP_TO_PHYS(addr));236linux_free_pages(page, order);237}238}239240static int241linux_get_user_pages_internal(vm_map_t map, unsigned long start, int nr_pages,242int write, struct page **pages)243{244vm_prot_t prot;245size_t len;246int count;247248prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;249len = ptoa((vm_offset_t)nr_pages);250count = vm_fault_quick_hold_pages(map, start, len, prot, pages, nr_pages);251return (count == -1 ? -EFAULT : nr_pages);252}253254int255__get_user_pages_fast(unsigned long start, int nr_pages, int write,256struct page **pages)257{258vm_map_t map;259vm_page_t *mp;260vm_offset_t va;261vm_offset_t end;262vm_prot_t prot;263int count;264265if (nr_pages == 0 || in_interrupt())266return (0);267268MPASS(pages != NULL);269map = &curthread->td_proc->p_vmspace->vm_map;270end = start + ptoa((vm_offset_t)nr_pages);271if (!vm_map_range_valid(map, start, end))272return (-EINVAL);273prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;274for (count = 0, mp = pages, va = start; va < end;275mp++, va += PAGE_SIZE, count++) {276*mp = pmap_extract_and_hold(map->pmap, va, prot);277if (*mp == NULL)278break;279280if ((prot & VM_PROT_WRITE) != 0 &&281(*mp)->dirty != VM_PAGE_BITS_ALL) {282/*283* Explicitly dirty the physical page. Otherwise, the284* caller's changes may go unnoticed because they are285* performed through an unmanaged mapping or by a DMA286* operation.287*288* The object lock is not held here.289* See vm_page_clear_dirty_mask().290*/291vm_page_dirty(*mp);292}293}294return (count);295}296297long298get_user_pages_remote(struct task_struct *task, struct mm_struct *mm,299unsigned long start, unsigned long nr_pages, unsigned int gup_flags,300struct page **pages, struct vm_area_struct **vmas)301{302vm_map_t map;303304map = &task->task_thread->td_proc->p_vmspace->vm_map;305return (linux_get_user_pages_internal(map, start, nr_pages,306!!(gup_flags & FOLL_WRITE), pages));307}308309long310lkpi_get_user_pages(unsigned long start, unsigned long nr_pages,311unsigned int gup_flags, struct page **pages)312{313vm_map_t map;314315map = &curthread->td_proc->p_vmspace->vm_map;316return (linux_get_user_pages_internal(map, start, nr_pages,317!!(gup_flags & FOLL_WRITE), pages));318}319320/*321* Hash of vmmap addresses. This is infrequently accessed and does not322* need to be particularly large. This is done because we must store the323* caller's idea of the map size to properly unmap.324*/325struct vmmap {326LIST_ENTRY(vmmap) vm_next;327void *vm_addr;328unsigned long vm_size;329};330331struct vmmaphd {332struct vmmap *lh_first;333};334#define VMMAP_HASH_SIZE 64335#define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1)336#define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK337static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];338static struct mtx vmmaplock;339340int341is_vmalloc_addr(const void *addr)342{343struct vmmap *vmmap;344345mtx_lock(&vmmaplock);346LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)347if (addr == vmmap->vm_addr)348break;349mtx_unlock(&vmmaplock);350if (vmmap != NULL)351return (1);352353return (vtoslab((vm_offset_t)addr & ~UMA_SLAB_MASK) != NULL);354}355356static void357vmmap_add(void *addr, unsigned long size)358{359struct vmmap *vmmap;360361vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);362mtx_lock(&vmmaplock);363vmmap->vm_size = size;364vmmap->vm_addr = addr;365LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);366mtx_unlock(&vmmaplock);367}368369static struct vmmap *370vmmap_remove(void *addr)371{372struct vmmap *vmmap;373374mtx_lock(&vmmaplock);375LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)376if (vmmap->vm_addr == addr)377break;378if (vmmap)379LIST_REMOVE(vmmap, vm_next);380mtx_unlock(&vmmaplock);381382return (vmmap);383}384385#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)386void *387_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)388{389void *addr;390391addr = pmap_mapdev_attr(phys_addr, size, attr);392if (addr == NULL)393return (NULL);394vmmap_add(addr, size);395396return (addr);397}398#endif399400void401iounmap(void *addr)402{403struct vmmap *vmmap;404405vmmap = vmmap_remove(addr);406if (vmmap == NULL)407return;408#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)409pmap_unmapdev(addr, vmmap->vm_size);410#endif411kfree(vmmap);412}413414void *415vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)416{417vm_offset_t off;418size_t size;419420size = count * PAGE_SIZE;421off = kva_alloc(size);422if (off == 0)423return (NULL);424vmmap_add((void *)off, size);425pmap_qenter(off, pages, count);426427return ((void *)off);428}429430#define VMAP_MAX_CHUNK_SIZE (65536U / sizeof(struct vm_page)) /* KMEM_ZMAX */431432void *433linuxkpi_vmap_pfn(unsigned long *pfns, unsigned int count, int prot)434{435vm_page_t m, *ma, fma;436vm_offset_t off, coff;437vm_paddr_t pa;438vm_memattr_t attr;439size_t size;440unsigned int i, c, chunk;441442size = ptoa(count);443off = kva_alloc(size);444if (off == 0)445return (NULL);446vmmap_add((void *)off, size);447448chunk = MIN(count, VMAP_MAX_CHUNK_SIZE);449attr = pgprot2cachemode(prot);450ma = malloc(chunk * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO);451fma = NULL;452c = 0;453coff = off;454for (i = 0; i < count; i++) {455pa = IDX_TO_OFF(pfns[i]);456m = PHYS_TO_VM_PAGE(pa);457if (m == NULL) {458if (fma == NULL)459fma = malloc(chunk * sizeof(struct vm_page),460M_TEMP, M_WAITOK | M_ZERO);461m = fma + c;462vm_page_initfake(m, pa, attr);463} else {464pmap_page_set_memattr(m, attr);465}466ma[c] = m;467c++;468if (c == chunk || i == count - 1) {469pmap_qenter(coff, ma, c);470if (i == count - 1)471break;472coff += ptoa(c);473c = 0;474memset(ma, 0, chunk * sizeof(vm_page_t));475if (fma != NULL)476memset(fma, 0, chunk * sizeof(struct vm_page));477}478}479free(fma, M_TEMP);480free(ma, M_TEMP);481482return ((void *)off);483}484485void486vunmap(void *addr)487{488struct vmmap *vmmap;489490vmmap = vmmap_remove(addr);491if (vmmap == NULL)492return;493pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);494kva_free((vm_offset_t)addr, vmmap->vm_size);495kfree(vmmap);496}497498vm_fault_t499lkpi_vmf_insert_pfn_prot_locked(struct vm_area_struct *vma, unsigned long addr,500unsigned long pfn, pgprot_t prot)501{502struct pctrie_iter pages;503vm_object_t vm_obj = vma->vm_obj;504vm_object_t tmp_obj;505vm_page_t page;506vm_pindex_t pindex;507508VM_OBJECT_ASSERT_WLOCKED(vm_obj);509vm_page_iter_init(&pages, vm_obj);510pindex = OFF_TO_IDX(addr - vma->vm_start);511if (vma->vm_pfn_count == 0)512vma->vm_pfn_first = pindex;513MPASS(pindex <= OFF_TO_IDX(vma->vm_end));514515retry:516page = vm_page_grab_iter(vm_obj, pindex, VM_ALLOC_NOCREAT, &pages);517if (page == NULL) {518page = PHYS_TO_VM_PAGE(IDX_TO_OFF(pfn));519if (page == NULL)520return (VM_FAULT_SIGBUS);521if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) {522pctrie_iter_reset(&pages);523goto retry;524}525if (page->object != NULL) {526tmp_obj = page->object;527vm_page_xunbusy(page);528VM_OBJECT_WUNLOCK(vm_obj);529VM_OBJECT_WLOCK(tmp_obj);530if (page->object == tmp_obj &&531vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) {532KASSERT(page->object == tmp_obj,533("page has changed identity"));534KASSERT((page->oflags & VPO_UNMANAGED) == 0,535("page does not belong to shmem"));536vm_pager_page_unswapped(page);537if (pmap_page_is_mapped(page)) {538vm_page_xunbusy(page);539VM_OBJECT_WUNLOCK(tmp_obj);540printf("%s: page rename failed: page "541"is mapped\n", __func__);542VM_OBJECT_WLOCK(vm_obj);543return (VM_FAULT_NOPAGE);544}545vm_page_remove(page);546}547VM_OBJECT_WUNLOCK(tmp_obj);548pctrie_iter_reset(&pages);549VM_OBJECT_WLOCK(vm_obj);550goto retry;551}552if (vm_page_iter_insert(page, vm_obj, pindex, &pages) != 0) {553vm_page_xunbusy(page);554return (VM_FAULT_OOM);555}556vm_page_valid(page);557}558pmap_page_set_memattr(page, pgprot2cachemode(prot));559vma->vm_pfn_count++;560561return (VM_FAULT_NOPAGE);562}563564int565lkpi_remap_pfn_range(struct vm_area_struct *vma, unsigned long start_addr,566unsigned long start_pfn, unsigned long size, pgprot_t prot)567{568vm_object_t vm_obj;569unsigned long addr, pfn;570int err = 0;571572vm_obj = vma->vm_obj;573574VM_OBJECT_WLOCK(vm_obj);575for (addr = start_addr, pfn = start_pfn;576addr < start_addr + size;577addr += PAGE_SIZE) {578vm_fault_t ret;579retry:580ret = lkpi_vmf_insert_pfn_prot_locked(vma, addr, pfn, prot);581582if ((ret & VM_FAULT_OOM) != 0) {583VM_OBJECT_WUNLOCK(vm_obj);584vm_wait(NULL);585VM_OBJECT_WLOCK(vm_obj);586goto retry;587}588589if ((ret & VM_FAULT_ERROR) != 0) {590err = -EFAULT;591break;592}593594pfn++;595}596VM_OBJECT_WUNLOCK(vm_obj);597598if (unlikely(err)) {599zap_vma_ptes(vma, start_addr,600(pfn - start_pfn) << PAGE_SHIFT);601return (err);602}603604return (0);605}606607int608lkpi_io_mapping_map_user(struct io_mapping *iomap,609struct vm_area_struct *vma, unsigned long addr,610unsigned long pfn, unsigned long size)611{612pgprot_t prot;613int ret;614615prot = cachemode2protval(iomap->attr);616ret = lkpi_remap_pfn_range(vma, addr, pfn, size, prot);617618return (ret);619}620621/*622* Although FreeBSD version of unmap_mapping_range has semantics and types of623* parameters compatible with Linux version, the values passed in are different624* @obj should match to vm_private_data field of vm_area_struct returned by625* mmap file operation handler, see linux_file_mmap_single() sources626* @holelen should match to size of area to be munmapped.627*/628void629lkpi_unmap_mapping_range(void *obj, loff_t const holebegin __unused,630loff_t const holelen __unused, int even_cows __unused)631{632vm_object_t devobj;633634devobj = cdev_pager_lookup(obj);635if (devobj != NULL) {636cdev_mgtdev_pager_free_pages(devobj);637vm_object_deallocate(devobj);638}639}640641int642lkpi_arch_phys_wc_add(unsigned long base, unsigned long size)643{644#ifdef __i386__645struct mem_range_desc *mrdesc;646int error, id, act;647648/* If PAT is available, do nothing */649if (pat_works)650return (0);651652mrdesc = malloc(sizeof(*mrdesc), M_LKMTRR, M_WAITOK);653mrdesc->mr_base = base;654mrdesc->mr_len = size;655mrdesc->mr_flags = MDF_WRITECOMBINE;656strlcpy(mrdesc->mr_owner, "drm", sizeof(mrdesc->mr_owner));657act = MEMRANGE_SET_UPDATE;658error = mem_range_attr_set(mrdesc, &act);659if (error == 0) {660error = idr_get_new(&mtrr_idr, mrdesc, &id);661MPASS(idr_find(&mtrr_idr, id) == mrdesc);662if (error != 0) {663act = MEMRANGE_SET_REMOVE;664mem_range_attr_set(mrdesc, &act);665}666}667if (error != 0) {668free(mrdesc, M_LKMTRR);669pr_warn(670"Failed to add WC MTRR for [%p-%p]: %d; "671"performance may suffer\n",672(void *)base, (void *)(base + size - 1), error);673} else674pr_warn("Successfully added WC MTRR for [%p-%p]\n",675(void *)base, (void *)(base + size - 1));676677return (error != 0 ? -error : id + __MTRR_ID_BASE);678#else679return (0);680#endif681}682683void684lkpi_arch_phys_wc_del(int reg)685{686#ifdef __i386__687struct mem_range_desc *mrdesc;688int act;689690/* Check if arch_phys_wc_add() failed. */691if (reg < __MTRR_ID_BASE)692return;693694mrdesc = idr_find(&mtrr_idr, reg - __MTRR_ID_BASE);695MPASS(mrdesc != NULL);696idr_remove(&mtrr_idr, reg - __MTRR_ID_BASE);697act = MEMRANGE_SET_REMOVE;698mem_range_attr_set(mrdesc, &act);699free(mrdesc, M_LKMTRR);700#endif701}702703/*704* This is a highly simplified version of the Linux page_frag_cache.705* We only support up-to 1 single page as fragment size and we will706* always return a full page. This may be wasteful on small objects707* but the only known consumer (mt76) is either asking for a half-page708* or a full page. If this was to become a problem we can implement709* a more elaborate version.710*/711void *712linuxkpi_page_frag_alloc(struct page_frag_cache *pfc,713size_t fragsz, gfp_t gfp)714{715vm_page_t pages;716717if (fragsz == 0)718return (NULL);719720KASSERT(fragsz <= PAGE_SIZE, ("%s: fragsz %zu > PAGE_SIZE not yet "721"supported", __func__, fragsz));722723pages = alloc_pages(gfp, flsl(howmany(fragsz, PAGE_SIZE) - 1));724if (pages == NULL)725return (NULL);726pfc->va = linux_page_address(pages);727728/* Passed in as "count" to __page_frag_cache_drain(). Unused by us. */729pfc->pagecnt_bias = 0;730731return (pfc->va);732}733734void735linuxkpi_page_frag_free(void *addr)736{737vm_page_t page;738739page = virt_to_page(addr);740linux_free_pages(page, 0);741}742743void744linuxkpi__page_frag_cache_drain(struct page *page, size_t count __unused)745{746747linux_free_pages(page, 0);748}749750static void751lkpi_page_init(void *arg)752{753int i;754755mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);756for (i = 0; i < VMMAP_HASH_SIZE; i++)757LIST_INIT(&vmmaphead[i]);758}759SYSINIT(lkpi_page, SI_SUB_DRIVERS, SI_ORDER_SECOND, lkpi_page_init, NULL);760761static void762lkpi_page_uninit(void *arg)763{764mtx_destroy(&vmmaplock);765}766SYSUNINIT(lkpi_page, SI_SUB_DRIVERS, SI_ORDER_SECOND, lkpi_page_uninit, NULL);767768769