Path: blob/main/sys/compat/linuxkpi/common/src/linux_page.c
39586 views
/*-1* Copyright (c) 2010 Isilon Systems, Inc.2* Copyright (c) 2016 Matthew Macy ([email protected])3* Copyright (c) 2017 Mellanox Technologies, Ltd.4* All rights reserved.5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice unmodified, this list of conditions, and the following11* disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR17* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES18* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.19* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,20* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT21* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,22* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY23* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT24* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF25* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.26*/2728#include <sys/param.h>29#include <sys/systm.h>30#include <sys/malloc.h>31#include <sys/kernel.h>32#include <sys/sysctl.h>33#include <sys/lock.h>34#include <sys/mutex.h>35#include <sys/rwlock.h>36#include <sys/proc.h>37#include <sys/sched.h>38#include <sys/memrange.h>3940#include <machine/bus.h>4142#include <vm/vm.h>43#include <vm/pmap.h>44#include <vm/vm_param.h>45#include <vm/vm_kern.h>46#include <vm/vm_object.h>47#include <vm/vm_map.h>48#include <vm/vm_page.h>49#include <vm/vm_pageout.h>50#include <vm/vm_pager.h>51#include <vm/vm_radix.h>52#include <vm/vm_reserv.h>53#include <vm/vm_extern.h>5455#include <vm/uma.h>56#include <vm/uma_int.h>5758#include <linux/gfp.h>59#include <linux/mm.h>60#include <linux/preempt.h>61#include <linux/fs.h>62#include <linux/shmem_fs.h>63#include <linux/kernel.h>64#include <linux/idr.h>65#include <linux/io.h>66#include <linux/io-mapping.h>6768#ifdef __i386__69DEFINE_IDR(mtrr_idr);70static MALLOC_DEFINE(M_LKMTRR, "idr", "Linux MTRR compat");71extern int pat_works;72#endif7374void75si_meminfo(struct sysinfo *si)76{77si->totalram = physmem;78si->freeram = vm_free_count();79si->totalhigh = 0;80si->freehigh = 0;81si->mem_unit = PAGE_SIZE;82}8384void *85linux_page_address(const struct page *page)86{8788if (page->object != kernel_object) {89return (PMAP_HAS_DMAP ?90((void *)(uintptr_t)PHYS_TO_DMAP(page_to_phys(page))) :91NULL);92}93return ((void *)(uintptr_t)(VM_MIN_KERNEL_ADDRESS +94IDX_TO_OFF(page->pindex)));95}9697struct page *98linux_alloc_pages(gfp_t flags, unsigned int order)99{100struct page *page;101102if (PMAP_HAS_DMAP) {103unsigned long npages = 1UL << order;104int req = VM_ALLOC_WIRED;105106if ((flags & M_ZERO) != 0)107req |= VM_ALLOC_ZERO;108109if (order == 0 && (flags & GFP_DMA32) == 0) {110page = vm_page_alloc_noobj(req);111if (page == NULL)112return (NULL);113} else {114vm_paddr_t pmax = (flags & GFP_DMA32) ?115BUS_SPACE_MAXADDR_32BIT : BUS_SPACE_MAXADDR;116117if ((flags & __GFP_NORETRY) != 0)118req |= VM_ALLOC_NORECLAIM;119120retry:121page = vm_page_alloc_noobj_contig(req, npages, 0, pmax,122PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);123if (page == NULL) {124if ((flags & (M_WAITOK | __GFP_NORETRY)) ==125M_WAITOK) {126int err = vm_page_reclaim_contig(req,127npages, 0, pmax, PAGE_SIZE, 0);128if (err == ENOMEM)129vm_wait(NULL);130else if (err != 0)131return (NULL);132flags &= ~M_WAITOK;133goto retry;134}135return (NULL);136}137}138} else {139vm_offset_t vaddr;140141vaddr = linux_alloc_kmem(flags, order);142if (vaddr == 0)143return (NULL);144145page = virt_to_page((void *)vaddr);146147KASSERT(vaddr == (vm_offset_t)page_address(page),148("Page address mismatch"));149}150151return (page);152}153154static void155_linux_free_kmem(vm_offset_t addr, unsigned int order)156{157size_t size = ((size_t)PAGE_SIZE) << order;158159kmem_free((void *)addr, size);160}161162void163linux_free_pages(struct page *page, unsigned int order)164{165if (PMAP_HAS_DMAP) {166unsigned long npages = 1UL << order;167unsigned long x;168169for (x = 0; x != npages; x++) {170vm_page_t pgo = page + x;171172/*173* The "free page" function is used in several174* contexts.175*176* Some pages are allocated by `linux_alloc_pages()`177* above, but not all of them are. For instance in the178* DRM drivers, some pages come from179* `shmem_read_mapping_page_gfp()`.180*181* That's why we need to check if the page is managed182* or not here.183*/184if ((pgo->oflags & VPO_UNMANAGED) == 0) {185vm_page_unwire(pgo, PQ_ACTIVE);186} else {187if (vm_page_unwire_noq(pgo))188vm_page_free(pgo);189}190}191} else {192vm_offset_t vaddr;193194vaddr = (vm_offset_t)page_address(page);195196_linux_free_kmem(vaddr, order);197}198}199200void201linux_release_pages(release_pages_arg arg, int nr)202{203int i;204205CTASSERT(offsetof(struct folio, page) == 0);206207for (i = 0; i < nr; i++)208__free_page(arg.pages[i]);209}210211vm_offset_t212linux_alloc_kmem(gfp_t flags, unsigned int order)213{214size_t size = ((size_t)PAGE_SIZE) << order;215void *addr;216217addr = kmem_alloc_contig(size, flags & GFP_NATIVE_MASK, 0,218((flags & GFP_DMA32) == 0) ? -1UL : BUS_SPACE_MAXADDR_32BIT,219PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);220221return ((vm_offset_t)addr);222}223224void225linux_free_kmem(vm_offset_t addr, unsigned int order)226{227KASSERT((addr & ~PAGE_MASK) == 0,228("%s: addr %p is not page aligned", __func__, (void *)addr));229230if (addr >= VM_MIN_KERNEL_ADDRESS && addr < VM_MAX_KERNEL_ADDRESS) {231_linux_free_kmem(addr, order);232} else {233vm_page_t page;234235page = PHYS_TO_VM_PAGE(DMAP_TO_PHYS(addr));236linux_free_pages(page, order);237}238}239240static int241linux_get_user_pages_internal(vm_map_t map, unsigned long start, int nr_pages,242int write, struct page **pages)243{244vm_prot_t prot;245size_t len;246int count;247248prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;249len = ptoa((vm_offset_t)nr_pages);250count = vm_fault_quick_hold_pages(map, start, len, prot, pages, nr_pages);251return (count == -1 ? -EFAULT : nr_pages);252}253254int255__get_user_pages_fast(unsigned long start, int nr_pages, int write,256struct page **pages)257{258vm_map_t map;259vm_page_t *mp;260vm_offset_t va;261vm_offset_t end;262vm_prot_t prot;263int count;264265if (nr_pages == 0 || in_interrupt())266return (0);267268MPASS(pages != NULL);269map = &curthread->td_proc->p_vmspace->vm_map;270end = start + ptoa((vm_offset_t)nr_pages);271if (!vm_map_range_valid(map, start, end))272return (-EINVAL);273prot = write ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;274for (count = 0, mp = pages, va = start; va < end;275mp++, va += PAGE_SIZE, count++) {276*mp = pmap_extract_and_hold(map->pmap, va, prot);277if (*mp == NULL)278break;279280if ((prot & VM_PROT_WRITE) != 0 &&281(*mp)->dirty != VM_PAGE_BITS_ALL) {282/*283* Explicitly dirty the physical page. Otherwise, the284* caller's changes may go unnoticed because they are285* performed through an unmanaged mapping or by a DMA286* operation.287*288* The object lock is not held here.289* See vm_page_clear_dirty_mask().290*/291vm_page_dirty(*mp);292}293}294return (count);295}296297long298get_user_pages_remote(struct task_struct *task, struct mm_struct *mm,299unsigned long start, unsigned long nr_pages, unsigned int gup_flags,300struct page **pages, struct vm_area_struct **vmas)301{302vm_map_t map;303304map = &task->task_thread->td_proc->p_vmspace->vm_map;305return (linux_get_user_pages_internal(map, start, nr_pages,306!!(gup_flags & FOLL_WRITE), pages));307}308309long310lkpi_get_user_pages(unsigned long start, unsigned long nr_pages,311unsigned int gup_flags, struct page **pages)312{313vm_map_t map;314315map = &curthread->td_proc->p_vmspace->vm_map;316return (linux_get_user_pages_internal(map, start, nr_pages,317!!(gup_flags & FOLL_WRITE), pages));318}319320int321is_vmalloc_addr(const void *addr)322{323return (vtoslab((vm_offset_t)addr & ~UMA_SLAB_MASK) != NULL);324}325326vm_fault_t327lkpi_vmf_insert_pfn_prot_locked(struct vm_area_struct *vma, unsigned long addr,328unsigned long pfn, pgprot_t prot)329{330struct pctrie_iter pages;331vm_object_t vm_obj = vma->vm_obj;332vm_object_t tmp_obj;333vm_page_t page;334vm_pindex_t pindex;335336VM_OBJECT_ASSERT_WLOCKED(vm_obj);337vm_page_iter_init(&pages, vm_obj);338pindex = OFF_TO_IDX(addr - vma->vm_start);339if (vma->vm_pfn_count == 0)340vma->vm_pfn_first = pindex;341MPASS(pindex <= OFF_TO_IDX(vma->vm_end));342343retry:344page = vm_page_grab_iter(vm_obj, pindex, VM_ALLOC_NOCREAT, &pages);345if (page == NULL) {346page = PHYS_TO_VM_PAGE(IDX_TO_OFF(pfn));347if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) {348pctrie_iter_reset(&pages);349goto retry;350}351if (page->object != NULL) {352tmp_obj = page->object;353vm_page_xunbusy(page);354VM_OBJECT_WUNLOCK(vm_obj);355VM_OBJECT_WLOCK(tmp_obj);356if (page->object == tmp_obj &&357vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) {358KASSERT(page->object == tmp_obj,359("page has changed identity"));360KASSERT((page->oflags & VPO_UNMANAGED) == 0,361("page does not belong to shmem"));362vm_pager_page_unswapped(page);363if (pmap_page_is_mapped(page)) {364vm_page_xunbusy(page);365VM_OBJECT_WUNLOCK(tmp_obj);366printf("%s: page rename failed: page "367"is mapped\n", __func__);368VM_OBJECT_WLOCK(vm_obj);369return (VM_FAULT_NOPAGE);370}371vm_page_remove(page);372}373VM_OBJECT_WUNLOCK(tmp_obj);374pctrie_iter_reset(&pages);375VM_OBJECT_WLOCK(vm_obj);376goto retry;377}378if (vm_page_iter_insert(page, vm_obj, pindex, &pages) != 0) {379vm_page_xunbusy(page);380return (VM_FAULT_OOM);381}382vm_page_valid(page);383}384pmap_page_set_memattr(page, pgprot2cachemode(prot));385vma->vm_pfn_count++;386387return (VM_FAULT_NOPAGE);388}389390int391lkpi_remap_pfn_range(struct vm_area_struct *vma, unsigned long start_addr,392unsigned long start_pfn, unsigned long size, pgprot_t prot)393{394vm_object_t vm_obj;395unsigned long addr, pfn;396int err = 0;397398vm_obj = vma->vm_obj;399400VM_OBJECT_WLOCK(vm_obj);401for (addr = start_addr, pfn = start_pfn;402addr < start_addr + size;403addr += PAGE_SIZE) {404vm_fault_t ret;405retry:406ret = lkpi_vmf_insert_pfn_prot_locked(vma, addr, pfn, prot);407408if ((ret & VM_FAULT_OOM) != 0) {409VM_OBJECT_WUNLOCK(vm_obj);410vm_wait(NULL);411VM_OBJECT_WLOCK(vm_obj);412goto retry;413}414415if ((ret & VM_FAULT_ERROR) != 0) {416err = -EFAULT;417break;418}419420pfn++;421}422VM_OBJECT_WUNLOCK(vm_obj);423424if (unlikely(err)) {425zap_vma_ptes(vma, start_addr,426(pfn - start_pfn) << PAGE_SHIFT);427return (err);428}429430return (0);431}432433int434lkpi_io_mapping_map_user(struct io_mapping *iomap,435struct vm_area_struct *vma, unsigned long addr,436unsigned long pfn, unsigned long size)437{438pgprot_t prot;439int ret;440441prot = cachemode2protval(iomap->attr);442ret = lkpi_remap_pfn_range(vma, addr, pfn, size, prot);443444return (ret);445}446447/*448* Although FreeBSD version of unmap_mapping_range has semantics and types of449* parameters compatible with Linux version, the values passed in are different450* @obj should match to vm_private_data field of vm_area_struct returned by451* mmap file operation handler, see linux_file_mmap_single() sources452* @holelen should match to size of area to be munmapped.453*/454void455lkpi_unmap_mapping_range(void *obj, loff_t const holebegin __unused,456loff_t const holelen __unused, int even_cows __unused)457{458vm_object_t devobj;459460devobj = cdev_pager_lookup(obj);461if (devobj != NULL) {462cdev_mgtdev_pager_free_pages(devobj);463vm_object_deallocate(devobj);464}465}466467int468lkpi_arch_phys_wc_add(unsigned long base, unsigned long size)469{470#ifdef __i386__471struct mem_range_desc *mrdesc;472int error, id, act;473474/* If PAT is available, do nothing */475if (pat_works)476return (0);477478mrdesc = malloc(sizeof(*mrdesc), M_LKMTRR, M_WAITOK);479mrdesc->mr_base = base;480mrdesc->mr_len = size;481mrdesc->mr_flags = MDF_WRITECOMBINE;482strlcpy(mrdesc->mr_owner, "drm", sizeof(mrdesc->mr_owner));483act = MEMRANGE_SET_UPDATE;484error = mem_range_attr_set(mrdesc, &act);485if (error == 0) {486error = idr_get_new(&mtrr_idr, mrdesc, &id);487MPASS(idr_find(&mtrr_idr, id) == mrdesc);488if (error != 0) {489act = MEMRANGE_SET_REMOVE;490mem_range_attr_set(mrdesc, &act);491}492}493if (error != 0) {494free(mrdesc, M_LKMTRR);495pr_warn(496"Failed to add WC MTRR for [%p-%p]: %d; "497"performance may suffer\n",498(void *)base, (void *)(base + size - 1), error);499} else500pr_warn("Successfully added WC MTRR for [%p-%p]\n",501(void *)base, (void *)(base + size - 1));502503return (error != 0 ? -error : id + __MTRR_ID_BASE);504#else505return (0);506#endif507}508509void510lkpi_arch_phys_wc_del(int reg)511{512#ifdef __i386__513struct mem_range_desc *mrdesc;514int act;515516/* Check if arch_phys_wc_add() failed. */517if (reg < __MTRR_ID_BASE)518return;519520mrdesc = idr_find(&mtrr_idr, reg - __MTRR_ID_BASE);521MPASS(mrdesc != NULL);522idr_remove(&mtrr_idr, reg - __MTRR_ID_BASE);523act = MEMRANGE_SET_REMOVE;524mem_range_attr_set(mrdesc, &act);525free(mrdesc, M_LKMTRR);526#endif527}528529/*530* This is a highly simplified version of the Linux page_frag_cache.531* We only support up-to 1 single page as fragment size and we will532* always return a full page. This may be wasteful on small objects533* but the only known consumer (mt76) is either asking for a half-page534* or a full page. If this was to become a problem we can implement535* a more elaborate version.536*/537void *538linuxkpi_page_frag_alloc(struct page_frag_cache *pfc,539size_t fragsz, gfp_t gfp)540{541vm_page_t pages;542543if (fragsz == 0)544return (NULL);545546KASSERT(fragsz <= PAGE_SIZE, ("%s: fragsz %zu > PAGE_SIZE not yet "547"supported", __func__, fragsz));548549pages = alloc_pages(gfp, flsl(howmany(fragsz, PAGE_SIZE) - 1));550if (pages == NULL)551return (NULL);552pfc->va = linux_page_address(pages);553554/* Passed in as "count" to __page_frag_cache_drain(). Unused by us. */555pfc->pagecnt_bias = 0;556557return (pfc->va);558}559560void561linuxkpi_page_frag_free(void *addr)562{563vm_page_t page;564565page = virt_to_page(addr);566linux_free_pages(page, 0);567}568569void570linuxkpi__page_frag_cache_drain(struct page *page, size_t count __unused)571{572573linux_free_pages(page, 0);574}575576577