Path: blob/master/arch/powerpc/mm/book3s64/radix_pgtable.c
52067 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Page table handling routines for radix page table.3*4* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.5*/67#define pr_fmt(fmt) "radix-mmu: " fmt89#include <linux/io.h>10#include <linux/kernel.h>11#include <linux/sched/mm.h>12#include <linux/memblock.h>13#include <linux/of.h>14#include <linux/of_fdt.h>15#include <linux/mm.h>16#include <linux/page_table_check.h>17#include <linux/hugetlb.h>18#include <linux/string_helpers.h>19#include <linux/memory.h>20#include <linux/kfence.h>2122#include <asm/pgalloc.h>23#include <asm/mmu_context.h>24#include <asm/dma.h>25#include <asm/machdep.h>26#include <asm/mmu.h>27#include <asm/firmware.h>28#include <asm/powernv.h>29#include <asm/sections.h>30#include <asm/smp.h>31#include <asm/trace.h>32#include <asm/uaccess.h>33#include <asm/ultravisor.h>34#include <asm/set_memory.h>35#include <asm/kfence.h>3637#include <trace/events/thp.h>3839#include <mm/mmu_decl.h>4041unsigned int mmu_base_pid;4243static __ref void *early_alloc_pgtable(unsigned long size, int nid,44unsigned long region_start, unsigned long region_end)45{46phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;47phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;48void *ptr;4950if (region_start)51min_addr = region_start;52if (region_end)53max_addr = region_end;5455ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);5657if (!ptr)58panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",59__func__, size, size, nid, &min_addr, &max_addr);6061return ptr;62}6364/*65* When allocating pud or pmd pointers, we allocate a complete page66* of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This67* is to ensure that the page obtained from the memblock allocator68* can be completely used as page table page and can be freed69* correctly when the page table entries are removed.70*/71static int early_map_kernel_page(unsigned long ea, unsigned long pa,72pgprot_t flags,73unsigned int map_page_size,74int nid,75unsigned long region_start, unsigned long region_end)76{77unsigned long pfn = pa >> PAGE_SHIFT;78pgd_t *pgdp;79p4d_t *p4dp;80pud_t *pudp;81pmd_t *pmdp;82pte_t *ptep;8384pgdp = pgd_offset_k(ea);85p4dp = p4d_offset(pgdp, ea);86if (p4d_none(*p4dp)) {87pudp = early_alloc_pgtable(PAGE_SIZE, nid,88region_start, region_end);89p4d_populate(&init_mm, p4dp, pudp);90}91pudp = pud_offset(p4dp, ea);92if (map_page_size == PUD_SIZE) {93ptep = (pte_t *)pudp;94goto set_the_pte;95}96if (pud_none(*pudp)) {97pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,98region_end);99pud_populate(&init_mm, pudp, pmdp);100}101pmdp = pmd_offset(pudp, ea);102if (map_page_size == PMD_SIZE) {103ptep = pmdp_ptep(pmdp);104goto set_the_pte;105}106if (!pmd_present(*pmdp)) {107ptep = early_alloc_pgtable(PAGE_SIZE, nid,108region_start, region_end);109pmd_populate_kernel(&init_mm, pmdp, ptep);110}111ptep = pte_offset_kernel(pmdp, ea);112113set_the_pte:114set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));115asm volatile("ptesync": : :"memory");116return 0;117}118119/*120* nid, region_start, and region_end are hints to try to place the page121* table memory in the same node or region.122*/123static int __map_kernel_page(unsigned long ea, unsigned long pa,124pgprot_t flags,125unsigned int map_page_size,126int nid,127unsigned long region_start, unsigned long region_end)128{129unsigned long pfn = pa >> PAGE_SHIFT;130pgd_t *pgdp;131p4d_t *p4dp;132pud_t *pudp;133pmd_t *pmdp;134pte_t *ptep;135/*136* Make sure task size is correct as per the max adddr137*/138BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);139140#ifdef CONFIG_PPC_64K_PAGES141BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));142#endif143144if (unlikely(!slab_is_available()))145return early_map_kernel_page(ea, pa, flags, map_page_size,146nid, region_start, region_end);147148/*149* Should make page table allocation functions be able to take a150* node, so we can place kernel page tables on the right nodes after151* boot.152*/153pgdp = pgd_offset_k(ea);154p4dp = p4d_offset(pgdp, ea);155pudp = pud_alloc(&init_mm, p4dp, ea);156if (!pudp)157return -ENOMEM;158if (map_page_size == PUD_SIZE) {159ptep = (pte_t *)pudp;160goto set_the_pte;161}162pmdp = pmd_alloc(&init_mm, pudp, ea);163if (!pmdp)164return -ENOMEM;165if (map_page_size == PMD_SIZE) {166ptep = pmdp_ptep(pmdp);167goto set_the_pte;168}169ptep = pte_alloc_kernel(pmdp, ea);170if (!ptep)171return -ENOMEM;172173set_the_pte:174set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));175asm volatile("ptesync": : :"memory");176return 0;177}178179int radix__map_kernel_page(unsigned long ea, unsigned long pa,180pgprot_t flags,181unsigned int map_page_size)182{183return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);184}185186#ifdef CONFIG_STRICT_KERNEL_RWX187static void radix__change_memory_range(unsigned long start, unsigned long end,188unsigned long clear)189{190unsigned long idx;191pgd_t *pgdp;192p4d_t *p4dp;193pud_t *pudp;194pmd_t *pmdp;195pte_t *ptep;196197start = ALIGN_DOWN(start, PAGE_SIZE);198end = PAGE_ALIGN(end); // aligns up199200pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",201start, end, clear);202203for (idx = start; idx < end; idx += PAGE_SIZE) {204pgdp = pgd_offset_k(idx);205p4dp = p4d_offset(pgdp, idx);206pudp = pud_alloc(&init_mm, p4dp, idx);207if (!pudp)208continue;209if (pud_leaf(*pudp)) {210ptep = (pte_t *)pudp;211goto update_the_pte;212}213pmdp = pmd_alloc(&init_mm, pudp, idx);214if (!pmdp)215continue;216if (pmd_leaf(*pmdp)) {217ptep = pmdp_ptep(pmdp);218goto update_the_pte;219}220ptep = pte_alloc_kernel(pmdp, idx);221if (!ptep)222continue;223update_the_pte:224radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);225}226227radix__flush_tlb_kernel_range(start, end);228}229230void radix__mark_rodata_ro(void)231{232unsigned long start, end;233234start = (unsigned long)_stext;235end = (unsigned long)__end_rodata;236237radix__change_memory_range(start, end, _PAGE_WRITE);238239for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {240end = start + PAGE_SIZE;241if (overlaps_interrupt_vector_text(start, end))242radix__change_memory_range(start, end, _PAGE_WRITE);243else244break;245}246}247248void radix__mark_initmem_nx(void)249{250unsigned long start = (unsigned long)__init_begin;251unsigned long end = (unsigned long)__init_end;252253radix__change_memory_range(start, end, _PAGE_EXEC);254}255#endif /* CONFIG_STRICT_KERNEL_RWX */256257static inline void __meminit258print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)259{260char buf[10];261262if (end <= start)263return;264265string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));266267pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,268exec ? " (exec)" : "");269}270271static unsigned long next_boundary(unsigned long addr, unsigned long end)272{273#ifdef CONFIG_STRICT_KERNEL_RWX274unsigned long stext_phys;275276stext_phys = __pa_symbol(_stext);277278// Relocatable kernel running at non-zero real address279if (stext_phys != 0) {280// The end of interrupts code at zero is a rodata boundary281unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;282if (addr < end_intr)283return end_intr;284285// Start of relocated kernel text is a rodata boundary286if (addr < stext_phys)287return stext_phys;288}289290if (addr < __pa_symbol(__srwx_boundary))291return __pa_symbol(__srwx_boundary);292#endif293return end;294}295296static int __meminit create_physical_mapping(unsigned long start,297unsigned long end,298int nid, pgprot_t _prot,299unsigned long mapping_sz_limit)300{301unsigned long vaddr, addr, mapping_size = 0;302bool prev_exec, exec = false;303pgprot_t prot;304int psize;305unsigned long max_mapping_size = memory_block_size;306307if (mapping_sz_limit < max_mapping_size)308max_mapping_size = mapping_sz_limit;309310if (debug_pagealloc_enabled())311max_mapping_size = PAGE_SIZE;312313start = ALIGN(start, PAGE_SIZE);314end = ALIGN_DOWN(end, PAGE_SIZE);315for (addr = start; addr < end; addr += mapping_size) {316unsigned long gap, previous_size;317int rc;318319gap = next_boundary(addr, end) - addr;320if (gap > max_mapping_size)321gap = max_mapping_size;322previous_size = mapping_size;323prev_exec = exec;324325if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&326mmu_psize_defs[MMU_PAGE_1G].shift) {327mapping_size = PUD_SIZE;328psize = MMU_PAGE_1G;329} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&330mmu_psize_defs[MMU_PAGE_2M].shift) {331mapping_size = PMD_SIZE;332psize = MMU_PAGE_2M;333} else {334mapping_size = PAGE_SIZE;335psize = mmu_virtual_psize;336}337338vaddr = (unsigned long)__va(addr);339340if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||341overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {342prot = PAGE_KERNEL_X;343exec = true;344} else {345prot = _prot;346exec = false;347}348349if (mapping_size != previous_size || exec != prev_exec) {350print_mapping(start, addr, previous_size, prev_exec);351start = addr;352}353354rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);355if (rc)356return rc;357358update_page_count(psize, 1);359}360361print_mapping(start, addr, mapping_size, exec);362return 0;363}364365#ifdef CONFIG_KFENCE366static __init phys_addr_t alloc_kfence_pool(void)367{368phys_addr_t kfence_pool;369370/*371* TODO: Support to enable KFENCE after bootup depends on the ability to372* split page table mappings. As such support is not currently373* implemented for radix pagetables, support enabling KFENCE374* only at system startup for now.375*376* After support for splitting mappings is available on radix,377* alloc_kfence_pool() & map_kfence_pool() can be dropped and378* mapping for __kfence_pool memory can be379* split during arch_kfence_init_pool().380*/381if (!kfence_early_init)382goto no_kfence;383384kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);385if (!kfence_pool)386goto no_kfence;387388memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);389return kfence_pool;390391no_kfence:392disable_kfence();393return 0;394}395396static __init void map_kfence_pool(phys_addr_t kfence_pool)397{398if (!kfence_pool)399return;400401if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,402-1, PAGE_KERNEL, PAGE_SIZE))403goto err;404405memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);406__kfence_pool = __va(kfence_pool);407return;408409err:410memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);411disable_kfence();412}413#else414static inline phys_addr_t alloc_kfence_pool(void) { return 0; }415static inline void map_kfence_pool(phys_addr_t kfence_pool) { }416#endif417418static void __init radix_init_pgtable(void)419{420phys_addr_t kfence_pool;421unsigned long rts_field;422phys_addr_t start, end;423u64 i;424425/* We don't support slb for radix */426slb_set_size(0);427428kfence_pool = alloc_kfence_pool();429430/*431* Create the linear mapping432*/433for_each_mem_range(i, &start, &end) {434/*435* The memblock allocator is up at this point, so the436* page tables will be allocated within the range. No437* need or a node (which we don't have yet).438*/439440if (end >= RADIX_VMALLOC_START) {441pr_warn("Outside the supported range\n");442continue;443}444445WARN_ON(create_physical_mapping(start, end,446-1, PAGE_KERNEL, ~0UL));447}448449map_kfence_pool(kfence_pool);450451if (!cpu_has_feature(CPU_FTR_HVMODE) &&452cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {453/*454* Older versions of KVM on these machines prefer if the455* guest only uses the low 19 PID bits.456*/457mmu_pid_bits = 19;458}459mmu_base_pid = 1;460461/*462* Allocate Partition table and process table for the463* host.464*/465BUG_ON(PRTB_SIZE_SHIFT > 36);466process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);467/*468* Fill in the process table.469*/470rts_field = radix__get_tree_size();471process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);472473/*474* The init_mm context is given the first available (non-zero) PID,475* which is the "guard PID" and contains no page table. PIDR should476* never be set to zero because that duplicates the kernel address477* space at the 0x0... offset (quadrant 0)!478*479* An arbitrary PID that may later be allocated by the PID allocator480* for userspace processes must not be used either, because that481* would cause stale user mappings for that PID on CPUs outside of482* the TLB invalidation scheme (because it won't be in mm_cpumask).483*484* So permanently carve out one PID for the purpose of a guard PID.485*/486init_mm.context.id = mmu_base_pid;487mmu_base_pid++;488}489490static void __init radix_init_partition_table(void)491{492unsigned long rts_field, dw0, dw1;493494mmu_partition_table_init();495rts_field = radix__get_tree_size();496dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;497dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;498mmu_partition_table_set_entry(0, dw0, dw1, false);499500pr_info("Initializing Radix MMU\n");501}502503static int __init get_idx_from_shift(unsigned int shift)504{505int idx = -1;506507switch (shift) {508case 0xc:509idx = MMU_PAGE_4K;510break;511case 0x10:512idx = MMU_PAGE_64K;513break;514case 0x15:515idx = MMU_PAGE_2M;516break;517case 0x1e:518idx = MMU_PAGE_1G;519break;520}521return idx;522}523524static int __init radix_dt_scan_page_sizes(unsigned long node,525const char *uname, int depth,526void *data)527{528int size = 0;529int shift, idx;530unsigned int ap;531const __be32 *prop;532const char *type = of_get_flat_dt_prop(node, "device_type", NULL);533534/* We are scanning "cpu" nodes only */535if (type == NULL || strcmp(type, "cpu") != 0)536return 0;537538/* Grab page size encodings */539prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);540if (!prop)541return 0;542543pr_info("Page sizes from device-tree:\n");544for (; size >= 4; size -= 4, ++prop) {545546struct mmu_psize_def *def;547548/* top 3 bit is AP encoding */549shift = be32_to_cpu(prop[0]) & ~(0xe << 28);550ap = be32_to_cpu(prop[0]) >> 29;551pr_info("Page size shift = %d AP=0x%x\n", shift, ap);552553idx = get_idx_from_shift(shift);554if (idx < 0)555continue;556557def = &mmu_psize_defs[idx];558def->shift = shift;559def->ap = ap;560def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);561}562563/* needed ? */564cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;565return 1;566}567568void __init radix__early_init_devtree(void)569{570int rc;571572/*573* Try to find the available page sizes in the device-tree574*/575rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);576if (!rc) {577/*578* No page size details found in device tree.579* Let's assume we have page 4k and 64k support580*/581mmu_psize_defs[MMU_PAGE_4K].shift = 12;582mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;583mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =584psize_to_rpti_pgsize(MMU_PAGE_4K);585586mmu_psize_defs[MMU_PAGE_64K].shift = 16;587mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;588mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =589psize_to_rpti_pgsize(MMU_PAGE_64K);590}591return;592}593594void __init radix__early_init_mmu(void)595{596unsigned long lpcr;597598#ifdef CONFIG_PPC_64S_HASH_MMU599#ifdef CONFIG_PPC_64K_PAGES600/* PAGE_SIZE mappings */601mmu_virtual_psize = MMU_PAGE_64K;602#else603mmu_virtual_psize = MMU_PAGE_4K;604#endif605#endif606/*607* initialize page table size608*/609__pte_index_size = RADIX_PTE_INDEX_SIZE;610__pmd_index_size = RADIX_PMD_INDEX_SIZE;611__pud_index_size = RADIX_PUD_INDEX_SIZE;612__pgd_index_size = RADIX_PGD_INDEX_SIZE;613__pud_cache_index = RADIX_PUD_INDEX_SIZE;614__pte_table_size = RADIX_PTE_TABLE_SIZE;615__pmd_table_size = RADIX_PMD_TABLE_SIZE;616__pud_table_size = RADIX_PUD_TABLE_SIZE;617__pgd_table_size = RADIX_PGD_TABLE_SIZE;618619__pmd_val_bits = RADIX_PMD_VAL_BITS;620__pud_val_bits = RADIX_PUD_VAL_BITS;621__pgd_val_bits = RADIX_PGD_VAL_BITS;622623__kernel_virt_start = RADIX_KERN_VIRT_START;624__vmalloc_start = RADIX_VMALLOC_START;625__vmalloc_end = RADIX_VMALLOC_END;626__kernel_io_start = RADIX_KERN_IO_START;627__kernel_io_end = RADIX_KERN_IO_END;628vmemmap = (struct page *)RADIX_VMEMMAP_START;629ioremap_bot = IOREMAP_BASE;630631#ifdef CONFIG_PCI632pci_io_base = ISA_IO_BASE;633#endif634__pte_frag_nr = RADIX_PTE_FRAG_NR;635__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;636__pmd_frag_nr = RADIX_PMD_FRAG_NR;637__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;638639radix_init_pgtable();640641if (!firmware_has_feature(FW_FEATURE_LPAR)) {642lpcr = mfspr(SPRN_LPCR);643mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);644radix_init_partition_table();645} else {646radix_init_pseries();647}648649memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);650651/* Switch to the guard PID before turning on MMU */652radix__switch_mmu_context(NULL, &init_mm);653tlbiel_all();654}655656void radix__early_init_mmu_secondary(void)657{658unsigned long lpcr;659/*660* update partition table control register and UPRT661*/662if (!firmware_has_feature(FW_FEATURE_LPAR)) {663lpcr = mfspr(SPRN_LPCR);664mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);665666set_ptcr_when_no_uv(__pa(partition_tb) |667(PATB_SIZE_SHIFT - 12));668}669670radix__switch_mmu_context(NULL, &init_mm);671tlbiel_all();672673/* Make sure userspace can't change the AMR */674mtspr(SPRN_UAMOR, 0);675}676677/* Called during kexec sequence with MMU off */678notrace void radix__mmu_cleanup_all(void)679{680unsigned long lpcr;681682if (!firmware_has_feature(FW_FEATURE_LPAR)) {683lpcr = mfspr(SPRN_LPCR);684mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);685set_ptcr_when_no_uv(0);686powernv_set_nmmu_ptcr(0);687radix__flush_tlb_all();688}689}690691#ifdef CONFIG_MEMORY_HOTPLUG692static void free_pte_table(pte_t *pte_start, pmd_t *pmd)693{694pte_t *pte;695int i;696697for (i = 0; i < PTRS_PER_PTE; i++) {698pte = pte_start + i;699if (!pte_none(*pte))700return;701}702703pte_free_kernel(&init_mm, pte_start);704pmd_clear(pmd);705}706707static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)708{709pmd_t *pmd;710int i;711712for (i = 0; i < PTRS_PER_PMD; i++) {713pmd = pmd_start + i;714if (!pmd_none(*pmd))715return;716}717718pmd_free(&init_mm, pmd_start);719pud_clear(pud);720}721722static void free_pud_table(pud_t *pud_start, p4d_t *p4d)723{724pud_t *pud;725int i;726727for (i = 0; i < PTRS_PER_PUD; i++) {728pud = pud_start + i;729if (!pud_none(*pud))730return;731}732733pud_free(&init_mm, pud_start);734p4d_clear(p4d);735}736737#ifdef CONFIG_SPARSEMEM_VMEMMAP738static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)739{740unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);741742return !vmemmap_populated(start, PMD_SIZE);743}744745static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)746{747unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);748749return !vmemmap_populated(start, PAGE_SIZE);750751}752#endif753754static void __meminit free_vmemmap_pages(struct page *page,755struct vmem_altmap *altmap,756int order)757{758unsigned int nr_pages = 1 << order;759760if (altmap) {761unsigned long alt_start, alt_end;762unsigned long base_pfn = page_to_pfn(page);763764/*765* with 2M vmemmap mmaping we can have things setup766* such that even though atlmap is specified we never767* used altmap.768*/769alt_start = altmap->base_pfn;770alt_end = altmap->base_pfn + altmap->reserve + altmap->free;771772if (base_pfn >= alt_start && base_pfn < alt_end) {773vmem_altmap_free(altmap, nr_pages);774return;775}776}777778if (PageReserved(page)) {779/* allocated from memblock */780while (nr_pages--)781free_reserved_page(page++);782} else783__free_pages(page, order);784}785786static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,787unsigned long end, bool direct,788struct vmem_altmap *altmap)789{790unsigned long next, pages = 0;791pte_t *pte;792793pte = pte_start + pte_index(addr);794for (; addr < end; addr = next, pte++) {795next = (addr + PAGE_SIZE) & PAGE_MASK;796if (next > end)797next = end;798799if (!pte_present(*pte))800continue;801802if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {803if (!direct)804free_vmemmap_pages(pte_page(*pte), altmap, 0);805pte_clear(&init_mm, addr, pte);806pages++;807}808#ifdef CONFIG_SPARSEMEM_VMEMMAP809else if (!direct && vmemmap_page_is_unused(addr, next)) {810free_vmemmap_pages(pte_page(*pte), altmap, 0);811pte_clear(&init_mm, addr, pte);812}813#endif814}815if (direct)816update_page_count(mmu_virtual_psize, -pages);817}818819static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,820unsigned long end, bool direct,821struct vmem_altmap *altmap)822{823unsigned long next, pages = 0;824pte_t *pte_base;825pmd_t *pmd;826827pmd = pmd_start + pmd_index(addr);828for (; addr < end; addr = next, pmd++) {829next = pmd_addr_end(addr, end);830831if (!pmd_present(*pmd))832continue;833834if (pmd_leaf(*pmd)) {835if (IS_ALIGNED(addr, PMD_SIZE) &&836IS_ALIGNED(next, PMD_SIZE)) {837if (!direct)838free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));839pte_clear(&init_mm, addr, (pte_t *)pmd);840pages++;841}842#ifdef CONFIG_SPARSEMEM_VMEMMAP843else if (!direct && vmemmap_pmd_is_unused(addr, next)) {844free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));845pte_clear(&init_mm, addr, (pte_t *)pmd);846}847#endif848continue;849}850851pte_base = (pte_t *)pmd_page_vaddr(*pmd);852remove_pte_table(pte_base, addr, next, direct, altmap);853free_pte_table(pte_base, pmd);854}855if (direct)856update_page_count(MMU_PAGE_2M, -pages);857}858859static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,860unsigned long end, bool direct,861struct vmem_altmap *altmap)862{863unsigned long next, pages = 0;864pmd_t *pmd_base;865pud_t *pud;866867pud = pud_start + pud_index(addr);868for (; addr < end; addr = next, pud++) {869next = pud_addr_end(addr, end);870871if (!pud_present(*pud))872continue;873874if (pud_leaf(*pud)) {875if (!IS_ALIGNED(addr, PUD_SIZE) ||876!IS_ALIGNED(next, PUD_SIZE)) {877WARN_ONCE(1, "%s: unaligned range\n", __func__);878continue;879}880pte_clear(&init_mm, addr, (pte_t *)pud);881pages++;882continue;883}884885pmd_base = pud_pgtable(*pud);886remove_pmd_table(pmd_base, addr, next, direct, altmap);887free_pmd_table(pmd_base, pud);888}889if (direct)890update_page_count(MMU_PAGE_1G, -pages);891}892893static void __meminit894remove_pagetable(unsigned long start, unsigned long end, bool direct,895struct vmem_altmap *altmap)896{897unsigned long addr, next;898pud_t *pud_base;899pgd_t *pgd;900p4d_t *p4d;901902spin_lock(&init_mm.page_table_lock);903904for (addr = start; addr < end; addr = next) {905next = pgd_addr_end(addr, end);906907pgd = pgd_offset_k(addr);908p4d = p4d_offset(pgd, addr);909if (!p4d_present(*p4d))910continue;911912if (p4d_leaf(*p4d)) {913if (!IS_ALIGNED(addr, P4D_SIZE) ||914!IS_ALIGNED(next, P4D_SIZE)) {915WARN_ONCE(1, "%s: unaligned range\n", __func__);916continue;917}918919pte_clear(&init_mm, addr, (pte_t *)pgd);920continue;921}922923pud_base = p4d_pgtable(*p4d);924remove_pud_table(pud_base, addr, next, direct, altmap);925free_pud_table(pud_base, p4d);926}927928spin_unlock(&init_mm.page_table_lock);929radix__flush_tlb_kernel_range(start, end);930}931932int __meminit radix__create_section_mapping(unsigned long start,933unsigned long end, int nid,934pgprot_t prot)935{936if (end >= RADIX_VMALLOC_START) {937pr_warn("Outside the supported range\n");938return -1;939}940941return create_physical_mapping(__pa(start), __pa(end),942nid, prot, ~0UL);943}944945int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)946{947remove_pagetable(start, end, true, NULL);948return 0;949}950#endif /* CONFIG_MEMORY_HOTPLUG */951952#ifdef CONFIG_SPARSEMEM_VMEMMAP953static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,954pgprot_t flags, unsigned int map_page_size,955int nid)956{957return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);958}959960int __meminit radix__vmemmap_create_mapping(unsigned long start,961unsigned long page_size,962unsigned long phys)963{964/* Create a PTE encoding */965int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);966int ret;967968if ((start + page_size) >= RADIX_VMEMMAP_END) {969pr_warn("Outside the supported range\n");970return -1;971}972973ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);974BUG_ON(ret);975976return 0;977}978979#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP980bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)981{982if (radix_enabled())983return __vmemmap_can_optimize(altmap, pgmap);984985return false;986}987#endif988989int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,990unsigned long addr, unsigned long next)991{992int large = pmd_leaf(*pmdp);993994if (large)995vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);996997return large;998}9991000void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,1001unsigned long addr, unsigned long next)1002{1003pte_t entry;1004pte_t *ptep = pmdp_ptep(pmdp);10051006VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));1007entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);1008set_pte_at(&init_mm, addr, ptep, entry);1009asm volatile("ptesync": : :"memory");10101011vmemmap_verify(ptep, node, addr, next);1012}10131014static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,1015int node,1016struct vmem_altmap *altmap,1017struct page *reuse)1018{1019pte_t *pte = pte_offset_kernel(pmdp, addr);10201021if (pte_none(*pte)) {1022pte_t entry;1023void *p;10241025if (!reuse) {1026/*1027* make sure we don't create altmap mappings1028* covering things outside the device.1029*/1030if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))1031altmap = NULL;10321033p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);1034if (!p && altmap)1035p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);1036if (!p)1037return NULL;1038pr_debug("PAGE_SIZE vmemmap mapping\n");1039} else {1040/*1041* When a PTE/PMD entry is freed from the init_mm1042* there's a free_pages() call to this page allocated1043* above. Thus this get_page() is paired with the1044* put_page_testzero() on the freeing path.1045* This can only called by certain ZONE_DEVICE path,1046* and through vmemmap_populate_compound_pages() when1047* slab is available.1048*/1049get_page(reuse);1050p = page_to_virt(reuse);1051pr_debug("Tail page reuse vmemmap mapping\n");1052}10531054VM_BUG_ON(!PAGE_ALIGNED(addr));1055entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);1056set_pte_at(&init_mm, addr, pte, entry);1057asm volatile("ptesync": : :"memory");1058}1059return pte;1060}10611062static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,1063unsigned long address)1064{1065pud_t *pud;10661067/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1068if (unlikely(p4d_none(*p4dp))) {1069if (unlikely(!slab_is_available())) {1070pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1071p4d_populate(&init_mm, p4dp, pud);1072/* go to the pud_offset */1073} else1074return pud_alloc(&init_mm, p4dp, address);1075}1076return pud_offset(p4dp, address);1077}10781079static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,1080unsigned long address)1081{1082pmd_t *pmd;10831084/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1085if (unlikely(pud_none(*pudp))) {1086if (unlikely(!slab_is_available())) {1087pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1088pud_populate(&init_mm, pudp, pmd);1089} else1090return pmd_alloc(&init_mm, pudp, address);1091}1092return pmd_offset(pudp, address);1093}10941095static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,1096unsigned long address)1097{1098pte_t *pte;10991100/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1101if (unlikely(pmd_none(*pmdp))) {1102if (unlikely(!slab_is_available())) {1103pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1104pmd_populate(&init_mm, pmdp, pte);1105} else1106return pte_alloc_kernel(pmdp, address);1107}1108return pte_offset_kernel(pmdp, address);1109}1110111111121113int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,1114struct vmem_altmap *altmap)1115{1116unsigned long addr;1117unsigned long next;1118pgd_t *pgd;1119p4d_t *p4d;1120pud_t *pud;1121pmd_t *pmd;1122pte_t *pte;11231124/*1125* If altmap is present, Make sure we align the start vmemmap addr1126* to PAGE_SIZE so that we calculate the correct start_pfn in1127* altmap boundary check to decide whether we should use altmap or1128* RAM based backing memory allocation. Also the address need to be1129* aligned for set_pte operation. If the start addr is already1130* PMD_SIZE aligned and with in the altmap boundary then we will1131* try to use a pmd size altmap mapping else we go for page size1132* mapping.1133*1134* If altmap is not present, align the vmemmap addr to PMD_SIZE and1135* always allocate a PMD size page for vmemmap backing.1136*1137*/11381139if (altmap)1140start = ALIGN_DOWN(start, PAGE_SIZE);1141else1142start = ALIGN_DOWN(start, PMD_SIZE);11431144for (addr = start; addr < end; addr = next) {1145next = pmd_addr_end(addr, end);11461147pgd = pgd_offset_k(addr);1148p4d = p4d_offset(pgd, addr);1149pud = vmemmap_pud_alloc(p4d, node, addr);1150if (!pud)1151return -ENOMEM;1152pmd = vmemmap_pmd_alloc(pud, node, addr);1153if (!pmd)1154return -ENOMEM;11551156if (pmd_none(READ_ONCE(*pmd))) {1157void *p;11581159/*1160* keep it simple by checking addr PMD_SIZE alignment1161* and verifying the device boundary condition.1162* For us to use a pmd mapping, both addr and pfn should1163* be aligned. We skip if addr is not aligned and for1164* pfn we hope we have extra area in the altmap that1165* can help to find an aligned block. This can result1166* in altmap block allocation failures, in which case1167* we fallback to RAM for vmemmap allocation.1168*/1169if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||1170altmap_cross_boundary(altmap, addr, PMD_SIZE))) {1171/*1172* make sure we don't create altmap mappings1173* covering things outside the device.1174*/1175goto base_mapping;1176}11771178p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);1179if (p) {1180vmemmap_set_pmd(pmd, p, node, addr, next);1181pr_debug("PMD_SIZE vmemmap mapping\n");1182continue;1183} else {1184/*1185* A vmemmap block allocation can fail due to1186* alignment requirements and we trying to align1187* things aggressively there by running out of1188* space. Try base mapping on failure.1189*/1190goto base_mapping;1191}1192} else if (vmemmap_check_pmd(pmd, node, addr, next)) {1193/*1194* If a huge mapping exist due to early call to1195* vmemmap_populate, let's try to use that.1196*/1197continue;1198}1199base_mapping:1200/*1201* Not able allocate higher order memory to back memmap1202* or we found a pointer to pte page. Allocate base page1203* size vmemmap1204*/1205pte = vmemmap_pte_alloc(pmd, node, addr);1206if (!pte)1207return -ENOMEM;12081209pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);1210if (!pte)1211return -ENOMEM;12121213vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);1214next = addr + PAGE_SIZE;1215}1216return 0;1217}12181219static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,1220struct vmem_altmap *altmap,1221struct page *reuse)1222{1223pgd_t *pgd;1224p4d_t *p4d;1225pud_t *pud;1226pmd_t *pmd;1227pte_t *pte;12281229pgd = pgd_offset_k(addr);1230p4d = p4d_offset(pgd, addr);1231pud = vmemmap_pud_alloc(p4d, node, addr);1232if (!pud)1233return NULL;1234pmd = vmemmap_pmd_alloc(pud, node, addr);1235if (!pmd)1236return NULL;1237if (pmd_leaf(*pmd))1238/*1239* The second page is mapped as a hugepage due to a nearby request.1240* Force our mapping to page size without deduplication1241*/1242return NULL;1243pte = vmemmap_pte_alloc(pmd, node, addr);1244if (!pte)1245return NULL;1246radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1247vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);12481249return pte;1250}12511252static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,1253unsigned long pfn_offset, int node)1254{1255pgd_t *pgd;1256p4d_t *p4d;1257pud_t *pud;1258pmd_t *pmd;1259pte_t *pte;1260unsigned long map_addr;12611262/* the second vmemmap page which we use for duplication */1263map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;1264pgd = pgd_offset_k(map_addr);1265p4d = p4d_offset(pgd, map_addr);1266pud = vmemmap_pud_alloc(p4d, node, map_addr);1267if (!pud)1268return NULL;1269pmd = vmemmap_pmd_alloc(pud, node, map_addr);1270if (!pmd)1271return NULL;1272if (pmd_leaf(*pmd))1273/*1274* The second page is mapped as a hugepage due to a nearby request.1275* Force our mapping to page size without deduplication1276*/1277return NULL;1278pte = vmemmap_pte_alloc(pmd, node, map_addr);1279if (!pte)1280return NULL;1281/*1282* Check if there exist a mapping to the left1283*/1284if (pte_none(*pte)) {1285/*1286* Populate the head page vmemmap page.1287* It can fall in different pmd, hence1288* vmemmap_populate_address()1289*/1290pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);1291if (!pte)1292return NULL;1293/*1294* Populate the tail pages vmemmap page1295*/1296pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);1297if (!pte)1298return NULL;1299vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);1300return pte;1301}1302return pte;1303}13041305int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,1306unsigned long start,1307unsigned long end, int node,1308struct dev_pagemap *pgmap)1309{1310/*1311* we want to map things as base page size mapping so that1312* we can save space in vmemmap. We could have huge mapping1313* covering out both edges.1314*/1315unsigned long addr;1316unsigned long addr_pfn = start_pfn;1317unsigned long next;1318pgd_t *pgd;1319p4d_t *p4d;1320pud_t *pud;1321pmd_t *pmd;1322pte_t *pte;13231324for (addr = start; addr < end; addr = next) {13251326pgd = pgd_offset_k(addr);1327p4d = p4d_offset(pgd, addr);1328pud = vmemmap_pud_alloc(p4d, node, addr);1329if (!pud)1330return -ENOMEM;1331pmd = vmemmap_pmd_alloc(pud, node, addr);1332if (!pmd)1333return -ENOMEM;13341335if (pmd_leaf(READ_ONCE(*pmd))) {1336/* existing huge mapping. Skip the range */1337addr_pfn += (PMD_SIZE >> PAGE_SHIFT);1338next = pmd_addr_end(addr, end);1339continue;1340}1341pte = vmemmap_pte_alloc(pmd, node, addr);1342if (!pte)1343return -ENOMEM;1344if (!pte_none(*pte)) {1345/*1346* This could be because we already have a compound1347* page whose VMEMMAP_RESERVE_NR pages were mapped and1348* this request fall in those pages.1349*/1350addr_pfn += 1;1351next = addr + PAGE_SIZE;1352continue;1353} else {1354unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);1355unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);1356pte_t *tail_page_pte;13571358/*1359* if the address is aligned to huge page size it is the1360* head mapping.1361*/1362if (pfn_offset == 0) {1363/* Populate the head page vmemmap page */1364pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1365if (!pte)1366return -ENOMEM;1367vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);13681369/*1370* Populate the tail pages vmemmap page1371* It can fall in different pmd, hence1372* vmemmap_populate_address()1373*/1374pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);1375if (!pte)1376return -ENOMEM;13771378addr_pfn += 2;1379next = addr + 2 * PAGE_SIZE;1380continue;1381}1382/*1383* get the 2nd mapping details1384* Also create it if that doesn't exist1385*/1386tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);1387if (!tail_page_pte) {13881389pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1390if (!pte)1391return -ENOMEM;1392vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);13931394addr_pfn += 1;1395next = addr + PAGE_SIZE;1396continue;1397}13981399pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));1400if (!pte)1401return -ENOMEM;1402vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);14031404addr_pfn += 1;1405next = addr + PAGE_SIZE;1406continue;1407}1408}1409return 0;1410}141114121413#ifdef CONFIG_MEMORY_HOTPLUG1414void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)1415{1416remove_pagetable(start, start + page_size, true, NULL);1417}14181419void __ref radix__vmemmap_free(unsigned long start, unsigned long end,1420struct vmem_altmap *altmap)1421{1422remove_pagetable(start, end, false, altmap);1423}1424#endif1425#endif14261427#ifdef CONFIG_TRANSPARENT_HUGEPAGE14281429unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,1430pmd_t *pmdp, unsigned long clr,1431unsigned long set)1432{1433unsigned long old;14341435#ifdef CONFIG_DEBUG_VM1436WARN_ON(!radix__pmd_trans_huge(*pmdp));1437assert_spin_locked(pmd_lockptr(mm, pmdp));1438#endif14391440old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);1441trace_hugepage_update_pmd(addr, old, clr, set);14421443return old;1444}14451446unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,1447pud_t *pudp, unsigned long clr,1448unsigned long set)1449{1450unsigned long old;14511452#ifdef CONFIG_DEBUG_VM1453WARN_ON(!pud_trans_huge(*pudp));1454assert_spin_locked(pud_lockptr(mm, pudp));1455#endif14561457old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);1458trace_hugepage_update_pud(addr, old, clr, set);14591460return old;1461}14621463pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,1464pmd_t *pmdp)14651466{1467pmd_t pmd;14681469VM_BUG_ON(address & ~HPAGE_PMD_MASK);1470VM_BUG_ON(radix__pmd_trans_huge(*pmdp));1471/*1472* khugepaged calls this for normal pmd1473*/1474pmd = *pmdp;1475pmd_clear(pmdp);14761477page_table_check_pmd_clear(vma->vm_mm, address, pmd);14781479radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);14801481return pmd;1482}14831484/*1485* For us pgtable_t is pte_t *. Inorder to save the deposisted1486* page table, we consider the allocated page table as a list1487* head. On withdraw we need to make sure we zero out the used1488* list_head memory area.1489*/1490void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,1491pgtable_t pgtable)1492{1493struct list_head *lh = (struct list_head *) pgtable;14941495assert_spin_locked(pmd_lockptr(mm, pmdp));14961497/* FIFO */1498if (!pmd_huge_pte(mm, pmdp))1499INIT_LIST_HEAD(lh);1500else1501list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));1502pmd_huge_pte(mm, pmdp) = pgtable;1503}15041505pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)1506{1507pte_t *ptep;1508pgtable_t pgtable;1509struct list_head *lh;15101511assert_spin_locked(pmd_lockptr(mm, pmdp));15121513/* FIFO */1514pgtable = pmd_huge_pte(mm, pmdp);1515lh = (struct list_head *) pgtable;1516if (list_empty(lh))1517pmd_huge_pte(mm, pmdp) = NULL;1518else {1519pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;1520list_del(lh);1521}1522ptep = (pte_t *) pgtable;1523*ptep = __pte(0);1524ptep++;1525*ptep = __pte(0);1526return pgtable;1527}15281529pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,1530unsigned long addr, pmd_t *pmdp)1531{1532pmd_t old_pmd;1533unsigned long old;15341535old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);1536old_pmd = __pmd(old);1537return old_pmd;1538}15391540pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,1541unsigned long addr, pud_t *pudp)1542{1543pud_t old_pud;1544unsigned long old;15451546old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);1547old_pud = __pud(old);1548return old_pud;1549}15501551#endif /* CONFIG_TRANSPARENT_HUGEPAGE */15521553void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,1554pte_t entry, unsigned long address, int psize)1555{1556struct mm_struct *mm = vma->vm_mm;1557unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |1558_PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);15591560unsigned long change = pte_val(entry) ^ pte_val(*ptep);1561/*1562* On POWER9, the NMMU is not able to relax PTE access permissions1563* for a translation with a TLB. The PTE must be invalidated, TLB1564* flushed before the new PTE is installed.1565*1566* This only needs to be done for radix, because hash translation does1567* flush when updating the linux pte (and we don't support NMMU1568* accelerators on HPT on POWER9 anyway XXX: do we?).1569*1570* POWER10 (and P9P) NMMU does behave as per ISA.1571*/1572if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&1573atomic_read(&mm->context.copros) > 0) {1574unsigned long old_pte, new_pte;15751576old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);1577new_pte = old_pte | set;1578radix__flush_tlb_page_psize(mm, address, psize);1579__radix_pte_update(ptep, _PAGE_INVALID, new_pte);1580} else {1581__radix_pte_update(ptep, 0, set);1582/*1583* Book3S does not require a TLB flush when relaxing access1584* restrictions when the address space (modulo the POWER9 nest1585* MMU issue above) because the MMU will reload the PTE after1586* taking an access fault, as defined by the architecture. See1587* "Setting a Reference or Change Bit or Upgrading Access1588* Authority (PTE Subject to Atomic Hardware Updates)" in1589* Power ISA Version 3.1B.1590*/1591}1592/* See ptesync comment in radix__set_pte_at */1593}15941595void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,1596unsigned long addr, pte_t *ptep,1597pte_t old_pte, pte_t pte)1598{1599struct mm_struct *mm = vma->vm_mm;16001601/*1602* POWER9 NMMU must flush the TLB after clearing the PTE before1603* installing a PTE with more relaxed access permissions, see1604* radix__ptep_set_access_flags.1605*/1606if (!cpu_has_feature(CPU_FTR_ARCH_31) &&1607is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&1608(atomic_read(&mm->context.copros) > 0))1609radix__flush_tlb_page(vma, addr);16101611set_pte_at_unchecked(mm, addr, ptep, pte);1612}16131614int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)1615{1616pte_t *ptep = (pte_t *)pud;1617pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);16181619if (!radix_enabled())1620return 0;16211622set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pud);16231624return 1;1625}16261627int pud_clear_huge(pud_t *pud)1628{1629if (pud_leaf(*pud)) {1630pud_clear(pud);1631return 1;1632}16331634return 0;1635}16361637int pud_free_pmd_page(pud_t *pud, unsigned long addr)1638{1639pmd_t *pmd;1640int i;16411642pmd = pud_pgtable(*pud);1643pud_clear(pud);16441645flush_tlb_kernel_range(addr, addr + PUD_SIZE);16461647for (i = 0; i < PTRS_PER_PMD; i++) {1648if (!pmd_none(pmd[i])) {1649pte_t *pte;1650pte = (pte_t *)pmd_page_vaddr(pmd[i]);16511652pte_free_kernel(&init_mm, pte);1653}1654}16551656pmd_free(&init_mm, pmd);16571658return 1;1659}16601661int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)1662{1663pte_t *ptep = (pte_t *)pmd;1664pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);16651666if (!radix_enabled())1667return 0;16681669set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pmd);16701671return 1;1672}16731674int pmd_clear_huge(pmd_t *pmd)1675{1676if (pmd_leaf(*pmd)) {1677pmd_clear(pmd);1678return 1;1679}16801681return 0;1682}16831684int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)1685{1686pte_t *pte;16871688pte = (pte_t *)pmd_page_vaddr(*pmd);1689pmd_clear(pmd);16901691flush_tlb_kernel_range(addr, addr + PMD_SIZE);16921693pte_free_kernel(&init_mm, pte);16941695return 1;1696}169716981699