Path: blob/master/arch/powerpc/mm/book3s64/radix_pgtable.c
26481 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Page table handling routines for radix page table.3*4* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.5*/67#define pr_fmt(fmt) "radix-mmu: " fmt89#include <linux/io.h>10#include <linux/kernel.h>11#include <linux/sched/mm.h>12#include <linux/memblock.h>13#include <linux/of.h>14#include <linux/of_fdt.h>15#include <linux/mm.h>16#include <linux/hugetlb.h>17#include <linux/string_helpers.h>18#include <linux/memory.h>19#include <linux/kfence.h>2021#include <asm/pgalloc.h>22#include <asm/mmu_context.h>23#include <asm/dma.h>24#include <asm/machdep.h>25#include <asm/mmu.h>26#include <asm/firmware.h>27#include <asm/powernv.h>28#include <asm/sections.h>29#include <asm/smp.h>30#include <asm/trace.h>31#include <asm/uaccess.h>32#include <asm/ultravisor.h>33#include <asm/set_memory.h>34#include <asm/kfence.h>3536#include <trace/events/thp.h>3738#include <mm/mmu_decl.h>3940unsigned int mmu_base_pid;4142static __ref void *early_alloc_pgtable(unsigned long size, int nid,43unsigned long region_start, unsigned long region_end)44{45phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;46phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;47void *ptr;4849if (region_start)50min_addr = region_start;51if (region_end)52max_addr = region_end;5354ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);5556if (!ptr)57panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",58__func__, size, size, nid, &min_addr, &max_addr);5960return ptr;61}6263/*64* When allocating pud or pmd pointers, we allocate a complete page65* of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This66* is to ensure that the page obtained from the memblock allocator67* can be completely used as page table page and can be freed68* correctly when the page table entries are removed.69*/70static int early_map_kernel_page(unsigned long ea, unsigned long pa,71pgprot_t flags,72unsigned int map_page_size,73int nid,74unsigned long region_start, unsigned long region_end)75{76unsigned long pfn = pa >> PAGE_SHIFT;77pgd_t *pgdp;78p4d_t *p4dp;79pud_t *pudp;80pmd_t *pmdp;81pte_t *ptep;8283pgdp = pgd_offset_k(ea);84p4dp = p4d_offset(pgdp, ea);85if (p4d_none(*p4dp)) {86pudp = early_alloc_pgtable(PAGE_SIZE, nid,87region_start, region_end);88p4d_populate(&init_mm, p4dp, pudp);89}90pudp = pud_offset(p4dp, ea);91if (map_page_size == PUD_SIZE) {92ptep = (pte_t *)pudp;93goto set_the_pte;94}95if (pud_none(*pudp)) {96pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,97region_end);98pud_populate(&init_mm, pudp, pmdp);99}100pmdp = pmd_offset(pudp, ea);101if (map_page_size == PMD_SIZE) {102ptep = pmdp_ptep(pmdp);103goto set_the_pte;104}105if (!pmd_present(*pmdp)) {106ptep = early_alloc_pgtable(PAGE_SIZE, nid,107region_start, region_end);108pmd_populate_kernel(&init_mm, pmdp, ptep);109}110ptep = pte_offset_kernel(pmdp, ea);111112set_the_pte:113set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));114asm volatile("ptesync": : :"memory");115return 0;116}117118/*119* nid, region_start, and region_end are hints to try to place the page120* table memory in the same node or region.121*/122static int __map_kernel_page(unsigned long ea, unsigned long pa,123pgprot_t flags,124unsigned int map_page_size,125int nid,126unsigned long region_start, unsigned long region_end)127{128unsigned long pfn = pa >> PAGE_SHIFT;129pgd_t *pgdp;130p4d_t *p4dp;131pud_t *pudp;132pmd_t *pmdp;133pte_t *ptep;134/*135* Make sure task size is correct as per the max adddr136*/137BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);138139#ifdef CONFIG_PPC_64K_PAGES140BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));141#endif142143if (unlikely(!slab_is_available()))144return early_map_kernel_page(ea, pa, flags, map_page_size,145nid, region_start, region_end);146147/*148* Should make page table allocation functions be able to take a149* node, so we can place kernel page tables on the right nodes after150* boot.151*/152pgdp = pgd_offset_k(ea);153p4dp = p4d_offset(pgdp, ea);154pudp = pud_alloc(&init_mm, p4dp, ea);155if (!pudp)156return -ENOMEM;157if (map_page_size == PUD_SIZE) {158ptep = (pte_t *)pudp;159goto set_the_pte;160}161pmdp = pmd_alloc(&init_mm, pudp, ea);162if (!pmdp)163return -ENOMEM;164if (map_page_size == PMD_SIZE) {165ptep = pmdp_ptep(pmdp);166goto set_the_pte;167}168ptep = pte_alloc_kernel(pmdp, ea);169if (!ptep)170return -ENOMEM;171172set_the_pte:173set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));174asm volatile("ptesync": : :"memory");175return 0;176}177178int radix__map_kernel_page(unsigned long ea, unsigned long pa,179pgprot_t flags,180unsigned int map_page_size)181{182return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);183}184185#ifdef CONFIG_STRICT_KERNEL_RWX186static void radix__change_memory_range(unsigned long start, unsigned long end,187unsigned long clear)188{189unsigned long idx;190pgd_t *pgdp;191p4d_t *p4dp;192pud_t *pudp;193pmd_t *pmdp;194pte_t *ptep;195196start = ALIGN_DOWN(start, PAGE_SIZE);197end = PAGE_ALIGN(end); // aligns up198199pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",200start, end, clear);201202for (idx = start; idx < end; idx += PAGE_SIZE) {203pgdp = pgd_offset_k(idx);204p4dp = p4d_offset(pgdp, idx);205pudp = pud_alloc(&init_mm, p4dp, idx);206if (!pudp)207continue;208if (pud_leaf(*pudp)) {209ptep = (pte_t *)pudp;210goto update_the_pte;211}212pmdp = pmd_alloc(&init_mm, pudp, idx);213if (!pmdp)214continue;215if (pmd_leaf(*pmdp)) {216ptep = pmdp_ptep(pmdp);217goto update_the_pte;218}219ptep = pte_alloc_kernel(pmdp, idx);220if (!ptep)221continue;222update_the_pte:223radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);224}225226radix__flush_tlb_kernel_range(start, end);227}228229void radix__mark_rodata_ro(void)230{231unsigned long start, end;232233start = (unsigned long)_stext;234end = (unsigned long)__end_rodata;235236radix__change_memory_range(start, end, _PAGE_WRITE);237238for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {239end = start + PAGE_SIZE;240if (overlaps_interrupt_vector_text(start, end))241radix__change_memory_range(start, end, _PAGE_WRITE);242else243break;244}245}246247void radix__mark_initmem_nx(void)248{249unsigned long start = (unsigned long)__init_begin;250unsigned long end = (unsigned long)__init_end;251252radix__change_memory_range(start, end, _PAGE_EXEC);253}254#endif /* CONFIG_STRICT_KERNEL_RWX */255256static inline void __meminit257print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)258{259char buf[10];260261if (end <= start)262return;263264string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));265266pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,267exec ? " (exec)" : "");268}269270static unsigned long next_boundary(unsigned long addr, unsigned long end)271{272#ifdef CONFIG_STRICT_KERNEL_RWX273unsigned long stext_phys;274275stext_phys = __pa_symbol(_stext);276277// Relocatable kernel running at non-zero real address278if (stext_phys != 0) {279// The end of interrupts code at zero is a rodata boundary280unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;281if (addr < end_intr)282return end_intr;283284// Start of relocated kernel text is a rodata boundary285if (addr < stext_phys)286return stext_phys;287}288289if (addr < __pa_symbol(__srwx_boundary))290return __pa_symbol(__srwx_boundary);291#endif292return end;293}294295static int __meminit create_physical_mapping(unsigned long start,296unsigned long end,297int nid, pgprot_t _prot,298unsigned long mapping_sz_limit)299{300unsigned long vaddr, addr, mapping_size = 0;301bool prev_exec, exec = false;302pgprot_t prot;303int psize;304unsigned long max_mapping_size = memory_block_size;305306if (mapping_sz_limit < max_mapping_size)307max_mapping_size = mapping_sz_limit;308309if (debug_pagealloc_enabled())310max_mapping_size = PAGE_SIZE;311312start = ALIGN(start, PAGE_SIZE);313end = ALIGN_DOWN(end, PAGE_SIZE);314for (addr = start; addr < end; addr += mapping_size) {315unsigned long gap, previous_size;316int rc;317318gap = next_boundary(addr, end) - addr;319if (gap > max_mapping_size)320gap = max_mapping_size;321previous_size = mapping_size;322prev_exec = exec;323324if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&325mmu_psize_defs[MMU_PAGE_1G].shift) {326mapping_size = PUD_SIZE;327psize = MMU_PAGE_1G;328} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&329mmu_psize_defs[MMU_PAGE_2M].shift) {330mapping_size = PMD_SIZE;331psize = MMU_PAGE_2M;332} else {333mapping_size = PAGE_SIZE;334psize = mmu_virtual_psize;335}336337vaddr = (unsigned long)__va(addr);338339if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||340overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {341prot = PAGE_KERNEL_X;342exec = true;343} else {344prot = _prot;345exec = false;346}347348if (mapping_size != previous_size || exec != prev_exec) {349print_mapping(start, addr, previous_size, prev_exec);350start = addr;351}352353rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);354if (rc)355return rc;356357update_page_count(psize, 1);358}359360print_mapping(start, addr, mapping_size, exec);361return 0;362}363364#ifdef CONFIG_KFENCE365static __init phys_addr_t alloc_kfence_pool(void)366{367phys_addr_t kfence_pool;368369/*370* TODO: Support to enable KFENCE after bootup depends on the ability to371* split page table mappings. As such support is not currently372* implemented for radix pagetables, support enabling KFENCE373* only at system startup for now.374*375* After support for splitting mappings is available on radix,376* alloc_kfence_pool() & map_kfence_pool() can be dropped and377* mapping for __kfence_pool memory can be378* split during arch_kfence_init_pool().379*/380if (!kfence_early_init)381goto no_kfence;382383kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);384if (!kfence_pool)385goto no_kfence;386387memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);388return kfence_pool;389390no_kfence:391disable_kfence();392return 0;393}394395static __init void map_kfence_pool(phys_addr_t kfence_pool)396{397if (!kfence_pool)398return;399400if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,401-1, PAGE_KERNEL, PAGE_SIZE))402goto err;403404memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);405__kfence_pool = __va(kfence_pool);406return;407408err:409memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);410disable_kfence();411}412#else413static inline phys_addr_t alloc_kfence_pool(void) { return 0; }414static inline void map_kfence_pool(phys_addr_t kfence_pool) { }415#endif416417static void __init radix_init_pgtable(void)418{419phys_addr_t kfence_pool;420unsigned long rts_field;421phys_addr_t start, end;422u64 i;423424/* We don't support slb for radix */425slb_set_size(0);426427kfence_pool = alloc_kfence_pool();428429/*430* Create the linear mapping431*/432for_each_mem_range(i, &start, &end) {433/*434* The memblock allocator is up at this point, so the435* page tables will be allocated within the range. No436* need or a node (which we don't have yet).437*/438439if (end >= RADIX_VMALLOC_START) {440pr_warn("Outside the supported range\n");441continue;442}443444WARN_ON(create_physical_mapping(start, end,445-1, PAGE_KERNEL, ~0UL));446}447448map_kfence_pool(kfence_pool);449450if (!cpu_has_feature(CPU_FTR_HVMODE) &&451cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {452/*453* Older versions of KVM on these machines prefer if the454* guest only uses the low 19 PID bits.455*/456mmu_pid_bits = 19;457}458mmu_base_pid = 1;459460/*461* Allocate Partition table and process table for the462* host.463*/464BUG_ON(PRTB_SIZE_SHIFT > 36);465process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);466/*467* Fill in the process table.468*/469rts_field = radix__get_tree_size();470process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);471472/*473* The init_mm context is given the first available (non-zero) PID,474* which is the "guard PID" and contains no page table. PIDR should475* never be set to zero because that duplicates the kernel address476* space at the 0x0... offset (quadrant 0)!477*478* An arbitrary PID that may later be allocated by the PID allocator479* for userspace processes must not be used either, because that480* would cause stale user mappings for that PID on CPUs outside of481* the TLB invalidation scheme (because it won't be in mm_cpumask).482*483* So permanently carve out one PID for the purpose of a guard PID.484*/485init_mm.context.id = mmu_base_pid;486mmu_base_pid++;487}488489static void __init radix_init_partition_table(void)490{491unsigned long rts_field, dw0, dw1;492493mmu_partition_table_init();494rts_field = radix__get_tree_size();495dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;496dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;497mmu_partition_table_set_entry(0, dw0, dw1, false);498499pr_info("Initializing Radix MMU\n");500}501502static int __init get_idx_from_shift(unsigned int shift)503{504int idx = -1;505506switch (shift) {507case 0xc:508idx = MMU_PAGE_4K;509break;510case 0x10:511idx = MMU_PAGE_64K;512break;513case 0x15:514idx = MMU_PAGE_2M;515break;516case 0x1e:517idx = MMU_PAGE_1G;518break;519}520return idx;521}522523static int __init radix_dt_scan_page_sizes(unsigned long node,524const char *uname, int depth,525void *data)526{527int size = 0;528int shift, idx;529unsigned int ap;530const __be32 *prop;531const char *type = of_get_flat_dt_prop(node, "device_type", NULL);532533/* We are scanning "cpu" nodes only */534if (type == NULL || strcmp(type, "cpu") != 0)535return 0;536537/* Grab page size encodings */538prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);539if (!prop)540return 0;541542pr_info("Page sizes from device-tree:\n");543for (; size >= 4; size -= 4, ++prop) {544545struct mmu_psize_def *def;546547/* top 3 bit is AP encoding */548shift = be32_to_cpu(prop[0]) & ~(0xe << 28);549ap = be32_to_cpu(prop[0]) >> 29;550pr_info("Page size shift = %d AP=0x%x\n", shift, ap);551552idx = get_idx_from_shift(shift);553if (idx < 0)554continue;555556def = &mmu_psize_defs[idx];557def->shift = shift;558def->ap = ap;559def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);560}561562/* needed ? */563cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;564return 1;565}566567void __init radix__early_init_devtree(void)568{569int rc;570571/*572* Try to find the available page sizes in the device-tree573*/574rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);575if (!rc) {576/*577* No page size details found in device tree.578* Let's assume we have page 4k and 64k support579*/580mmu_psize_defs[MMU_PAGE_4K].shift = 12;581mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;582mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =583psize_to_rpti_pgsize(MMU_PAGE_4K);584585mmu_psize_defs[MMU_PAGE_64K].shift = 16;586mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;587mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =588psize_to_rpti_pgsize(MMU_PAGE_64K);589}590return;591}592593void __init radix__early_init_mmu(void)594{595unsigned long lpcr;596597#ifdef CONFIG_PPC_64S_HASH_MMU598#ifdef CONFIG_PPC_64K_PAGES599/* PAGE_SIZE mappings */600mmu_virtual_psize = MMU_PAGE_64K;601#else602mmu_virtual_psize = MMU_PAGE_4K;603#endif604#endif605/*606* initialize page table size607*/608__pte_index_size = RADIX_PTE_INDEX_SIZE;609__pmd_index_size = RADIX_PMD_INDEX_SIZE;610__pud_index_size = RADIX_PUD_INDEX_SIZE;611__pgd_index_size = RADIX_PGD_INDEX_SIZE;612__pud_cache_index = RADIX_PUD_INDEX_SIZE;613__pte_table_size = RADIX_PTE_TABLE_SIZE;614__pmd_table_size = RADIX_PMD_TABLE_SIZE;615__pud_table_size = RADIX_PUD_TABLE_SIZE;616__pgd_table_size = RADIX_PGD_TABLE_SIZE;617618__pmd_val_bits = RADIX_PMD_VAL_BITS;619__pud_val_bits = RADIX_PUD_VAL_BITS;620__pgd_val_bits = RADIX_PGD_VAL_BITS;621622__kernel_virt_start = RADIX_KERN_VIRT_START;623__vmalloc_start = RADIX_VMALLOC_START;624__vmalloc_end = RADIX_VMALLOC_END;625__kernel_io_start = RADIX_KERN_IO_START;626__kernel_io_end = RADIX_KERN_IO_END;627vmemmap = (struct page *)RADIX_VMEMMAP_START;628ioremap_bot = IOREMAP_BASE;629630#ifdef CONFIG_PCI631pci_io_base = ISA_IO_BASE;632#endif633__pte_frag_nr = RADIX_PTE_FRAG_NR;634__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;635__pmd_frag_nr = RADIX_PMD_FRAG_NR;636__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;637638radix_init_pgtable();639640if (!firmware_has_feature(FW_FEATURE_LPAR)) {641lpcr = mfspr(SPRN_LPCR);642mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);643radix_init_partition_table();644} else {645radix_init_pseries();646}647648memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);649650/* Switch to the guard PID before turning on MMU */651radix__switch_mmu_context(NULL, &init_mm);652tlbiel_all();653}654655void radix__early_init_mmu_secondary(void)656{657unsigned long lpcr;658/*659* update partition table control register and UPRT660*/661if (!firmware_has_feature(FW_FEATURE_LPAR)) {662lpcr = mfspr(SPRN_LPCR);663mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);664665set_ptcr_when_no_uv(__pa(partition_tb) |666(PATB_SIZE_SHIFT - 12));667}668669radix__switch_mmu_context(NULL, &init_mm);670tlbiel_all();671672/* Make sure userspace can't change the AMR */673mtspr(SPRN_UAMOR, 0);674}675676/* Called during kexec sequence with MMU off */677notrace void radix__mmu_cleanup_all(void)678{679unsigned long lpcr;680681if (!firmware_has_feature(FW_FEATURE_LPAR)) {682lpcr = mfspr(SPRN_LPCR);683mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);684set_ptcr_when_no_uv(0);685powernv_set_nmmu_ptcr(0);686radix__flush_tlb_all();687}688}689690#ifdef CONFIG_MEMORY_HOTPLUG691static void free_pte_table(pte_t *pte_start, pmd_t *pmd)692{693pte_t *pte;694int i;695696for (i = 0; i < PTRS_PER_PTE; i++) {697pte = pte_start + i;698if (!pte_none(*pte))699return;700}701702pte_free_kernel(&init_mm, pte_start);703pmd_clear(pmd);704}705706static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)707{708pmd_t *pmd;709int i;710711for (i = 0; i < PTRS_PER_PMD; i++) {712pmd = pmd_start + i;713if (!pmd_none(*pmd))714return;715}716717pmd_free(&init_mm, pmd_start);718pud_clear(pud);719}720721static void free_pud_table(pud_t *pud_start, p4d_t *p4d)722{723pud_t *pud;724int i;725726for (i = 0; i < PTRS_PER_PUD; i++) {727pud = pud_start + i;728if (!pud_none(*pud))729return;730}731732pud_free(&init_mm, pud_start);733p4d_clear(p4d);734}735736#ifdef CONFIG_SPARSEMEM_VMEMMAP737static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)738{739unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);740741return !vmemmap_populated(start, PMD_SIZE);742}743744static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)745{746unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);747748return !vmemmap_populated(start, PAGE_SIZE);749750}751#endif752753static void __meminit free_vmemmap_pages(struct page *page,754struct vmem_altmap *altmap,755int order)756{757unsigned int nr_pages = 1 << order;758759if (altmap) {760unsigned long alt_start, alt_end;761unsigned long base_pfn = page_to_pfn(page);762763/*764* with 2M vmemmap mmaping we can have things setup765* such that even though atlmap is specified we never766* used altmap.767*/768alt_start = altmap->base_pfn;769alt_end = altmap->base_pfn + altmap->reserve + altmap->free;770771if (base_pfn >= alt_start && base_pfn < alt_end) {772vmem_altmap_free(altmap, nr_pages);773return;774}775}776777if (PageReserved(page)) {778/* allocated from memblock */779while (nr_pages--)780free_reserved_page(page++);781} else782free_pages((unsigned long)page_address(page), order);783}784785static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,786unsigned long end, bool direct,787struct vmem_altmap *altmap)788{789unsigned long next, pages = 0;790pte_t *pte;791792pte = pte_start + pte_index(addr);793for (; addr < end; addr = next, pte++) {794next = (addr + PAGE_SIZE) & PAGE_MASK;795if (next > end)796next = end;797798if (!pte_present(*pte))799continue;800801if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {802if (!direct)803free_vmemmap_pages(pte_page(*pte), altmap, 0);804pte_clear(&init_mm, addr, pte);805pages++;806}807#ifdef CONFIG_SPARSEMEM_VMEMMAP808else if (!direct && vmemmap_page_is_unused(addr, next)) {809free_vmemmap_pages(pte_page(*pte), altmap, 0);810pte_clear(&init_mm, addr, pte);811}812#endif813}814if (direct)815update_page_count(mmu_virtual_psize, -pages);816}817818static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,819unsigned long end, bool direct,820struct vmem_altmap *altmap)821{822unsigned long next, pages = 0;823pte_t *pte_base;824pmd_t *pmd;825826pmd = pmd_start + pmd_index(addr);827for (; addr < end; addr = next, pmd++) {828next = pmd_addr_end(addr, end);829830if (!pmd_present(*pmd))831continue;832833if (pmd_leaf(*pmd)) {834if (IS_ALIGNED(addr, PMD_SIZE) &&835IS_ALIGNED(next, PMD_SIZE)) {836if (!direct)837free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));838pte_clear(&init_mm, addr, (pte_t *)pmd);839pages++;840}841#ifdef CONFIG_SPARSEMEM_VMEMMAP842else if (!direct && vmemmap_pmd_is_unused(addr, next)) {843free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));844pte_clear(&init_mm, addr, (pte_t *)pmd);845}846#endif847continue;848}849850pte_base = (pte_t *)pmd_page_vaddr(*pmd);851remove_pte_table(pte_base, addr, next, direct, altmap);852free_pte_table(pte_base, pmd);853}854if (direct)855update_page_count(MMU_PAGE_2M, -pages);856}857858static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,859unsigned long end, bool direct,860struct vmem_altmap *altmap)861{862unsigned long next, pages = 0;863pmd_t *pmd_base;864pud_t *pud;865866pud = pud_start + pud_index(addr);867for (; addr < end; addr = next, pud++) {868next = pud_addr_end(addr, end);869870if (!pud_present(*pud))871continue;872873if (pud_leaf(*pud)) {874if (!IS_ALIGNED(addr, PUD_SIZE) ||875!IS_ALIGNED(next, PUD_SIZE)) {876WARN_ONCE(1, "%s: unaligned range\n", __func__);877continue;878}879pte_clear(&init_mm, addr, (pte_t *)pud);880pages++;881continue;882}883884pmd_base = pud_pgtable(*pud);885remove_pmd_table(pmd_base, addr, next, direct, altmap);886free_pmd_table(pmd_base, pud);887}888if (direct)889update_page_count(MMU_PAGE_1G, -pages);890}891892static void __meminit893remove_pagetable(unsigned long start, unsigned long end, bool direct,894struct vmem_altmap *altmap)895{896unsigned long addr, next;897pud_t *pud_base;898pgd_t *pgd;899p4d_t *p4d;900901spin_lock(&init_mm.page_table_lock);902903for (addr = start; addr < end; addr = next) {904next = pgd_addr_end(addr, end);905906pgd = pgd_offset_k(addr);907p4d = p4d_offset(pgd, addr);908if (!p4d_present(*p4d))909continue;910911if (p4d_leaf(*p4d)) {912if (!IS_ALIGNED(addr, P4D_SIZE) ||913!IS_ALIGNED(next, P4D_SIZE)) {914WARN_ONCE(1, "%s: unaligned range\n", __func__);915continue;916}917918pte_clear(&init_mm, addr, (pte_t *)pgd);919continue;920}921922pud_base = p4d_pgtable(*p4d);923remove_pud_table(pud_base, addr, next, direct, altmap);924free_pud_table(pud_base, p4d);925}926927spin_unlock(&init_mm.page_table_lock);928radix__flush_tlb_kernel_range(start, end);929}930931int __meminit radix__create_section_mapping(unsigned long start,932unsigned long end, int nid,933pgprot_t prot)934{935if (end >= RADIX_VMALLOC_START) {936pr_warn("Outside the supported range\n");937return -1;938}939940return create_physical_mapping(__pa(start), __pa(end),941nid, prot, ~0UL);942}943944int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)945{946remove_pagetable(start, end, true, NULL);947return 0;948}949#endif /* CONFIG_MEMORY_HOTPLUG */950951#ifdef CONFIG_SPARSEMEM_VMEMMAP952static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,953pgprot_t flags, unsigned int map_page_size,954int nid)955{956return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);957}958959int __meminit radix__vmemmap_create_mapping(unsigned long start,960unsigned long page_size,961unsigned long phys)962{963/* Create a PTE encoding */964int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);965int ret;966967if ((start + page_size) >= RADIX_VMEMMAP_END) {968pr_warn("Outside the supported range\n");969return -1;970}971972ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);973BUG_ON(ret);974975return 0;976}977978#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP979bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)980{981if (radix_enabled())982return __vmemmap_can_optimize(altmap, pgmap);983984return false;985}986#endif987988int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,989unsigned long addr, unsigned long next)990{991int large = pmd_leaf(*pmdp);992993if (large)994vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);995996return large;997}998999void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,1000unsigned long addr, unsigned long next)1001{1002pte_t entry;1003pte_t *ptep = pmdp_ptep(pmdp);10041005VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));1006entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);1007set_pte_at(&init_mm, addr, ptep, entry);1008asm volatile("ptesync": : :"memory");10091010vmemmap_verify(ptep, node, addr, next);1011}10121013static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,1014int node,1015struct vmem_altmap *altmap,1016struct page *reuse)1017{1018pte_t *pte = pte_offset_kernel(pmdp, addr);10191020if (pte_none(*pte)) {1021pte_t entry;1022void *p;10231024if (!reuse) {1025/*1026* make sure we don't create altmap mappings1027* covering things outside the device.1028*/1029if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))1030altmap = NULL;10311032p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);1033if (!p && altmap)1034p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);1035if (!p)1036return NULL;1037pr_debug("PAGE_SIZE vmemmap mapping\n");1038} else {1039/*1040* When a PTE/PMD entry is freed from the init_mm1041* there's a free_pages() call to this page allocated1042* above. Thus this get_page() is paired with the1043* put_page_testzero() on the freeing path.1044* This can only called by certain ZONE_DEVICE path,1045* and through vmemmap_populate_compound_pages() when1046* slab is available.1047*/1048get_page(reuse);1049p = page_to_virt(reuse);1050pr_debug("Tail page reuse vmemmap mapping\n");1051}10521053VM_BUG_ON(!PAGE_ALIGNED(addr));1054entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);1055set_pte_at(&init_mm, addr, pte, entry);1056asm volatile("ptesync": : :"memory");1057}1058return pte;1059}10601061static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,1062unsigned long address)1063{1064pud_t *pud;10651066/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1067if (unlikely(p4d_none(*p4dp))) {1068if (unlikely(!slab_is_available())) {1069pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1070p4d_populate(&init_mm, p4dp, pud);1071/* go to the pud_offset */1072} else1073return pud_alloc(&init_mm, p4dp, address);1074}1075return pud_offset(p4dp, address);1076}10771078static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,1079unsigned long address)1080{1081pmd_t *pmd;10821083/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1084if (unlikely(pud_none(*pudp))) {1085if (unlikely(!slab_is_available())) {1086pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1087pud_populate(&init_mm, pudp, pmd);1088} else1089return pmd_alloc(&init_mm, pudp, address);1090}1091return pmd_offset(pudp, address);1092}10931094static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,1095unsigned long address)1096{1097pte_t *pte;10981099/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */1100if (unlikely(pmd_none(*pmdp))) {1101if (unlikely(!slab_is_available())) {1102pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);1103pmd_populate(&init_mm, pmdp, pte);1104} else1105return pte_alloc_kernel(pmdp, address);1106}1107return pte_offset_kernel(pmdp, address);1108}1109111011111112int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,1113struct vmem_altmap *altmap)1114{1115unsigned long addr;1116unsigned long next;1117pgd_t *pgd;1118p4d_t *p4d;1119pud_t *pud;1120pmd_t *pmd;1121pte_t *pte;11221123/*1124* If altmap is present, Make sure we align the start vmemmap addr1125* to PAGE_SIZE so that we calculate the correct start_pfn in1126* altmap boundary check to decide whether we should use altmap or1127* RAM based backing memory allocation. Also the address need to be1128* aligned for set_pte operation. If the start addr is already1129* PMD_SIZE aligned and with in the altmap boundary then we will1130* try to use a pmd size altmap mapping else we go for page size1131* mapping.1132*1133* If altmap is not present, align the vmemmap addr to PMD_SIZE and1134* always allocate a PMD size page for vmemmap backing.1135*1136*/11371138if (altmap)1139start = ALIGN_DOWN(start, PAGE_SIZE);1140else1141start = ALIGN_DOWN(start, PMD_SIZE);11421143for (addr = start; addr < end; addr = next) {1144next = pmd_addr_end(addr, end);11451146pgd = pgd_offset_k(addr);1147p4d = p4d_offset(pgd, addr);1148pud = vmemmap_pud_alloc(p4d, node, addr);1149if (!pud)1150return -ENOMEM;1151pmd = vmemmap_pmd_alloc(pud, node, addr);1152if (!pmd)1153return -ENOMEM;11541155if (pmd_none(READ_ONCE(*pmd))) {1156void *p;11571158/*1159* keep it simple by checking addr PMD_SIZE alignment1160* and verifying the device boundary condition.1161* For us to use a pmd mapping, both addr and pfn should1162* be aligned. We skip if addr is not aligned and for1163* pfn we hope we have extra area in the altmap that1164* can help to find an aligned block. This can result1165* in altmap block allocation failures, in which case1166* we fallback to RAM for vmemmap allocation.1167*/1168if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||1169altmap_cross_boundary(altmap, addr, PMD_SIZE))) {1170/*1171* make sure we don't create altmap mappings1172* covering things outside the device.1173*/1174goto base_mapping;1175}11761177p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);1178if (p) {1179vmemmap_set_pmd(pmd, p, node, addr, next);1180pr_debug("PMD_SIZE vmemmap mapping\n");1181continue;1182} else {1183/*1184* A vmemmap block allocation can fail due to1185* alignment requirements and we trying to align1186* things aggressively there by running out of1187* space. Try base mapping on failure.1188*/1189goto base_mapping;1190}1191} else if (vmemmap_check_pmd(pmd, node, addr, next)) {1192/*1193* If a huge mapping exist due to early call to1194* vmemmap_populate, let's try to use that.1195*/1196continue;1197}1198base_mapping:1199/*1200* Not able allocate higher order memory to back memmap1201* or we found a pointer to pte page. Allocate base page1202* size vmemmap1203*/1204pte = vmemmap_pte_alloc(pmd, node, addr);1205if (!pte)1206return -ENOMEM;12071208pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);1209if (!pte)1210return -ENOMEM;12111212vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);1213next = addr + PAGE_SIZE;1214}1215return 0;1216}12171218static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,1219struct vmem_altmap *altmap,1220struct page *reuse)1221{1222pgd_t *pgd;1223p4d_t *p4d;1224pud_t *pud;1225pmd_t *pmd;1226pte_t *pte;12271228pgd = pgd_offset_k(addr);1229p4d = p4d_offset(pgd, addr);1230pud = vmemmap_pud_alloc(p4d, node, addr);1231if (!pud)1232return NULL;1233pmd = vmemmap_pmd_alloc(pud, node, addr);1234if (!pmd)1235return NULL;1236if (pmd_leaf(*pmd))1237/*1238* The second page is mapped as a hugepage due to a nearby request.1239* Force our mapping to page size without deduplication1240*/1241return NULL;1242pte = vmemmap_pte_alloc(pmd, node, addr);1243if (!pte)1244return NULL;1245radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1246vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);12471248return pte;1249}12501251static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,1252unsigned long pfn_offset, int node)1253{1254pgd_t *pgd;1255p4d_t *p4d;1256pud_t *pud;1257pmd_t *pmd;1258pte_t *pte;1259unsigned long map_addr;12601261/* the second vmemmap page which we use for duplication */1262map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;1263pgd = pgd_offset_k(map_addr);1264p4d = p4d_offset(pgd, map_addr);1265pud = vmemmap_pud_alloc(p4d, node, map_addr);1266if (!pud)1267return NULL;1268pmd = vmemmap_pmd_alloc(pud, node, map_addr);1269if (!pmd)1270return NULL;1271if (pmd_leaf(*pmd))1272/*1273* The second page is mapped as a hugepage due to a nearby request.1274* Force our mapping to page size without deduplication1275*/1276return NULL;1277pte = vmemmap_pte_alloc(pmd, node, map_addr);1278if (!pte)1279return NULL;1280/*1281* Check if there exist a mapping to the left1282*/1283if (pte_none(*pte)) {1284/*1285* Populate the head page vmemmap page.1286* It can fall in different pmd, hence1287* vmemmap_populate_address()1288*/1289pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);1290if (!pte)1291return NULL;1292/*1293* Populate the tail pages vmemmap page1294*/1295pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);1296if (!pte)1297return NULL;1298vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);1299return pte;1300}1301return pte;1302}13031304int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,1305unsigned long start,1306unsigned long end, int node,1307struct dev_pagemap *pgmap)1308{1309/*1310* we want to map things as base page size mapping so that1311* we can save space in vmemmap. We could have huge mapping1312* covering out both edges.1313*/1314unsigned long addr;1315unsigned long addr_pfn = start_pfn;1316unsigned long next;1317pgd_t *pgd;1318p4d_t *p4d;1319pud_t *pud;1320pmd_t *pmd;1321pte_t *pte;13221323for (addr = start; addr < end; addr = next) {13241325pgd = pgd_offset_k(addr);1326p4d = p4d_offset(pgd, addr);1327pud = vmemmap_pud_alloc(p4d, node, addr);1328if (!pud)1329return -ENOMEM;1330pmd = vmemmap_pmd_alloc(pud, node, addr);1331if (!pmd)1332return -ENOMEM;13331334if (pmd_leaf(READ_ONCE(*pmd))) {1335/* existing huge mapping. Skip the range */1336addr_pfn += (PMD_SIZE >> PAGE_SHIFT);1337next = pmd_addr_end(addr, end);1338continue;1339}1340pte = vmemmap_pte_alloc(pmd, node, addr);1341if (!pte)1342return -ENOMEM;1343if (!pte_none(*pte)) {1344/*1345* This could be because we already have a compound1346* page whose VMEMMAP_RESERVE_NR pages were mapped and1347* this request fall in those pages.1348*/1349addr_pfn += 1;1350next = addr + PAGE_SIZE;1351continue;1352} else {1353unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);1354unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);1355pte_t *tail_page_pte;13561357/*1358* if the address is aligned to huge page size it is the1359* head mapping.1360*/1361if (pfn_offset == 0) {1362/* Populate the head page vmemmap page */1363pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1364if (!pte)1365return -ENOMEM;1366vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);13671368/*1369* Populate the tail pages vmemmap page1370* It can fall in different pmd, hence1371* vmemmap_populate_address()1372*/1373pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);1374if (!pte)1375return -ENOMEM;13761377addr_pfn += 2;1378next = addr + 2 * PAGE_SIZE;1379continue;1380}1381/*1382* get the 2nd mapping details1383* Also create it if that doesn't exist1384*/1385tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);1386if (!tail_page_pte) {13871388pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);1389if (!pte)1390return -ENOMEM;1391vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);13921393addr_pfn += 1;1394next = addr + PAGE_SIZE;1395continue;1396}13971398pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));1399if (!pte)1400return -ENOMEM;1401vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);14021403addr_pfn += 1;1404next = addr + PAGE_SIZE;1405continue;1406}1407}1408return 0;1409}141014111412#ifdef CONFIG_MEMORY_HOTPLUG1413void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)1414{1415remove_pagetable(start, start + page_size, true, NULL);1416}14171418void __ref radix__vmemmap_free(unsigned long start, unsigned long end,1419struct vmem_altmap *altmap)1420{1421remove_pagetable(start, end, false, altmap);1422}1423#endif1424#endif14251426#ifdef CONFIG_TRANSPARENT_HUGEPAGE14271428unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,1429pmd_t *pmdp, unsigned long clr,1430unsigned long set)1431{1432unsigned long old;14331434#ifdef CONFIG_DEBUG_VM1435WARN_ON(!radix__pmd_trans_huge(*pmdp));1436assert_spin_locked(pmd_lockptr(mm, pmdp));1437#endif14381439old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);1440trace_hugepage_update_pmd(addr, old, clr, set);14411442return old;1443}14441445unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,1446pud_t *pudp, unsigned long clr,1447unsigned long set)1448{1449unsigned long old;14501451#ifdef CONFIG_DEBUG_VM1452WARN_ON(!pud_trans_huge(*pudp));1453assert_spin_locked(pud_lockptr(mm, pudp));1454#endif14551456old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);1457trace_hugepage_update_pud(addr, old, clr, set);14581459return old;1460}14611462pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,1463pmd_t *pmdp)14641465{1466pmd_t pmd;14671468VM_BUG_ON(address & ~HPAGE_PMD_MASK);1469VM_BUG_ON(radix__pmd_trans_huge(*pmdp));1470/*1471* khugepaged calls this for normal pmd1472*/1473pmd = *pmdp;1474pmd_clear(pmdp);14751476radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);14771478return pmd;1479}14801481/*1482* For us pgtable_t is pte_t *. Inorder to save the deposisted1483* page table, we consider the allocated page table as a list1484* head. On withdraw we need to make sure we zero out the used1485* list_head memory area.1486*/1487void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,1488pgtable_t pgtable)1489{1490struct list_head *lh = (struct list_head *) pgtable;14911492assert_spin_locked(pmd_lockptr(mm, pmdp));14931494/* FIFO */1495if (!pmd_huge_pte(mm, pmdp))1496INIT_LIST_HEAD(lh);1497else1498list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));1499pmd_huge_pte(mm, pmdp) = pgtable;1500}15011502pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)1503{1504pte_t *ptep;1505pgtable_t pgtable;1506struct list_head *lh;15071508assert_spin_locked(pmd_lockptr(mm, pmdp));15091510/* FIFO */1511pgtable = pmd_huge_pte(mm, pmdp);1512lh = (struct list_head *) pgtable;1513if (list_empty(lh))1514pmd_huge_pte(mm, pmdp) = NULL;1515else {1516pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;1517list_del(lh);1518}1519ptep = (pte_t *) pgtable;1520*ptep = __pte(0);1521ptep++;1522*ptep = __pte(0);1523return pgtable;1524}15251526pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,1527unsigned long addr, pmd_t *pmdp)1528{1529pmd_t old_pmd;1530unsigned long old;15311532old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);1533old_pmd = __pmd(old);1534return old_pmd;1535}15361537pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,1538unsigned long addr, pud_t *pudp)1539{1540pud_t old_pud;1541unsigned long old;15421543old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);1544old_pud = __pud(old);1545return old_pud;1546}15471548#endif /* CONFIG_TRANSPARENT_HUGEPAGE */15491550void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,1551pte_t entry, unsigned long address, int psize)1552{1553struct mm_struct *mm = vma->vm_mm;1554unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |1555_PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);15561557unsigned long change = pte_val(entry) ^ pte_val(*ptep);1558/*1559* On POWER9, the NMMU is not able to relax PTE access permissions1560* for a translation with a TLB. The PTE must be invalidated, TLB1561* flushed before the new PTE is installed.1562*1563* This only needs to be done for radix, because hash translation does1564* flush when updating the linux pte (and we don't support NMMU1565* accelerators on HPT on POWER9 anyway XXX: do we?).1566*1567* POWER10 (and P9P) NMMU does behave as per ISA.1568*/1569if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&1570atomic_read(&mm->context.copros) > 0) {1571unsigned long old_pte, new_pte;15721573old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);1574new_pte = old_pte | set;1575radix__flush_tlb_page_psize(mm, address, psize);1576__radix_pte_update(ptep, _PAGE_INVALID, new_pte);1577} else {1578__radix_pte_update(ptep, 0, set);1579/*1580* Book3S does not require a TLB flush when relaxing access1581* restrictions when the address space (modulo the POWER9 nest1582* MMU issue above) because the MMU will reload the PTE after1583* taking an access fault, as defined by the architecture. See1584* "Setting a Reference or Change Bit or Upgrading Access1585* Authority (PTE Subject to Atomic Hardware Updates)" in1586* Power ISA Version 3.1B.1587*/1588}1589/* See ptesync comment in radix__set_pte_at */1590}15911592void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,1593unsigned long addr, pte_t *ptep,1594pte_t old_pte, pte_t pte)1595{1596struct mm_struct *mm = vma->vm_mm;15971598/*1599* POWER9 NMMU must flush the TLB after clearing the PTE before1600* installing a PTE with more relaxed access permissions, see1601* radix__ptep_set_access_flags.1602*/1603if (!cpu_has_feature(CPU_FTR_ARCH_31) &&1604is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&1605(atomic_read(&mm->context.copros) > 0))1606radix__flush_tlb_page(vma, addr);16071608set_pte_at(mm, addr, ptep, pte);1609}16101611int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)1612{1613pte_t *ptep = (pte_t *)pud;1614pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);16151616if (!radix_enabled())1617return 0;16181619set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);16201621return 1;1622}16231624int pud_clear_huge(pud_t *pud)1625{1626if (pud_leaf(*pud)) {1627pud_clear(pud);1628return 1;1629}16301631return 0;1632}16331634int pud_free_pmd_page(pud_t *pud, unsigned long addr)1635{1636pmd_t *pmd;1637int i;16381639pmd = pud_pgtable(*pud);1640pud_clear(pud);16411642flush_tlb_kernel_range(addr, addr + PUD_SIZE);16431644for (i = 0; i < PTRS_PER_PMD; i++) {1645if (!pmd_none(pmd[i])) {1646pte_t *pte;1647pte = (pte_t *)pmd_page_vaddr(pmd[i]);16481649pte_free_kernel(&init_mm, pte);1650}1651}16521653pmd_free(&init_mm, pmd);16541655return 1;1656}16571658int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)1659{1660pte_t *ptep = (pte_t *)pmd;1661pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);16621663if (!radix_enabled())1664return 0;16651666set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);16671668return 1;1669}16701671int pmd_clear_huge(pmd_t *pmd)1672{1673if (pmd_leaf(*pmd)) {1674pmd_clear(pmd);1675return 1;1676}16771678return 0;1679}16801681int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)1682{1683pte_t *pte;16841685pte = (pte_t *)pmd_page_vaddr(*pmd);1686pmd_clear(pmd);16871688flush_tlb_kernel_range(addr, addr + PMD_SIZE);16891690pte_free_kernel(&init_mm, pte);16911692return 1;1693}169416951696