Path: blob/master/arch/powerpc/kvm/book3s_64_mmu_radix.c
26451 views
// SPDX-License-Identifier: GPL-2.0-only1/*2*3* Copyright 2016 Paul Mackerras, IBM Corp. <[email protected]>4*/56#include <linux/types.h>7#include <linux/string.h>8#include <linux/kvm.h>9#include <linux/kvm_host.h>10#include <linux/anon_inodes.h>11#include <linux/file.h>12#include <linux/debugfs.h>13#include <linux/pgtable.h>1415#include <asm/kvm_ppc.h>16#include <asm/kvm_book3s.h>17#include "book3s_hv.h"18#include <asm/page.h>19#include <asm/mmu.h>20#include <asm/pgalloc.h>21#include <asm/pte-walk.h>22#include <asm/ultravisor.h>23#include <asm/kvm_book3s_uvmem.h>24#include <asm/plpar_wrappers.h>25#include <asm/firmware.h>2627/*28* Supported radix tree geometry.29* Like p9, we support either 5 or 9 bits at the first (lowest) level,30* for a page size of 64k or 4k.31*/32static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };3334unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,35gva_t eaddr, void *to, void *from,36unsigned long n)37{38int old_pid, old_lpid;39unsigned long quadrant, ret = n;40bool is_load = !!to;4142if (kvmhv_is_nestedv2())43return H_UNSUPPORTED;4445/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */46if (kvmhv_on_pseries())47return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,48(to != NULL) ? __pa(to): 0,49(from != NULL) ? __pa(from): 0, n);5051if (eaddr & (0xFFFUL << 52))52return ret;5354quadrant = 1;55if (!pid)56quadrant = 2;57if (is_load)58from = (void *) (eaddr | (quadrant << 62));59else60to = (void *) (eaddr | (quadrant << 62));6162preempt_disable();6364asm volatile("hwsync" ::: "memory");65isync();66/* switch the lpid first to avoid running host with unallocated pid */67old_lpid = mfspr(SPRN_LPID);68if (old_lpid != lpid)69mtspr(SPRN_LPID, lpid);70if (quadrant == 1) {71old_pid = mfspr(SPRN_PID);72if (old_pid != pid)73mtspr(SPRN_PID, pid);74}75isync();7677pagefault_disable();78if (is_load)79ret = __copy_from_user_inatomic(to, (const void __user *)from, n);80else81ret = __copy_to_user_inatomic((void __user *)to, from, n);82pagefault_enable();8384asm volatile("hwsync" ::: "memory");85isync();86/* switch the pid first to avoid running host with unallocated pid */87if (quadrant == 1 && pid != old_pid)88mtspr(SPRN_PID, old_pid);89if (lpid != old_lpid)90mtspr(SPRN_LPID, old_lpid);91isync();9293preempt_enable();9495return ret;96}9798static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,99void *to, void *from, unsigned long n)100{101int lpid = vcpu->kvm->arch.lpid;102int pid;103104/* This would cause a data segment intr so don't allow the access */105if (eaddr & (0x3FFUL << 52))106return -EINVAL;107108/* Should we be using the nested lpid */109if (vcpu->arch.nested)110lpid = vcpu->arch.nested->shadow_lpid;111112/* If accessing quadrant 3 then pid is expected to be 0 */113if (((eaddr >> 62) & 0x3) == 0x3)114pid = 0;115else116pid = kvmppc_get_pid(vcpu);117118eaddr &= ~(0xFFFUL << 52);119120return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);121}122123long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,124unsigned long n)125{126long ret;127128ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);129if (ret > 0)130memset(to + (n - ret), 0, ret);131132return ret;133}134135long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,136unsigned long n)137{138return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);139}140141int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,142struct kvmppc_pte *gpte, u64 root,143u64 *pte_ret_p)144{145struct kvm *kvm = vcpu->kvm;146int ret, level, ps;147unsigned long rts, bits, offset, index;148u64 pte, base, gpa;149__be64 rpte;150151rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |152((root & RTS2_MASK) >> RTS2_SHIFT);153bits = root & RPDS_MASK;154base = root & RPDB_MASK;155156offset = rts + 31;157158/* Current implementations only support 52-bit space */159if (offset != 52)160return -EINVAL;161162/* Walk each level of the radix tree */163for (level = 3; level >= 0; --level) {164u64 addr;165/* Check a valid size */166if (level && bits != p9_supported_radix_bits[level])167return -EINVAL;168if (level == 0 && !(bits == 5 || bits == 9))169return -EINVAL;170offset -= bits;171index = (eaddr >> offset) & ((1UL << bits) - 1);172/* Check that low bits of page table base are zero */173if (base & ((1UL << (bits + 3)) - 1))174return -EINVAL;175/* Read the entry from guest memory */176addr = base + (index * sizeof(rpte));177178kvm_vcpu_srcu_read_lock(vcpu);179ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));180kvm_vcpu_srcu_read_unlock(vcpu);181if (ret) {182if (pte_ret_p)183*pte_ret_p = addr;184return ret;185}186pte = __be64_to_cpu(rpte);187if (!(pte & _PAGE_PRESENT))188return -ENOENT;189/* Check if a leaf entry */190if (pte & _PAGE_PTE)191break;192/* Get ready to walk the next level */193base = pte & RPDB_MASK;194bits = pte & RPDS_MASK;195}196197/* Need a leaf at lowest level; 512GB pages not supported */198if (level < 0 || level == 3)199return -EINVAL;200201/* We found a valid leaf PTE */202/* Offset is now log base 2 of the page size */203gpa = pte & 0x01fffffffffff000ul;204if (gpa & ((1ul << offset) - 1))205return -EINVAL;206gpa |= eaddr & ((1ul << offset) - 1);207for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)208if (offset == mmu_psize_defs[ps].shift)209break;210gpte->page_size = ps;211gpte->page_shift = offset;212213gpte->eaddr = eaddr;214gpte->raddr = gpa;215216/* Work out permissions */217gpte->may_read = !!(pte & _PAGE_READ);218gpte->may_write = !!(pte & _PAGE_WRITE);219gpte->may_execute = !!(pte & _PAGE_EXEC);220221gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);222223if (pte_ret_p)224*pte_ret_p = pte;225226return 0;227}228229/*230* Used to walk a partition or process table radix tree in guest memory231* Note: We exploit the fact that a partition table and a process232* table have the same layout, a partition-scoped page table and a233* process-scoped page table have the same layout, and the 2nd234* doubleword of a partition table entry has the same layout as235* the PTCR register.236*/237int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,238struct kvmppc_pte *gpte, u64 table,239int table_index, u64 *pte_ret_p)240{241struct kvm *kvm = vcpu->kvm;242int ret;243unsigned long size, ptbl, root;244struct prtb_entry entry;245246if ((table & PRTS_MASK) > 24)247return -EINVAL;248size = 1ul << ((table & PRTS_MASK) + 12);249250/* Is the table big enough to contain this entry? */251if ((table_index * sizeof(entry)) >= size)252return -EINVAL;253254/* Read the table to find the root of the radix tree */255ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));256kvm_vcpu_srcu_read_lock(vcpu);257ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));258kvm_vcpu_srcu_read_unlock(vcpu);259if (ret)260return ret;261262/* Root is stored in the first double word */263root = be64_to_cpu(entry.prtb0);264265return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);266}267268int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,269struct kvmppc_pte *gpte, bool data, bool iswrite)270{271u32 pid;272u64 pte;273int ret;274275/* Work out effective PID */276switch (eaddr >> 62) {277case 0:278pid = kvmppc_get_pid(vcpu);279break;280case 3:281pid = 0;282break;283default:284return -EINVAL;285}286287ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,288vcpu->kvm->arch.process_table, pid, &pte);289if (ret)290return ret;291292/* Check privilege (applies only to process scoped translations) */293if (kvmppc_get_msr(vcpu) & MSR_PR) {294if (pte & _PAGE_PRIVILEGED) {295gpte->may_read = 0;296gpte->may_write = 0;297gpte->may_execute = 0;298}299} else {300if (!(pte & _PAGE_PRIVILEGED)) {301/* Check AMR/IAMR to see if strict mode is in force */302if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))303gpte->may_read = 0;304if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))305gpte->may_write = 0;306if (vcpu->arch.iamr & (1ul << 62))307gpte->may_execute = 0;308}309}310311return 0;312}313314void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,315unsigned int pshift, u64 lpid)316{317unsigned long psize = PAGE_SIZE;318int psi;319long rc;320unsigned long rb;321322if (pshift)323psize = 1UL << pshift;324else325pshift = PAGE_SHIFT;326327addr &= ~(psize - 1);328329if (!kvmhv_on_pseries()) {330radix__flush_tlb_lpid_page(lpid, addr, psize);331return;332}333334psi = shift_to_mmu_psize(pshift);335336if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {337rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));338rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),339lpid, rb);340} else {341rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,342H_RPTI_TYPE_NESTED |343H_RPTI_TYPE_TLB,344psize_to_rpti_pgsize(psi),345addr, addr + psize);346}347348if (rc)349pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);350}351352static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid)353{354long rc;355356if (!kvmhv_on_pseries()) {357radix__flush_pwc_lpid(lpid);358return;359}360361if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))362rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),363lpid, TLBIEL_INVAL_SET_LPID);364else365rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,366H_RPTI_TYPE_NESTED |367H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,3680, -1UL);369if (rc)370pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);371}372373static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,374unsigned long clr, unsigned long set,375unsigned long addr, unsigned int shift)376{377return __radix_pte_update(ptep, clr, set);378}379380static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,381pte_t *ptep, pte_t pte)382{383radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);384}385386static struct kmem_cache *kvm_pte_cache;387static struct kmem_cache *kvm_pmd_cache;388389static pte_t *kvmppc_pte_alloc(void)390{391pte_t *pte;392393pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);394/* pmd_populate() will only reference _pa(pte). */395kmemleak_ignore(pte);396397return pte;398}399400static void kvmppc_pte_free(pte_t *ptep)401{402kmem_cache_free(kvm_pte_cache, ptep);403}404405static pmd_t *kvmppc_pmd_alloc(void)406{407pmd_t *pmd;408409pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);410/* pud_populate() will only reference _pa(pmd). */411kmemleak_ignore(pmd);412413return pmd;414}415416static void kvmppc_pmd_free(pmd_t *pmdp)417{418kmem_cache_free(kvm_pmd_cache, pmdp);419}420421/* Called with kvm->mmu_lock held */422void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,423unsigned int shift,424const struct kvm_memory_slot *memslot,425u64 lpid)426427{428unsigned long old;429unsigned long gfn = gpa >> PAGE_SHIFT;430unsigned long page_size = PAGE_SIZE;431unsigned long hpa;432433old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);434kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);435436/* The following only applies to L1 entries */437if (lpid != kvm->arch.lpid)438return;439440if (!memslot) {441memslot = gfn_to_memslot(kvm, gfn);442if (!memslot)443return;444}445if (shift) { /* 1GB or 2MB page */446page_size = 1ul << shift;447if (shift == PMD_SHIFT)448kvm->stat.num_2M_pages--;449else if (shift == PUD_SHIFT)450kvm->stat.num_1G_pages--;451}452453gpa &= ~(page_size - 1);454hpa = old & PTE_RPN_MASK;455kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);456457if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)458kvmppc_update_dirty_map(memslot, gfn, page_size);459}460461/*462* kvmppc_free_p?d are used to free existing page tables, and recursively463* descend and clear and free children.464* Callers are responsible for flushing the PWC.465*466* When page tables are being unmapped/freed as part of page fault path467* (full == false), valid ptes are generally not expected; however, there468* is one situation where they arise, which is when dirty page logging is469* turned off for a memslot while the VM is running. The new memslot470* becomes visible to page faults before the memslot commit function471* gets to flush the memslot, which can lead to a 2MB page mapping being472* installed for a guest physical address where there are already 64kB473* (or 4kB) mappings (of sub-pages of the same 2MB page).474*/475static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,476u64 lpid)477{478if (full) {479memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);480} else {481pte_t *p = pte;482unsigned long it;483484for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {485if (pte_val(*p) == 0)486continue;487kvmppc_unmap_pte(kvm, p,488pte_pfn(*p) << PAGE_SHIFT,489PAGE_SHIFT, NULL, lpid);490}491}492493kvmppc_pte_free(pte);494}495496static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,497u64 lpid)498{499unsigned long im;500pmd_t *p = pmd;501502for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {503if (!pmd_present(*p))504continue;505if (pmd_leaf(*p)) {506if (full) {507pmd_clear(p);508} else {509WARN_ON_ONCE(1);510kvmppc_unmap_pte(kvm, (pte_t *)p,511pte_pfn(*(pte_t *)p) << PAGE_SHIFT,512PMD_SHIFT, NULL, lpid);513}514} else {515pte_t *pte;516517pte = pte_offset_kernel(p, 0);518kvmppc_unmap_free_pte(kvm, pte, full, lpid);519pmd_clear(p);520}521}522kvmppc_pmd_free(pmd);523}524525static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,526u64 lpid)527{528unsigned long iu;529pud_t *p = pud;530531for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {532if (!pud_present(*p))533continue;534if (pud_leaf(*p)) {535pud_clear(p);536} else {537pmd_t *pmd;538539pmd = pmd_offset(p, 0);540kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);541pud_clear(p);542}543}544pud_free(kvm->mm, pud);545}546547void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid)548{549unsigned long ig;550551for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {552p4d_t *p4d = p4d_offset(pgd, 0);553pud_t *pud;554555if (!p4d_present(*p4d))556continue;557pud = pud_offset(p4d, 0);558kvmppc_unmap_free_pud(kvm, pud, lpid);559p4d_clear(p4d);560}561}562563void kvmppc_free_radix(struct kvm *kvm)564{565if (kvm->arch.pgtable) {566kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,567kvm->arch.lpid);568pgd_free(kvm->mm, kvm->arch.pgtable);569kvm->arch.pgtable = NULL;570}571}572573static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,574unsigned long gpa, u64 lpid)575{576pte_t *pte = pte_offset_kernel(pmd, 0);577578/*579* Clearing the pmd entry then flushing the PWC ensures that the pte580* page no longer be cached by the MMU, so can be freed without581* flushing the PWC again.582*/583pmd_clear(pmd);584kvmppc_radix_flush_pwc(kvm, lpid);585586kvmppc_unmap_free_pte(kvm, pte, false, lpid);587}588589static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,590unsigned long gpa, u64 lpid)591{592pmd_t *pmd = pmd_offset(pud, 0);593594/*595* Clearing the pud entry then flushing the PWC ensures that the pmd596* page and any children pte pages will no longer be cached by the MMU,597* so can be freed without flushing the PWC again.598*/599pud_clear(pud);600kvmppc_radix_flush_pwc(kvm, lpid);601602kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);603}604605/*606* There are a number of bits which may differ between different faults to607* the same partition scope entry. RC bits, in the course of cleaning and608* aging. And the write bit can change, either the access could have been609* upgraded, or a read fault could happen concurrently with a write fault610* that sets those bits first.611*/612#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))613614int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,615unsigned long gpa, unsigned int level,616unsigned long mmu_seq, u64 lpid,617unsigned long *rmapp, struct rmap_nested **n_rmap)618{619pgd_t *pgd;620p4d_t *p4d;621pud_t *pud, *new_pud = NULL;622pmd_t *pmd, *new_pmd = NULL;623pte_t *ptep, *new_ptep = NULL;624int ret;625626/* Traverse the guest's 2nd-level tree, allocate new levels needed */627pgd = pgtable + pgd_index(gpa);628p4d = p4d_offset(pgd, gpa);629630pud = NULL;631if (p4d_present(*p4d))632pud = pud_offset(p4d, gpa);633else634new_pud = pud_alloc_one(kvm->mm, gpa);635636pmd = NULL;637if (pud && pud_present(*pud) && !pud_leaf(*pud))638pmd = pmd_offset(pud, gpa);639else if (level <= 1)640new_pmd = kvmppc_pmd_alloc();641642if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd)))643new_ptep = kvmppc_pte_alloc();644645/* Check if we might have been invalidated; let the guest retry if so */646spin_lock(&kvm->mmu_lock);647ret = -EAGAIN;648if (mmu_invalidate_retry(kvm, mmu_seq))649goto out_unlock;650651/* Now traverse again under the lock and change the tree */652ret = -ENOMEM;653if (p4d_none(*p4d)) {654if (!new_pud)655goto out_unlock;656p4d_populate(kvm->mm, p4d, new_pud);657new_pud = NULL;658}659pud = pud_offset(p4d, gpa);660if (pud_leaf(*pud)) {661unsigned long hgpa = gpa & PUD_MASK;662663/* Check if we raced and someone else has set the same thing */664if (level == 2) {665if (pud_raw(*pud) == pte_raw(pte)) {666ret = 0;667goto out_unlock;668}669/* Valid 1GB page here already, add our extra bits */670WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &671PTE_BITS_MUST_MATCH);672kvmppc_radix_update_pte(kvm, (pte_t *)pud,6730, pte_val(pte), hgpa, PUD_SHIFT);674ret = 0;675goto out_unlock;676}677/*678* If we raced with another CPU which has just put679* a 1GB pte in after we saw a pmd page, try again.680*/681if (!new_pmd) {682ret = -EAGAIN;683goto out_unlock;684}685/* Valid 1GB page here already, remove it */686kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,687lpid);688}689if (level == 2) {690if (!pud_none(*pud)) {691/*692* There's a page table page here, but we wanted to693* install a large page, so remove and free the page694* table page.695*/696kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);697}698kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);699if (rmapp && n_rmap)700kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);701ret = 0;702goto out_unlock;703}704if (pud_none(*pud)) {705if (!new_pmd)706goto out_unlock;707pud_populate(kvm->mm, pud, new_pmd);708new_pmd = NULL;709}710pmd = pmd_offset(pud, gpa);711if (pmd_leaf(*pmd)) {712unsigned long lgpa = gpa & PMD_MASK;713714/* Check if we raced and someone else has set the same thing */715if (level == 1) {716if (pmd_raw(*pmd) == pte_raw(pte)) {717ret = 0;718goto out_unlock;719}720/* Valid 2MB page here already, add our extra bits */721WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &722PTE_BITS_MUST_MATCH);723kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),7240, pte_val(pte), lgpa, PMD_SHIFT);725ret = 0;726goto out_unlock;727}728729/*730* If we raced with another CPU which has just put731* a 2MB pte in after we saw a pte page, try again.732*/733if (!new_ptep) {734ret = -EAGAIN;735goto out_unlock;736}737/* Valid 2MB page here already, remove it */738kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,739lpid);740}741if (level == 1) {742if (!pmd_none(*pmd)) {743/*744* There's a page table page here, but we wanted to745* install a large page, so remove and free the page746* table page.747*/748kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);749}750kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);751if (rmapp && n_rmap)752kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);753ret = 0;754goto out_unlock;755}756if (pmd_none(*pmd)) {757if (!new_ptep)758goto out_unlock;759pmd_populate(kvm->mm, pmd, new_ptep);760new_ptep = NULL;761}762ptep = pte_offset_kernel(pmd, gpa);763if (pte_present(*ptep)) {764/* Check if someone else set the same thing */765if (pte_raw(*ptep) == pte_raw(pte)) {766ret = 0;767goto out_unlock;768}769/* Valid page here already, add our extra bits */770WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &771PTE_BITS_MUST_MATCH);772kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);773ret = 0;774goto out_unlock;775}776kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);777if (rmapp && n_rmap)778kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);779ret = 0;780781out_unlock:782spin_unlock(&kvm->mmu_lock);783if (new_pud)784pud_free(kvm->mm, new_pud);785if (new_pmd)786kvmppc_pmd_free(new_pmd);787if (new_ptep)788kvmppc_pte_free(new_ptep);789return ret;790}791792bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,793unsigned long gpa, u64 lpid)794{795unsigned long pgflags;796unsigned int shift;797pte_t *ptep;798799/*800* Need to set an R or C bit in the 2nd-level tables;801* since we are just helping out the hardware here,802* it is sufficient to do what the hardware does.803*/804pgflags = _PAGE_ACCESSED;805if (writing)806pgflags |= _PAGE_DIRTY;807808if (nested)809ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);810else811ptep = find_kvm_secondary_pte(kvm, gpa, &shift);812813if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {814kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);815return true;816}817return false;818}819820int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,821unsigned long gpa,822struct kvm_memory_slot *memslot,823bool writing,824pte_t *inserted_pte, unsigned int *levelp)825{826struct kvm *kvm = vcpu->kvm;827struct page *page = NULL;828unsigned long mmu_seq;829unsigned long hva, gfn = gpa >> PAGE_SHIFT;830bool upgrade_write = false;831pte_t pte, *ptep;832unsigned int shift, level;833int ret;834bool large_enable;835kvm_pfn_t pfn;836837/* used to check for invalidations in progress */838mmu_seq = kvm->mmu_invalidate_seq;839smp_rmb();840841hva = gfn_to_hva_memslot(memslot, gfn);842pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0,843&upgrade_write, &page);844if (is_error_noslot_pfn(pfn))845return -EFAULT;846847/*848* Read the PTE from the process' radix tree and use that849* so we get the shift and attribute bits.850*/851spin_lock(&kvm->mmu_lock);852ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);853pte = __pte(0);854if (ptep)855pte = READ_ONCE(*ptep);856spin_unlock(&kvm->mmu_lock);857/*858* If the PTE disappeared temporarily due to a THP859* collapse, just return and let the guest try again.860*/861if (!pte_present(pte)) {862if (page)863put_page(page);864return RESUME_GUEST;865}866867/* If we're logging dirty pages, always map single pages */868large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);869870/* Get pte level from shift/size */871if (large_enable && shift == PUD_SHIFT &&872(gpa & (PUD_SIZE - PAGE_SIZE)) ==873(hva & (PUD_SIZE - PAGE_SIZE))) {874level = 2;875} else if (large_enable && shift == PMD_SHIFT &&876(gpa & (PMD_SIZE - PAGE_SIZE)) ==877(hva & (PMD_SIZE - PAGE_SIZE))) {878level = 1;879} else {880level = 0;881if (shift > PAGE_SHIFT) {882/*883* If the pte maps more than one page, bring over884* bits from the virtual address to get the real885* address of the specific single page we want.886*/887unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;888pte = __pte(pte_val(pte) | (hva & rpnmask));889}890}891892pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);893if (writing || upgrade_write) {894if (pte_val(pte) & _PAGE_WRITE)895pte = __pte(pte_val(pte) | _PAGE_DIRTY);896} else {897pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));898}899900/* Allocate space in the tree and write the PTE */901ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,902mmu_seq, kvm->arch.lpid, NULL, NULL);903if (inserted_pte)904*inserted_pte = pte;905if (levelp)906*levelp = level;907908if (page) {909if (!ret && (pte_val(pte) & _PAGE_WRITE))910set_page_dirty_lock(page);911put_page(page);912}913914/* Increment number of large pages if we (successfully) inserted one */915if (!ret) {916if (level == 1)917kvm->stat.num_2M_pages++;918else if (level == 2)919kvm->stat.num_1G_pages++;920}921922return ret;923}924925int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,926unsigned long ea, unsigned long dsisr)927{928struct kvm *kvm = vcpu->kvm;929unsigned long gpa, gfn;930struct kvm_memory_slot *memslot;931long ret;932bool writing = !!(dsisr & DSISR_ISSTORE);933934/* Check for unusual errors */935if (dsisr & DSISR_UNSUPP_MMU) {936pr_err("KVM: Got unsupported MMU fault\n");937return -EFAULT;938}939if (dsisr & DSISR_BADACCESS) {940/* Reflect to the guest as DSI */941pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);942kvmppc_core_queue_data_storage(vcpu,943kvmppc_get_msr(vcpu) & SRR1_PREFIXED,944ea, dsisr);945return RESUME_GUEST;946}947948/* Translate the logical address */949gpa = vcpu->arch.fault_gpa & ~0xfffUL;950gpa &= ~0xF000000000000000ul;951gfn = gpa >> PAGE_SHIFT;952if (!(dsisr & DSISR_PRTABLE_FAULT))953gpa |= ea & 0xfff;954955if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)956return kvmppc_send_page_to_uv(kvm, gfn);957958/* Get the corresponding memslot */959memslot = gfn_to_memslot(kvm, gfn);960961/* No memslot means it's an emulated MMIO region */962if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {963if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |964DSISR_SET_RC)) {965/*966* Bad address in guest page table tree, or other967* unusual error - reflect it to the guest as DSI.968*/969kvmppc_core_queue_data_storage(vcpu,970kvmppc_get_msr(vcpu) & SRR1_PREFIXED,971ea, dsisr);972return RESUME_GUEST;973}974return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);975}976977if (memslot->flags & KVM_MEM_READONLY) {978if (writing) {979/* give the guest a DSI */980kvmppc_core_queue_data_storage(vcpu,981kvmppc_get_msr(vcpu) & SRR1_PREFIXED,982ea, DSISR_ISSTORE | DSISR_PROTFAULT);983return RESUME_GUEST;984}985}986987/* Failed to set the reference/change bits */988if (dsisr & DSISR_SET_RC) {989spin_lock(&kvm->mmu_lock);990if (kvmppc_hv_handle_set_rc(kvm, false, writing,991gpa, kvm->arch.lpid))992dsisr &= ~DSISR_SET_RC;993spin_unlock(&kvm->mmu_lock);994995if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |996DSISR_PROTFAULT | DSISR_SET_RC)))997return RESUME_GUEST;998}9991000/* Try to insert a pte */1001ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,1002NULL, NULL);10031004if (ret == 0 || ret == -EAGAIN)1005ret = RESUME_GUEST;1006return ret;1007}10081009/* Called with kvm->mmu_lock held */1010void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,1011unsigned long gfn)1012{1013pte_t *ptep;1014unsigned long gpa = gfn << PAGE_SHIFT;1015unsigned int shift;10161017if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {1018uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);1019return;1020}10211022ptep = find_kvm_secondary_pte(kvm, gpa, &shift);1023if (ptep && pte_present(*ptep))1024kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,1025kvm->arch.lpid);1026}10271028/* Called with kvm->mmu_lock held */1029bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,1030unsigned long gfn)1031{1032pte_t *ptep;1033unsigned long gpa = gfn << PAGE_SHIFT;1034unsigned int shift;1035bool ref = false;1036unsigned long old, *rmapp;10371038if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)1039return ref;10401041ptep = find_kvm_secondary_pte(kvm, gpa, &shift);1042if (ptep && pte_present(*ptep) && pte_young(*ptep)) {1043old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,1044gpa, shift);1045/* XXX need to flush tlb here? */1046/* Also clear bit in ptes in shadow pgtable for nested guests */1047rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];1048kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,1049old & PTE_RPN_MASK,10501UL << shift);1051ref = true;1052}1053return ref;1054}10551056/* Called with kvm->mmu_lock held */1057bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,1058unsigned long gfn)10591060{1061pte_t *ptep;1062unsigned long gpa = gfn << PAGE_SHIFT;1063unsigned int shift;1064bool ref = false;10651066if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)1067return ref;10681069ptep = find_kvm_secondary_pte(kvm, gpa, &shift);1070if (ptep && pte_present(*ptep) && pte_young(*ptep))1071ref = true;1072return ref;1073}10741075/* Returns the number of PAGE_SIZE pages that are dirty */1076static int kvm_radix_test_clear_dirty(struct kvm *kvm,1077struct kvm_memory_slot *memslot, int pagenum)1078{1079unsigned long gfn = memslot->base_gfn + pagenum;1080unsigned long gpa = gfn << PAGE_SHIFT;1081pte_t *ptep, pte;1082unsigned int shift;1083int ret = 0;1084unsigned long old, *rmapp;10851086if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)1087return ret;10881089/*1090* For performance reasons we don't hold kvm->mmu_lock while walking the1091* partition scoped table.1092*/1093ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);1094if (!ptep)1095return 0;10961097pte = READ_ONCE(*ptep);1098if (pte_present(pte) && pte_dirty(pte)) {1099spin_lock(&kvm->mmu_lock);1100/*1101* Recheck the pte again1102*/1103if (pte_val(pte) != pte_val(*ptep)) {1104/*1105* We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can1106* only find PAGE_SIZE pte entries here. We can continue1107* to use the pte addr returned by above page table1108* walk.1109*/1110if (!pte_present(*ptep) || !pte_dirty(*ptep)) {1111spin_unlock(&kvm->mmu_lock);1112return 0;1113}1114}11151116ret = 1;1117VM_BUG_ON(shift);1118old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,1119gpa, shift);1120kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);1121/* Also clear bit in ptes in shadow pgtable for nested guests */1122rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];1123kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,1124old & PTE_RPN_MASK,11251UL << shift);1126spin_unlock(&kvm->mmu_lock);1127}1128return ret;1129}11301131long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,1132struct kvm_memory_slot *memslot, unsigned long *map)1133{1134unsigned long i, j;1135int npages;11361137for (i = 0; i < memslot->npages; i = j) {1138npages = kvm_radix_test_clear_dirty(kvm, memslot, i);11391140/*1141* Note that if npages > 0 then i must be a multiple of npages,1142* since huge pages are only used to back the guest at guest1143* real addresses that are a multiple of their size.1144* Since we have at most one PTE covering any given guest1145* real address, if npages > 1 we can skip to i + npages.1146*/1147j = i + 1;1148if (npages) {1149set_dirty_bits(map, i, npages);1150j = i + npages;1151}1152}1153return 0;1154}11551156void kvmppc_radix_flush_memslot(struct kvm *kvm,1157const struct kvm_memory_slot *memslot)1158{1159unsigned long n;1160pte_t *ptep;1161unsigned long gpa;1162unsigned int shift;11631164if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)1165kvmppc_uvmem_drop_pages(memslot, kvm, true);11661167if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)1168return;11691170gpa = memslot->base_gfn << PAGE_SHIFT;1171spin_lock(&kvm->mmu_lock);1172for (n = memslot->npages; n; --n) {1173ptep = find_kvm_secondary_pte(kvm, gpa, &shift);1174if (ptep && pte_present(*ptep))1175kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,1176kvm->arch.lpid);1177gpa += PAGE_SIZE;1178}1179/*1180* Increase the mmu notifier sequence number to prevent any page1181* fault that read the memslot earlier from writing a PTE.1182*/1183kvm->mmu_invalidate_seq++;1184spin_unlock(&kvm->mmu_lock);1185}11861187static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,1188int psize, int *indexp)1189{1190if (!mmu_psize_defs[psize].shift)1191return;1192info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |1193(mmu_psize_defs[psize].ap << 29);1194++(*indexp);1195}11961197int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)1198{1199int i;12001201if (!radix_enabled())1202return -EINVAL;1203memset(info, 0, sizeof(*info));12041205/* 4k page size */1206info->geometries[0].page_shift = 12;1207info->geometries[0].level_bits[0] = 9;1208for (i = 1; i < 4; ++i)1209info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];1210/* 64k page size */1211info->geometries[1].page_shift = 16;1212for (i = 0; i < 4; ++i)1213info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];12141215i = 0;1216add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);1217add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);1218add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);1219add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);12201221return 0;1222}12231224int kvmppc_init_vm_radix(struct kvm *kvm)1225{1226kvm->arch.pgtable = pgd_alloc(kvm->mm);1227if (!kvm->arch.pgtable)1228return -ENOMEM;1229return 0;1230}12311232static void pte_ctor(void *addr)1233{1234memset(addr, 0, RADIX_PTE_TABLE_SIZE);1235}12361237static void pmd_ctor(void *addr)1238{1239memset(addr, 0, RADIX_PMD_TABLE_SIZE);1240}12411242struct debugfs_radix_state {1243struct kvm *kvm;1244struct mutex mutex;1245unsigned long gpa;1246int lpid;1247int chars_left;1248int buf_index;1249char buf[128];1250u8 hdr;1251};12521253static int debugfs_radix_open(struct inode *inode, struct file *file)1254{1255struct kvm *kvm = inode->i_private;1256struct debugfs_radix_state *p;12571258p = kzalloc(sizeof(*p), GFP_KERNEL);1259if (!p)1260return -ENOMEM;12611262kvm_get_kvm(kvm);1263p->kvm = kvm;1264mutex_init(&p->mutex);1265file->private_data = p;12661267return nonseekable_open(inode, file);1268}12691270static int debugfs_radix_release(struct inode *inode, struct file *file)1271{1272struct debugfs_radix_state *p = file->private_data;12731274kvm_put_kvm(p->kvm);1275kfree(p);1276return 0;1277}12781279static ssize_t debugfs_radix_read(struct file *file, char __user *buf,1280size_t len, loff_t *ppos)1281{1282struct debugfs_radix_state *p = file->private_data;1283ssize_t ret, r;1284unsigned long n;1285struct kvm *kvm;1286unsigned long gpa;1287pgd_t *pgt;1288struct kvm_nested_guest *nested;1289pgd_t *pgdp;1290p4d_t p4d, *p4dp;1291pud_t pud, *pudp;1292pmd_t pmd, *pmdp;1293pte_t *ptep;1294int shift;1295unsigned long pte;12961297kvm = p->kvm;1298if (!kvm_is_radix(kvm))1299return 0;13001301ret = mutex_lock_interruptible(&p->mutex);1302if (ret)1303return ret;13041305if (p->chars_left) {1306n = p->chars_left;1307if (n > len)1308n = len;1309r = copy_to_user(buf, p->buf + p->buf_index, n);1310n -= r;1311p->chars_left -= n;1312p->buf_index += n;1313buf += n;1314len -= n;1315ret = n;1316if (r) {1317if (!n)1318ret = -EFAULT;1319goto out;1320}1321}13221323gpa = p->gpa;1324nested = NULL;1325pgt = NULL;1326while (len != 0 && p->lpid >= 0) {1327if (gpa >= RADIX_PGTABLE_RANGE) {1328gpa = 0;1329pgt = NULL;1330if (nested) {1331kvmhv_put_nested(nested);1332nested = NULL;1333}1334p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);1335p->hdr = 0;1336if (p->lpid < 0)1337break;1338}1339if (!pgt) {1340if (p->lpid == 0) {1341pgt = kvm->arch.pgtable;1342} else {1343nested = kvmhv_get_nested(kvm, p->lpid, false);1344if (!nested) {1345gpa = RADIX_PGTABLE_RANGE;1346continue;1347}1348pgt = nested->shadow_pgtable;1349}1350}1351n = 0;1352if (!p->hdr) {1353if (p->lpid > 0)1354n = scnprintf(p->buf, sizeof(p->buf),1355"\nNested LPID %d: ", p->lpid);1356n += scnprintf(p->buf + n, sizeof(p->buf) - n,1357"pgdir: %lx\n", (unsigned long)pgt);1358p->hdr = 1;1359goto copy;1360}13611362pgdp = pgt + pgd_index(gpa);1363p4dp = p4d_offset(pgdp, gpa);1364p4d = READ_ONCE(*p4dp);1365if (!(p4d_val(p4d) & _PAGE_PRESENT)) {1366gpa = (gpa & P4D_MASK) + P4D_SIZE;1367continue;1368}13691370pudp = pud_offset(&p4d, gpa);1371pud = READ_ONCE(*pudp);1372if (!(pud_val(pud) & _PAGE_PRESENT)) {1373gpa = (gpa & PUD_MASK) + PUD_SIZE;1374continue;1375}1376if (pud_val(pud) & _PAGE_PTE) {1377pte = pud_val(pud);1378shift = PUD_SHIFT;1379goto leaf;1380}13811382pmdp = pmd_offset(&pud, gpa);1383pmd = READ_ONCE(*pmdp);1384if (!(pmd_val(pmd) & _PAGE_PRESENT)) {1385gpa = (gpa & PMD_MASK) + PMD_SIZE;1386continue;1387}1388if (pmd_val(pmd) & _PAGE_PTE) {1389pte = pmd_val(pmd);1390shift = PMD_SHIFT;1391goto leaf;1392}13931394ptep = pte_offset_kernel(&pmd, gpa);1395pte = pte_val(READ_ONCE(*ptep));1396if (!(pte & _PAGE_PRESENT)) {1397gpa += PAGE_SIZE;1398continue;1399}1400shift = PAGE_SHIFT;1401leaf:1402n = scnprintf(p->buf, sizeof(p->buf),1403" %lx: %lx %d\n", gpa, pte, shift);1404gpa += 1ul << shift;1405copy:1406p->chars_left = n;1407if (n > len)1408n = len;1409r = copy_to_user(buf, p->buf, n);1410n -= r;1411p->chars_left -= n;1412p->buf_index = n;1413buf += n;1414len -= n;1415ret += n;1416if (r) {1417if (!ret)1418ret = -EFAULT;1419break;1420}1421}1422p->gpa = gpa;1423if (nested)1424kvmhv_put_nested(nested);14251426out:1427mutex_unlock(&p->mutex);1428return ret;1429}14301431static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,1432size_t len, loff_t *ppos)1433{1434return -EACCES;1435}14361437static const struct file_operations debugfs_radix_fops = {1438.owner = THIS_MODULE,1439.open = debugfs_radix_open,1440.release = debugfs_radix_release,1441.read = debugfs_radix_read,1442.write = debugfs_radix_write,1443.llseek = generic_file_llseek,1444};14451446void kvmhv_radix_debugfs_init(struct kvm *kvm)1447{1448debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,1449&debugfs_radix_fops);1450}14511452int kvmppc_radix_init(void)1453{1454unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;14551456kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);1457if (!kvm_pte_cache)1458return -ENOMEM;14591460size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;14611462kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);1463if (!kvm_pmd_cache) {1464kmem_cache_destroy(kvm_pte_cache);1465return -ENOMEM;1466}14671468return 0;1469}14701471void kvmppc_radix_exit(void)1472{1473kmem_cache_destroy(kvm_pte_cache);1474kmem_cache_destroy(kvm_pmd_cache);1475}147614771478