// SPDX-License-Identifier: GPL-2.01/*2* Copyright (C) 2020-2023 Loongson Technology Corporation Limited3*/45#include <linux/highmem.h>6#include <linux/hugetlb.h>7#include <linux/kvm_host.h>8#include <linux/page-flags.h>9#include <linux/uaccess.h>10#include <asm/mmu_context.h>11#include <asm/pgalloc.h>12#include <asm/tlb.h>13#include <asm/kvm_mmu.h>1415static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot)16{17return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE;18}1920static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot)21{22return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE;23}2425static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx)26{27ctx->level = kvm->arch.root_level;28/* pte table */29ctx->invalid_ptes = kvm->arch.invalid_ptes;30ctx->pte_shifts = kvm->arch.pte_shifts;31ctx->pgtable_shift = ctx->pte_shifts[ctx->level];32ctx->invalid_entry = ctx->invalid_ptes[ctx->level];33ctx->opaque = kvm;34}3536/*37* Mark a range of guest physical address space old (all accesses fault) in the38* VM's GPA page table to allow detection of commonly used pages.39*/40static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)41{42if (kvm_pte_young(*pte)) {43*pte = kvm_pte_mkold(*pte);44return 1;45}4647return 0;48}4950/*51* Mark a range of guest physical address space clean (writes fault) in the VM's52* GPA page table to allow dirty page tracking.53*/54static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)55{56gfn_t offset;57kvm_pte_t val;5859val = *pte;60/*61* For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end62* may cross hugepage, for first huge page parameter addr is equal to63* start, however for the second huge page addr is base address of64* this huge page, rather than start or end address65*/66if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) {67offset = (addr >> PAGE_SHIFT) - ctx->gfn;68if (!(BIT(offset) & ctx->mask))69return 0;70}7172/*73* Need not split huge page now, just set write-proect pte bit74* Split huge page until next write fault75*/76if (kvm_pte_dirty(val)) {77*pte = kvm_pte_mkclean(val);78return 1;79}8081return 0;82}8384/*85* Clear pte entry86*/87static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)88{89struct kvm *kvm;9091kvm = ctx->opaque;92if (ctx->level)93kvm->stat.hugepages--;94else95kvm->stat.pages--;9697*pte = ctx->invalid_entry;9899return 1;100}101102/*103* kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.104*105* Allocate a blank KVM GPA page directory (PGD) for representing guest physical106* to host physical page mappings.107*108* Returns: Pointer to new KVM GPA page directory.109* NULL on allocation failure.110*/111kvm_pte_t *kvm_pgd_alloc(void)112{113kvm_pte_t *pgd;114115pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0);116if (pgd)117pgd_init((void *)pgd);118119return pgd;120}121122static void _kvm_pte_init(void *addr, unsigned long val)123{124unsigned long *p, *end;125126p = (unsigned long *)addr;127end = p + PTRS_PER_PTE;128do {129p[0] = val;130p[1] = val;131p[2] = val;132p[3] = val;133p[4] = val;134p += 8;135p[-3] = val;136p[-2] = val;137p[-1] = val;138} while (p != end);139}140141/*142* Caller must hold kvm->mm_lock143*144* Walk the page tables of kvm to find the PTE corresponding to the145* address @addr. If page tables don't exist for @addr, they will be created146* from the MMU cache if @cache is not NULL.147*/148static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm,149struct kvm_mmu_memory_cache *cache,150unsigned long addr, int level)151{152kvm_ptw_ctx ctx;153kvm_pte_t *entry, *child;154155kvm_ptw_prepare(kvm, &ctx);156child = kvm->arch.pgd;157while (ctx.level > level) {158entry = kvm_pgtable_offset(&ctx, child, addr);159if (kvm_pte_none(&ctx, entry)) {160if (!cache)161return NULL;162163child = kvm_mmu_memory_cache_alloc(cache);164_kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]);165smp_wmb(); /* Make pte visible before pmd */166kvm_set_pte(entry, __pa(child));167} else if (kvm_pte_huge(*entry)) {168return entry;169} else170child = (kvm_pte_t *)__va(PHYSADDR(*entry));171kvm_ptw_enter(&ctx);172}173174entry = kvm_pgtable_offset(&ctx, child, addr);175176return entry;177}178179/*180* Page walker for VM shadow mmu at last level181* The last level is small pte page or huge pmd page182*/183static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)184{185int ret;186phys_addr_t next, start, size;187struct list_head *list;188kvm_pte_t *entry, *child;189190ret = 0;191start = addr;192child = (kvm_pte_t *)__va(PHYSADDR(*dir));193entry = kvm_pgtable_offset(ctx, child, addr);194do {195next = addr + (0x1UL << ctx->pgtable_shift);196if (!kvm_pte_present(ctx, entry))197continue;198199ret |= ctx->ops(entry, addr, ctx);200} while (entry++, addr = next, addr < end);201202if (kvm_need_flush(ctx)) {203size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);204if (start + size == end) {205list = (struct list_head *)child;206list_add_tail(list, &ctx->list);207*dir = ctx->invalid_ptes[ctx->level + 1];208}209}210211return ret;212}213214/*215* Page walker for VM shadow mmu at page table dir level216*/217static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)218{219int ret;220phys_addr_t next, start, size;221struct list_head *list;222kvm_pte_t *entry, *child;223224ret = 0;225start = addr;226child = (kvm_pte_t *)__va(PHYSADDR(*dir));227entry = kvm_pgtable_offset(ctx, child, addr);228do {229next = kvm_pgtable_addr_end(ctx, addr, end);230if (!kvm_pte_present(ctx, entry))231continue;232233if (kvm_pte_huge(*entry)) {234ret |= ctx->ops(entry, addr, ctx);235continue;236}237238kvm_ptw_enter(ctx);239if (ctx->level == 0)240ret |= kvm_ptw_leaf(entry, addr, next, ctx);241else242ret |= kvm_ptw_dir(entry, addr, next, ctx);243kvm_ptw_exit(ctx);244} while (entry++, addr = next, addr < end);245246if (kvm_need_flush(ctx)) {247size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);248if (start + size == end) {249list = (struct list_head *)child;250list_add_tail(list, &ctx->list);251*dir = ctx->invalid_ptes[ctx->level + 1];252}253}254255return ret;256}257258/*259* Page walker for VM shadow mmu at page root table260*/261static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)262{263int ret;264phys_addr_t next;265kvm_pte_t *entry;266267ret = 0;268entry = kvm_pgtable_offset(ctx, dir, addr);269do {270next = kvm_pgtable_addr_end(ctx, addr, end);271if (!kvm_pte_present(ctx, entry))272continue;273274kvm_ptw_enter(ctx);275ret |= kvm_ptw_dir(entry, addr, next, ctx);276kvm_ptw_exit(ctx);277} while (entry++, addr = next, addr < end);278279return ret;280}281282/*283* kvm_flush_range() - Flush a range of guest physical addresses.284* @kvm: KVM pointer.285* @start_gfn: Guest frame number of first page in GPA range to flush.286* @end_gfn: Guest frame number of last page in GPA range to flush.287* @lock: Whether to hold mmu_lock or not288*289* Flushes a range of GPA mappings from the GPA page tables.290*/291static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock)292{293int ret;294kvm_ptw_ctx ctx;295struct list_head *pos, *temp;296297ctx.ops = kvm_flush_pte;298ctx.flag = _KVM_FLUSH_PGTABLE;299kvm_ptw_prepare(kvm, &ctx);300INIT_LIST_HEAD(&ctx.list);301302if (lock) {303spin_lock(&kvm->mmu_lock);304ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,305end_gfn << PAGE_SHIFT, &ctx);306spin_unlock(&kvm->mmu_lock);307} else308ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,309end_gfn << PAGE_SHIFT, &ctx);310311/* Flush vpid for each vCPU individually */312if (ret)313kvm_flush_remote_tlbs(kvm);314315/*316* free pte table page after mmu_lock317* the pte table page is linked together with ctx.list318*/319list_for_each_safe(pos, temp, &ctx.list) {320list_del(pos);321free_page((unsigned long)pos);322}323}324325/*326* kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.327* @kvm: KVM pointer.328* @start_gfn: Guest frame number of first page in GPA range to flush.329* @end_gfn: Guest frame number of last page in GPA range to flush.330*331* Make a range of GPA mappings clean so that guest writes will fault and332* trigger dirty page logging.333*334* The caller must hold the @kvm->mmu_lock spinlock.335*336* Returns: Whether any GPA mappings were modified, which would require337* derived mappings (GVA page tables & TLB enties) to be338* invalidated.339*/340static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)341{342kvm_ptw_ctx ctx;343344ctx.ops = kvm_mkclean_pte;345ctx.flag = 0;346kvm_ptw_prepare(kvm, &ctx);347return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx);348}349350/*351* kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages352* @kvm: The KVM pointer353* @slot: The memory slot associated with mask354* @gfn_offset: The gfn offset in memory slot355* @mask: The mask of dirty pages at offset 'gfn_offset' in this memory356* slot to be write protected357*358* Walks bits set in mask write protects the associated pte's. Caller must359* acquire @kvm->mmu_lock.360*/361void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,362struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask)363{364kvm_ptw_ctx ctx;365gfn_t base_gfn = slot->base_gfn + gfn_offset;366gfn_t start = base_gfn + __ffs(mask);367gfn_t end = base_gfn + __fls(mask) + 1;368369ctx.ops = kvm_mkclean_pte;370ctx.flag = _KVM_HAS_PGMASK;371ctx.mask = mask;372ctx.gfn = base_gfn;373kvm_ptw_prepare(kvm, &ctx);374375kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx);376}377378int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old,379struct kvm_memory_slot *new, enum kvm_mr_change change)380{381gpa_t gpa_start;382hva_t hva_start;383size_t size, gpa_offset, hva_offset;384385if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE))386return 0;387/*388* Prevent userspace from creating a memory region outside of the389* VM GPA address space390*/391if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT))392return -ENOMEM;393394new->arch.flags = 0;395size = new->npages * PAGE_SIZE;396gpa_start = new->base_gfn << PAGE_SHIFT;397hva_start = new->userspace_addr;398if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE)399&& IS_ALIGNED(hva_start, PMD_SIZE))400new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE;401else {402/*403* Pages belonging to memslots that don't have the same404* alignment within a PMD for userspace and GPA cannot be405* mapped with PMD entries, because we'll end up mapping406* the wrong pages.407*408* Consider a layout like the following:409*410* memslot->userspace_addr:411* +-----+--------------------+--------------------+---+412* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|413* +-----+--------------------+--------------------+---+414*415* memslot->base_gfn << PAGE_SIZE:416* +---+--------------------+--------------------+-----+417* |abc|def Stage-2 block | Stage-2 block |tvxyz|418* +---+--------------------+--------------------+-----+419*420* If we create those stage-2 blocks, we'll end up with this421* incorrect mapping:422* d -> f423* e -> g424* f -> h425*/426gpa_offset = gpa_start & (PMD_SIZE - 1);427hva_offset = hva_start & (PMD_SIZE - 1);428if (gpa_offset != hva_offset) {429new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;430} else {431if (gpa_offset == 0)432gpa_offset = PMD_SIZE;433if ((size + gpa_offset) < (PMD_SIZE * 2))434new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;435}436}437438return 0;439}440441void kvm_arch_commit_memory_region(struct kvm *kvm,442struct kvm_memory_slot *old,443const struct kvm_memory_slot *new,444enum kvm_mr_change change)445{446int needs_flush;447u32 old_flags = old ? old->flags : 0;448u32 new_flags = new ? new->flags : 0;449bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;450451/* Only track memslot flags changed */452if (change != KVM_MR_FLAGS_ONLY)453return;454455/* Discard dirty page tracking on readonly memslot */456if ((old_flags & new_flags) & KVM_MEM_READONLY)457return;458459/*460* If dirty page logging is enabled, write protect all pages in the slot461* ready for dirty logging.462*463* There is no need to do this in any of the following cases:464* CREATE: No dirty mappings will already exist.465* MOVE/DELETE: The old mappings will already have been cleaned up by466* kvm_arch_flush_shadow_memslot()467*/468if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) {469/*470* Initially-all-set does not require write protecting any page471* because they're all assumed to be dirty.472*/473if (kvm_dirty_log_manual_protect_and_init_set(kvm))474return;475476spin_lock(&kvm->mmu_lock);477/* Write protect GPA page table entries */478needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,479new->base_gfn + new->npages);480spin_unlock(&kvm->mmu_lock);481if (needs_flush)482kvm_flush_remote_tlbs(kvm);483}484}485486void kvm_arch_flush_shadow_all(struct kvm *kvm)487{488kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0);489}490491void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)492{493/*494* The slot has been made invalid (ready for moving or deletion), so we495* need to ensure that it can no longer be accessed by any guest vCPUs.496*/497kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1);498}499500bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)501{502kvm_ptw_ctx ctx;503504ctx.flag = 0;505ctx.ops = kvm_flush_pte;506kvm_ptw_prepare(kvm, &ctx);507INIT_LIST_HEAD(&ctx.list);508509return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,510range->end << PAGE_SHIFT, &ctx);511}512513bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)514{515kvm_ptw_ctx ctx;516517ctx.flag = 0;518ctx.ops = kvm_mkold_pte;519kvm_ptw_prepare(kvm, &ctx);520521return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,522range->end << PAGE_SHIFT, &ctx);523}524525bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)526{527gpa_t gpa = range->start << PAGE_SHIFT;528kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);529530if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep))531return true;532533return false;534}535536/*537* kvm_map_page_fast() - Fast path GPA fault handler.538* @vcpu: vCPU pointer.539* @gpa: Guest physical address of fault.540* @write: Whether the fault was due to a write.541*542* Perform fast path GPA fault handling, doing all that can be done without543* calling into KVM. This handles marking old pages young (for idle page544* tracking), and dirtying of clean pages (for dirty page logging).545*546* Returns: 0 on success, in which case we can update derived mappings and547* resume guest execution.548* -EFAULT on failure due to absent GPA mapping or write to549* read-only page, in which case KVM must be consulted.550*/551static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)552{553int ret = 0;554kvm_pte_t *ptep, changed, new;555gfn_t gfn = gpa >> PAGE_SHIFT;556struct kvm *kvm = vcpu->kvm;557struct kvm_memory_slot *slot;558559spin_lock(&kvm->mmu_lock);560561/* Fast path - just check GPA page table for an existing entry */562ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);563if (!ptep || !kvm_pte_present(NULL, ptep)) {564ret = -EFAULT;565goto out;566}567568/* Track access to pages marked old */569new = kvm_pte_mkyoung(*ptep);570if (write && !kvm_pte_dirty(new)) {571if (!kvm_pte_write(new)) {572ret = -EFAULT;573goto out;574}575576if (kvm_pte_huge(new)) {577/*578* Do not set write permission when dirty logging is579* enabled for HugePages580*/581slot = gfn_to_memslot(kvm, gfn);582if (kvm_slot_dirty_track_enabled(slot)) {583ret = -EFAULT;584goto out;585}586}587588/* Track dirtying of writeable pages */589new = kvm_pte_mkdirty(new);590}591592changed = new ^ (*ptep);593if (changed)594kvm_set_pte(ptep, new);595596spin_unlock(&kvm->mmu_lock);597598if (kvm_pte_dirty(changed))599mark_page_dirty(kvm, gfn);600601return ret;602out:603spin_unlock(&kvm->mmu_lock);604return ret;605}606607static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,608unsigned long hva, bool write)609{610hva_t start, end;611612/* Disable dirty logging on HugePages */613if (kvm_slot_dirty_track_enabled(memslot) && write)614return false;615616if (kvm_hugepage_capable(memslot))617return true;618619if (kvm_hugepage_incapable(memslot))620return false;621622start = memslot->userspace_addr;623end = start + memslot->npages * PAGE_SIZE;624625/*626* Next, let's make sure we're not trying to map anything not covered627* by the memslot. This means we have to prohibit block size mappings628* for the beginning and end of a non-block aligned and non-block sized629* memory slot (illustrated by the head and tail parts of the630* userspace view above containing pages 'abcde' and 'xyz',631* respectively).632*633* Note that it doesn't matter if we do the check using the634* userspace_addr or the base_gfn, as both are equally aligned (per635* the check above) and equally sized.636*/637return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE));638}639640/*641* Lookup the mapping level for @gfn in the current mm.642*643* WARNING! Use of host_pfn_mapping_level() requires the caller and the end644* consumer to be tied into KVM's handlers for MMU notifier events!645*646* There are several ways to safely use this helper:647*648* - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before649* consuming it. In this case, mmu_lock doesn't need to be held during the650* lookup, but it does need to be held while checking the MMU notifier.651*652* - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation653* event for the hva. This can be done by explicit checking the MMU notifier654* or by ensuring that KVM already has a valid mapping that covers the hva.655*656* - Do not use the result to install new mappings, e.g. use the host mapping657* level only to decide whether or not to zap an entry. In this case, it's658* not required to hold mmu_lock (though it's highly likely the caller will659* want to hold mmu_lock anyways, e.g. to modify SPTEs).660*661* Note! The lookup can still race with modifications to host page tables, but662* the above "rules" ensure KVM will not _consume_ the result of the walk if a663* race with the primary MMU occurs.664*/665static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,666const struct kvm_memory_slot *slot)667{668int level = 0;669unsigned long hva;670unsigned long flags;671pgd_t pgd;672p4d_t p4d;673pud_t pud;674pmd_t pmd;675676/*677* Note, using the already-retrieved memslot and __gfn_to_hva_memslot()678* is not solely for performance, it's also necessary to avoid the679* "writable" check in __gfn_to_hva_many(), which will always fail on680* read-only memslots due to gfn_to_hva() assuming writes. Earlier681* page fault steps have already verified the guest isn't writing a682* read-only memslot.683*/684hva = __gfn_to_hva_memslot(slot, gfn);685686/*687* Disable IRQs to prevent concurrent tear down of host page tables,688* e.g. if the primary MMU promotes a P*D to a huge page and then frees689* the original page table.690*/691local_irq_save(flags);692693/*694* Read each entry once. As above, a non-leaf entry can be promoted to695* a huge page _during_ this walk. Re-reading the entry could send the696* walk into the weeks, e.g. p*d_leaf() returns false (sees the old697* value) and then p*d_offset() walks into the target huge page instead698* of the old page table (sees the new value).699*/700pgd = pgdp_get(pgd_offset(kvm->mm, hva));701if (pgd_none(pgd))702goto out;703704p4d = p4dp_get(p4d_offset(&pgd, hva));705if (p4d_none(p4d) || !p4d_present(p4d))706goto out;707708pud = pudp_get(pud_offset(&p4d, hva));709if (pud_none(pud) || !pud_present(pud))710goto out;711712pmd = pmdp_get(pmd_offset(&pud, hva));713if (pmd_none(pmd) || !pmd_present(pmd))714goto out;715716if (kvm_pte_huge(pmd_val(pmd)))717level = 1;718719out:720local_irq_restore(flags);721return level;722}723724/*725* Split huge page726*/727static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn)728{729int i;730kvm_pte_t val, *child;731struct kvm *kvm = vcpu->kvm;732struct kvm_mmu_memory_cache *memcache;733734memcache = &vcpu->arch.mmu_page_cache;735child = kvm_mmu_memory_cache_alloc(memcache);736val = kvm_pte_mksmall(*ptep);737for (i = 0; i < PTRS_PER_PTE; i++) {738kvm_set_pte(child + i, val);739val += PAGE_SIZE;740}741742smp_wmb(); /* Make pte visible before pmd */743/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */744kvm_set_pte(ptep, __pa(child));745746kvm->stat.hugepages--;747kvm->stat.pages += PTRS_PER_PTE;748749return child + (gfn & (PTRS_PER_PTE - 1));750}751752/*753* kvm_map_page() - Map a guest physical page.754* @vcpu: vCPU pointer.755* @gpa: Guest physical address of fault.756* @write: Whether the fault was due to a write.757*758* Handle GPA faults by creating a new GPA mapping (or updating an existing759* one).760*761* This takes care of marking pages young or dirty (idle/dirty page tracking),762* asking KVM for the corresponding PFN, and creating a mapping in the GPA page763* tables. Derived mappings (GVA page tables and TLBs) must be handled by the764* caller.765*766* Returns: 0 on success767* -EFAULT if there is no memory region at @gpa or a write was768* attempted to a read-only memory region. This is usually handled769* as an MMIO access.770*/771static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)772{773bool writeable;774int srcu_idx, err, retry_no = 0, level;775unsigned long hva, mmu_seq, prot_bits;776kvm_pfn_t pfn;777kvm_pte_t *ptep, new_pte;778gfn_t gfn = gpa >> PAGE_SHIFT;779struct kvm *kvm = vcpu->kvm;780struct kvm_memory_slot *memslot;781struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;782struct page *page;783784/* Try the fast path to handle old / clean pages */785srcu_idx = srcu_read_lock(&kvm->srcu);786err = kvm_map_page_fast(vcpu, gpa, write);787if (!err)788goto out;789790memslot = gfn_to_memslot(kvm, gfn);791hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);792if (kvm_is_error_hva(hva) || (write && !writeable)) {793err = -EFAULT;794goto out;795}796797/* We need a minimum of cached pages ready for page table creation */798err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);799if (err)800goto out;801802retry:803/*804* Used to check for invalidations in progress, of the pfn that is805* returned by pfn_to_pfn_prot below.806*/807mmu_seq = kvm->mmu_invalidate_seq;808/*809* Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in810* kvm_faultin_pfn() (which calls get_user_pages()), so that we don't811* risk the page we get a reference to getting unmapped before we have a812* chance to grab the mmu_lock without mmu_invalidate_retry() noticing.813*814* This smp_rmb() pairs with the effective smp_wmb() of the combination815* of the pte_unmap_unlock() after the PTE is zapped, and the816* spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before817* mmu_invalidate_seq is incremented.818*/819smp_rmb();820821/* Slow path - ask KVM core whether we can access this GPA */822pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page);823if (is_error_noslot_pfn(pfn)) {824err = -EFAULT;825goto out;826}827828/* Check if an invalidation has taken place since we got pfn */829spin_lock(&kvm->mmu_lock);830if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {831/*832* This can happen when mappings are changed asynchronously, but833* also synchronously if a COW is triggered by834* kvm_faultin_pfn().835*/836spin_unlock(&kvm->mmu_lock);837kvm_release_page_unused(page);838if (retry_no > 100) {839retry_no = 0;840schedule();841}842retry_no++;843goto retry;844}845846/*847* For emulated devices such virtio device, actual cache attribute is848* determined by physical machine.849* For pass through physical device, it should be uncachable850*/851prot_bits = _PAGE_PRESENT | __READABLE;852if (pfn_valid(pfn))853prot_bits |= _CACHE_CC;854else855prot_bits |= _CACHE_SUC;856857if (writeable) {858prot_bits |= _PAGE_WRITE;859if (write)860prot_bits |= __WRITEABLE;861}862863/* Disable dirty logging on HugePages */864level = 0;865if (fault_supports_huge_mapping(memslot, hva, write)) {866/* Check page level about host mmu*/867level = host_pfn_mapping_level(kvm, gfn, memslot);868if (level == 1) {869/*870* Check page level about secondary mmu871* Disable hugepage if it is normal page on872* secondary mmu already873*/874ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);875if (ptep && !kvm_pte_huge(*ptep))876level = 0;877}878879if (level == 1) {880gfn = gfn & ~(PTRS_PER_PTE - 1);881pfn = pfn & ~(PTRS_PER_PTE - 1);882}883}884885/* Ensure page tables are allocated */886ptep = kvm_populate_gpa(kvm, memcache, gpa, level);887new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits));888if (level == 1) {889new_pte = kvm_pte_mkhuge(new_pte);890/*891* previous pmd entry is invalid_pte_table892* there is invalid tlb with small page893* need flush these invalid tlbs for current vcpu894*/895kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);896++kvm->stat.hugepages;897} else if (kvm_pte_huge(*ptep) && write)898ptep = kvm_split_huge(vcpu, ptep, gfn);899else900++kvm->stat.pages;901kvm_set_pte(ptep, new_pte);902903kvm_release_faultin_page(kvm, page, false, writeable);904spin_unlock(&kvm->mmu_lock);905906if (prot_bits & _PAGE_DIRTY)907mark_page_dirty_in_slot(kvm, memslot, gfn);908909out:910srcu_read_unlock(&kvm->srcu, srcu_idx);911return err;912}913914int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write, int ecode)915{916int ret;917918ret = kvm_map_page(vcpu, gpa, write);919if (ret)920return ret;921922/* Invalidate this entry in the TLB */923if (!cpu_has_ptw || (ecode == EXCCODE_TLBM)) {924/*925* With HW PTW, invalid TLB is not added when page fault. But926* for EXCCODE_TLBM exception, stale TLB may exist because of927* the last read access.928*929* With SW PTW, invalid TLB is added in TLB refill exception.930*/931vcpu->arch.flush_gpa = gpa;932kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);933}934935return 0;936}937938void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)939{940}941942void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,943const struct kvm_memory_slot *memslot)944{945kvm_flush_remote_tlbs(kvm);946}947948949