Path: blob/master/tools/testing/selftests/kvm/lib/x86/processor.c
49428 views
// SPDX-License-Identifier: GPL-2.0-only1/*2* Copyright (C) 2018, Google LLC.3*/45#include "linux/bitmap.h"6#include "test_util.h"7#include "kvm_util.h"8#include "pmu.h"9#include "processor.h"10#include "sev.h"1112#ifndef NUM_INTERRUPTS13#define NUM_INTERRUPTS 25614#endif1516#define KERNEL_CS 0x817#define KERNEL_DS 0x1018#define KERNEL_TSS 0x181920vm_vaddr_t exception_handlers;21bool host_cpu_is_amd;22bool host_cpu_is_intel;23bool is_forced_emulation_enabled;24uint64_t guest_tsc_khz;2526const char *ex_str(int vector)27{28switch (vector) {29#define VEC_STR(v) case v##_VECTOR: return "#" #v30case DE_VECTOR: return "no exception";31case KVM_MAGIC_DE_VECTOR: return "#DE";32VEC_STR(DB);33VEC_STR(NMI);34VEC_STR(BP);35VEC_STR(OF);36VEC_STR(BR);37VEC_STR(UD);38VEC_STR(NM);39VEC_STR(DF);40VEC_STR(TS);41VEC_STR(NP);42VEC_STR(SS);43VEC_STR(GP);44VEC_STR(PF);45VEC_STR(MF);46VEC_STR(AC);47VEC_STR(MC);48VEC_STR(XM);49VEC_STR(VE);50VEC_STR(CP);51VEC_STR(HV);52VEC_STR(VC);53VEC_STR(SX);54default: return "#??";55#undef VEC_STR56}57}5859static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)60{61fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "62"rcx: 0x%.16llx rdx: 0x%.16llx\n",63indent, "",64regs->rax, regs->rbx, regs->rcx, regs->rdx);65fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "66"rsp: 0x%.16llx rbp: 0x%.16llx\n",67indent, "",68regs->rsi, regs->rdi, regs->rsp, regs->rbp);69fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "70"r10: 0x%.16llx r11: 0x%.16llx\n",71indent, "",72regs->r8, regs->r9, regs->r10, regs->r11);73fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "74"r14: 0x%.16llx r15: 0x%.16llx\n",75indent, "",76regs->r12, regs->r13, regs->r14, regs->r15);77fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",78indent, "",79regs->rip, regs->rflags);80}8182static void segment_dump(FILE *stream, struct kvm_segment *segment,83uint8_t indent)84{85fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "86"selector: 0x%.4x type: 0x%.2x\n",87indent, "", segment->base, segment->limit,88segment->selector, segment->type);89fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "90"db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",91indent, "", segment->present, segment->dpl,92segment->db, segment->s, segment->l);93fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "94"unusable: 0x%.2x padding: 0x%.2x\n",95indent, "", segment->g, segment->avl,96segment->unusable, segment->padding);97}9899static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,100uint8_t indent)101{102fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "103"padding: 0x%.4x 0x%.4x 0x%.4x\n",104indent, "", dtable->base, dtable->limit,105dtable->padding[0], dtable->padding[1], dtable->padding[2]);106}107108static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)109{110unsigned int i;111112fprintf(stream, "%*scs:\n", indent, "");113segment_dump(stream, &sregs->cs, indent + 2);114fprintf(stream, "%*sds:\n", indent, "");115segment_dump(stream, &sregs->ds, indent + 2);116fprintf(stream, "%*ses:\n", indent, "");117segment_dump(stream, &sregs->es, indent + 2);118fprintf(stream, "%*sfs:\n", indent, "");119segment_dump(stream, &sregs->fs, indent + 2);120fprintf(stream, "%*sgs:\n", indent, "");121segment_dump(stream, &sregs->gs, indent + 2);122fprintf(stream, "%*sss:\n", indent, "");123segment_dump(stream, &sregs->ss, indent + 2);124fprintf(stream, "%*str:\n", indent, "");125segment_dump(stream, &sregs->tr, indent + 2);126fprintf(stream, "%*sldt:\n", indent, "");127segment_dump(stream, &sregs->ldt, indent + 2);128129fprintf(stream, "%*sgdt:\n", indent, "");130dtable_dump(stream, &sregs->gdt, indent + 2);131fprintf(stream, "%*sidt:\n", indent, "");132dtable_dump(stream, &sregs->idt, indent + 2);133134fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "135"cr3: 0x%.16llx cr4: 0x%.16llx\n",136indent, "",137sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);138fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "139"apic_base: 0x%.16llx\n",140indent, "",141sregs->cr8, sregs->efer, sregs->apic_base);142143fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");144for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {145fprintf(stream, "%*s%.16llx\n", indent + 2, "",146sregs->interrupt_bitmap[i]);147}148}149150bool kvm_is_tdp_enabled(void)151{152if (host_cpu_is_intel)153return get_kvm_intel_param_bool("ept");154else155return get_kvm_amd_param_bool("npt");156}157158void virt_arch_pgd_alloc(struct kvm_vm *vm)159{160TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,161"Unknown or unsupported guest mode: 0x%x", vm->mode);162163/* If needed, create the top-level page table. */164if (!vm->pgd_created) {165vm->pgd = vm_alloc_page_table(vm);166vm->pgd_created = true;167}168}169170static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,171uint64_t vaddr, int level)172{173uint64_t pt_gpa = PTE_GET_PA(*parent_pte);174uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);175int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;176177TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,178"Parent PTE (level %d) not PRESENT for gva: 0x%08lx",179level + 1, vaddr);180181return &page_table[index];182}183184static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,185uint64_t *parent_pte,186uint64_t vaddr,187uint64_t paddr,188int current_level,189int target_level)190{191uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);192193paddr = vm_untag_gpa(vm, paddr);194195if (!(*pte & PTE_PRESENT_MASK)) {196*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;197if (current_level == target_level)198*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);199else200*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;201} else {202/*203* Entry already present. Assert that the caller doesn't want204* a hugepage at this level, and that there isn't a hugepage at205* this level.206*/207TEST_ASSERT(current_level != target_level,208"Cannot create hugepage at level: %u, vaddr: 0x%lx",209current_level, vaddr);210TEST_ASSERT(!(*pte & PTE_LARGE_MASK),211"Cannot create page table at level: %u, vaddr: 0x%lx",212current_level, vaddr);213}214return pte;215}216217void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)218{219const uint64_t pg_size = PG_LEVEL_SIZE(level);220uint64_t *pte = &vm->pgd;221int current_level;222223TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,224"Unknown or unsupported guest mode: 0x%x", vm->mode);225226TEST_ASSERT((vaddr % pg_size) == 0,227"Virtual address not aligned,\n"228"vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);229TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),230"Invalid virtual address, vaddr: 0x%lx", vaddr);231TEST_ASSERT((paddr % pg_size) == 0,232"Physical address not aligned,\n"233" paddr: 0x%lx page size: 0x%lx", paddr, pg_size);234TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,235"Physical address beyond maximum supported,\n"236" paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",237paddr, vm->max_gfn, vm->page_size);238TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,239"Unexpected bits in paddr: %lx", paddr);240241/*242* Allocate upper level page tables, if not already present. Return243* early if a hugepage was created.244*/245for (current_level = vm->pgtable_levels;246current_level > PG_LEVEL_4K;247current_level--) {248pte = virt_create_upper_pte(vm, pte, vaddr, paddr,249current_level, level);250if (*pte & PTE_LARGE_MASK)251return;252}253254/* Fill in page table entry. */255pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K);256TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),257"PTE already present for 4k page at vaddr: 0x%lx", vaddr);258*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);259260/*261* Neither SEV nor TDX supports shared page tables, so only the final262* leaf PTE needs manually set the C/S-bit.263*/264if (vm_is_gpa_protected(vm, paddr))265*pte |= vm->arch.c_bit;266else267*pte |= vm->arch.s_bit;268}269270void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)271{272__virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);273}274275void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,276uint64_t nr_bytes, int level)277{278uint64_t pg_size = PG_LEVEL_SIZE(level);279uint64_t nr_pages = nr_bytes / pg_size;280int i;281282TEST_ASSERT(nr_bytes % pg_size == 0,283"Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",284nr_bytes, pg_size);285286for (i = 0; i < nr_pages; i++) {287__virt_pg_map(vm, vaddr, paddr, level);288sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift,289nr_bytes / PAGE_SIZE);290291vaddr += pg_size;292paddr += pg_size;293}294}295296static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)297{298if (*pte & PTE_LARGE_MASK) {299TEST_ASSERT(*level == PG_LEVEL_NONE ||300*level == current_level,301"Unexpected hugepage at level %d", current_level);302*level = current_level;303}304305return *level == current_level;306}307308uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,309int *level)310{311int va_width = 12 + (vm->pgtable_levels) * 9;312uint64_t *pte = &vm->pgd;313int current_level;314315TEST_ASSERT(!vm->arch.is_pt_protected,316"Walking page tables of protected guests is impossible");317318TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels,319"Invalid PG_LEVEL_* '%d'", *level);320321TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,322"Unknown or unsupported guest mode: 0x%x", vm->mode);323TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,324(vaddr >> vm->page_shift)),325"Invalid virtual address, vaddr: 0x%lx",326vaddr);327/*328* Check that the vaddr is a sign-extended va_width value.329*/330TEST_ASSERT(vaddr ==331(((int64_t)vaddr << (64 - va_width) >> (64 - va_width))),332"Canonical check failed. The virtual address is invalid.");333334for (current_level = vm->pgtable_levels;335current_level > PG_LEVEL_4K;336current_level--) {337pte = virt_get_pte(vm, pte, vaddr, current_level);338if (vm_is_target_pte(pte, level, current_level))339return pte;340}341342return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K);343}344345uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)346{347int level = PG_LEVEL_4K;348349return __vm_get_page_table_entry(vm, vaddr, &level);350}351352void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)353{354uint64_t *pml4e, *pml4e_start;355uint64_t *pdpe, *pdpe_start;356uint64_t *pde, *pde_start;357uint64_t *pte, *pte_start;358359if (!vm->pgd_created)360return;361362fprintf(stream, "%*s "363" no\n", indent, "");364fprintf(stream, "%*s index hvaddr gpaddr "365"addr w exec dirty\n",366indent, "");367pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);368for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {369pml4e = &pml4e_start[n1];370if (!(*pml4e & PTE_PRESENT_MASK))371continue;372fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "373" %u\n",374indent, "",375pml4e - pml4e_start, pml4e,376addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),377!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));378379pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);380for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {381pdpe = &pdpe_start[n2];382if (!(*pdpe & PTE_PRESENT_MASK))383continue;384fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx "385"%u %u\n",386indent, "",387pdpe - pdpe_start, pdpe,388addr_hva2gpa(vm, pdpe),389PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),390!!(*pdpe & PTE_NX_MASK));391392pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);393for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {394pde = &pde_start[n3];395if (!(*pde & PTE_PRESENT_MASK))396continue;397fprintf(stream, "%*spde 0x%-3zx %p "398"0x%-12lx 0x%-10llx %u %u\n",399indent, "", pde - pde_start, pde,400addr_hva2gpa(vm, pde),401PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),402!!(*pde & PTE_NX_MASK));403404pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);405for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {406pte = &pte_start[n4];407if (!(*pte & PTE_PRESENT_MASK))408continue;409fprintf(stream, "%*spte 0x%-3zx %p "410"0x%-12lx 0x%-10llx %u %u "411" %u 0x%-10lx\n",412indent, "",413pte - pte_start, pte,414addr_hva2gpa(vm, pte),415PTE_GET_PFN(*pte),416!!(*pte & PTE_WRITABLE_MASK),417!!(*pte & PTE_NX_MASK),418!!(*pte & PTE_DIRTY_MASK),419((uint64_t) n1 << 27)420| ((uint64_t) n2 << 18)421| ((uint64_t) n3 << 9)422| ((uint64_t) n4));423}424}425}426}427}428429/*430* Set Unusable Segment431*432* Input Args: None433*434* Output Args:435* segp - Pointer to segment register436*437* Return: None438*439* Sets the segment register pointed to by @segp to an unusable state.440*/441static void kvm_seg_set_unusable(struct kvm_segment *segp)442{443memset(segp, 0, sizeof(*segp));444segp->unusable = true;445}446447static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)448{449void *gdt = addr_gva2hva(vm, vm->arch.gdt);450struct desc64 *desc = gdt + (segp->selector >> 3) * 8;451452desc->limit0 = segp->limit & 0xFFFF;453desc->base0 = segp->base & 0xFFFF;454desc->base1 = segp->base >> 16;455desc->type = segp->type;456desc->s = segp->s;457desc->dpl = segp->dpl;458desc->p = segp->present;459desc->limit1 = segp->limit >> 16;460desc->avl = segp->avl;461desc->l = segp->l;462desc->db = segp->db;463desc->g = segp->g;464desc->base2 = segp->base >> 24;465if (!segp->s)466desc->base3 = segp->base >> 32;467}468469static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)470{471memset(segp, 0, sizeof(*segp));472segp->selector = KERNEL_CS;473segp->limit = 0xFFFFFFFFu;474segp->s = 0x1; /* kTypeCodeData */475segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed476* | kFlagCodeReadable477*/478segp->g = true;479segp->l = true;480segp->present = 1;481}482483static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)484{485memset(segp, 0, sizeof(*segp));486segp->selector = KERNEL_DS;487segp->limit = 0xFFFFFFFFu;488segp->s = 0x1; /* kTypeCodeData */489segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed490* | kFlagDataWritable491*/492segp->g = true;493segp->present = true;494}495496vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)497{498int level = PG_LEVEL_NONE;499uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);500501TEST_ASSERT(*pte & PTE_PRESENT_MASK,502"Leaf PTE not PRESENT for gva: 0x%08lx", gva);503504/*505* No need for a hugepage mask on the PTE, x86-64 requires the "unused"506* address bits to be zero.507*/508return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));509}510511static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp)512{513memset(segp, 0, sizeof(*segp));514segp->base = base;515segp->limit = 0x67;516segp->selector = KERNEL_TSS;517segp->type = 0xb;518segp->present = 1;519}520521static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)522{523struct kvm_sregs sregs;524525TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,526"Unknown or unsupported guest mode: 0x%x", vm->mode);527528/* Set mode specific system register values. */529vcpu_sregs_get(vcpu, &sregs);530531sregs.idt.base = vm->arch.idt;532sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;533sregs.gdt.base = vm->arch.gdt;534sregs.gdt.limit = getpagesize() - 1;535536sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;537sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;538if (kvm_cpu_has(X86_FEATURE_XSAVE))539sregs.cr4 |= X86_CR4_OSXSAVE;540if (vm->pgtable_levels == 5)541sregs.cr4 |= X86_CR4_LA57;542sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);543544kvm_seg_set_unusable(&sregs.ldt);545kvm_seg_set_kernel_code_64bit(&sregs.cs);546kvm_seg_set_kernel_data_64bit(&sregs.ds);547kvm_seg_set_kernel_data_64bit(&sregs.es);548kvm_seg_set_kernel_data_64bit(&sregs.gs);549kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);550551sregs.cr3 = vm->pgd;552vcpu_sregs_set(vcpu, &sregs);553}554555static void vcpu_init_xcrs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)556{557struct kvm_xcrs xcrs = {558.nr_xcrs = 1,559.xcrs[0].xcr = 0,560.xcrs[0].value = kvm_cpu_supported_xcr0(),561};562563if (!kvm_cpu_has(X86_FEATURE_XSAVE))564return;565566vcpu_xcrs_set(vcpu, &xcrs);567}568569static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,570int dpl, unsigned short selector)571{572struct idt_entry *base =573(struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);574struct idt_entry *e = &base[vector];575576memset(e, 0, sizeof(*e));577e->offset0 = addr;578e->selector = selector;579e->ist = 0;580e->type = 14;581e->dpl = dpl;582e->p = 1;583e->offset1 = addr >> 16;584e->offset2 = addr >> 32;585}586587static bool kvm_fixup_exception(struct ex_regs *regs)588{589if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)590return false;591592if (regs->vector == DE_VECTOR)593regs->vector = KVM_MAGIC_DE_VECTOR;594595regs->rip = regs->r11;596regs->r9 = regs->vector;597regs->r10 = regs->error_code;598return true;599}600601void route_exception(struct ex_regs *regs)602{603typedef void(*handler)(struct ex_regs *);604handler *handlers = (handler *)exception_handlers;605606if (handlers && handlers[regs->vector]) {607handlers[regs->vector](regs);608return;609}610611if (kvm_fixup_exception(regs))612return;613614GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",615regs->vector, regs->rip);616}617618static void vm_init_descriptor_tables(struct kvm_vm *vm)619{620extern void *idt_handlers;621struct kvm_segment seg;622int i;623624vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);625vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);626vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);627vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);628629/* Handlers have the same address in both address spaces.*/630for (i = 0; i < NUM_INTERRUPTS; i++)631set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);632633*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;634635kvm_seg_set_kernel_code_64bit(&seg);636kvm_seg_fill_gdt_64bit(vm, &seg);637638kvm_seg_set_kernel_data_64bit(&seg);639kvm_seg_fill_gdt_64bit(vm, &seg);640641kvm_seg_set_tss_64bit(vm->arch.tss, &seg);642kvm_seg_fill_gdt_64bit(vm, &seg);643}644645void vm_install_exception_handler(struct kvm_vm *vm, int vector,646void (*handler)(struct ex_regs *))647{648vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);649650handlers[vector] = (vm_vaddr_t)handler;651}652653void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)654{655struct ucall uc;656657if (get_ucall(vcpu, &uc) == UCALL_ABORT)658REPORT_GUEST_ASSERT(uc);659}660661void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)662{663int r;664665TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),666"Require KVM_GET_TSC_KHZ to provide udelay() to guest.");667668vm_create_irqchip(vm);669vm_init_descriptor_tables(vm);670671sync_global_to_guest(vm, host_cpu_is_intel);672sync_global_to_guest(vm, host_cpu_is_amd);673sync_global_to_guest(vm, is_forced_emulation_enabled);674sync_global_to_guest(vm, pmu_errata_mask);675676if (is_sev_vm(vm)) {677struct kvm_sev_init init = { 0 };678679vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);680}681682r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);683TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");684guest_tsc_khz = r;685sync_global_to_guest(vm, guest_tsc_khz);686}687688void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)689{690struct kvm_regs regs;691692vcpu_regs_get(vcpu, ®s);693regs.rip = (unsigned long) guest_code;694vcpu_regs_set(vcpu, ®s);695}696697struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)698{699struct kvm_mp_state mp_state;700struct kvm_regs regs;701vm_vaddr_t stack_vaddr;702struct kvm_vcpu *vcpu;703704stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),705DEFAULT_GUEST_STACK_VADDR_MIN,706MEM_REGION_DATA);707708stack_vaddr += DEFAULT_STACK_PGS * getpagesize();709710/*711* Align stack to match calling sequence requirements in section "The712* Stack Frame" of the System V ABI AMD64 Architecture Processor713* Supplement, which requires the value (%rsp + 8) to be a multiple of714* 16 when control is transferred to the function entry point.715*716* If this code is ever used to launch a vCPU with 32-bit entry point it717* may need to subtract 4 bytes instead of 8 bytes.718*/719TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE),720"__vm_vaddr_alloc() did not provide a page-aligned address");721stack_vaddr -= 8;722723vcpu = __vm_vcpu_add(vm, vcpu_id);724vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());725vcpu_init_sregs(vm, vcpu);726vcpu_init_xcrs(vm, vcpu);727728/* Setup guest general purpose registers */729vcpu_regs_get(vcpu, ®s);730regs.rflags = regs.rflags | 0x2;731regs.rsp = stack_vaddr;732vcpu_regs_set(vcpu, ®s);733734/* Setup the MP state */735mp_state.mp_state = 0;736vcpu_mp_state_set(vcpu, &mp_state);737738/*739* Refresh CPUID after setting SREGS and XCR0, so that KVM's "runtime"740* updates to guest CPUID, e.g. for OSXSAVE and XSAVE state size, are741* reflected into selftests' vCPU CPUID cache, i.e. so that the cache742* is consistent with vCPU state.743*/744vcpu_get_cpuid(vcpu);745return vcpu;746}747748struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id)749{750struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);751752vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());753754return vcpu;755}756757void vcpu_arch_free(struct kvm_vcpu *vcpu)758{759if (vcpu->cpuid)760free(vcpu->cpuid);761}762763/* Do not use kvm_supported_cpuid directly except for validity checks. */764static void *kvm_supported_cpuid;765766const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)767{768int kvm_fd;769770if (kvm_supported_cpuid)771return kvm_supported_cpuid;772773kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);774kvm_fd = open_kvm_dev_path_or_exit();775776kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,777(struct kvm_cpuid2 *)kvm_supported_cpuid);778779close(kvm_fd);780return kvm_supported_cpuid;781}782783static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,784uint32_t function, uint32_t index,785uint8_t reg, uint8_t lo, uint8_t hi)786{787const struct kvm_cpuid_entry2 *entry;788int i;789790for (i = 0; i < cpuid->nent; i++) {791entry = &cpuid->entries[i];792793/*794* The output registers in kvm_cpuid_entry2 are in alphabetical795* order, but kvm_x86_cpu_feature matches that mess, so yay796* pointer shenanigans!797*/798if (entry->function == function && entry->index == index)799return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;800}801802return 0;803}804805bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,806struct kvm_x86_cpu_feature feature)807{808return __kvm_cpu_has(cpuid, feature.function, feature.index,809feature.reg, feature.bit, feature.bit);810}811812uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,813struct kvm_x86_cpu_property property)814{815return __kvm_cpu_has(cpuid, property.function, property.index,816property.reg, property.lo_bit, property.hi_bit);817}818819uint64_t kvm_get_feature_msr(uint64_t msr_index)820{821struct {822struct kvm_msrs header;823struct kvm_msr_entry entry;824} buffer = {};825int r, kvm_fd;826827buffer.header.nmsrs = 1;828buffer.entry.index = msr_index;829kvm_fd = open_kvm_dev_path_or_exit();830831r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);832TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));833834close(kvm_fd);835return buffer.entry.data;836}837838void __vm_xsave_require_permission(uint64_t xfeature, const char *name)839{840int kvm_fd;841u64 bitmask;842long rc;843struct kvm_device_attr attr = {844.group = 0,845.attr = KVM_X86_XCOMP_GUEST_SUPP,846.addr = (unsigned long) &bitmask,847};848849TEST_ASSERT(!kvm_supported_cpuid,850"kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");851852TEST_ASSERT(is_power_of_2(xfeature),853"Dynamic XFeatures must be enabled one at a time");854855kvm_fd = open_kvm_dev_path_or_exit();856rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);857close(kvm_fd);858859if (rc == -1 && (errno == ENXIO || errno == EINVAL))860__TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");861862TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);863864__TEST_REQUIRE(bitmask & xfeature,865"Required XSAVE feature '%s' not supported", name);866867TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));868869rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);870TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);871TEST_ASSERT(bitmask & xfeature,872"'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",873name, xfeature, bitmask);874}875876void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)877{878TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");879880/* Allow overriding the default CPUID. */881if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {882free(vcpu->cpuid);883vcpu->cpuid = NULL;884}885886if (!vcpu->cpuid)887vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);888889memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));890vcpu_set_cpuid(vcpu);891}892893void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,894struct kvm_x86_cpu_property property,895uint32_t value)896{897struct kvm_cpuid_entry2 *entry;898899entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);900901(&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);902(&entry->eax)[property.reg] |= value << property.lo_bit;903904vcpu_set_cpuid(vcpu);905906/* Sanity check that @value doesn't exceed the bounds in any way. */907TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);908}909910void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)911{912struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);913914entry->eax = 0;915entry->ebx = 0;916entry->ecx = 0;917entry->edx = 0;918vcpu_set_cpuid(vcpu);919}920921void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,922struct kvm_x86_cpu_feature feature,923bool set)924{925struct kvm_cpuid_entry2 *entry;926u32 *reg;927928entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);929reg = (&entry->eax) + feature.reg;930931if (set)932*reg |= BIT(feature.bit);933else934*reg &= ~BIT(feature.bit);935936vcpu_set_cpuid(vcpu);937}938939uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index)940{941struct {942struct kvm_msrs header;943struct kvm_msr_entry entry;944} buffer = {};945946buffer.header.nmsrs = 1;947buffer.entry.index = msr_index;948949vcpu_msrs_get(vcpu, &buffer.header);950951return buffer.entry.data;952}953954int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value)955{956struct {957struct kvm_msrs header;958struct kvm_msr_entry entry;959} buffer = {};960961memset(&buffer, 0, sizeof(buffer));962buffer.header.nmsrs = 1;963buffer.entry.index = msr_index;964buffer.entry.data = msr_value;965966return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);967}968969void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)970{971va_list ap;972struct kvm_regs regs;973974TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"975" num: %u",976num);977978va_start(ap, num);979vcpu_regs_get(vcpu, ®s);980981if (num >= 1)982regs.rdi = va_arg(ap, uint64_t);983984if (num >= 2)985regs.rsi = va_arg(ap, uint64_t);986987if (num >= 3)988regs.rdx = va_arg(ap, uint64_t);989990if (num >= 4)991regs.rcx = va_arg(ap, uint64_t);992993if (num >= 5)994regs.r8 = va_arg(ap, uint64_t);995996if (num >= 6)997regs.r9 = va_arg(ap, uint64_t);998999vcpu_regs_set(vcpu, ®s);1000va_end(ap);1001}10021003void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)1004{1005struct kvm_regs regs;1006struct kvm_sregs sregs;10071008fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);10091010fprintf(stream, "%*sregs:\n", indent + 2, "");1011vcpu_regs_get(vcpu, ®s);1012regs_dump(stream, ®s, indent + 4);10131014fprintf(stream, "%*ssregs:\n", indent + 2, "");1015vcpu_sregs_get(vcpu, &sregs);1016sregs_dump(stream, &sregs, indent + 4);1017}10181019static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)1020{1021struct kvm_msr_list *list;1022struct kvm_msr_list nmsrs;1023int kvm_fd, r;10241025kvm_fd = open_kvm_dev_path_or_exit();10261027nmsrs.nmsrs = 0;1028if (!feature_msrs)1029r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);1030else1031r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);10321033TEST_ASSERT(r == -1 && errno == E2BIG,1034"Expected -E2BIG, got rc: %i errno: %i (%s)",1035r, errno, strerror(errno));10361037list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));1038TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");1039list->nmsrs = nmsrs.nmsrs;10401041if (!feature_msrs)1042kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);1043else1044kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);1045close(kvm_fd);10461047TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,1048"Number of MSRs in list changed, was %d, now %d",1049nmsrs.nmsrs, list->nmsrs);1050return list;1051}10521053const struct kvm_msr_list *kvm_get_msr_index_list(void)1054{1055static const struct kvm_msr_list *list;10561057if (!list)1058list = __kvm_get_msr_index_list(false);1059return list;1060}106110621063const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)1064{1065static const struct kvm_msr_list *list;10661067if (!list)1068list = __kvm_get_msr_index_list(true);1069return list;1070}10711072bool kvm_msr_is_in_save_restore_list(uint32_t msr_index)1073{1074const struct kvm_msr_list *list = kvm_get_msr_index_list();1075int i;10761077for (i = 0; i < list->nmsrs; ++i) {1078if (list->indices[i] == msr_index)1079return true;1080}10811082return false;1083}10841085static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,1086struct kvm_x86_state *state)1087{1088int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);10891090if (size) {1091state->xsave = malloc(size);1092vcpu_xsave2_get(vcpu, state->xsave);1093} else {1094state->xsave = malloc(sizeof(struct kvm_xsave));1095vcpu_xsave_get(vcpu, state->xsave);1096}1097}10981099struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)1100{1101const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();1102struct kvm_x86_state *state;1103int i;11041105static int nested_size = -1;11061107if (nested_size == -1) {1108nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);1109TEST_ASSERT(nested_size <= sizeof(state->nested_),1110"Nested state size too big, %i > %zi",1111nested_size, sizeof(state->nested_));1112}11131114/*1115* When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees1116* guest state is consistent only after userspace re-enters the1117* kernel with KVM_RUN. Complete IO prior to migrating state1118* to a new VM.1119*/1120vcpu_run_complete_io(vcpu);11211122state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));1123TEST_ASSERT(state, "-ENOMEM when allocating kvm state");11241125vcpu_events_get(vcpu, &state->events);1126vcpu_mp_state_get(vcpu, &state->mp_state);1127vcpu_regs_get(vcpu, &state->regs);1128vcpu_save_xsave_state(vcpu, state);11291130if (kvm_has_cap(KVM_CAP_XCRS))1131vcpu_xcrs_get(vcpu, &state->xcrs);11321133vcpu_sregs_get(vcpu, &state->sregs);11341135if (nested_size) {1136state->nested.size = sizeof(state->nested_);11371138vcpu_nested_state_get(vcpu, &state->nested);1139TEST_ASSERT(state->nested.size <= nested_size,1140"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",1141state->nested.size, nested_size);1142} else {1143state->nested.size = 0;1144}11451146state->msrs.nmsrs = msr_list->nmsrs;1147for (i = 0; i < msr_list->nmsrs; i++)1148state->msrs.entries[i].index = msr_list->indices[i];1149vcpu_msrs_get(vcpu, &state->msrs);11501151vcpu_debugregs_get(vcpu, &state->debugregs);11521153return state;1154}11551156void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)1157{1158vcpu_sregs_set(vcpu, &state->sregs);1159vcpu_msrs_set(vcpu, &state->msrs);11601161if (kvm_has_cap(KVM_CAP_XCRS))1162vcpu_xcrs_set(vcpu, &state->xcrs);11631164vcpu_xsave_set(vcpu, state->xsave);1165vcpu_events_set(vcpu, &state->events);1166vcpu_mp_state_set(vcpu, &state->mp_state);1167vcpu_debugregs_set(vcpu, &state->debugregs);1168vcpu_regs_set(vcpu, &state->regs);11691170if (state->nested.size)1171vcpu_nested_state_set(vcpu, &state->nested);1172}11731174void kvm_x86_state_cleanup(struct kvm_x86_state *state)1175{1176free(state->xsave);1177free(state);1178}11791180void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)1181{1182if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {1183*pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;1184*va_bits = 32;1185} else {1186*pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);1187*va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);1188}1189}11901191void kvm_init_vm_address_properties(struct kvm_vm *vm)1192{1193if (is_sev_vm(vm)) {1194vm->arch.sev_fd = open_sev_dev_path_or_exit();1195vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));1196vm->gpa_tag_mask = vm->arch.c_bit;1197} else {1198vm->arch.sev_fd = -1;1199}1200}12011202const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,1203uint32_t function, uint32_t index)1204{1205int i;12061207for (i = 0; i < cpuid->nent; i++) {1208if (cpuid->entries[i].function == function &&1209cpuid->entries[i].index == index)1210return &cpuid->entries[i];1211}12121213TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);12141215return NULL;1216}12171218#define X86_HYPERCALL(inputs...) \1219({ \1220uint64_t r; \1221\1222asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t" \1223"jnz 1f\n\t" \1224"vmcall\n\t" \1225"jmp 2f\n\t" \1226"1: vmmcall\n\t" \1227"2:" \1228: "=a"(r) \1229: [use_vmmcall] "r" (host_cpu_is_amd), inputs); \1230\1231r; \1232})12331234uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,1235uint64_t a3)1236{1237return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));1238}12391240uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1)1241{1242return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));1243}12441245void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)1246{1247GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));1248}12491250unsigned long vm_compute_max_gfn(struct kvm_vm *vm)1251{1252const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */1253unsigned long ht_gfn, max_gfn, max_pfn;1254uint8_t maxphyaddr, guest_maxphyaddr;12551256/*1257* Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR1258* enumerates the max _mappable_ GPA, which can be less than the raw1259* MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU1260* doesn't support 5-level TDP.1261*/1262guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);1263guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;1264TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,1265"Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");12661267max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;12681269/* Avoid reserved HyperTransport region on AMD processors. */1270if (!host_cpu_is_amd)1271return max_gfn;12721273/* On parts with <40 physical address bits, the area is fully hidden */1274if (vm->pa_bits < 40)1275return max_gfn;12761277/* Before family 17h, the HyperTransport area is just below 1T. */1278ht_gfn = (1 << 28) - num_ht_pages;1279if (this_cpu_family() < 0x17)1280goto done;12811282/*1283* Otherwise it's at the top of the physical address space, possibly1284* reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use1285* the old conservative value if MAXPHYADDR is not enumerated.1286*/1287if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))1288goto done;12891290maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);1291max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;12921293if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))1294max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);12951296ht_gfn = max_pfn - num_ht_pages;1297done:1298return min(max_gfn, ht_gfn - 1);1299}13001301void kvm_selftest_arch_init(void)1302{1303host_cpu_is_intel = this_cpu_is_intel();1304host_cpu_is_amd = this_cpu_is_amd();1305is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();13061307kvm_init_pmu_errata();1308}13091310bool sys_clocksource_is_based_on_tsc(void)1311{1312char *clk_name = sys_get_cur_clocksource();1313bool ret = !strcmp(clk_name, "tsc\n") ||1314!strcmp(clk_name, "hyperv_clocksource_tsc_page\n");13151316free(clk_name);13171318return ret;1319}13201321bool kvm_arch_has_default_irqchip(void)1322{1323return true;1324}132513261327