// SPDX-License-Identifier: GPL-2.01/* Copyright (C) 2021-2022 Intel Corporation */23#undef pr_fmt4#define pr_fmt(fmt) "tdx: " fmt56#include <linux/cpufeature.h>7#include <linux/export.h>8#include <linux/io.h>9#include <linux/kexec.h>10#include <asm/coco.h>11#include <asm/tdx.h>12#include <asm/vmx.h>13#include <asm/ia32.h>14#include <asm/insn.h>15#include <asm/insn-eval.h>16#include <asm/paravirt_types.h>17#include <asm/pgtable.h>18#include <asm/set_memory.h>19#include <asm/traps.h>2021/* MMIO direction */22#define EPT_READ 023#define EPT_WRITE 12425/* Port I/O direction */26#define PORT_READ 027#define PORT_WRITE 12829/* See Exit Qualification for I/O Instructions in VMX documentation */30#define VE_IS_IO_IN(e) ((e) & BIT(3))31#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)32#define VE_GET_PORT_NUM(e) ((e) >> 16)33#define VE_IS_IO_STRING(e) ((e) & BIT(4))3435/* TDX Module call error codes */36#define TDCALL_RETURN_CODE(a) ((a) >> 32)37#define TDCALL_INVALID_OPERAND 0xc000010038#define TDCALL_OPERAND_BUSY 0x800002003940#define TDREPORT_SUBTYPE_0 04142static atomic_long_t nr_shared;4344/* Called from __tdx_hypercall() for unrecoverable failure */45noinstr void __noreturn __tdx_hypercall_failed(void)46{47instrumentation_begin();48panic("TDVMCALL failed. TDX module bug?");49}5051#ifdef CONFIG_KVM_GUEST52long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,53unsigned long p3, unsigned long p4)54{55struct tdx_module_args args = {56.r10 = nr,57.r11 = p1,58.r12 = p2,59.r13 = p3,60.r14 = p4,61};6263return __tdx_hypercall(&args);64}65EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);66#endif6768/*69* Used for TDX guests to make calls directly to the TD module. This70* should only be used for calls that have no legitimate reason to fail71* or where the kernel can not survive the call failing.72*/73static inline void tdcall(u64 fn, struct tdx_module_args *args)74{75if (__tdcall_ret(fn, args))76panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);77}7879/* Read TD-scoped metadata */80static inline u64 tdg_vm_rd(u64 field, u64 *value)81{82struct tdx_module_args args = {83.rdx = field,84};85u64 ret;8687ret = __tdcall_ret(TDG_VM_RD, &args);88*value = args.r8;8990return ret;91}9293/* Write TD-scoped metadata */94static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)95{96struct tdx_module_args args = {97.rdx = field,98.r8 = value,99.r9 = mask,100};101102return __tdcall(TDG_VM_WR, &args);103}104105/**106* tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT107* subtype 0) using TDG.MR.REPORT TDCALL.108* @reportdata: Address of the input buffer which contains user-defined109* REPORTDATA to be included into TDREPORT.110* @tdreport: Address of the output buffer to store TDREPORT.111*112* Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0113* specification for more information on TDG.MR.REPORT TDCALL.114*115* It is used in the TDX guest driver module to get the TDREPORT0.116*117* Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,118* or -EIO on other TDCALL failures.119*/120int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)121{122struct tdx_module_args args = {123.rcx = virt_to_phys(tdreport),124.rdx = virt_to_phys(reportdata),125.r8 = TDREPORT_SUBTYPE_0,126};127u64 ret;128129ret = __tdcall(TDG_MR_REPORT, &args);130if (ret) {131if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)132return -ENXIO;133else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)134return -EBUSY;135return -EIO;136}137138return 0;139}140EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);141142/**143* tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using144* TDG.MR.RTMR.EXTEND TDCALL.145* @index: Index of RTMR register to be extended.146* @data: Address of the input buffer with RTMR register extend data.147*148* Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0149* specification for more information on TDG.MR.RTMR.EXTEND TDCALL.150*151* It is used in the TDX guest driver module to allow user to extend the RTMR152* registers.153*154* Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,155* or -EIO on other TDCALL failures.156*/157int tdx_mcall_extend_rtmr(u8 index, u8 *data)158{159struct tdx_module_args args = {160.rcx = virt_to_phys(data),161.rdx = index,162};163u64 ret;164165ret = __tdcall(TDG_MR_RTMR_EXTEND, &args);166if (ret) {167if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)168return -ENXIO;169if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)170return -EBUSY;171return -EIO;172}173174return 0;175}176EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr);177178/**179* tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote180* hypercall.181* @buf: Address of the directly mapped shared kernel buffer which182* contains TDREPORT. The same buffer will be used by VMM to183* store the generated TD Quote output.184* @size: size of the tdquote buffer (4KB-aligned).185*186* Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI187* v1.0 specification for more information on GetQuote hypercall.188* It is used in the TDX guest driver module to get the TD Quote.189*190* Return 0 on success or error code on failure.191*/192u64 tdx_hcall_get_quote(u8 *buf, size_t size)193{194/* Since buf is a shared memory, set the shared (decrypted) bits */195return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);196}197EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);198199static void __noreturn tdx_panic(const char *msg)200{201struct tdx_module_args args = {202.r10 = TDX_HYPERCALL_STANDARD,203.r11 = TDVMCALL_REPORT_FATAL_ERROR,204.r12 = 0, /* Error code: 0 is Panic */205};206union {207/* Define register order according to the GHCI */208struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };209210char bytes[64] __nonstring;211} message;212213/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */214strtomem_pad(message.bytes, msg, '\0');215216args.r8 = message.r8;217args.r9 = message.r9;218args.r14 = message.r14;219args.r15 = message.r15;220args.rdi = message.rdi;221args.rsi = message.rsi;222args.rbx = message.rbx;223args.rdx = message.rdx;224225/*226* This hypercall should never return and it is not safe227* to keep the guest running. Call it forever if it228* happens to return.229*/230while (1)231__tdx_hypercall(&args);232}233234/*235* The kernel cannot handle #VEs when accessing normal kernel memory. Ensure236* that no #VE will be delivered for accesses to TD-private memory.237*238* TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM239* controls if the guest will receive such #VE with TD attribute240* TDX_ATTR_SEPT_VE_DISABLE.241*242* Newer TDX modules allow the guest to control if it wants to receive SEPT243* violation #VEs.244*245* Check if the feature is available and disable SEPT #VE if possible.246*247* If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE248* attribute is no longer reliable. It reflects the initial state of the249* control for the TD, but it will not be updated if someone (e.g. bootloader)250* changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to251* determine if SEPT #VEs are enabled or disabled.252*/253static void disable_sept_ve(u64 td_attr)254{255const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";256bool debug = td_attr & TDX_ATTR_DEBUG;257u64 config, controls;258259/* Is this TD allowed to disable SEPT #VE */260tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);261if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {262/* No SEPT #VE controls for the guest: check the attribute */263if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)264return;265266/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */267if (debug)268pr_warn("%s\n", msg);269else270tdx_panic(msg);271return;272}273274/* Check if SEPT #VE has been disabled before us */275tdg_vm_rd(TDCS_TD_CTLS, &controls);276if (controls & TD_CTLS_PENDING_VE_DISABLE)277return;278279/* Keep #VEs enabled for splats in debugging environments */280if (debug)281return;282283/* Disable SEPT #VEs */284tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,285TD_CTLS_PENDING_VE_DISABLE);286}287288/*289* TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and290* 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs.291* In practice, this means that the kernel can only boot with a plain topology.292* Any complications will cause problems.293*294* The ENUM_TOPOLOGY feature allows the VMM to provide topology information.295* Enabling the feature eliminates topology-related #VEs: the TDX module296* virtualizes accesses to the CPUID leafs and the MSR.297*298* Enable ENUM_TOPOLOGY if it is available.299*/300static void enable_cpu_topology_enumeration(void)301{302u64 configured;303304/* Has the VMM provided a valid topology configuration? */305tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured);306if (!configured) {307pr_err("VMM did not configure X2APIC_IDs properly\n");308return;309}310311tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);312}313314static void reduce_unnecessary_ve(void)315{316u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);317318if (err == TDX_SUCCESS)319return;320321/*322* Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to323* enable ENUM_TOPOLOGY if REDUCE_VE was not successful.324*/325enable_cpu_topology_enumeration();326}327328static void tdx_setup(u64 *cc_mask)329{330struct tdx_module_args args = {};331unsigned int gpa_width;332u64 td_attr;333334/*335* TDINFO TDX module call is used to get the TD execution environment336* information like GPA width, number of available vcpus, debug mode337* information, etc. More details about the ABI can be found in TDX338* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL339* [TDG.VP.INFO].340*/341tdcall(TDG_VP_INFO, &args);342343/*344* The highest bit of a guest physical address is the "sharing" bit.345* Set it for shared pages and clear it for private pages.346*347* The GPA width that comes out of this call is critical. TDX guests348* can not meaningfully run without it.349*/350gpa_width = args.rcx & GENMASK(5, 0);351*cc_mask = BIT_ULL(gpa_width - 1);352353td_attr = args.rdx;354355/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */356tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);357358disable_sept_ve(td_attr);359360reduce_unnecessary_ve();361}362363/*364* The TDX module spec states that #VE may be injected for a limited set of365* reasons:366*367* - Emulation of the architectural #VE injection on EPT violation;368*369* - As a result of guest TD execution of a disallowed instruction,370* a disallowed MSR access, or CPUID virtualization;371*372* - A notification to the guest TD about anomalous behavior;373*374* The last one is opt-in and is not used by the kernel.375*376* The Intel Software Developer's Manual describes cases when instruction377* length field can be used in section "Information for VM Exits Due to378* Instruction Execution".379*380* For TDX, it ultimately means GET_VEINFO provides reliable instruction length381* information if #VE occurred due to instruction execution, but not for EPT382* violations.383*/384static int ve_instr_len(struct ve_info *ve)385{386switch (ve->exit_reason) {387case EXIT_REASON_HLT:388case EXIT_REASON_MSR_READ:389case EXIT_REASON_MSR_WRITE:390case EXIT_REASON_CPUID:391case EXIT_REASON_IO_INSTRUCTION:392/* It is safe to use ve->instr_len for #VE due instructions */393return ve->instr_len;394case EXIT_REASON_EPT_VIOLATION:395/*396* For EPT violations, ve->insn_len is not defined. For those,397* the kernel must decode instructions manually and should not398* be using this function.399*/400WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");401return 0;402default:403WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);404return ve->instr_len;405}406}407408static u64 __cpuidle __halt(const bool irq_disabled)409{410struct tdx_module_args args = {411.r10 = TDX_HYPERCALL_STANDARD,412.r11 = hcall_func(EXIT_REASON_HLT),413.r12 = irq_disabled,414};415416/*417* Emulate HLT operation via hypercall. More info about ABI418* can be found in TDX Guest-Host-Communication Interface419* (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.420*421* The VMM uses the "IRQ disabled" param to understand IRQ422* enabled status (RFLAGS.IF) of the TD guest and to determine423* whether or not it should schedule the halted vCPU if an424* IRQ becomes pending. E.g. if IRQs are disabled, the VMM425* can keep the vCPU in virtual HLT, even if an IRQ is426* pending, without hanging/breaking the guest.427*/428return __tdx_hypercall(&args);429}430431static int handle_halt(struct ve_info *ve)432{433const bool irq_disabled = irqs_disabled();434435/*436* HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a437* wake event may be consumed before requesting HLT emulation, leaving438* the vCPU blocking indefinitely.439*/440if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled"))441return -EIO;442443if (__halt(irq_disabled))444return -EIO;445446return ve_instr_len(ve);447}448449void __cpuidle tdx_halt(void)450{451const bool irq_disabled = false;452453/*454* Use WARN_ONCE() to report the failure.455*/456if (__halt(irq_disabled))457WARN_ONCE(1, "HLT instruction emulation failed\n");458}459460static void __cpuidle tdx_safe_halt(void)461{462tdx_halt();463/*464* "__cpuidle" section doesn't support instrumentation, so stick465* with raw_* variant that avoids tracing hooks.466*/467raw_local_irq_enable();468}469470static int read_msr(struct pt_regs *regs, struct ve_info *ve)471{472struct tdx_module_args args = {473.r10 = TDX_HYPERCALL_STANDARD,474.r11 = hcall_func(EXIT_REASON_MSR_READ),475.r12 = regs->cx,476};477478/*479* Emulate the MSR read via hypercall. More info about ABI480* can be found in TDX Guest-Host-Communication Interface481* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".482*/483if (__tdx_hypercall(&args))484return -EIO;485486regs->ax = lower_32_bits(args.r11);487regs->dx = upper_32_bits(args.r11);488return ve_instr_len(ve);489}490491static int write_msr(struct pt_regs *regs, struct ve_info *ve)492{493struct tdx_module_args args = {494.r10 = TDX_HYPERCALL_STANDARD,495.r11 = hcall_func(EXIT_REASON_MSR_WRITE),496.r12 = regs->cx,497.r13 = (u64)regs->dx << 32 | regs->ax,498};499500/*501* Emulate the MSR write via hypercall. More info about ABI502* can be found in TDX Guest-Host-Communication Interface503* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".504*/505if (__tdx_hypercall(&args))506return -EIO;507508return ve_instr_len(ve);509}510511static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)512{513struct tdx_module_args args = {514.r10 = TDX_HYPERCALL_STANDARD,515.r11 = hcall_func(EXIT_REASON_CPUID),516.r12 = regs->ax,517.r13 = regs->cx,518};519520/*521* Only allow VMM to control range reserved for hypervisor522* communication.523*524* Return all-zeros for any CPUID outside the range. It matches CPU525* behaviour for non-supported leaf.526*/527if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {528regs->ax = regs->bx = regs->cx = regs->dx = 0;529return ve_instr_len(ve);530}531532/*533* Emulate the CPUID instruction via a hypercall. More info about534* ABI can be found in TDX Guest-Host-Communication Interface535* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".536*/537if (__tdx_hypercall(&args))538return -EIO;539540/*541* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of542* EAX, EBX, ECX, EDX registers after the CPUID instruction execution.543* So copy the register contents back to pt_regs.544*/545regs->ax = args.r12;546regs->bx = args.r13;547regs->cx = args.r14;548regs->dx = args.r15;549550return ve_instr_len(ve);551}552553static bool mmio_read(int size, unsigned long addr, unsigned long *val)554{555struct tdx_module_args args = {556.r10 = TDX_HYPERCALL_STANDARD,557.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),558.r12 = size,559.r13 = EPT_READ,560.r14 = addr,561};562563if (__tdx_hypercall(&args))564return false;565566*val = args.r11;567return true;568}569570static bool mmio_write(int size, unsigned long addr, unsigned long val)571{572return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,573EPT_WRITE, addr, val);574}575576static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)577{578unsigned long *reg, val, vaddr;579char buffer[MAX_INSN_SIZE];580enum insn_mmio_type mmio;581struct insn insn = {};582int size, extend_size;583u8 extend_val = 0;584585/* Only in-kernel MMIO is supported */586if (WARN_ON_ONCE(user_mode(regs)))587return -EFAULT;588589if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))590return -EFAULT;591592if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))593return -EINVAL;594595mmio = insn_decode_mmio(&insn, &size);596if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))597return -EINVAL;598599if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {600reg = insn_get_modrm_reg_ptr(&insn, regs);601if (!reg)602return -EINVAL;603}604605if (!fault_in_kernel_space(ve->gla)) {606WARN_ONCE(1, "Access to userspace address is not supported");607return -EINVAL;608}609610/*611* Reject EPT violation #VEs that split pages.612*613* MMIO accesses are supposed to be naturally aligned and therefore614* never cross page boundaries. Seeing split page accesses indicates615* a bug or a load_unaligned_zeropad() that stepped into an MMIO page.616*617* load_unaligned_zeropad() will recover using exception fixups.618*/619vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);620if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)621return -EFAULT;622623/* Handle writes first */624switch (mmio) {625case INSN_MMIO_WRITE:626memcpy(&val, reg, size);627if (!mmio_write(size, ve->gpa, val))628return -EIO;629return insn.length;630case INSN_MMIO_WRITE_IMM:631val = insn.immediate.value;632if (!mmio_write(size, ve->gpa, val))633return -EIO;634return insn.length;635case INSN_MMIO_READ:636case INSN_MMIO_READ_ZERO_EXTEND:637case INSN_MMIO_READ_SIGN_EXTEND:638/* Reads are handled below */639break;640case INSN_MMIO_MOVS:641case INSN_MMIO_DECODE_FAILED:642/*643* MMIO was accessed with an instruction that could not be644* decoded or handled properly. It was likely not using io.h645* helpers or accessed MMIO accidentally.646*/647return -EINVAL;648default:649WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");650return -EINVAL;651}652653/* Handle reads */654if (!mmio_read(size, ve->gpa, &val))655return -EIO;656657switch (mmio) {658case INSN_MMIO_READ:659/* Zero-extend for 32-bit operation */660extend_size = size == 4 ? sizeof(*reg) : 0;661break;662case INSN_MMIO_READ_ZERO_EXTEND:663/* Zero extend based on operand size */664extend_size = insn.opnd_bytes;665break;666case INSN_MMIO_READ_SIGN_EXTEND:667/* Sign extend based on operand size */668extend_size = insn.opnd_bytes;669if (size == 1 && val & BIT(7))670extend_val = 0xFF;671else if (size > 1 && val & BIT(15))672extend_val = 0xFF;673break;674default:675/* All other cases has to be covered with the first switch() */676WARN_ON_ONCE(1);677return -EINVAL;678}679680if (extend_size)681memset(reg, extend_val, extend_size);682memcpy(reg, &val, size);683return insn.length;684}685686static bool handle_in(struct pt_regs *regs, int size, int port)687{688struct tdx_module_args args = {689.r10 = TDX_HYPERCALL_STANDARD,690.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),691.r12 = size,692.r13 = PORT_READ,693.r14 = port,694};695u64 mask = GENMASK(BITS_PER_BYTE * size, 0);696bool success;697698/*699* Emulate the I/O read via hypercall. More info about ABI can be found700* in TDX Guest-Host-Communication Interface (GHCI) section titled701* "TDG.VP.VMCALL<Instruction.IO>".702*/703success = !__tdx_hypercall(&args);704705/* Update part of the register affected by the emulated instruction */706regs->ax &= ~mask;707if (success)708regs->ax |= args.r11 & mask;709710return success;711}712713static bool handle_out(struct pt_regs *regs, int size, int port)714{715u64 mask = GENMASK(BITS_PER_BYTE * size, 0);716717/*718* Emulate the I/O write via hypercall. More info about ABI can be found719* in TDX Guest-Host-Communication Interface (GHCI) section titled720* "TDG.VP.VMCALL<Instruction.IO>".721*/722return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,723PORT_WRITE, port, regs->ax & mask);724}725726/*727* Emulate I/O using hypercall.728*729* Assumes the IO instruction was using ax, which is enforced730* by the standard io.h macros.731*732* Return True on success or False on failure.733*/734static int handle_io(struct pt_regs *regs, struct ve_info *ve)735{736u32 exit_qual = ve->exit_qual;737int size, port;738bool in, ret;739740if (VE_IS_IO_STRING(exit_qual))741return -EIO;742743in = VE_IS_IO_IN(exit_qual);744size = VE_GET_IO_SIZE(exit_qual);745port = VE_GET_PORT_NUM(exit_qual);746747748if (in)749ret = handle_in(regs, size, port);750else751ret = handle_out(regs, size, port);752if (!ret)753return -EIO;754755return ve_instr_len(ve);756}757758/*759* Early #VE exception handler. Only handles a subset of port I/O.760* Intended only for earlyprintk. If failed, return false.761*/762__init bool tdx_early_handle_ve(struct pt_regs *regs)763{764struct ve_info ve;765int insn_len;766767tdx_get_ve_info(&ve);768769if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)770return false;771772insn_len = handle_io(regs, &ve);773if (insn_len < 0)774return false;775776regs->ip += insn_len;777return true;778}779780void tdx_get_ve_info(struct ve_info *ve)781{782struct tdx_module_args args = {};783784/*785* Called during #VE handling to retrieve the #VE info from the786* TDX module.787*788* This has to be called early in #VE handling. A "nested" #VE which789* occurs before this will raise a #DF and is not recoverable.790*791* The call retrieves the #VE info from the TDX module, which also792* clears the "#VE valid" flag. This must be done before anything else793* because any #VE that occurs while the valid flag is set will lead to794* #DF.795*796* Note, the TDX module treats virtual NMIs as inhibited if the #VE797* valid flag is set. It means that NMI=>#VE will not result in a #DF.798*/799tdcall(TDG_VP_VEINFO_GET, &args);800801/* Transfer the output parameters */802ve->exit_reason = args.rcx;803ve->exit_qual = args.rdx;804ve->gla = args.r8;805ve->gpa = args.r9;806ve->instr_len = lower_32_bits(args.r10);807ve->instr_info = upper_32_bits(args.r10);808}809810/*811* Handle the user initiated #VE.812*813* On success, returns the number of bytes RIP should be incremented (>=0)814* or -errno on error.815*/816static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)817{818switch (ve->exit_reason) {819case EXIT_REASON_CPUID:820return handle_cpuid(regs, ve);821default:822pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);823return -EIO;824}825}826827static inline bool is_private_gpa(u64 gpa)828{829return gpa == cc_mkenc(gpa);830}831832/*833* Handle the kernel #VE.834*835* On success, returns the number of bytes RIP should be incremented (>=0)836* or -errno on error.837*/838static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)839{840switch (ve->exit_reason) {841case EXIT_REASON_HLT:842return handle_halt(ve);843case EXIT_REASON_MSR_READ:844return read_msr(regs, ve);845case EXIT_REASON_MSR_WRITE:846return write_msr(regs, ve);847case EXIT_REASON_CPUID:848return handle_cpuid(regs, ve);849case EXIT_REASON_EPT_VIOLATION:850if (is_private_gpa(ve->gpa))851panic("Unexpected EPT-violation on private memory.");852return handle_mmio(regs, ve);853case EXIT_REASON_IO_INSTRUCTION:854return handle_io(regs, ve);855default:856pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);857return -EIO;858}859}860861bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)862{863int insn_len;864865if (user_mode(regs))866insn_len = virt_exception_user(regs, ve);867else868insn_len = virt_exception_kernel(regs, ve);869if (insn_len < 0)870return false;871872/* After successful #VE handling, move the IP */873regs->ip += insn_len;874875return true;876}877878static bool tdx_tlb_flush_required(bool private)879{880/*881* TDX guest is responsible for flushing TLB on private->shared882* transition. VMM is responsible for flushing on shared->private.883*884* The VMM _can't_ flush private addresses as it can't generate PAs885* with the guest's HKID. Shared memory isn't subject to integrity886* checking, i.e. the VMM doesn't need to flush for its own protection.887*888* There's no need to flush when converting from shared to private,889* as flushing is the VMM's responsibility in this case, e.g. it must890* flush to avoid integrity failures in the face of a buggy or891* malicious guest.892*/893return !private;894}895896static bool tdx_cache_flush_required(void)897{898/*899* AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.900* TDX doesn't have such capability.901*902* Flush cache unconditionally.903*/904return true;905}906907/*908* Notify the VMM about page mapping conversion. More info about ABI909* can be found in TDX Guest-Host-Communication Interface (GHCI),910* section "TDG.VP.VMCALL<MapGPA>".911*/912static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)913{914/* Retrying the hypercall a second time should succeed; use 3 just in case */915const int max_retries_per_page = 3;916int retry_count = 0;917918if (!enc) {919/* Set the shared (decrypted) bits: */920start |= cc_mkdec(0);921end |= cc_mkdec(0);922}923924while (retry_count < max_retries_per_page) {925struct tdx_module_args args = {926.r10 = TDX_HYPERCALL_STANDARD,927.r11 = TDVMCALL_MAP_GPA,928.r12 = start,929.r13 = end - start };930931u64 map_fail_paddr;932u64 ret = __tdx_hypercall(&args);933934if (ret != TDVMCALL_STATUS_RETRY)935return !ret;936/*937* The guest must retry the operation for the pages in the938* region starting at the GPA specified in R11. R11 comes939* from the untrusted VMM. Sanity check it.940*/941map_fail_paddr = args.r11;942if (map_fail_paddr < start || map_fail_paddr >= end)943return false;944945/* "Consume" a retry without forward progress */946if (map_fail_paddr == start) {947retry_count++;948continue;949}950951start = map_fail_paddr;952retry_count = 0;953}954955return false;956}957958/*959* Inform the VMM of the guest's intent for this physical page: shared with960* the VMM or private to the guest. The VMM is expected to change its mapping961* of the page in response.962*/963static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)964{965phys_addr_t start = __pa(vaddr);966phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);967968if (!tdx_map_gpa(start, end, enc))969return false;970971/* shared->private conversion requires memory to be accepted before use */972if (enc)973return tdx_accept_memory(start, end);974975return true;976}977978static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,979bool enc)980{981/*982* Only handle shared->private conversion here.983* See the comment in tdx_early_init().984*/985if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))986return -EIO;987988return 0;989}990991static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,992bool enc)993{994/*995* Only handle private->shared conversion here.996* See the comment in tdx_early_init().997*/998if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))999return -EIO;10001001if (enc)1002atomic_long_sub(numpages, &nr_shared);1003else1004atomic_long_add(numpages, &nr_shared);10051006return 0;1007}10081009/* Stop new private<->shared conversions */1010static void tdx_kexec_begin(void)1011{1012if (!IS_ENABLED(CONFIG_KEXEC_CORE))1013return;10141015/*1016* Crash kernel reaches here with interrupts disabled: can't wait for1017* conversions to finish.1018*1019* If race happened, just report and proceed.1020*/1021if (!set_memory_enc_stop_conversion())1022pr_warn("Failed to stop shared<->private conversions\n");1023}10241025/* Walk direct mapping and convert all shared memory back to private */1026static void tdx_kexec_finish(void)1027{1028unsigned long addr, end;1029long found = 0, shared;10301031if (!IS_ENABLED(CONFIG_KEXEC_CORE))1032return;10331034lockdep_assert_irqs_disabled();10351036addr = PAGE_OFFSET;1037end = PAGE_OFFSET + get_max_mapped();10381039while (addr < end) {1040unsigned long size;1041unsigned int level;1042pte_t *pte;10431044pte = lookup_address(addr, &level);1045size = page_level_size(level);10461047if (pte && pte_decrypted(*pte)) {1048int pages = size / PAGE_SIZE;10491050/*1051* Touching memory with shared bit set triggers implicit1052* conversion to shared.1053*1054* Make sure nobody touches the shared range from1055* now on.1056*/1057set_pte(pte, __pte(0));10581059/*1060* Memory encryption state persists across kexec.1061* If tdx_enc_status_changed() fails in the first1062* kernel, it leaves memory in an unknown state.1063*1064* If that memory remains shared, accessing it in the1065* *next* kernel through a private mapping will result1066* in an unrecoverable guest shutdown.1067*1068* The kdump kernel boot is not impacted as it uses1069* a pre-reserved memory range that is always private.1070* However, gathering crash information could lead to1071* a crash if it accesses unconverted memory through1072* a private mapping which is possible when accessing1073* that memory through /proc/vmcore, for example.1074*1075* In all cases, print error info in order to leave1076* enough bread crumbs for debugging.1077*/1078if (!tdx_enc_status_changed(addr, pages, true)) {1079pr_err("Failed to unshare range %#lx-%#lx\n",1080addr, addr + size);1081}10821083found += pages;1084}10851086addr += size;1087}10881089__flush_tlb_all();10901091shared = atomic_long_read(&nr_shared);1092if (shared != found) {1093pr_err("shared page accounting is off\n");1094pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);1095}1096}10971098static __init void tdx_announce(void)1099{1100struct tdx_module_args args = {};1101u64 controls;11021103pr_info("Guest detected\n");11041105tdcall(TDG_VP_INFO, &args);1106tdx_dump_attributes(args.rdx);11071108tdg_vm_rd(TDCS_TD_CTLS, &controls);1109tdx_dump_td_ctls(controls);1110}11111112void __init tdx_early_init(void)1113{1114u64 cc_mask;1115u32 eax, sig[3];11161117cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);11181119if (memcmp(TDX_IDENT, sig, sizeof(sig)))1120return;11211122setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);11231124/* TSC is the only reliable clock in TDX guest */1125setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);11261127cc_vendor = CC_VENDOR_INTEL;11281129/* Configure the TD */1130tdx_setup(&cc_mask);11311132cc_set_mask(cc_mask);11331134/*1135* All bits above GPA width are reserved and kernel treats shared bit1136* as flag, not as part of physical address.1137*1138* Adjust physical mask to only cover valid GPA bits.1139*/1140physical_mask &= cc_mask - 1;11411142/*1143* The kernel mapping should match the TDX metadata for the page.1144* load_unaligned_zeropad() can touch memory *adjacent* to that which is1145* owned by the caller and can catch even _momentary_ mismatches. Bad1146* things happen on mismatch:1147*1148* - Private mapping => Shared Page == Guest shutdown1149* - Shared mapping => Private Page == Recoverable #VE1150*1151* guest.enc_status_change_prepare() converts the page from1152* shared=>private before the mapping becomes private.1153*1154* guest.enc_status_change_finish() converts the page from1155* private=>shared after the mapping becomes private.1156*1157* In both cases there is a temporary shared mapping to a private page,1158* which can result in a #VE. But, there is never a private mapping to1159* a shared page.1160*/1161x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;1162x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish;11631164x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;1165x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;11661167x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;1168x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;11691170/*1171* Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that1172* will enable interrupts before HLT TDCALL invocation if executed1173* in STI-shadow, possibly resulting in missed wakeup events.1174*1175* Modify all possible HLT execution paths to use TDX specific routines1176* that directly execute TDCALL and toggle the interrupt state as1177* needed after TDCALL completion. This also reduces HLT related #VEs1178* in addition to having a reliable halt logic execution.1179*/1180pv_ops.irq.safe_halt = tdx_safe_halt;1181pv_ops.irq.halt = tdx_halt;11821183/*1184* TDX intercepts the RDMSR to read the X2APIC ID in the parallel1185* bringup low level code. That raises #VE which cannot be handled1186* there.1187*1188* Intel-TDX has a secure RDMSR hypercall, but that needs to be1189* implemented separately in the low level startup ASM code.1190* Until that is in place, disable parallel bringup for TDX.1191*/1192x86_cpuinit.parallel_bringup = false;11931194tdx_announce();1195}119611971198