Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/tdx.c
26489 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/cleanup.h>
3
#include <linux/cpu.h>
4
#include <asm/cpufeature.h>
5
#include <asm/fpu/xcr.h>
6
#include <linux/misc_cgroup.h>
7
#include <linux/mmu_context.h>
8
#include <asm/tdx.h>
9
#include "capabilities.h"
10
#include "mmu.h"
11
#include "x86_ops.h"
12
#include "lapic.h"
13
#include "tdx.h"
14
#include "vmx.h"
15
#include "mmu/spte.h"
16
#include "common.h"
17
#include "posted_intr.h"
18
#include "irq.h"
19
#include <trace/events/kvm.h>
20
#include "trace.h"
21
22
#pragma GCC poison to_vmx
23
24
#undef pr_fmt
25
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27
#define pr_tdx_error(__fn, __err) \
28
pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29
30
#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
31
pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
32
33
#define pr_tdx_error_1(__fn, __err, __rcx) \
34
__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35
36
#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
37
__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38
39
#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
40
__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41
42
bool enable_tdx __ro_after_init;
43
module_param_named(tdx, enable_tdx, bool, 0444);
44
45
#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46
#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47
48
static enum cpuhp_state tdx_cpuhp_state;
49
50
static const struct tdx_sys_info *tdx_sysinfo;
51
52
void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53
{
54
KVM_BUG_ON(1, tdx->vcpu.kvm);
55
pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56
}
57
58
void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59
u64 val, u64 err)
60
{
61
KVM_BUG_ON(1, tdx->vcpu.kvm);
62
pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63
}
64
65
#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66
67
static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68
{
69
return container_of(kvm, struct kvm_tdx, kvm);
70
}
71
72
static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73
{
74
return container_of(vcpu, struct vcpu_tdx, vcpu);
75
}
76
77
static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78
{
79
u64 val = KVM_SUPPORTED_TD_ATTRS;
80
81
if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82
return 0;
83
84
val &= td_conf->attributes_fixed0;
85
86
return val;
87
}
88
89
static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90
{
91
u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92
93
if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94
return 0;
95
96
val &= td_conf->xfam_fixed0;
97
98
return val;
99
}
100
101
static int tdx_get_guest_phys_addr_bits(const u32 eax)
102
{
103
return (eax & GENMASK(23, 16)) >> 16;
104
}
105
106
static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107
{
108
return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109
}
110
111
#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112
113
static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114
{
115
return entry->function == 7 && entry->index == 0 &&
116
(entry->ebx & TDX_FEATURE_TSX);
117
}
118
119
static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120
{
121
entry->ebx &= ~TDX_FEATURE_TSX;
122
}
123
124
static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125
{
126
return entry->function == 7 && entry->index == 0 &&
127
(entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128
}
129
130
static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131
{
132
entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133
}
134
135
static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136
{
137
if (has_tsx(entry))
138
clear_tsx(entry);
139
140
if (has_waitpkg(entry))
141
clear_waitpkg(entry);
142
}
143
144
static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145
{
146
return has_tsx(entry) || has_waitpkg(entry);
147
}
148
149
#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
150
151
static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152
{
153
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154
155
entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156
entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157
entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158
entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159
entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160
entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161
162
if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163
entry->index = 0;
164
165
/*
166
* The TDX module doesn't allow configuring the guest phys addr bits
167
* (EAX[23:16]). However, KVM uses it as an interface to the userspace
168
* to configure the GPAW. Report these bits as configurable.
169
*/
170
if (entry->function == 0x80000008)
171
entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172
173
tdx_clear_unsupported_cpuid(entry);
174
}
175
176
#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
177
178
static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
179
struct kvm_tdx_capabilities *caps)
180
{
181
int i;
182
183
caps->supported_attrs = tdx_get_supported_attrs(td_conf);
184
if (!caps->supported_attrs)
185
return -EIO;
186
187
caps->supported_xfam = tdx_get_supported_xfam(td_conf);
188
if (!caps->supported_xfam)
189
return -EIO;
190
191
caps->cpuid.nent = td_conf->num_cpuid_config;
192
193
caps->user_tdvmcallinfo_1_r11 =
194
TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
195
196
for (i = 0; i < td_conf->num_cpuid_config; i++)
197
td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
198
199
return 0;
200
}
201
202
/*
203
* Some SEAMCALLs acquire the TDX module globally, and can fail with
204
* TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
205
*/
206
static DEFINE_MUTEX(tdx_lock);
207
208
static atomic_t nr_configured_hkid;
209
210
static bool tdx_operand_busy(u64 err)
211
{
212
return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
213
}
214
215
216
/*
217
* A per-CPU list of TD vCPUs associated with a given CPU.
218
* Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
219
* list.
220
* - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
221
* the old CPU during the IPI callback running on the old CPU, and then added
222
* to the per-CPU list of the new CPU.
223
* - When a TD is tearing down, all vCPUs are disassociated from their current
224
* running CPUs and removed from the per-CPU list during the IPI callback
225
* running on those CPUs.
226
* - When a CPU is brought down, traverse the per-CPU list to disassociate all
227
* associated TD vCPUs and remove them from the per-CPU list.
228
*/
229
static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
230
231
static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
232
{
233
return to_tdx(vcpu)->vp_enter_args.r10;
234
}
235
236
static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
237
{
238
return to_tdx(vcpu)->vp_enter_args.r11;
239
}
240
241
static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
242
long val)
243
{
244
to_tdx(vcpu)->vp_enter_args.r10 = val;
245
}
246
247
static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
248
unsigned long val)
249
{
250
to_tdx(vcpu)->vp_enter_args.r11 = val;
251
}
252
253
static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
254
{
255
tdx_guest_keyid_free(kvm_tdx->hkid);
256
kvm_tdx->hkid = -1;
257
atomic_dec(&nr_configured_hkid);
258
misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
259
put_misc_cg(kvm_tdx->misc_cg);
260
kvm_tdx->misc_cg = NULL;
261
}
262
263
static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
264
{
265
return kvm_tdx->hkid > 0;
266
}
267
268
static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
269
{
270
lockdep_assert_irqs_disabled();
271
272
list_del(&to_tdx(vcpu)->cpu_list);
273
274
/*
275
* Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
276
* otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
277
* to its list before it's deleted from this CPU's list.
278
*/
279
smp_wmb();
280
281
vcpu->cpu = -1;
282
}
283
284
static void tdx_clear_page(struct page *page)
285
{
286
const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
287
void *dest = page_to_virt(page);
288
unsigned long i;
289
290
/*
291
* The page could have been poisoned. MOVDIR64B also clears
292
* the poison bit so the kernel can safely use the page again.
293
*/
294
for (i = 0; i < PAGE_SIZE; i += 64)
295
movdir64b(dest + i, zero_page);
296
/*
297
* MOVDIR64B store uses WC buffer. Prevent following memory reads
298
* from seeing potentially poisoned cache.
299
*/
300
__mb();
301
}
302
303
static void tdx_no_vcpus_enter_start(struct kvm *kvm)
304
{
305
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
306
307
lockdep_assert_held_write(&kvm->mmu_lock);
308
309
WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
310
311
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
312
}
313
314
static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
315
{
316
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
317
318
lockdep_assert_held_write(&kvm->mmu_lock);
319
320
WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
321
}
322
323
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
324
static int __tdx_reclaim_page(struct page *page)
325
{
326
u64 err, rcx, rdx, r8;
327
328
err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
329
330
/*
331
* No need to check for TDX_OPERAND_BUSY; all TD pages are freed
332
* before the HKID is released and control pages have also been
333
* released at this point, so there is no possibility of contention.
334
*/
335
if (WARN_ON_ONCE(err)) {
336
pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
337
return -EIO;
338
}
339
return 0;
340
}
341
342
static int tdx_reclaim_page(struct page *page)
343
{
344
int r;
345
346
r = __tdx_reclaim_page(page);
347
if (!r)
348
tdx_clear_page(page);
349
return r;
350
}
351
352
353
/*
354
* Reclaim the TD control page(s) which are crypto-protected by TDX guest's
355
* private KeyID. Assume the cache associated with the TDX private KeyID has
356
* been flushed.
357
*/
358
static void tdx_reclaim_control_page(struct page *ctrl_page)
359
{
360
/*
361
* Leak the page if the kernel failed to reclaim the page.
362
* The kernel cannot use it safely anymore.
363
*/
364
if (tdx_reclaim_page(ctrl_page))
365
return;
366
367
__free_page(ctrl_page);
368
}
369
370
struct tdx_flush_vp_arg {
371
struct kvm_vcpu *vcpu;
372
u64 err;
373
};
374
375
static void tdx_flush_vp(void *_arg)
376
{
377
struct tdx_flush_vp_arg *arg = _arg;
378
struct kvm_vcpu *vcpu = arg->vcpu;
379
u64 err;
380
381
arg->err = 0;
382
lockdep_assert_irqs_disabled();
383
384
/* Task migration can race with CPU offlining. */
385
if (unlikely(vcpu->cpu != raw_smp_processor_id()))
386
return;
387
388
/*
389
* No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
390
* list tracking still needs to be updated so that it's correct if/when
391
* the vCPU does get initialized.
392
*/
393
if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
394
/*
395
* No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
396
* TDVPR as exclusive, TDR as shared, and TDCS as shared. This
397
* vp flush function is called when destructing vCPU/TD or vCPU
398
* migration. No other thread uses TDVPR in those cases.
399
*/
400
err = tdh_vp_flush(&to_tdx(vcpu)->vp);
401
if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
402
/*
403
* This function is called in IPI context. Do not use
404
* printk to avoid console semaphore.
405
* The caller prints out the error message, instead.
406
*/
407
if (err)
408
arg->err = err;
409
}
410
}
411
412
tdx_disassociate_vp(vcpu);
413
}
414
415
static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
416
{
417
struct tdx_flush_vp_arg arg = {
418
.vcpu = vcpu,
419
};
420
int cpu = vcpu->cpu;
421
422
if (unlikely(cpu == -1))
423
return;
424
425
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
426
if (KVM_BUG_ON(arg.err, vcpu->kvm))
427
pr_tdx_error(TDH_VP_FLUSH, arg.err);
428
}
429
430
void tdx_disable_virtualization_cpu(void)
431
{
432
int cpu = raw_smp_processor_id();
433
struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
434
struct tdx_flush_vp_arg arg;
435
struct vcpu_tdx *tdx, *tmp;
436
unsigned long flags;
437
438
local_irq_save(flags);
439
/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
440
list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
441
arg.vcpu = &tdx->vcpu;
442
tdx_flush_vp(&arg);
443
}
444
local_irq_restore(flags);
445
}
446
447
#define TDX_SEAMCALL_RETRIES 10000
448
449
static void smp_func_do_phymem_cache_wb(void *unused)
450
{
451
u64 err = 0;
452
bool resume;
453
int i;
454
455
/*
456
* TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
457
* KeyID on the package or core. The TDX module may not finish the
458
* cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
459
* kernel should retry it until it returns success w/o rescheduling.
460
*/
461
for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
462
resume = !!err;
463
err = tdh_phymem_cache_wb(resume);
464
switch (err) {
465
case TDX_INTERRUPTED_RESUMABLE:
466
continue;
467
case TDX_NO_HKID_READY_TO_WBCACHE:
468
err = TDX_SUCCESS; /* Already done by other thread */
469
fallthrough;
470
default:
471
goto out;
472
}
473
}
474
475
out:
476
if (WARN_ON_ONCE(err))
477
pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
478
}
479
480
void tdx_mmu_release_hkid(struct kvm *kvm)
481
{
482
bool packages_allocated, targets_allocated;
483
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
484
cpumask_var_t packages, targets;
485
struct kvm_vcpu *vcpu;
486
unsigned long j;
487
int i;
488
u64 err;
489
490
if (!is_hkid_assigned(kvm_tdx))
491
return;
492
493
packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
494
targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
495
cpus_read_lock();
496
497
kvm_for_each_vcpu(j, vcpu, kvm)
498
tdx_flush_vp_on_cpu(vcpu);
499
500
/*
501
* TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
502
* and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
503
* Multiple TDX guests can be destroyed simultaneously. Take the
504
* mutex to prevent it from getting error.
505
*/
506
mutex_lock(&tdx_lock);
507
508
/*
509
* Releasing HKID is in vm_destroy().
510
* After the above flushing vps, there should be no more vCPU
511
* associations, as all vCPU fds have been released at this stage.
512
*/
513
err = tdh_mng_vpflushdone(&kvm_tdx->td);
514
if (err == TDX_FLUSHVP_NOT_DONE)
515
goto out;
516
if (KVM_BUG_ON(err, kvm)) {
517
pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
518
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
519
kvm_tdx->hkid);
520
goto out;
521
}
522
523
for_each_online_cpu(i) {
524
if (packages_allocated &&
525
cpumask_test_and_set_cpu(topology_physical_package_id(i),
526
packages))
527
continue;
528
if (targets_allocated)
529
cpumask_set_cpu(i, targets);
530
}
531
if (targets_allocated)
532
on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
533
else
534
on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
535
/*
536
* In the case of error in smp_func_do_phymem_cache_wb(), the following
537
* tdh_mng_key_freeid() will fail.
538
*/
539
err = tdh_mng_key_freeid(&kvm_tdx->td);
540
if (KVM_BUG_ON(err, kvm)) {
541
pr_tdx_error(TDH_MNG_KEY_FREEID, err);
542
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
543
kvm_tdx->hkid);
544
} else {
545
tdx_hkid_free(kvm_tdx);
546
}
547
548
out:
549
mutex_unlock(&tdx_lock);
550
cpus_read_unlock();
551
free_cpumask_var(targets);
552
free_cpumask_var(packages);
553
}
554
555
static void tdx_reclaim_td_control_pages(struct kvm *kvm)
556
{
557
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
558
u64 err;
559
int i;
560
561
/*
562
* tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
563
* heavily with TDX module. Give up freeing TD pages. As the function
564
* already warned, don't warn it again.
565
*/
566
if (is_hkid_assigned(kvm_tdx))
567
return;
568
569
if (kvm_tdx->td.tdcs_pages) {
570
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
571
if (!kvm_tdx->td.tdcs_pages[i])
572
continue;
573
574
tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
575
}
576
kfree(kvm_tdx->td.tdcs_pages);
577
kvm_tdx->td.tdcs_pages = NULL;
578
}
579
580
if (!kvm_tdx->td.tdr_page)
581
return;
582
583
if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
584
return;
585
586
/*
587
* Use a SEAMCALL to ask the TDX module to flush the cache based on the
588
* KeyID. TDX module may access TDR while operating on TD (Especially
589
* when it is reclaiming TDCS).
590
*/
591
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
592
if (KVM_BUG_ON(err, kvm)) {
593
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
594
return;
595
}
596
tdx_clear_page(kvm_tdx->td.tdr_page);
597
598
__free_page(kvm_tdx->td.tdr_page);
599
kvm_tdx->td.tdr_page = NULL;
600
}
601
602
void tdx_vm_destroy(struct kvm *kvm)
603
{
604
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
605
606
tdx_reclaim_td_control_pages(kvm);
607
608
kvm_tdx->state = TD_STATE_UNINITIALIZED;
609
}
610
611
static int tdx_do_tdh_mng_key_config(void *param)
612
{
613
struct kvm_tdx *kvm_tdx = param;
614
u64 err;
615
616
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
617
err = tdh_mng_key_config(&kvm_tdx->td);
618
619
if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
620
pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
621
return -EIO;
622
}
623
624
return 0;
625
}
626
627
int tdx_vm_init(struct kvm *kvm)
628
{
629
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
630
631
kvm->arch.has_protected_state = true;
632
kvm->arch.has_private_mem = true;
633
kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
634
635
/*
636
* Because guest TD is protected, VMM can't parse the instruction in TD.
637
* Instead, guest uses MMIO hypercall. For unmodified device driver,
638
* #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
639
* instruction into MMIO hypercall.
640
*
641
* SPTE value for MMIO needs to be setup so that #VE is injected into
642
* TD instead of triggering EPT MISCONFIG.
643
* - RWX=0 so that EPT violation is triggered.
644
* - suppress #VE bit is cleared to inject #VE.
645
*/
646
kvm_mmu_set_mmio_spte_value(kvm, 0);
647
648
/*
649
* TDX has its own limit of maximum vCPUs it can support for all
650
* TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
651
* such limit via the MAX_VCPU_PER_TD global metadata. In
652
* practice, it reflects the number of logical CPUs that ALL
653
* platforms that the TDX module supports can possibly have.
654
*
655
* Limit TDX guest's maximum vCPUs to the number of logical CPUs
656
* the platform has. Simply forwarding the MAX_VCPU_PER_TD to
657
* userspace would result in an unpredictable ABI.
658
*/
659
kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
660
661
kvm_tdx->state = TD_STATE_UNINITIALIZED;
662
663
return 0;
664
}
665
666
int tdx_vcpu_create(struct kvm_vcpu *vcpu)
667
{
668
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
669
struct vcpu_tdx *tdx = to_tdx(vcpu);
670
671
if (kvm_tdx->state != TD_STATE_INITIALIZED)
672
return -EIO;
673
674
/*
675
* TDX module mandates APICv, which requires an in-kernel local APIC.
676
* Disallow an in-kernel I/O APIC, because level-triggered interrupts
677
* and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
678
*/
679
if (!irqchip_split(vcpu->kvm))
680
return -EINVAL;
681
682
fpstate_set_confidential(&vcpu->arch.guest_fpu);
683
vcpu->arch.apic->guest_apic_protected = true;
684
INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
685
686
vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
687
688
vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
689
vcpu->arch.cr0_guest_owned_bits = -1ul;
690
vcpu->arch.cr4_guest_owned_bits = -1ul;
691
692
/* KVM can't change TSC offset/multiplier as TDX module manages them. */
693
vcpu->arch.guest_tsc_protected = true;
694
vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
695
vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
696
vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
697
vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
698
699
vcpu->arch.guest_state_protected =
700
!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
701
702
if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
703
vcpu->arch.xfd_no_write_intercept = true;
704
705
tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
706
__pi_set_sn(&tdx->vt.pi_desc);
707
708
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
709
710
return 0;
711
}
712
713
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
714
{
715
struct vcpu_tdx *tdx = to_tdx(vcpu);
716
717
vmx_vcpu_pi_load(vcpu, cpu);
718
if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
719
return;
720
721
tdx_flush_vp_on_cpu(vcpu);
722
723
KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
724
local_irq_disable();
725
/*
726
* Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
727
* vcpu->cpu is read before tdx->cpu_list.
728
*/
729
smp_rmb();
730
731
list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
732
local_irq_enable();
733
}
734
735
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
736
{
737
/*
738
* KVM can't get the interrupt status of TDX guest and it assumes
739
* interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
740
* which passes the interrupt blocked flag.
741
*/
742
return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
743
!to_tdx(vcpu)->vp_enter_args.r12;
744
}
745
746
static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
747
{
748
u64 vcpu_state_details;
749
750
if (pi_has_pending_interrupt(vcpu))
751
return true;
752
753
/*
754
* Only check RVI pending for HALTED case with IRQ enabled.
755
* For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
756
* interrupt was pending before TD exit, then it _must_ be blocked,
757
* otherwise the interrupt would have been serviced at the instruction
758
* boundary.
759
*/
760
if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
761
to_tdx(vcpu)->vp_enter_args.r12)
762
return false;
763
764
vcpu_state_details =
765
td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
766
767
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
768
}
769
770
/*
771
* Compared to vmx_prepare_switch_to_guest(), there is not much to do
772
* as SEAMCALL/SEAMRET calls take care of most of save and restore.
773
*/
774
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
775
{
776
struct vcpu_vt *vt = to_vt(vcpu);
777
778
if (vt->guest_state_loaded)
779
return;
780
781
if (likely(is_64bit_mm(current->mm)))
782
vt->msr_host_kernel_gs_base = current->thread.gsbase;
783
else
784
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
785
786
vt->guest_state_loaded = true;
787
}
788
789
struct tdx_uret_msr {
790
u32 msr;
791
unsigned int slot;
792
u64 defval;
793
};
794
795
static struct tdx_uret_msr tdx_uret_msrs[] = {
796
{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
797
{.msr = MSR_STAR,},
798
{.msr = MSR_LSTAR,},
799
{.msr = MSR_TSC_AUX,},
800
};
801
802
static void tdx_user_return_msr_update_cache(void)
803
{
804
int i;
805
806
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
807
kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
808
tdx_uret_msrs[i].defval);
809
}
810
811
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
812
{
813
struct vcpu_vt *vt = to_vt(vcpu);
814
struct vcpu_tdx *tdx = to_tdx(vcpu);
815
816
if (!vt->guest_state_loaded)
817
return;
818
819
++vcpu->stat.host_state_reload;
820
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
821
822
if (tdx->guest_entered) {
823
tdx_user_return_msr_update_cache();
824
tdx->guest_entered = false;
825
}
826
827
vt->guest_state_loaded = false;
828
}
829
830
void tdx_vcpu_put(struct kvm_vcpu *vcpu)
831
{
832
vmx_vcpu_pi_put(vcpu);
833
tdx_prepare_switch_to_host(vcpu);
834
}
835
836
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
837
{
838
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
839
struct vcpu_tdx *tdx = to_tdx(vcpu);
840
int i;
841
842
/*
843
* It is not possible to reclaim pages while hkid is assigned. It might
844
* be assigned if:
845
* 1. the TD VM is being destroyed but freeing hkid failed, in which
846
* case the pages are leaked
847
* 2. TD VCPU creation failed and this on the error path, in which case
848
* there is nothing to do anyway
849
*/
850
if (is_hkid_assigned(kvm_tdx))
851
return;
852
853
if (tdx->vp.tdcx_pages) {
854
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
855
if (tdx->vp.tdcx_pages[i])
856
tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
857
}
858
kfree(tdx->vp.tdcx_pages);
859
tdx->vp.tdcx_pages = NULL;
860
}
861
if (tdx->vp.tdvpr_page) {
862
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
863
tdx->vp.tdvpr_page = 0;
864
}
865
866
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
867
}
868
869
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
870
{
871
if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
872
to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
873
return -EINVAL;
874
875
return 1;
876
}
877
878
static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
879
{
880
switch (tdvmcall_leaf(vcpu)) {
881
case EXIT_REASON_CPUID:
882
case EXIT_REASON_HLT:
883
case EXIT_REASON_IO_INSTRUCTION:
884
case EXIT_REASON_MSR_READ:
885
case EXIT_REASON_MSR_WRITE:
886
return tdvmcall_leaf(vcpu);
887
case EXIT_REASON_EPT_VIOLATION:
888
return EXIT_REASON_EPT_MISCONFIG;
889
default:
890
break;
891
}
892
893
return EXIT_REASON_TDCALL;
894
}
895
896
static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
897
{
898
struct vcpu_tdx *tdx = to_tdx(vcpu);
899
u32 exit_reason;
900
901
switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
902
case TDX_SUCCESS:
903
case TDX_NON_RECOVERABLE_VCPU:
904
case TDX_NON_RECOVERABLE_TD:
905
case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
906
case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
907
break;
908
default:
909
return -1u;
910
}
911
912
exit_reason = tdx->vp_enter_ret;
913
914
switch (exit_reason) {
915
case EXIT_REASON_TDCALL:
916
if (tdvmcall_exit_type(vcpu))
917
return EXIT_REASON_VMCALL;
918
919
return tdcall_to_vmx_exit_reason(vcpu);
920
case EXIT_REASON_EPT_MISCONFIG:
921
/*
922
* Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
923
* non-instrumentable code with interrupts disabled.
924
*/
925
return -1u;
926
default:
927
break;
928
}
929
930
return exit_reason;
931
}
932
933
static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
934
{
935
struct vcpu_tdx *tdx = to_tdx(vcpu);
936
struct vcpu_vt *vt = to_vt(vcpu);
937
938
guest_state_enter_irqoff();
939
940
tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
941
942
vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
943
944
vt->exit_qualification = tdx->vp_enter_args.rcx;
945
tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
946
tdx->exit_gpa = tdx->vp_enter_args.r8;
947
vt->exit_intr_info = tdx->vp_enter_args.r9;
948
949
vmx_handle_nmi(vcpu);
950
951
guest_state_exit_irqoff();
952
}
953
954
static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
955
{
956
return vmx_get_exit_reason(vcpu).failed_vmentry &&
957
vmx_get_exit_reason(vcpu).full != -1u;
958
}
959
960
static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
961
{
962
u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
963
964
/*
965
* TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
966
* or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
967
*
968
* When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
969
* KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
970
* vCPUs leaving fastpath so that interrupt can be enabled to ensure the
971
* IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
972
* EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
973
* requester may be blocked endlessly.
974
*/
975
if (unlikely(tdx_operand_busy(vp_enter_ret)))
976
return EXIT_FASTPATH_EXIT_HANDLED;
977
978
return EXIT_FASTPATH_NONE;
979
}
980
981
#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
982
BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
983
BIT_ULL(VCPU_REGS_RAX) | \
984
BIT_ULL(VCPU_REGS_RBX) | \
985
BIT_ULL(VCPU_REGS_RCX) | \
986
BIT_ULL(VCPU_REGS_RDX) | \
987
BIT_ULL(VCPU_REGS_RBP) | \
988
BIT_ULL(VCPU_REGS_RSI) | \
989
BIT_ULL(VCPU_REGS_RDI) | \
990
BIT_ULL(VCPU_REGS_R8) | \
991
BIT_ULL(VCPU_REGS_R9) | \
992
BIT_ULL(VCPU_REGS_R10) | \
993
BIT_ULL(VCPU_REGS_R11) | \
994
BIT_ULL(VCPU_REGS_R12) | \
995
BIT_ULL(VCPU_REGS_R13) | \
996
BIT_ULL(VCPU_REGS_R14) | \
997
BIT_ULL(VCPU_REGS_R15))
998
999
static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1000
{
1001
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1002
1003
/*
1004
* All TDX hosts support PKRU; but even if they didn't,
1005
* vcpu->arch.host_pkru would be 0 and the wrpkru would be
1006
* skipped.
1007
*/
1008
if (vcpu->arch.host_pkru != 0)
1009
wrpkru(vcpu->arch.host_pkru);
1010
1011
if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1012
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1013
1014
/*
1015
* Likewise, even if a TDX hosts didn't support XSS both arms of
1016
* the comparison would be 0 and the wrmsrl would be skipped.
1017
*/
1018
if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1019
wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1020
}
1021
1022
#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1023
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1024
DEBUGCTLMSR_FREEZE_IN_SMM)
1025
1026
fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1027
{
1028
struct vcpu_tdx *tdx = to_tdx(vcpu);
1029
struct vcpu_vt *vt = to_vt(vcpu);
1030
1031
/*
1032
* WARN if KVM wants to force an immediate exit, as the TDX module does
1033
* not guarantee entry into the guest, i.e. it's possible for KVM to
1034
* _think_ it completed entry to the guest and forced an immediate exit
1035
* without actually having done so. Luckily, KVM never needs to force
1036
* an immediate exit for TDX (KVM can't do direct event injection, so
1037
* just WARN and continue on.
1038
*/
1039
WARN_ON_ONCE(run_flags);
1040
1041
/*
1042
* Wait until retry of SEPT-zap-related SEAMCALL completes before
1043
* allowing vCPU entry to avoid contention with tdh_vp_enter() and
1044
* TDCALLs.
1045
*/
1046
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1047
return EXIT_FASTPATH_EXIT_HANDLED;
1048
1049
trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1050
1051
if (pi_test_on(&vt->pi_desc)) {
1052
apic->send_IPI_self(POSTED_INTR_VECTOR);
1053
1054
if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1055
APIC_VECTOR_MASK, &vt->pi_desc))
1056
kvm_wait_lapic_expire(vcpu);
1057
}
1058
1059
tdx_vcpu_enter_exit(vcpu);
1060
1061
if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1062
update_debugctlmsr(vcpu->arch.host_debugctl);
1063
1064
tdx_load_host_xsave_state(vcpu);
1065
tdx->guest_entered = true;
1066
1067
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1068
1069
if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1070
return EXIT_FASTPATH_NONE;
1071
1072
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1073
return EXIT_FASTPATH_NONE;
1074
1075
if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1076
kvm_machine_check();
1077
1078
trace_kvm_exit(vcpu, KVM_ISA_VMX);
1079
1080
if (unlikely(tdx_failed_vmentry(vcpu)))
1081
return EXIT_FASTPATH_NONE;
1082
1083
return tdx_exit_handlers_fastpath(vcpu);
1084
}
1085
1086
void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1087
{
1088
++vcpu->stat.nmi_injections;
1089
td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1090
/*
1091
* From KVM's perspective, NMI injection is completed right after
1092
* writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1093
* the TDX module or not.
1094
*/
1095
vcpu->arch.nmi_injected = false;
1096
/*
1097
* TDX doesn't support KVM to request NMI window exit. If there is
1098
* still a pending vNMI, KVM is not able to inject it along with the
1099
* one pending in TDX module in a back-to-back way. Since the previous
1100
* vNMI is still pending in TDX module, i.e. it has not been delivered
1101
* to TDX guest yet, it's OK to collapse the pending vNMI into the
1102
* previous one. The guest is expected to handle all the NMI sources
1103
* when handling the first vNMI.
1104
*/
1105
vcpu->arch.nmi_pending = 0;
1106
}
1107
1108
static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1109
{
1110
u32 intr_info = vmx_get_intr_info(vcpu);
1111
1112
/*
1113
* Machine checks are handled by handle_exception_irqoff(), or by
1114
* tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1115
* VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1116
*/
1117
if (is_nmi(intr_info) || is_machine_check(intr_info))
1118
return 1;
1119
1120
vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1121
vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1122
vcpu->run->ex.error_code = 0;
1123
1124
return 0;
1125
}
1126
1127
static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1128
{
1129
tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1130
return 1;
1131
}
1132
1133
static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1134
{
1135
kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1136
kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1137
kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1138
kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1139
kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1140
1141
return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1142
}
1143
1144
/*
1145
* Split into chunks and check interrupt pending between chunks. This allows
1146
* for timely injection of interrupts to prevent issues with guest lockup
1147
* detection.
1148
*/
1149
#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1150
static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1151
1152
static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1153
{
1154
struct vcpu_tdx *tdx = to_tdx(vcpu);
1155
1156
if (vcpu->run->hypercall.ret) {
1157
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1158
tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1159
return 1;
1160
}
1161
1162
tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1163
if (tdx->map_gpa_next >= tdx->map_gpa_end)
1164
return 1;
1165
1166
/*
1167
* Stop processing the remaining part if there is a pending interrupt,
1168
* which could be qualified to deliver. Skip checking pending RVI for
1169
* TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1170
*/
1171
if (kvm_vcpu_has_events(vcpu)) {
1172
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1173
tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1174
return 1;
1175
}
1176
1177
__tdx_map_gpa(tdx);
1178
return 0;
1179
}
1180
1181
static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1182
{
1183
u64 gpa = tdx->map_gpa_next;
1184
u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1185
1186
if (size > TDX_MAP_GPA_MAX_LEN)
1187
size = TDX_MAP_GPA_MAX_LEN;
1188
1189
tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1190
tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1191
/*
1192
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1193
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1194
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1195
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1196
*/
1197
tdx->vcpu.run->hypercall.ret = 0;
1198
tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1199
tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1200
tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1201
KVM_MAP_GPA_RANGE_ENCRYPTED :
1202
KVM_MAP_GPA_RANGE_DECRYPTED;
1203
tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1204
1205
tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1206
}
1207
1208
static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1209
{
1210
struct vcpu_tdx *tdx = to_tdx(vcpu);
1211
u64 gpa = tdx->vp_enter_args.r12;
1212
u64 size = tdx->vp_enter_args.r13;
1213
u64 ret;
1214
1215
/*
1216
* Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1217
* userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1218
* bit set. This is a base call so it should always be supported, but
1219
* KVM has no way to ensure that userspace implements the GHCI correctly.
1220
* So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1221
* to the guest.
1222
*/
1223
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1224
ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1225
goto error;
1226
}
1227
1228
if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1229
!kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1230
(vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1231
vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1232
ret = TDVMCALL_STATUS_INVALID_OPERAND;
1233
goto error;
1234
}
1235
1236
if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1237
ret = TDVMCALL_STATUS_ALIGN_ERROR;
1238
goto error;
1239
}
1240
1241
tdx->map_gpa_end = gpa + size;
1242
tdx->map_gpa_next = gpa;
1243
1244
__tdx_map_gpa(tdx);
1245
return 0;
1246
1247
error:
1248
tdvmcall_set_return_code(vcpu, ret);
1249
tdx->vp_enter_args.r11 = gpa;
1250
return 1;
1251
}
1252
1253
static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1254
{
1255
struct vcpu_tdx *tdx = to_tdx(vcpu);
1256
u64 *regs = vcpu->run->system_event.data;
1257
u64 *module_regs = &tdx->vp_enter_args.r8;
1258
int index = VCPU_REGS_RAX;
1259
1260
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1261
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1262
vcpu->run->system_event.ndata = 16;
1263
1264
/* Dump 16 general-purpose registers to userspace in ascending order. */
1265
regs[index++] = tdx->vp_enter_ret;
1266
regs[index++] = tdx->vp_enter_args.rcx;
1267
regs[index++] = tdx->vp_enter_args.rdx;
1268
regs[index++] = tdx->vp_enter_args.rbx;
1269
regs[index++] = 0;
1270
regs[index++] = 0;
1271
regs[index++] = tdx->vp_enter_args.rsi;
1272
regs[index] = tdx->vp_enter_args.rdi;
1273
for (index = 0; index < 8; index++)
1274
regs[VCPU_REGS_R8 + index] = module_regs[index];
1275
1276
return 0;
1277
}
1278
1279
static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1280
{
1281
u32 eax, ebx, ecx, edx;
1282
struct vcpu_tdx *tdx = to_tdx(vcpu);
1283
1284
/* EAX and ECX for cpuid is stored in R12 and R13. */
1285
eax = tdx->vp_enter_args.r12;
1286
ecx = tdx->vp_enter_args.r13;
1287
1288
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1289
1290
tdx->vp_enter_args.r12 = eax;
1291
tdx->vp_enter_args.r13 = ebx;
1292
tdx->vp_enter_args.r14 = ecx;
1293
tdx->vp_enter_args.r15 = edx;
1294
1295
return 1;
1296
}
1297
1298
static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1299
{
1300
vcpu->arch.pio.count = 0;
1301
return 1;
1302
}
1303
1304
static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1305
{
1306
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1307
unsigned long val = 0;
1308
int ret;
1309
1310
ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1311
vcpu->arch.pio.port, &val, 1);
1312
1313
WARN_ON_ONCE(!ret);
1314
1315
tdvmcall_set_return_val(vcpu, val);
1316
1317
return 1;
1318
}
1319
1320
static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1321
{
1322
struct vcpu_tdx *tdx = to_tdx(vcpu);
1323
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1324
unsigned long val = 0;
1325
unsigned int port;
1326
u64 size, write;
1327
int ret;
1328
1329
++vcpu->stat.io_exits;
1330
1331
size = tdx->vp_enter_args.r12;
1332
write = tdx->vp_enter_args.r13;
1333
port = tdx->vp_enter_args.r14;
1334
1335
if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1336
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1337
return 1;
1338
}
1339
1340
if (write) {
1341
val = tdx->vp_enter_args.r15;
1342
ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1343
} else {
1344
ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1345
}
1346
1347
if (!ret)
1348
vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1349
tdx_complete_pio_in;
1350
else if (!write)
1351
tdvmcall_set_return_val(vcpu, val);
1352
1353
return ret;
1354
}
1355
1356
static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1357
{
1358
unsigned long val = 0;
1359
gpa_t gpa;
1360
int size;
1361
1362
gpa = vcpu->mmio_fragments[0].gpa;
1363
size = vcpu->mmio_fragments[0].len;
1364
1365
memcpy(&val, vcpu->run->mmio.data, size);
1366
tdvmcall_set_return_val(vcpu, val);
1367
trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1368
return 1;
1369
}
1370
1371
static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1372
unsigned long val)
1373
{
1374
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1375
trace_kvm_fast_mmio(gpa);
1376
return 0;
1377
}
1378
1379
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1380
if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1381
return -EOPNOTSUPP;
1382
1383
return 0;
1384
}
1385
1386
static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1387
{
1388
unsigned long val;
1389
1390
if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1391
return -EOPNOTSUPP;
1392
1393
tdvmcall_set_return_val(vcpu, val);
1394
trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1395
return 0;
1396
}
1397
1398
static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1399
{
1400
struct vcpu_tdx *tdx = to_tdx(vcpu);
1401
int size, write, r;
1402
unsigned long val;
1403
gpa_t gpa;
1404
1405
size = tdx->vp_enter_args.r12;
1406
write = tdx->vp_enter_args.r13;
1407
gpa = tdx->vp_enter_args.r14;
1408
val = write ? tdx->vp_enter_args.r15 : 0;
1409
1410
if (size != 1 && size != 2 && size != 4 && size != 8)
1411
goto error;
1412
if (write != 0 && write != 1)
1413
goto error;
1414
1415
/*
1416
* TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1417
* do MMIO emulation for private GPA.
1418
*/
1419
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1420
vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1421
goto error;
1422
1423
gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1424
1425
if (write)
1426
r = tdx_mmio_write(vcpu, gpa, size, val);
1427
else
1428
r = tdx_mmio_read(vcpu, gpa, size);
1429
if (!r)
1430
/* Kernel completed device emulation. */
1431
return 1;
1432
1433
/* Request the device emulation to userspace device model. */
1434
vcpu->mmio_is_write = write;
1435
if (!write)
1436
vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1437
1438
vcpu->run->mmio.phys_addr = gpa;
1439
vcpu->run->mmio.len = size;
1440
vcpu->run->mmio.is_write = write;
1441
vcpu->run->exit_reason = KVM_EXIT_MMIO;
1442
1443
if (write) {
1444
memcpy(vcpu->run->mmio.data, &val, size);
1445
} else {
1446
vcpu->mmio_fragments[0].gpa = gpa;
1447
vcpu->mmio_fragments[0].len = size;
1448
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1449
}
1450
return 0;
1451
1452
error:
1453
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1454
return 1;
1455
}
1456
1457
static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1458
{
1459
struct vcpu_tdx *tdx = to_tdx(vcpu);
1460
1461
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1462
1463
/*
1464
* For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1465
* directly without the support from userspace, just set the value
1466
* returned from userspace.
1467
*/
1468
tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1469
tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1470
tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1471
tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1472
1473
return 1;
1474
}
1475
1476
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1477
{
1478
struct vcpu_tdx *tdx = to_tdx(vcpu);
1479
1480
switch (tdx->vp_enter_args.r12) {
1481
case 0:
1482
tdx->vp_enter_args.r11 = 0;
1483
tdx->vp_enter_args.r12 = 0;
1484
tdx->vp_enter_args.r13 = 0;
1485
tdx->vp_enter_args.r14 = 0;
1486
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1487
return 1;
1488
case 1:
1489
vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1490
vcpu->run->exit_reason = KVM_EXIT_TDX;
1491
vcpu->run->tdx.flags = 0;
1492
vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1493
vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1494
vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1495
vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1496
vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1497
vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1498
vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1499
return 0;
1500
default:
1501
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1502
return 1;
1503
}
1504
}
1505
1506
static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1507
{
1508
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1509
return 1;
1510
}
1511
1512
static int tdx_get_quote(struct kvm_vcpu *vcpu)
1513
{
1514
struct vcpu_tdx *tdx = to_tdx(vcpu);
1515
u64 gpa = tdx->vp_enter_args.r12;
1516
u64 size = tdx->vp_enter_args.r13;
1517
1518
/* The gpa of buffer must have shared bit set. */
1519
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1520
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1521
return 1;
1522
}
1523
1524
vcpu->run->exit_reason = KVM_EXIT_TDX;
1525
vcpu->run->tdx.flags = 0;
1526
vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1527
vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1528
vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1529
vcpu->run->tdx.get_quote.size = size;
1530
1531
vcpu->arch.complete_userspace_io = tdx_complete_simple;
1532
1533
return 0;
1534
}
1535
1536
static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1537
{
1538
struct vcpu_tdx *tdx = to_tdx(vcpu);
1539
u64 vector = tdx->vp_enter_args.r12;
1540
1541
if (vector < 32 || vector > 255) {
1542
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1543
return 1;
1544
}
1545
1546
vcpu->run->exit_reason = KVM_EXIT_TDX;
1547
vcpu->run->tdx.flags = 0;
1548
vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1549
vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1550
vcpu->run->tdx.setup_event_notify.vector = vector;
1551
1552
vcpu->arch.complete_userspace_io = tdx_complete_simple;
1553
1554
return 0;
1555
}
1556
1557
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1558
{
1559
switch (tdvmcall_leaf(vcpu)) {
1560
case TDVMCALL_MAP_GPA:
1561
return tdx_map_gpa(vcpu);
1562
case TDVMCALL_REPORT_FATAL_ERROR:
1563
return tdx_report_fatal_error(vcpu);
1564
case TDVMCALL_GET_TD_VM_CALL_INFO:
1565
return tdx_get_td_vm_call_info(vcpu);
1566
case TDVMCALL_GET_QUOTE:
1567
return tdx_get_quote(vcpu);
1568
case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1569
return tdx_setup_event_notify_interrupt(vcpu);
1570
default:
1571
break;
1572
}
1573
1574
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1575
return 1;
1576
}
1577
1578
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1579
{
1580
u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1581
TDX_SHARED_BIT_PWL_4;
1582
1583
if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1584
return;
1585
1586
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1587
}
1588
1589
static void tdx_unpin(struct kvm *kvm, struct page *page)
1590
{
1591
put_page(page);
1592
}
1593
1594
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1595
enum pg_level level, struct page *page)
1596
{
1597
int tdx_level = pg_level_to_tdx_sept_level(level);
1598
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1599
gpa_t gpa = gfn_to_gpa(gfn);
1600
u64 entry, level_state;
1601
u64 err;
1602
1603
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1604
if (unlikely(tdx_operand_busy(err))) {
1605
tdx_unpin(kvm, page);
1606
return -EBUSY;
1607
}
1608
1609
if (KVM_BUG_ON(err, kvm)) {
1610
pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1611
tdx_unpin(kvm, page);
1612
return -EIO;
1613
}
1614
1615
return 0;
1616
}
1617
1618
/*
1619
* KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1620
* callback tdx_gmem_post_populate() then maps pages into private memory.
1621
* through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
1622
* private EPT structures for the page to have been built before, which is
1623
* done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1624
* were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1625
* The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1626
* are no half-initialized shared EPT pages.
1627
*/
1628
static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1629
enum pg_level level, kvm_pfn_t pfn)
1630
{
1631
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1632
1633
if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1634
return -EINVAL;
1635
1636
/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1637
atomic64_inc(&kvm_tdx->nr_premapped);
1638
return 0;
1639
}
1640
1641
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1642
enum pg_level level, kvm_pfn_t pfn)
1643
{
1644
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1645
struct page *page = pfn_to_page(pfn);
1646
1647
/* TODO: handle large pages. */
1648
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1649
return -EINVAL;
1650
1651
/*
1652
* Because guest_memfd doesn't support page migration with
1653
* a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1654
* migration. Until guest_memfd supports page migration, prevent page
1655
* migration.
1656
* TODO: Once guest_memfd introduces callback on page migration,
1657
* implement it and remove get_page/put_page().
1658
*/
1659
get_page(page);
1660
1661
/*
1662
* Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1663
* barrier in tdx_td_finalize().
1664
*/
1665
smp_rmb();
1666
if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1667
return tdx_mem_page_aug(kvm, gfn, level, page);
1668
1669
return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1670
}
1671
1672
static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1673
enum pg_level level, struct page *page)
1674
{
1675
int tdx_level = pg_level_to_tdx_sept_level(level);
1676
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1677
gpa_t gpa = gfn_to_gpa(gfn);
1678
u64 err, entry, level_state;
1679
1680
/* TODO: handle large pages. */
1681
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1682
return -EINVAL;
1683
1684
if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1685
return -EINVAL;
1686
1687
/*
1688
* When zapping private page, write lock is held. So no race condition
1689
* with other vcpu sept operation.
1690
* Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1691
*/
1692
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1693
&level_state);
1694
1695
if (unlikely(tdx_operand_busy(err))) {
1696
/*
1697
* The second retry is expected to succeed after kicking off all
1698
* other vCPUs and prevent them from invoking TDH.VP.ENTER.
1699
*/
1700
tdx_no_vcpus_enter_start(kvm);
1701
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1702
&level_state);
1703
tdx_no_vcpus_enter_stop(kvm);
1704
}
1705
1706
if (KVM_BUG_ON(err, kvm)) {
1707
pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1708
return -EIO;
1709
}
1710
1711
err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1712
1713
if (KVM_BUG_ON(err, kvm)) {
1714
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1715
return -EIO;
1716
}
1717
tdx_clear_page(page);
1718
tdx_unpin(kvm, page);
1719
return 0;
1720
}
1721
1722
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1723
enum pg_level level, void *private_spt)
1724
{
1725
int tdx_level = pg_level_to_tdx_sept_level(level);
1726
gpa_t gpa = gfn_to_gpa(gfn);
1727
struct page *page = virt_to_page(private_spt);
1728
u64 err, entry, level_state;
1729
1730
err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1731
&level_state);
1732
if (unlikely(tdx_operand_busy(err)))
1733
return -EBUSY;
1734
1735
if (KVM_BUG_ON(err, kvm)) {
1736
pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1737
return -EIO;
1738
}
1739
1740
return 0;
1741
}
1742
1743
/*
1744
* Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1745
* mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1746
* successfully.
1747
*
1748
* Since tdh_mem_sept_add() must have been invoked successfully before a
1749
* non-leaf entry present in the mirrored page table, the SEPT ZAP related
1750
* SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1751
* find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1752
* SEPT.
1753
*
1754
* Further check if the returned entry from SEPT walking is with RWX permissions
1755
* to filter out anything unexpected.
1756
*
1757
* Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1758
* level_state returned from a SEAMCALL error is the same as that passed into
1759
* the SEAMCALL.
1760
*/
1761
static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1762
u64 entry, int level)
1763
{
1764
if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1765
return false;
1766
1767
if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1768
return false;
1769
1770
if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1771
return false;
1772
1773
return true;
1774
}
1775
1776
static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1777
enum pg_level level, struct page *page)
1778
{
1779
int tdx_level = pg_level_to_tdx_sept_level(level);
1780
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1781
gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1782
u64 err, entry, level_state;
1783
1784
/* For now large page isn't supported yet. */
1785
WARN_ON_ONCE(level != PG_LEVEL_4K);
1786
1787
err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1788
1789
if (unlikely(tdx_operand_busy(err))) {
1790
/* After no vCPUs enter, the second retry is expected to succeed */
1791
tdx_no_vcpus_enter_start(kvm);
1792
err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1793
tdx_no_vcpus_enter_stop(kvm);
1794
}
1795
if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1796
!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1797
atomic64_dec(&kvm_tdx->nr_premapped);
1798
tdx_unpin(kvm, page);
1799
return 0;
1800
}
1801
1802
if (KVM_BUG_ON(err, kvm)) {
1803
pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1804
return -EIO;
1805
}
1806
return 1;
1807
}
1808
1809
/*
1810
* Ensure shared and private EPTs to be flushed on all vCPUs.
1811
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
1812
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1813
* running in guest mode with the value "N - 1".
1814
*
1815
* A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1816
* guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1817
* being increased to "N + 1".
1818
*
1819
* Kicking off all vCPUs after that further results in no vCPUs can run in guest
1820
* mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1821
* to increase TD epoch to "N + 2").
1822
*
1823
* TDX module will flush EPT on the next TD enter and make vCPUs to run in
1824
* guest mode with TD epoch value "N + 1".
1825
*
1826
* kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1827
* waiting empty IPI handler ack_kick().
1828
*
1829
* No action is required to the vCPUs being kicked off since the kicking off
1830
* occurs certainly after TD epoch increment and before the next
1831
* tdh_mem_track().
1832
*/
1833
static void tdx_track(struct kvm *kvm)
1834
{
1835
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1836
u64 err;
1837
1838
/* If TD isn't finalized, it's before any vcpu running. */
1839
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1840
return;
1841
1842
lockdep_assert_held_write(&kvm->mmu_lock);
1843
1844
err = tdh_mem_track(&kvm_tdx->td);
1845
if (unlikely(tdx_operand_busy(err))) {
1846
/* After no vCPUs enter, the second retry is expected to succeed */
1847
tdx_no_vcpus_enter_start(kvm);
1848
err = tdh_mem_track(&kvm_tdx->td);
1849
tdx_no_vcpus_enter_stop(kvm);
1850
}
1851
1852
if (KVM_BUG_ON(err, kvm))
1853
pr_tdx_error(TDH_MEM_TRACK, err);
1854
1855
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1856
}
1857
1858
static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1859
enum pg_level level, void *private_spt)
1860
{
1861
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1862
1863
/*
1864
* free_external_spt() is only called after hkid is freed when TD is
1865
* tearing down.
1866
* KVM doesn't (yet) zap page table pages in mirror page table while
1867
* TD is active, though guest pages mapped in mirror page table could be
1868
* zapped during TD is active, e.g. for shared <-> private conversion
1869
* and slot move/deletion.
1870
*/
1871
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1872
return -EINVAL;
1873
1874
/*
1875
* The HKID assigned to this TD was already freed and cache was
1876
* already flushed. We don't have to flush again.
1877
*/
1878
return tdx_reclaim_page(virt_to_page(private_spt));
1879
}
1880
1881
static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1882
enum pg_level level, kvm_pfn_t pfn)
1883
{
1884
struct page *page = pfn_to_page(pfn);
1885
int ret;
1886
1887
/*
1888
* HKID is released after all private pages have been removed, and set
1889
* before any might be populated. Warn if zapping is attempted when
1890
* there can't be anything populated in the private EPT.
1891
*/
1892
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1893
return -EINVAL;
1894
1895
ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1896
if (ret <= 0)
1897
return ret;
1898
1899
/*
1900
* TDX requires TLB tracking before dropping private page. Do
1901
* it here, although it is also done later.
1902
*/
1903
tdx_track(kvm);
1904
1905
return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1906
}
1907
1908
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1909
int trig_mode, int vector)
1910
{
1911
struct kvm_vcpu *vcpu = apic->vcpu;
1912
struct vcpu_tdx *tdx = to_tdx(vcpu);
1913
1914
/* TDX supports only posted interrupt. No lapic emulation. */
1915
__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1916
1917
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1918
}
1919
1920
static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1921
{
1922
u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1923
u64 eq = vmx_get_exit_qual(vcpu);
1924
1925
if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1926
return false;
1927
1928
return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1929
}
1930
1931
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1932
{
1933
unsigned long exit_qual;
1934
gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1935
bool local_retry = false;
1936
int ret;
1937
1938
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1939
if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1940
pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1941
gpa, vcpu->vcpu_id);
1942
kvm_vm_dead(vcpu->kvm);
1943
return -EIO;
1944
}
1945
/*
1946
* Always treat SEPT violations as write faults. Ignore the
1947
* EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1948
* TD private pages are always RWX in the SEPT tables,
1949
* i.e. they're always mapped writable. Just as importantly,
1950
* treating SEPT violations as write faults is necessary to
1951
* avoid COW allocations, which will cause TDAUGPAGE failures
1952
* due to aliasing a single HPA to multiple GPAs.
1953
*/
1954
exit_qual = EPT_VIOLATION_ACC_WRITE;
1955
1956
/* Only private GPA triggers zero-step mitigation */
1957
local_retry = true;
1958
} else {
1959
exit_qual = vmx_get_exit_qual(vcpu);
1960
/*
1961
* EPT violation due to instruction fetch should never be
1962
* triggered from shared memory in TDX guest. If such EPT
1963
* violation occurs, treat it as broken hardware.
1964
*/
1965
if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1966
return -EIO;
1967
}
1968
1969
trace_kvm_page_fault(vcpu, gpa, exit_qual);
1970
1971
/*
1972
* To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1973
* mapping in TDX.
1974
*
1975
* KVM may return RET_PF_RETRY for private GPA due to
1976
* - contentions when atomically updating SPTEs of the mirror page table
1977
* - in-progress GFN invalidation or memslot removal.
1978
* - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1979
* caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1980
* or certain TDCALLs.
1981
*
1982
* If TDH.VP.ENTER is invoked more times than the threshold set by the
1983
* TDX module before KVM resolves the private GPA mapping, the TDX
1984
* module will activate zero-step mitigation during TDH.VP.ENTER. This
1985
* process acquires an SEPT tree lock in the TDX module, leading to
1986
* further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1987
* operations on other vCPUs.
1988
*
1989
* Breaking out of local retries for kvm_vcpu_has_events() is for
1990
* interrupt injection. kvm_vcpu_has_events() should not see pending
1991
* events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1992
* blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1993
* the guest even if the IRQ/NMI can't be delivered.
1994
*
1995
* Note: even without breaking out of local retries, zero-step
1996
* mitigation may still occur due to
1997
* - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1998
* - a single RIP causing EPT violations for more GFNs than the
1999
* threshold count.
2000
* This is safe, as triggering zero-step mitigation only introduces
2001
* contentions to page installation SEAMCALLs on other vCPUs, which will
2002
* handle retries locally in their EPT violation handlers.
2003
*/
2004
while (1) {
2005
ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
2006
2007
if (ret != RET_PF_RETRY || !local_retry)
2008
break;
2009
2010
if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
2011
break;
2012
2013
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
2014
ret = -EIO;
2015
break;
2016
}
2017
2018
cond_resched();
2019
}
2020
return ret;
2021
}
2022
2023
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2024
{
2025
if (err) {
2026
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2027
return 1;
2028
}
2029
2030
if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2031
tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2032
2033
return 1;
2034
}
2035
2036
2037
int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2038
{
2039
struct vcpu_tdx *tdx = to_tdx(vcpu);
2040
u64 vp_enter_ret = tdx->vp_enter_ret;
2041
union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2042
2043
if (fastpath != EXIT_FASTPATH_NONE)
2044
return 1;
2045
2046
if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2047
KVM_BUG_ON(1, vcpu->kvm);
2048
return -EIO;
2049
}
2050
2051
/*
2052
* Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2053
* TDX_SEAMCALL_VMFAILINVALID.
2054
*/
2055
if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2056
KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2057
goto unhandled_exit;
2058
}
2059
2060
if (unlikely(tdx_failed_vmentry(vcpu))) {
2061
/*
2062
* If the guest state is protected, that means off-TD debug is
2063
* not enabled, TDX_NON_RECOVERABLE must be set.
2064
*/
2065
WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2066
!(vp_enter_ret & TDX_NON_RECOVERABLE));
2067
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2068
vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2069
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2070
return 0;
2071
}
2072
2073
if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2074
exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2075
kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2076
goto unhandled_exit;
2077
}
2078
2079
WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2080
(vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2081
2082
switch (exit_reason.basic) {
2083
case EXIT_REASON_TRIPLE_FAULT:
2084
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2085
vcpu->mmio_needed = 0;
2086
return 0;
2087
case EXIT_REASON_EXCEPTION_NMI:
2088
return tdx_handle_exception_nmi(vcpu);
2089
case EXIT_REASON_EXTERNAL_INTERRUPT:
2090
++vcpu->stat.irq_exits;
2091
return 1;
2092
case EXIT_REASON_CPUID:
2093
return tdx_emulate_cpuid(vcpu);
2094
case EXIT_REASON_HLT:
2095
return kvm_emulate_halt_noskip(vcpu);
2096
case EXIT_REASON_TDCALL:
2097
return handle_tdvmcall(vcpu);
2098
case EXIT_REASON_VMCALL:
2099
return tdx_emulate_vmcall(vcpu);
2100
case EXIT_REASON_IO_INSTRUCTION:
2101
return tdx_emulate_io(vcpu);
2102
case EXIT_REASON_MSR_READ:
2103
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2104
return kvm_emulate_rdmsr(vcpu);
2105
case EXIT_REASON_MSR_WRITE:
2106
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2107
kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2108
kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2109
return kvm_emulate_wrmsr(vcpu);
2110
case EXIT_REASON_EPT_MISCONFIG:
2111
return tdx_emulate_mmio(vcpu);
2112
case EXIT_REASON_EPT_VIOLATION:
2113
return tdx_handle_ept_violation(vcpu);
2114
case EXIT_REASON_OTHER_SMI:
2115
/*
2116
* Unlike VMX, SMI in SEAM non-root mode (i.e. when
2117
* TD guest vCPU is running) will cause VM exit to TDX module,
2118
* then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2119
* and handled by kernel handler right away.
2120
*
2121
* The Other SMI exit can also be caused by the SEAM non-root
2122
* machine check delivered via Machine Check System Management
2123
* Interrupt (MSMI), but it has already been handled by the
2124
* kernel machine check handler, i.e., the memory page has been
2125
* marked as poisoned and it won't be freed to the free list
2126
* when the TDX guest is terminated (the TDX module marks the
2127
* guest as dead and prevent it from further running when
2128
* machine check happens in SEAM non-root).
2129
*
2130
* - A MSMI will not reach here, it's handled as non_recoverable
2131
* case above.
2132
* - If it's not an MSMI, no need to do anything here.
2133
*/
2134
return 1;
2135
default:
2136
break;
2137
}
2138
2139
unhandled_exit:
2140
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2141
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2142
vcpu->run->internal.ndata = 2;
2143
vcpu->run->internal.data[0] = vp_enter_ret;
2144
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2145
return 0;
2146
}
2147
2148
void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2149
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2150
{
2151
struct vcpu_tdx *tdx = to_tdx(vcpu);
2152
2153
*reason = tdx->vt.exit_reason.full;
2154
if (*reason != -1u) {
2155
*info1 = vmx_get_exit_qual(vcpu);
2156
*info2 = tdx->ext_exit_qualification;
2157
*intr_info = vmx_get_intr_info(vcpu);
2158
} else {
2159
*info1 = 0;
2160
*info2 = 0;
2161
*intr_info = 0;
2162
}
2163
2164
*error_code = 0;
2165
}
2166
2167
bool tdx_has_emulated_msr(u32 index)
2168
{
2169
switch (index) {
2170
case MSR_IA32_UCODE_REV:
2171
case MSR_IA32_ARCH_CAPABILITIES:
2172
case MSR_IA32_POWER_CTL:
2173
case MSR_IA32_CR_PAT:
2174
case MSR_MTRRcap:
2175
case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2176
case MSR_MTRRdefType:
2177
case MSR_IA32_TSC_DEADLINE:
2178
case MSR_IA32_MISC_ENABLE:
2179
case MSR_PLATFORM_INFO:
2180
case MSR_MISC_FEATURES_ENABLES:
2181
case MSR_IA32_APICBASE:
2182
case MSR_EFER:
2183
case MSR_IA32_FEAT_CTL:
2184
case MSR_IA32_MCG_CAP:
2185
case MSR_IA32_MCG_STATUS:
2186
case MSR_IA32_MCG_CTL:
2187
case MSR_IA32_MCG_EXT_CTL:
2188
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2189
case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2190
/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2191
case MSR_KVM_POLL_CONTROL:
2192
return true;
2193
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2194
/*
2195
* x2APIC registers that are virtualized by the CPU can't be
2196
* emulated, KVM doesn't have access to the virtual APIC page.
2197
*/
2198
switch (index) {
2199
case X2APIC_MSR(APIC_TASKPRI):
2200
case X2APIC_MSR(APIC_PROCPRI):
2201
case X2APIC_MSR(APIC_EOI):
2202
case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2203
case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2204
case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2205
return false;
2206
default:
2207
return true;
2208
}
2209
default:
2210
return false;
2211
}
2212
}
2213
2214
static bool tdx_is_read_only_msr(u32 index)
2215
{
2216
return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2217
index == MSR_IA32_FEAT_CTL;
2218
}
2219
2220
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2221
{
2222
switch (msr->index) {
2223
case MSR_IA32_FEAT_CTL:
2224
/*
2225
* MCE and MCA are advertised via cpuid. Guest kernel could
2226
* check if LMCE is enabled or not.
2227
*/
2228
msr->data = FEAT_CTL_LOCKED;
2229
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2230
msr->data |= FEAT_CTL_LMCE_ENABLED;
2231
return 0;
2232
case MSR_IA32_MCG_EXT_CTL:
2233
if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2234
return 1;
2235
msr->data = vcpu->arch.mcg_ext_ctl;
2236
return 0;
2237
default:
2238
if (!tdx_has_emulated_msr(msr->index))
2239
return 1;
2240
2241
return kvm_get_msr_common(vcpu, msr);
2242
}
2243
}
2244
2245
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2246
{
2247
switch (msr->index) {
2248
case MSR_IA32_MCG_EXT_CTL:
2249
if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2250
(msr->data & ~MCG_EXT_CTL_LMCE_EN))
2251
return 1;
2252
vcpu->arch.mcg_ext_ctl = msr->data;
2253
return 0;
2254
default:
2255
if (tdx_is_read_only_msr(msr->index))
2256
return 1;
2257
2258
if (!tdx_has_emulated_msr(msr->index))
2259
return 1;
2260
2261
return kvm_set_msr_common(vcpu, msr);
2262
}
2263
}
2264
2265
static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2266
{
2267
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2268
struct kvm_tdx_capabilities __user *user_caps;
2269
struct kvm_tdx_capabilities *caps = NULL;
2270
u32 nr_user_entries;
2271
int ret = 0;
2272
2273
/* flags is reserved for future use */
2274
if (cmd->flags)
2275
return -EINVAL;
2276
2277
caps = kzalloc(sizeof(*caps) +
2278
sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2279
GFP_KERNEL);
2280
if (!caps)
2281
return -ENOMEM;
2282
2283
user_caps = u64_to_user_ptr(cmd->data);
2284
if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
2285
ret = -EFAULT;
2286
goto out;
2287
}
2288
2289
if (nr_user_entries < td_conf->num_cpuid_config) {
2290
ret = -E2BIG;
2291
goto out;
2292
}
2293
2294
ret = init_kvm_tdx_caps(td_conf, caps);
2295
if (ret)
2296
goto out;
2297
2298
if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2299
ret = -EFAULT;
2300
goto out;
2301
}
2302
2303
if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2304
caps->cpuid.nent *
2305
sizeof(caps->cpuid.entries[0])))
2306
ret = -EFAULT;
2307
2308
out:
2309
/* kfree() accepts NULL. */
2310
kfree(caps);
2311
return ret;
2312
}
2313
2314
/*
2315
* KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2316
* similar to TDX's GPAW. Use this field as the interface for userspace to
2317
* configure the GPAW and EPT level for TDs.
2318
*
2319
* Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2320
* 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2321
* supported. Value 52 is only supported when the platform supports 5 level
2322
* EPT.
2323
*/
2324
static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2325
struct td_params *td_params)
2326
{
2327
const struct kvm_cpuid_entry2 *entry;
2328
int guest_pa;
2329
2330
entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2331
if (!entry)
2332
return -EINVAL;
2333
2334
guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2335
2336
if (guest_pa != 48 && guest_pa != 52)
2337
return -EINVAL;
2338
2339
if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2340
return -EINVAL;
2341
2342
td_params->eptp_controls = VMX_EPTP_MT_WB;
2343
if (guest_pa == 52) {
2344
td_params->eptp_controls |= VMX_EPTP_PWL_5;
2345
td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2346
} else {
2347
td_params->eptp_controls |= VMX_EPTP_PWL_4;
2348
}
2349
2350
return 0;
2351
}
2352
2353
static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2354
struct td_params *td_params)
2355
{
2356
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2357
const struct kvm_cpuid_entry2 *entry;
2358
struct tdx_cpuid_value *value;
2359
int i, copy_cnt = 0;
2360
2361
/*
2362
* td_params.cpuid_values: The number and the order of cpuid_value must
2363
* be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2364
* It's assumed that td_params was zeroed.
2365
*/
2366
for (i = 0; i < td_conf->num_cpuid_config; i++) {
2367
struct kvm_cpuid_entry2 tmp;
2368
2369
td_init_cpuid_entry2(&tmp, i);
2370
2371
entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2372
tmp.function, tmp.index);
2373
if (!entry)
2374
continue;
2375
2376
if (tdx_unsupported_cpuid(entry))
2377
return -EINVAL;
2378
2379
copy_cnt++;
2380
2381
value = &td_params->cpuid_values[i];
2382
value->eax = entry->eax;
2383
value->ebx = entry->ebx;
2384
value->ecx = entry->ecx;
2385
value->edx = entry->edx;
2386
2387
/*
2388
* TDX module does not accept nonzero bits 16..23 for the
2389
* CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2390
*/
2391
if (tmp.function == 0x80000008)
2392
value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2393
}
2394
2395
/*
2396
* Rely on the TDX module to reject invalid configuration, but it can't
2397
* check of leafs that don't have a proper slot in td_params->cpuid_values
2398
* to stick then. So fail if there were entries that didn't get copied to
2399
* td_params.
2400
*/
2401
if (copy_cnt != cpuid->nent)
2402
return -EINVAL;
2403
2404
return 0;
2405
}
2406
2407
static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2408
struct kvm_tdx_init_vm *init_vm)
2409
{
2410
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2411
struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2412
int ret;
2413
2414
if (kvm->created_vcpus)
2415
return -EBUSY;
2416
2417
if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2418
return -EINVAL;
2419
2420
if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2421
return -EINVAL;
2422
2423
td_params->max_vcpus = kvm->max_vcpus;
2424
td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2425
td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2426
2427
td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2428
td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2429
2430
ret = setup_tdparams_eptp_controls(cpuid, td_params);
2431
if (ret)
2432
return ret;
2433
2434
ret = setup_tdparams_cpuids(cpuid, td_params);
2435
if (ret)
2436
return ret;
2437
2438
#define MEMCPY_SAME_SIZE(dst, src) \
2439
do { \
2440
BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2441
memcpy((dst), (src), sizeof(dst)); \
2442
} while (0)
2443
2444
MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2445
MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2446
MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2447
2448
return 0;
2449
}
2450
2451
static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2452
u64 *seamcall_err)
2453
{
2454
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2455
cpumask_var_t packages;
2456
struct page **tdcs_pages = NULL;
2457
struct page *tdr_page;
2458
int ret, i;
2459
u64 err, rcx;
2460
2461
*seamcall_err = 0;
2462
ret = tdx_guest_keyid_alloc();
2463
if (ret < 0)
2464
return ret;
2465
kvm_tdx->hkid = ret;
2466
kvm_tdx->misc_cg = get_current_misc_cg();
2467
ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2468
if (ret)
2469
goto free_hkid;
2470
2471
ret = -ENOMEM;
2472
2473
atomic_inc(&nr_configured_hkid);
2474
2475
tdr_page = alloc_page(GFP_KERNEL);
2476
if (!tdr_page)
2477
goto free_hkid;
2478
2479
kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2480
/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2481
kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2482
tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2483
GFP_KERNEL | __GFP_ZERO);
2484
if (!tdcs_pages)
2485
goto free_tdr;
2486
2487
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2488
tdcs_pages[i] = alloc_page(GFP_KERNEL);
2489
if (!tdcs_pages[i])
2490
goto free_tdcs;
2491
}
2492
2493
if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2494
goto free_tdcs;
2495
2496
cpus_read_lock();
2497
2498
/*
2499
* Need at least one CPU of the package to be online in order to
2500
* program all packages for host key id. Check it.
2501
*/
2502
for_each_present_cpu(i)
2503
cpumask_set_cpu(topology_physical_package_id(i), packages);
2504
for_each_online_cpu(i)
2505
cpumask_clear_cpu(topology_physical_package_id(i), packages);
2506
if (!cpumask_empty(packages)) {
2507
ret = -EIO;
2508
/*
2509
* Because it's hard for human operator to figure out the
2510
* reason, warn it.
2511
*/
2512
#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2513
pr_warn_ratelimited(MSG_ALLPKG);
2514
goto free_packages;
2515
}
2516
2517
/*
2518
* TDH.MNG.CREATE tries to grab the global TDX module and fails
2519
* with TDX_OPERAND_BUSY when it fails to grab. Take the global
2520
* lock to prevent it from failure.
2521
*/
2522
mutex_lock(&tdx_lock);
2523
kvm_tdx->td.tdr_page = tdr_page;
2524
err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2525
mutex_unlock(&tdx_lock);
2526
2527
if (err == TDX_RND_NO_ENTROPY) {
2528
ret = -EAGAIN;
2529
goto free_packages;
2530
}
2531
2532
if (WARN_ON_ONCE(err)) {
2533
pr_tdx_error(TDH_MNG_CREATE, err);
2534
ret = -EIO;
2535
goto free_packages;
2536
}
2537
2538
for_each_online_cpu(i) {
2539
int pkg = topology_physical_package_id(i);
2540
2541
if (cpumask_test_and_set_cpu(pkg, packages))
2542
continue;
2543
2544
/*
2545
* Program the memory controller in the package with an
2546
* encryption key associated to a TDX private host key id
2547
* assigned to this TDR. Concurrent operations on same memory
2548
* controller results in TDX_OPERAND_BUSY. No locking needed
2549
* beyond the cpus_read_lock() above as it serializes against
2550
* hotplug and the first online CPU of the package is always
2551
* used. We never have two CPUs in the same socket trying to
2552
* program the key.
2553
*/
2554
ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2555
kvm_tdx, true);
2556
if (ret)
2557
break;
2558
}
2559
cpus_read_unlock();
2560
free_cpumask_var(packages);
2561
if (ret) {
2562
i = 0;
2563
goto teardown;
2564
}
2565
2566
kvm_tdx->td.tdcs_pages = tdcs_pages;
2567
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2568
err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2569
if (err == TDX_RND_NO_ENTROPY) {
2570
/* Here it's hard to allow userspace to retry. */
2571
ret = -EAGAIN;
2572
goto teardown;
2573
}
2574
if (WARN_ON_ONCE(err)) {
2575
pr_tdx_error(TDH_MNG_ADDCX, err);
2576
ret = -EIO;
2577
goto teardown;
2578
}
2579
}
2580
2581
err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2582
if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2583
/*
2584
* Because a user gives operands, don't warn.
2585
* Return a hint to the user because it's sometimes hard for the
2586
* user to figure out which operand is invalid. SEAMCALL status
2587
* code includes which operand caused invalid operand error.
2588
*/
2589
*seamcall_err = err;
2590
ret = -EINVAL;
2591
goto teardown;
2592
} else if (WARN_ON_ONCE(err)) {
2593
pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2594
ret = -EIO;
2595
goto teardown;
2596
}
2597
2598
return 0;
2599
2600
/*
2601
* The sequence for freeing resources from a partially initialized TD
2602
* varies based on where in the initialization flow failure occurred.
2603
* Simply use the full teardown and destroy, which naturally play nice
2604
* with partial initialization.
2605
*/
2606
teardown:
2607
/* Only free pages not yet added, so start at 'i' */
2608
for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2609
if (tdcs_pages[i]) {
2610
__free_page(tdcs_pages[i]);
2611
tdcs_pages[i] = NULL;
2612
}
2613
}
2614
if (!kvm_tdx->td.tdcs_pages)
2615
kfree(tdcs_pages);
2616
2617
tdx_mmu_release_hkid(kvm);
2618
tdx_reclaim_td_control_pages(kvm);
2619
2620
return ret;
2621
2622
free_packages:
2623
cpus_read_unlock();
2624
free_cpumask_var(packages);
2625
2626
free_tdcs:
2627
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2628
if (tdcs_pages[i])
2629
__free_page(tdcs_pages[i]);
2630
}
2631
kfree(tdcs_pages);
2632
kvm_tdx->td.tdcs_pages = NULL;
2633
2634
free_tdr:
2635
if (tdr_page)
2636
__free_page(tdr_page);
2637
kvm_tdx->td.tdr_page = 0;
2638
2639
free_hkid:
2640
tdx_hkid_free(kvm_tdx);
2641
2642
return ret;
2643
}
2644
2645
static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2646
u64 *data)
2647
{
2648
u64 err;
2649
2650
err = tdh_mng_rd(&tdx->td, field_id, data);
2651
2652
return err;
2653
}
2654
2655
#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2656
#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2657
2658
static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2659
bool sub_leaf_set, int *entry_index,
2660
struct kvm_cpuid_entry2 *out)
2661
{
2662
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2663
u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2664
u64 ebx_eax, edx_ecx;
2665
u64 err = 0;
2666
2667
if (sub_leaf > 0b1111111)
2668
return -EINVAL;
2669
2670
if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2671
return -EINVAL;
2672
2673
if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2674
sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2675
return -EINVAL;
2676
2677
/*
2678
* bit 23:17, REVSERVED: reserved, must be 0;
2679
* bit 16, LEAF_31: leaf number bit 31;
2680
* bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2681
* implicitly 0;
2682
* bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2683
* bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2684
* the SUBLEAF_6_0 is all-1.
2685
* sub-leaf bits 31:7 are implicitly 0;
2686
* bit 0, ELEMENT_I: Element index within field;
2687
*/
2688
field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2689
field_id |= (leaf & 0x7f) << 9;
2690
if (sub_leaf_set)
2691
field_id |= (sub_leaf & 0x7f) << 1;
2692
else
2693
field_id |= 0x1fe;
2694
2695
err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2696
if (err) //TODO check for specific errors
2697
goto err_out;
2698
2699
out->eax = (u32) ebx_eax;
2700
out->ebx = (u32) (ebx_eax >> 32);
2701
2702
field_id++;
2703
err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2704
/*
2705
* It's weird that reading edx_ecx fails while reading ebx_eax
2706
* succeeded.
2707
*/
2708
if (WARN_ON_ONCE(err))
2709
goto err_out;
2710
2711
out->ecx = (u32) edx_ecx;
2712
out->edx = (u32) (edx_ecx >> 32);
2713
2714
out->function = leaf;
2715
out->index = sub_leaf;
2716
out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2717
2718
/*
2719
* Work around missing support on old TDX modules, fetch
2720
* guest maxpa from gfn_direct_bits.
2721
*/
2722
if (leaf == 0x80000008) {
2723
gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2724
unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2725
2726
out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2727
}
2728
2729
(*entry_index)++;
2730
2731
return 0;
2732
2733
err_out:
2734
out->eax = 0;
2735
out->ebx = 0;
2736
out->ecx = 0;
2737
out->edx = 0;
2738
2739
return -EIO;
2740
}
2741
2742
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2743
{
2744
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2745
struct kvm_tdx_init_vm *init_vm;
2746
struct td_params *td_params = NULL;
2747
int ret;
2748
2749
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2750
BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2751
2752
if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2753
return -EINVAL;
2754
2755
if (cmd->flags)
2756
return -EINVAL;
2757
2758
init_vm = kmalloc(sizeof(*init_vm) +
2759
sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2760
GFP_KERNEL);
2761
if (!init_vm)
2762
return -ENOMEM;
2763
2764
if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2765
ret = -EFAULT;
2766
goto out;
2767
}
2768
2769
if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2770
ret = -E2BIG;
2771
goto out;
2772
}
2773
2774
if (copy_from_user(init_vm->cpuid.entries,
2775
u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2776
flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2777
ret = -EFAULT;
2778
goto out;
2779
}
2780
2781
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2782
ret = -EINVAL;
2783
goto out;
2784
}
2785
2786
if (init_vm->cpuid.padding) {
2787
ret = -EINVAL;
2788
goto out;
2789
}
2790
2791
td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2792
if (!td_params) {
2793
ret = -ENOMEM;
2794
goto out;
2795
}
2796
2797
ret = setup_tdparams(kvm, td_params, init_vm);
2798
if (ret)
2799
goto out;
2800
2801
ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2802
if (ret)
2803
goto out;
2804
2805
kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2806
kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2807
kvm_tdx->attributes = td_params->attributes;
2808
kvm_tdx->xfam = td_params->xfam;
2809
2810
if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2811
kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2812
else
2813
kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2814
2815
kvm_tdx->state = TD_STATE_INITIALIZED;
2816
out:
2817
/* kfree() accepts NULL. */
2818
kfree(init_vm);
2819
kfree(td_params);
2820
2821
return ret;
2822
}
2823
2824
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2825
{
2826
/*
2827
* flush_tlb_current() is invoked when the first time for the vcpu to
2828
* run or when root of shared EPT is invalidated.
2829
* KVM only needs to flush shared EPT because the TDX module handles TLB
2830
* invalidation for private EPT in tdh_vp_enter();
2831
*
2832
* A single context invalidation for shared EPT can be performed here.
2833
* However, this single context invalidation requires the private EPTP
2834
* rather than the shared EPTP to flush shared EPT, as shared EPT uses
2835
* private EPTP as its ASID for TLB invalidation.
2836
*
2837
* To avoid reading back private EPTP, perform a global invalidation for
2838
* shared EPT instead to keep this function simple.
2839
*/
2840
ept_sync_global();
2841
}
2842
2843
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2844
{
2845
/*
2846
* TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2847
* ensure that private EPT will be flushed on the next TD enter. No need
2848
* to call tdx_track() here again even when this callback is a result of
2849
* zapping private EPT.
2850
*
2851
* Due to the lack of the context to determine which EPT has been
2852
* affected by zapping, invoke invept() directly here for both shared
2853
* EPT and private EPT for simplicity, though it's not necessary for
2854
* private EPT.
2855
*/
2856
ept_sync_global();
2857
}
2858
2859
static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2860
{
2861
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2862
2863
guard(mutex)(&kvm->slots_lock);
2864
2865
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2866
return -EINVAL;
2867
/*
2868
* Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2869
* TDH.MEM.PAGE.ADD().
2870
*/
2871
if (atomic64_read(&kvm_tdx->nr_premapped))
2872
return -EINVAL;
2873
2874
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2875
if (tdx_operand_busy(cmd->hw_error))
2876
return -EBUSY;
2877
if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2878
pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2879
return -EIO;
2880
}
2881
2882
kvm_tdx->state = TD_STATE_RUNNABLE;
2883
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2884
smp_wmb();
2885
kvm->arch.pre_fault_allowed = true;
2886
return 0;
2887
}
2888
2889
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2890
{
2891
struct kvm_tdx_cmd tdx_cmd;
2892
int r;
2893
2894
if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2895
return -EFAULT;
2896
2897
/*
2898
* Userspace should never set hw_error. It is used to fill
2899
* hardware-defined error by the kernel.
2900
*/
2901
if (tdx_cmd.hw_error)
2902
return -EINVAL;
2903
2904
mutex_lock(&kvm->lock);
2905
2906
switch (tdx_cmd.id) {
2907
case KVM_TDX_CAPABILITIES:
2908
r = tdx_get_capabilities(&tdx_cmd);
2909
break;
2910
case KVM_TDX_INIT_VM:
2911
r = tdx_td_init(kvm, &tdx_cmd);
2912
break;
2913
case KVM_TDX_FINALIZE_VM:
2914
r = tdx_td_finalize(kvm, &tdx_cmd);
2915
break;
2916
default:
2917
r = -EINVAL;
2918
goto out;
2919
}
2920
2921
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2922
r = -EFAULT;
2923
2924
out:
2925
mutex_unlock(&kvm->lock);
2926
return r;
2927
}
2928
2929
/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2930
static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2931
{
2932
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2933
struct vcpu_tdx *tdx = to_tdx(vcpu);
2934
struct page *page;
2935
int ret, i;
2936
u64 err;
2937
2938
page = alloc_page(GFP_KERNEL);
2939
if (!page)
2940
return -ENOMEM;
2941
tdx->vp.tdvpr_page = page;
2942
2943
tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2944
GFP_KERNEL);
2945
if (!tdx->vp.tdcx_pages) {
2946
ret = -ENOMEM;
2947
goto free_tdvpr;
2948
}
2949
2950
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2951
page = alloc_page(GFP_KERNEL);
2952
if (!page) {
2953
ret = -ENOMEM;
2954
goto free_tdcx;
2955
}
2956
tdx->vp.tdcx_pages[i] = page;
2957
}
2958
2959
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2960
if (KVM_BUG_ON(err, vcpu->kvm)) {
2961
ret = -EIO;
2962
pr_tdx_error(TDH_VP_CREATE, err);
2963
goto free_tdcx;
2964
}
2965
2966
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2968
if (KVM_BUG_ON(err, vcpu->kvm)) {
2969
pr_tdx_error(TDH_VP_ADDCX, err);
2970
/*
2971
* Pages already added are reclaimed by the vcpu_free
2972
* method, but the rest are freed here.
2973
*/
2974
for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2975
__free_page(tdx->vp.tdcx_pages[i]);
2976
tdx->vp.tdcx_pages[i] = NULL;
2977
}
2978
return -EIO;
2979
}
2980
}
2981
2982
err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2983
if (KVM_BUG_ON(err, vcpu->kvm)) {
2984
pr_tdx_error(TDH_VP_INIT, err);
2985
return -EIO;
2986
}
2987
2988
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2989
2990
return 0;
2991
2992
free_tdcx:
2993
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2994
if (tdx->vp.tdcx_pages[i])
2995
__free_page(tdx->vp.tdcx_pages[i]);
2996
tdx->vp.tdcx_pages[i] = NULL;
2997
}
2998
kfree(tdx->vp.tdcx_pages);
2999
tdx->vp.tdcx_pages = NULL;
3000
3001
free_tdvpr:
3002
if (tdx->vp.tdvpr_page)
3003
__free_page(tdx->vp.tdvpr_page);
3004
tdx->vp.tdvpr_page = 0;
3005
3006
return ret;
3007
}
3008
3009
/* Sometimes reads multipple subleafs. Return how many enties were written. */
3010
static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
3011
struct kvm_cpuid_entry2 *output_e)
3012
{
3013
int sub_leaf = 0;
3014
int ret;
3015
3016
/* First try without a subleaf */
3017
ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
3018
3019
/* If success, or invalid leaf, just give up */
3020
if (ret != -EIO)
3021
return ret;
3022
3023
/*
3024
* If the try without a subleaf failed, try reading subleafs until
3025
* failure. The TDX module only supports 6 bits of subleaf index.
3026
*/
3027
while (1) {
3028
/* Keep reading subleafs until there is a failure. */
3029
if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3030
return !sub_leaf;
3031
3032
sub_leaf++;
3033
output_e++;
3034
}
3035
3036
return 0;
3037
}
3038
3039
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3040
{
3041
struct kvm_cpuid2 __user *output, *td_cpuid;
3042
int r = 0, i = 0, leaf;
3043
u32 level;
3044
3045
output = u64_to_user_ptr(cmd->data);
3046
td_cpuid = kzalloc(sizeof(*td_cpuid) +
3047
sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3048
GFP_KERNEL);
3049
if (!td_cpuid)
3050
return -ENOMEM;
3051
3052
if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3053
r = -EFAULT;
3054
goto out;
3055
}
3056
3057
/* Read max CPUID for normal range */
3058
if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3059
r = -EIO;
3060
goto out;
3061
}
3062
level = td_cpuid->entries[0].eax;
3063
3064
for (leaf = 1; leaf <= level; leaf++)
3065
tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3066
3067
/* Read max CPUID for extended range */
3068
if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3069
r = -EIO;
3070
goto out;
3071
}
3072
level = td_cpuid->entries[i - 1].eax;
3073
3074
for (leaf = 0x80000001; leaf <= level; leaf++)
3075
tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3076
3077
if (td_cpuid->nent < i)
3078
r = -E2BIG;
3079
td_cpuid->nent = i;
3080
3081
if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3082
r = -EFAULT;
3083
goto out;
3084
}
3085
3086
if (r == -E2BIG)
3087
goto out;
3088
3089
if (copy_to_user(output->entries, td_cpuid->entries,
3090
td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3091
r = -EFAULT;
3092
3093
out:
3094
kfree(td_cpuid);
3095
3096
return r;
3097
}
3098
3099
static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3100
{
3101
u64 apic_base;
3102
struct vcpu_tdx *tdx = to_tdx(vcpu);
3103
int ret;
3104
3105
if (cmd->flags)
3106
return -EINVAL;
3107
3108
if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3109
return -EINVAL;
3110
3111
/*
3112
* TDX requires X2APIC, userspace is responsible for configuring guest
3113
* CPUID accordingly.
3114
*/
3115
apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3116
(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3117
if (kvm_apic_set_base(vcpu, apic_base, true))
3118
return -EINVAL;
3119
3120
ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3121
if (ret)
3122
return ret;
3123
3124
td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3125
td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3126
td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3127
3128
tdx->state = VCPU_TD_STATE_INITIALIZED;
3129
3130
return 0;
3131
}
3132
3133
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3134
{
3135
/*
3136
* Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3137
* INIT events.
3138
*
3139
* Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3140
* userspace needs to define the vCPU model before KVM can initialize
3141
* vCPU state, e.g. to enable x2APIC.
3142
*/
3143
WARN_ON_ONCE(init_event);
3144
}
3145
3146
struct tdx_gmem_post_populate_arg {
3147
struct kvm_vcpu *vcpu;
3148
__u32 flags;
3149
};
3150
3151
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3152
void __user *src, int order, void *_arg)
3153
{
3154
u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3155
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3156
struct tdx_gmem_post_populate_arg *arg = _arg;
3157
struct kvm_vcpu *vcpu = arg->vcpu;
3158
gpa_t gpa = gfn_to_gpa(gfn);
3159
u8 level = PG_LEVEL_4K;
3160
struct page *src_page;
3161
int ret, i;
3162
u64 err, entry, level_state;
3163
3164
/*
3165
* Get the source page if it has been faulted in. Return failure if the
3166
* source page has been swapped out or unmapped in primary memory.
3167
*/
3168
ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3169
if (ret < 0)
3170
return ret;
3171
if (ret != 1)
3172
return -ENOMEM;
3173
3174
ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3175
if (ret < 0)
3176
goto out;
3177
3178
/*
3179
* The private mem cannot be zapped after kvm_tdp_map_page()
3180
* because all paths are covered by slots_lock and the
3181
* filemap invalidate lock. Check that they are indeed enough.
3182
*/
3183
if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3184
scoped_guard(read_lock, &kvm->mmu_lock) {
3185
if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3186
ret = -EIO;
3187
goto out;
3188
}
3189
}
3190
}
3191
3192
ret = 0;
3193
err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3194
src_page, &entry, &level_state);
3195
if (err) {
3196
ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3197
goto out;
3198
}
3199
3200
if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3201
atomic64_dec(&kvm_tdx->nr_premapped);
3202
3203
if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3204
for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3205
err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3206
&level_state);
3207
if (err) {
3208
ret = -EIO;
3209
break;
3210
}
3211
}
3212
}
3213
3214
out:
3215
put_page(src_page);
3216
return ret;
3217
}
3218
3219
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3220
{
3221
struct vcpu_tdx *tdx = to_tdx(vcpu);
3222
struct kvm *kvm = vcpu->kvm;
3223
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3224
struct kvm_tdx_init_mem_region region;
3225
struct tdx_gmem_post_populate_arg arg;
3226
long gmem_ret;
3227
int ret;
3228
3229
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3230
return -EINVAL;
3231
3232
guard(mutex)(&kvm->slots_lock);
3233
3234
/* Once TD is finalized, the initial guest memory is fixed. */
3235
if (kvm_tdx->state == TD_STATE_RUNNABLE)
3236
return -EINVAL;
3237
3238
if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3239
return -EINVAL;
3240
3241
if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3242
return -EFAULT;
3243
3244
if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3245
!region.nr_pages ||
3246
region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3247
!vt_is_tdx_private_gpa(kvm, region.gpa) ||
3248
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3249
return -EINVAL;
3250
3251
kvm_mmu_reload(vcpu);
3252
ret = 0;
3253
while (region.nr_pages) {
3254
if (signal_pending(current)) {
3255
ret = -EINTR;
3256
break;
3257
}
3258
3259
arg = (struct tdx_gmem_post_populate_arg) {
3260
.vcpu = vcpu,
3261
.flags = cmd->flags,
3262
};
3263
gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3264
u64_to_user_ptr(region.source_addr),
3265
1, tdx_gmem_post_populate, &arg);
3266
if (gmem_ret < 0) {
3267
ret = gmem_ret;
3268
break;
3269
}
3270
3271
if (gmem_ret != 1) {
3272
ret = -EIO;
3273
break;
3274
}
3275
3276
region.source_addr += PAGE_SIZE;
3277
region.gpa += PAGE_SIZE;
3278
region.nr_pages--;
3279
3280
cond_resched();
3281
}
3282
3283
if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3284
ret = -EFAULT;
3285
return ret;
3286
}
3287
3288
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3289
{
3290
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3291
struct kvm_tdx_cmd cmd;
3292
int ret;
3293
3294
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3295
return -EINVAL;
3296
3297
if (copy_from_user(&cmd, argp, sizeof(cmd)))
3298
return -EFAULT;
3299
3300
if (cmd.hw_error)
3301
return -EINVAL;
3302
3303
switch (cmd.id) {
3304
case KVM_TDX_INIT_VCPU:
3305
ret = tdx_vcpu_init(vcpu, &cmd);
3306
break;
3307
case KVM_TDX_INIT_MEM_REGION:
3308
ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3309
break;
3310
case KVM_TDX_GET_CPUID:
3311
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3312
break;
3313
default:
3314
ret = -EINVAL;
3315
break;
3316
}
3317
3318
return ret;
3319
}
3320
3321
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
3322
{
3323
return PG_LEVEL_4K;
3324
}
3325
3326
static int tdx_online_cpu(unsigned int cpu)
3327
{
3328
unsigned long flags;
3329
int r;
3330
3331
/* Sanity check CPU is already in post-VMXON */
3332
WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3333
3334
local_irq_save(flags);
3335
r = tdx_cpu_enable();
3336
local_irq_restore(flags);
3337
3338
return r;
3339
}
3340
3341
static int tdx_offline_cpu(unsigned int cpu)
3342
{
3343
int i;
3344
3345
/* No TD is running. Allow any cpu to be offline. */
3346
if (!atomic_read(&nr_configured_hkid))
3347
return 0;
3348
3349
/*
3350
* In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3351
* call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3352
* controller with pconfig. If we have active TDX HKID, refuse to
3353
* offline the last online cpu.
3354
*/
3355
for_each_online_cpu(i) {
3356
/*
3357
* Found another online cpu on the same package.
3358
* Allow to offline.
3359
*/
3360
if (i != cpu && topology_physical_package_id(i) ==
3361
topology_physical_package_id(cpu))
3362
return 0;
3363
}
3364
3365
/*
3366
* This is the last cpu of this package. Don't offline it.
3367
*
3368
* Because it's hard for human operator to understand the
3369
* reason, warn it.
3370
*/
3371
#define MSG_ALLPKG_ONLINE \
3372
"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3373
pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3374
return -EBUSY;
3375
}
3376
3377
static void __do_tdx_cleanup(void)
3378
{
3379
/*
3380
* Once TDX module is initialized, it cannot be disabled and
3381
* re-initialized again w/o runtime update (which isn't
3382
* supported by kernel). Only need to remove the cpuhp here.
3383
* The TDX host core code tracks TDX status and can handle
3384
* 'multiple enabling' scenario.
3385
*/
3386
WARN_ON_ONCE(!tdx_cpuhp_state);
3387
cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3388
tdx_cpuhp_state = 0;
3389
}
3390
3391
static void __tdx_cleanup(void)
3392
{
3393
cpus_read_lock();
3394
__do_tdx_cleanup();
3395
cpus_read_unlock();
3396
}
3397
3398
static int __init __do_tdx_bringup(void)
3399
{
3400
int r;
3401
3402
/*
3403
* TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3404
* online CPUs before calling tdx_enable(), and on any new
3405
* going-online CPU to make sure it is ready for TDX guest.
3406
*/
3407
r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3408
"kvm/cpu/tdx:online",
3409
tdx_online_cpu, tdx_offline_cpu);
3410
if (r < 0)
3411
return r;
3412
3413
tdx_cpuhp_state = r;
3414
3415
r = tdx_enable();
3416
if (r)
3417
__do_tdx_cleanup();
3418
3419
return r;
3420
}
3421
3422
static int __init __tdx_bringup(void)
3423
{
3424
const struct tdx_sys_info_td_conf *td_conf;
3425
int r, i;
3426
3427
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3428
/*
3429
* Check if MSRs (tdx_uret_msrs) can be saved/restored
3430
* before returning to user space.
3431
*
3432
* this_cpu_ptr(user_return_msrs)->registered isn't checked
3433
* because the registration is done at vcpu runtime by
3434
* tdx_user_return_msr_update_cache().
3435
*/
3436
tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3437
if (tdx_uret_msrs[i].slot == -1) {
3438
/* If any MSR isn't supported, it is a KVM bug */
3439
pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3440
tdx_uret_msrs[i].msr);
3441
return -EIO;
3442
}
3443
}
3444
3445
/*
3446
* Enabling TDX requires enabling hardware virtualization first,
3447
* as making SEAMCALLs requires CPU being in post-VMXON state.
3448
*/
3449
r = kvm_enable_virtualization();
3450
if (r)
3451
return r;
3452
3453
cpus_read_lock();
3454
r = __do_tdx_bringup();
3455
cpus_read_unlock();
3456
3457
if (r)
3458
goto tdx_bringup_err;
3459
3460
/* Get TDX global information for later use */
3461
tdx_sysinfo = tdx_get_sysinfo();
3462
if (WARN_ON_ONCE(!tdx_sysinfo)) {
3463
r = -EINVAL;
3464
goto get_sysinfo_err;
3465
}
3466
3467
/* Check TDX module and KVM capabilities */
3468
if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3469
!tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3470
goto get_sysinfo_err;
3471
3472
if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3473
goto get_sysinfo_err;
3474
3475
/*
3476
* TDX has its own limit of maximum vCPUs it can support for all
3477
* TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3478
* query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3479
* extension on per-VM basis.
3480
*
3481
* TDX module reports such limit via the MAX_VCPU_PER_TD global
3482
* metadata. Different modules may report different values.
3483
* Some old module may also not support this metadata (in which
3484
* case this limit is U16_MAX).
3485
*
3486
* In practice, the reported value reflects the maximum logical
3487
* CPUs that ALL the platforms that the module supports can
3488
* possibly have.
3489
*
3490
* Simply forwarding the MAX_VCPU_PER_TD to userspace could
3491
* result in an unpredictable ABI. KVM instead always advertise
3492
* the number of logical CPUs the platform has as the maximum
3493
* vCPUs for TDX guests.
3494
*
3495
* Make sure MAX_VCPU_PER_TD reported by TDX module is not
3496
* smaller than the number of logical CPUs, otherwise KVM will
3497
* report an unsupported value to userspace.
3498
*
3499
* Note, a platform with TDX enabled in the BIOS cannot support
3500
* physical CPU hotplug, and TDX requires the BIOS has marked
3501
* all logical CPUs in MADT table as enabled. Just use
3502
* num_present_cpus() for the number of logical CPUs.
3503
*/
3504
td_conf = &tdx_sysinfo->td_conf;
3505
if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3506
pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3507
td_conf->max_vcpus_per_td, num_present_cpus());
3508
r = -EINVAL;
3509
goto get_sysinfo_err;
3510
}
3511
3512
if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
3513
r = -EINVAL;
3514
goto get_sysinfo_err;
3515
}
3516
3517
/*
3518
* Leave hardware virtualization enabled after TDX is enabled
3519
* successfully. TDX CPU hotplug depends on this.
3520
*/
3521
return 0;
3522
3523
get_sysinfo_err:
3524
__tdx_cleanup();
3525
tdx_bringup_err:
3526
kvm_disable_virtualization();
3527
return r;
3528
}
3529
3530
void tdx_cleanup(void)
3531
{
3532
if (enable_tdx) {
3533
misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3534
__tdx_cleanup();
3535
kvm_disable_virtualization();
3536
}
3537
}
3538
3539
int __init tdx_bringup(void)
3540
{
3541
int r, i;
3542
3543
/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3544
for_each_possible_cpu(i)
3545
INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3546
3547
if (!enable_tdx)
3548
return 0;
3549
3550
if (!enable_ept) {
3551
pr_err("EPT is required for TDX\n");
3552
goto success_disable_tdx;
3553
}
3554
3555
if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3556
pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3557
goto success_disable_tdx;
3558
}
3559
3560
if (!enable_apicv) {
3561
pr_err("APICv is required for TDX\n");
3562
goto success_disable_tdx;
3563
}
3564
3565
if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3566
pr_err("tdx: OSXSAVE is required for TDX\n");
3567
goto success_disable_tdx;
3568
}
3569
3570
if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3571
pr_err("tdx: MOVDIR64B is required for TDX\n");
3572
goto success_disable_tdx;
3573
}
3574
3575
if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3576
pr_err("Self-snoop is required for TDX\n");
3577
goto success_disable_tdx;
3578
}
3579
3580
if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3581
pr_err("tdx: no TDX private KeyIDs available\n");
3582
goto success_disable_tdx;
3583
}
3584
3585
if (!enable_virt_at_load) {
3586
pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3587
goto success_disable_tdx;
3588
}
3589
3590
/*
3591
* Ideally KVM should probe whether TDX module has been loaded
3592
* first and then try to bring it up. But TDX needs to use SEAMCALL
3593
* to probe whether the module is loaded (there is no CPUID or MSR
3594
* for that), and making SEAMCALL requires enabling virtualization
3595
* first, just like the rest steps of bringing up TDX module.
3596
*
3597
* So, for simplicity do everything in __tdx_bringup(); the first
3598
* SEAMCALL will return -ENODEV when the module is not loaded. The
3599
* only complication is having to make sure that initialization
3600
* SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3601
* cases.
3602
*/
3603
r = __tdx_bringup();
3604
if (r) {
3605
/*
3606
* Disable TDX only but don't fail to load module if the TDX
3607
* module could not be loaded. No need to print message saying
3608
* "module is not loaded" because it was printed when the first
3609
* SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3610
* vm_size, as kvm_x86_ops have already been finalized (and are
3611
* intentionally not exported). The S-EPT code is unreachable,
3612
* and allocating a few more bytes per VM in a should-be-rare
3613
* failure scenario is a non-issue.
3614
*/
3615
if (r == -ENODEV)
3616
goto success_disable_tdx;
3617
3618
enable_tdx = 0;
3619
}
3620
3621
return r;
3622
3623
success_disable_tdx:
3624
enable_tdx = 0;
3625
return 0;
3626
}
3627
3628
void __init tdx_hardware_setup(void)
3629
{
3630
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3631
3632
/*
3633
* Note, if the TDX module can't be loaded, KVM TDX support will be
3634
* disabled but KVM will continue loading (see tdx_bringup()).
3635
*/
3636
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3637
3638
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3639
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3640
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3641
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3642
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3643
}
3644
3645