Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/tdx.c
52358 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/cleanup.h>
3
#include <linux/cpu.h>
4
#include <asm/cpufeature.h>
5
#include <asm/fpu/xcr.h>
6
#include <linux/misc_cgroup.h>
7
#include <linux/mmu_context.h>
8
#include <asm/tdx.h>
9
#include "capabilities.h"
10
#include "mmu.h"
11
#include "x86_ops.h"
12
#include "lapic.h"
13
#include "tdx.h"
14
#include "vmx.h"
15
#include "mmu/spte.h"
16
#include "common.h"
17
#include "posted_intr.h"
18
#include "irq.h"
19
#include <trace/events/kvm.h>
20
#include "trace.h"
21
22
#pragma GCC poison to_vmx
23
24
#undef pr_fmt
25
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27
#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
28
({ \
29
struct kvm *_kvm = (__kvm); \
30
bool __ret = !!(__err); \
31
\
32
if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
33
if (_kvm) \
34
kvm_vm_bugged(_kvm); \
35
pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
36
__err, __args); \
37
} \
38
unlikely(__ret); \
39
})
40
41
#define TDX_BUG_ON(__err, __fn, __kvm) \
42
__TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
43
44
#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
45
__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
46
47
#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
48
__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
49
50
#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
51
__TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
52
a1, a2, a3)
53
54
55
bool enable_tdx __ro_after_init;
56
module_param_named(tdx, enable_tdx, bool, 0444);
57
58
#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
59
#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
60
61
static enum cpuhp_state tdx_cpuhp_state;
62
63
static const struct tdx_sys_info *tdx_sysinfo;
64
65
void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
66
{
67
KVM_BUG_ON(1, tdx->vcpu.kvm);
68
pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
69
}
70
71
void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
72
u64 val, u64 err)
73
{
74
KVM_BUG_ON(1, tdx->vcpu.kvm);
75
pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
76
}
77
78
#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
79
80
static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
81
{
82
return container_of(kvm, struct kvm_tdx, kvm);
83
}
84
85
static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
86
{
87
return container_of(vcpu, struct vcpu_tdx, vcpu);
88
}
89
90
static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
91
{
92
u64 val = KVM_SUPPORTED_TD_ATTRS;
93
94
if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
95
return 0;
96
97
val &= td_conf->attributes_fixed0;
98
99
return val;
100
}
101
102
static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
103
{
104
u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
105
106
if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
107
return 0;
108
109
val &= td_conf->xfam_fixed0;
110
111
return val;
112
}
113
114
static int tdx_get_guest_phys_addr_bits(const u32 eax)
115
{
116
return (eax & GENMASK(23, 16)) >> 16;
117
}
118
119
static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
120
{
121
return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
122
}
123
124
#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
125
126
static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
127
{
128
return entry->function == 7 && entry->index == 0 &&
129
(entry->ebx & TDX_FEATURE_TSX);
130
}
131
132
static void clear_tsx(struct kvm_cpuid_entry2 *entry)
133
{
134
entry->ebx &= ~TDX_FEATURE_TSX;
135
}
136
137
static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
138
{
139
return entry->function == 7 && entry->index == 0 &&
140
(entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
141
}
142
143
static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
144
{
145
entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
146
}
147
148
static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
149
{
150
if (has_tsx(entry))
151
clear_tsx(entry);
152
153
if (has_waitpkg(entry))
154
clear_waitpkg(entry);
155
}
156
157
static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
158
{
159
return has_tsx(entry) || has_waitpkg(entry);
160
}
161
162
#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
163
164
static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
165
{
166
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
167
168
entry->function = (u32)td_conf->cpuid_config_leaves[idx];
169
entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
170
entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
171
entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
172
entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
173
entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
174
175
if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
176
entry->index = 0;
177
178
/*
179
* The TDX module doesn't allow configuring the guest phys addr bits
180
* (EAX[23:16]). However, KVM uses it as an interface to the userspace
181
* to configure the GPAW. Report these bits as configurable.
182
*/
183
if (entry->function == 0x80000008)
184
entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
185
186
tdx_clear_unsupported_cpuid(entry);
187
}
188
189
#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
190
191
static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
192
struct kvm_tdx_capabilities *caps)
193
{
194
int i;
195
196
caps->supported_attrs = tdx_get_supported_attrs(td_conf);
197
if (!caps->supported_attrs)
198
return -EIO;
199
200
caps->supported_xfam = tdx_get_supported_xfam(td_conf);
201
if (!caps->supported_xfam)
202
return -EIO;
203
204
caps->cpuid.nent = td_conf->num_cpuid_config;
205
206
caps->user_tdvmcallinfo_1_r11 =
207
TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
208
209
for (i = 0; i < td_conf->num_cpuid_config; i++)
210
td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
211
212
return 0;
213
}
214
215
/*
216
* Some SEAMCALLs acquire the TDX module globally, and can fail with
217
* TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
218
*/
219
static DEFINE_MUTEX(tdx_lock);
220
221
static atomic_t nr_configured_hkid;
222
223
static bool tdx_operand_busy(u64 err)
224
{
225
return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
226
}
227
228
229
/*
230
* A per-CPU list of TD vCPUs associated with a given CPU.
231
* Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
232
* list.
233
* - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
234
* the old CPU during the IPI callback running on the old CPU, and then added
235
* to the per-CPU list of the new CPU.
236
* - When a TD is tearing down, all vCPUs are disassociated from their current
237
* running CPUs and removed from the per-CPU list during the IPI callback
238
* running on those CPUs.
239
* - When a CPU is brought down, traverse the per-CPU list to disassociate all
240
* associated TD vCPUs and remove them from the per-CPU list.
241
*/
242
static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
243
244
static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
245
{
246
return to_tdx(vcpu)->vp_enter_args.r10;
247
}
248
249
static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
250
{
251
return to_tdx(vcpu)->vp_enter_args.r11;
252
}
253
254
static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
255
long val)
256
{
257
to_tdx(vcpu)->vp_enter_args.r10 = val;
258
}
259
260
static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
261
unsigned long val)
262
{
263
to_tdx(vcpu)->vp_enter_args.r11 = val;
264
}
265
266
static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
267
{
268
tdx_guest_keyid_free(kvm_tdx->hkid);
269
kvm_tdx->hkid = -1;
270
atomic_dec(&nr_configured_hkid);
271
misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
272
put_misc_cg(kvm_tdx->misc_cg);
273
kvm_tdx->misc_cg = NULL;
274
}
275
276
static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
277
{
278
return kvm_tdx->hkid > 0;
279
}
280
281
static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
282
{
283
lockdep_assert_irqs_disabled();
284
285
list_del(&to_tdx(vcpu)->cpu_list);
286
287
/*
288
* Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
289
* otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
290
* to its list before it's deleted from this CPU's list.
291
*/
292
smp_wmb();
293
294
vcpu->cpu = -1;
295
}
296
297
/*
298
* Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
299
* retry (if necessary) after forcing vCPUs to exit and wait for the operation
300
* to complete. All flows that remove/block S-EPT entries run with mmu_lock
301
* held for write, i.e. are mutually exclusive with each other, but they aren't
302
* mutually exclusive with running vCPUs, and so can fail with "operand busy"
303
* if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
304
*
305
* Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
306
*/
307
#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
308
({ \
309
struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
310
u64 __err; \
311
\
312
lockdep_assert_held_write(&kvm->mmu_lock); \
313
\
314
__err = tdh_func(args); \
315
if (unlikely(tdx_operand_busy(__err))) { \
316
WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
317
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
318
\
319
__err = tdh_func(args); \
320
\
321
WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
322
} \
323
__err; \
324
})
325
326
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
327
static int __tdx_reclaim_page(struct page *page)
328
{
329
u64 err, rcx, rdx, r8;
330
331
err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
332
333
/*
334
* No need to check for TDX_OPERAND_BUSY; all TD pages are freed
335
* before the HKID is released and control pages have also been
336
* released at this point, so there is no possibility of contention.
337
*/
338
if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
339
return -EIO;
340
341
return 0;
342
}
343
344
static int tdx_reclaim_page(struct page *page)
345
{
346
int r;
347
348
r = __tdx_reclaim_page(page);
349
if (!r)
350
tdx_quirk_reset_page(page);
351
return r;
352
}
353
354
355
/*
356
* Reclaim the TD control page(s) which are crypto-protected by TDX guest's
357
* private KeyID. Assume the cache associated with the TDX private KeyID has
358
* been flushed.
359
*/
360
static void tdx_reclaim_control_page(struct page *ctrl_page)
361
{
362
/*
363
* Leak the page if the kernel failed to reclaim the page.
364
* The kernel cannot use it safely anymore.
365
*/
366
if (tdx_reclaim_page(ctrl_page))
367
return;
368
369
__free_page(ctrl_page);
370
}
371
372
struct tdx_flush_vp_arg {
373
struct kvm_vcpu *vcpu;
374
u64 err;
375
};
376
377
static void tdx_flush_vp(void *_arg)
378
{
379
struct tdx_flush_vp_arg *arg = _arg;
380
struct kvm_vcpu *vcpu = arg->vcpu;
381
u64 err;
382
383
arg->err = 0;
384
lockdep_assert_irqs_disabled();
385
386
/* Task migration can race with CPU offlining. */
387
if (unlikely(vcpu->cpu != raw_smp_processor_id()))
388
return;
389
390
/*
391
* No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
392
* list tracking still needs to be updated so that it's correct if/when
393
* the vCPU does get initialized.
394
*/
395
if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
396
/*
397
* No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
398
* TDVPR as exclusive, TDR as shared, and TDCS as shared. This
399
* vp flush function is called when destructing vCPU/TD or vCPU
400
* migration. No other thread uses TDVPR in those cases.
401
*/
402
err = tdh_vp_flush(&to_tdx(vcpu)->vp);
403
if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
404
/*
405
* This function is called in IPI context. Do not use
406
* printk to avoid console semaphore.
407
* The caller prints out the error message, instead.
408
*/
409
if (err)
410
arg->err = err;
411
}
412
}
413
414
tdx_disassociate_vp(vcpu);
415
}
416
417
static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
418
{
419
struct tdx_flush_vp_arg arg = {
420
.vcpu = vcpu,
421
};
422
int cpu = vcpu->cpu;
423
424
if (unlikely(cpu == -1))
425
return;
426
427
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
428
429
TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
430
}
431
432
void tdx_disable_virtualization_cpu(void)
433
{
434
int cpu = raw_smp_processor_id();
435
struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
436
struct tdx_flush_vp_arg arg;
437
struct vcpu_tdx *tdx, *tmp;
438
unsigned long flags;
439
440
local_irq_save(flags);
441
/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
442
list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
443
arg.vcpu = &tdx->vcpu;
444
tdx_flush_vp(&arg);
445
}
446
local_irq_restore(flags);
447
448
/*
449
* Flush cache now if kexec is possible: this is necessary to avoid
450
* having dirty private memory cachelines when the new kernel boots,
451
* but WBINVD is a relatively expensive operation and doing it during
452
* kexec can exacerbate races in native_stop_other_cpus(). Do it
453
* now, since this is a safe moment and there is going to be no more
454
* TDX activity on this CPU from this point on.
455
*/
456
tdx_cpu_flush_cache_for_kexec();
457
}
458
459
#define TDX_SEAMCALL_RETRIES 10000
460
461
static void smp_func_do_phymem_cache_wb(void *unused)
462
{
463
u64 err = 0;
464
bool resume;
465
int i;
466
467
/*
468
* TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
469
* KeyID on the package or core. The TDX module may not finish the
470
* cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
471
* kernel should retry it until it returns success w/o rescheduling.
472
*/
473
for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
474
resume = !!err;
475
err = tdh_phymem_cache_wb(resume);
476
switch (err) {
477
case TDX_INTERRUPTED_RESUMABLE:
478
continue;
479
case TDX_NO_HKID_READY_TO_WBCACHE:
480
err = TDX_SUCCESS; /* Already done by other thread */
481
fallthrough;
482
default:
483
goto out;
484
}
485
}
486
487
out:
488
TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
489
}
490
491
void tdx_mmu_release_hkid(struct kvm *kvm)
492
{
493
bool packages_allocated, targets_allocated;
494
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
495
cpumask_var_t packages, targets;
496
struct kvm_vcpu *vcpu;
497
unsigned long j;
498
int i;
499
u64 err;
500
501
if (!is_hkid_assigned(kvm_tdx))
502
return;
503
504
packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
505
targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
506
cpus_read_lock();
507
508
kvm_for_each_vcpu(j, vcpu, kvm)
509
tdx_flush_vp_on_cpu(vcpu);
510
511
/*
512
* TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
513
* and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
514
* Multiple TDX guests can be destroyed simultaneously. Take the
515
* mutex to prevent it from getting error.
516
*/
517
mutex_lock(&tdx_lock);
518
519
/*
520
* Releasing HKID is in vm_destroy().
521
* After the above flushing vps, there should be no more vCPU
522
* associations, as all vCPU fds have been released at this stage.
523
*/
524
err = tdh_mng_vpflushdone(&kvm_tdx->td);
525
if (err == TDX_FLUSHVP_NOT_DONE)
526
goto out;
527
if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
528
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
529
kvm_tdx->hkid);
530
goto out;
531
}
532
533
for_each_online_cpu(i) {
534
if (packages_allocated &&
535
cpumask_test_and_set_cpu(topology_physical_package_id(i),
536
packages))
537
continue;
538
if (targets_allocated)
539
cpumask_set_cpu(i, targets);
540
}
541
if (targets_allocated)
542
on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
543
else
544
on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
545
/*
546
* In the case of error in smp_func_do_phymem_cache_wb(), the following
547
* tdh_mng_key_freeid() will fail.
548
*/
549
err = tdh_mng_key_freeid(&kvm_tdx->td);
550
if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
551
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
552
kvm_tdx->hkid);
553
} else {
554
tdx_hkid_free(kvm_tdx);
555
}
556
557
out:
558
mutex_unlock(&tdx_lock);
559
cpus_read_unlock();
560
free_cpumask_var(targets);
561
free_cpumask_var(packages);
562
}
563
564
static void tdx_reclaim_td_control_pages(struct kvm *kvm)
565
{
566
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
567
u64 err;
568
int i;
569
570
/*
571
* tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
572
* heavily with TDX module. Give up freeing TD pages. As the function
573
* already warned, don't warn it again.
574
*/
575
if (is_hkid_assigned(kvm_tdx))
576
return;
577
578
if (kvm_tdx->td.tdcs_pages) {
579
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
580
if (!kvm_tdx->td.tdcs_pages[i])
581
continue;
582
583
tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
584
}
585
kfree(kvm_tdx->td.tdcs_pages);
586
kvm_tdx->td.tdcs_pages = NULL;
587
}
588
589
if (!kvm_tdx->td.tdr_page)
590
return;
591
592
if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
593
return;
594
595
/*
596
* Use a SEAMCALL to ask the TDX module to flush the cache based on the
597
* KeyID. TDX module may access TDR while operating on TD (Especially
598
* when it is reclaiming TDCS).
599
*/
600
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
601
if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
602
return;
603
604
tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
605
606
__free_page(kvm_tdx->td.tdr_page);
607
kvm_tdx->td.tdr_page = NULL;
608
}
609
610
void tdx_vm_destroy(struct kvm *kvm)
611
{
612
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
613
614
tdx_reclaim_td_control_pages(kvm);
615
616
kvm_tdx->state = TD_STATE_UNINITIALIZED;
617
}
618
619
static int tdx_do_tdh_mng_key_config(void *param)
620
{
621
struct kvm_tdx *kvm_tdx = param;
622
u64 err;
623
624
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
625
err = tdh_mng_key_config(&kvm_tdx->td);
626
if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
627
return -EIO;
628
629
return 0;
630
}
631
632
int tdx_vm_init(struct kvm *kvm)
633
{
634
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
635
636
kvm->arch.has_protected_state = true;
637
/*
638
* TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
639
* i.e. all EOIs are accelerated and never trigger exits.
640
*/
641
kvm->arch.has_protected_eoi = true;
642
kvm->arch.has_private_mem = true;
643
kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
644
645
/*
646
* Because guest TD is protected, VMM can't parse the instruction in TD.
647
* Instead, guest uses MMIO hypercall. For unmodified device driver,
648
* #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
649
* instruction into MMIO hypercall.
650
*
651
* SPTE value for MMIO needs to be setup so that #VE is injected into
652
* TD instead of triggering EPT MISCONFIG.
653
* - RWX=0 so that EPT violation is triggered.
654
* - suppress #VE bit is cleared to inject #VE.
655
*/
656
kvm_mmu_set_mmio_spte_value(kvm, 0);
657
658
/*
659
* TDX has its own limit of maximum vCPUs it can support for all
660
* TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
661
* such limit via the MAX_VCPU_PER_TD global metadata. In
662
* practice, it reflects the number of logical CPUs that ALL
663
* platforms that the TDX module supports can possibly have.
664
*
665
* Limit TDX guest's maximum vCPUs to the number of logical CPUs
666
* the platform has. Simply forwarding the MAX_VCPU_PER_TD to
667
* userspace would result in an unpredictable ABI.
668
*/
669
kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
670
671
kvm_tdx->state = TD_STATE_UNINITIALIZED;
672
673
return 0;
674
}
675
676
int tdx_vcpu_create(struct kvm_vcpu *vcpu)
677
{
678
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
679
struct vcpu_tdx *tdx = to_tdx(vcpu);
680
681
if (kvm_tdx->state != TD_STATE_INITIALIZED)
682
return -EIO;
683
684
/*
685
* TDX module mandates APICv, which requires an in-kernel local APIC.
686
* Disallow an in-kernel I/O APIC, because level-triggered interrupts
687
* and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
688
*/
689
if (!irqchip_split(vcpu->kvm))
690
return -EINVAL;
691
692
fpstate_set_confidential(&vcpu->arch.guest_fpu);
693
vcpu->arch.apic->guest_apic_protected = true;
694
INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
695
696
vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
697
698
vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
699
vcpu->arch.cr0_guest_owned_bits = -1ul;
700
vcpu->arch.cr4_guest_owned_bits = -1ul;
701
702
/* KVM can't change TSC offset/multiplier as TDX module manages them. */
703
vcpu->arch.guest_tsc_protected = true;
704
vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
705
vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
706
vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
707
vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
708
709
vcpu->arch.guest_state_protected =
710
!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
711
712
if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
713
vcpu->arch.xfd_no_write_intercept = true;
714
715
tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
716
__pi_set_sn(&tdx->vt.pi_desc);
717
718
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
719
720
return 0;
721
}
722
723
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
724
{
725
struct vcpu_tdx *tdx = to_tdx(vcpu);
726
727
vmx_vcpu_pi_load(vcpu, cpu);
728
if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
729
return;
730
731
tdx_flush_vp_on_cpu(vcpu);
732
733
KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
734
local_irq_disable();
735
/*
736
* Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
737
* vcpu->cpu is read before tdx->cpu_list.
738
*/
739
smp_rmb();
740
741
list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
742
local_irq_enable();
743
}
744
745
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
746
{
747
/*
748
* KVM can't get the interrupt status of TDX guest and it assumes
749
* interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
750
* which passes the interrupt blocked flag.
751
*/
752
return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
753
!to_tdx(vcpu)->vp_enter_args.r12;
754
}
755
756
static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
757
{
758
u64 vcpu_state_details;
759
760
if (pi_has_pending_interrupt(vcpu))
761
return true;
762
763
/*
764
* Only check RVI pending for HALTED case with IRQ enabled.
765
* For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
766
* interrupt was pending before TD exit, then it _must_ be blocked,
767
* otherwise the interrupt would have been serviced at the instruction
768
* boundary.
769
*/
770
if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
771
to_tdx(vcpu)->vp_enter_args.r12)
772
return false;
773
774
vcpu_state_details =
775
td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
776
777
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
778
}
779
780
struct tdx_uret_msr {
781
u32 msr;
782
unsigned int slot;
783
u64 defval;
784
};
785
786
static struct tdx_uret_msr tdx_uret_msrs[] = {
787
{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
788
{.msr = MSR_STAR,},
789
{.msr = MSR_LSTAR,},
790
{.msr = MSR_TSC_AUX,},
791
};
792
793
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
794
{
795
struct vcpu_vt *vt = to_vt(vcpu);
796
int i;
797
798
if (vt->guest_state_loaded)
799
return;
800
801
if (likely(is_64bit_mm(current->mm)))
802
vt->msr_host_kernel_gs_base = current->thread.gsbase;
803
else
804
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
805
806
vt->guest_state_loaded = true;
807
808
/*
809
* Explicitly set user-return MSRs that are clobbered by the TDX-Module
810
* if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
811
* written by the TDX-Module. Don't rely on the TDX-Module to actually
812
* clobber the MSRs, as the contract is poorly defined and not upheld.
813
* E.g. the TDX-Module will synthesize an EPT Violation without doing
814
* VM-Enter if it suspects a zero-step attack, and never "restore" VMM
815
* state.
816
*/
817
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
818
kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
819
tdx_uret_msrs[i].defval, -1ull);
820
}
821
822
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
823
{
824
struct vcpu_vt *vt = to_vt(vcpu);
825
826
if (!vt->guest_state_loaded)
827
return;
828
829
++vcpu->stat.host_state_reload;
830
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
831
832
vt->guest_state_loaded = false;
833
}
834
835
void tdx_vcpu_put(struct kvm_vcpu *vcpu)
836
{
837
vmx_vcpu_pi_put(vcpu);
838
tdx_prepare_switch_to_host(vcpu);
839
}
840
841
/*
842
* Life cycles for a TD and a vCPU:
843
* 1. KVM_CREATE_VM ioctl.
844
* TD state is TD_STATE_UNINITIALIZED.
845
* hkid is not assigned at this stage.
846
* 2. KVM_TDX_INIT_VM ioctl.
847
* TD transitions to TD_STATE_INITIALIZED.
848
* hkid is assigned after this stage.
849
* 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
850
* 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
851
* 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
852
* 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
853
* kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
854
* 4. KVM_TDX_INIT_VCPU ioctl.
855
* tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
856
* vCPU control structures are allocated at this stage.
857
* 5. kvm_destroy_vm().
858
* 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
859
* (2) puts hkid to !assigned state.
860
* 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
861
* transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
862
* 5.3 tdx_vm_destroy()
863
* transitions TD to TD_STATE_UNINITIALIZED state.
864
*
865
* tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
866
* - If at 3.3, hkid is still assigned, but the vCPU must be in
867
* VCPU_TD_STATE_UNINITIALIZED state.
868
* - if at 5.2, hkid must be !assigned and all vCPUs must be in
869
* VCPU_TD_STATE_INITIALIZED state and have been dissociated.
870
*/
871
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
872
{
873
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
874
struct vcpu_tdx *tdx = to_tdx(vcpu);
875
int i;
876
877
if (vcpu->cpu != -1) {
878
KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
879
tdx_flush_vp_on_cpu(vcpu);
880
return;
881
}
882
883
/*
884
* It is not possible to reclaim pages while hkid is assigned. It might
885
* be assigned if the TD VM is being destroyed but freeing hkid failed,
886
* in which case the pages are leaked.
887
*/
888
if (is_hkid_assigned(kvm_tdx))
889
return;
890
891
if (tdx->vp.tdcx_pages) {
892
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
893
if (tdx->vp.tdcx_pages[i])
894
tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
895
}
896
kfree(tdx->vp.tdcx_pages);
897
tdx->vp.tdcx_pages = NULL;
898
}
899
if (tdx->vp.tdvpr_page) {
900
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
901
tdx->vp.tdvpr_page = NULL;
902
tdx->vp.tdvpr_pa = 0;
903
}
904
905
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
906
}
907
908
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
909
{
910
if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
911
to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
912
return -EINVAL;
913
914
return 1;
915
}
916
917
static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
918
{
919
switch (tdvmcall_leaf(vcpu)) {
920
case EXIT_REASON_CPUID:
921
case EXIT_REASON_HLT:
922
case EXIT_REASON_IO_INSTRUCTION:
923
case EXIT_REASON_MSR_READ:
924
case EXIT_REASON_MSR_WRITE:
925
return tdvmcall_leaf(vcpu);
926
case EXIT_REASON_EPT_VIOLATION:
927
return EXIT_REASON_EPT_MISCONFIG;
928
default:
929
break;
930
}
931
932
return EXIT_REASON_TDCALL;
933
}
934
935
static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
936
{
937
struct vcpu_tdx *tdx = to_tdx(vcpu);
938
u32 exit_reason;
939
940
switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
941
case TDX_SUCCESS:
942
case TDX_NON_RECOVERABLE_VCPU:
943
case TDX_NON_RECOVERABLE_TD:
944
case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
945
case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
946
break;
947
default:
948
return -1u;
949
}
950
951
exit_reason = tdx->vp_enter_ret;
952
953
switch (exit_reason) {
954
case EXIT_REASON_TDCALL:
955
if (tdvmcall_exit_type(vcpu))
956
return EXIT_REASON_VMCALL;
957
958
return tdcall_to_vmx_exit_reason(vcpu);
959
case EXIT_REASON_EPT_MISCONFIG:
960
/*
961
* Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
962
* non-instrumentable code with interrupts disabled.
963
*/
964
return -1u;
965
default:
966
break;
967
}
968
969
return exit_reason;
970
}
971
972
static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
973
{
974
struct vcpu_tdx *tdx = to_tdx(vcpu);
975
struct vcpu_vt *vt = to_vt(vcpu);
976
977
guest_state_enter_irqoff();
978
979
tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
980
981
vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
982
983
vt->exit_qualification = tdx->vp_enter_args.rcx;
984
tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
985
tdx->exit_gpa = tdx->vp_enter_args.r8;
986
vt->exit_intr_info = tdx->vp_enter_args.r9;
987
988
vmx_handle_nmi(vcpu);
989
990
guest_state_exit_irqoff();
991
}
992
993
static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
994
{
995
return vmx_get_exit_reason(vcpu).failed_vmentry &&
996
vmx_get_exit_reason(vcpu).full != -1u;
997
}
998
999
static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
1000
{
1001
u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
1002
1003
/*
1004
* TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
1005
* or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
1006
*
1007
* When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
1008
* KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
1009
* vCPUs leaving fastpath so that interrupt can be enabled to ensure the
1010
* IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
1011
* EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
1012
* requester may be blocked endlessly.
1013
*/
1014
if (unlikely(tdx_operand_busy(vp_enter_ret)))
1015
return EXIT_FASTPATH_EXIT_HANDLED;
1016
1017
return EXIT_FASTPATH_NONE;
1018
}
1019
1020
#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
1021
BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
1022
BIT_ULL(VCPU_REGS_RAX) | \
1023
BIT_ULL(VCPU_REGS_RBX) | \
1024
BIT_ULL(VCPU_REGS_RCX) | \
1025
BIT_ULL(VCPU_REGS_RDX) | \
1026
BIT_ULL(VCPU_REGS_RBP) | \
1027
BIT_ULL(VCPU_REGS_RSI) | \
1028
BIT_ULL(VCPU_REGS_RDI) | \
1029
BIT_ULL(VCPU_REGS_R8) | \
1030
BIT_ULL(VCPU_REGS_R9) | \
1031
BIT_ULL(VCPU_REGS_R10) | \
1032
BIT_ULL(VCPU_REGS_R11) | \
1033
BIT_ULL(VCPU_REGS_R12) | \
1034
BIT_ULL(VCPU_REGS_R13) | \
1035
BIT_ULL(VCPU_REGS_R14) | \
1036
BIT_ULL(VCPU_REGS_R15))
1037
1038
static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1039
{
1040
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1041
1042
/*
1043
* All TDX hosts support PKRU; but even if they didn't,
1044
* vcpu->arch.host_pkru would be 0 and the wrpkru would be
1045
* skipped.
1046
*/
1047
if (vcpu->arch.host_pkru != 0)
1048
wrpkru(vcpu->arch.host_pkru);
1049
1050
if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1051
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1052
1053
/*
1054
* Likewise, even if a TDX hosts didn't support XSS both arms of
1055
* the comparison would be 0 and the wrmsrl would be skipped.
1056
*/
1057
if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1058
wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1059
}
1060
1061
#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1062
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1063
DEBUGCTLMSR_FREEZE_IN_SMM)
1064
1065
fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1066
{
1067
struct vcpu_tdx *tdx = to_tdx(vcpu);
1068
struct vcpu_vt *vt = to_vt(vcpu);
1069
1070
/*
1071
* WARN if KVM wants to force an immediate exit, as the TDX module does
1072
* not guarantee entry into the guest, i.e. it's possible for KVM to
1073
* _think_ it completed entry to the guest and forced an immediate exit
1074
* without actually having done so. Luckily, KVM never needs to force
1075
* an immediate exit for TDX (KVM can't do direct event injection, so
1076
* just WARN and continue on.
1077
*/
1078
WARN_ON_ONCE(run_flags);
1079
1080
/*
1081
* Wait until retry of SEPT-zap-related SEAMCALL completes before
1082
* allowing vCPU entry to avoid contention with tdh_vp_enter() and
1083
* TDCALLs.
1084
*/
1085
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1086
return EXIT_FASTPATH_EXIT_HANDLED;
1087
1088
trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1089
1090
if (pi_test_on(&vt->pi_desc)) {
1091
apic->send_IPI_self(POSTED_INTR_VECTOR);
1092
1093
if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1094
APIC_VECTOR_MASK, &vt->pi_desc))
1095
kvm_wait_lapic_expire(vcpu);
1096
}
1097
1098
tdx_vcpu_enter_exit(vcpu);
1099
1100
if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1101
update_debugctlmsr(vcpu->arch.host_debugctl);
1102
1103
tdx_load_host_xsave_state(vcpu);
1104
1105
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1106
1107
if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1108
return EXIT_FASTPATH_NONE;
1109
1110
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1111
return EXIT_FASTPATH_NONE;
1112
1113
trace_kvm_exit(vcpu, KVM_ISA_VMX);
1114
1115
if (unlikely(tdx_failed_vmentry(vcpu)))
1116
return EXIT_FASTPATH_NONE;
1117
1118
return tdx_exit_handlers_fastpath(vcpu);
1119
}
1120
1121
void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1122
{
1123
++vcpu->stat.nmi_injections;
1124
td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1125
/*
1126
* From KVM's perspective, NMI injection is completed right after
1127
* writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1128
* the TDX module or not.
1129
*/
1130
vcpu->arch.nmi_injected = false;
1131
/*
1132
* TDX doesn't support KVM to request NMI window exit. If there is
1133
* still a pending vNMI, KVM is not able to inject it along with the
1134
* one pending in TDX module in a back-to-back way. Since the previous
1135
* vNMI is still pending in TDX module, i.e. it has not been delivered
1136
* to TDX guest yet, it's OK to collapse the pending vNMI into the
1137
* previous one. The guest is expected to handle all the NMI sources
1138
* when handling the first vNMI.
1139
*/
1140
vcpu->arch.nmi_pending = 0;
1141
}
1142
1143
static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1144
{
1145
u32 intr_info = vmx_get_intr_info(vcpu);
1146
1147
/*
1148
* Machine checks are handled by handle_exception_irqoff(), or by
1149
* tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1150
* VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1151
*/
1152
if (is_nmi(intr_info) || is_machine_check(intr_info))
1153
return 1;
1154
1155
vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1156
vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1157
vcpu->run->ex.error_code = 0;
1158
1159
return 0;
1160
}
1161
1162
static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1163
{
1164
tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1165
return 1;
1166
}
1167
1168
static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1169
{
1170
kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1171
kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1172
kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1173
kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1174
kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1175
1176
return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1177
}
1178
1179
/*
1180
* Split into chunks and check interrupt pending between chunks. This allows
1181
* for timely injection of interrupts to prevent issues with guest lockup
1182
* detection.
1183
*/
1184
#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1185
static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1186
1187
static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1188
{
1189
struct vcpu_tdx *tdx = to_tdx(vcpu);
1190
1191
if (vcpu->run->hypercall.ret) {
1192
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1193
tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1194
return 1;
1195
}
1196
1197
tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1198
if (tdx->map_gpa_next >= tdx->map_gpa_end)
1199
return 1;
1200
1201
/*
1202
* Stop processing the remaining part if there is a pending interrupt,
1203
* which could be qualified to deliver. Skip checking pending RVI for
1204
* TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1205
*/
1206
if (kvm_vcpu_has_events(vcpu)) {
1207
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1208
tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1209
return 1;
1210
}
1211
1212
__tdx_map_gpa(tdx);
1213
return 0;
1214
}
1215
1216
static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1217
{
1218
u64 gpa = tdx->map_gpa_next;
1219
u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1220
1221
if (size > TDX_MAP_GPA_MAX_LEN)
1222
size = TDX_MAP_GPA_MAX_LEN;
1223
1224
tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1225
tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1226
/*
1227
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1228
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1229
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1230
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1231
*/
1232
tdx->vcpu.run->hypercall.ret = 0;
1233
tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1234
tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1235
tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1236
KVM_MAP_GPA_RANGE_ENCRYPTED :
1237
KVM_MAP_GPA_RANGE_DECRYPTED;
1238
tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1239
1240
tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1241
}
1242
1243
static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1244
{
1245
struct vcpu_tdx *tdx = to_tdx(vcpu);
1246
u64 gpa = tdx->vp_enter_args.r12;
1247
u64 size = tdx->vp_enter_args.r13;
1248
u64 ret;
1249
1250
/*
1251
* Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1252
* userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1253
* bit set. This is a base call so it should always be supported, but
1254
* KVM has no way to ensure that userspace implements the GHCI correctly.
1255
* So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1256
* to the guest.
1257
*/
1258
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1259
ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1260
goto error;
1261
}
1262
1263
if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1264
!kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1265
(vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1266
vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1267
ret = TDVMCALL_STATUS_INVALID_OPERAND;
1268
goto error;
1269
}
1270
1271
if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1272
ret = TDVMCALL_STATUS_ALIGN_ERROR;
1273
goto error;
1274
}
1275
1276
tdx->map_gpa_end = gpa + size;
1277
tdx->map_gpa_next = gpa;
1278
1279
__tdx_map_gpa(tdx);
1280
return 0;
1281
1282
error:
1283
tdvmcall_set_return_code(vcpu, ret);
1284
tdx->vp_enter_args.r11 = gpa;
1285
return 1;
1286
}
1287
1288
static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1289
{
1290
struct vcpu_tdx *tdx = to_tdx(vcpu);
1291
u64 *regs = vcpu->run->system_event.data;
1292
u64 *module_regs = &tdx->vp_enter_args.r8;
1293
int index = VCPU_REGS_RAX;
1294
1295
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1296
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1297
vcpu->run->system_event.ndata = 16;
1298
1299
/* Dump 16 general-purpose registers to userspace in ascending order. */
1300
regs[index++] = tdx->vp_enter_ret;
1301
regs[index++] = tdx->vp_enter_args.rcx;
1302
regs[index++] = tdx->vp_enter_args.rdx;
1303
regs[index++] = tdx->vp_enter_args.rbx;
1304
regs[index++] = 0;
1305
regs[index++] = 0;
1306
regs[index++] = tdx->vp_enter_args.rsi;
1307
regs[index] = tdx->vp_enter_args.rdi;
1308
for (index = 0; index < 8; index++)
1309
regs[VCPU_REGS_R8 + index] = module_regs[index];
1310
1311
return 0;
1312
}
1313
1314
static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1315
{
1316
u32 eax, ebx, ecx, edx;
1317
struct vcpu_tdx *tdx = to_tdx(vcpu);
1318
1319
/* EAX and ECX for cpuid is stored in R12 and R13. */
1320
eax = tdx->vp_enter_args.r12;
1321
ecx = tdx->vp_enter_args.r13;
1322
1323
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1324
1325
tdx->vp_enter_args.r12 = eax;
1326
tdx->vp_enter_args.r13 = ebx;
1327
tdx->vp_enter_args.r14 = ecx;
1328
tdx->vp_enter_args.r15 = edx;
1329
1330
return 1;
1331
}
1332
1333
static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1334
{
1335
vcpu->arch.pio.count = 0;
1336
return 1;
1337
}
1338
1339
static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1340
{
1341
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1342
unsigned long val = 0;
1343
int ret;
1344
1345
ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1346
vcpu->arch.pio.port, &val, 1);
1347
1348
WARN_ON_ONCE(!ret);
1349
1350
tdvmcall_set_return_val(vcpu, val);
1351
1352
return 1;
1353
}
1354
1355
static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1356
{
1357
struct vcpu_tdx *tdx = to_tdx(vcpu);
1358
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1359
unsigned long val = 0;
1360
unsigned int port;
1361
u64 size, write;
1362
int ret;
1363
1364
++vcpu->stat.io_exits;
1365
1366
size = tdx->vp_enter_args.r12;
1367
write = tdx->vp_enter_args.r13;
1368
port = tdx->vp_enter_args.r14;
1369
1370
if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1371
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1372
return 1;
1373
}
1374
1375
if (write) {
1376
val = tdx->vp_enter_args.r15;
1377
ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1378
} else {
1379
ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1380
}
1381
1382
if (!ret)
1383
vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1384
tdx_complete_pio_in;
1385
else if (!write)
1386
tdvmcall_set_return_val(vcpu, val);
1387
1388
return ret;
1389
}
1390
1391
static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1392
{
1393
unsigned long val = 0;
1394
gpa_t gpa;
1395
int size;
1396
1397
gpa = vcpu->mmio_fragments[0].gpa;
1398
size = vcpu->mmio_fragments[0].len;
1399
1400
memcpy(&val, vcpu->run->mmio.data, size);
1401
tdvmcall_set_return_val(vcpu, val);
1402
trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1403
return 1;
1404
}
1405
1406
static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1407
unsigned long val)
1408
{
1409
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1410
trace_kvm_fast_mmio(gpa);
1411
return 0;
1412
}
1413
1414
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1415
if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1416
return -EOPNOTSUPP;
1417
1418
return 0;
1419
}
1420
1421
static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1422
{
1423
unsigned long val;
1424
1425
if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1426
return -EOPNOTSUPP;
1427
1428
tdvmcall_set_return_val(vcpu, val);
1429
trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1430
return 0;
1431
}
1432
1433
static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1434
{
1435
struct vcpu_tdx *tdx = to_tdx(vcpu);
1436
int size, write, r;
1437
unsigned long val;
1438
gpa_t gpa;
1439
1440
size = tdx->vp_enter_args.r12;
1441
write = tdx->vp_enter_args.r13;
1442
gpa = tdx->vp_enter_args.r14;
1443
val = write ? tdx->vp_enter_args.r15 : 0;
1444
1445
if (size != 1 && size != 2 && size != 4 && size != 8)
1446
goto error;
1447
if (write != 0 && write != 1)
1448
goto error;
1449
1450
/*
1451
* TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1452
* do MMIO emulation for private GPA.
1453
*/
1454
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1455
vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1456
goto error;
1457
1458
gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1459
1460
if (write)
1461
r = tdx_mmio_write(vcpu, gpa, size, val);
1462
else
1463
r = tdx_mmio_read(vcpu, gpa, size);
1464
if (!r)
1465
/* Kernel completed device emulation. */
1466
return 1;
1467
1468
/* Request the device emulation to userspace device model. */
1469
vcpu->mmio_is_write = write;
1470
if (!write)
1471
vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1472
1473
vcpu->run->mmio.phys_addr = gpa;
1474
vcpu->run->mmio.len = size;
1475
vcpu->run->mmio.is_write = write;
1476
vcpu->run->exit_reason = KVM_EXIT_MMIO;
1477
1478
if (write) {
1479
memcpy(vcpu->run->mmio.data, &val, size);
1480
} else {
1481
vcpu->mmio_fragments[0].gpa = gpa;
1482
vcpu->mmio_fragments[0].len = size;
1483
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1484
}
1485
return 0;
1486
1487
error:
1488
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1489
return 1;
1490
}
1491
1492
static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1493
{
1494
struct vcpu_tdx *tdx = to_tdx(vcpu);
1495
1496
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1497
1498
/*
1499
* For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1500
* directly without the support from userspace, just set the value
1501
* returned from userspace.
1502
*/
1503
tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1504
tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1505
tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1506
tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1507
1508
return 1;
1509
}
1510
1511
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1512
{
1513
struct vcpu_tdx *tdx = to_tdx(vcpu);
1514
1515
switch (tdx->vp_enter_args.r12) {
1516
case 0:
1517
tdx->vp_enter_args.r11 = 0;
1518
tdx->vp_enter_args.r12 = 0;
1519
tdx->vp_enter_args.r13 = 0;
1520
tdx->vp_enter_args.r14 = 0;
1521
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1522
return 1;
1523
case 1:
1524
vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1525
vcpu->run->exit_reason = KVM_EXIT_TDX;
1526
vcpu->run->tdx.flags = 0;
1527
vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1528
vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1529
vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1530
vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1531
vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1532
vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1533
vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1534
return 0;
1535
default:
1536
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1537
return 1;
1538
}
1539
}
1540
1541
static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1542
{
1543
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1544
return 1;
1545
}
1546
1547
static int tdx_get_quote(struct kvm_vcpu *vcpu)
1548
{
1549
struct vcpu_tdx *tdx = to_tdx(vcpu);
1550
u64 gpa = tdx->vp_enter_args.r12;
1551
u64 size = tdx->vp_enter_args.r13;
1552
1553
/* The gpa of buffer must have shared bit set. */
1554
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1555
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1556
return 1;
1557
}
1558
1559
vcpu->run->exit_reason = KVM_EXIT_TDX;
1560
vcpu->run->tdx.flags = 0;
1561
vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1562
vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1563
vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1564
vcpu->run->tdx.get_quote.size = size;
1565
1566
vcpu->arch.complete_userspace_io = tdx_complete_simple;
1567
1568
return 0;
1569
}
1570
1571
static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1572
{
1573
struct vcpu_tdx *tdx = to_tdx(vcpu);
1574
u64 vector = tdx->vp_enter_args.r12;
1575
1576
if (vector < 32 || vector > 255) {
1577
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1578
return 1;
1579
}
1580
1581
vcpu->run->exit_reason = KVM_EXIT_TDX;
1582
vcpu->run->tdx.flags = 0;
1583
vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1584
vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1585
vcpu->run->tdx.setup_event_notify.vector = vector;
1586
1587
vcpu->arch.complete_userspace_io = tdx_complete_simple;
1588
1589
return 0;
1590
}
1591
1592
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1593
{
1594
switch (tdvmcall_leaf(vcpu)) {
1595
case TDVMCALL_MAP_GPA:
1596
return tdx_map_gpa(vcpu);
1597
case TDVMCALL_REPORT_FATAL_ERROR:
1598
return tdx_report_fatal_error(vcpu);
1599
case TDVMCALL_GET_TD_VM_CALL_INFO:
1600
return tdx_get_td_vm_call_info(vcpu);
1601
case TDVMCALL_GET_QUOTE:
1602
return tdx_get_quote(vcpu);
1603
case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1604
return tdx_setup_event_notify_interrupt(vcpu);
1605
default:
1606
break;
1607
}
1608
1609
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1610
return 1;
1611
}
1612
1613
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1614
{
1615
u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1616
TDX_SHARED_BIT_PWL_4;
1617
1618
if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1619
return;
1620
1621
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1622
}
1623
1624
static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1625
kvm_pfn_t pfn)
1626
{
1627
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1628
u64 err, entry, level_state;
1629
gpa_t gpa = gfn_to_gpa(gfn);
1630
1631
lockdep_assert_held(&kvm->slots_lock);
1632
1633
if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
1634
KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
1635
return -EIO;
1636
1637
err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
1638
kvm_tdx->page_add_src, &entry, &level_state);
1639
if (unlikely(tdx_operand_busy(err)))
1640
return -EBUSY;
1641
1642
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
1643
return -EIO;
1644
1645
return 0;
1646
}
1647
1648
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1649
enum pg_level level, kvm_pfn_t pfn)
1650
{
1651
int tdx_level = pg_level_to_tdx_sept_level(level);
1652
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1653
struct page *page = pfn_to_page(pfn);
1654
gpa_t gpa = gfn_to_gpa(gfn);
1655
u64 entry, level_state;
1656
u64 err;
1657
1658
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1659
if (unlikely(tdx_operand_busy(err)))
1660
return -EBUSY;
1661
1662
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
1663
return -EIO;
1664
1665
return 0;
1666
}
1667
1668
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1669
enum pg_level level, u64 mirror_spte)
1670
{
1671
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1672
kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
1673
1674
/* TODO: handle large pages. */
1675
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1676
return -EIO;
1677
1678
WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
1679
(mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
1680
1681
/*
1682
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
1683
* before kvm_tdx->state. Userspace must not be allowed to pre-fault
1684
* arbitrary memory until the initial memory image is finalized. Pairs
1685
* with the smp_wmb() in tdx_td_finalize().
1686
*/
1687
smp_rmb();
1688
1689
/*
1690
* If the TD isn't finalized/runnable, then userspace is initializing
1691
* the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
1692
*/
1693
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1694
return tdx_mem_page_add(kvm, gfn, level, pfn);
1695
1696
return tdx_mem_page_aug(kvm, gfn, level, pfn);
1697
}
1698
1699
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1700
enum pg_level level, void *private_spt)
1701
{
1702
int tdx_level = pg_level_to_tdx_sept_level(level);
1703
gpa_t gpa = gfn_to_gpa(gfn);
1704
struct page *page = virt_to_page(private_spt);
1705
u64 err, entry, level_state;
1706
1707
err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1708
&level_state);
1709
if (unlikely(tdx_operand_busy(err)))
1710
return -EBUSY;
1711
1712
if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
1713
return -EIO;
1714
1715
return 0;
1716
}
1717
1718
/*
1719
* Ensure shared and private EPTs to be flushed on all vCPUs.
1720
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
1721
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1722
* running in guest mode with the value "N - 1".
1723
*
1724
* A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1725
* guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1726
* being increased to "N + 1".
1727
*
1728
* Kicking off all vCPUs after that further results in no vCPUs can run in guest
1729
* mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1730
* to increase TD epoch to "N + 2").
1731
*
1732
* TDX module will flush EPT on the next TD enter and make vCPUs to run in
1733
* guest mode with TD epoch value "N + 1".
1734
*
1735
* kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1736
* waiting empty IPI handler ack_kick().
1737
*
1738
* No action is required to the vCPUs being kicked off since the kicking off
1739
* occurs certainly after TD epoch increment and before the next
1740
* tdh_mem_track().
1741
*/
1742
static void tdx_track(struct kvm *kvm)
1743
{
1744
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1745
u64 err;
1746
1747
/* If TD isn't finalized, it's before any vcpu running. */
1748
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1749
return;
1750
1751
/*
1752
* The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
1753
* mode must be serialized, as TDH.MEM.TRACK will fail if the previous
1754
* tracking epoch hasn't completed.
1755
*/
1756
lockdep_assert_held_write(&kvm->mmu_lock);
1757
1758
err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
1759
TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
1760
1761
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1762
}
1763
1764
static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1765
enum pg_level level, void *private_spt)
1766
{
1767
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1768
1769
/*
1770
* free_external_spt() is only called after hkid is freed when TD is
1771
* tearing down.
1772
* KVM doesn't (yet) zap page table pages in mirror page table while
1773
* TD is active, though guest pages mapped in mirror page table could be
1774
* zapped during TD is active, e.g. for shared <-> private conversion
1775
* and slot move/deletion.
1776
*/
1777
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1778
return -EIO;
1779
1780
/*
1781
* The HKID assigned to this TD was already freed and cache was
1782
* already flushed. We don't have to flush again.
1783
*/
1784
return tdx_reclaim_page(virt_to_page(private_spt));
1785
}
1786
1787
static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1788
enum pg_level level, u64 mirror_spte)
1789
{
1790
struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
1791
int tdx_level = pg_level_to_tdx_sept_level(level);
1792
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1793
gpa_t gpa = gfn_to_gpa(gfn);
1794
u64 err, entry, level_state;
1795
1796
lockdep_assert_held_write(&kvm->mmu_lock);
1797
1798
/*
1799
* HKID is released after all private pages have been removed, and set
1800
* before any might be populated. Warn if zapping is attempted when
1801
* there can't be anything populated in the private EPT.
1802
*/
1803
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1804
return;
1805
1806
/* TODO: handle large pages. */
1807
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1808
return;
1809
1810
err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
1811
tdx_level, &entry, &level_state);
1812
if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
1813
return;
1814
1815
/*
1816
* TDX requires TLB tracking before dropping private page. Do
1817
* it here, although it is also done later.
1818
*/
1819
tdx_track(kvm);
1820
1821
/*
1822
* When zapping private page, write lock is held. So no race condition
1823
* with other vcpu sept operation.
1824
* Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1825
*/
1826
err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
1827
tdx_level, &entry, &level_state);
1828
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
1829
return;
1830
1831
err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1832
if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
1833
return;
1834
1835
tdx_quirk_reset_page(page);
1836
}
1837
1838
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1839
int trig_mode, int vector)
1840
{
1841
struct kvm_vcpu *vcpu = apic->vcpu;
1842
struct vcpu_tdx *tdx = to_tdx(vcpu);
1843
1844
/* TDX supports only posted interrupt. No lapic emulation. */
1845
__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1846
1847
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1848
}
1849
1850
static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1851
{
1852
u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1853
u64 eq = vmx_get_exit_qual(vcpu);
1854
1855
if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1856
return false;
1857
1858
return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1859
}
1860
1861
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1862
{
1863
unsigned long exit_qual;
1864
gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1865
bool local_retry = false;
1866
int ret;
1867
1868
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1869
if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1870
pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1871
gpa, vcpu->vcpu_id);
1872
kvm_vm_dead(vcpu->kvm);
1873
return -EIO;
1874
}
1875
/*
1876
* Always treat SEPT violations as write faults. Ignore the
1877
* EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1878
* TD private pages are always RWX in the SEPT tables,
1879
* i.e. they're always mapped writable. Just as importantly,
1880
* treating SEPT violations as write faults is necessary to
1881
* avoid COW allocations, which will cause TDAUGPAGE failures
1882
* due to aliasing a single HPA to multiple GPAs.
1883
*/
1884
exit_qual = EPT_VIOLATION_ACC_WRITE;
1885
1886
/* Only private GPA triggers zero-step mitigation */
1887
local_retry = true;
1888
} else {
1889
exit_qual = vmx_get_exit_qual(vcpu);
1890
/*
1891
* EPT violation due to instruction fetch should never be
1892
* triggered from shared memory in TDX guest. If such EPT
1893
* violation occurs, treat it as broken hardware.
1894
*/
1895
if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1896
return -EIO;
1897
}
1898
1899
trace_kvm_page_fault(vcpu, gpa, exit_qual);
1900
1901
/*
1902
* To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1903
* mapping in TDX.
1904
*
1905
* KVM may return RET_PF_RETRY for private GPA due to
1906
* - contentions when atomically updating SPTEs of the mirror page table
1907
* - in-progress GFN invalidation or memslot removal.
1908
* - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1909
* caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1910
* or certain TDCALLs.
1911
*
1912
* If TDH.VP.ENTER is invoked more times than the threshold set by the
1913
* TDX module before KVM resolves the private GPA mapping, the TDX
1914
* module will activate zero-step mitigation during TDH.VP.ENTER. This
1915
* process acquires an SEPT tree lock in the TDX module, leading to
1916
* further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1917
* operations on other vCPUs.
1918
*
1919
* Breaking out of local retries for kvm_vcpu_has_events() is for
1920
* interrupt injection. kvm_vcpu_has_events() should not see pending
1921
* events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1922
* blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1923
* the guest even if the IRQ/NMI can't be delivered.
1924
*
1925
* Note: even without breaking out of local retries, zero-step
1926
* mitigation may still occur due to
1927
* - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1928
* - a single RIP causing EPT violations for more GFNs than the
1929
* threshold count.
1930
* This is safe, as triggering zero-step mitigation only introduces
1931
* contentions to page installation SEAMCALLs on other vCPUs, which will
1932
* handle retries locally in their EPT violation handlers.
1933
*/
1934
while (1) {
1935
struct kvm_memory_slot *slot;
1936
1937
ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
1938
1939
if (ret != RET_PF_RETRY || !local_retry)
1940
break;
1941
1942
if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
1943
break;
1944
1945
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
1946
ret = -EIO;
1947
break;
1948
}
1949
1950
/*
1951
* Bail if the memslot is invalid, i.e. is being deleted, as
1952
* faulting in will never succeed and this task needs to drop
1953
* SRCU in order to let memslot deletion complete.
1954
*/
1955
slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
1956
if (slot && slot->flags & KVM_MEMSLOT_INVALID)
1957
break;
1958
1959
cond_resched();
1960
}
1961
return ret;
1962
}
1963
1964
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
1965
{
1966
if (err) {
1967
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1968
return 1;
1969
}
1970
1971
if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
1972
tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
1973
1974
return 1;
1975
}
1976
1977
1978
int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
1979
{
1980
struct vcpu_tdx *tdx = to_tdx(vcpu);
1981
u64 vp_enter_ret = tdx->vp_enter_ret;
1982
union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
1983
1984
if (fastpath != EXIT_FASTPATH_NONE)
1985
return 1;
1986
1987
if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
1988
KVM_BUG_ON(1, vcpu->kvm);
1989
return -EIO;
1990
}
1991
1992
/*
1993
* Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
1994
* TDX_SEAMCALL_VMFAILINVALID.
1995
*/
1996
if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
1997
KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
1998
goto unhandled_exit;
1999
}
2000
2001
if (unlikely(tdx_failed_vmentry(vcpu))) {
2002
/*
2003
* If the guest state is protected, that means off-TD debug is
2004
* not enabled, TDX_NON_RECOVERABLE must be set.
2005
*/
2006
WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2007
!(vp_enter_ret & TDX_NON_RECOVERABLE));
2008
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2009
vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2010
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2011
return 0;
2012
}
2013
2014
if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2015
exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2016
kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2017
goto unhandled_exit;
2018
}
2019
2020
WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2021
(vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2022
2023
switch (exit_reason.basic) {
2024
case EXIT_REASON_TRIPLE_FAULT:
2025
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2026
vcpu->mmio_needed = 0;
2027
return 0;
2028
case EXIT_REASON_EXCEPTION_NMI:
2029
return tdx_handle_exception_nmi(vcpu);
2030
case EXIT_REASON_EXTERNAL_INTERRUPT:
2031
++vcpu->stat.irq_exits;
2032
return 1;
2033
case EXIT_REASON_CPUID:
2034
return tdx_emulate_cpuid(vcpu);
2035
case EXIT_REASON_HLT:
2036
return kvm_emulate_halt_noskip(vcpu);
2037
case EXIT_REASON_TDCALL:
2038
return handle_tdvmcall(vcpu);
2039
case EXIT_REASON_VMCALL:
2040
return tdx_emulate_vmcall(vcpu);
2041
case EXIT_REASON_IO_INSTRUCTION:
2042
return tdx_emulate_io(vcpu);
2043
case EXIT_REASON_MSR_READ:
2044
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2045
return kvm_emulate_rdmsr(vcpu);
2046
case EXIT_REASON_MSR_WRITE:
2047
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2048
kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2049
kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2050
return kvm_emulate_wrmsr(vcpu);
2051
case EXIT_REASON_EPT_MISCONFIG:
2052
return tdx_emulate_mmio(vcpu);
2053
case EXIT_REASON_EPT_VIOLATION:
2054
return tdx_handle_ept_violation(vcpu);
2055
case EXIT_REASON_OTHER_SMI:
2056
/*
2057
* Unlike VMX, SMI in SEAM non-root mode (i.e. when
2058
* TD guest vCPU is running) will cause VM exit to TDX module,
2059
* then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2060
* and handled by kernel handler right away.
2061
*
2062
* The Other SMI exit can also be caused by the SEAM non-root
2063
* machine check delivered via Machine Check System Management
2064
* Interrupt (MSMI), but it has already been handled by the
2065
* kernel machine check handler, i.e., the memory page has been
2066
* marked as poisoned and it won't be freed to the free list
2067
* when the TDX guest is terminated (the TDX module marks the
2068
* guest as dead and prevent it from further running when
2069
* machine check happens in SEAM non-root).
2070
*
2071
* - A MSMI will not reach here, it's handled as non_recoverable
2072
* case above.
2073
* - If it's not an MSMI, no need to do anything here.
2074
*/
2075
return 1;
2076
default:
2077
break;
2078
}
2079
2080
unhandled_exit:
2081
kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
2082
return 0;
2083
}
2084
2085
void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2086
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2087
{
2088
struct vcpu_tdx *tdx = to_tdx(vcpu);
2089
2090
*reason = tdx->vt.exit_reason.full;
2091
if (*reason != -1u) {
2092
*info1 = vmx_get_exit_qual(vcpu);
2093
*info2 = tdx->ext_exit_qualification;
2094
*intr_info = vmx_get_intr_info(vcpu);
2095
} else {
2096
*info1 = 0;
2097
*info2 = 0;
2098
*intr_info = 0;
2099
}
2100
2101
*error_code = 0;
2102
}
2103
2104
bool tdx_has_emulated_msr(u32 index)
2105
{
2106
switch (index) {
2107
case MSR_IA32_UCODE_REV:
2108
case MSR_IA32_ARCH_CAPABILITIES:
2109
case MSR_IA32_POWER_CTL:
2110
case MSR_IA32_CR_PAT:
2111
case MSR_MTRRcap:
2112
case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2113
case MSR_MTRRdefType:
2114
case MSR_IA32_TSC_DEADLINE:
2115
case MSR_IA32_MISC_ENABLE:
2116
case MSR_PLATFORM_INFO:
2117
case MSR_MISC_FEATURES_ENABLES:
2118
case MSR_IA32_APICBASE:
2119
case MSR_EFER:
2120
case MSR_IA32_FEAT_CTL:
2121
case MSR_IA32_MCG_CAP:
2122
case MSR_IA32_MCG_STATUS:
2123
case MSR_IA32_MCG_CTL:
2124
case MSR_IA32_MCG_EXT_CTL:
2125
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2126
case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2127
/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2128
case MSR_KVM_POLL_CONTROL:
2129
return true;
2130
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2131
/*
2132
* x2APIC registers that are virtualized by the CPU can't be
2133
* emulated, KVM doesn't have access to the virtual APIC page.
2134
*/
2135
switch (index) {
2136
case X2APIC_MSR(APIC_TASKPRI):
2137
case X2APIC_MSR(APIC_PROCPRI):
2138
case X2APIC_MSR(APIC_EOI):
2139
case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2140
case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2141
case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2142
return false;
2143
default:
2144
return true;
2145
}
2146
default:
2147
return false;
2148
}
2149
}
2150
2151
static bool tdx_is_read_only_msr(u32 index)
2152
{
2153
return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2154
index == MSR_IA32_FEAT_CTL;
2155
}
2156
2157
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2158
{
2159
switch (msr->index) {
2160
case MSR_IA32_FEAT_CTL:
2161
/*
2162
* MCE and MCA are advertised via cpuid. Guest kernel could
2163
* check if LMCE is enabled or not.
2164
*/
2165
msr->data = FEAT_CTL_LOCKED;
2166
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2167
msr->data |= FEAT_CTL_LMCE_ENABLED;
2168
return 0;
2169
case MSR_IA32_MCG_EXT_CTL:
2170
if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2171
return 1;
2172
msr->data = vcpu->arch.mcg_ext_ctl;
2173
return 0;
2174
default:
2175
if (!tdx_has_emulated_msr(msr->index))
2176
return 1;
2177
2178
return kvm_get_msr_common(vcpu, msr);
2179
}
2180
}
2181
2182
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2183
{
2184
switch (msr->index) {
2185
case MSR_IA32_MCG_EXT_CTL:
2186
if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2187
(msr->data & ~MCG_EXT_CTL_LMCE_EN))
2188
return 1;
2189
vcpu->arch.mcg_ext_ctl = msr->data;
2190
return 0;
2191
default:
2192
if (tdx_is_read_only_msr(msr->index))
2193
return 1;
2194
2195
if (!tdx_has_emulated_msr(msr->index))
2196
return 1;
2197
2198
return kvm_set_msr_common(vcpu, msr);
2199
}
2200
}
2201
2202
static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2203
{
2204
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2205
struct kvm_tdx_capabilities __user *user_caps;
2206
struct kvm_tdx_capabilities *caps = NULL;
2207
u32 nr_user_entries;
2208
int ret = 0;
2209
2210
/* flags is reserved for future use */
2211
if (cmd->flags)
2212
return -EINVAL;
2213
2214
user_caps = u64_to_user_ptr(cmd->data);
2215
if (get_user(nr_user_entries, &user_caps->cpuid.nent))
2216
return -EFAULT;
2217
2218
if (nr_user_entries < td_conf->num_cpuid_config)
2219
return -E2BIG;
2220
2221
caps = kzalloc(struct_size(caps, cpuid.entries,
2222
td_conf->num_cpuid_config), GFP_KERNEL);
2223
if (!caps)
2224
return -ENOMEM;
2225
2226
ret = init_kvm_tdx_caps(td_conf, caps);
2227
if (ret)
2228
goto out;
2229
2230
if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
2231
caps->cpuid.nent))) {
2232
ret = -EFAULT;
2233
goto out;
2234
}
2235
2236
out:
2237
/* kfree() accepts NULL. */
2238
kfree(caps);
2239
return ret;
2240
}
2241
2242
/*
2243
* KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2244
* similar to TDX's GPAW. Use this field as the interface for userspace to
2245
* configure the GPAW and EPT level for TDs.
2246
*
2247
* Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2248
* 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2249
* supported. Value 52 is only supported when the platform supports 5 level
2250
* EPT.
2251
*/
2252
static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2253
struct td_params *td_params)
2254
{
2255
const struct kvm_cpuid_entry2 *entry;
2256
int guest_pa;
2257
2258
entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2259
if (!entry)
2260
return -EINVAL;
2261
2262
guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2263
2264
if (guest_pa != 48 && guest_pa != 52)
2265
return -EINVAL;
2266
2267
if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2268
return -EINVAL;
2269
2270
td_params->eptp_controls = VMX_EPTP_MT_WB;
2271
if (guest_pa == 52) {
2272
td_params->eptp_controls |= VMX_EPTP_PWL_5;
2273
td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2274
} else {
2275
td_params->eptp_controls |= VMX_EPTP_PWL_4;
2276
}
2277
2278
return 0;
2279
}
2280
2281
static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2282
struct td_params *td_params)
2283
{
2284
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2285
const struct kvm_cpuid_entry2 *entry;
2286
struct tdx_cpuid_value *value;
2287
int i, copy_cnt = 0;
2288
2289
/*
2290
* td_params.cpuid_values: The number and the order of cpuid_value must
2291
* be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2292
* It's assumed that td_params was zeroed.
2293
*/
2294
for (i = 0; i < td_conf->num_cpuid_config; i++) {
2295
struct kvm_cpuid_entry2 tmp;
2296
2297
td_init_cpuid_entry2(&tmp, i);
2298
2299
entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2300
tmp.function, tmp.index);
2301
if (!entry)
2302
continue;
2303
2304
if (tdx_unsupported_cpuid(entry))
2305
return -EINVAL;
2306
2307
copy_cnt++;
2308
2309
value = &td_params->cpuid_values[i];
2310
value->eax = entry->eax;
2311
value->ebx = entry->ebx;
2312
value->ecx = entry->ecx;
2313
value->edx = entry->edx;
2314
2315
/*
2316
* TDX module does not accept nonzero bits 16..23 for the
2317
* CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2318
*/
2319
if (tmp.function == 0x80000008)
2320
value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2321
}
2322
2323
/*
2324
* Rely on the TDX module to reject invalid configuration, but it can't
2325
* check of leafs that don't have a proper slot in td_params->cpuid_values
2326
* to stick then. So fail if there were entries that didn't get copied to
2327
* td_params.
2328
*/
2329
if (copy_cnt != cpuid->nent)
2330
return -EINVAL;
2331
2332
return 0;
2333
}
2334
2335
static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2336
struct kvm_tdx_init_vm *init_vm)
2337
{
2338
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2339
struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2340
int ret;
2341
2342
if (kvm->created_vcpus)
2343
return -EBUSY;
2344
2345
if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2346
return -EINVAL;
2347
2348
if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2349
return -EINVAL;
2350
2351
td_params->max_vcpus = kvm->max_vcpus;
2352
td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2353
td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2354
2355
td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2356
td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2357
2358
ret = setup_tdparams_eptp_controls(cpuid, td_params);
2359
if (ret)
2360
return ret;
2361
2362
ret = setup_tdparams_cpuids(cpuid, td_params);
2363
if (ret)
2364
return ret;
2365
2366
#define MEMCPY_SAME_SIZE(dst, src) \
2367
do { \
2368
BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2369
memcpy((dst), (src), sizeof(dst)); \
2370
} while (0)
2371
2372
MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2373
MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2374
MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2375
2376
return 0;
2377
}
2378
2379
static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2380
u64 *seamcall_err)
2381
{
2382
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2383
cpumask_var_t packages;
2384
struct page **tdcs_pages = NULL;
2385
struct page *tdr_page;
2386
int ret, i;
2387
u64 err, rcx;
2388
2389
*seamcall_err = 0;
2390
ret = tdx_guest_keyid_alloc();
2391
if (ret < 0)
2392
return ret;
2393
kvm_tdx->hkid = ret;
2394
kvm_tdx->misc_cg = get_current_misc_cg();
2395
ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2396
if (ret)
2397
goto free_hkid;
2398
2399
ret = -ENOMEM;
2400
2401
atomic_inc(&nr_configured_hkid);
2402
2403
tdr_page = alloc_page(GFP_KERNEL);
2404
if (!tdr_page)
2405
goto free_hkid;
2406
2407
kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2408
/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2409
kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2410
tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2411
GFP_KERNEL);
2412
if (!tdcs_pages)
2413
goto free_tdr;
2414
2415
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2416
tdcs_pages[i] = alloc_page(GFP_KERNEL);
2417
if (!tdcs_pages[i])
2418
goto free_tdcs;
2419
}
2420
2421
if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2422
goto free_tdcs;
2423
2424
cpus_read_lock();
2425
2426
/*
2427
* Need at least one CPU of the package to be online in order to
2428
* program all packages for host key id. Check it.
2429
*/
2430
for_each_present_cpu(i)
2431
cpumask_set_cpu(topology_physical_package_id(i), packages);
2432
for_each_online_cpu(i)
2433
cpumask_clear_cpu(topology_physical_package_id(i), packages);
2434
if (!cpumask_empty(packages)) {
2435
ret = -EIO;
2436
/*
2437
* Because it's hard for human operator to figure out the
2438
* reason, warn it.
2439
*/
2440
#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2441
pr_warn_ratelimited(MSG_ALLPKG);
2442
goto free_packages;
2443
}
2444
2445
/*
2446
* TDH.MNG.CREATE tries to grab the global TDX module and fails
2447
* with TDX_OPERAND_BUSY when it fails to grab. Take the global
2448
* lock to prevent it from failure.
2449
*/
2450
mutex_lock(&tdx_lock);
2451
kvm_tdx->td.tdr_page = tdr_page;
2452
err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2453
mutex_unlock(&tdx_lock);
2454
2455
if (err == TDX_RND_NO_ENTROPY) {
2456
ret = -EAGAIN;
2457
goto free_packages;
2458
}
2459
2460
if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
2461
ret = -EIO;
2462
goto free_packages;
2463
}
2464
2465
for_each_online_cpu(i) {
2466
int pkg = topology_physical_package_id(i);
2467
2468
if (cpumask_test_and_set_cpu(pkg, packages))
2469
continue;
2470
2471
/*
2472
* Program the memory controller in the package with an
2473
* encryption key associated to a TDX private host key id
2474
* assigned to this TDR. Concurrent operations on same memory
2475
* controller results in TDX_OPERAND_BUSY. No locking needed
2476
* beyond the cpus_read_lock() above as it serializes against
2477
* hotplug and the first online CPU of the package is always
2478
* used. We never have two CPUs in the same socket trying to
2479
* program the key.
2480
*/
2481
ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2482
kvm_tdx, true);
2483
if (ret)
2484
break;
2485
}
2486
cpus_read_unlock();
2487
free_cpumask_var(packages);
2488
if (ret) {
2489
i = 0;
2490
goto teardown;
2491
}
2492
2493
kvm_tdx->td.tdcs_pages = tdcs_pages;
2494
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2495
err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2496
if (err == TDX_RND_NO_ENTROPY) {
2497
/* Here it's hard to allow userspace to retry. */
2498
ret = -EAGAIN;
2499
goto teardown;
2500
}
2501
if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
2502
ret = -EIO;
2503
goto teardown;
2504
}
2505
}
2506
2507
err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2508
if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2509
/*
2510
* Because a user gives operands, don't warn.
2511
* Return a hint to the user because it's sometimes hard for the
2512
* user to figure out which operand is invalid. SEAMCALL status
2513
* code includes which operand caused invalid operand error.
2514
*/
2515
*seamcall_err = err;
2516
ret = -EINVAL;
2517
goto teardown;
2518
} else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
2519
ret = -EIO;
2520
goto teardown;
2521
}
2522
2523
return 0;
2524
2525
/*
2526
* The sequence for freeing resources from a partially initialized TD
2527
* varies based on where in the initialization flow failure occurred.
2528
* Simply use the full teardown and destroy, which naturally play nice
2529
* with partial initialization.
2530
*/
2531
teardown:
2532
/* Only free pages not yet added, so start at 'i' */
2533
for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2534
if (tdcs_pages[i]) {
2535
__free_page(tdcs_pages[i]);
2536
tdcs_pages[i] = NULL;
2537
}
2538
}
2539
if (!kvm_tdx->td.tdcs_pages)
2540
kfree(tdcs_pages);
2541
2542
tdx_mmu_release_hkid(kvm);
2543
tdx_reclaim_td_control_pages(kvm);
2544
2545
return ret;
2546
2547
free_packages:
2548
cpus_read_unlock();
2549
free_cpumask_var(packages);
2550
2551
free_tdcs:
2552
for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2553
if (tdcs_pages[i])
2554
__free_page(tdcs_pages[i]);
2555
}
2556
kfree(tdcs_pages);
2557
kvm_tdx->td.tdcs_pages = NULL;
2558
2559
free_tdr:
2560
if (tdr_page)
2561
__free_page(tdr_page);
2562
kvm_tdx->td.tdr_page = NULL;
2563
2564
free_hkid:
2565
tdx_hkid_free(kvm_tdx);
2566
2567
return ret;
2568
}
2569
2570
static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2571
u64 *data)
2572
{
2573
u64 err;
2574
2575
err = tdh_mng_rd(&tdx->td, field_id, data);
2576
2577
return err;
2578
}
2579
2580
#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2581
#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2582
2583
static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2584
bool sub_leaf_set, int *entry_index,
2585
struct kvm_cpuid_entry2 *out)
2586
{
2587
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2588
u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2589
u64 ebx_eax, edx_ecx;
2590
u64 err = 0;
2591
2592
if (sub_leaf > 0b1111111)
2593
return -EINVAL;
2594
2595
if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2596
return -EINVAL;
2597
2598
if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2599
sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2600
return -EINVAL;
2601
2602
/*
2603
* bit 23:17, REVSERVED: reserved, must be 0;
2604
* bit 16, LEAF_31: leaf number bit 31;
2605
* bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2606
* implicitly 0;
2607
* bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2608
* bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2609
* the SUBLEAF_6_0 is all-1.
2610
* sub-leaf bits 31:7 are implicitly 0;
2611
* bit 0, ELEMENT_I: Element index within field;
2612
*/
2613
field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2614
field_id |= (leaf & 0x7f) << 9;
2615
if (sub_leaf_set)
2616
field_id |= (sub_leaf & 0x7f) << 1;
2617
else
2618
field_id |= 0x1fe;
2619
2620
err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2621
if (err) //TODO check for specific errors
2622
goto err_out;
2623
2624
out->eax = (u32) ebx_eax;
2625
out->ebx = (u32) (ebx_eax >> 32);
2626
2627
field_id++;
2628
err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2629
/*
2630
* It's weird that reading edx_ecx fails while reading ebx_eax
2631
* succeeded.
2632
*/
2633
if (WARN_ON_ONCE(err))
2634
goto err_out;
2635
2636
out->ecx = (u32) edx_ecx;
2637
out->edx = (u32) (edx_ecx >> 32);
2638
2639
out->function = leaf;
2640
out->index = sub_leaf;
2641
out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2642
2643
/*
2644
* Work around missing support on old TDX modules, fetch
2645
* guest maxpa from gfn_direct_bits.
2646
*/
2647
if (leaf == 0x80000008) {
2648
gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2649
unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2650
2651
out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2652
}
2653
2654
(*entry_index)++;
2655
2656
return 0;
2657
2658
err_out:
2659
out->eax = 0;
2660
out->ebx = 0;
2661
out->ecx = 0;
2662
out->edx = 0;
2663
2664
return -EIO;
2665
}
2666
2667
typedef void *tdx_vm_state_guard_t;
2668
2669
static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
2670
{
2671
int r;
2672
2673
mutex_lock(&kvm->lock);
2674
2675
if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
2676
r = -EBUSY;
2677
goto out_err;
2678
}
2679
2680
r = kvm_lock_all_vcpus(kvm);
2681
if (r)
2682
goto out_err;
2683
2684
/*
2685
* Note the unintuitive ordering! vcpu->mutex must be taken outside
2686
* kvm->slots_lock!
2687
*/
2688
mutex_lock(&kvm->slots_lock);
2689
return kvm;
2690
2691
out_err:
2692
mutex_unlock(&kvm->lock);
2693
return ERR_PTR(r);
2694
}
2695
2696
static void tdx_release_vm_state_locks(struct kvm *kvm)
2697
{
2698
mutex_unlock(&kvm->slots_lock);
2699
kvm_unlock_all_vcpus(kvm);
2700
mutex_unlock(&kvm->lock);
2701
}
2702
2703
DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
2704
if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
2705
tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
2706
2707
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2708
{
2709
struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
2710
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2711
struct kvm_tdx_init_vm *init_vm;
2712
struct td_params *td_params = NULL;
2713
u32 nr_user_entries;
2714
int ret;
2715
2716
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2717
BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2718
2719
if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2720
return -EINVAL;
2721
2722
if (cmd->flags)
2723
return -EINVAL;
2724
2725
if (get_user(nr_user_entries, &user_data->cpuid.nent))
2726
return -EFAULT;
2727
2728
if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
2729
return -E2BIG;
2730
2731
init_vm = memdup_user(user_data,
2732
struct_size(user_data, cpuid.entries, nr_user_entries));
2733
if (IS_ERR(init_vm))
2734
return PTR_ERR(init_vm);
2735
2736
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2737
ret = -EINVAL;
2738
goto out;
2739
}
2740
2741
if (init_vm->cpuid.padding) {
2742
ret = -EINVAL;
2743
goto out;
2744
}
2745
2746
td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2747
if (!td_params) {
2748
ret = -ENOMEM;
2749
goto out;
2750
}
2751
2752
ret = setup_tdparams(kvm, td_params, init_vm);
2753
if (ret)
2754
goto out;
2755
2756
ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2757
if (ret)
2758
goto out;
2759
2760
kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2761
kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2762
kvm_tdx->attributes = td_params->attributes;
2763
kvm_tdx->xfam = td_params->xfam;
2764
2765
if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2766
kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2767
else
2768
kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2769
2770
kvm_tdx->state = TD_STATE_INITIALIZED;
2771
out:
2772
/* kfree() accepts NULL. */
2773
kfree(init_vm);
2774
kfree(td_params);
2775
2776
return ret;
2777
}
2778
2779
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2780
{
2781
/*
2782
* flush_tlb_current() is invoked when the first time for the vcpu to
2783
* run or when root of shared EPT is invalidated.
2784
* KVM only needs to flush shared EPT because the TDX module handles TLB
2785
* invalidation for private EPT in tdh_vp_enter();
2786
*
2787
* A single context invalidation for shared EPT can be performed here.
2788
* However, this single context invalidation requires the private EPTP
2789
* rather than the shared EPTP to flush shared EPT, as shared EPT uses
2790
* private EPTP as its ASID for TLB invalidation.
2791
*
2792
* To avoid reading back private EPTP, perform a global invalidation for
2793
* shared EPT instead to keep this function simple.
2794
*/
2795
ept_sync_global();
2796
}
2797
2798
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2799
{
2800
/*
2801
* TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2802
* ensure that private EPT will be flushed on the next TD enter. No need
2803
* to call tdx_track() here again even when this callback is a result of
2804
* zapping private EPT.
2805
*
2806
* Due to the lack of the context to determine which EPT has been
2807
* affected by zapping, invoke invept() directly here for both shared
2808
* EPT and private EPT for simplicity, though it's not necessary for
2809
* private EPT.
2810
*/
2811
ept_sync_global();
2812
}
2813
2814
static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2815
{
2816
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2817
2818
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2819
return -EINVAL;
2820
2821
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2822
if (tdx_operand_busy(cmd->hw_error))
2823
return -EBUSY;
2824
if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
2825
return -EIO;
2826
2827
kvm_tdx->state = TD_STATE_RUNNABLE;
2828
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2829
smp_wmb();
2830
kvm->arch.pre_fault_allowed = true;
2831
return 0;
2832
}
2833
2834
static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
2835
{
2836
if (copy_from_user(cmd, argp, sizeof(*cmd)))
2837
return -EFAULT;
2838
2839
/*
2840
* Userspace should never set hw_error. KVM writes hw_error to report
2841
* hardware-defined error back to userspace.
2842
*/
2843
if (cmd->hw_error)
2844
return -EINVAL;
2845
2846
return 0;
2847
}
2848
2849
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2850
{
2851
struct kvm_tdx_cmd tdx_cmd;
2852
int r;
2853
2854
r = tdx_get_cmd(argp, &tdx_cmd);
2855
if (r)
2856
return r;
2857
2858
if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
2859
return tdx_get_capabilities(&tdx_cmd);
2860
2861
CLASS(tdx_vm_state_guard, guard)(kvm);
2862
if (IS_ERR(guard))
2863
return PTR_ERR(guard);
2864
2865
switch (tdx_cmd.id) {
2866
case KVM_TDX_INIT_VM:
2867
r = tdx_td_init(kvm, &tdx_cmd);
2868
break;
2869
case KVM_TDX_FINALIZE_VM:
2870
r = tdx_td_finalize(kvm, &tdx_cmd);
2871
break;
2872
default:
2873
return -EINVAL;
2874
}
2875
2876
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2877
return -EFAULT;
2878
2879
return r;
2880
}
2881
2882
/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2883
static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2884
{
2885
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2886
struct vcpu_tdx *tdx = to_tdx(vcpu);
2887
struct page *page;
2888
int ret, i;
2889
u64 err;
2890
2891
page = alloc_page(GFP_KERNEL);
2892
if (!page)
2893
return -ENOMEM;
2894
tdx->vp.tdvpr_page = page;
2895
2896
/*
2897
* page_to_phys() does not work in 'noinstr' code, like guest
2898
* entry via tdh_vp_enter(). Precalculate and store it instead
2899
* of doing it at runtime later.
2900
*/
2901
tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2902
2903
tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2904
GFP_KERNEL);
2905
if (!tdx->vp.tdcx_pages) {
2906
ret = -ENOMEM;
2907
goto free_tdvpr;
2908
}
2909
2910
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2911
page = alloc_page(GFP_KERNEL);
2912
if (!page) {
2913
ret = -ENOMEM;
2914
goto free_tdcx;
2915
}
2916
tdx->vp.tdcx_pages[i] = page;
2917
}
2918
2919
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2920
if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
2921
ret = -EIO;
2922
goto free_tdcx;
2923
}
2924
2925
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2926
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2927
if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
2928
/*
2929
* Pages already added are reclaimed by the vcpu_free
2930
* method, but the rest are freed here.
2931
*/
2932
for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2933
__free_page(tdx->vp.tdcx_pages[i]);
2934
tdx->vp.tdcx_pages[i] = NULL;
2935
}
2936
return -EIO;
2937
}
2938
}
2939
2940
/*
2941
* tdh_vp_init() can take an exclusive lock of the TDR resource inside
2942
* the TDX-Module. The TDR resource is also taken as shared in several
2943
* no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
2944
* (TDX-Module locks are try-lock implementations with no slow path).
2945
* Take mmu_lock for write to reflect the nature of the lock taken by
2946
* the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
2947
* a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
2948
*/
2949
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
2950
err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2951
if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
2952
return -EIO;
2953
}
2954
2955
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2956
2957
return 0;
2958
2959
free_tdcx:
2960
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2961
if (tdx->vp.tdcx_pages[i])
2962
__free_page(tdx->vp.tdcx_pages[i]);
2963
tdx->vp.tdcx_pages[i] = NULL;
2964
}
2965
kfree(tdx->vp.tdcx_pages);
2966
tdx->vp.tdcx_pages = NULL;
2967
2968
free_tdvpr:
2969
if (tdx->vp.tdvpr_page)
2970
__free_page(tdx->vp.tdvpr_page);
2971
tdx->vp.tdvpr_page = NULL;
2972
tdx->vp.tdvpr_pa = 0;
2973
2974
return ret;
2975
}
2976
2977
/* Sometimes reads multipple subleafs. Return how many enties were written. */
2978
static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
2979
struct kvm_cpuid_entry2 *output_e)
2980
{
2981
int sub_leaf = 0;
2982
int ret;
2983
2984
/* First try without a subleaf */
2985
ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
2986
2987
/* If success, or invalid leaf, just give up */
2988
if (ret != -EIO)
2989
return ret;
2990
2991
/*
2992
* If the try without a subleaf failed, try reading subleafs until
2993
* failure. The TDX module only supports 6 bits of subleaf index.
2994
*/
2995
while (1) {
2996
/* Keep reading subleafs until there is a failure. */
2997
if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
2998
return !sub_leaf;
2999
3000
sub_leaf++;
3001
output_e++;
3002
}
3003
3004
return 0;
3005
}
3006
3007
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3008
{
3009
struct kvm_cpuid2 __user *output;
3010
struct kvm_cpuid2 *td_cpuid;
3011
int r = 0, i = 0, leaf;
3012
u32 level;
3013
3014
output = u64_to_user_ptr(cmd->data);
3015
td_cpuid = kzalloc(sizeof(*td_cpuid) +
3016
sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3017
GFP_KERNEL);
3018
if (!td_cpuid)
3019
return -ENOMEM;
3020
3021
if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3022
r = -EFAULT;
3023
goto out;
3024
}
3025
3026
/* Read max CPUID for normal range */
3027
if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3028
r = -EIO;
3029
goto out;
3030
}
3031
level = td_cpuid->entries[0].eax;
3032
3033
for (leaf = 1; leaf <= level; leaf++)
3034
tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3035
3036
/* Read max CPUID for extended range */
3037
if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3038
r = -EIO;
3039
goto out;
3040
}
3041
level = td_cpuid->entries[i - 1].eax;
3042
3043
for (leaf = 0x80000001; leaf <= level; leaf++)
3044
tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3045
3046
if (td_cpuid->nent < i)
3047
r = -E2BIG;
3048
td_cpuid->nent = i;
3049
3050
if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3051
r = -EFAULT;
3052
goto out;
3053
}
3054
3055
if (r == -E2BIG)
3056
goto out;
3057
3058
if (copy_to_user(output->entries, td_cpuid->entries,
3059
td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3060
r = -EFAULT;
3061
3062
out:
3063
kfree(td_cpuid);
3064
3065
return r;
3066
}
3067
3068
static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3069
{
3070
u64 apic_base;
3071
struct vcpu_tdx *tdx = to_tdx(vcpu);
3072
int ret;
3073
3074
if (cmd->flags)
3075
return -EINVAL;
3076
3077
if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3078
return -EINVAL;
3079
3080
/*
3081
* TDX requires X2APIC, userspace is responsible for configuring guest
3082
* CPUID accordingly.
3083
*/
3084
apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3085
(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3086
if (kvm_apic_set_base(vcpu, apic_base, true))
3087
return -EINVAL;
3088
3089
ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3090
if (ret)
3091
return ret;
3092
3093
td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3094
td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3095
td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3096
3097
tdx->state = VCPU_TD_STATE_INITIALIZED;
3098
3099
return 0;
3100
}
3101
3102
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3103
{
3104
/*
3105
* Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3106
* INIT events.
3107
*
3108
* Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3109
* userspace needs to define the vCPU model before KVM can initialize
3110
* vCPU state, e.g. to enable x2APIC.
3111
*/
3112
WARN_ON_ONCE(init_event);
3113
}
3114
3115
struct tdx_gmem_post_populate_arg {
3116
struct kvm_vcpu *vcpu;
3117
__u32 flags;
3118
};
3119
3120
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3121
struct page *src_page, void *_arg)
3122
{
3123
struct tdx_gmem_post_populate_arg *arg = _arg;
3124
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3125
u64 err, entry, level_state;
3126
gpa_t gpa = gfn_to_gpa(gfn);
3127
int ret, i;
3128
3129
if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
3130
return -EIO;
3131
3132
if (!src_page)
3133
return -EOPNOTSUPP;
3134
3135
kvm_tdx->page_add_src = src_page;
3136
ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
3137
kvm_tdx->page_add_src = NULL;
3138
3139
if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
3140
return ret;
3141
3142
/*
3143
* Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
3144
* between mapping the pfn and now, but slots_lock prevents memslot
3145
* updates, filemap_invalidate_lock() prevents guest_memfd updates,
3146
* mmu_notifier events can't reach S-EPT entries, and KVM's internal
3147
* zapping flows are mutually exclusive with S-EPT mappings.
3148
*/
3149
for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3150
err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
3151
if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
3152
return -EIO;
3153
}
3154
3155
return 0;
3156
}
3157
3158
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3159
{
3160
struct vcpu_tdx *tdx = to_tdx(vcpu);
3161
struct kvm *kvm = vcpu->kvm;
3162
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3163
struct kvm_tdx_init_mem_region region;
3164
struct tdx_gmem_post_populate_arg arg;
3165
long gmem_ret;
3166
int ret;
3167
3168
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3169
return -EINVAL;
3170
3171
/* Once TD is finalized, the initial guest memory is fixed. */
3172
if (kvm_tdx->state == TD_STATE_RUNNABLE)
3173
return -EINVAL;
3174
3175
if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3176
return -EINVAL;
3177
3178
if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3179
return -EFAULT;
3180
3181
if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3182
!region.nr_pages ||
3183
region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3184
!vt_is_tdx_private_gpa(kvm, region.gpa) ||
3185
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3186
return -EINVAL;
3187
3188
ret = 0;
3189
while (region.nr_pages) {
3190
if (signal_pending(current)) {
3191
ret = -EINTR;
3192
break;
3193
}
3194
3195
arg = (struct tdx_gmem_post_populate_arg) {
3196
.vcpu = vcpu,
3197
.flags = cmd->flags,
3198
};
3199
gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3200
u64_to_user_ptr(region.source_addr),
3201
1, tdx_gmem_post_populate, &arg);
3202
if (gmem_ret < 0) {
3203
ret = gmem_ret;
3204
break;
3205
}
3206
3207
if (gmem_ret != 1) {
3208
ret = -EIO;
3209
break;
3210
}
3211
3212
region.source_addr += PAGE_SIZE;
3213
region.gpa += PAGE_SIZE;
3214
region.nr_pages--;
3215
3216
cond_resched();
3217
}
3218
3219
if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3220
ret = -EFAULT;
3221
return ret;
3222
}
3223
3224
int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3225
{
3226
struct kvm *kvm = vcpu->kvm;
3227
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3228
struct kvm_tdx_cmd cmd;
3229
int r;
3230
3231
r = tdx_get_cmd(argp, &cmd);
3232
if (r)
3233
return r;
3234
3235
CLASS(tdx_vm_state_guard, guard)(kvm);
3236
if (IS_ERR(guard))
3237
return PTR_ERR(guard);
3238
3239
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3240
return -EINVAL;
3241
3242
vcpu_load(vcpu);
3243
3244
switch (cmd.id) {
3245
case KVM_TDX_INIT_MEM_REGION:
3246
r = tdx_vcpu_init_mem_region(vcpu, &cmd);
3247
break;
3248
case KVM_TDX_INIT_VCPU:
3249
r = tdx_vcpu_init(vcpu, &cmd);
3250
break;
3251
default:
3252
r = -ENOIOCTLCMD;
3253
break;
3254
}
3255
3256
vcpu_put(vcpu);
3257
3258
return r;
3259
}
3260
3261
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3262
{
3263
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3264
struct kvm_tdx_cmd cmd;
3265
int ret;
3266
3267
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3268
return -EINVAL;
3269
3270
ret = tdx_get_cmd(argp, &cmd);
3271
if (ret)
3272
return ret;
3273
3274
switch (cmd.id) {
3275
case KVM_TDX_GET_CPUID:
3276
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3277
break;
3278
default:
3279
ret = -EINVAL;
3280
break;
3281
}
3282
3283
return ret;
3284
}
3285
3286
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3287
{
3288
if (!is_private)
3289
return 0;
3290
3291
return PG_LEVEL_4K;
3292
}
3293
3294
static int tdx_online_cpu(unsigned int cpu)
3295
{
3296
unsigned long flags;
3297
int r;
3298
3299
/* Sanity check CPU is already in post-VMXON */
3300
WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3301
3302
local_irq_save(flags);
3303
r = tdx_cpu_enable();
3304
local_irq_restore(flags);
3305
3306
return r;
3307
}
3308
3309
static int tdx_offline_cpu(unsigned int cpu)
3310
{
3311
int i;
3312
3313
/* No TD is running. Allow any cpu to be offline. */
3314
if (!atomic_read(&nr_configured_hkid))
3315
return 0;
3316
3317
/*
3318
* In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3319
* call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3320
* controller with pconfig. If we have active TDX HKID, refuse to
3321
* offline the last online cpu.
3322
*/
3323
for_each_online_cpu(i) {
3324
/*
3325
* Found another online cpu on the same package.
3326
* Allow to offline.
3327
*/
3328
if (i != cpu && topology_physical_package_id(i) ==
3329
topology_physical_package_id(cpu))
3330
return 0;
3331
}
3332
3333
/*
3334
* This is the last cpu of this package. Don't offline it.
3335
*
3336
* Because it's hard for human operator to understand the
3337
* reason, warn it.
3338
*/
3339
#define MSG_ALLPKG_ONLINE \
3340
"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3341
pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3342
return -EBUSY;
3343
}
3344
3345
static void __do_tdx_cleanup(void)
3346
{
3347
/*
3348
* Once TDX module is initialized, it cannot be disabled and
3349
* re-initialized again w/o runtime update (which isn't
3350
* supported by kernel). Only need to remove the cpuhp here.
3351
* The TDX host core code tracks TDX status and can handle
3352
* 'multiple enabling' scenario.
3353
*/
3354
WARN_ON_ONCE(!tdx_cpuhp_state);
3355
cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3356
tdx_cpuhp_state = 0;
3357
}
3358
3359
static void __tdx_cleanup(void)
3360
{
3361
cpus_read_lock();
3362
__do_tdx_cleanup();
3363
cpus_read_unlock();
3364
}
3365
3366
static int __init __do_tdx_bringup(void)
3367
{
3368
int r;
3369
3370
/*
3371
* TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3372
* online CPUs before calling tdx_enable(), and on any new
3373
* going-online CPU to make sure it is ready for TDX guest.
3374
*/
3375
r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3376
"kvm/cpu/tdx:online",
3377
tdx_online_cpu, tdx_offline_cpu);
3378
if (r < 0)
3379
return r;
3380
3381
tdx_cpuhp_state = r;
3382
3383
r = tdx_enable();
3384
if (r)
3385
__do_tdx_cleanup();
3386
3387
return r;
3388
}
3389
3390
static int __init __tdx_bringup(void)
3391
{
3392
const struct tdx_sys_info_td_conf *td_conf;
3393
int r, i;
3394
3395
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3396
/*
3397
* Check if MSRs (tdx_uret_msrs) can be saved/restored
3398
* before returning to user space.
3399
*/
3400
tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3401
if (tdx_uret_msrs[i].slot == -1) {
3402
/* If any MSR isn't supported, it is a KVM bug */
3403
pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3404
tdx_uret_msrs[i].msr);
3405
return -EIO;
3406
}
3407
}
3408
3409
/*
3410
* Enabling TDX requires enabling hardware virtualization first,
3411
* as making SEAMCALLs requires CPU being in post-VMXON state.
3412
*/
3413
r = kvm_enable_virtualization();
3414
if (r)
3415
return r;
3416
3417
cpus_read_lock();
3418
r = __do_tdx_bringup();
3419
cpus_read_unlock();
3420
3421
if (r)
3422
goto tdx_bringup_err;
3423
3424
r = -EINVAL;
3425
/* Get TDX global information for later use */
3426
tdx_sysinfo = tdx_get_sysinfo();
3427
if (WARN_ON_ONCE(!tdx_sysinfo))
3428
goto get_sysinfo_err;
3429
3430
/* Check TDX module and KVM capabilities */
3431
if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3432
!tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3433
goto get_sysinfo_err;
3434
3435
if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3436
goto get_sysinfo_err;
3437
3438
/*
3439
* TDX has its own limit of maximum vCPUs it can support for all
3440
* TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3441
* query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3442
* extension on per-VM basis.
3443
*
3444
* TDX module reports such limit via the MAX_VCPU_PER_TD global
3445
* metadata. Different modules may report different values.
3446
* Some old module may also not support this metadata (in which
3447
* case this limit is U16_MAX).
3448
*
3449
* In practice, the reported value reflects the maximum logical
3450
* CPUs that ALL the platforms that the module supports can
3451
* possibly have.
3452
*
3453
* Simply forwarding the MAX_VCPU_PER_TD to userspace could
3454
* result in an unpredictable ABI. KVM instead always advertise
3455
* the number of logical CPUs the platform has as the maximum
3456
* vCPUs for TDX guests.
3457
*
3458
* Make sure MAX_VCPU_PER_TD reported by TDX module is not
3459
* smaller than the number of logical CPUs, otherwise KVM will
3460
* report an unsupported value to userspace.
3461
*
3462
* Note, a platform with TDX enabled in the BIOS cannot support
3463
* physical CPU hotplug, and TDX requires the BIOS has marked
3464
* all logical CPUs in MADT table as enabled. Just use
3465
* num_present_cpus() for the number of logical CPUs.
3466
*/
3467
td_conf = &tdx_sysinfo->td_conf;
3468
if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3469
pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3470
td_conf->max_vcpus_per_td, num_present_cpus());
3471
goto get_sysinfo_err;
3472
}
3473
3474
if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3475
goto get_sysinfo_err;
3476
3477
/*
3478
* Leave hardware virtualization enabled after TDX is enabled
3479
* successfully. TDX CPU hotplug depends on this.
3480
*/
3481
return 0;
3482
3483
get_sysinfo_err:
3484
__tdx_cleanup();
3485
tdx_bringup_err:
3486
kvm_disable_virtualization();
3487
return r;
3488
}
3489
3490
void tdx_cleanup(void)
3491
{
3492
if (enable_tdx) {
3493
misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3494
__tdx_cleanup();
3495
kvm_disable_virtualization();
3496
}
3497
}
3498
3499
int __init tdx_bringup(void)
3500
{
3501
int r, i;
3502
3503
/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3504
for_each_possible_cpu(i)
3505
INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3506
3507
if (!enable_tdx)
3508
return 0;
3509
3510
if (!enable_ept) {
3511
pr_err("EPT is required for TDX\n");
3512
goto success_disable_tdx;
3513
}
3514
3515
if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3516
pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3517
goto success_disable_tdx;
3518
}
3519
3520
if (!enable_apicv) {
3521
pr_err("APICv is required for TDX\n");
3522
goto success_disable_tdx;
3523
}
3524
3525
if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3526
pr_err("tdx: OSXSAVE is required for TDX\n");
3527
goto success_disable_tdx;
3528
}
3529
3530
if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3531
pr_err("tdx: MOVDIR64B is required for TDX\n");
3532
goto success_disable_tdx;
3533
}
3534
3535
if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3536
pr_err("Self-snoop is required for TDX\n");
3537
goto success_disable_tdx;
3538
}
3539
3540
if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3541
pr_err("tdx: no TDX private KeyIDs available\n");
3542
goto success_disable_tdx;
3543
}
3544
3545
if (!enable_virt_at_load) {
3546
pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3547
goto success_disable_tdx;
3548
}
3549
3550
/*
3551
* Ideally KVM should probe whether TDX module has been loaded
3552
* first and then try to bring it up. But TDX needs to use SEAMCALL
3553
* to probe whether the module is loaded (there is no CPUID or MSR
3554
* for that), and making SEAMCALL requires enabling virtualization
3555
* first, just like the rest steps of bringing up TDX module.
3556
*
3557
* So, for simplicity do everything in __tdx_bringup(); the first
3558
* SEAMCALL will return -ENODEV when the module is not loaded. The
3559
* only complication is having to make sure that initialization
3560
* SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3561
* cases.
3562
*/
3563
r = __tdx_bringup();
3564
if (r) {
3565
/*
3566
* Disable TDX only but don't fail to load module if the TDX
3567
* module could not be loaded. No need to print message saying
3568
* "module is not loaded" because it was printed when the first
3569
* SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3570
* vm_size, as kvm_x86_ops have already been finalized (and are
3571
* intentionally not exported). The S-EPT code is unreachable,
3572
* and allocating a few more bytes per VM in a should-be-rare
3573
* failure scenario is a non-issue.
3574
*/
3575
if (r == -ENODEV)
3576
goto success_disable_tdx;
3577
3578
enable_tdx = 0;
3579
}
3580
3581
return r;
3582
3583
success_disable_tdx:
3584
enable_tdx = 0;
3585
return 0;
3586
}
3587
3588
void __init tdx_hardware_setup(void)
3589
{
3590
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3591
3592
/*
3593
* Note, if the TDX module can't be loaded, KVM TDX support will be
3594
* disabled but KVM will continue loading (see tdx_bringup()).
3595
*/
3596
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3597
3598
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3599
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3600
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3601
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3602
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3603
}
3604
3605