Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/virt/vmx/tdx/tdx.c
52063 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright(c) 2023 Intel Corporation.
4
*
5
* Intel Trusted Domain Extensions (TDX) support
6
*/
7
8
#include "asm/page_types.h"
9
#define pr_fmt(fmt) "virt/tdx: " fmt
10
11
#include <linux/types.h>
12
#include <linux/cache.h>
13
#include <linux/init.h>
14
#include <linux/errno.h>
15
#include <linux/printk.h>
16
#include <linux/cpu.h>
17
#include <linux/spinlock.h>
18
#include <linux/percpu-defs.h>
19
#include <linux/mutex.h>
20
#include <linux/list.h>
21
#include <linux/memblock.h>
22
#include <linux/memory.h>
23
#include <linux/minmax.h>
24
#include <linux/sizes.h>
25
#include <linux/pfn.h>
26
#include <linux/align.h>
27
#include <linux/sort.h>
28
#include <linux/log2.h>
29
#include <linux/acpi.h>
30
#include <linux/suspend.h>
31
#include <linux/idr.h>
32
#include <linux/kvm_types.h>
33
#include <asm/page.h>
34
#include <asm/special_insns.h>
35
#include <asm/msr-index.h>
36
#include <asm/msr.h>
37
#include <asm/cpufeature.h>
38
#include <asm/tdx.h>
39
#include <asm/cpu_device_id.h>
40
#include <asm/processor.h>
41
#include <asm/mce.h>
42
#include "tdx.h"
43
44
static u32 tdx_global_keyid __ro_after_init;
45
static u32 tdx_guest_keyid_start __ro_after_init;
46
static u32 tdx_nr_guest_keyids __ro_after_init;
47
48
static DEFINE_IDA(tdx_guest_keyid_pool);
49
50
static DEFINE_PER_CPU(bool, tdx_lp_initialized);
51
52
static struct tdmr_info_list tdx_tdmr_list;
53
54
static enum tdx_module_status_t tdx_module_status;
55
static DEFINE_MUTEX(tdx_module_lock);
56
57
/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
58
static LIST_HEAD(tdx_memlist);
59
60
static struct tdx_sys_info tdx_sysinfo;
61
62
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
63
64
static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
65
{
66
pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
67
}
68
69
static inline void seamcall_err_ret(u64 fn, u64 err,
70
struct tdx_module_args *args)
71
{
72
seamcall_err(fn, err, args);
73
pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
74
args->rcx, args->rdx, args->r8);
75
pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
76
args->r9, args->r10, args->r11);
77
}
78
79
static __always_inline int sc_retry_prerr(sc_func_t func,
80
sc_err_func_t err_func,
81
u64 fn, struct tdx_module_args *args)
82
{
83
u64 sret = sc_retry(func, fn, args);
84
85
if (sret == TDX_SUCCESS)
86
return 0;
87
88
if (sret == TDX_SEAMCALL_VMFAILINVALID)
89
return -ENODEV;
90
91
if (sret == TDX_SEAMCALL_GP)
92
return -EOPNOTSUPP;
93
94
if (sret == TDX_SEAMCALL_UD)
95
return -EACCES;
96
97
err_func(fn, sret, args);
98
return -EIO;
99
}
100
101
#define seamcall_prerr(__fn, __args) \
102
sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
103
104
#define seamcall_prerr_ret(__fn, __args) \
105
sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
106
107
/*
108
* Do the module global initialization once and return its result.
109
* It can be done on any cpu. It's always called with interrupts
110
* disabled.
111
*/
112
static int try_init_module_global(void)
113
{
114
struct tdx_module_args args = {};
115
static DEFINE_RAW_SPINLOCK(sysinit_lock);
116
static bool sysinit_done;
117
static int sysinit_ret;
118
119
lockdep_assert_irqs_disabled();
120
121
raw_spin_lock(&sysinit_lock);
122
123
if (sysinit_done)
124
goto out;
125
126
/* RCX is module attributes and all bits are reserved */
127
args.rcx = 0;
128
sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
129
130
/*
131
* The first SEAMCALL also detects the TDX module, thus
132
* it can fail due to the TDX module is not loaded.
133
* Dump message to let the user know.
134
*/
135
if (sysinit_ret == -ENODEV)
136
pr_err("module not loaded\n");
137
138
sysinit_done = true;
139
out:
140
raw_spin_unlock(&sysinit_lock);
141
return sysinit_ret;
142
}
143
144
/**
145
* tdx_cpu_enable - Enable TDX on local cpu
146
*
147
* Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
148
* global initialization SEAMCALL if not done) on local cpu to make this
149
* cpu be ready to run any other SEAMCALLs.
150
*
151
* Always call this function via IPI function calls.
152
*
153
* Return 0 on success, otherwise errors.
154
*/
155
int tdx_cpu_enable(void)
156
{
157
struct tdx_module_args args = {};
158
int ret;
159
160
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
161
return -ENODEV;
162
163
lockdep_assert_irqs_disabled();
164
165
if (__this_cpu_read(tdx_lp_initialized))
166
return 0;
167
168
/*
169
* The TDX module global initialization is the very first step
170
* to enable TDX. Need to do it first (if hasn't been done)
171
* before the per-cpu initialization.
172
*/
173
ret = try_init_module_global();
174
if (ret)
175
return ret;
176
177
ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
178
if (ret)
179
return ret;
180
181
__this_cpu_write(tdx_lp_initialized, true);
182
183
return 0;
184
}
185
EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable);
186
187
/*
188
* Add a memory region as a TDX memory block. The caller must make sure
189
* all memory regions are added in address ascending order and don't
190
* overlap.
191
*/
192
static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
193
unsigned long end_pfn, int nid)
194
{
195
struct tdx_memblock *tmb;
196
197
tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
198
if (!tmb)
199
return -ENOMEM;
200
201
INIT_LIST_HEAD(&tmb->list);
202
tmb->start_pfn = start_pfn;
203
tmb->end_pfn = end_pfn;
204
tmb->nid = nid;
205
206
/* @tmb_list is protected by mem_hotplug_lock */
207
list_add_tail(&tmb->list, tmb_list);
208
return 0;
209
}
210
211
static void free_tdx_memlist(struct list_head *tmb_list)
212
{
213
/* @tmb_list is protected by mem_hotplug_lock */
214
while (!list_empty(tmb_list)) {
215
struct tdx_memblock *tmb = list_first_entry(tmb_list,
216
struct tdx_memblock, list);
217
218
list_del(&tmb->list);
219
kfree(tmb);
220
}
221
}
222
223
/*
224
* Ensure that all memblock memory regions are convertible to TDX
225
* memory. Once this has been established, stash the memblock
226
* ranges off in a secondary structure because memblock is modified
227
* in memory hotplug while TDX memory regions are fixed.
228
*/
229
static int build_tdx_memlist(struct list_head *tmb_list)
230
{
231
unsigned long start_pfn, end_pfn;
232
int i, nid, ret;
233
234
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
235
/*
236
* The first 1MB is not reported as TDX convertible memory.
237
* Although the first 1MB is always reserved and won't end up
238
* to the page allocator, it is still in memblock's memory
239
* regions. Skip them manually to exclude them as TDX memory.
240
*/
241
start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
242
if (start_pfn >= end_pfn)
243
continue;
244
245
/*
246
* Add the memory regions as TDX memory. The regions in
247
* memblock has already guaranteed they are in address
248
* ascending order and don't overlap.
249
*/
250
ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
251
if (ret)
252
goto err;
253
}
254
255
return 0;
256
err:
257
free_tdx_memlist(tmb_list);
258
return ret;
259
}
260
261
static int read_sys_metadata_field(u64 field_id, u64 *data)
262
{
263
struct tdx_module_args args = {};
264
int ret;
265
266
/*
267
* TDH.SYS.RD -- reads one global metadata field
268
* - RDX (in): the field to read
269
* - R8 (out): the field data
270
*/
271
args.rdx = field_id;
272
ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
273
if (ret)
274
return ret;
275
276
*data = args.r8;
277
278
return 0;
279
}
280
281
#include "tdx_global_metadata.c"
282
283
static int check_features(struct tdx_sys_info *sysinfo)
284
{
285
u64 tdx_features0 = sysinfo->features.tdx_features0;
286
287
if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
288
pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
289
return -EINVAL;
290
}
291
292
return 0;
293
}
294
295
/* Calculate the actual TDMR size */
296
static int tdmr_size_single(u16 max_reserved_per_tdmr)
297
{
298
int tdmr_sz;
299
300
/*
301
* The actual size of TDMR depends on the maximum
302
* number of reserved areas.
303
*/
304
tdmr_sz = sizeof(struct tdmr_info);
305
tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
306
307
return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
308
}
309
310
static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
311
struct tdx_sys_info_tdmr *sysinfo_tdmr)
312
{
313
size_t tdmr_sz, tdmr_array_sz;
314
void *tdmr_array;
315
316
tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
317
tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
318
319
/*
320
* To keep things simple, allocate all TDMRs together.
321
* The buffer needs to be physically contiguous to make
322
* sure each TDMR is physically contiguous.
323
*/
324
tdmr_array = alloc_pages_exact(tdmr_array_sz,
325
GFP_KERNEL | __GFP_ZERO);
326
if (!tdmr_array)
327
return -ENOMEM;
328
329
tdmr_list->tdmrs = tdmr_array;
330
331
/*
332
* Keep the size of TDMR to find the target TDMR
333
* at a given index in the TDMR list.
334
*/
335
tdmr_list->tdmr_sz = tdmr_sz;
336
tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
337
tdmr_list->nr_consumed_tdmrs = 0;
338
339
return 0;
340
}
341
342
static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
343
{
344
free_pages_exact(tdmr_list->tdmrs,
345
tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
346
}
347
348
/* Get the TDMR from the list at the given index. */
349
static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
350
int idx)
351
{
352
int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
353
354
return (void *)tdmr_list->tdmrs + tdmr_info_offset;
355
}
356
357
#define TDMR_ALIGNMENT SZ_1G
358
#define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
359
#define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
360
361
static inline u64 tdmr_end(struct tdmr_info *tdmr)
362
{
363
return tdmr->base + tdmr->size;
364
}
365
366
/*
367
* Take the memory referenced in @tmb_list and populate the
368
* preallocated @tdmr_list, following all the special alignment
369
* and size rules for TDMR.
370
*/
371
static int fill_out_tdmrs(struct list_head *tmb_list,
372
struct tdmr_info_list *tdmr_list)
373
{
374
struct tdx_memblock *tmb;
375
int tdmr_idx = 0;
376
377
/*
378
* Loop over TDX memory regions and fill out TDMRs to cover them.
379
* To keep it simple, always try to use one TDMR to cover one
380
* memory region.
381
*
382
* In practice TDX supports at least 64 TDMRs. A 2-socket system
383
* typically only consumes less than 10 of those. This code is
384
* dumb and simple and may use more TMDRs than is strictly
385
* required.
386
*/
387
list_for_each_entry(tmb, tmb_list, list) {
388
struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
389
u64 start, end;
390
391
start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
392
end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
393
394
/*
395
* A valid size indicates the current TDMR has already
396
* been filled out to cover the previous memory region(s).
397
*/
398
if (tdmr->size) {
399
/*
400
* Loop to the next if the current memory region
401
* has already been fully covered.
402
*/
403
if (end <= tdmr_end(tdmr))
404
continue;
405
406
/* Otherwise, skip the already covered part. */
407
if (start < tdmr_end(tdmr))
408
start = tdmr_end(tdmr);
409
410
/*
411
* Create a new TDMR to cover the current memory
412
* region, or the remaining part of it.
413
*/
414
tdmr_idx++;
415
if (tdmr_idx >= tdmr_list->max_tdmrs) {
416
pr_warn("initialization failed: TDMRs exhausted.\n");
417
return -ENOSPC;
418
}
419
420
tdmr = tdmr_entry(tdmr_list, tdmr_idx);
421
}
422
423
tdmr->base = start;
424
tdmr->size = end - start;
425
}
426
427
/* @tdmr_idx is always the index of the last valid TDMR. */
428
tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
429
430
/*
431
* Warn early that kernel is about to run out of TDMRs.
432
*
433
* This is an indication that TDMR allocation has to be
434
* reworked to be smarter to not run into an issue.
435
*/
436
if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
437
pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
438
tdmr_list->nr_consumed_tdmrs,
439
tdmr_list->max_tdmrs);
440
441
return 0;
442
}
443
444
/*
445
* Calculate PAMT size given a TDMR and a page size. The returned
446
* PAMT size is always aligned up to 4K page boundary.
447
*/
448
static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
449
u16 pamt_entry_size)
450
{
451
unsigned long pamt_sz, nr_pamt_entries;
452
453
switch (pgsz) {
454
case TDX_PS_4K:
455
nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
456
break;
457
case TDX_PS_2M:
458
nr_pamt_entries = tdmr->size >> PMD_SHIFT;
459
break;
460
case TDX_PS_1G:
461
nr_pamt_entries = tdmr->size >> PUD_SHIFT;
462
break;
463
default:
464
WARN_ON_ONCE(1);
465
return 0;
466
}
467
468
pamt_sz = nr_pamt_entries * pamt_entry_size;
469
/* TDX requires PAMT size must be 4K aligned */
470
pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
471
472
return pamt_sz;
473
}
474
475
/*
476
* Locate a NUMA node which should hold the allocation of the @tdmr
477
* PAMT. This node will have some memory covered by the TDMR. The
478
* relative amount of memory covered is not considered.
479
*/
480
static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
481
{
482
struct tdx_memblock *tmb;
483
484
/*
485
* A TDMR must cover at least part of one TMB. That TMB will end
486
* after the TDMR begins. But, that TMB may have started before
487
* the TDMR. Find the next 'tmb' that _ends_ after this TDMR
488
* begins. Ignore 'tmb' start addresses. They are irrelevant.
489
*/
490
list_for_each_entry(tmb, tmb_list, list) {
491
if (tmb->end_pfn > PHYS_PFN(tdmr->base))
492
return tmb->nid;
493
}
494
495
/*
496
* Fall back to allocating the TDMR's metadata from node 0 when
497
* no TDX memory block can be found. This should never happen
498
* since TDMRs originate from TDX memory blocks.
499
*/
500
pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
501
tdmr->base, tdmr_end(tdmr));
502
return 0;
503
}
504
505
/*
506
* Allocate PAMTs from the local NUMA node of some memory in @tmb_list
507
* within @tdmr, and set up PAMTs for @tdmr.
508
*/
509
static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
510
struct list_head *tmb_list,
511
u16 pamt_entry_size[])
512
{
513
unsigned long pamt_base[TDX_PS_NR];
514
unsigned long pamt_size[TDX_PS_NR];
515
unsigned long tdmr_pamt_base;
516
unsigned long tdmr_pamt_size;
517
struct page *pamt;
518
int pgsz, nid;
519
520
nid = tdmr_get_nid(tdmr, tmb_list);
521
522
/*
523
* Calculate the PAMT size for each TDX supported page size
524
* and the total PAMT size.
525
*/
526
tdmr_pamt_size = 0;
527
for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
528
pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
529
pamt_entry_size[pgsz]);
530
tdmr_pamt_size += pamt_size[pgsz];
531
}
532
533
/*
534
* Allocate one chunk of physically contiguous memory for all
535
* PAMTs. This helps minimize the PAMT's use of reserved areas
536
* in overlapped TDMRs.
537
*/
538
pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
539
nid, &node_online_map);
540
if (!pamt)
541
return -ENOMEM;
542
543
/*
544
* Break the contiguous allocation back up into the
545
* individual PAMTs for each page size.
546
*/
547
tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
548
for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
549
pamt_base[pgsz] = tdmr_pamt_base;
550
tdmr_pamt_base += pamt_size[pgsz];
551
}
552
553
tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
554
tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
555
tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
556
tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
557
tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
558
tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
559
560
return 0;
561
}
562
563
static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
564
unsigned long *pamt_size)
565
{
566
unsigned long pamt_bs, pamt_sz;
567
568
/*
569
* The PAMT was allocated in one contiguous unit. The 4K PAMT
570
* should always point to the beginning of that allocation.
571
*/
572
pamt_bs = tdmr->pamt_4k_base;
573
pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
574
575
WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
576
577
*pamt_base = pamt_bs;
578
*pamt_size = pamt_sz;
579
}
580
581
static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
582
void (*pamt_func)(unsigned long base, unsigned long size))
583
{
584
unsigned long pamt_base, pamt_size;
585
586
tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
587
588
/* Do nothing if PAMT hasn't been allocated for this TDMR */
589
if (!pamt_size)
590
return;
591
592
if (WARN_ON_ONCE(!pamt_base))
593
return;
594
595
pamt_func(pamt_base, pamt_size);
596
}
597
598
static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
599
{
600
free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
601
}
602
603
static void tdmr_free_pamt(struct tdmr_info *tdmr)
604
{
605
tdmr_do_pamt_func(tdmr, free_pamt);
606
}
607
608
static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
609
{
610
int i;
611
612
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
613
tdmr_free_pamt(tdmr_entry(tdmr_list, i));
614
}
615
616
/* Allocate and set up PAMTs for all TDMRs */
617
static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
618
struct list_head *tmb_list,
619
u16 pamt_entry_size[])
620
{
621
int i, ret = 0;
622
623
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
624
ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
625
pamt_entry_size);
626
if (ret)
627
goto err;
628
}
629
630
return 0;
631
err:
632
tdmrs_free_pamt_all(tdmr_list);
633
return ret;
634
}
635
636
/*
637
* Convert TDX private pages back to normal by using MOVDIR64B to clear these
638
* pages. Typically, any write to the page will convert it from TDX private back
639
* to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
640
* do the conversion explicitly via MOVDIR64B.
641
*/
642
static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
643
{
644
const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
645
unsigned long phys, end;
646
647
if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
648
return;
649
650
end = base + size;
651
for (phys = base; phys < end; phys += 64)
652
movdir64b(__va(phys), zero_page);
653
654
/*
655
* MOVDIR64B uses WC protocol. Use memory barrier to
656
* make sure any later user of these pages sees the
657
* updated data.
658
*/
659
mb();
660
}
661
662
void tdx_quirk_reset_page(struct page *page)
663
{
664
tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
665
}
666
EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
667
668
static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
669
{
670
tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr);
671
}
672
673
static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list)
674
{
675
int i;
676
677
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
678
tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i));
679
}
680
681
static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
682
{
683
unsigned long pamt_size = 0;
684
int i;
685
686
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
687
unsigned long base, size;
688
689
tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
690
pamt_size += size;
691
}
692
693
return pamt_size / 1024;
694
}
695
696
static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
697
u64 size, u16 max_reserved_per_tdmr)
698
{
699
struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
700
int idx = *p_idx;
701
702
/* Reserved area must be 4K aligned in offset and size */
703
if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
704
return -EINVAL;
705
706
if (idx >= max_reserved_per_tdmr) {
707
pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
708
tdmr->base, tdmr_end(tdmr));
709
return -ENOSPC;
710
}
711
712
/*
713
* Consume one reserved area per call. Make no effort to
714
* optimize or reduce the number of reserved areas which are
715
* consumed by contiguous reserved areas, for instance.
716
*/
717
rsvd_areas[idx].offset = addr - tdmr->base;
718
rsvd_areas[idx].size = size;
719
720
*p_idx = idx + 1;
721
722
return 0;
723
}
724
725
/*
726
* Go through @tmb_list to find holes between memory areas. If any of
727
* those holes fall within @tdmr, set up a TDMR reserved area to cover
728
* the hole.
729
*/
730
static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
731
struct tdmr_info *tdmr,
732
int *rsvd_idx,
733
u16 max_reserved_per_tdmr)
734
{
735
struct tdx_memblock *tmb;
736
u64 prev_end;
737
int ret;
738
739
/*
740
* Start looking for reserved blocks at the
741
* beginning of the TDMR.
742
*/
743
prev_end = tdmr->base;
744
list_for_each_entry(tmb, tmb_list, list) {
745
u64 start, end;
746
747
start = PFN_PHYS(tmb->start_pfn);
748
end = PFN_PHYS(tmb->end_pfn);
749
750
/* Break if this region is after the TDMR */
751
if (start >= tdmr_end(tdmr))
752
break;
753
754
/* Exclude regions before this TDMR */
755
if (end < tdmr->base)
756
continue;
757
758
/*
759
* Skip over memory areas that
760
* have already been dealt with.
761
*/
762
if (start <= prev_end) {
763
prev_end = end;
764
continue;
765
}
766
767
/* Add the hole before this region */
768
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
769
start - prev_end,
770
max_reserved_per_tdmr);
771
if (ret)
772
return ret;
773
774
prev_end = end;
775
}
776
777
/* Add the hole after the last region if it exists. */
778
if (prev_end < tdmr_end(tdmr)) {
779
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
780
tdmr_end(tdmr) - prev_end,
781
max_reserved_per_tdmr);
782
if (ret)
783
return ret;
784
}
785
786
return 0;
787
}
788
789
/*
790
* Go through @tdmr_list to find all PAMTs. If any of those PAMTs
791
* overlaps with @tdmr, set up a TDMR reserved area to cover the
792
* overlapping part.
793
*/
794
static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
795
struct tdmr_info *tdmr,
796
int *rsvd_idx,
797
u16 max_reserved_per_tdmr)
798
{
799
int i, ret;
800
801
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
802
struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
803
unsigned long pamt_base, pamt_size, pamt_end;
804
805
tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
806
/* Each TDMR must already have PAMT allocated */
807
WARN_ON_ONCE(!pamt_size || !pamt_base);
808
809
pamt_end = pamt_base + pamt_size;
810
/* Skip PAMTs outside of the given TDMR */
811
if ((pamt_end <= tdmr->base) ||
812
(pamt_base >= tdmr_end(tdmr)))
813
continue;
814
815
/* Only mark the part within the TDMR as reserved */
816
if (pamt_base < tdmr->base)
817
pamt_base = tdmr->base;
818
if (pamt_end > tdmr_end(tdmr))
819
pamt_end = tdmr_end(tdmr);
820
821
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
822
pamt_end - pamt_base,
823
max_reserved_per_tdmr);
824
if (ret)
825
return ret;
826
}
827
828
return 0;
829
}
830
831
/* Compare function called by sort() for TDMR reserved areas */
832
static int rsvd_area_cmp_func(const void *a, const void *b)
833
{
834
struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
835
struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
836
837
if (r1->offset + r1->size <= r2->offset)
838
return -1;
839
if (r1->offset >= r2->offset + r2->size)
840
return 1;
841
842
/* Reserved areas cannot overlap. The caller must guarantee. */
843
WARN_ON_ONCE(1);
844
return -1;
845
}
846
847
/*
848
* Populate reserved areas for the given @tdmr, including memory holes
849
* (via @tmb_list) and PAMTs (via @tdmr_list).
850
*/
851
static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
852
struct list_head *tmb_list,
853
struct tdmr_info_list *tdmr_list,
854
u16 max_reserved_per_tdmr)
855
{
856
int ret, rsvd_idx = 0;
857
858
ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
859
max_reserved_per_tdmr);
860
if (ret)
861
return ret;
862
863
ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
864
max_reserved_per_tdmr);
865
if (ret)
866
return ret;
867
868
/* TDX requires reserved areas listed in address ascending order */
869
sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
870
rsvd_area_cmp_func, NULL);
871
872
return 0;
873
}
874
875
/*
876
* Populate reserved areas for all TDMRs in @tdmr_list, including memory
877
* holes (via @tmb_list) and PAMTs.
878
*/
879
static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
880
struct list_head *tmb_list,
881
u16 max_reserved_per_tdmr)
882
{
883
int i;
884
885
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
886
int ret;
887
888
ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
889
tmb_list, tdmr_list, max_reserved_per_tdmr);
890
if (ret)
891
return ret;
892
}
893
894
return 0;
895
}
896
897
/*
898
* Construct a list of TDMRs on the preallocated space in @tdmr_list
899
* to cover all TDX memory regions in @tmb_list based on the TDX module
900
* TDMR global information in @sysinfo_tdmr.
901
*/
902
static int construct_tdmrs(struct list_head *tmb_list,
903
struct tdmr_info_list *tdmr_list,
904
struct tdx_sys_info_tdmr *sysinfo_tdmr)
905
{
906
u16 pamt_entry_size[TDX_PS_NR] = {
907
sysinfo_tdmr->pamt_4k_entry_size,
908
sysinfo_tdmr->pamt_2m_entry_size,
909
sysinfo_tdmr->pamt_1g_entry_size,
910
};
911
int ret;
912
913
ret = fill_out_tdmrs(tmb_list, tdmr_list);
914
if (ret)
915
return ret;
916
917
ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
918
if (ret)
919
return ret;
920
921
ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
922
sysinfo_tdmr->max_reserved_per_tdmr);
923
if (ret)
924
tdmrs_free_pamt_all(tdmr_list);
925
926
/*
927
* The tdmr_info_list is read-only from here on out.
928
* Ensure that these writes are seen by other CPUs.
929
* Pairs with a smp_rmb() in is_pamt_page().
930
*/
931
smp_wmb();
932
933
return ret;
934
}
935
936
static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
937
{
938
struct tdx_module_args args = {};
939
u64 *tdmr_pa_array;
940
size_t array_sz;
941
int i, ret;
942
943
/*
944
* TDMRs are passed to the TDX module via an array of physical
945
* addresses of each TDMR. The array itself also has certain
946
* alignment requirement.
947
*/
948
array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
949
array_sz = roundup_pow_of_two(array_sz);
950
if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
951
array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
952
953
tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
954
if (!tdmr_pa_array)
955
return -ENOMEM;
956
957
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
958
tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
959
960
args.rcx = __pa(tdmr_pa_array);
961
args.rdx = tdmr_list->nr_consumed_tdmrs;
962
args.r8 = global_keyid;
963
ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
964
965
/* Free the array as it is not required anymore. */
966
kfree(tdmr_pa_array);
967
968
return ret;
969
}
970
971
static int do_global_key_config(void *unused)
972
{
973
struct tdx_module_args args = {};
974
975
return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
976
}
977
978
/*
979
* Attempt to configure the global KeyID on all physical packages.
980
*
981
* This requires running code on at least one CPU in each package.
982
* TDMR initialization) will fail will fail if any package in the
983
* system has no online CPUs.
984
*
985
* This code takes no affirmative steps to online CPUs. Callers (aka.
986
* KVM) can ensure success by ensuring sufficient CPUs are online and
987
* can run SEAMCALLs.
988
*/
989
static int config_global_keyid(void)
990
{
991
cpumask_var_t packages;
992
int cpu, ret = -EINVAL;
993
994
if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
995
return -ENOMEM;
996
997
/*
998
* Hardware doesn't guarantee cache coherency across different
999
* KeyIDs. The kernel needs to flush PAMT's dirty cachelines
1000
* (associated with KeyID 0) before the TDX module can use the
1001
* global KeyID to access the PAMT. Given PAMTs are potentially
1002
* large (~1/256th of system RAM), just use WBINVD.
1003
*/
1004
wbinvd_on_all_cpus();
1005
1006
for_each_online_cpu(cpu) {
1007
/*
1008
* The key configuration only needs to be done once per
1009
* package and will return an error if configured more
1010
* than once. Avoid doing it multiple times per package.
1011
*/
1012
if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1013
packages))
1014
continue;
1015
1016
/*
1017
* TDH.SYS.KEY.CONFIG cannot run concurrently on
1018
* different cpus. Do it one by one.
1019
*/
1020
ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1021
if (ret)
1022
break;
1023
}
1024
1025
free_cpumask_var(packages);
1026
return ret;
1027
}
1028
1029
static int init_tdmr(struct tdmr_info *tdmr)
1030
{
1031
u64 next;
1032
1033
/*
1034
* Initializing a TDMR can be time consuming. To avoid long
1035
* SEAMCALLs, the TDX module may only initialize a part of the
1036
* TDMR in each call.
1037
*/
1038
do {
1039
struct tdx_module_args args = {
1040
.rcx = tdmr->base,
1041
};
1042
int ret;
1043
1044
ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1045
if (ret)
1046
return ret;
1047
/*
1048
* RDX contains 'next-to-initialize' address if
1049
* TDH.SYS.TDMR.INIT did not fully complete and
1050
* should be retried.
1051
*/
1052
next = args.rdx;
1053
cond_resched();
1054
/* Keep making SEAMCALLs until the TDMR is done */
1055
} while (next < tdmr->base + tdmr->size);
1056
1057
return 0;
1058
}
1059
1060
static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1061
{
1062
int i;
1063
1064
/*
1065
* This operation is costly. It can be parallelized,
1066
* but keep it simple for now.
1067
*/
1068
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1069
int ret;
1070
1071
ret = init_tdmr(tdmr_entry(tdmr_list, i));
1072
if (ret)
1073
return ret;
1074
}
1075
1076
return 0;
1077
}
1078
1079
static int init_tdx_module(void)
1080
{
1081
int ret;
1082
1083
ret = get_tdx_sys_info(&tdx_sysinfo);
1084
if (ret)
1085
return ret;
1086
1087
/* Check whether the kernel can support this module */
1088
ret = check_features(&tdx_sysinfo);
1089
if (ret)
1090
return ret;
1091
1092
/*
1093
* To keep things simple, assume that all TDX-protected memory
1094
* will come from the page allocator. Make sure all pages in the
1095
* page allocator are TDX-usable memory.
1096
*
1097
* Build the list of "TDX-usable" memory regions which cover all
1098
* pages in the page allocator to guarantee that. Do it while
1099
* holding mem_hotplug_lock read-lock as the memory hotplug code
1100
* path reads the @tdx_memlist to reject any new memory.
1101
*/
1102
get_online_mems();
1103
1104
ret = build_tdx_memlist(&tdx_memlist);
1105
if (ret)
1106
goto out_put_tdxmem;
1107
1108
/* Allocate enough space for constructing TDMRs */
1109
ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
1110
if (ret)
1111
goto err_free_tdxmem;
1112
1113
/* Cover all TDX-usable memory regions in TDMRs */
1114
ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
1115
if (ret)
1116
goto err_free_tdmrs;
1117
1118
/* Pass the TDMRs and the global KeyID to the TDX module */
1119
ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1120
if (ret)
1121
goto err_free_pamts;
1122
1123
/* Config the key of global KeyID on all packages */
1124
ret = config_global_keyid();
1125
if (ret)
1126
goto err_reset_pamts;
1127
1128
/* Initialize TDMRs to complete the TDX module initialization */
1129
ret = init_tdmrs(&tdx_tdmr_list);
1130
if (ret)
1131
goto err_reset_pamts;
1132
1133
pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1134
1135
out_put_tdxmem:
1136
/*
1137
* @tdx_memlist is written here and read at memory hotplug time.
1138
* Lock out memory hotplug code while building it.
1139
*/
1140
put_online_mems();
1141
return ret;
1142
1143
err_reset_pamts:
1144
/*
1145
* Part of PAMTs may already have been initialized by the
1146
* TDX module. Flush cache before returning PAMTs back
1147
* to the kernel.
1148
*/
1149
wbinvd_on_all_cpus();
1150
tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list);
1151
err_free_pamts:
1152
tdmrs_free_pamt_all(&tdx_tdmr_list);
1153
err_free_tdmrs:
1154
free_tdmr_list(&tdx_tdmr_list);
1155
err_free_tdxmem:
1156
free_tdx_memlist(&tdx_memlist);
1157
goto out_put_tdxmem;
1158
}
1159
1160
static int __tdx_enable(void)
1161
{
1162
int ret;
1163
1164
ret = init_tdx_module();
1165
if (ret) {
1166
pr_err("module initialization failed (%d)\n", ret);
1167
tdx_module_status = TDX_MODULE_ERROR;
1168
return ret;
1169
}
1170
1171
pr_info("module initialized\n");
1172
tdx_module_status = TDX_MODULE_INITIALIZED;
1173
1174
return 0;
1175
}
1176
1177
/**
1178
* tdx_enable - Enable TDX module to make it ready to run TDX guests
1179
*
1180
* This function assumes the caller has: 1) held read lock of CPU hotplug
1181
* lock to prevent any new cpu from becoming online; 2) done both VMXON
1182
* and tdx_cpu_enable() on all online cpus.
1183
*
1184
* This function requires there's at least one online cpu for each CPU
1185
* package to succeed.
1186
*
1187
* This function can be called in parallel by multiple callers.
1188
*
1189
* Return 0 if TDX is enabled successfully, otherwise error.
1190
*/
1191
int tdx_enable(void)
1192
{
1193
int ret;
1194
1195
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1196
return -ENODEV;
1197
1198
lockdep_assert_cpus_held();
1199
1200
mutex_lock(&tdx_module_lock);
1201
1202
switch (tdx_module_status) {
1203
case TDX_MODULE_UNINITIALIZED:
1204
ret = __tdx_enable();
1205
break;
1206
case TDX_MODULE_INITIALIZED:
1207
/* Already initialized, great, tell the caller. */
1208
ret = 0;
1209
break;
1210
default:
1211
/* Failed to initialize in the previous attempts */
1212
ret = -EINVAL;
1213
break;
1214
}
1215
1216
mutex_unlock(&tdx_module_lock);
1217
1218
return ret;
1219
}
1220
EXPORT_SYMBOL_FOR_KVM(tdx_enable);
1221
1222
static bool is_pamt_page(unsigned long phys)
1223
{
1224
struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1225
int i;
1226
1227
/* Ensure that all remote 'tdmr_list' writes are visible: */
1228
smp_rmb();
1229
1230
/*
1231
* The TDX module is no longer returning TDX_SYS_NOT_READY and
1232
* is initialized. The 'tdmr_list' was initialized long ago
1233
* and is now read-only.
1234
*/
1235
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1236
unsigned long base, size;
1237
1238
tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1239
1240
if (phys >= base && phys < (base + size))
1241
return true;
1242
}
1243
1244
return false;
1245
}
1246
1247
/*
1248
* Return whether the memory page at the given physical address is TDX
1249
* private memory or not.
1250
*
1251
* This can be imprecise for two known reasons:
1252
* 1. PAMTs are private memory and exist before the TDX module is
1253
* ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
1254
* short window that occurs once per boot.
1255
* 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1256
* page. However, the page can still cause #MC until it has been
1257
* fully converted to shared using 64-byte writes like MOVDIR64B.
1258
* Buggy hosts might still leave #MC-causing memory in place which
1259
* this function can not detect.
1260
*/
1261
static bool paddr_is_tdx_private(unsigned long phys)
1262
{
1263
struct tdx_module_args args = {
1264
.rcx = phys & PAGE_MASK,
1265
};
1266
u64 sret;
1267
1268
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1269
return false;
1270
1271
/* Get page type from the TDX module */
1272
sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args);
1273
1274
/*
1275
* The SEAMCALL will not return success unless there is a
1276
* working, "ready" TDX module. Assume an absence of TDX
1277
* private pages until SEAMCALL is working.
1278
*/
1279
if (sret)
1280
return false;
1281
1282
/*
1283
* SEAMCALL was successful -- read page type (via RCX):
1284
*
1285
* - PT_NDA: Page is not used by the TDX module
1286
* - PT_RSVD: Reserved for Non-TDX use
1287
* - Others: Page is used by the TDX module
1288
*
1289
* Note PAMT pages are marked as PT_RSVD but they are also TDX
1290
* private memory.
1291
*/
1292
switch (args.rcx) {
1293
case PT_NDA:
1294
return false;
1295
case PT_RSVD:
1296
return is_pamt_page(phys);
1297
default:
1298
return true;
1299
}
1300
}
1301
1302
/*
1303
* Some TDX-capable CPUs have an erratum. A write to TDX private
1304
* memory poisons that memory, and a subsequent read of that memory
1305
* triggers #MC.
1306
*
1307
* Help distinguish erratum-triggered #MCs from a normal hardware one.
1308
* Just print additional message to show such #MC may be result of the
1309
* erratum.
1310
*/
1311
const char *tdx_dump_mce_info(struct mce *m)
1312
{
1313
if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1314
return NULL;
1315
1316
if (!paddr_is_tdx_private(m->addr))
1317
return NULL;
1318
1319
return "TDX private memory error. Possible kernel bug.";
1320
}
1321
1322
static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1323
u32 *nr_tdx_keyids)
1324
{
1325
u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1326
int ret;
1327
1328
/*
1329
* IA32_MKTME_KEYID_PARTIONING:
1330
* Bit [31:0]: Number of MKTME KeyIDs.
1331
* Bit [63:32]: Number of TDX private KeyIDs.
1332
*/
1333
ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1334
&_nr_tdx_keyids);
1335
if (ret || !_nr_tdx_keyids)
1336
return -EINVAL;
1337
1338
/* TDX KeyIDs start after the last MKTME KeyID. */
1339
_tdx_keyid_start = _nr_mktme_keyids + 1;
1340
1341
*tdx_keyid_start = _tdx_keyid_start;
1342
*nr_tdx_keyids = _nr_tdx_keyids;
1343
1344
return 0;
1345
}
1346
1347
static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1348
{
1349
struct tdx_memblock *tmb;
1350
1351
/*
1352
* This check assumes that the start_pfn<->end_pfn range does not
1353
* cross multiple @tdx_memlist entries. A single memory online
1354
* event across multiple memblocks (from which @tdx_memlist
1355
* entries are derived at the time of module initialization) is
1356
* not possible. This is because memory offline/online is done
1357
* on granularity of 'struct memory_block', and the hotpluggable
1358
* memory region (one memblock) must be multiple of memory_block.
1359
*/
1360
list_for_each_entry(tmb, &tdx_memlist, list) {
1361
if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1362
return true;
1363
}
1364
return false;
1365
}
1366
1367
static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1368
void *v)
1369
{
1370
struct memory_notify *mn = v;
1371
1372
if (action != MEM_GOING_ONLINE)
1373
return NOTIFY_OK;
1374
1375
/*
1376
* Empty list means TDX isn't enabled. Allow any memory
1377
* to go online.
1378
*/
1379
if (list_empty(&tdx_memlist))
1380
return NOTIFY_OK;
1381
1382
/*
1383
* The TDX memory configuration is static and can not be
1384
* changed. Reject onlining any memory which is outside of
1385
* the static configuration whether it supports TDX or not.
1386
*/
1387
if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1388
return NOTIFY_OK;
1389
1390
return NOTIFY_BAD;
1391
}
1392
1393
static struct notifier_block tdx_memory_nb = {
1394
.notifier_call = tdx_memory_notifier,
1395
};
1396
1397
static void __init check_tdx_erratum(void)
1398
{
1399
/*
1400
* These CPUs have an erratum. A partial write from non-TD
1401
* software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1402
* private memory poisons that memory, and a subsequent read of
1403
* that memory triggers #MC.
1404
*/
1405
switch (boot_cpu_data.x86_vfm) {
1406
case INTEL_SAPPHIRERAPIDS_X:
1407
case INTEL_EMERALDRAPIDS_X:
1408
setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1409
}
1410
}
1411
1412
void __init tdx_init(void)
1413
{
1414
u32 tdx_keyid_start, nr_tdx_keyids;
1415
int err;
1416
1417
err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1418
if (err)
1419
return;
1420
1421
pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1422
tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1423
1424
/*
1425
* The TDX module itself requires one 'global KeyID' to protect
1426
* its metadata. If there's only one TDX KeyID, there won't be
1427
* any left for TDX guests thus there's no point to enable TDX
1428
* at all.
1429
*/
1430
if (nr_tdx_keyids < 2) {
1431
pr_err("initialization failed: too few private KeyIDs available.\n");
1432
return;
1433
}
1434
1435
/*
1436
* At this point, hibernation_available() indicates whether or
1437
* not hibernation support has been permanently disabled.
1438
*/
1439
if (hibernation_available()) {
1440
pr_err("initialization failed: Hibernation support is enabled\n");
1441
return;
1442
}
1443
1444
err = register_memory_notifier(&tdx_memory_nb);
1445
if (err) {
1446
pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1447
err);
1448
return;
1449
}
1450
1451
#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1452
pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1453
acpi_suspend_lowlevel = NULL;
1454
#endif
1455
1456
/*
1457
* Just use the first TDX KeyID as the 'global KeyID' and
1458
* leave the rest for TDX guests.
1459
*/
1460
tdx_global_keyid = tdx_keyid_start;
1461
tdx_guest_keyid_start = tdx_keyid_start + 1;
1462
tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1463
1464
setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1465
1466
check_tdx_erratum();
1467
}
1468
1469
const struct tdx_sys_info *tdx_get_sysinfo(void)
1470
{
1471
const struct tdx_sys_info *p = NULL;
1472
1473
/* Make sure all fields in @tdx_sysinfo have been populated */
1474
mutex_lock(&tdx_module_lock);
1475
if (tdx_module_status == TDX_MODULE_INITIALIZED)
1476
p = (const struct tdx_sys_info *)&tdx_sysinfo;
1477
mutex_unlock(&tdx_module_lock);
1478
1479
return p;
1480
}
1481
EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo);
1482
1483
u32 tdx_get_nr_guest_keyids(void)
1484
{
1485
return tdx_nr_guest_keyids;
1486
}
1487
EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids);
1488
1489
int tdx_guest_keyid_alloc(void)
1490
{
1491
return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
1492
tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
1493
GFP_KERNEL);
1494
}
1495
EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc);
1496
1497
void tdx_guest_keyid_free(unsigned int keyid)
1498
{
1499
ida_free(&tdx_guest_keyid_pool, keyid);
1500
}
1501
EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free);
1502
1503
static inline u64 tdx_tdr_pa(struct tdx_td *td)
1504
{
1505
return page_to_phys(td->tdr_page);
1506
}
1507
1508
/*
1509
* The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
1510
* a CLFLUSH of pages is required before handing them to the TDX module.
1511
* Be conservative and make the code simpler by doing the CLFLUSH
1512
* unconditionally.
1513
*/
1514
static void tdx_clflush_page(struct page *page)
1515
{
1516
clflush_cache_range(page_to_virt(page), PAGE_SIZE);
1517
}
1518
1519
noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
1520
{
1521
args->rcx = td->tdvpr_pa;
1522
1523
return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
1524
}
1525
EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter);
1526
1527
u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
1528
{
1529
struct tdx_module_args args = {
1530
.rcx = page_to_phys(tdcs_page),
1531
.rdx = tdx_tdr_pa(td),
1532
};
1533
1534
tdx_clflush_page(tdcs_page);
1535
return seamcall(TDH_MNG_ADDCX, &args);
1536
}
1537
EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
1538
1539
u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
1540
{
1541
struct tdx_module_args args = {
1542
.rcx = gpa,
1543
.rdx = tdx_tdr_pa(td),
1544
.r8 = page_to_phys(page),
1545
.r9 = page_to_phys(source),
1546
};
1547
u64 ret;
1548
1549
tdx_clflush_page(page);
1550
ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
1551
1552
*ext_err1 = args.rcx;
1553
*ext_err2 = args.rdx;
1554
1555
return ret;
1556
}
1557
EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add);
1558
1559
u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1560
{
1561
struct tdx_module_args args = {
1562
.rcx = gpa | level,
1563
.rdx = tdx_tdr_pa(td),
1564
.r8 = page_to_phys(page),
1565
};
1566
u64 ret;
1567
1568
tdx_clflush_page(page);
1569
ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
1570
1571
*ext_err1 = args.rcx;
1572
*ext_err2 = args.rdx;
1573
1574
return ret;
1575
}
1576
EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add);
1577
1578
u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
1579
{
1580
struct tdx_module_args args = {
1581
.rcx = page_to_phys(tdcx_page),
1582
.rdx = vp->tdvpr_pa,
1583
};
1584
1585
tdx_clflush_page(tdcx_page);
1586
return seamcall(TDH_VP_ADDCX, &args);
1587
}
1588
EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
1589
1590
u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1591
{
1592
struct tdx_module_args args = {
1593
.rcx = gpa | level,
1594
.rdx = tdx_tdr_pa(td),
1595
.r8 = page_to_phys(page),
1596
};
1597
u64 ret;
1598
1599
tdx_clflush_page(page);
1600
ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
1601
1602
*ext_err1 = args.rcx;
1603
*ext_err2 = args.rdx;
1604
1605
return ret;
1606
}
1607
EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug);
1608
1609
u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
1610
{
1611
struct tdx_module_args args = {
1612
.rcx = gpa | level,
1613
.rdx = tdx_tdr_pa(td),
1614
};
1615
u64 ret;
1616
1617
ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
1618
1619
*ext_err1 = args.rcx;
1620
*ext_err2 = args.rdx;
1621
1622
return ret;
1623
}
1624
EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block);
1625
1626
u64 tdh_mng_key_config(struct tdx_td *td)
1627
{
1628
struct tdx_module_args args = {
1629
.rcx = tdx_tdr_pa(td),
1630
};
1631
1632
return seamcall(TDH_MNG_KEY_CONFIG, &args);
1633
}
1634
EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config);
1635
1636
u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
1637
{
1638
struct tdx_module_args args = {
1639
.rcx = tdx_tdr_pa(td),
1640
.rdx = hkid,
1641
};
1642
1643
tdx_clflush_page(td->tdr_page);
1644
return seamcall(TDH_MNG_CREATE, &args);
1645
}
1646
EXPORT_SYMBOL_FOR_KVM(tdh_mng_create);
1647
1648
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
1649
{
1650
struct tdx_module_args args = {
1651
.rcx = vp->tdvpr_pa,
1652
.rdx = tdx_tdr_pa(td),
1653
};
1654
1655
tdx_clflush_page(vp->tdvpr_page);
1656
return seamcall(TDH_VP_CREATE, &args);
1657
}
1658
EXPORT_SYMBOL_FOR_KVM(tdh_vp_create);
1659
1660
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
1661
{
1662
struct tdx_module_args args = {
1663
.rcx = tdx_tdr_pa(td),
1664
.rdx = field,
1665
};
1666
u64 ret;
1667
1668
ret = seamcall_ret(TDH_MNG_RD, &args);
1669
1670
/* R8: Content of the field, or 0 in case of error. */
1671
*data = args.r8;
1672
1673
return ret;
1674
}
1675
EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd);
1676
1677
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
1678
{
1679
struct tdx_module_args args = {
1680
.rcx = gpa,
1681
.rdx = tdx_tdr_pa(td),
1682
};
1683
u64 ret;
1684
1685
ret = seamcall_ret(TDH_MR_EXTEND, &args);
1686
1687
*ext_err1 = args.rcx;
1688
*ext_err2 = args.rdx;
1689
1690
return ret;
1691
}
1692
EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend);
1693
1694
u64 tdh_mr_finalize(struct tdx_td *td)
1695
{
1696
struct tdx_module_args args = {
1697
.rcx = tdx_tdr_pa(td),
1698
};
1699
1700
return seamcall(TDH_MR_FINALIZE, &args);
1701
}
1702
EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize);
1703
1704
u64 tdh_vp_flush(struct tdx_vp *vp)
1705
{
1706
struct tdx_module_args args = {
1707
.rcx = vp->tdvpr_pa,
1708
};
1709
1710
return seamcall(TDH_VP_FLUSH, &args);
1711
}
1712
EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush);
1713
1714
u64 tdh_mng_vpflushdone(struct tdx_td *td)
1715
{
1716
struct tdx_module_args args = {
1717
.rcx = tdx_tdr_pa(td),
1718
};
1719
1720
return seamcall(TDH_MNG_VPFLUSHDONE, &args);
1721
}
1722
EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone);
1723
1724
u64 tdh_mng_key_freeid(struct tdx_td *td)
1725
{
1726
struct tdx_module_args args = {
1727
.rcx = tdx_tdr_pa(td),
1728
};
1729
1730
return seamcall(TDH_MNG_KEY_FREEID, &args);
1731
}
1732
EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid);
1733
1734
u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
1735
{
1736
struct tdx_module_args args = {
1737
.rcx = tdx_tdr_pa(td),
1738
.rdx = td_params,
1739
};
1740
u64 ret;
1741
1742
ret = seamcall_ret(TDH_MNG_INIT, &args);
1743
1744
*extended_err = args.rcx;
1745
1746
return ret;
1747
}
1748
EXPORT_SYMBOL_FOR_KVM(tdh_mng_init);
1749
1750
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
1751
{
1752
struct tdx_module_args args = {
1753
.rcx = vp->tdvpr_pa,
1754
.rdx = field,
1755
};
1756
u64 ret;
1757
1758
ret = seamcall_ret(TDH_VP_RD, &args);
1759
1760
/* R8: Content of the field, or 0 in case of error. */
1761
*data = args.r8;
1762
1763
return ret;
1764
}
1765
EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd);
1766
1767
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
1768
{
1769
struct tdx_module_args args = {
1770
.rcx = vp->tdvpr_pa,
1771
.rdx = field,
1772
.r8 = data,
1773
.r9 = mask,
1774
};
1775
1776
return seamcall(TDH_VP_WR, &args);
1777
}
1778
EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr);
1779
1780
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
1781
{
1782
struct tdx_module_args args = {
1783
.rcx = vp->tdvpr_pa,
1784
.rdx = initial_rcx,
1785
.r8 = x2apicid,
1786
};
1787
1788
/* apicid requires version == 1. */
1789
return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
1790
}
1791
EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
1792
1793
/*
1794
* TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
1795
* So despite the names, they must be interpted specially as described by the spec. Return
1796
* them only for error reporting purposes.
1797
*/
1798
u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
1799
{
1800
struct tdx_module_args args = {
1801
.rcx = page_to_phys(page),
1802
};
1803
u64 ret;
1804
1805
ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
1806
1807
*tdx_pt = args.rcx;
1808
*tdx_owner = args.rdx;
1809
*tdx_size = args.r8;
1810
1811
return ret;
1812
}
1813
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim);
1814
1815
u64 tdh_mem_track(struct tdx_td *td)
1816
{
1817
struct tdx_module_args args = {
1818
.rcx = tdx_tdr_pa(td),
1819
};
1820
1821
return seamcall(TDH_MEM_TRACK, &args);
1822
}
1823
EXPORT_SYMBOL_FOR_KVM(tdh_mem_track);
1824
1825
u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
1826
{
1827
struct tdx_module_args args = {
1828
.rcx = gpa | level,
1829
.rdx = tdx_tdr_pa(td),
1830
};
1831
u64 ret;
1832
1833
ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
1834
1835
*ext_err1 = args.rcx;
1836
*ext_err2 = args.rdx;
1837
1838
return ret;
1839
}
1840
EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove);
1841
1842
u64 tdh_phymem_cache_wb(bool resume)
1843
{
1844
struct tdx_module_args args = {
1845
.rcx = resume ? 1 : 0,
1846
};
1847
1848
return seamcall(TDH_PHYMEM_CACHE_WB, &args);
1849
}
1850
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
1851
1852
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
1853
{
1854
struct tdx_module_args args = {};
1855
1856
args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
1857
1858
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1859
}
1860
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
1861
1862
u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
1863
{
1864
struct tdx_module_args args = {};
1865
1866
args.rcx = mk_keyed_paddr(hkid, page);
1867
1868
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1869
}
1870
EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid);
1871
1872
#ifdef CONFIG_KEXEC_CORE
1873
void tdx_cpu_flush_cache_for_kexec(void)
1874
{
1875
lockdep_assert_preemption_disabled();
1876
1877
if (!this_cpu_read(cache_state_incoherent))
1878
return;
1879
1880
/*
1881
* Private memory cachelines need to be clean at the time of
1882
* kexec. Write them back now, as the caller promises that
1883
* there should be no more SEAMCALLs on this CPU.
1884
*/
1885
wbinvd();
1886
this_cpu_write(cache_state_incoherent, false);
1887
}
1888
EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec);
1889
#endif
1890
1891