CoCalc -- sev.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/virt/svm/sev.c
²⁶⁴⁸¹ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * AMD SVM-SEV Host Support.
4
 *
5
 * Copyright (C) 2023 Advanced Micro Devices, Inc.
6
 *
7
 * Author: Ashish Kalra <[email protected]>
8
 *
9
 */
10

11
#include <linux/cc_platform.h>
12
#include <linux/printk.h>
13
#include <linux/mm_types.h>
14
#include <linux/set_memory.h>
15
#include <linux/memblock.h>
16
#include <linux/kernel.h>
17
#include <linux/mm.h>
18
#include <linux/cpumask.h>
19
#include <linux/iommu.h>
20
#include <linux/amd-iommu.h>
21
#include <linux/nospec.h>
22

23
#include <asm/sev.h>
24
#include <asm/processor.h>
25
#include <asm/setup.h>
26
#include <asm/svm.h>
27
#include <asm/smp.h>
28
#include <asm/cpu.h>
29
#include <asm/apic.h>
30
#include <asm/cpuid/api.h>
31
#include <asm/cmdline.h>
32
#include <asm/iommu.h>
33
#include <asm/msr.h>
34

35
/*
36
 * The RMP entry information as returned by the RMPREAD instruction.
37
 */
38
struct rmpentry {
39
	u64 gpa;
40
	u8  assigned		:1,
41
	    rsvd1		:7;
42
	u8  pagesize		:1,
43
	    hpage_region_status	:1,
44
	    rsvd2		:6;
45
	u8  immutable		:1,
46
	    rsvd3		:7;
47
	u8  rsvd4;
48
	u32 asid;
49
} __packed;
50

51
/*
52
 * The raw RMP entry format is not architectural. The format is defined in PPR
53
 * Family 19h Model 01h, Rev B1 processor. This format represents the actual
54
 * entry in the RMP table memory. The bitfield definitions are used for machines
55
 * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo"
56
 * fields are only used for dumping the raw data.
57
 */
58
struct rmpentry_raw {
59
	union {
60
		struct {
61
			u64 assigned	: 1,
62
			    pagesize	: 1,
63
			    immutable	: 1,
64
			    rsvd1	: 9,
65
			    gpa		: 39,
66
			    asid	: 10,
67
			    vmsa	: 1,
68
			    validated	: 1,
69
			    rsvd2	: 1;
70
		};
71
		u64 lo;
72
	};
73
	u64 hi;
74
} __packed;
75

76
/*
77
 * The first 16KB from the RMP_BASE is used by the processor for the
78
 * bookkeeping, the range needs to be added during the RMP entry lookup.
79
 */
80
#define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
81

82
/*
83
 * For a non-segmented RMP table, use the maximum physical addressing as the
84
 * segment size in order to always arrive at index 0 in the table.
85
 */
86
#define RMPTABLE_NON_SEGMENTED_SHIFT	52
87

88
struct rmp_segment_desc {
89
	struct rmpentry_raw *rmp_entry;
90
	u64 max_index;
91
	u64 size;
92
};
93

94
/*
95
 * Segmented RMP Table support.
96
 *   - The segment size is used for two purposes:
97
 *     - Identify the amount of memory covered by an RMP segment
98
 *     - Quickly locate an RMP segment table entry for a physical address
99
 *
100
 *   - The RMP segment table contains pointers to an RMP table that covers
101
 *     a specific portion of memory. There can be up to 512 8-byte entries,
102
 *     one pages worth.
103
 */
104
#define RST_ENTRY_MAPPED_SIZE(x)	((x) & GENMASK_ULL(19, 0))
105
#define RST_ENTRY_SEGMENT_BASE(x)	((x) & GENMASK_ULL(51, 20))
106

107
#define RST_SIZE SZ_4K
108
static struct rmp_segment_desc **rmp_segment_table __ro_after_init;
109
static unsigned int rst_max_index __ro_after_init = 512;
110

111
static unsigned int rmp_segment_shift;
112
static u64 rmp_segment_size;
113
static u64 rmp_segment_mask;
114

115
#define RST_ENTRY_INDEX(x)	((x) >> rmp_segment_shift)
116
#define RMP_ENTRY_INDEX(x)	((u64)(PHYS_PFN((x) & rmp_segment_mask)))
117

118
static u64 rmp_cfg;
119

120
/* Mask to apply to a PFN to get the first PFN of a 2MB page */
121
#define PFN_PMD_MASK	GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
122

123
static u64 probed_rmp_base, probed_rmp_size;
124

125
static LIST_HEAD(snp_leaked_pages_list);
126
static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
127

128
static unsigned long snp_nr_leaked_pages;
129

130
#undef pr_fmt
131
#define pr_fmt(fmt)	"SEV-SNP: " fmt
132

133
static int __mfd_enable(unsigned int cpu)
134
{
135
	u64 val;
136

137
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
138
		return 0;
139

140
	rdmsrq(MSR_AMD64_SYSCFG, val);
141

142
	val |= MSR_AMD64_SYSCFG_MFDM;
143

144
	wrmsrq(MSR_AMD64_SYSCFG, val);
145

146
	return 0;
147
}
148

149
static __init void mfd_enable(void *arg)
150
{
151
	__mfd_enable(smp_processor_id());
152
}
153

154
static int __snp_enable(unsigned int cpu)
155
{
156
	u64 val;
157

158
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
159
		return 0;
160

161
	rdmsrq(MSR_AMD64_SYSCFG, val);
162

163
	val |= MSR_AMD64_SYSCFG_SNP_EN;
164
	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
165

166
	wrmsrq(MSR_AMD64_SYSCFG, val);
167

168
	return 0;
169
}
170

171
static __init void snp_enable(void *arg)
172
{
173
	__snp_enable(smp_processor_id());
174
}
175

176
static void __init __snp_fixup_e820_tables(u64 pa)
177
{
178
	if (IS_ALIGNED(pa, PMD_SIZE))
179
		return;
180

181
	/*
182
	 * Handle cases where the RMP table placement by the BIOS is not
183
	 * 2M aligned and the kexec kernel could try to allocate
184
	 * from within that chunk which then causes a fatal RMP fault.
185
	 *
186
	 * The e820_table needs to be updated as it is converted to
187
	 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
188
	 * to load kexec segments.
189
	 *
190
	 * The e820_table_firmware needs to be updated as it is exposed
191
	 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
192
	 * segments.
193
	 *
194
	 * The e820_table_kexec needs to be updated as it passed to
195
	 * the kexec-ed kernel.
196
	 */
197
	pa = ALIGN_DOWN(pa, PMD_SIZE);
198
	if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
199
		pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
200
		e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
201
		e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
202
		if (!memblock_is_region_reserved(pa, PMD_SIZE))
203
			memblock_reserve(pa, PMD_SIZE);
204
	}
205
}
206

207
static void __init fixup_e820_tables_for_segmented_rmp(void)
208
{
209
	u64 pa, *rst, size, mapped_size;
210
	unsigned int i;
211

212
	__snp_fixup_e820_tables(probed_rmp_base);
213

214
	pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
215

216
	__snp_fixup_e820_tables(pa + RST_SIZE);
217

218
	rst = early_memremap(pa, RST_SIZE);
219
	if (!rst)
220
		return;
221

222
	for (i = 0; i < rst_max_index; i++) {
223
		pa = RST_ENTRY_SEGMENT_BASE(rst[i]);
224
		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
225
		if (!mapped_size)
226
			continue;
227

228
		__snp_fixup_e820_tables(pa);
229

230
		/*
231
		 * Mapped size in GB. Mapped size is allowed to exceed
232
		 * the segment coverage size, but gets reduced to the
233
		 * segment coverage size.
234
		 */
235
		mapped_size <<= 30;
236
		if (mapped_size > rmp_segment_size)
237
			mapped_size = rmp_segment_size;
238

239
		/* Calculate the RMP segment size (16 bytes/page mapped) */
240
		size = PHYS_PFN(mapped_size) << 4;
241

242
		__snp_fixup_e820_tables(pa + size);
243
	}
244

245
	early_memunmap(rst, RST_SIZE);
246
}
247

248
static void __init fixup_e820_tables_for_contiguous_rmp(void)
249
{
250
	__snp_fixup_e820_tables(probed_rmp_base);
251
	__snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
252
}
253

254
void __init snp_fixup_e820_tables(void)
255
{
256
	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
257
		fixup_e820_tables_for_segmented_rmp();
258
	} else {
259
		fixup_e820_tables_for_contiguous_rmp();
260
	}
261
}
262

263
static bool __init clear_rmptable_bookkeeping(void)
264
{
265
	void *bk;
266

267
	bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB);
268
	if (!bk) {
269
		pr_err("Failed to map RMP bookkeeping area\n");
270
		return false;
271
	}
272

273
	memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ);
274

275
	memunmap(bk);
276

277
	return true;
278
}
279

280
static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa)
281
{
282
	u64 rst_index, rmp_segment_size_max;
283
	struct rmp_segment_desc *desc;
284
	void *rmp_segment;
285

286
	/* Calculate the maximum size an RMP can be (16 bytes/page mapped) */
287
	rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4;
288

289
	/* Validate the RMP segment size */
290
	if (segment_size > rmp_segment_size_max) {
291
		pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n",
292
		       segment_size, rmp_segment_size_max);
293
		return false;
294
	}
295

296
	/* Validate the RMP segment table index */
297
	rst_index = RST_ENTRY_INDEX(pa);
298
	if (rst_index >= rst_max_index) {
299
		pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n",
300
		       pa, rmp_segment_size);
301
		return false;
302
	}
303

304
	if (rmp_segment_table[rst_index]) {
305
		pr_err("RMP segment descriptor already exists at index %llu\n", rst_index);
306
		return false;
307
	}
308

309
	rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB);
310
	if (!rmp_segment) {
311
		pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n",
312
		       segment_pa, segment_size);
313
		return false;
314
	}
315

316
	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
317
	if (!desc) {
318
		memunmap(rmp_segment);
319
		return false;
320
	}
321

322
	desc->rmp_entry = rmp_segment;
323
	desc->max_index = segment_size / sizeof(*desc->rmp_entry);
324
	desc->size = segment_size;
325

326
	rmp_segment_table[rst_index] = desc;
327

328
	return true;
329
}
330

331
static void __init free_rmp_segment_table(void)
332
{
333
	unsigned int i;
334

335
	for (i = 0; i < rst_max_index; i++) {
336
		struct rmp_segment_desc *desc;
337

338
		desc = rmp_segment_table[i];
339
		if (!desc)
340
			continue;
341

342
		memunmap(desc->rmp_entry);
343

344
		kfree(desc);
345
	}
346

347
	free_page((unsigned long)rmp_segment_table);
348

349
	rmp_segment_table = NULL;
350
}
351

352
/* Allocate the table used to index into the RMP segments */
353
static bool __init alloc_rmp_segment_table(void)
354
{
355
	struct page *page;
356

357
	page = alloc_page(__GFP_ZERO);
358
	if (!page)
359
		return false;
360

361
	rmp_segment_table = page_address(page);
362

363
	return true;
364
}
365

366
static bool __init setup_contiguous_rmptable(void)
367
{
368
	u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end;
369

370
	if (!probed_rmp_size)
371
		return false;
372

373
	rmp_end = probed_rmp_base + probed_rmp_size - 1;
374

375
	/*
376
	 * Calculate the amount of memory that must be reserved by the BIOS to
377
	 * address the whole RAM, including the bookkeeping area. The RMP itself
378
	 * must also be covered.
379
	 */
380
	max_rmp_pfn = max_pfn;
381
	if (PFN_UP(rmp_end) > max_pfn)
382
		max_rmp_pfn = PFN_UP(rmp_end);
383

384
	calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
385
	if (calc_rmp_sz > probed_rmp_size) {
386
		pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
387
		       calc_rmp_sz, probed_rmp_size);
388
		return false;
389
	}
390

391
	if (!alloc_rmp_segment_table())
392
		return false;
393

394
	/* Map only the RMP entries */
395
	rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
396
	rmptable_size    = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
397

398
	if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) {
399
		free_rmp_segment_table();
400
		return false;
401
	}
402

403
	return true;
404
}
405

406
static bool __init setup_segmented_rmptable(void)
407
{
408
	u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max;
409
	unsigned int i, max_index;
410

411
	if (!probed_rmp_base)
412
		return false;
413

414
	if (!alloc_rmp_segment_table())
415
		return false;
416

417
	rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
418
	rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB);
419
	if (!rst) {
420
		pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa);
421
		goto e_free;
422
	}
423

424
	pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30);
425

426
	ram_pa_max = max_pfn << PAGE_SHIFT;
427

428
	max_index = 0;
429
	ram_pa_end = 0;
430
	for (i = 0; i < rst_max_index; i++) {
431
		u64 rmp_segment, rmp_size, mapped_size;
432

433
		mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
434
		if (!mapped_size)
435
			continue;
436

437
		max_index = i;
438

439
		/*
440
		 * Mapped size in GB. Mapped size is allowed to exceed the
441
		 * segment coverage size, but gets reduced to the segment
442
		 * coverage size.
443
		 */
444
		mapped_size <<= 30;
445
		if (mapped_size > rmp_segment_size) {
446
			pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n",
447
				i, mapped_size, rmp_segment_size);
448
			mapped_size = rmp_segment_size;
449
		}
450

451
		rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]);
452

453
		/* Calculate the RMP segment size (16 bytes/page mapped) */
454
		rmp_size = PHYS_PFN(mapped_size) << 4;
455

456
		pa = (u64)i << rmp_segment_shift;
457

458
		/*
459
		 * Some segments may be for MMIO mapped above system RAM. These
460
		 * segments are used for Trusted I/O.
461
		 */
462
		if (pa < ram_pa_max)
463
			ram_pa_end = pa + mapped_size;
464

465
		if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa))
466
			goto e_unmap;
467

468
		pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n",
469
			i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1);
470
	}
471

472
	if (ram_pa_max > ram_pa_end) {
473
		pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
474
		       ram_pa_max, ram_pa_end);
475
		goto e_unmap;
476
	}
477

478
	/* Adjust the maximum index based on the found segments */
479
	rst_max_index = max_index + 1;
480

481
	memunmap(rst);
482

483
	return true;
484

485
e_unmap:
486
	memunmap(rst);
487

488
e_free:
489
	free_rmp_segment_table();
490

491
	return false;
492
}
493

494
static bool __init setup_rmptable(void)
495
{
496
	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
497
		return setup_segmented_rmptable();
498
	} else {
499
		return setup_contiguous_rmptable();
500
	}
501
}
502

503
/*
504
 * Do the necessary preparations which are verified by the firmware as
505
 * described in the SNP_INIT_EX firmware command description in the SNP
506
 * firmware ABI spec.
507
 */
508
int __init snp_rmptable_init(void)
509
{
510
	unsigned int i;
511
	u64 val;
512

513
	if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP)))
514
		return -ENOSYS;
515

516
	if (WARN_ON_ONCE(!amd_iommu_snp_en))
517
		return -ENOSYS;
518

519
	if (!setup_rmptable())
520
		return -ENOSYS;
521

522
	/*
523
	 * Check if SEV-SNP is already enabled, this can happen in case of
524
	 * kexec boot.
525
	 */
526
	rdmsrq(MSR_AMD64_SYSCFG, val);
527
	if (val & MSR_AMD64_SYSCFG_SNP_EN)
528
		goto skip_enable;
529

530
	/* Zero out the RMP bookkeeping area */
531
	if (!clear_rmptable_bookkeeping()) {
532
		free_rmp_segment_table();
533
		return -ENOSYS;
534
	}
535

536
	/* Zero out the RMP entries */
537
	for (i = 0; i < rst_max_index; i++) {
538
		struct rmp_segment_desc *desc;
539

540
		desc = rmp_segment_table[i];
541
		if (!desc)
542
			continue;
543

544
		memset(desc->rmp_entry, 0, desc->size);
545
	}
546

547
	/* Flush the caches to ensure that data is written before SNP is enabled. */
548
	wbinvd_on_all_cpus();
549

550
	/* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
551
	on_each_cpu(mfd_enable, NULL, 1);
552

553
	on_each_cpu(snp_enable, NULL, 1);
554

555
skip_enable:
556
	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
557

558
	/*
559
	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
560
	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
561
	 */
562
	crash_kexec_post_notifiers = true;
563

564
	return 0;
565
}
566

567
static void set_rmp_segment_info(unsigned int segment_shift)
568
{
569
	rmp_segment_shift = segment_shift;
570
	rmp_segment_size  = 1ULL << rmp_segment_shift;
571
	rmp_segment_mask  = rmp_segment_size - 1;
572
}
573

574
#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
575

576
static bool probe_contiguous_rmptable_info(void)
577
{
578
	u64 rmp_sz, rmp_base, rmp_end;
579

580
	rdmsrq(MSR_AMD64_RMP_BASE, rmp_base);
581
	rdmsrq(MSR_AMD64_RMP_END, rmp_end);
582

583
	if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
584
		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
585
		return false;
586
	}
587

588
	if (rmp_base > rmp_end) {
589
		pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
590
		return false;
591
	}
592

593
	rmp_sz = rmp_end - rmp_base + 1;
594

595
	/* Treat the contiguous RMP table as a single segment */
596
	rst_max_index = 1;
597

598
	set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT);
599

600
	probed_rmp_base = rmp_base;
601
	probed_rmp_size = rmp_sz;
602

603
	pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
604
		rmp_base, rmp_end);
605

606
	return true;
607
}
608

609
static bool probe_segmented_rmptable_info(void)
610
{
611
	unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max;
612
	u64 rmp_base, rmp_end;
613

614
	rdmsrq(MSR_AMD64_RMP_BASE, rmp_base);
615
	if (!(rmp_base & RMP_ADDR_MASK)) {
616
		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
617
		return false;
618
	}
619

620
	rdmsrq(MSR_AMD64_RMP_END, rmp_end);
621
	WARN_ONCE(rmp_end & RMP_ADDR_MASK,
622
		  "Segmented RMP enabled but RMP_END MSR is non-zero\n");
623

624
	/* Obtain the min and max supported RMP segment size */
625
	eax = cpuid_eax(0x80000025);
626
	segment_shift_min = eax & GENMASK(5, 0);
627
	segment_shift_max = (eax & GENMASK(11, 6)) >> 6;
628

629
	/* Verify the segment size is within the supported limits */
630
	segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg);
631
	if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) {
632
		pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n",
633
		       segment_shift, segment_shift_min, segment_shift_max);
634
		return false;
635
	}
636

637
	/* Override the max supported RST index if a hardware limit exists */
638
	ebx = cpuid_ebx(0x80000025);
639
	if (ebx & BIT(10))
640
		rst_max_index = ebx & GENMASK(9, 0);
641

642
	set_rmp_segment_info(segment_shift);
643

644
	probed_rmp_base = rmp_base;
645
	probed_rmp_size = 0;
646

647
	pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n",
648
		rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE);
649

650
	return true;
651
}
652

653
bool snp_probe_rmptable_info(void)
654
{
655
	if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP))
656
		rdmsrq(MSR_AMD64_RMP_CFG, rmp_cfg);
657

658
	if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)
659
		return probe_segmented_rmptable_info();
660
	else
661
		return probe_contiguous_rmptable_info();
662
}
663

664
/*
665
 * About the array_index_nospec() usage below:
666
 *
667
 * This function can get called by exported functions like
668
 * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among
669
 * others, and since the @pfn passed in cannot always be trusted,
670
 * speculation should be stopped as a protective measure.
671
 */
672
static struct rmpentry_raw *get_raw_rmpentry(u64 pfn)
673
{
674
	u64 paddr, rst_index, segment_index;
675
	struct rmp_segment_desc *desc;
676

677
	if (!rmp_segment_table)
678
		return ERR_PTR(-ENODEV);
679

680
	paddr = pfn << PAGE_SHIFT;
681

682
	rst_index = RST_ENTRY_INDEX(paddr);
683
	if (unlikely(rst_index >= rst_max_index))
684
		return ERR_PTR(-EFAULT);
685

686
	rst_index = array_index_nospec(rst_index, rst_max_index);
687

688
	desc = rmp_segment_table[rst_index];
689
	if (unlikely(!desc))
690
		return ERR_PTR(-EFAULT);
691

692
	segment_index = RMP_ENTRY_INDEX(paddr);
693
	if (unlikely(segment_index >= desc->max_index))
694
		return ERR_PTR(-EFAULT);
695

696
	segment_index = array_index_nospec(segment_index, desc->max_index);
697

698
	return desc->rmp_entry + segment_index;
699
}
700

701
static int get_rmpentry(u64 pfn, struct rmpentry *e)
702
{
703
	struct rmpentry_raw *e_raw;
704

705
	if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) {
706
		int ret;
707

708
		/* Binutils version 2.44 supports the RMPREAD mnemonic. */
709
		asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd"
710
			     : "=a" (ret)
711
			     : "a" (pfn << PAGE_SHIFT), "c" (e)
712
			     : "memory", "cc");
713

714
		return ret;
715
	}
716

717
	e_raw = get_raw_rmpentry(pfn);
718
	if (IS_ERR(e_raw))
719
		return PTR_ERR(e_raw);
720

721
	/*
722
	 * Map the raw RMP table entry onto the RMPREAD output format.
723
	 * The 2MB region status indicator (hpage_region_status field) is not
724
	 * calculated, since the overhead could be significant and the field
725
	 * is not used.
726
	 */
727
	memset(e, 0, sizeof(*e));
728
	e->gpa       = e_raw->gpa << PAGE_SHIFT;
729
	e->asid      = e_raw->asid;
730
	e->assigned  = e_raw->assigned;
731
	e->pagesize  = e_raw->pagesize;
732
	e->immutable = e_raw->immutable;
733

734
	return 0;
735
}
736

737
static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level)
738
{
739
	struct rmpentry e_large;
740
	int ret;
741

742
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
743
		return -ENODEV;
744

745
	ret = get_rmpentry(pfn, e);
746
	if (ret)
747
		return ret;
748

749
	/*
750
	 * Find the authoritative RMP entry for a PFN. This can be either a 4K
751
	 * RMP entry or a special large RMP entry that is authoritative for a
752
	 * whole 2M area.
753
	 */
754
	ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large);
755
	if (ret)
756
		return ret;
757

758
	*level = RMP_TO_PG_LEVEL(e_large.pagesize);
759

760
	return 0;
761
}
762

763
int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
764
{
765
	struct rmpentry e;
766
	int ret;
767

768
	ret = __snp_lookup_rmpentry(pfn, &e, level);
769
	if (ret)
770
		return ret;
771

772
	*assigned = !!e.assigned;
773
	return 0;
774
}
775
EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
776

777
/*
778
 * Dump the raw RMP entry for a particular PFN. These bits are documented in the
779
 * PPR for a particular CPU model and provide useful information about how a
780
 * particular PFN is being utilized by the kernel/firmware at the time certain
781
 * unexpected events occur, such as RMP faults.
782
 */
783
static void dump_rmpentry(u64 pfn)
784
{
785
	struct rmpentry_raw *e_raw;
786
	u64 pfn_i, pfn_end;
787
	struct rmpentry e;
788
	int level, ret;
789

790
	ret = __snp_lookup_rmpentry(pfn, &e, &level);
791
	if (ret) {
792
		pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n",
793
		       pfn, ret);
794
		return;
795
	}
796

797
	if (e.assigned) {
798
		e_raw = get_raw_rmpentry(pfn);
799
		if (IS_ERR(e_raw)) {
800
			pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n",
801
			       pfn, PTR_ERR(e_raw));
802
			return;
803
		}
804

805
		pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
806
			pfn, e_raw->lo, e_raw->hi);
807
		return;
808
	}
809

810
	/*
811
	 * If the RMP entry for a particular PFN is not in an assigned state,
812
	 * then it is sometimes useful to get an idea of whether or not any RMP
813
	 * entries for other PFNs within the same 2MB region are assigned, since
814
	 * those too can affect the ability to access a particular PFN in
815
	 * certain situations, such as when the PFN is being accessed via a 2MB
816
	 * mapping in the host page table.
817
	 */
818
	pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
819
	pfn_end = pfn_i + PTRS_PER_PMD;
820

821
	pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
822
		pfn, pfn_i, pfn_end);
823

824
	while (pfn_i < pfn_end) {
825
		e_raw = get_raw_rmpentry(pfn_i);
826
		if (IS_ERR(e_raw)) {
827
			pr_err("Error %ld reading RMP contents for PFN 0x%llx\n",
828
			       PTR_ERR(e_raw), pfn_i);
829
			pfn_i++;
830
			continue;
831
		}
832

833
		if (e_raw->lo || e_raw->hi)
834
			pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi);
835
		pfn_i++;
836
	}
837
}
838

839
void snp_dump_hva_rmpentry(unsigned long hva)
840
{
841
	unsigned long paddr;
842
	unsigned int level;
843
	pgd_t *pgd;
844
	pte_t *pte;
845

846
	pgd = __va(read_cr3_pa());
847
	pgd += pgd_index(hva);
848
	pte = lookup_address_in_pgd(pgd, hva, &level);
849

850
	if (!pte) {
851
		pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
852
		return;
853
	}
854

855
	paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
856
	dump_rmpentry(PHYS_PFN(paddr));
857
}
858

859
/*
860
 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
861
 * Validated bit.
862
 */
863
int psmash(u64 pfn)
864
{
865
	unsigned long paddr = pfn << PAGE_SHIFT;
866
	int ret;
867

868
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
869
		return -ENODEV;
870

871
	if (!pfn_valid(pfn))
872
		return -EINVAL;
873

874
	/* Binutils version 2.36 supports the PSMASH mnemonic. */
875
	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
876
		      : "=a" (ret)
877
		      : "a" (paddr)
878
		      : "memory", "cc");
879

880
	return ret;
881
}
882
EXPORT_SYMBOL_GPL(psmash);
883

884
/*
885
 * If the kernel uses a 2MB or larger directmap mapping to write to an address,
886
 * and that mapping contains any 4KB pages that are set to private in the RMP
887
 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
888
 * owns the PFNs being transitioned will never attempt such a write, but other
889
 * kernel tasks writing to other PFNs in the range may trigger these checks
890
 * inadvertently due a large directmap mapping that happens to overlap such a
891
 * PFN.
892
 *
893
 * Prevent this by splitting any 2MB+ mappings that might end up containing a
894
 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
895
 * PFN/rmp_level passed in.
896
 *
897
 * Note that there is no attempt here to scan all the RMP entries for the 2MB
898
 * physical range, since it would only be worthwhile in determining if a
899
 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
900
 * the same shared/private state, thus avoiding the need to split the mapping.
901
 * But that would mean the entries are currently in a mixed state, and so the
902
 * mapping would have already been split as a result of prior transitions.
903
 * And since the 4K split is only done if the mapping is 2MB+, and there isn't
904
 * currently a mechanism in place to restore 2MB+ mappings, such a check would
905
 * not provide any usable benefit.
906
 *
907
 * More specifics on how these checks are carried out can be found in APM
908
 * Volume 2, "RMP and VMPL Access Checks".
909
 */
910
static int adjust_direct_map(u64 pfn, int rmp_level)
911
{
912
	unsigned long vaddr;
913
	unsigned int level;
914
	int npages, ret;
915
	pte_t *pte;
916

917
	/*
918
	 * pfn_to_kaddr() will return a vaddr only within the direct
919
	 * map range.
920
	 */
921
	vaddr = (unsigned long)pfn_to_kaddr(pfn);
922

923
	/* Only 4KB/2MB RMP entries are supported by current hardware. */
924
	if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
925
		return -EINVAL;
926

927
	if (!pfn_valid(pfn))
928
		return -EINVAL;
929

930
	if (rmp_level == PG_LEVEL_2M &&
931
	    (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
932
		return -EINVAL;
933

934
	/*
935
	 * If an entire 2MB physical range is being transitioned, then there is
936
	 * no risk of RMP #PFs due to write accesses from overlapping mappings,
937
	 * since even accesses from 1GB mappings will be treated as 2MB accesses
938
	 * as far as RMP table checks are concerned.
939
	 */
940
	if (rmp_level == PG_LEVEL_2M)
941
		return 0;
942

943
	pte = lookup_address(vaddr, &level);
944
	if (!pte || pte_none(*pte))
945
		return 0;
946

947
	if (level == PG_LEVEL_4K)
948
		return 0;
949

950
	npages = page_level_size(rmp_level) / PAGE_SIZE;
951
	ret = set_memory_4k(vaddr, npages);
952
	if (ret)
953
		pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
954
			pfn, ret);
955

956
	return ret;
957
}
958

959
/*
960
 * It is expected that those operations are seldom enough so that no mutual
961
 * exclusion of updaters is needed and thus the overlap error condition below
962
 * should happen very rarely and would get resolved relatively quickly by
963
 * the firmware.
964
 *
965
 * If not, one could consider introducing a mutex or so here to sync concurrent
966
 * RMP updates and thus diminish the amount of cases where firmware needs to
967
 * lock 2M ranges to protect against concurrent updates.
968
 *
969
 * The optimal solution would be range locking to avoid locking disjoint
970
 * regions unnecessarily but there's no support for that yet.
971
 */
972
static int rmpupdate(u64 pfn, struct rmp_state *state)
973
{
974
	unsigned long paddr = pfn << PAGE_SHIFT;
975
	int ret, level;
976

977
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
978
		return -ENODEV;
979

980
	level = RMP_TO_PG_LEVEL(state->pagesize);
981

982
	if (adjust_direct_map(pfn, level))
983
		return -EFAULT;
984

985
	do {
986
		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
987
		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
988
			     : "=a" (ret)
989
			     : "a" (paddr), "c" ((unsigned long)state)
990
			     : "memory", "cc");
991
	} while (ret == RMPUPDATE_FAIL_OVERLAP);
992

993
	if (ret) {
994
		pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
995
		       pfn, level, ret);
996
		dump_rmpentry(pfn);
997
		dump_stack();
998
		return -EFAULT;
999
	}
1000

1001
	return 0;
1002
}
1003

1004
/* Transition a page to guest-owned/private state in the RMP table. */
1005
int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
1006
{
1007
	struct rmp_state state;
1008

1009
	memset(&state, 0, sizeof(state));
1010
	state.assigned = 1;
1011
	state.asid = asid;
1012
	state.immutable = immutable;
1013
	state.gpa = gpa;
1014
	state.pagesize = PG_LEVEL_TO_RMP(level);
1015

1016
	return rmpupdate(pfn, &state);
1017
}
1018
EXPORT_SYMBOL_GPL(rmp_make_private);
1019

1020
/* Transition a page to hypervisor-owned/shared state in the RMP table. */
1021
int rmp_make_shared(u64 pfn, enum pg_level level)
1022
{
1023
	struct rmp_state state;
1024

1025
	memset(&state, 0, sizeof(state));
1026
	state.pagesize = PG_LEVEL_TO_RMP(level);
1027

1028
	return rmpupdate(pfn, &state);
1029
}
1030
EXPORT_SYMBOL_GPL(rmp_make_shared);
1031

1032
void snp_leak_pages(u64 pfn, unsigned int npages)
1033
{
1034
	struct page *page = pfn_to_page(pfn);
1035

1036
	pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
1037

1038
	spin_lock(&snp_leaked_pages_list_lock);
1039
	while (npages--) {
1040

1041
		/*
1042
		 * Reuse the page's buddy list for chaining into the leaked
1043
		 * pages list. This page should not be on a free list currently
1044
		 * and is also unsafe to be added to a free list.
1045
		 */
1046
		if (likely(!PageCompound(page)) ||
1047

1048
			/*
1049
			 * Skip inserting tail pages of compound page as
1050
			 * page->buddy_list of tail pages is not usable.
1051
			 */
1052
		    (PageHead(page) && compound_nr(page) <= npages))
1053
			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
1054

1055
		dump_rmpentry(pfn);
1056
		snp_nr_leaked_pages++;
1057
		pfn++;
1058
		page++;
1059
	}
1060
	spin_unlock(&snp_leaked_pages_list_lock);
1061
}
1062
EXPORT_SYMBOL_GPL(snp_leak_pages);
1063

1064
void kdump_sev_callback(void)
1065
{
1066
	/*
1067
	 * Do wbinvd() on remote CPUs when SNP is enabled in order to
1068
	 * safely do SNP_SHUTDOWN on the local CPU.
1069
	 */
1070
	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
1071
		wbinvd();
1072
}
1073

1074
Product

Resources

Company