CoCalc -- sev.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/svm/sev.c
²⁶⁴⁷¹ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Kernel-based Virtual Machine driver for Linux
4
 *
5
 * AMD SVM-SEV support
6
 *
7
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8
 */
9
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10

11
#include <linux/kvm_types.h>
12
#include <linux/kvm_host.h>
13
#include <linux/kernel.h>
14
#include <linux/highmem.h>
15
#include <linux/psp.h>
16
#include <linux/psp-sev.h>
17
#include <linux/pagemap.h>
18
#include <linux/swap.h>
19
#include <linux/misc_cgroup.h>
20
#include <linux/processor.h>
21
#include <linux/trace_events.h>
22
#include <uapi/linux/sev-guest.h>
23

24
#include <asm/pkru.h>
25
#include <asm/trapnr.h>
26
#include <asm/fpu/xcr.h>
27
#include <asm/fpu/xstate.h>
28
#include <asm/debugreg.h>
29
#include <asm/msr.h>
30
#include <asm/sev.h>
31

32
#include "mmu.h"
33
#include "x86.h"
34
#include "svm.h"
35
#include "svm_ops.h"
36
#include "cpuid.h"
37
#include "trace.h"
38

39
#define GHCB_VERSION_MAX	2ULL
40
#define GHCB_VERSION_DEFAULT	2ULL
41
#define GHCB_VERSION_MIN	1ULL
42

43
#define GHCB_HV_FT_SUPPORTED	(GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
44

45
/* enable/disable SEV support */
46
static bool sev_enabled = true;
47
module_param_named(sev, sev_enabled, bool, 0444);
48

49
/* enable/disable SEV-ES support */
50
static bool sev_es_enabled = true;
51
module_param_named(sev_es, sev_es_enabled, bool, 0444);
52

53
/* enable/disable SEV-SNP support */
54
static bool sev_snp_enabled = true;
55
module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
56

57
/* enable/disable SEV-ES DebugSwap support */
58
static bool sev_es_debug_swap_enabled = true;
59
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
60
static u64 sev_supported_vmsa_features;
61

62
#define AP_RESET_HOLD_NONE		0
63
#define AP_RESET_HOLD_NAE_EVENT		1
64
#define AP_RESET_HOLD_MSR_PROTO		2
65

66
/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
67
#define SNP_POLICY_MASK_API_MINOR	GENMASK_ULL(7, 0)
68
#define SNP_POLICY_MASK_API_MAJOR	GENMASK_ULL(15, 8)
69
#define SNP_POLICY_MASK_SMT		BIT_ULL(16)
70
#define SNP_POLICY_MASK_RSVD_MBO	BIT_ULL(17)
71
#define SNP_POLICY_MASK_DEBUG		BIT_ULL(19)
72
#define SNP_POLICY_MASK_SINGLE_SOCKET	BIT_ULL(20)
73

74
#define SNP_POLICY_MASK_VALID		(SNP_POLICY_MASK_API_MINOR	| \
75
					 SNP_POLICY_MASK_API_MAJOR	| \
76
					 SNP_POLICY_MASK_SMT		| \
77
					 SNP_POLICY_MASK_RSVD_MBO	| \
78
					 SNP_POLICY_MASK_DEBUG		| \
79
					 SNP_POLICY_MASK_SINGLE_SOCKET)
80

81
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
82

83
static u8 sev_enc_bit;
84
static DECLARE_RWSEM(sev_deactivate_lock);
85
static DEFINE_MUTEX(sev_bitmap_lock);
86
unsigned int max_sev_asid;
87
static unsigned int min_sev_asid;
88
static unsigned long sev_me_mask;
89
static unsigned int nr_asids;
90
static unsigned long *sev_asid_bitmap;
91
static unsigned long *sev_reclaim_asid_bitmap;
92

93
static int snp_decommission_context(struct kvm *kvm);
94

95
struct enc_region {
96
	struct list_head list;
97
	unsigned long npages;
98
	struct page **pages;
99
	unsigned long uaddr;
100
	unsigned long size;
101
};
102

103
/* Called with the sev_bitmap_lock held, or on shutdown  */
104
static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
105
{
106
	int ret, error = 0;
107
	unsigned int asid;
108

109
	/* Check if there are any ASIDs to reclaim before performing a flush */
110
	asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
111
	if (asid > max_asid)
112
		return -EBUSY;
113

114
	/*
115
	 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
116
	 * so it must be guarded.
117
	 */
118
	down_write(&sev_deactivate_lock);
119

120
	/* SNP firmware requires use of WBINVD for ASID recycling. */
121
	wbinvd_on_all_cpus();
122

123
	if (sev_snp_enabled)
124
		ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
125
	else
126
		ret = sev_guest_df_flush(&error);
127

128
	up_write(&sev_deactivate_lock);
129

130
	if (ret)
131
		pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
132
		       sev_snp_enabled ? "-SNP" : "", ret, error);
133

134
	return ret;
135
}
136

137
static inline bool is_mirroring_enc_context(struct kvm *kvm)
138
{
139
	return !!to_kvm_sev_info(kvm)->enc_context_owner;
140
}
141

142
static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
143
{
144
	struct kvm_vcpu *vcpu = &svm->vcpu;
145
	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
146

147
	return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
148
}
149

150
/* Must be called with the sev_bitmap_lock held */
151
static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
152
{
153
	if (sev_flush_asids(min_asid, max_asid))
154
		return false;
155

156
	/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
157
	bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
158
		   nr_asids);
159
	bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
160

161
	return true;
162
}
163

164
static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
165
{
166
	enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
167
	return misc_cg_try_charge(type, sev->misc_cg, 1);
168
}
169

170
static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
171
{
172
	enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
173
	misc_cg_uncharge(type, sev->misc_cg, 1);
174
}
175

176
static int sev_asid_new(struct kvm_sev_info *sev)
177
{
178
	/*
179
	 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
180
	 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
181
	 * Note: min ASID can end up larger than the max if basic SEV support is
182
	 * effectively disabled by disallowing use of ASIDs for SEV guests.
183
	 */
184
	unsigned int min_asid = sev->es_active ? 1 : min_sev_asid;
185
	unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
186
	unsigned int asid;
187
	bool retry = true;
188
	int ret;
189

190
	if (min_asid > max_asid)
191
		return -ENOTTY;
192

193
	WARN_ON(sev->misc_cg);
194
	sev->misc_cg = get_current_misc_cg();
195
	ret = sev_misc_cg_try_charge(sev);
196
	if (ret) {
197
		put_misc_cg(sev->misc_cg);
198
		sev->misc_cg = NULL;
199
		return ret;
200
	}
201

202
	mutex_lock(&sev_bitmap_lock);
203

204
again:
205
	asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
206
	if (asid > max_asid) {
207
		if (retry && __sev_recycle_asids(min_asid, max_asid)) {
208
			retry = false;
209
			goto again;
210
		}
211
		mutex_unlock(&sev_bitmap_lock);
212
		ret = -EBUSY;
213
		goto e_uncharge;
214
	}
215

216
	__set_bit(asid, sev_asid_bitmap);
217

218
	mutex_unlock(&sev_bitmap_lock);
219

220
	sev->asid = asid;
221
	return 0;
222
e_uncharge:
223
	sev_misc_cg_uncharge(sev);
224
	put_misc_cg(sev->misc_cg);
225
	sev->misc_cg = NULL;
226
	return ret;
227
}
228

229
static unsigned int sev_get_asid(struct kvm *kvm)
230
{
231
	return to_kvm_sev_info(kvm)->asid;
232
}
233

234
static void sev_asid_free(struct kvm_sev_info *sev)
235
{
236
	struct svm_cpu_data *sd;
237
	int cpu;
238

239
	mutex_lock(&sev_bitmap_lock);
240

241
	__set_bit(sev->asid, sev_reclaim_asid_bitmap);
242

243
	for_each_possible_cpu(cpu) {
244
		sd = per_cpu_ptr(&svm_data, cpu);
245
		sd->sev_vmcbs[sev->asid] = NULL;
246
	}
247

248
	mutex_unlock(&sev_bitmap_lock);
249

250
	sev_misc_cg_uncharge(sev);
251
	put_misc_cg(sev->misc_cg);
252
	sev->misc_cg = NULL;
253
}
254

255
static void sev_decommission(unsigned int handle)
256
{
257
	struct sev_data_decommission decommission;
258

259
	if (!handle)
260
		return;
261

262
	decommission.handle = handle;
263
	sev_guest_decommission(&decommission, NULL);
264
}
265

266
/*
267
 * Transition a page to hypervisor-owned/shared state in the RMP table. This
268
 * should not fail under normal conditions, but leak the page should that
269
 * happen since it will no longer be usable by the host due to RMP protections.
270
 */
271
static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
272
{
273
	if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
274
		snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
275
		return -EIO;
276
	}
277

278
	return 0;
279
}
280

281
/*
282
 * Certain page-states, such as Pre-Guest and Firmware pages (as documented
283
 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
284
 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
285
 * unless they are reclaimed first.
286
 *
287
 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
288
 * might not be usable by the host due to being set as immutable or still
289
 * being associated with a guest ASID.
290
 *
291
 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
292
 * converted back to shared, as the page is no longer usable due to RMP
293
 * protections, and it's infeasible for the guest to continue on.
294
 */
295
static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
296
{
297
	struct sev_data_snp_page_reclaim data = {0};
298
	int fw_err, rc;
299

300
	data.paddr = __sme_set(pfn << PAGE_SHIFT);
301
	rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
302
	if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
303
		snp_leak_pages(pfn, 1);
304
		return -EIO;
305
	}
306

307
	if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
308
		return -EIO;
309

310
	return rc;
311
}
312

313
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
314
{
315
	struct sev_data_deactivate deactivate;
316

317
	if (!handle)
318
		return;
319

320
	deactivate.handle = handle;
321

322
	/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
323
	down_read(&sev_deactivate_lock);
324
	sev_guest_deactivate(&deactivate, NULL);
325
	up_read(&sev_deactivate_lock);
326

327
	sev_decommission(handle);
328
}
329

330
/*
331
 * This sets up bounce buffers/firmware pages to handle SNP Guest Request
332
 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
333
 * 2.0 specification for more details.
334
 *
335
 * Technically, when an SNP Guest Request is issued, the guest will provide its
336
 * own request/response pages, which could in theory be passed along directly
337
 * to firmware rather than using bounce pages. However, these pages would need
338
 * special care:
339
 *
340
 *   - Both pages are from shared guest memory, so they need to be protected
341
 *     from migration/etc. occurring while firmware reads/writes to them. At a
342
 *     minimum, this requires elevating the ref counts and potentially needing
343
 *     an explicit pinning of the memory. This places additional restrictions
344
 *     on what type of memory backends userspace can use for shared guest
345
 *     memory since there is some reliance on using refcounted pages.
346
 *
347
 *   - The response page needs to be switched to Firmware-owned[1] state
348
 *     before the firmware can write to it, which can lead to potential
349
 *     host RMP #PFs if the guest is misbehaved and hands the host a
350
 *     guest page that KVM might write to for other reasons (e.g. virtio
351
 *     buffers/etc.).
352
 *
353
 * Both of these issues can be avoided completely by using separately-allocated
354
 * bounce pages for both the request/response pages and passing those to
355
 * firmware instead. So that's what is being set up here.
356
 *
357
 * Guest requests rely on message sequence numbers to ensure requests are
358
 * issued to firmware in the order the guest issues them, so concurrent guest
359
 * requests generally shouldn't happen. But a misbehaved guest could issue
360
 * concurrent guest requests in theory, so a mutex is used to serialize
361
 * access to the bounce buffers.
362
 *
363
 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
364
 *     details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
365
 *     in the APM for details on the related RMP restrictions.
366
 */
367
static int snp_guest_req_init(struct kvm *kvm)
368
{
369
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
370
	struct page *req_page;
371

372
	req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
373
	if (!req_page)
374
		return -ENOMEM;
375

376
	sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
377
	if (!sev->guest_resp_buf) {
378
		__free_page(req_page);
379
		return -EIO;
380
	}
381

382
	sev->guest_req_buf = page_address(req_page);
383
	mutex_init(&sev->guest_req_mutex);
384

385
	return 0;
386
}
387

388
static void snp_guest_req_cleanup(struct kvm *kvm)
389
{
390
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
391

392
	if (sev->guest_resp_buf)
393
		snp_free_firmware_page(sev->guest_resp_buf);
394

395
	if (sev->guest_req_buf)
396
		__free_page(virt_to_page(sev->guest_req_buf));
397

398
	sev->guest_req_buf = NULL;
399
	sev->guest_resp_buf = NULL;
400
}
401

402
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
403
			    struct kvm_sev_init *data,
404
			    unsigned long vm_type)
405
{
406
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
407
	struct sev_platform_init_args init_args = {0};
408
	bool es_active = vm_type != KVM_X86_SEV_VM;
409
	u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
410
	int ret;
411

412
	if (kvm->created_vcpus)
413
		return -EINVAL;
414

415
	if (data->flags)
416
		return -EINVAL;
417

418
	if (data->vmsa_features & ~valid_vmsa_features)
419
		return -EINVAL;
420

421
	if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
422
		return -EINVAL;
423

424
	if (unlikely(sev->active))
425
		return -EINVAL;
426

427
	sev->active = true;
428
	sev->es_active = es_active;
429
	sev->vmsa_features = data->vmsa_features;
430
	sev->ghcb_version = data->ghcb_version;
431

432
	/*
433
	 * Currently KVM supports the full range of mandatory features defined
434
	 * by version 2 of the GHCB protocol, so default to that for SEV-ES
435
	 * guests created via KVM_SEV_INIT2.
436
	 */
437
	if (sev->es_active && !sev->ghcb_version)
438
		sev->ghcb_version = GHCB_VERSION_DEFAULT;
439

440
	if (vm_type == KVM_X86_SNP_VM)
441
		sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
442

443
	ret = sev_asid_new(sev);
444
	if (ret)
445
		goto e_no_asid;
446

447
	init_args.probe = false;
448
	ret = sev_platform_init(&init_args);
449
	if (ret)
450
		goto e_free_asid;
451

452
	if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
453
		ret = -ENOMEM;
454
		goto e_free_asid;
455
	}
456

457
	/* This needs to happen after SEV/SNP firmware initialization. */
458
	if (vm_type == KVM_X86_SNP_VM) {
459
		ret = snp_guest_req_init(kvm);
460
		if (ret)
461
			goto e_free;
462
	}
463

464
	INIT_LIST_HEAD(&sev->regions_list);
465
	INIT_LIST_HEAD(&sev->mirror_vms);
466
	sev->need_init = false;
467

468
	kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
469

470
	return 0;
471

472
e_free:
473
	free_cpumask_var(sev->have_run_cpus);
474
e_free_asid:
475
	argp->error = init_args.error;
476
	sev_asid_free(sev);
477
	sev->asid = 0;
478
e_no_asid:
479
	sev->vmsa_features = 0;
480
	sev->es_active = false;
481
	sev->active = false;
482
	return ret;
483
}
484

485
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
486
{
487
	struct kvm_sev_init data = {
488
		.vmsa_features = 0,
489
		.ghcb_version = 0,
490
	};
491
	unsigned long vm_type;
492

493
	if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
494
		return -EINVAL;
495

496
	vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
497

498
	/*
499
	 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
500
	 * continue to only ever support the minimal GHCB protocol version.
501
	 */
502
	if (vm_type == KVM_X86_SEV_ES_VM)
503
		data.ghcb_version = GHCB_VERSION_MIN;
504

505
	return __sev_guest_init(kvm, argp, &data, vm_type);
506
}
507

508
static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
509
{
510
	struct kvm_sev_init data;
511

512
	if (!to_kvm_sev_info(kvm)->need_init)
513
		return -EINVAL;
514

515
	if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
516
	    kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
517
	    kvm->arch.vm_type != KVM_X86_SNP_VM)
518
		return -EINVAL;
519

520
	if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
521
		return -EFAULT;
522

523
	return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
524
}
525

526
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
527
{
528
	unsigned int asid = sev_get_asid(kvm);
529
	struct sev_data_activate activate;
530
	int ret;
531

532
	/* activate ASID on the given handle */
533
	activate.handle = handle;
534
	activate.asid   = asid;
535
	ret = sev_guest_activate(&activate, error);
536

537
	return ret;
538
}
539

540
static int __sev_issue_cmd(int fd, int id, void *data, int *error)
541
{
542
	CLASS(fd, f)(fd);
543

544
	if (fd_empty(f))
545
		return -EBADF;
546

547
	return sev_issue_cmd_external_user(fd_file(f), id, data, error);
548
}
549

550
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
551
{
552
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
553

554
	return __sev_issue_cmd(sev->fd, id, data, error);
555
}
556

557
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
558
{
559
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
560
	struct sev_data_launch_start start;
561
	struct kvm_sev_launch_start params;
562
	void *dh_blob, *session_blob;
563
	int *error = &argp->error;
564
	int ret;
565

566
	if (!sev_guest(kvm))
567
		return -ENOTTY;
568

569
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
570
		return -EFAULT;
571

572
	sev->policy = params.policy;
573

574
	memset(&start, 0, sizeof(start));
575

576
	dh_blob = NULL;
577
	if (params.dh_uaddr) {
578
		dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
579
		if (IS_ERR(dh_blob))
580
			return PTR_ERR(dh_blob);
581

582
		start.dh_cert_address = __sme_set(__pa(dh_blob));
583
		start.dh_cert_len = params.dh_len;
584
	}
585

586
	session_blob = NULL;
587
	if (params.session_uaddr) {
588
		session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
589
		if (IS_ERR(session_blob)) {
590
			ret = PTR_ERR(session_blob);
591
			goto e_free_dh;
592
		}
593

594
		start.session_address = __sme_set(__pa(session_blob));
595
		start.session_len = params.session_len;
596
	}
597

598
	start.handle = params.handle;
599
	start.policy = params.policy;
600

601
	/* create memory encryption context */
602
	ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
603
	if (ret)
604
		goto e_free_session;
605

606
	/* Bind ASID to this guest */
607
	ret = sev_bind_asid(kvm, start.handle, error);
608
	if (ret) {
609
		sev_decommission(start.handle);
610
		goto e_free_session;
611
	}
612

613
	/* return handle to userspace */
614
	params.handle = start.handle;
615
	if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
616
		sev_unbind_asid(kvm, start.handle);
617
		ret = -EFAULT;
618
		goto e_free_session;
619
	}
620

621
	sev->handle = start.handle;
622
	sev->fd = argp->sev_fd;
623

624
e_free_session:
625
	kfree(session_blob);
626
e_free_dh:
627
	kfree(dh_blob);
628
	return ret;
629
}
630

631
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
632
				    unsigned long ulen, unsigned long *n,
633
				    unsigned int flags)
634
{
635
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
636
	unsigned long npages, size;
637
	int npinned;
638
	unsigned long locked, lock_limit;
639
	struct page **pages;
640
	unsigned long first, last;
641
	int ret;
642

643
	lockdep_assert_held(&kvm->lock);
644

645
	if (ulen == 0 || uaddr + ulen < uaddr)
646
		return ERR_PTR(-EINVAL);
647

648
	/* Calculate number of pages. */
649
	first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
650
	last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
651
	npages = (last - first + 1);
652

653
	locked = sev->pages_locked + npages;
654
	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
655
	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
656
		pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
657
		return ERR_PTR(-ENOMEM);
658
	}
659

660
	if (WARN_ON_ONCE(npages > INT_MAX))
661
		return ERR_PTR(-EINVAL);
662

663
	/* Avoid using vmalloc for smaller buffers. */
664
	size = npages * sizeof(struct page *);
665
	if (size > PAGE_SIZE)
666
		pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
667
	else
668
		pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
669

670
	if (!pages)
671
		return ERR_PTR(-ENOMEM);
672

673
	/* Pin the user virtual address. */
674
	npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
675
	if (npinned != npages) {
676
		pr_err("SEV: Failure locking %lu pages.\n", npages);
677
		ret = -ENOMEM;
678
		goto err;
679
	}
680

681
	*n = npages;
682
	sev->pages_locked = locked;
683

684
	return pages;
685

686
err:
687
	if (npinned > 0)
688
		unpin_user_pages(pages, npinned);
689

690
	kvfree(pages);
691
	return ERR_PTR(ret);
692
}
693

694
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
695
			     unsigned long npages)
696
{
697
	unpin_user_pages(pages, npages);
698
	kvfree(pages);
699
	to_kvm_sev_info(kvm)->pages_locked -= npages;
700
}
701

702
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
703
{
704
	uint8_t *page_virtual;
705
	unsigned long i;
706

707
	if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
708
	    pages == NULL)
709
		return;
710

711
	for (i = 0; i < npages; i++) {
712
		page_virtual = kmap_local_page(pages[i]);
713
		clflush_cache_range(page_virtual, PAGE_SIZE);
714
		kunmap_local(page_virtual);
715
		cond_resched();
716
	}
717
}
718

719
static void sev_writeback_caches(struct kvm *kvm)
720
{
721
	/*
722
	 * Ensure that all dirty guest tagged cache entries are written back
723
	 * before releasing the pages back to the system for use.  CLFLUSH will
724
	 * not do this without SME_COHERENT, and flushing many cache lines
725
	 * individually is slower than blasting WBINVD for large VMs, so issue
726
	 * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
727
	 * on CPUs that have done VMRUN, i.e. may have dirtied data using the
728
	 * VM's ASID.
729
	 *
730
	 * For simplicity, never remove CPUs from the bitmap.  Ideally, KVM
731
	 * would clear the mask when flushing caches, but doing so requires
732
	 * serializing multiple calls and having responding CPUs (to the IPI)
733
	 * mark themselves as still running if they are running (or about to
734
	 * run) a vCPU for the VM.
735
	 *
736
	 * Note, the caller is responsible for ensuring correctness if the mask
737
	 * can be modified, e.g. if a CPU could be doing VMRUN.
738
	 */
739
	wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
740
}
741

742
static unsigned long get_num_contig_pages(unsigned long idx,
743
				struct page **inpages, unsigned long npages)
744
{
745
	unsigned long paddr, next_paddr;
746
	unsigned long i = idx + 1, pages = 1;
747

748
	/* find the number of contiguous pages starting from idx */
749
	paddr = __sme_page_pa(inpages[idx]);
750
	while (i < npages) {
751
		next_paddr = __sme_page_pa(inpages[i++]);
752
		if ((paddr + PAGE_SIZE) == next_paddr) {
753
			pages++;
754
			paddr = next_paddr;
755
			continue;
756
		}
757
		break;
758
	}
759

760
	return pages;
761
}
762

763
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
764
{
765
	unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
766
	struct kvm_sev_launch_update_data params;
767
	struct sev_data_launch_update_data data;
768
	struct page **inpages;
769
	int ret;
770

771
	if (!sev_guest(kvm))
772
		return -ENOTTY;
773

774
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
775
		return -EFAULT;
776

777
	vaddr = params.uaddr;
778
	size = params.len;
779
	vaddr_end = vaddr + size;
780

781
	/* Lock the user memory. */
782
	inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
783
	if (IS_ERR(inpages))
784
		return PTR_ERR(inpages);
785

786
	/*
787
	 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
788
	 * place; the cache may contain the data that was written unencrypted.
789
	 */
790
	sev_clflush_pages(inpages, npages);
791

792
	data.reserved = 0;
793
	data.handle = to_kvm_sev_info(kvm)->handle;
794

795
	for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
796
		int offset, len;
797

798
		/*
799
		 * If the user buffer is not page-aligned, calculate the offset
800
		 * within the page.
801
		 */
802
		offset = vaddr & (PAGE_SIZE - 1);
803

804
		/* Calculate the number of pages that can be encrypted in one go. */
805
		pages = get_num_contig_pages(i, inpages, npages);
806

807
		len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
808

809
		data.len = len;
810
		data.address = __sme_page_pa(inpages[i]) + offset;
811
		ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
812
		if (ret)
813
			goto e_unpin;
814

815
		size -= len;
816
		next_vaddr = vaddr + len;
817
	}
818

819
e_unpin:
820
	/* content of memory is updated, mark pages dirty */
821
	for (i = 0; i < npages; i++) {
822
		set_page_dirty_lock(inpages[i]);
823
		mark_page_accessed(inpages[i]);
824
	}
825
	/* unlock the user pages */
826
	sev_unpin_memory(kvm, inpages, npages);
827
	return ret;
828
}
829

830
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
831
{
832
	struct kvm_vcpu *vcpu = &svm->vcpu;
833
	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
834
	struct sev_es_save_area *save = svm->sev_es.vmsa;
835
	struct xregs_state *xsave;
836
	const u8 *s;
837
	u8 *d;
838
	int i;
839

840
	/* Check some debug related fields before encrypting the VMSA */
841
	if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
842
		return -EINVAL;
843

844
	/*
845
	 * SEV-ES will use a VMSA that is pointed to by the VMCB, not
846
	 * the traditional VMSA that is part of the VMCB. Copy the
847
	 * traditional VMSA as it has been built so far (in prep
848
	 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
849
	 */
850
	memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
851

852
	/* Sync registgers */
853
	save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
854
	save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
855
	save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
856
	save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
857
	save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
858
	save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
859
	save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
860
	save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
861
#ifdef CONFIG_X86_64
862
	save->r8  = svm->vcpu.arch.regs[VCPU_REGS_R8];
863
	save->r9  = svm->vcpu.arch.regs[VCPU_REGS_R9];
864
	save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
865
	save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
866
	save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
867
	save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
868
	save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
869
	save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
870
#endif
871
	save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
872

873
	/* Sync some non-GPR registers before encrypting */
874
	save->xcr0 = svm->vcpu.arch.xcr0;
875
	save->pkru = svm->vcpu.arch.pkru;
876
	save->xss  = svm->vcpu.arch.ia32_xss;
877
	save->dr6  = svm->vcpu.arch.dr6;
878

879
	save->sev_features = sev->vmsa_features;
880

881
	/*
882
	 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
883
	 * breaking older measurements.
884
	 */
885
	if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
886
		xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
887
		save->x87_dp = xsave->i387.rdp;
888
		save->mxcsr = xsave->i387.mxcsr;
889
		save->x87_ftw = xsave->i387.twd;
890
		save->x87_fsw = xsave->i387.swd;
891
		save->x87_fcw = xsave->i387.cwd;
892
		save->x87_fop = xsave->i387.fop;
893
		save->x87_ds = 0;
894
		save->x87_cs = 0;
895
		save->x87_rip = xsave->i387.rip;
896

897
		for (i = 0; i < 8; i++) {
898
			/*
899
			 * The format of the x87 save area is undocumented and
900
			 * definitely not what you would expect.  It consists of
901
			 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
902
			 * area with bytes 8-9 of each register.
903
			 */
904
			d = save->fpreg_x87 + i * 8;
905
			s = ((u8 *)xsave->i387.st_space) + i * 16;
906
			memcpy(d, s, 8);
907
			save->fpreg_x87[64 + i * 2] = s[8];
908
			save->fpreg_x87[64 + i * 2 + 1] = s[9];
909
		}
910
		memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
911

912
		s = get_xsave_addr(xsave, XFEATURE_YMM);
913
		if (s)
914
			memcpy(save->fpreg_ymm, s, 256);
915
		else
916
			memset(save->fpreg_ymm, 0, 256);
917
	}
918

919
	pr_debug("Virtual Machine Save Area (VMSA):\n");
920
	print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
921

922
	return 0;
923
}
924

925
static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
926
				    int *error)
927
{
928
	struct sev_data_launch_update_vmsa vmsa;
929
	struct vcpu_svm *svm = to_svm(vcpu);
930
	int ret;
931

932
	if (vcpu->guest_debug) {
933
		pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
934
		return -EINVAL;
935
	}
936

937
	/* Perform some pre-encryption checks against the VMSA */
938
	ret = sev_es_sync_vmsa(svm);
939
	if (ret)
940
		return ret;
941

942
	/*
943
	 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
944
	 * the VMSA memory content (i.e it will write the same memory region
945
	 * with the guest's key), so invalidate it first.
946
	 */
947
	clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
948

949
	vmsa.reserved = 0;
950
	vmsa.handle = to_kvm_sev_info(kvm)->handle;
951
	vmsa.address = __sme_pa(svm->sev_es.vmsa);
952
	vmsa.len = PAGE_SIZE;
953
	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
954
	if (ret)
955
	  return ret;
956

957
	/*
958
	 * SEV-ES guests maintain an encrypted version of their FPU
959
	 * state which is restored and saved on VMRUN and VMEXIT.
960
	 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
961
	 * do xsave/xrstor on it.
962
	 */
963
	fpstate_set_confidential(&vcpu->arch.guest_fpu);
964
	vcpu->arch.guest_state_protected = true;
965

966
	/*
967
	 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
968
	 * only after setting guest_state_protected because KVM_SET_MSRS allows
969
	 * dynamic toggling of LBRV (for performance reason) on write access to
970
	 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
971
	 */
972
	svm_enable_lbrv(vcpu);
973
	return 0;
974
}
975

976
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
977
{
978
	struct kvm_vcpu *vcpu;
979
	unsigned long i;
980
	int ret;
981

982
	if (!sev_es_guest(kvm))
983
		return -ENOTTY;
984

985
	kvm_for_each_vcpu(i, vcpu, kvm) {
986
		ret = mutex_lock_killable(&vcpu->mutex);
987
		if (ret)
988
			return ret;
989

990
		ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
991

992
		mutex_unlock(&vcpu->mutex);
993
		if (ret)
994
			return ret;
995
	}
996

997
	return 0;
998
}
999

1000
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
1001
{
1002
	void __user *measure = u64_to_user_ptr(argp->data);
1003
	struct sev_data_launch_measure data;
1004
	struct kvm_sev_launch_measure params;
1005
	void __user *p = NULL;
1006
	void *blob = NULL;
1007
	int ret;
1008

1009
	if (!sev_guest(kvm))
1010
		return -ENOTTY;
1011

1012
	if (copy_from_user(&params, measure, sizeof(params)))
1013
		return -EFAULT;
1014

1015
	memset(&data, 0, sizeof(data));
1016

1017
	/* User wants to query the blob length */
1018
	if (!params.len)
1019
		goto cmd;
1020

1021
	p = u64_to_user_ptr(params.uaddr);
1022
	if (p) {
1023
		if (params.len > SEV_FW_BLOB_MAX_SIZE)
1024
			return -EINVAL;
1025

1026
		blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1027
		if (!blob)
1028
			return -ENOMEM;
1029

1030
		data.address = __psp_pa(blob);
1031
		data.len = params.len;
1032
	}
1033

1034
cmd:
1035
	data.handle = to_kvm_sev_info(kvm)->handle;
1036
	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
1037

1038
	/*
1039
	 * If we query the session length, FW responded with expected data.
1040
	 */
1041
	if (!params.len)
1042
		goto done;
1043

1044
	if (ret)
1045
		goto e_free_blob;
1046

1047
	if (blob) {
1048
		if (copy_to_user(p, blob, params.len))
1049
			ret = -EFAULT;
1050
	}
1051

1052
done:
1053
	params.len = data.len;
1054
	if (copy_to_user(measure, &params, sizeof(params)))
1055
		ret = -EFAULT;
1056
e_free_blob:
1057
	kfree(blob);
1058
	return ret;
1059
}
1060

1061
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1062
{
1063
	struct sev_data_launch_finish data;
1064

1065
	if (!sev_guest(kvm))
1066
		return -ENOTTY;
1067

1068
	data.handle = to_kvm_sev_info(kvm)->handle;
1069
	return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
1070
}
1071

1072
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
1073
{
1074
	struct kvm_sev_guest_status params;
1075
	struct sev_data_guest_status data;
1076
	int ret;
1077

1078
	if (!sev_guest(kvm))
1079
		return -ENOTTY;
1080

1081
	memset(&data, 0, sizeof(data));
1082

1083
	data.handle = to_kvm_sev_info(kvm)->handle;
1084
	ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
1085
	if (ret)
1086
		return ret;
1087

1088
	params.policy = data.policy;
1089
	params.state = data.state;
1090
	params.handle = data.handle;
1091

1092
	if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
1093
		ret = -EFAULT;
1094

1095
	return ret;
1096
}
1097

1098
static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
1099
			       unsigned long dst, int size,
1100
			       int *error, bool enc)
1101
{
1102
	struct sev_data_dbg data;
1103

1104
	data.reserved = 0;
1105
	data.handle = to_kvm_sev_info(kvm)->handle;
1106
	data.dst_addr = dst;
1107
	data.src_addr = src;
1108
	data.len = size;
1109

1110
	return sev_issue_cmd(kvm,
1111
			     enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
1112
			     &data, error);
1113
}
1114

1115
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
1116
			     unsigned long dst_paddr, int sz, int *err)
1117
{
1118
	int offset;
1119

1120
	/*
1121
	 * Its safe to read more than we are asked, caller should ensure that
1122
	 * destination has enough space.
1123
	 */
1124
	offset = src_paddr & 15;
1125
	src_paddr = round_down(src_paddr, 16);
1126
	sz = round_up(sz + offset, 16);
1127

1128
	return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
1129
}
1130

1131
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
1132
				  void __user *dst_uaddr,
1133
				  unsigned long dst_paddr,
1134
				  int size, int *err)
1135
{
1136
	struct page *tpage = NULL;
1137
	int ret, offset;
1138

1139
	/* if inputs are not 16-byte then use intermediate buffer */
1140
	if (!IS_ALIGNED(dst_paddr, 16) ||
1141
	    !IS_ALIGNED(paddr,     16) ||
1142
	    !IS_ALIGNED(size,      16)) {
1143
		tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1144
		if (!tpage)
1145
			return -ENOMEM;
1146

1147
		dst_paddr = __sme_page_pa(tpage);
1148
	}
1149

1150
	ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
1151
	if (ret)
1152
		goto e_free;
1153

1154
	if (tpage) {
1155
		offset = paddr & 15;
1156
		if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
1157
			ret = -EFAULT;
1158
	}
1159

1160
e_free:
1161
	if (tpage)
1162
		__free_page(tpage);
1163

1164
	return ret;
1165
}
1166

1167
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
1168
				  void __user *vaddr,
1169
				  unsigned long dst_paddr,
1170
				  void __user *dst_vaddr,
1171
				  int size, int *error)
1172
{
1173
	struct page *src_tpage = NULL;
1174
	struct page *dst_tpage = NULL;
1175
	int ret, len = size;
1176

1177
	/* If source buffer is not aligned then use an intermediate buffer */
1178
	if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
1179
		src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1180
		if (!src_tpage)
1181
			return -ENOMEM;
1182

1183
		if (copy_from_user(page_address(src_tpage), vaddr, size)) {
1184
			__free_page(src_tpage);
1185
			return -EFAULT;
1186
		}
1187

1188
		paddr = __sme_page_pa(src_tpage);
1189
	}
1190

1191
	/*
1192
	 *  If destination buffer or length is not aligned then do read-modify-write:
1193
	 *   - decrypt destination in an intermediate buffer
1194
	 *   - copy the source buffer in an intermediate buffer
1195
	 *   - use the intermediate buffer as source buffer
1196
	 */
1197
	if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
1198
		int dst_offset;
1199

1200
		dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1201
		if (!dst_tpage) {
1202
			ret = -ENOMEM;
1203
			goto e_free;
1204
		}
1205

1206
		ret = __sev_dbg_decrypt(kvm, dst_paddr,
1207
					__sme_page_pa(dst_tpage), size, error);
1208
		if (ret)
1209
			goto e_free;
1210

1211
		/*
1212
		 *  If source is kernel buffer then use memcpy() otherwise
1213
		 *  copy_from_user().
1214
		 */
1215
		dst_offset = dst_paddr & 15;
1216

1217
		if (src_tpage)
1218
			memcpy(page_address(dst_tpage) + dst_offset,
1219
			       page_address(src_tpage), size);
1220
		else {
1221
			if (copy_from_user(page_address(dst_tpage) + dst_offset,
1222
					   vaddr, size)) {
1223
				ret = -EFAULT;
1224
				goto e_free;
1225
			}
1226
		}
1227

1228
		paddr = __sme_page_pa(dst_tpage);
1229
		dst_paddr = round_down(dst_paddr, 16);
1230
		len = round_up(size, 16);
1231
	}
1232

1233
	ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
1234

1235
e_free:
1236
	if (src_tpage)
1237
		__free_page(src_tpage);
1238
	if (dst_tpage)
1239
		__free_page(dst_tpage);
1240
	return ret;
1241
}
1242

1243
static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
1244
{
1245
	unsigned long vaddr, vaddr_end, next_vaddr;
1246
	unsigned long dst_vaddr;
1247
	struct page **src_p, **dst_p;
1248
	struct kvm_sev_dbg debug;
1249
	unsigned long n;
1250
	unsigned int size;
1251
	int ret;
1252

1253
	if (!sev_guest(kvm))
1254
		return -ENOTTY;
1255

1256
	if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
1257
		return -EFAULT;
1258

1259
	if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
1260
		return -EINVAL;
1261
	if (!debug.dst_uaddr)
1262
		return -EINVAL;
1263

1264
	vaddr = debug.src_uaddr;
1265
	size = debug.len;
1266
	vaddr_end = vaddr + size;
1267
	dst_vaddr = debug.dst_uaddr;
1268

1269
	for (; vaddr < vaddr_end; vaddr = next_vaddr) {
1270
		int len, s_off, d_off;
1271

1272
		/* lock userspace source and destination page */
1273
		src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
1274
		if (IS_ERR(src_p))
1275
			return PTR_ERR(src_p);
1276

1277
		dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
1278
		if (IS_ERR(dst_p)) {
1279
			sev_unpin_memory(kvm, src_p, n);
1280
			return PTR_ERR(dst_p);
1281
		}
1282

1283
		/*
1284
		 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
1285
		 * the pages; flush the destination too so that future accesses do not
1286
		 * see stale data.
1287
		 */
1288
		sev_clflush_pages(src_p, 1);
1289
		sev_clflush_pages(dst_p, 1);
1290

1291
		/*
1292
		 * Since user buffer may not be page aligned, calculate the
1293
		 * offset within the page.
1294
		 */
1295
		s_off = vaddr & ~PAGE_MASK;
1296
		d_off = dst_vaddr & ~PAGE_MASK;
1297
		len = min_t(size_t, (PAGE_SIZE - s_off), size);
1298

1299
		if (dec)
1300
			ret = __sev_dbg_decrypt_user(kvm,
1301
						     __sme_page_pa(src_p[0]) + s_off,
1302
						     (void __user *)dst_vaddr,
1303
						     __sme_page_pa(dst_p[0]) + d_off,
1304
						     len, &argp->error);
1305
		else
1306
			ret = __sev_dbg_encrypt_user(kvm,
1307
						     __sme_page_pa(src_p[0]) + s_off,
1308
						     (void __user *)vaddr,
1309
						     __sme_page_pa(dst_p[0]) + d_off,
1310
						     (void __user *)dst_vaddr,
1311
						     len, &argp->error);
1312

1313
		sev_unpin_memory(kvm, src_p, n);
1314
		sev_unpin_memory(kvm, dst_p, n);
1315

1316
		if (ret)
1317
			goto err;
1318

1319
		next_vaddr = vaddr + len;
1320
		dst_vaddr = dst_vaddr + len;
1321
		size -= len;
1322
	}
1323
err:
1324
	return ret;
1325
}
1326

1327
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
1328
{
1329
	struct sev_data_launch_secret data;
1330
	struct kvm_sev_launch_secret params;
1331
	struct page **pages;
1332
	void *blob, *hdr;
1333
	unsigned long n, i;
1334
	int ret, offset;
1335

1336
	if (!sev_guest(kvm))
1337
		return -ENOTTY;
1338

1339
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1340
		return -EFAULT;
1341

1342
	pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
1343
	if (IS_ERR(pages))
1344
		return PTR_ERR(pages);
1345

1346
	/*
1347
	 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
1348
	 * place; the cache may contain the data that was written unencrypted.
1349
	 */
1350
	sev_clflush_pages(pages, n);
1351

1352
	/*
1353
	 * The secret must be copied into contiguous memory region, lets verify
1354
	 * that userspace memory pages are contiguous before we issue command.
1355
	 */
1356
	if (get_num_contig_pages(0, pages, n) != n) {
1357
		ret = -EINVAL;
1358
		goto e_unpin_memory;
1359
	}
1360

1361
	memset(&data, 0, sizeof(data));
1362

1363
	offset = params.guest_uaddr & (PAGE_SIZE - 1);
1364
	data.guest_address = __sme_page_pa(pages[0]) + offset;
1365
	data.guest_len = params.guest_len;
1366

1367
	blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1368
	if (IS_ERR(blob)) {
1369
		ret = PTR_ERR(blob);
1370
		goto e_unpin_memory;
1371
	}
1372

1373
	data.trans_address = __psp_pa(blob);
1374
	data.trans_len = params.trans_len;
1375

1376
	hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1377
	if (IS_ERR(hdr)) {
1378
		ret = PTR_ERR(hdr);
1379
		goto e_free_blob;
1380
	}
1381
	data.hdr_address = __psp_pa(hdr);
1382
	data.hdr_len = params.hdr_len;
1383

1384
	data.handle = to_kvm_sev_info(kvm)->handle;
1385
	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
1386

1387
	kfree(hdr);
1388

1389
e_free_blob:
1390
	kfree(blob);
1391
e_unpin_memory:
1392
	/* content of memory is updated, mark pages dirty */
1393
	for (i = 0; i < n; i++) {
1394
		set_page_dirty_lock(pages[i]);
1395
		mark_page_accessed(pages[i]);
1396
	}
1397
	sev_unpin_memory(kvm, pages, n);
1398
	return ret;
1399
}
1400

1401
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
1402
{
1403
	void __user *report = u64_to_user_ptr(argp->data);
1404
	struct sev_data_attestation_report data;
1405
	struct kvm_sev_attestation_report params;
1406
	void __user *p;
1407
	void *blob = NULL;
1408
	int ret;
1409

1410
	if (!sev_guest(kvm))
1411
		return -ENOTTY;
1412

1413
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1414
		return -EFAULT;
1415

1416
	memset(&data, 0, sizeof(data));
1417

1418
	/* User wants to query the blob length */
1419
	if (!params.len)
1420
		goto cmd;
1421

1422
	p = u64_to_user_ptr(params.uaddr);
1423
	if (p) {
1424
		if (params.len > SEV_FW_BLOB_MAX_SIZE)
1425
			return -EINVAL;
1426

1427
		blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1428
		if (!blob)
1429
			return -ENOMEM;
1430

1431
		data.address = __psp_pa(blob);
1432
		data.len = params.len;
1433
		memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
1434
	}
1435
cmd:
1436
	data.handle = to_kvm_sev_info(kvm)->handle;
1437
	ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
1438
	/*
1439
	 * If we query the session length, FW responded with expected data.
1440
	 */
1441
	if (!params.len)
1442
		goto done;
1443

1444
	if (ret)
1445
		goto e_free_blob;
1446

1447
	if (blob) {
1448
		if (copy_to_user(p, blob, params.len))
1449
			ret = -EFAULT;
1450
	}
1451

1452
done:
1453
	params.len = data.len;
1454
	if (copy_to_user(report, &params, sizeof(params)))
1455
		ret = -EFAULT;
1456
e_free_blob:
1457
	kfree(blob);
1458
	return ret;
1459
}
1460

1461
/* Userspace wants to query session length. */
1462
static int
1463
__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
1464
				      struct kvm_sev_send_start *params)
1465
{
1466
	struct sev_data_send_start data;
1467
	int ret;
1468

1469
	memset(&data, 0, sizeof(data));
1470
	data.handle = to_kvm_sev_info(kvm)->handle;
1471
	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1472

1473
	params->session_len = data.session_len;
1474
	if (copy_to_user(u64_to_user_ptr(argp->data), params,
1475
				sizeof(struct kvm_sev_send_start)))
1476
		ret = -EFAULT;
1477

1478
	return ret;
1479
}
1480

1481
static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1482
{
1483
	struct sev_data_send_start data;
1484
	struct kvm_sev_send_start params;
1485
	void *amd_certs, *session_data;
1486
	void *pdh_cert, *plat_certs;
1487
	int ret;
1488

1489
	if (!sev_guest(kvm))
1490
		return -ENOTTY;
1491

1492
	if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1493
				sizeof(struct kvm_sev_send_start)))
1494
		return -EFAULT;
1495

1496
	/* if session_len is zero, userspace wants to query the session length */
1497
	if (!params.session_len)
1498
		return __sev_send_start_query_session_length(kvm, argp,
1499
				&params);
1500

1501
	/* some sanity checks */
1502
	if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
1503
	    !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
1504
		return -EINVAL;
1505

1506
	/* allocate the memory to hold the session data blob */
1507
	session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
1508
	if (!session_data)
1509
		return -ENOMEM;
1510

1511
	/* copy the certificate blobs from userspace */
1512
	pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
1513
				params.pdh_cert_len);
1514
	if (IS_ERR(pdh_cert)) {
1515
		ret = PTR_ERR(pdh_cert);
1516
		goto e_free_session;
1517
	}
1518

1519
	plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
1520
				params.plat_certs_len);
1521
	if (IS_ERR(plat_certs)) {
1522
		ret = PTR_ERR(plat_certs);
1523
		goto e_free_pdh;
1524
	}
1525

1526
	amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
1527
				params.amd_certs_len);
1528
	if (IS_ERR(amd_certs)) {
1529
		ret = PTR_ERR(amd_certs);
1530
		goto e_free_plat_cert;
1531
	}
1532

1533
	/* populate the FW SEND_START field with system physical address */
1534
	memset(&data, 0, sizeof(data));
1535
	data.pdh_cert_address = __psp_pa(pdh_cert);
1536
	data.pdh_cert_len = params.pdh_cert_len;
1537
	data.plat_certs_address = __psp_pa(plat_certs);
1538
	data.plat_certs_len = params.plat_certs_len;
1539
	data.amd_certs_address = __psp_pa(amd_certs);
1540
	data.amd_certs_len = params.amd_certs_len;
1541
	data.session_address = __psp_pa(session_data);
1542
	data.session_len = params.session_len;
1543
	data.handle = to_kvm_sev_info(kvm)->handle;
1544

1545
	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1546

1547
	if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
1548
			session_data, params.session_len)) {
1549
		ret = -EFAULT;
1550
		goto e_free_amd_cert;
1551
	}
1552

1553
	params.policy = data.policy;
1554
	params.session_len = data.session_len;
1555
	if (copy_to_user(u64_to_user_ptr(argp->data), &params,
1556
				sizeof(struct kvm_sev_send_start)))
1557
		ret = -EFAULT;
1558

1559
e_free_amd_cert:
1560
	kfree(amd_certs);
1561
e_free_plat_cert:
1562
	kfree(plat_certs);
1563
e_free_pdh:
1564
	kfree(pdh_cert);
1565
e_free_session:
1566
	kfree(session_data);
1567
	return ret;
1568
}
1569

1570
/* Userspace wants to query either header or trans length. */
1571
static int
1572
__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
1573
				     struct kvm_sev_send_update_data *params)
1574
{
1575
	struct sev_data_send_update_data data;
1576
	int ret;
1577

1578
	memset(&data, 0, sizeof(data));
1579
	data.handle = to_kvm_sev_info(kvm)->handle;
1580
	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1581

1582
	params->hdr_len = data.hdr_len;
1583
	params->trans_len = data.trans_len;
1584

1585
	if (copy_to_user(u64_to_user_ptr(argp->data), params,
1586
			 sizeof(struct kvm_sev_send_update_data)))
1587
		ret = -EFAULT;
1588

1589
	return ret;
1590
}
1591

1592
static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1593
{
1594
	struct sev_data_send_update_data data;
1595
	struct kvm_sev_send_update_data params;
1596
	void *hdr, *trans_data;
1597
	struct page **guest_page;
1598
	unsigned long n;
1599
	int ret, offset;
1600

1601
	if (!sev_guest(kvm))
1602
		return -ENOTTY;
1603

1604
	if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1605
			sizeof(struct kvm_sev_send_update_data)))
1606
		return -EFAULT;
1607

1608
	/* userspace wants to query either header or trans length */
1609
	if (!params.trans_len || !params.hdr_len)
1610
		return __sev_send_update_data_query_lengths(kvm, argp, &params);
1611

1612
	if (!params.trans_uaddr || !params.guest_uaddr ||
1613
	    !params.guest_len || !params.hdr_uaddr)
1614
		return -EINVAL;
1615

1616
	/* Check if we are crossing the page boundary */
1617
	offset = params.guest_uaddr & (PAGE_SIZE - 1);
1618
	if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1619
		return -EINVAL;
1620

1621
	/* Pin guest memory */
1622
	guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1623
				    PAGE_SIZE, &n, 0);
1624
	if (IS_ERR(guest_page))
1625
		return PTR_ERR(guest_page);
1626

1627
	/* allocate memory for header and transport buffer */
1628
	ret = -ENOMEM;
1629
	hdr = kzalloc(params.hdr_len, GFP_KERNEL);
1630
	if (!hdr)
1631
		goto e_unpin;
1632

1633
	trans_data = kzalloc(params.trans_len, GFP_KERNEL);
1634
	if (!trans_data)
1635
		goto e_free_hdr;
1636

1637
	memset(&data, 0, sizeof(data));
1638
	data.hdr_address = __psp_pa(hdr);
1639
	data.hdr_len = params.hdr_len;
1640
	data.trans_address = __psp_pa(trans_data);
1641
	data.trans_len = params.trans_len;
1642

1643
	/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
1644
	data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1645
	data.guest_address |= sev_me_mask;
1646
	data.guest_len = params.guest_len;
1647
	data.handle = to_kvm_sev_info(kvm)->handle;
1648

1649
	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1650

1651
	if (ret)
1652
		goto e_free_trans_data;
1653

1654
	/* copy transport buffer to user space */
1655
	if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
1656
			 trans_data, params.trans_len)) {
1657
		ret = -EFAULT;
1658
		goto e_free_trans_data;
1659
	}
1660

1661
	/* Copy packet header to userspace. */
1662
	if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
1663
			 params.hdr_len))
1664
		ret = -EFAULT;
1665

1666
e_free_trans_data:
1667
	kfree(trans_data);
1668
e_free_hdr:
1669
	kfree(hdr);
1670
e_unpin:
1671
	sev_unpin_memory(kvm, guest_page, n);
1672

1673
	return ret;
1674
}
1675

1676
static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1677
{
1678
	struct sev_data_send_finish data;
1679

1680
	if (!sev_guest(kvm))
1681
		return -ENOTTY;
1682

1683
	data.handle = to_kvm_sev_info(kvm)->handle;
1684
	return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
1685
}
1686

1687
static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
1688
{
1689
	struct sev_data_send_cancel data;
1690

1691
	if (!sev_guest(kvm))
1692
		return -ENOTTY;
1693

1694
	data.handle = to_kvm_sev_info(kvm)->handle;
1695
	return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
1696
}
1697

1698
static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1699
{
1700
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
1701
	struct sev_data_receive_start start;
1702
	struct kvm_sev_receive_start params;
1703
	int *error = &argp->error;
1704
	void *session_data;
1705
	void *pdh_data;
1706
	int ret;
1707

1708
	if (!sev_guest(kvm))
1709
		return -ENOTTY;
1710

1711
	/* Get parameter from the userspace */
1712
	if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1713
			sizeof(struct kvm_sev_receive_start)))
1714
		return -EFAULT;
1715

1716
	/* some sanity checks */
1717
	if (!params.pdh_uaddr || !params.pdh_len ||
1718
	    !params.session_uaddr || !params.session_len)
1719
		return -EINVAL;
1720

1721
	pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
1722
	if (IS_ERR(pdh_data))
1723
		return PTR_ERR(pdh_data);
1724

1725
	session_data = psp_copy_user_blob(params.session_uaddr,
1726
			params.session_len);
1727
	if (IS_ERR(session_data)) {
1728
		ret = PTR_ERR(session_data);
1729
		goto e_free_pdh;
1730
	}
1731

1732
	memset(&start, 0, sizeof(start));
1733
	start.handle = params.handle;
1734
	start.policy = params.policy;
1735
	start.pdh_cert_address = __psp_pa(pdh_data);
1736
	start.pdh_cert_len = params.pdh_len;
1737
	start.session_address = __psp_pa(session_data);
1738
	start.session_len = params.session_len;
1739

1740
	/* create memory encryption context */
1741
	ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
1742
				error);
1743
	if (ret)
1744
		goto e_free_session;
1745

1746
	/* Bind ASID to this guest */
1747
	ret = sev_bind_asid(kvm, start.handle, error);
1748
	if (ret) {
1749
		sev_decommission(start.handle);
1750
		goto e_free_session;
1751
	}
1752

1753
	params.handle = start.handle;
1754
	if (copy_to_user(u64_to_user_ptr(argp->data),
1755
			 &params, sizeof(struct kvm_sev_receive_start))) {
1756
		ret = -EFAULT;
1757
		sev_unbind_asid(kvm, start.handle);
1758
		goto e_free_session;
1759
	}
1760

1761
    	sev->handle = start.handle;
1762
	sev->fd = argp->sev_fd;
1763

1764
e_free_session:
1765
	kfree(session_data);
1766
e_free_pdh:
1767
	kfree(pdh_data);
1768

1769
	return ret;
1770
}
1771

1772
static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1773
{
1774
	struct kvm_sev_receive_update_data params;
1775
	struct sev_data_receive_update_data data;
1776
	void *hdr = NULL, *trans = NULL;
1777
	struct page **guest_page;
1778
	unsigned long n;
1779
	int ret, offset;
1780

1781
	if (!sev_guest(kvm))
1782
		return -EINVAL;
1783

1784
	if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1785
			sizeof(struct kvm_sev_receive_update_data)))
1786
		return -EFAULT;
1787

1788
	if (!params.hdr_uaddr || !params.hdr_len ||
1789
	    !params.guest_uaddr || !params.guest_len ||
1790
	    !params.trans_uaddr || !params.trans_len)
1791
		return -EINVAL;
1792

1793
	/* Check if we are crossing the page boundary */
1794
	offset = params.guest_uaddr & (PAGE_SIZE - 1);
1795
	if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1796
		return -EINVAL;
1797

1798
	hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1799
	if (IS_ERR(hdr))
1800
		return PTR_ERR(hdr);
1801

1802
	trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1803
	if (IS_ERR(trans)) {
1804
		ret = PTR_ERR(trans);
1805
		goto e_free_hdr;
1806
	}
1807

1808
	memset(&data, 0, sizeof(data));
1809
	data.hdr_address = __psp_pa(hdr);
1810
	data.hdr_len = params.hdr_len;
1811
	data.trans_address = __psp_pa(trans);
1812
	data.trans_len = params.trans_len;
1813

1814
	/* Pin guest memory */
1815
	guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1816
				    PAGE_SIZE, &n, FOLL_WRITE);
1817
	if (IS_ERR(guest_page)) {
1818
		ret = PTR_ERR(guest_page);
1819
		goto e_free_trans;
1820
	}
1821

1822
	/*
1823
	 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
1824
	 * encrypts the written data with the guest's key, and the cache may
1825
	 * contain dirty, unencrypted data.
1826
	 */
1827
	sev_clflush_pages(guest_page, n);
1828

1829
	/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
1830
	data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1831
	data.guest_address |= sev_me_mask;
1832
	data.guest_len = params.guest_len;
1833
	data.handle = to_kvm_sev_info(kvm)->handle;
1834

1835
	ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
1836
				&argp->error);
1837

1838
	sev_unpin_memory(kvm, guest_page, n);
1839

1840
e_free_trans:
1841
	kfree(trans);
1842
e_free_hdr:
1843
	kfree(hdr);
1844

1845
	return ret;
1846
}
1847

1848
static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1849
{
1850
	struct sev_data_receive_finish data;
1851

1852
	if (!sev_guest(kvm))
1853
		return -ENOTTY;
1854

1855
	data.handle = to_kvm_sev_info(kvm)->handle;
1856
	return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
1857
}
1858

1859
static bool is_cmd_allowed_from_mirror(u32 cmd_id)
1860
{
1861
	/*
1862
	 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
1863
	 * active mirror VMs. Also allow the debugging and status commands.
1864
	 */
1865
	if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
1866
	    cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
1867
	    cmd_id == KVM_SEV_DBG_ENCRYPT)
1868
		return true;
1869

1870
	return false;
1871
}
1872

1873
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1874
{
1875
	struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1876
	struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1877
	int r = -EBUSY;
1878

1879
	if (dst_kvm == src_kvm)
1880
		return -EINVAL;
1881

1882
	/*
1883
	 * Bail if these VMs are already involved in a migration to avoid
1884
	 * deadlock between two VMs trying to migrate to/from each other.
1885
	 */
1886
	if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
1887
		return -EBUSY;
1888

1889
	if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
1890
		goto release_dst;
1891

1892
	r = -EINTR;
1893
	if (mutex_lock_killable(&dst_kvm->lock))
1894
		goto release_src;
1895
	if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
1896
		goto unlock_dst;
1897
	return 0;
1898

1899
unlock_dst:
1900
	mutex_unlock(&dst_kvm->lock);
1901
release_src:
1902
	atomic_set_release(&src_sev->migration_in_progress, 0);
1903
release_dst:
1904
	atomic_set_release(&dst_sev->migration_in_progress, 0);
1905
	return r;
1906
}
1907

1908
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1909
{
1910
	struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1911
	struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1912

1913
	mutex_unlock(&dst_kvm->lock);
1914
	mutex_unlock(&src_kvm->lock);
1915
	atomic_set_release(&dst_sev->migration_in_progress, 0);
1916
	atomic_set_release(&src_sev->migration_in_progress, 0);
1917
}
1918

1919
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
1920
{
1921
	struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
1922
	struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
1923
	struct kvm_vcpu *dst_vcpu, *src_vcpu;
1924
	struct vcpu_svm *dst_svm, *src_svm;
1925
	struct kvm_sev_info *mirror;
1926
	unsigned long i;
1927

1928
	dst->active = true;
1929
	dst->asid = src->asid;
1930
	dst->handle = src->handle;
1931
	dst->pages_locked = src->pages_locked;
1932
	dst->enc_context_owner = src->enc_context_owner;
1933
	dst->es_active = src->es_active;
1934
	dst->vmsa_features = src->vmsa_features;
1935

1936
	src->asid = 0;
1937
	src->active = false;
1938
	src->handle = 0;
1939
	src->pages_locked = 0;
1940
	src->enc_context_owner = NULL;
1941
	src->es_active = false;
1942

1943
	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
1944

1945
	/*
1946
	 * If this VM has mirrors, "transfer" each mirror's refcount of the
1947
	 * source to the destination (this KVM).  The caller holds a reference
1948
	 * to the source, so there's no danger of use-after-free.
1949
	 */
1950
	list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
1951
	list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
1952
		kvm_get_kvm(dst_kvm);
1953
		kvm_put_kvm(src_kvm);
1954
		mirror->enc_context_owner = dst_kvm;
1955
	}
1956

1957
	/*
1958
	 * If this VM is a mirror, remove the old mirror from the owners list
1959
	 * and add the new mirror to the list.
1960
	 */
1961
	if (is_mirroring_enc_context(dst_kvm)) {
1962
		struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
1963

1964
		list_del(&src->mirror_entry);
1965
		list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
1966
	}
1967

1968
	kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
1969
		dst_svm = to_svm(dst_vcpu);
1970

1971
		sev_init_vmcb(dst_svm);
1972

1973
		if (!dst->es_active)
1974
			continue;
1975

1976
		/*
1977
		 * Note, the source is not required to have the same number of
1978
		 * vCPUs as the destination when migrating a vanilla SEV VM.
1979
		 */
1980
		src_vcpu = kvm_get_vcpu(src_kvm, i);
1981
		src_svm = to_svm(src_vcpu);
1982

1983
		/*
1984
		 * Transfer VMSA and GHCB state to the destination.  Nullify and
1985
		 * clear source fields as appropriate, the state now belongs to
1986
		 * the destination.
1987
		 */
1988
		memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
1989
		dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
1990
		dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
1991
		dst_vcpu->arch.guest_state_protected = true;
1992

1993
		memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
1994
		src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
1995
		src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
1996
		src_vcpu->arch.guest_state_protected = false;
1997
	}
1998
}
1999

2000
static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
2001
{
2002
	struct kvm_vcpu *src_vcpu;
2003
	unsigned long i;
2004

2005
	if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
2006
	    dst->created_vcpus != atomic_read(&dst->online_vcpus))
2007
		return -EBUSY;
2008

2009
	if (!sev_es_guest(src))
2010
		return 0;
2011

2012
	if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
2013
		return -EINVAL;
2014

2015
	kvm_for_each_vcpu(i, src_vcpu, src) {
2016
		if (!src_vcpu->arch.guest_state_protected)
2017
			return -EINVAL;
2018
	}
2019

2020
	return 0;
2021
}
2022

2023
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2024
{
2025
	struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
2026
	struct kvm_sev_info *src_sev, *cg_cleanup_sev;
2027
	CLASS(fd, f)(source_fd);
2028
	struct kvm *source_kvm;
2029
	bool charged = false;
2030
	int ret;
2031

2032
	if (fd_empty(f))
2033
		return -EBADF;
2034

2035
	if (!file_is_kvm(fd_file(f)))
2036
		return -EBADF;
2037

2038
	source_kvm = fd_file(f)->private_data;
2039
	ret = sev_lock_two_vms(kvm, source_kvm);
2040
	if (ret)
2041
		return ret;
2042

2043
	if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
2044
	    sev_guest(kvm) || !sev_guest(source_kvm)) {
2045
		ret = -EINVAL;
2046
		goto out_unlock;
2047
	}
2048

2049
	src_sev = to_kvm_sev_info(source_kvm);
2050

2051
	dst_sev->misc_cg = get_current_misc_cg();
2052
	cg_cleanup_sev = dst_sev;
2053
	if (dst_sev->misc_cg != src_sev->misc_cg) {
2054
		ret = sev_misc_cg_try_charge(dst_sev);
2055
		if (ret)
2056
			goto out_dst_cgroup;
2057
		charged = true;
2058
	}
2059

2060
	ret = kvm_lock_all_vcpus(kvm);
2061
	if (ret)
2062
		goto out_dst_cgroup;
2063
	ret = kvm_lock_all_vcpus(source_kvm);
2064
	if (ret)
2065
		goto out_dst_vcpu;
2066

2067
	ret = sev_check_source_vcpus(kvm, source_kvm);
2068
	if (ret)
2069
		goto out_source_vcpu;
2070

2071
	/*
2072
	 * Allocate a new have_run_cpus for the destination, i.e. don't copy
2073
	 * the set of CPUs from the source.  If a CPU was used to run a vCPU in
2074
	 * the source VM but is never used for the destination VM, then the CPU
2075
	 * can only have cached memory that was accessible to the source VM.
2076
	 */
2077
	if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2078
		ret = -ENOMEM;
2079
		goto out_source_vcpu;
2080
	}
2081

2082
	sev_migrate_from(kvm, source_kvm);
2083
	kvm_vm_dead(source_kvm);
2084
	cg_cleanup_sev = src_sev;
2085
	ret = 0;
2086

2087
out_source_vcpu:
2088
	kvm_unlock_all_vcpus(source_kvm);
2089
out_dst_vcpu:
2090
	kvm_unlock_all_vcpus(kvm);
2091
out_dst_cgroup:
2092
	/* Operates on the source on success, on the destination on failure.  */
2093
	if (charged)
2094
		sev_misc_cg_uncharge(cg_cleanup_sev);
2095
	put_misc_cg(cg_cleanup_sev->misc_cg);
2096
	cg_cleanup_sev->misc_cg = NULL;
2097
out_unlock:
2098
	sev_unlock_two_vms(kvm, source_kvm);
2099
	return ret;
2100
}
2101

2102
int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
2103
{
2104
	if (group != KVM_X86_GRP_SEV)
2105
		return -ENXIO;
2106

2107
	switch (attr) {
2108
	case KVM_X86_SEV_VMSA_FEATURES:
2109
		*val = sev_supported_vmsa_features;
2110
		return 0;
2111

2112
	default:
2113
		return -ENXIO;
2114
	}
2115
}
2116

2117
/*
2118
 * The guest context contains all the information, keys and metadata
2119
 * associated with the guest that the firmware tracks to implement SEV
2120
 * and SNP features. The firmware stores the guest context in hypervisor
2121
 * provide page via the SNP_GCTX_CREATE command.
2122
 */
2123
static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
2124
{
2125
	struct sev_data_snp_addr data = {};
2126
	void *context;
2127
	int rc;
2128

2129
	/* Allocate memory for context page */
2130
	context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
2131
	if (!context)
2132
		return NULL;
2133

2134
	data.address = __psp_pa(context);
2135
	rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
2136
	if (rc) {
2137
		pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
2138
			rc, argp->error);
2139
		snp_free_firmware_page(context);
2140
		return NULL;
2141
	}
2142

2143
	return context;
2144
}
2145

2146
static int snp_bind_asid(struct kvm *kvm, int *error)
2147
{
2148
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2149
	struct sev_data_snp_activate data = {0};
2150

2151
	data.gctx_paddr = __psp_pa(sev->snp_context);
2152
	data.asid = sev_get_asid(kvm);
2153
	return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
2154
}
2155

2156
static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
2157
{
2158
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2159
	struct sev_data_snp_launch_start start = {0};
2160
	struct kvm_sev_snp_launch_start params;
2161
	int rc;
2162

2163
	if (!sev_snp_guest(kvm))
2164
		return -ENOTTY;
2165

2166
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2167
		return -EFAULT;
2168

2169
	/* Don't allow userspace to allocate memory for more than 1 SNP context. */
2170
	if (sev->snp_context)
2171
		return -EINVAL;
2172

2173
	if (params.flags)
2174
		return -EINVAL;
2175

2176
	if (params.policy & ~SNP_POLICY_MASK_VALID)
2177
		return -EINVAL;
2178

2179
	/* Check for policy bits that must be set */
2180
	if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
2181
		return -EINVAL;
2182

2183
	sev->policy = params.policy;
2184

2185
	sev->snp_context = snp_context_create(kvm, argp);
2186
	if (!sev->snp_context)
2187
		return -ENOTTY;
2188

2189
	start.gctx_paddr = __psp_pa(sev->snp_context);
2190
	start.policy = params.policy;
2191
	memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
2192
	rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
2193
	if (rc) {
2194
		pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
2195
			 __func__, rc);
2196
		goto e_free_context;
2197
	}
2198

2199
	sev->fd = argp->sev_fd;
2200
	rc = snp_bind_asid(kvm, &argp->error);
2201
	if (rc) {
2202
		pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
2203
			 __func__, rc);
2204
		goto e_free_context;
2205
	}
2206

2207
	return 0;
2208

2209
e_free_context:
2210
	snp_decommission_context(kvm);
2211

2212
	return rc;
2213
}
2214

2215
struct sev_gmem_populate_args {
2216
	__u8 type;
2217
	int sev_fd;
2218
	int fw_error;
2219
};
2220

2221
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
2222
				  void __user *src, int order, void *opaque)
2223
{
2224
	struct sev_gmem_populate_args *sev_populate_args = opaque;
2225
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2226
	int n_private = 0, ret, i;
2227
	int npages = (1 << order);
2228
	gfn_t gfn;
2229

2230
	if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
2231
		return -EINVAL;
2232

2233
	for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
2234
		struct sev_data_snp_launch_update fw_args = {0};
2235
		bool assigned = false;
2236
		int level;
2237

2238
		ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
2239
		if (ret || assigned) {
2240
			pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
2241
				 __func__, gfn, ret, assigned);
2242
			ret = ret ? -EINVAL : -EEXIST;
2243
			goto err;
2244
		}
2245

2246
		if (src) {
2247
			void *vaddr = kmap_local_pfn(pfn + i);
2248

2249
			if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
2250
				ret = -EFAULT;
2251
				goto err;
2252
			}
2253
			kunmap_local(vaddr);
2254
		}
2255

2256
		ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
2257
				       sev_get_asid(kvm), true);
2258
		if (ret)
2259
			goto err;
2260

2261
		n_private++;
2262

2263
		fw_args.gctx_paddr = __psp_pa(sev->snp_context);
2264
		fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
2265
		fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
2266
		fw_args.page_type = sev_populate_args->type;
2267

2268
		ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2269
				      &fw_args, &sev_populate_args->fw_error);
2270
		if (ret)
2271
			goto fw_err;
2272
	}
2273

2274
	return 0;
2275

2276
fw_err:
2277
	/*
2278
	 * If the firmware command failed handle the reclaim and cleanup of that
2279
	 * PFN specially vs. prior pages which can be cleaned up below without
2280
	 * needing to reclaim in advance.
2281
	 *
2282
	 * Additionally, when invalid CPUID function entries are detected,
2283
	 * firmware writes the expected values into the page and leaves it
2284
	 * unencrypted so it can be used for debugging and error-reporting.
2285
	 *
2286
	 * Copy this page back into the source buffer so userspace can use this
2287
	 * information to provide information on which CPUID leaves/fields
2288
	 * failed CPUID validation.
2289
	 */
2290
	if (!snp_page_reclaim(kvm, pfn + i) &&
2291
	    sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
2292
	    sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
2293
		void *vaddr = kmap_local_pfn(pfn + i);
2294

2295
		if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
2296
			pr_debug("Failed to write CPUID page back to userspace\n");
2297

2298
		kunmap_local(vaddr);
2299
	}
2300

2301
	/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
2302
	n_private--;
2303

2304
err:
2305
	pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
2306
		 __func__, ret, sev_populate_args->fw_error, n_private);
2307
	for (i = 0; i < n_private; i++)
2308
		kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
2309

2310
	return ret;
2311
}
2312

2313
static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
2314
{
2315
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2316
	struct sev_gmem_populate_args sev_populate_args = {0};
2317
	struct kvm_sev_snp_launch_update params;
2318
	struct kvm_memory_slot *memslot;
2319
	long npages, count;
2320
	void __user *src;
2321
	int ret = 0;
2322

2323
	if (!sev_snp_guest(kvm) || !sev->snp_context)
2324
		return -EINVAL;
2325

2326
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2327
		return -EFAULT;
2328

2329
	pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
2330
		 params.gfn_start, params.len, params.type, params.flags);
2331

2332
	if (!PAGE_ALIGNED(params.len) || params.flags ||
2333
	    (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
2334
	     params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
2335
	     params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
2336
	     params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
2337
	     params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
2338
		return -EINVAL;
2339

2340
	npages = params.len / PAGE_SIZE;
2341

2342
	/*
2343
	 * For each GFN that's being prepared as part of the initial guest
2344
	 * state, the following pre-conditions are verified:
2345
	 *
2346
	 *   1) The backing memslot is a valid private memslot.
2347
	 *   2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
2348
	 *      beforehand.
2349
	 *   3) The PFN of the guest_memfd has not already been set to private
2350
	 *      in the RMP table.
2351
	 *
2352
	 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
2353
	 * faults if there's a race between a fault and an attribute update via
2354
	 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
2355
	 * here. However, kvm->slots_lock guards against both this as well as
2356
	 * concurrent memslot updates occurring while these checks are being
2357
	 * performed, so use that here to make it easier to reason about the
2358
	 * initial expected state and better guard against unexpected
2359
	 * situations.
2360
	 */
2361
	mutex_lock(&kvm->slots_lock);
2362

2363
	memslot = gfn_to_memslot(kvm, params.gfn_start);
2364
	if (!kvm_slot_can_be_private(memslot)) {
2365
		ret = -EINVAL;
2366
		goto out;
2367
	}
2368

2369
	sev_populate_args.sev_fd = argp->sev_fd;
2370
	sev_populate_args.type = params.type;
2371
	src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
2372

2373
	count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
2374
				  sev_gmem_post_populate, &sev_populate_args);
2375
	if (count < 0) {
2376
		argp->error = sev_populate_args.fw_error;
2377
		pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
2378
			 __func__, count, argp->error);
2379
		ret = -EIO;
2380
	} else {
2381
		params.gfn_start += count;
2382
		params.len -= count * PAGE_SIZE;
2383
		if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
2384
			params.uaddr += count * PAGE_SIZE;
2385

2386
		ret = 0;
2387
		if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
2388
			ret = -EFAULT;
2389
	}
2390

2391
out:
2392
	mutex_unlock(&kvm->slots_lock);
2393

2394
	return ret;
2395
}
2396

2397
static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
2398
{
2399
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2400
	struct sev_data_snp_launch_update data = {};
2401
	struct kvm_vcpu *vcpu;
2402
	unsigned long i;
2403
	int ret;
2404

2405
	data.gctx_paddr = __psp_pa(sev->snp_context);
2406
	data.page_type = SNP_PAGE_TYPE_VMSA;
2407

2408
	kvm_for_each_vcpu(i, vcpu, kvm) {
2409
		struct vcpu_svm *svm = to_svm(vcpu);
2410
		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
2411

2412
		ret = sev_es_sync_vmsa(svm);
2413
		if (ret)
2414
			return ret;
2415

2416
		/* Transition the VMSA page to a firmware state. */
2417
		ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
2418
		if (ret)
2419
			return ret;
2420

2421
		/* Issue the SNP command to encrypt the VMSA */
2422
		data.address = __sme_pa(svm->sev_es.vmsa);
2423
		ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2424
				      &data, &argp->error);
2425
		if (ret) {
2426
			snp_page_reclaim(kvm, pfn);
2427

2428
			return ret;
2429
		}
2430

2431
		svm->vcpu.arch.guest_state_protected = true;
2432
		/*
2433
		 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
2434
		 * be _always_ ON. Enable it only after setting
2435
		 * guest_state_protected because KVM_SET_MSRS allows dynamic
2436
		 * toggling of LBRV (for performance reason) on write access to
2437
		 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
2438
		 */
2439
		svm_enable_lbrv(vcpu);
2440
	}
2441

2442
	return 0;
2443
}
2444

2445
static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
2446
{
2447
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2448
	struct kvm_sev_snp_launch_finish params;
2449
	struct sev_data_snp_launch_finish *data;
2450
	void *id_block = NULL, *id_auth = NULL;
2451
	int ret;
2452

2453
	if (!sev_snp_guest(kvm))
2454
		return -ENOTTY;
2455

2456
	if (!sev->snp_context)
2457
		return -EINVAL;
2458

2459
	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2460
		return -EFAULT;
2461

2462
	if (params.flags)
2463
		return -EINVAL;
2464

2465
	/* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
2466
	ret = snp_launch_update_vmsa(kvm, argp);
2467
	if (ret)
2468
		return ret;
2469

2470
	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
2471
	if (!data)
2472
		return -ENOMEM;
2473

2474
	if (params.id_block_en) {
2475
		id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
2476
		if (IS_ERR(id_block)) {
2477
			ret = PTR_ERR(id_block);
2478
			goto e_free;
2479
		}
2480

2481
		data->id_block_en = 1;
2482
		data->id_block_paddr = __sme_pa(id_block);
2483

2484
		id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
2485
		if (IS_ERR(id_auth)) {
2486
			ret = PTR_ERR(id_auth);
2487
			goto e_free_id_block;
2488
		}
2489

2490
		data->id_auth_paddr = __sme_pa(id_auth);
2491

2492
		if (params.auth_key_en)
2493
			data->auth_key_en = 1;
2494
	}
2495

2496
	data->vcek_disabled = params.vcek_disabled;
2497

2498
	memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
2499
	data->gctx_paddr = __psp_pa(sev->snp_context);
2500
	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
2501

2502
	/*
2503
	 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
2504
	 * can be given to the guest simply by marking the RMP entry as private.
2505
	 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
2506
	 */
2507
	if (!ret)
2508
		kvm->arch.pre_fault_allowed = true;
2509

2510
	kfree(id_auth);
2511

2512
e_free_id_block:
2513
	kfree(id_block);
2514

2515
e_free:
2516
	kfree(data);
2517

2518
	return ret;
2519
}
2520

2521
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
2522
{
2523
	struct kvm_sev_cmd sev_cmd;
2524
	int r;
2525

2526
	if (!sev_enabled)
2527
		return -ENOTTY;
2528

2529
	if (!argp)
2530
		return 0;
2531

2532
	if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
2533
		return -EFAULT;
2534

2535
	mutex_lock(&kvm->lock);
2536

2537
	/* Only the enc_context_owner handles some memory enc operations. */
2538
	if (is_mirroring_enc_context(kvm) &&
2539
	    !is_cmd_allowed_from_mirror(sev_cmd.id)) {
2540
		r = -EINVAL;
2541
		goto out;
2542
	}
2543

2544
	/*
2545
	 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
2546
	 * allow the use of SNP-specific commands.
2547
	 */
2548
	if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
2549
		r = -EPERM;
2550
		goto out;
2551
	}
2552

2553
	switch (sev_cmd.id) {
2554
	case KVM_SEV_ES_INIT:
2555
		if (!sev_es_enabled) {
2556
			r = -ENOTTY;
2557
			goto out;
2558
		}
2559
		fallthrough;
2560
	case KVM_SEV_INIT:
2561
		r = sev_guest_init(kvm, &sev_cmd);
2562
		break;
2563
	case KVM_SEV_INIT2:
2564
		r = sev_guest_init2(kvm, &sev_cmd);
2565
		break;
2566
	case KVM_SEV_LAUNCH_START:
2567
		r = sev_launch_start(kvm, &sev_cmd);
2568
		break;
2569
	case KVM_SEV_LAUNCH_UPDATE_DATA:
2570
		r = sev_launch_update_data(kvm, &sev_cmd);
2571
		break;
2572
	case KVM_SEV_LAUNCH_UPDATE_VMSA:
2573
		r = sev_launch_update_vmsa(kvm, &sev_cmd);
2574
		break;
2575
	case KVM_SEV_LAUNCH_MEASURE:
2576
		r = sev_launch_measure(kvm, &sev_cmd);
2577
		break;
2578
	case KVM_SEV_LAUNCH_FINISH:
2579
		r = sev_launch_finish(kvm, &sev_cmd);
2580
		break;
2581
	case KVM_SEV_GUEST_STATUS:
2582
		r = sev_guest_status(kvm, &sev_cmd);
2583
		break;
2584
	case KVM_SEV_DBG_DECRYPT:
2585
		r = sev_dbg_crypt(kvm, &sev_cmd, true);
2586
		break;
2587
	case KVM_SEV_DBG_ENCRYPT:
2588
		r = sev_dbg_crypt(kvm, &sev_cmd, false);
2589
		break;
2590
	case KVM_SEV_LAUNCH_SECRET:
2591
		r = sev_launch_secret(kvm, &sev_cmd);
2592
		break;
2593
	case KVM_SEV_GET_ATTESTATION_REPORT:
2594
		r = sev_get_attestation_report(kvm, &sev_cmd);
2595
		break;
2596
	case KVM_SEV_SEND_START:
2597
		r = sev_send_start(kvm, &sev_cmd);
2598
		break;
2599
	case KVM_SEV_SEND_UPDATE_DATA:
2600
		r = sev_send_update_data(kvm, &sev_cmd);
2601
		break;
2602
	case KVM_SEV_SEND_FINISH:
2603
		r = sev_send_finish(kvm, &sev_cmd);
2604
		break;
2605
	case KVM_SEV_SEND_CANCEL:
2606
		r = sev_send_cancel(kvm, &sev_cmd);
2607
		break;
2608
	case KVM_SEV_RECEIVE_START:
2609
		r = sev_receive_start(kvm, &sev_cmd);
2610
		break;
2611
	case KVM_SEV_RECEIVE_UPDATE_DATA:
2612
		r = sev_receive_update_data(kvm, &sev_cmd);
2613
		break;
2614
	case KVM_SEV_RECEIVE_FINISH:
2615
		r = sev_receive_finish(kvm, &sev_cmd);
2616
		break;
2617
	case KVM_SEV_SNP_LAUNCH_START:
2618
		r = snp_launch_start(kvm, &sev_cmd);
2619
		break;
2620
	case KVM_SEV_SNP_LAUNCH_UPDATE:
2621
		r = snp_launch_update(kvm, &sev_cmd);
2622
		break;
2623
	case KVM_SEV_SNP_LAUNCH_FINISH:
2624
		r = snp_launch_finish(kvm, &sev_cmd);
2625
		break;
2626
	default:
2627
		r = -EINVAL;
2628
		goto out;
2629
	}
2630

2631
	if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
2632
		r = -EFAULT;
2633

2634
out:
2635
	mutex_unlock(&kvm->lock);
2636
	return r;
2637
}
2638

2639
int sev_mem_enc_register_region(struct kvm *kvm,
2640
				struct kvm_enc_region *range)
2641
{
2642
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2643
	struct enc_region *region;
2644
	int ret = 0;
2645

2646
	if (!sev_guest(kvm))
2647
		return -ENOTTY;
2648

2649
	/* If kvm is mirroring encryption context it isn't responsible for it */
2650
	if (is_mirroring_enc_context(kvm))
2651
		return -EINVAL;
2652

2653
	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
2654
		return -EINVAL;
2655

2656
	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
2657
	if (!region)
2658
		return -ENOMEM;
2659

2660
	mutex_lock(&kvm->lock);
2661
	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
2662
				       FOLL_WRITE | FOLL_LONGTERM);
2663
	if (IS_ERR(region->pages)) {
2664
		ret = PTR_ERR(region->pages);
2665
		mutex_unlock(&kvm->lock);
2666
		goto e_free;
2667
	}
2668

2669
	/*
2670
	 * The guest may change the memory encryption attribute from C=0 -> C=1
2671
	 * or vice versa for this memory range. Lets make sure caches are
2672
	 * flushed to ensure that guest data gets written into memory with
2673
	 * correct C-bit.  Note, this must be done before dropping kvm->lock,
2674
	 * as region and its array of pages can be freed by a different task
2675
	 * once kvm->lock is released.
2676
	 */
2677
	sev_clflush_pages(region->pages, region->npages);
2678

2679
	region->uaddr = range->addr;
2680
	region->size = range->size;
2681

2682
	list_add_tail(&region->list, &sev->regions_list);
2683
	mutex_unlock(&kvm->lock);
2684

2685
	return ret;
2686

2687
e_free:
2688
	kfree(region);
2689
	return ret;
2690
}
2691

2692
static struct enc_region *
2693
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
2694
{
2695
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2696
	struct list_head *head = &sev->regions_list;
2697
	struct enc_region *i;
2698

2699
	list_for_each_entry(i, head, list) {
2700
		if (i->uaddr == range->addr &&
2701
		    i->size == range->size)
2702
			return i;
2703
	}
2704

2705
	return NULL;
2706
}
2707

2708
static void __unregister_enc_region_locked(struct kvm *kvm,
2709
					   struct enc_region *region)
2710
{
2711
	sev_unpin_memory(kvm, region->pages, region->npages);
2712
	list_del(&region->list);
2713
	kfree(region);
2714
}
2715

2716
int sev_mem_enc_unregister_region(struct kvm *kvm,
2717
				  struct kvm_enc_region *range)
2718
{
2719
	struct enc_region *region;
2720
	int ret;
2721

2722
	/* If kvm is mirroring encryption context it isn't responsible for it */
2723
	if (is_mirroring_enc_context(kvm))
2724
		return -EINVAL;
2725

2726
	mutex_lock(&kvm->lock);
2727

2728
	if (!sev_guest(kvm)) {
2729
		ret = -ENOTTY;
2730
		goto failed;
2731
	}
2732

2733
	region = find_enc_region(kvm, range);
2734
	if (!region) {
2735
		ret = -EINVAL;
2736
		goto failed;
2737
	}
2738

2739
	sev_writeback_caches(kvm);
2740

2741
	__unregister_enc_region_locked(kvm, region);
2742

2743
	mutex_unlock(&kvm->lock);
2744
	return 0;
2745

2746
failed:
2747
	mutex_unlock(&kvm->lock);
2748
	return ret;
2749
}
2750

2751
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2752
{
2753
	CLASS(fd, f)(source_fd);
2754
	struct kvm *source_kvm;
2755
	struct kvm_sev_info *source_sev, *mirror_sev;
2756
	int ret;
2757

2758
	if (fd_empty(f))
2759
		return -EBADF;
2760

2761
	if (!file_is_kvm(fd_file(f)))
2762
		return -EBADF;
2763

2764
	source_kvm = fd_file(f)->private_data;
2765
	ret = sev_lock_two_vms(kvm, source_kvm);
2766
	if (ret)
2767
		return ret;
2768

2769
	/*
2770
	 * Mirrors of mirrors should work, but let's not get silly.  Also
2771
	 * disallow out-of-band SEV/SEV-ES init if the target is already an
2772
	 * SEV guest, or if vCPUs have been created.  KVM relies on vCPUs being
2773
	 * created after SEV/SEV-ES initialization, e.g. to init intercepts.
2774
	 */
2775
	if (sev_guest(kvm) || !sev_guest(source_kvm) ||
2776
	    is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
2777
		ret = -EINVAL;
2778
		goto e_unlock;
2779
	}
2780

2781
	mirror_sev = to_kvm_sev_info(kvm);
2782
	if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2783
		ret = -ENOMEM;
2784
		goto e_unlock;
2785
	}
2786

2787
	/*
2788
	 * The mirror kvm holds an enc_context_owner ref so its asid can't
2789
	 * disappear until we're done with it
2790
	 */
2791
	source_sev = to_kvm_sev_info(source_kvm);
2792
	kvm_get_kvm(source_kvm);
2793
	list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
2794

2795
	/* Set enc_context_owner and copy its encryption context over */
2796
	mirror_sev->enc_context_owner = source_kvm;
2797
	mirror_sev->active = true;
2798
	mirror_sev->asid = source_sev->asid;
2799
	mirror_sev->fd = source_sev->fd;
2800
	mirror_sev->es_active = source_sev->es_active;
2801
	mirror_sev->need_init = false;
2802
	mirror_sev->handle = source_sev->handle;
2803
	INIT_LIST_HEAD(&mirror_sev->regions_list);
2804
	INIT_LIST_HEAD(&mirror_sev->mirror_vms);
2805
	ret = 0;
2806

2807
	/*
2808
	 * Do not copy ap_jump_table. Since the mirror does not share the same
2809
	 * KVM contexts as the original, and they may have different
2810
	 * memory-views.
2811
	 */
2812

2813
e_unlock:
2814
	sev_unlock_two_vms(kvm, source_kvm);
2815
	return ret;
2816
}
2817

2818
static int snp_decommission_context(struct kvm *kvm)
2819
{
2820
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2821
	struct sev_data_snp_addr data = {};
2822
	int ret;
2823

2824
	/* If context is not created then do nothing */
2825
	if (!sev->snp_context)
2826
		return 0;
2827

2828
	/* Do the decommision, which will unbind the ASID from the SNP context */
2829
	data.address = __sme_pa(sev->snp_context);
2830
	down_write(&sev_deactivate_lock);
2831
	ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
2832
	up_write(&sev_deactivate_lock);
2833

2834
	if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
2835
		return ret;
2836

2837
	snp_free_firmware_page(sev->snp_context);
2838
	sev->snp_context = NULL;
2839

2840
	return 0;
2841
}
2842

2843
void sev_vm_destroy(struct kvm *kvm)
2844
{
2845
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2846
	struct list_head *head = &sev->regions_list;
2847
	struct list_head *pos, *q;
2848

2849
	if (!sev_guest(kvm))
2850
		return;
2851

2852
	WARN_ON(!list_empty(&sev->mirror_vms));
2853

2854
	free_cpumask_var(sev->have_run_cpus);
2855

2856
	/*
2857
	 * If this is a mirror VM, remove it from the owner's list of a mirrors
2858
	 * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
2859
	 * Note, mirror VMs don't support registering encrypted regions.
2860
	 */
2861
	if (is_mirroring_enc_context(kvm)) {
2862
		struct kvm *owner_kvm = sev->enc_context_owner;
2863

2864
		mutex_lock(&owner_kvm->lock);
2865
		list_del(&sev->mirror_entry);
2866
		mutex_unlock(&owner_kvm->lock);
2867
		kvm_put_kvm(owner_kvm);
2868
		return;
2869
	}
2870

2871

2872
	/*
2873
	 * if userspace was terminated before unregistering the memory regions
2874
	 * then lets unpin all the registered memory.
2875
	 */
2876
	if (!list_empty(head)) {
2877
		list_for_each_safe(pos, q, head) {
2878
			__unregister_enc_region_locked(kvm,
2879
				list_entry(pos, struct enc_region, list));
2880
			cond_resched();
2881
		}
2882
	}
2883

2884
	if (sev_snp_guest(kvm)) {
2885
		snp_guest_req_cleanup(kvm);
2886

2887
		/*
2888
		 * Decomission handles unbinding of the ASID. If it fails for
2889
		 * some unexpected reason, just leak the ASID.
2890
		 */
2891
		if (snp_decommission_context(kvm))
2892
			return;
2893
	} else {
2894
		sev_unbind_asid(kvm, sev->handle);
2895
	}
2896

2897
	sev_asid_free(sev);
2898
}
2899

2900
void __init sev_set_cpu_caps(void)
2901
{
2902
	if (sev_enabled) {
2903
		kvm_cpu_cap_set(X86_FEATURE_SEV);
2904
		kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
2905
	}
2906
	if (sev_es_enabled) {
2907
		kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
2908
		kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
2909
	}
2910
	if (sev_snp_enabled) {
2911
		kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
2912
		kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
2913
	}
2914
}
2915

2916
static bool is_sev_snp_initialized(void)
2917
{
2918
	struct sev_user_data_snp_status *status;
2919
	struct sev_data_snp_addr buf;
2920
	bool initialized = false;
2921
	int ret, error = 0;
2922

2923
	status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
2924
	if (!status)
2925
		return false;
2926

2927
	buf.address = __psp_pa(status);
2928
	ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
2929
	if (ret) {
2930
		pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
2931
		       ret, error, error);
2932
		goto out;
2933
	}
2934

2935
	initialized = !!status->state;
2936

2937
out:
2938
	snp_free_firmware_page(status);
2939

2940
	return initialized;
2941
}
2942

2943
void __init sev_hardware_setup(void)
2944
{
2945
	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
2946
	struct sev_platform_init_args init_args = {0};
2947
	bool sev_snp_supported = false;
2948
	bool sev_es_supported = false;
2949
	bool sev_supported = false;
2950

2951
	if (!sev_enabled || !npt_enabled || !nrips)
2952
		goto out;
2953

2954
	/*
2955
	 * SEV must obviously be supported in hardware.  Sanity check that the
2956
	 * CPU supports decode assists, which is mandatory for SEV guests to
2957
	 * support instruction emulation.  Ditto for flushing by ASID, as SEV
2958
	 * guests are bound to a single ASID, i.e. KVM can't rotate to a new
2959
	 * ASID to effect a TLB flush.
2960
	 */
2961
	if (!boot_cpu_has(X86_FEATURE_SEV) ||
2962
	    WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
2963
	    WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
2964
		goto out;
2965

2966
	/*
2967
	 * The kernel's initcall infrastructure lacks the ability to express
2968
	 * dependencies between initcalls, whereas the modules infrastructure
2969
	 * automatically handles dependencies via symbol loading.  Ensure the
2970
	 * PSP SEV driver is initialized before proceeding if KVM is built-in,
2971
	 * as the dependency isn't handled by the initcall infrastructure.
2972
	 */
2973
	if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
2974
		goto out;
2975

2976
	/* Retrieve SEV CPUID information */
2977
	cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
2978

2979
	/* Set encryption bit location for SEV-ES guests */
2980
	sev_enc_bit = ebx & 0x3f;
2981

2982
	/* Maximum number of encrypted guests supported simultaneously */
2983
	max_sev_asid = ecx;
2984
	if (!max_sev_asid)
2985
		goto out;
2986

2987
	/* Minimum ASID value that should be used for SEV guest */
2988
	min_sev_asid = edx;
2989
	sev_me_mask = 1UL << (ebx & 0x3f);
2990

2991
	/*
2992
	 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
2993
	 * even though it's never used, so that the bitmap is indexed by the
2994
	 * actual ASID.
2995
	 */
2996
	nr_asids = max_sev_asid + 1;
2997
	sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
2998
	if (!sev_asid_bitmap)
2999
		goto out;
3000

3001
	sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
3002
	if (!sev_reclaim_asid_bitmap) {
3003
		bitmap_free(sev_asid_bitmap);
3004
		sev_asid_bitmap = NULL;
3005
		goto out;
3006
	}
3007

3008
	if (min_sev_asid <= max_sev_asid) {
3009
		sev_asid_count = max_sev_asid - min_sev_asid + 1;
3010
		WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
3011
	}
3012
	sev_supported = true;
3013

3014
	/* SEV-ES support requested? */
3015
	if (!sev_es_enabled)
3016
		goto out;
3017

3018
	/*
3019
	 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
3020
	 * instruction stream, i.e. can't emulate in response to a #NPF and
3021
	 * instead relies on #NPF(RSVD) being reflected into the guest as #VC
3022
	 * (the guest can then do a #VMGEXIT to request MMIO emulation).
3023
	 */
3024
	if (!enable_mmio_caching)
3025
		goto out;
3026

3027
	/* Does the CPU support SEV-ES? */
3028
	if (!boot_cpu_has(X86_FEATURE_SEV_ES))
3029
		goto out;
3030

3031
	if (!lbrv) {
3032
		WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
3033
			  "LBRV must be present for SEV-ES support");
3034
		goto out;
3035
	}
3036

3037
	/* Has the system been allocated ASIDs for SEV-ES? */
3038
	if (min_sev_asid == 1)
3039
		goto out;
3040

3041
	sev_es_asid_count = min_sev_asid - 1;
3042
	WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
3043
	sev_es_supported = true;
3044
	sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
3045

3046
out:
3047
	if (sev_enabled) {
3048
		init_args.probe = true;
3049
		if (sev_platform_init(&init_args))
3050
			sev_supported = sev_es_supported = sev_snp_supported = false;
3051
		else if (sev_snp_supported)
3052
			sev_snp_supported = is_sev_snp_initialized();
3053
	}
3054

3055
	if (boot_cpu_has(X86_FEATURE_SEV))
3056
		pr_info("SEV %s (ASIDs %u - %u)\n",
3057
			sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
3058
								       "unusable" :
3059
								       "disabled",
3060
			min_sev_asid, max_sev_asid);
3061
	if (boot_cpu_has(X86_FEATURE_SEV_ES))
3062
		pr_info("SEV-ES %s (ASIDs %u - %u)\n",
3063
			str_enabled_disabled(sev_es_supported),
3064
			min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
3065
	if (boot_cpu_has(X86_FEATURE_SEV_SNP))
3066
		pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
3067
			str_enabled_disabled(sev_snp_supported),
3068
			min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
3069

3070
	sev_enabled = sev_supported;
3071
	sev_es_enabled = sev_es_supported;
3072
	sev_snp_enabled = sev_snp_supported;
3073

3074
	if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
3075
	    !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
3076
		sev_es_debug_swap_enabled = false;
3077

3078
	sev_supported_vmsa_features = 0;
3079
	if (sev_es_debug_swap_enabled)
3080
		sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
3081
}
3082

3083
void sev_hardware_unsetup(void)
3084
{
3085
	if (!sev_enabled)
3086
		return;
3087

3088
	/* No need to take sev_bitmap_lock, all VMs have been destroyed. */
3089
	sev_flush_asids(1, max_sev_asid);
3090

3091
	bitmap_free(sev_asid_bitmap);
3092
	bitmap_free(sev_reclaim_asid_bitmap);
3093

3094
	misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
3095
	misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
3096

3097
	sev_platform_shutdown();
3098
}
3099

3100
int sev_cpu_init(struct svm_cpu_data *sd)
3101
{
3102
	if (!sev_enabled)
3103
		return 0;
3104

3105
	sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
3106
	if (!sd->sev_vmcbs)
3107
		return -ENOMEM;
3108

3109
	return 0;
3110
}
3111

3112
/*
3113
 * Pages used by hardware to hold guest encrypted state must be flushed before
3114
 * returning them to the system.
3115
 */
3116
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
3117
{
3118
	unsigned int asid = sev_get_asid(vcpu->kvm);
3119

3120
	/*
3121
	 * Note!  The address must be a kernel address, as regular page walk
3122
	 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
3123
	 * address is non-deterministic and unsafe.  This function deliberately
3124
	 * takes a pointer to deter passing in a user address.
3125
	 */
3126
	unsigned long addr = (unsigned long)va;
3127

3128
	/*
3129
	 * If CPU enforced cache coherency for encrypted mappings of the
3130
	 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
3131
	 * flush is still needed in order to work properly with DMA devices.
3132
	 */
3133
	if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
3134
		clflush_cache_range(va, PAGE_SIZE);
3135
		return;
3136
	}
3137

3138
	/*
3139
	 * VM Page Flush takes a host virtual address and a guest ASID.  Fall
3140
	 * back to full writeback of caches if this faults so as not to make
3141
	 * any problems worse by leaving stale encrypted data in the cache.
3142
	 */
3143
	if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
3144
		goto do_sev_writeback_caches;
3145

3146
	return;
3147

3148
do_sev_writeback_caches:
3149
	sev_writeback_caches(vcpu->kvm);
3150
}
3151

3152
void sev_guest_memory_reclaimed(struct kvm *kvm)
3153
{
3154
	/*
3155
	 * With SNP+gmem, private/encrypted memory is unreachable via the
3156
	 * hva-based mmu notifiers, i.e. these events are explicitly scoped to
3157
	 * shared pages, where there's no need to flush caches.
3158
	 */
3159
	if (!sev_guest(kvm) || sev_snp_guest(kvm))
3160
		return;
3161

3162
	sev_writeback_caches(kvm);
3163
}
3164

3165
void sev_free_vcpu(struct kvm_vcpu *vcpu)
3166
{
3167
	struct vcpu_svm *svm;
3168

3169
	if (!sev_es_guest(vcpu->kvm))
3170
		return;
3171

3172
	svm = to_svm(vcpu);
3173

3174
	/*
3175
	 * If it's an SNP guest, then the VMSA was marked in the RMP table as
3176
	 * a guest-owned page. Transition the page to hypervisor state before
3177
	 * releasing it back to the system.
3178
	 */
3179
	if (sev_snp_guest(vcpu->kvm)) {
3180
		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
3181

3182
		if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
3183
			goto skip_vmsa_free;
3184
	}
3185

3186
	if (vcpu->arch.guest_state_protected)
3187
		sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
3188

3189
	__free_page(virt_to_page(svm->sev_es.vmsa));
3190

3191
skip_vmsa_free:
3192
	if (svm->sev_es.ghcb_sa_free)
3193
		kvfree(svm->sev_es.ghcb_sa);
3194
}
3195

3196
static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
3197
{
3198
	return (((u64)control->exit_code_hi) << 32) | control->exit_code;
3199
}
3200

3201
static void dump_ghcb(struct vcpu_svm *svm)
3202
{
3203
	struct vmcb_control_area *control = &svm->vmcb->control;
3204
	unsigned int nbits;
3205

3206
	/* Re-use the dump_invalid_vmcb module parameter */
3207
	if (!dump_invalid_vmcb) {
3208
		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3209
		return;
3210
	}
3211

3212
	nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
3213

3214
	/*
3215
	 * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
3216
	 * used to handle the exit.  If the guest has since modified the GHCB
3217
	 * itself, dumping the raw GHCB won't help debug why KVM was unable to
3218
	 * handle the VMGEXIT that KVM observed.
3219
	 */
3220
	pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
3221
	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
3222
	       kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
3223
	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
3224
	       control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
3225
	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
3226
	       control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
3227
	pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
3228
	       svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
3229
	pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
3230
}
3231

3232
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
3233
{
3234
	struct kvm_vcpu *vcpu = &svm->vcpu;
3235
	struct ghcb *ghcb = svm->sev_es.ghcb;
3236

3237
	/*
3238
	 * The GHCB protocol so far allows for the following data
3239
	 * to be returned:
3240
	 *   GPRs RAX, RBX, RCX, RDX
3241
	 *
3242
	 * Copy their values, even if they may not have been written during the
3243
	 * VM-Exit.  It's the guest's responsibility to not consume random data.
3244
	 */
3245
	ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
3246
	ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
3247
	ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
3248
	ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
3249
}
3250

3251
static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
3252
{
3253
	struct vmcb_control_area *control = &svm->vmcb->control;
3254
	struct kvm_vcpu *vcpu = &svm->vcpu;
3255
	struct ghcb *ghcb = svm->sev_es.ghcb;
3256
	u64 exit_code;
3257

3258
	/*
3259
	 * The GHCB protocol so far allows for the following data
3260
	 * to be supplied:
3261
	 *   GPRs RAX, RBX, RCX, RDX
3262
	 *   XCR0
3263
	 *   CPL
3264
	 *
3265
	 * VMMCALL allows the guest to provide extra registers. KVM also
3266
	 * expects RSI for hypercalls, so include that, too.
3267
	 *
3268
	 * Copy their values to the appropriate location if supplied.
3269
	 */
3270
	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
3271

3272
	BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
3273
	memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
3274

3275
	vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
3276
	vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
3277
	vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
3278
	vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
3279
	vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
3280

3281
	svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
3282

3283
	if (kvm_ghcb_xcr0_is_valid(svm)) {
3284
		vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
3285
		vcpu->arch.cpuid_dynamic_bits_dirty = true;
3286
	}
3287

3288
	/* Copy the GHCB exit information into the VMCB fields */
3289
	exit_code = ghcb_get_sw_exit_code(ghcb);
3290
	control->exit_code = lower_32_bits(exit_code);
3291
	control->exit_code_hi = upper_32_bits(exit_code);
3292
	control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
3293
	control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
3294
	svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
3295

3296
	/* Clear the valid entries fields */
3297
	memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
3298
}
3299

3300
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
3301
{
3302
	struct vmcb_control_area *control = &svm->vmcb->control;
3303
	struct kvm_vcpu *vcpu = &svm->vcpu;
3304
	u64 exit_code;
3305
	u64 reason;
3306

3307
	/*
3308
	 * Retrieve the exit code now even though it may not be marked valid
3309
	 * as it could help with debugging.
3310
	 */
3311
	exit_code = kvm_ghcb_get_sw_exit_code(control);
3312

3313
	/* Only GHCB Usage code 0 is supported */
3314
	if (svm->sev_es.ghcb->ghcb_usage) {
3315
		reason = GHCB_ERR_INVALID_USAGE;
3316
		goto vmgexit_err;
3317
	}
3318

3319
	reason = GHCB_ERR_MISSING_INPUT;
3320

3321
	if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
3322
	    !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
3323
	    !kvm_ghcb_sw_exit_info_2_is_valid(svm))
3324
		goto vmgexit_err;
3325

3326
	switch (exit_code) {
3327
	case SVM_EXIT_READ_DR7:
3328
		break;
3329
	case SVM_EXIT_WRITE_DR7:
3330
		if (!kvm_ghcb_rax_is_valid(svm))
3331
			goto vmgexit_err;
3332
		break;
3333
	case SVM_EXIT_RDTSC:
3334
		break;
3335
	case SVM_EXIT_RDPMC:
3336
		if (!kvm_ghcb_rcx_is_valid(svm))
3337
			goto vmgexit_err;
3338
		break;
3339
	case SVM_EXIT_CPUID:
3340
		if (!kvm_ghcb_rax_is_valid(svm) ||
3341
		    !kvm_ghcb_rcx_is_valid(svm))
3342
			goto vmgexit_err;
3343
		if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
3344
			if (!kvm_ghcb_xcr0_is_valid(svm))
3345
				goto vmgexit_err;
3346
		break;
3347
	case SVM_EXIT_INVD:
3348
		break;
3349
	case SVM_EXIT_IOIO:
3350
		if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
3351
			if (!kvm_ghcb_sw_scratch_is_valid(svm))
3352
				goto vmgexit_err;
3353
		} else {
3354
			if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
3355
				if (!kvm_ghcb_rax_is_valid(svm))
3356
					goto vmgexit_err;
3357
		}
3358
		break;
3359
	case SVM_EXIT_MSR:
3360
		if (!kvm_ghcb_rcx_is_valid(svm))
3361
			goto vmgexit_err;
3362
		if (control->exit_info_1) {
3363
			if (!kvm_ghcb_rax_is_valid(svm) ||
3364
			    !kvm_ghcb_rdx_is_valid(svm))
3365
				goto vmgexit_err;
3366
		}
3367
		break;
3368
	case SVM_EXIT_VMMCALL:
3369
		if (!kvm_ghcb_rax_is_valid(svm) ||
3370
		    !kvm_ghcb_cpl_is_valid(svm))
3371
			goto vmgexit_err;
3372
		break;
3373
	case SVM_EXIT_RDTSCP:
3374
		break;
3375
	case SVM_EXIT_WBINVD:
3376
		break;
3377
	case SVM_EXIT_MONITOR:
3378
		if (!kvm_ghcb_rax_is_valid(svm) ||
3379
		    !kvm_ghcb_rcx_is_valid(svm) ||
3380
		    !kvm_ghcb_rdx_is_valid(svm))
3381
			goto vmgexit_err;
3382
		break;
3383
	case SVM_EXIT_MWAIT:
3384
		if (!kvm_ghcb_rax_is_valid(svm) ||
3385
		    !kvm_ghcb_rcx_is_valid(svm))
3386
			goto vmgexit_err;
3387
		break;
3388
	case SVM_VMGEXIT_MMIO_READ:
3389
	case SVM_VMGEXIT_MMIO_WRITE:
3390
		if (!kvm_ghcb_sw_scratch_is_valid(svm))
3391
			goto vmgexit_err;
3392
		break;
3393
	case SVM_VMGEXIT_AP_CREATION:
3394
		if (!sev_snp_guest(vcpu->kvm))
3395
			goto vmgexit_err;
3396
		if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
3397
			if (!kvm_ghcb_rax_is_valid(svm))
3398
				goto vmgexit_err;
3399
		break;
3400
	case SVM_VMGEXIT_NMI_COMPLETE:
3401
	case SVM_VMGEXIT_AP_HLT_LOOP:
3402
	case SVM_VMGEXIT_AP_JUMP_TABLE:
3403
	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
3404
	case SVM_VMGEXIT_HV_FEATURES:
3405
	case SVM_VMGEXIT_TERM_REQUEST:
3406
		break;
3407
	case SVM_VMGEXIT_PSC:
3408
		if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
3409
			goto vmgexit_err;
3410
		break;
3411
	case SVM_VMGEXIT_GUEST_REQUEST:
3412
	case SVM_VMGEXIT_EXT_GUEST_REQUEST:
3413
		if (!sev_snp_guest(vcpu->kvm) ||
3414
		    !PAGE_ALIGNED(control->exit_info_1) ||
3415
		    !PAGE_ALIGNED(control->exit_info_2) ||
3416
		    control->exit_info_1 == control->exit_info_2)
3417
			goto vmgexit_err;
3418
		break;
3419
	default:
3420
		reason = GHCB_ERR_INVALID_EVENT;
3421
		goto vmgexit_err;
3422
	}
3423

3424
	return 0;
3425

3426
vmgexit_err:
3427
	if (reason == GHCB_ERR_INVALID_USAGE) {
3428
		vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
3429
			    svm->sev_es.ghcb->ghcb_usage);
3430
	} else if (reason == GHCB_ERR_INVALID_EVENT) {
3431
		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
3432
			    exit_code);
3433
	} else {
3434
		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
3435
			    exit_code);
3436
		dump_ghcb(svm);
3437
	}
3438

3439
	svm_vmgexit_bad_input(svm, reason);
3440

3441
	/* Resume the guest to "return" the error code. */
3442
	return 1;
3443
}
3444

3445
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
3446
{
3447
	/* Clear any indication that the vCPU is in a type of AP Reset Hold */
3448
	svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
3449

3450
	if (!svm->sev_es.ghcb)
3451
		return;
3452

3453
	if (svm->sev_es.ghcb_sa_free) {
3454
		/*
3455
		 * The scratch area lives outside the GHCB, so there is a
3456
		 * buffer that, depending on the operation performed, may
3457
		 * need to be synced, then freed.
3458
		 */
3459
		if (svm->sev_es.ghcb_sa_sync) {
3460
			kvm_write_guest(svm->vcpu.kvm,
3461
					svm->sev_es.sw_scratch,
3462
					svm->sev_es.ghcb_sa,
3463
					svm->sev_es.ghcb_sa_len);
3464
			svm->sev_es.ghcb_sa_sync = false;
3465
		}
3466

3467
		kvfree(svm->sev_es.ghcb_sa);
3468
		svm->sev_es.ghcb_sa = NULL;
3469
		svm->sev_es.ghcb_sa_free = false;
3470
	}
3471

3472
	trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
3473

3474
	sev_es_sync_to_ghcb(svm);
3475

3476
	kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
3477
	svm->sev_es.ghcb = NULL;
3478
}
3479

3480
int pre_sev_run(struct vcpu_svm *svm, int cpu)
3481
{
3482
	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
3483
	struct kvm *kvm = svm->vcpu.kvm;
3484
	unsigned int asid = sev_get_asid(kvm);
3485

3486
	/*
3487
	 * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
3488
	 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
3489
	 * AP Destroy event.
3490
	 */
3491
	if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
3492
		return -EINVAL;
3493

3494
	/*
3495
	 * To optimize cache flushes when memory is reclaimed from an SEV VM,
3496
	 * track physical CPUs that enter the guest for SEV VMs and thus can
3497
	 * have encrypted, dirty data in the cache, and flush caches only for
3498
	 * CPUs that have entered the guest.
3499
	 */
3500
	if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
3501
		cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
3502

3503
	/* Assign the asid allocated with this SEV guest */
3504
	svm->asid = asid;
3505

3506
	/*
3507
	 * Flush guest TLB:
3508
	 *
3509
	 * 1) when different VMCB for the same ASID is to be run on the same host CPU.
3510
	 * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
3511
	 */
3512
	if (sd->sev_vmcbs[asid] == svm->vmcb &&
3513
	    svm->vcpu.arch.last_vmentry_cpu == cpu)
3514
		return 0;
3515

3516
	sd->sev_vmcbs[asid] = svm->vmcb;
3517
	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3518
	vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3519
	return 0;
3520
}
3521

3522
#define GHCB_SCRATCH_AREA_LIMIT		(16ULL * PAGE_SIZE)
3523
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
3524
{
3525
	struct vmcb_control_area *control = &svm->vmcb->control;
3526
	u64 ghcb_scratch_beg, ghcb_scratch_end;
3527
	u64 scratch_gpa_beg, scratch_gpa_end;
3528
	void *scratch_va;
3529

3530
	scratch_gpa_beg = svm->sev_es.sw_scratch;
3531
	if (!scratch_gpa_beg) {
3532
		pr_err("vmgexit: scratch gpa not provided\n");
3533
		goto e_scratch;
3534
	}
3535

3536
	scratch_gpa_end = scratch_gpa_beg + len;
3537
	if (scratch_gpa_end < scratch_gpa_beg) {
3538
		pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
3539
		       len, scratch_gpa_beg);
3540
		goto e_scratch;
3541
	}
3542

3543
	if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
3544
		/* Scratch area begins within GHCB */
3545
		ghcb_scratch_beg = control->ghcb_gpa +
3546
				   offsetof(struct ghcb, shared_buffer);
3547
		ghcb_scratch_end = control->ghcb_gpa +
3548
				   offsetof(struct ghcb, reserved_0xff0);
3549

3550
		/*
3551
		 * If the scratch area begins within the GHCB, it must be
3552
		 * completely contained in the GHCB shared buffer area.
3553
		 */
3554
		if (scratch_gpa_beg < ghcb_scratch_beg ||
3555
		    scratch_gpa_end > ghcb_scratch_end) {
3556
			pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
3557
			       scratch_gpa_beg, scratch_gpa_end);
3558
			goto e_scratch;
3559
		}
3560

3561
		scratch_va = (void *)svm->sev_es.ghcb;
3562
		scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
3563
	} else {
3564
		/*
3565
		 * The guest memory must be read into a kernel buffer, so
3566
		 * limit the size
3567
		 */
3568
		if (len > GHCB_SCRATCH_AREA_LIMIT) {
3569
			pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
3570
			       len, GHCB_SCRATCH_AREA_LIMIT);
3571
			goto e_scratch;
3572
		}
3573
		scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
3574
		if (!scratch_va)
3575
			return -ENOMEM;
3576

3577
		if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
3578
			/* Unable to copy scratch area from guest */
3579
			pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
3580

3581
			kvfree(scratch_va);
3582
			return -EFAULT;
3583
		}
3584

3585
		/*
3586
		 * The scratch area is outside the GHCB. The operation will
3587
		 * dictate whether the buffer needs to be synced before running
3588
		 * the vCPU next time (i.e. a read was requested so the data
3589
		 * must be written back to the guest memory).
3590
		 */
3591
		svm->sev_es.ghcb_sa_sync = sync;
3592
		svm->sev_es.ghcb_sa_free = true;
3593
	}
3594

3595
	svm->sev_es.ghcb_sa = scratch_va;
3596
	svm->sev_es.ghcb_sa_len = len;
3597

3598
	return 0;
3599

3600
e_scratch:
3601
	svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
3602

3603
	return 1;
3604
}
3605

3606
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
3607
			      unsigned int pos)
3608
{
3609
	svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
3610
	svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
3611
}
3612

3613
static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
3614
{
3615
	return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
3616
}
3617

3618
static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
3619
{
3620
	svm->vmcb->control.ghcb_gpa = value;
3621
}
3622

3623
static int snp_rmptable_psmash(kvm_pfn_t pfn)
3624
{
3625
	int ret;
3626

3627
	pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
3628

3629
	/*
3630
	 * PSMASH_FAIL_INUSE indicates another processor is modifying the
3631
	 * entry, so retry until that's no longer the case.
3632
	 */
3633
	do {
3634
		ret = psmash(pfn);
3635
	} while (ret == PSMASH_FAIL_INUSE);
3636

3637
	return ret;
3638
}
3639

3640
static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
3641
{
3642
	struct vcpu_svm *svm = to_svm(vcpu);
3643

3644
	if (vcpu->run->hypercall.ret)
3645
		set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3646
	else
3647
		set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
3648

3649
	return 1; /* resume guest */
3650
}
3651

3652
static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
3653
{
3654
	u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
3655
	u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
3656
	struct kvm_vcpu *vcpu = &svm->vcpu;
3657

3658
	if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
3659
		set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3660
		return 1; /* resume guest */
3661
	}
3662

3663
	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3664
		set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3665
		return 1; /* resume guest */
3666
	}
3667

3668
	vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3669
	vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3670
	/*
3671
	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3672
	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3673
	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
3674
	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3675
	 */
3676
	vcpu->run->hypercall.ret = 0;
3677
	vcpu->run->hypercall.args[0] = gpa;
3678
	vcpu->run->hypercall.args[1] = 1;
3679
	vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
3680
				       ? KVM_MAP_GPA_RANGE_ENCRYPTED
3681
				       : KVM_MAP_GPA_RANGE_DECRYPTED;
3682
	vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3683

3684
	vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
3685

3686
	return 0; /* forward request to userspace */
3687
}
3688

3689
struct psc_buffer {
3690
	struct psc_hdr hdr;
3691
	struct psc_entry entries[];
3692
} __packed;
3693

3694
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
3695

3696
static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
3697
{
3698
	svm->sev_es.psc_inflight = 0;
3699
	svm->sev_es.psc_idx = 0;
3700
	svm->sev_es.psc_2m = false;
3701

3702
	/*
3703
	 * PSC requests always get a "no action" response in SW_EXITINFO1, with
3704
	 * a PSC-specific return code in SW_EXITINFO2 that provides the "real"
3705
	 * return code.  E.g. if the PSC request was interrupted, the need to
3706
	 * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
3707
	 */
3708
	svm_vmgexit_no_action(svm, psc_ret);
3709
}
3710

3711
static void __snp_complete_one_psc(struct vcpu_svm *svm)
3712
{
3713
	struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3714
	struct psc_entry *entries = psc->entries;
3715
	struct psc_hdr *hdr = &psc->hdr;
3716
	__u16 idx;
3717

3718
	/*
3719
	 * Everything in-flight has been processed successfully. Update the
3720
	 * corresponding entries in the guest's PSC buffer and zero out the
3721
	 * count of in-flight PSC entries.
3722
	 */
3723
	for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
3724
	     svm->sev_es.psc_inflight--, idx++) {
3725
		struct psc_entry *entry = &entries[idx];
3726

3727
		entry->cur_page = entry->pagesize ? 512 : 1;
3728
	}
3729

3730
	hdr->cur_entry = idx;
3731
}
3732

3733
static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
3734
{
3735
	struct vcpu_svm *svm = to_svm(vcpu);
3736
	struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3737

3738
	if (vcpu->run->hypercall.ret) {
3739
		snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3740
		return 1; /* resume guest */
3741
	}
3742

3743
	__snp_complete_one_psc(svm);
3744

3745
	/* Handle the next range (if any). */
3746
	return snp_begin_psc(svm, psc);
3747
}
3748

3749
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
3750
{
3751
	struct psc_entry *entries = psc->entries;
3752
	struct kvm_vcpu *vcpu = &svm->vcpu;
3753
	struct psc_hdr *hdr = &psc->hdr;
3754
	struct psc_entry entry_start;
3755
	u16 idx, idx_start, idx_end;
3756
	int npages;
3757
	bool huge;
3758
	u64 gfn;
3759

3760
	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3761
		snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3762
		return 1;
3763
	}
3764

3765
next_range:
3766
	/* There should be no other PSCs in-flight at this point. */
3767
	if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
3768
		snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3769
		return 1;
3770
	}
3771

3772
	/*
3773
	 * The PSC descriptor buffer can be modified by a misbehaved guest after
3774
	 * validation, so take care to only use validated copies of values used
3775
	 * for things like array indexing.
3776
	 */
3777
	idx_start = hdr->cur_entry;
3778
	idx_end = hdr->end_entry;
3779

3780
	if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
3781
		snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
3782
		return 1;
3783
	}
3784

3785
	/* Find the start of the next range which needs processing. */
3786
	for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
3787
		entry_start = entries[idx];
3788

3789
		gfn = entry_start.gfn;
3790
		huge = entry_start.pagesize;
3791
		npages = huge ? 512 : 1;
3792

3793
		if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
3794
			snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
3795
			return 1;
3796
		}
3797

3798
		if (entry_start.cur_page) {
3799
			/*
3800
			 * If this is a partially-completed 2M range, force 4K handling
3801
			 * for the remaining pages since they're effectively split at
3802
			 * this point. Subsequent code should ensure this doesn't get
3803
			 * combined with adjacent PSC entries where 2M handling is still
3804
			 * possible.
3805
			 */
3806
			npages -= entry_start.cur_page;
3807
			gfn += entry_start.cur_page;
3808
			huge = false;
3809
		}
3810

3811
		if (npages)
3812
			break;
3813
	}
3814

3815
	if (idx > idx_end) {
3816
		/* Nothing more to process. */
3817
		snp_complete_psc(svm, 0);
3818
		return 1;
3819
	}
3820

3821
	svm->sev_es.psc_2m = huge;
3822
	svm->sev_es.psc_idx = idx;
3823
	svm->sev_es.psc_inflight = 1;
3824

3825
	/*
3826
	 * Find all subsequent PSC entries that contain adjacent GPA
3827
	 * ranges/operations and can be combined into a single
3828
	 * KVM_HC_MAP_GPA_RANGE exit.
3829
	 */
3830
	while (++idx <= idx_end) {
3831
		struct psc_entry entry = entries[idx];
3832

3833
		if (entry.operation != entry_start.operation ||
3834
		    entry.gfn != entry_start.gfn + npages ||
3835
		    entry.cur_page || !!entry.pagesize != huge)
3836
			break;
3837

3838
		svm->sev_es.psc_inflight++;
3839
		npages += huge ? 512 : 1;
3840
	}
3841

3842
	switch (entry_start.operation) {
3843
	case VMGEXIT_PSC_OP_PRIVATE:
3844
	case VMGEXIT_PSC_OP_SHARED:
3845
		vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3846
		vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3847
		/*
3848
		 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3849
		 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3850
		 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
3851
		 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3852
		 */
3853
		vcpu->run->hypercall.ret = 0;
3854
		vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
3855
		vcpu->run->hypercall.args[1] = npages;
3856
		vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
3857
					       ? KVM_MAP_GPA_RANGE_ENCRYPTED
3858
					       : KVM_MAP_GPA_RANGE_DECRYPTED;
3859
		vcpu->run->hypercall.args[2] |= entry_start.pagesize
3860
						? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
3861
						: KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3862
		vcpu->arch.complete_userspace_io = snp_complete_one_psc;
3863
		return 0; /* forward request to userspace */
3864
	default:
3865
		/*
3866
		 * Only shared/private PSC operations are currently supported, so if the
3867
		 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
3868
		 * then consider the entire range completed and avoid exiting to
3869
		 * userspace. In theory snp_complete_psc() can always be called directly
3870
		 * at this point to complete the current range and start the next one,
3871
		 * but that could lead to unexpected levels of recursion.
3872
		 */
3873
		__snp_complete_one_psc(svm);
3874
		goto next_range;
3875
	}
3876

3877
	BUG();
3878
}
3879

3880
/*
3881
 * Invoked as part of svm_vcpu_reset() processing of an init event.
3882
 */
3883
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
3884
{
3885
	struct vcpu_svm *svm = to_svm(vcpu);
3886
	struct kvm_memory_slot *slot;
3887
	struct page *page;
3888
	kvm_pfn_t pfn;
3889
	gfn_t gfn;
3890

3891
	if (!sev_snp_guest(vcpu->kvm))
3892
		return;
3893

3894
	guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
3895

3896
	if (!svm->sev_es.snp_ap_waiting_for_reset)
3897
		return;
3898

3899
	svm->sev_es.snp_ap_waiting_for_reset = false;
3900

3901
	/* Mark the vCPU as offline and not runnable */
3902
	vcpu->arch.pv.pv_unhalted = false;
3903
	kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
3904

3905
	/* Clear use of the VMSA */
3906
	svm->vmcb->control.vmsa_pa = INVALID_PAGE;
3907

3908
	/*
3909
	 * When replacing the VMSA during SEV-SNP AP creation,
3910
	 * mark the VMCB dirty so that full state is always reloaded.
3911
	 */
3912
	vmcb_mark_all_dirty(svm->vmcb);
3913

3914
	if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
3915
		return;
3916

3917
	gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
3918
	svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
3919

3920
	slot = gfn_to_memslot(vcpu->kvm, gfn);
3921
	if (!slot)
3922
		return;
3923

3924
	/*
3925
	 * The new VMSA will be private memory guest memory, so retrieve the
3926
	 * PFN from the gmem backend.
3927
	 */
3928
	if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
3929
		return;
3930

3931
	/*
3932
	 * From this point forward, the VMSA will always be a guest-mapped page
3933
	 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
3934
	 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
3935
	 * that involves cleanups like flushing caches, which would ideally be
3936
	 * handled during teardown rather than guest boot.  Deferring that also
3937
	 * allows the existing logic for SEV-ES VMSAs to be re-used with
3938
	 * minimal SNP-specific changes.
3939
	 */
3940
	svm->sev_es.snp_has_guest_vmsa = true;
3941

3942
	/* Use the new VMSA */
3943
	svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
3944

3945
	/* Mark the vCPU as runnable */
3946
	kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3947

3948
	/*
3949
	 * gmem pages aren't currently migratable, but if this ever changes
3950
	 * then care should be taken to ensure svm->sev_es.vmsa is pinned
3951
	 * through some other means.
3952
	 */
3953
	kvm_release_page_clean(page);
3954
}
3955

3956
static int sev_snp_ap_creation(struct vcpu_svm *svm)
3957
{
3958
	struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
3959
	struct kvm_vcpu *vcpu = &svm->vcpu;
3960
	struct kvm_vcpu *target_vcpu;
3961
	struct vcpu_svm *target_svm;
3962
	unsigned int request;
3963
	unsigned int apic_id;
3964

3965
	request = lower_32_bits(svm->vmcb->control.exit_info_1);
3966
	apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
3967

3968
	/* Validate the APIC ID */
3969
	target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
3970
	if (!target_vcpu) {
3971
		vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
3972
			    apic_id);
3973
		return -EINVAL;
3974
	}
3975

3976
	target_svm = to_svm(target_vcpu);
3977

3978
	guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
3979

3980
	switch (request) {
3981
	case SVM_VMGEXIT_AP_CREATE_ON_INIT:
3982
	case SVM_VMGEXIT_AP_CREATE:
3983
		if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
3984
			vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
3985
				    vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
3986
			return -EINVAL;
3987
		}
3988

3989
		if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
3990
			vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
3991
				    svm->vmcb->control.exit_info_2);
3992
			return -EINVAL;
3993
		}
3994

3995
		/*
3996
		 * Malicious guest can RMPADJUST a large page into VMSA which
3997
		 * will hit the SNP erratum where the CPU will incorrectly signal
3998
		 * an RMP violation #PF if a hugepage collides with the RMP entry
3999
		 * of VMSA page, reject the AP CREATE request if VMSA address from
4000
		 * guest is 2M aligned.
4001
		 */
4002
		if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
4003
			vcpu_unimpl(vcpu,
4004
				    "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
4005
				    svm->vmcb->control.exit_info_2);
4006
			return -EINVAL;
4007
		}
4008

4009
		target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
4010
		break;
4011
	case SVM_VMGEXIT_AP_DESTROY:
4012
		target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
4013
		break;
4014
	default:
4015
		vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
4016
			    request);
4017
		return -EINVAL;
4018
	}
4019

4020
	target_svm->sev_es.snp_ap_waiting_for_reset = true;
4021

4022
	/*
4023
	 * Unless Creation is deferred until INIT, signal the vCPU to update
4024
	 * its state.
4025
	 */
4026
	if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
4027
		kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
4028

4029
	return 0;
4030
}
4031

4032
static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4033
{
4034
	struct sev_data_snp_guest_request data = {0};
4035
	struct kvm *kvm = svm->vcpu.kvm;
4036
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4037
	sev_ret_code fw_err = 0;
4038
	int ret;
4039

4040
	if (!sev_snp_guest(kvm))
4041
		return -EINVAL;
4042

4043
	mutex_lock(&sev->guest_req_mutex);
4044

4045
	if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
4046
		ret = -EIO;
4047
		goto out_unlock;
4048
	}
4049

4050
	data.gctx_paddr = __psp_pa(sev->snp_context);
4051
	data.req_paddr = __psp_pa(sev->guest_req_buf);
4052
	data.res_paddr = __psp_pa(sev->guest_resp_buf);
4053

4054
	/*
4055
	 * Firmware failures are propagated on to guest, but any other failure
4056
	 * condition along the way should be reported to userspace. E.g. if
4057
	 * the PSP is dead and commands are timing out.
4058
	 */
4059
	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
4060
	if (ret && !fw_err)
4061
		goto out_unlock;
4062

4063
	if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
4064
		ret = -EIO;
4065
		goto out_unlock;
4066
	}
4067

4068
	/* No action is requested *from KVM* if there was a firmware error. */
4069
	svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
4070

4071
	ret = 1; /* resume guest */
4072

4073
out_unlock:
4074
	mutex_unlock(&sev->guest_req_mutex);
4075
	return ret;
4076
}
4077

4078
static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4079
{
4080
	struct kvm *kvm = svm->vcpu.kvm;
4081
	u8 msg_type;
4082

4083
	if (!sev_snp_guest(kvm))
4084
		return -EINVAL;
4085

4086
	if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
4087
			   &msg_type, 1))
4088
		return -EIO;
4089

4090
	/*
4091
	 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
4092
	 * additional certificate data to be provided alongside the attestation
4093
	 * report via the guest-provided data pages indicated by RAX/RBX. The
4094
	 * certificate data is optional and requires additional KVM enablement
4095
	 * to provide an interface for userspace to provide it, but KVM still
4096
	 * needs to be able to handle extended guest requests either way. So
4097
	 * provide a stub implementation that will always return an empty
4098
	 * certificate table in the guest-provided data pages.
4099
	 */
4100
	if (msg_type == SNP_MSG_REPORT_REQ) {
4101
		struct kvm_vcpu *vcpu = &svm->vcpu;
4102
		u64 data_npages;
4103
		gpa_t data_gpa;
4104

4105
		if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
4106
			goto request_invalid;
4107

4108
		data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
4109
		data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
4110

4111
		if (!PAGE_ALIGNED(data_gpa))
4112
			goto request_invalid;
4113

4114
		/*
4115
		 * As per GHCB spec (see "SNP Extended Guest Request"), the
4116
		 * certificate table is terminated by 24-bytes of zeroes.
4117
		 */
4118
		if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
4119
			return -EIO;
4120
	}
4121

4122
	return snp_handle_guest_req(svm, req_gpa, resp_gpa);
4123

4124
request_invalid:
4125
	svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4126
	return 1; /* resume guest */
4127
}
4128

4129
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
4130
{
4131
	struct vmcb_control_area *control = &svm->vmcb->control;
4132
	struct kvm_vcpu *vcpu = &svm->vcpu;
4133
	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4134
	u64 ghcb_info;
4135
	int ret = 1;
4136

4137
	ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
4138

4139
	trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
4140
					     control->ghcb_gpa);
4141

4142
	switch (ghcb_info) {
4143
	case GHCB_MSR_SEV_INFO_REQ:
4144
		set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4145
						    GHCB_VERSION_MIN,
4146
						    sev_enc_bit));
4147
		break;
4148
	case GHCB_MSR_CPUID_REQ: {
4149
		u64 cpuid_fn, cpuid_reg, cpuid_value;
4150

4151
		cpuid_fn = get_ghcb_msr_bits(svm,
4152
					     GHCB_MSR_CPUID_FUNC_MASK,
4153
					     GHCB_MSR_CPUID_FUNC_POS);
4154

4155
		/* Initialize the registers needed by the CPUID intercept */
4156
		vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
4157
		vcpu->arch.regs[VCPU_REGS_RCX] = 0;
4158

4159
		ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
4160
		if (!ret) {
4161
			/* Error, keep GHCB MSR value as-is */
4162
			break;
4163
		}
4164

4165
		cpuid_reg = get_ghcb_msr_bits(svm,
4166
					      GHCB_MSR_CPUID_REG_MASK,
4167
					      GHCB_MSR_CPUID_REG_POS);
4168
		if (cpuid_reg == 0)
4169
			cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
4170
		else if (cpuid_reg == 1)
4171
			cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
4172
		else if (cpuid_reg == 2)
4173
			cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
4174
		else
4175
			cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
4176

4177
		set_ghcb_msr_bits(svm, cpuid_value,
4178
				  GHCB_MSR_CPUID_VALUE_MASK,
4179
				  GHCB_MSR_CPUID_VALUE_POS);
4180

4181
		set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
4182
				  GHCB_MSR_INFO_MASK,
4183
				  GHCB_MSR_INFO_POS);
4184
		break;
4185
	}
4186
	case GHCB_MSR_AP_RESET_HOLD_REQ:
4187
		svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
4188
		ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
4189

4190
		/*
4191
		 * Preset the result to a non-SIPI return and then only set
4192
		 * the result to non-zero when delivering a SIPI.
4193
		 */
4194
		set_ghcb_msr_bits(svm, 0,
4195
				  GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4196
				  GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4197

4198
		set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4199
				  GHCB_MSR_INFO_MASK,
4200
				  GHCB_MSR_INFO_POS);
4201
		break;
4202
	case GHCB_MSR_HV_FT_REQ:
4203
		set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
4204
				  GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
4205
		set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
4206
				  GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
4207
		break;
4208
	case GHCB_MSR_PREF_GPA_REQ:
4209
		if (!sev_snp_guest(vcpu->kvm))
4210
			goto out_terminate;
4211

4212
		set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
4213
				  GHCB_MSR_GPA_VALUE_POS);
4214
		set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
4215
				  GHCB_MSR_INFO_POS);
4216
		break;
4217
	case GHCB_MSR_REG_GPA_REQ: {
4218
		u64 gfn;
4219

4220
		if (!sev_snp_guest(vcpu->kvm))
4221
			goto out_terminate;
4222

4223
		gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
4224
					GHCB_MSR_GPA_VALUE_POS);
4225

4226
		svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
4227

4228
		set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
4229
				  GHCB_MSR_GPA_VALUE_POS);
4230
		set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
4231
				  GHCB_MSR_INFO_POS);
4232
		break;
4233
	}
4234
	case GHCB_MSR_PSC_REQ:
4235
		if (!sev_snp_guest(vcpu->kvm))
4236
			goto out_terminate;
4237

4238
		ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
4239
		break;
4240
	case GHCB_MSR_TERM_REQ: {
4241
		u64 reason_set, reason_code;
4242

4243
		reason_set = get_ghcb_msr_bits(svm,
4244
					       GHCB_MSR_TERM_REASON_SET_MASK,
4245
					       GHCB_MSR_TERM_REASON_SET_POS);
4246
		reason_code = get_ghcb_msr_bits(svm,
4247
						GHCB_MSR_TERM_REASON_MASK,
4248
						GHCB_MSR_TERM_REASON_POS);
4249
		pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
4250
			reason_set, reason_code);
4251

4252
		goto out_terminate;
4253
	}
4254
	default:
4255
		/* Error, keep GHCB MSR value as-is */
4256
		break;
4257
	}
4258

4259
	trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
4260
					    control->ghcb_gpa, ret);
4261

4262
	return ret;
4263

4264
out_terminate:
4265
	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4266
	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4267
	vcpu->run->system_event.ndata = 1;
4268
	vcpu->run->system_event.data[0] = control->ghcb_gpa;
4269

4270
	return 0;
4271
}
4272

4273
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
4274
{
4275
	struct vcpu_svm *svm = to_svm(vcpu);
4276
	struct vmcb_control_area *control = &svm->vmcb->control;
4277
	u64 ghcb_gpa, exit_code;
4278
	int ret;
4279

4280
	/* Validate the GHCB */
4281
	ghcb_gpa = control->ghcb_gpa;
4282
	if (ghcb_gpa & GHCB_MSR_INFO_MASK)
4283
		return sev_handle_vmgexit_msr_protocol(svm);
4284

4285
	if (!ghcb_gpa) {
4286
		vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
4287

4288
		/* Without a GHCB, just return right back to the guest */
4289
		return 1;
4290
	}
4291

4292
	if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
4293
		/* Unable to map GHCB from guest */
4294
		vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
4295
			    ghcb_gpa);
4296

4297
		/* Without a GHCB, just return right back to the guest */
4298
		return 1;
4299
	}
4300

4301
	svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
4302

4303
	trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
4304

4305
	sev_es_sync_from_ghcb(svm);
4306

4307
	/* SEV-SNP guest requires that the GHCB GPA must be registered */
4308
	if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
4309
		vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
4310
		return -EINVAL;
4311
	}
4312

4313
	ret = sev_es_validate_vmgexit(svm);
4314
	if (ret)
4315
		return ret;
4316

4317
	svm_vmgexit_success(svm, 0);
4318

4319
	exit_code = kvm_ghcb_get_sw_exit_code(control);
4320
	switch (exit_code) {
4321
	case SVM_VMGEXIT_MMIO_READ:
4322
		ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4323
		if (ret)
4324
			break;
4325

4326
		ret = kvm_sev_es_mmio_read(vcpu,
4327
					   control->exit_info_1,
4328
					   control->exit_info_2,
4329
					   svm->sev_es.ghcb_sa);
4330
		break;
4331
	case SVM_VMGEXIT_MMIO_WRITE:
4332
		ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
4333
		if (ret)
4334
			break;
4335

4336
		ret = kvm_sev_es_mmio_write(vcpu,
4337
					    control->exit_info_1,
4338
					    control->exit_info_2,
4339
					    svm->sev_es.ghcb_sa);
4340
		break;
4341
	case SVM_VMGEXIT_NMI_COMPLETE:
4342
		++vcpu->stat.nmi_window_exits;
4343
		svm->nmi_masked = false;
4344
		kvm_make_request(KVM_REQ_EVENT, vcpu);
4345
		ret = 1;
4346
		break;
4347
	case SVM_VMGEXIT_AP_HLT_LOOP:
4348
		svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
4349
		ret = kvm_emulate_ap_reset_hold(vcpu);
4350
		break;
4351
	case SVM_VMGEXIT_AP_JUMP_TABLE: {
4352
		struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4353

4354
		switch (control->exit_info_1) {
4355
		case 0:
4356
			/* Set AP jump table address */
4357
			sev->ap_jump_table = control->exit_info_2;
4358
			break;
4359
		case 1:
4360
			/* Get AP jump table address */
4361
			svm_vmgexit_success(svm, sev->ap_jump_table);
4362
			break;
4363
		default:
4364
			pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
4365
			       control->exit_info_1);
4366
			svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4367
		}
4368

4369
		ret = 1;
4370
		break;
4371
	}
4372
	case SVM_VMGEXIT_HV_FEATURES:
4373
		svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
4374
		ret = 1;
4375
		break;
4376
	case SVM_VMGEXIT_TERM_REQUEST:
4377
		pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
4378
			control->exit_info_1, control->exit_info_2);
4379
		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4380
		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4381
		vcpu->run->system_event.ndata = 1;
4382
		vcpu->run->system_event.data[0] = control->ghcb_gpa;
4383
		break;
4384
	case SVM_VMGEXIT_PSC:
4385
		ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4386
		if (ret)
4387
			break;
4388

4389
		ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
4390
		break;
4391
	case SVM_VMGEXIT_AP_CREATION:
4392
		ret = sev_snp_ap_creation(svm);
4393
		if (ret) {
4394
			svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4395
		}
4396

4397
		ret = 1;
4398
		break;
4399
	case SVM_VMGEXIT_GUEST_REQUEST:
4400
		ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
4401
		break;
4402
	case SVM_VMGEXIT_EXT_GUEST_REQUEST:
4403
		ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
4404
		break;
4405
	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
4406
		vcpu_unimpl(vcpu,
4407
			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
4408
			    control->exit_info_1, control->exit_info_2);
4409
		ret = -EINVAL;
4410
		break;
4411
	default:
4412
		ret = svm_invoke_exit_handler(vcpu, exit_code);
4413
	}
4414

4415
	return ret;
4416
}
4417

4418
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
4419
{
4420
	int count;
4421
	int bytes;
4422
	int r;
4423

4424
	if (svm->vmcb->control.exit_info_2 > INT_MAX)
4425
		return -EINVAL;
4426

4427
	count = svm->vmcb->control.exit_info_2;
4428
	if (unlikely(check_mul_overflow(count, size, &bytes)))
4429
		return -EINVAL;
4430

4431
	r = setup_vmgexit_scratch(svm, in, bytes);
4432
	if (r)
4433
		return r;
4434

4435
	return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
4436
				    count, in);
4437
}
4438

4439
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4440
{
4441
	/* Clear intercepts on MSRs that are context switched by hardware. */
4442
	svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
4443
	svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
4444
	svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
4445

4446
	if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
4447
		svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
4448
					  !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
4449
					  !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
4450

4451
	/*
4452
	 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
4453
	 * the host/guest supports its use.
4454
	 *
4455
	 * KVM treats the guest as being capable of using XSAVES even if XSAVES
4456
	 * isn't enabled in guest CPUID as there is no intercept for XSAVES,
4457
	 * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
4458
	 * exposed to the guest and XSAVES is supported in hardware.  Condition
4459
	 * full XSS passthrough on the guest being able to use XSAVES *and*
4460
	 * XSAVES being exposed to the guest so that KVM can at least honor
4461
	 * guest CPUID for RDMSR and WRMSR.
4462
	 */
4463
	svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
4464
				  !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
4465
				  !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
4466
}
4467

4468
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
4469
{
4470
	struct kvm_vcpu *vcpu = &svm->vcpu;
4471
	struct kvm_cpuid_entry2 *best;
4472

4473
	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
4474
	best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4475
	if (best)
4476
		vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4477
}
4478

4479
static void sev_es_init_vmcb(struct vcpu_svm *svm)
4480
{
4481
	struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
4482
	struct vmcb *vmcb = svm->vmcb01.ptr;
4483

4484
	svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
4485

4486
	/*
4487
	 * An SEV-ES guest requires a VMSA area that is a separate from the
4488
	 * VMCB page. Do not include the encryption mask on the VMSA physical
4489
	 * address since hardware will access it using the guest key.  Note,
4490
	 * the VMSA will be NULL if this vCPU is the destination for intrahost
4491
	 * migration, and will be copied later.
4492
	 */
4493
	if (!svm->sev_es.snp_has_guest_vmsa) {
4494
		if (svm->sev_es.vmsa)
4495
			svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
4496
		else
4497
			svm->vmcb->control.vmsa_pa = INVALID_PAGE;
4498
	}
4499

4500
	if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
4501
		svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
4502
							  VMCB_ALLOWED_SEV_FEATURES_VALID;
4503

4504
	/* Can't intercept CR register access, HV can't modify CR registers */
4505
	svm_clr_intercept(svm, INTERCEPT_CR0_READ);
4506
	svm_clr_intercept(svm, INTERCEPT_CR4_READ);
4507
	svm_clr_intercept(svm, INTERCEPT_CR8_READ);
4508
	svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
4509
	svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
4510
	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
4511

4512
	svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
4513

4514
	/* Track EFER/CR register changes */
4515
	svm_set_intercept(svm, TRAP_EFER_WRITE);
4516
	svm_set_intercept(svm, TRAP_CR0_WRITE);
4517
	svm_set_intercept(svm, TRAP_CR4_WRITE);
4518
	svm_set_intercept(svm, TRAP_CR8_WRITE);
4519

4520
	vmcb->control.intercepts[INTERCEPT_DR] = 0;
4521
	if (!sev_vcpu_has_debug_swap(svm)) {
4522
		vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
4523
		vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
4524
		recalc_intercepts(svm);
4525
	} else {
4526
		/*
4527
		 * Disable #DB intercept iff DebugSwap is enabled.  KVM doesn't
4528
		 * allow debugging SEV-ES guests, and enables DebugSwap iff
4529
		 * NO_NESTED_DATA_BP is supported, so there's no reason to
4530
		 * intercept #DB when DebugSwap is enabled.  For simplicity
4531
		 * with respect to guest debug, intercept #DB for other VMs
4532
		 * even if NO_NESTED_DATA_BP is supported, i.e. even if the
4533
		 * guest can't DoS the CPU with infinite #DB vectoring.
4534
		 */
4535
		clr_exception_intercept(svm, DB_VECTOR);
4536
	}
4537

4538
	/* Can't intercept XSETBV, HV can't modify XCR0 directly */
4539
	svm_clr_intercept(svm, INTERCEPT_XSETBV);
4540
}
4541

4542
void sev_init_vmcb(struct vcpu_svm *svm)
4543
{
4544
	svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
4545
	clr_exception_intercept(svm, UD_VECTOR);
4546

4547
	/*
4548
	 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
4549
	 * KVM can't decrypt guest memory to decode the faulting instruction.
4550
	 */
4551
	clr_exception_intercept(svm, GP_VECTOR);
4552

4553
	if (sev_es_guest(svm->vcpu.kvm))
4554
		sev_es_init_vmcb(svm);
4555
}
4556

4557
void sev_es_vcpu_reset(struct vcpu_svm *svm)
4558
{
4559
	struct kvm_vcpu *vcpu = &svm->vcpu;
4560
	struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4561

4562
	/*
4563
	 * Set the GHCB MSR value as per the GHCB specification when emulating
4564
	 * vCPU RESET for an SEV-ES guest.
4565
	 */
4566
	set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4567
					    GHCB_VERSION_MIN,
4568
					    sev_enc_bit));
4569

4570
	mutex_init(&svm->sev_es.snp_vmsa_mutex);
4571
}
4572

4573
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
4574
{
4575
	struct kvm *kvm = svm->vcpu.kvm;
4576

4577
	/*
4578
	 * All host state for SEV-ES guests is categorized into three swap types
4579
	 * based on how it is handled by hardware during a world switch:
4580
	 *
4581
	 * A: VMRUN:   Host state saved in host save area
4582
	 *    VMEXIT:  Host state loaded from host save area
4583
	 *
4584
	 * B: VMRUN:   Host state _NOT_ saved in host save area
4585
	 *    VMEXIT:  Host state loaded from host save area
4586
	 *
4587
	 * C: VMRUN:   Host state _NOT_ saved in host save area
4588
	 *    VMEXIT:  Host state initialized to default(reset) values
4589
	 *
4590
	 * Manually save type-B state, i.e. state that is loaded by VMEXIT but
4591
	 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
4592
	 * by common SVM code).
4593
	 */
4594
	hostsa->xcr0 = kvm_host.xcr0;
4595
	hostsa->pkru = read_pkru();
4596
	hostsa->xss = kvm_host.xss;
4597

4598
	/*
4599
	 * If DebugSwap is enabled, debug registers are loaded but NOT saved by
4600
	 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
4601
	 * not save or load debug registers.  Sadly, KVM can't prevent SNP
4602
	 * guests from lying about DebugSwap on secondary vCPUs, i.e. the
4603
	 * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
4604
	 * the guest has actually enabled (or not!) in the VMSA.
4605
	 *
4606
	 * If DebugSwap is *possible*, save the masks so that they're restored
4607
	 * if the guest enables DebugSwap.  But for the DRs themselves, do NOT
4608
	 * rely on the CPU to restore the host values; KVM will restore them as
4609
	 * needed in common code, via hw_breakpoint_restore().  Note, KVM does
4610
	 * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
4611
	 * don't need to be restored per se, KVM just needs to ensure they are
4612
	 * loaded with the correct values *if* the CPU writes the MSRs.
4613
	 */
4614
	if (sev_vcpu_has_debug_swap(svm) ||
4615
	    (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
4616
		hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
4617
		hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
4618
		hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
4619
		hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
4620
	}
4621
}
4622

4623
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4624
{
4625
	struct vcpu_svm *svm = to_svm(vcpu);
4626

4627
	/* First SIPI: Use the values as initially set by the VMM */
4628
	if (!svm->sev_es.received_first_sipi) {
4629
		svm->sev_es.received_first_sipi = true;
4630
		return;
4631
	}
4632

4633
	/* Subsequent SIPI */
4634
	switch (svm->sev_es.ap_reset_hold_type) {
4635
	case AP_RESET_HOLD_NAE_EVENT:
4636
		/*
4637
		 * Return from an AP Reset Hold VMGEXIT, where the guest will
4638
		 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
4639
		 */
4640
		svm_vmgexit_success(svm, 1);
4641
		break;
4642
	case AP_RESET_HOLD_MSR_PROTO:
4643
		/*
4644
		 * Return from an AP Reset Hold VMGEXIT, where the guest will
4645
		 * set the CS and RIP. Set GHCB data field to a non-zero value.
4646
		 */
4647
		set_ghcb_msr_bits(svm, 1,
4648
				  GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4649
				  GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4650

4651
		set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4652
				  GHCB_MSR_INFO_MASK,
4653
				  GHCB_MSR_INFO_POS);
4654
		break;
4655
	default:
4656
		break;
4657
	}
4658
}
4659

4660
struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
4661
{
4662
	unsigned long pfn;
4663
	struct page *p;
4664

4665
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
4666
		return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
4667

4668
	/*
4669
	 * Allocate an SNP-safe page to workaround the SNP erratum where
4670
	 * the CPU will incorrectly signal an RMP violation #PF if a
4671
	 * hugepage (2MB or 1GB) collides with the RMP entry of a
4672
	 * 2MB-aligned VMCB, VMSA, or AVIC backing page.
4673
	 *
4674
	 * Allocate one extra page, choose a page which is not
4675
	 * 2MB-aligned, and free the other.
4676
	 */
4677
	p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
4678
	if (!p)
4679
		return NULL;
4680

4681
	split_page(p, 1);
4682

4683
	pfn = page_to_pfn(p);
4684
	if (IS_ALIGNED(pfn, PTRS_PER_PMD))
4685
		__free_page(p++);
4686
	else
4687
		__free_page(p + 1);
4688

4689
	return p;
4690
}
4691

4692
void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
4693
{
4694
	struct kvm_memory_slot *slot;
4695
	struct kvm *kvm = vcpu->kvm;
4696
	int order, rmp_level, ret;
4697
	struct page *page;
4698
	bool assigned;
4699
	kvm_pfn_t pfn;
4700
	gfn_t gfn;
4701

4702
	gfn = gpa >> PAGE_SHIFT;
4703

4704
	/*
4705
	 * The only time RMP faults occur for shared pages is when the guest is
4706
	 * triggering an RMP fault for an implicit page-state change from
4707
	 * shared->private. Implicit page-state changes are forwarded to
4708
	 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
4709
	 * for shared pages should not end up here.
4710
	 */
4711
	if (!kvm_mem_is_private(kvm, gfn)) {
4712
		pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
4713
				    gpa);
4714
		return;
4715
	}
4716

4717
	slot = gfn_to_memslot(kvm, gfn);
4718
	if (!kvm_slot_can_be_private(slot)) {
4719
		pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
4720
				    gpa);
4721
		return;
4722
	}
4723

4724
	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
4725
	if (ret) {
4726
		pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
4727
				    gpa);
4728
		return;
4729
	}
4730

4731
	ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4732
	if (ret || !assigned) {
4733
		pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
4734
				    gpa, pfn, ret);
4735
		goto out_no_trace;
4736
	}
4737

4738
	/*
4739
	 * There are 2 cases where a PSMASH may be needed to resolve an #NPF
4740
	 * with PFERR_GUEST_RMP_BIT set:
4741
	 *
4742
	 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
4743
	 *    bit set if the guest issues them with a smaller granularity than
4744
	 *    what is indicated by the page-size bit in the 2MB RMP entry for
4745
	 *    the PFN that backs the GPA.
4746
	 *
4747
	 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
4748
	 *    smaller than what is indicated by the 2MB RMP entry for the PFN
4749
	 *    that backs the GPA.
4750
	 *
4751
	 * In both these cases, the corresponding 2M RMP entry needs to
4752
	 * be PSMASH'd to 512 4K RMP entries.  If the RMP entry is already
4753
	 * split into 4K RMP entries, then this is likely a spurious case which
4754
	 * can occur when there are concurrent accesses by the guest to a 2MB
4755
	 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
4756
	 * the process of being PMASH'd into 4K entries. These cases should
4757
	 * resolve automatically on subsequent accesses, so just ignore them
4758
	 * here.
4759
	 */
4760
	if (rmp_level == PG_LEVEL_4K)
4761
		goto out;
4762

4763
	ret = snp_rmptable_psmash(pfn);
4764
	if (ret) {
4765
		/*
4766
		 * Look it up again. If it's 4K now then the PSMASH may have
4767
		 * raced with another process and the issue has already resolved
4768
		 * itself.
4769
		 */
4770
		if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
4771
		    assigned && rmp_level == PG_LEVEL_4K)
4772
			goto out;
4773

4774
		pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
4775
				    gpa, pfn, ret);
4776
	}
4777

4778
	kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
4779
out:
4780
	trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
4781
out_no_trace:
4782
	kvm_release_page_unused(page);
4783
}
4784

4785
static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
4786
{
4787
	kvm_pfn_t pfn = start;
4788

4789
	while (pfn < end) {
4790
		int ret, rmp_level;
4791
		bool assigned;
4792

4793
		ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4794
		if (ret) {
4795
			pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
4796
					    pfn, start, end, rmp_level, ret);
4797
			return false;
4798
		}
4799

4800
		if (assigned) {
4801
			pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
4802
				 __func__, pfn, start, end, rmp_level);
4803
			return false;
4804
		}
4805

4806
		pfn++;
4807
	}
4808

4809
	return true;
4810
}
4811

4812
static u8 max_level_for_order(int order)
4813
{
4814
	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
4815
		return PG_LEVEL_2M;
4816

4817
	return PG_LEVEL_4K;
4818
}
4819

4820
static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
4821
{
4822
	kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
4823

4824
	/*
4825
	 * If this is a large folio, and the entire 2M range containing the
4826
	 * PFN is currently shared, then the entire 2M-aligned range can be
4827
	 * set to private via a single 2M RMP entry.
4828
	 */
4829
	if (max_level_for_order(order) > PG_LEVEL_4K &&
4830
	    is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
4831
		return true;
4832

4833
	return false;
4834
}
4835

4836
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
4837
{
4838
	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4839
	kvm_pfn_t pfn_aligned;
4840
	gfn_t gfn_aligned;
4841
	int level, rc;
4842
	bool assigned;
4843

4844
	if (!sev_snp_guest(kvm))
4845
		return 0;
4846

4847
	rc = snp_lookup_rmpentry(pfn, &assigned, &level);
4848
	if (rc) {
4849
		pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
4850
				   gfn, pfn, rc);
4851
		return -ENOENT;
4852
	}
4853

4854
	if (assigned) {
4855
		pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
4856
			 __func__, gfn, pfn, max_order, level);
4857
		return 0;
4858
	}
4859

4860
	if (is_large_rmp_possible(kvm, pfn, max_order)) {
4861
		level = PG_LEVEL_2M;
4862
		pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
4863
		gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
4864
	} else {
4865
		level = PG_LEVEL_4K;
4866
		pfn_aligned = pfn;
4867
		gfn_aligned = gfn;
4868
	}
4869

4870
	rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
4871
	if (rc) {
4872
		pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
4873
				   gfn, pfn, level, rc);
4874
		return -EINVAL;
4875
	}
4876

4877
	pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
4878
		 __func__, gfn, pfn, pfn_aligned, max_order, level);
4879

4880
	return 0;
4881
}
4882

4883
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
4884
{
4885
	kvm_pfn_t pfn;
4886

4887
	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
4888
		return;
4889

4890
	pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
4891

4892
	for (pfn = start; pfn < end;) {
4893
		bool use_2m_update = false;
4894
		int rc, rmp_level;
4895
		bool assigned;
4896

4897
		rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4898
		if (rc || !assigned)
4899
			goto next_pfn;
4900

4901
		use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
4902
				end >= (pfn + PTRS_PER_PMD) &&
4903
				rmp_level > PG_LEVEL_4K;
4904

4905
		/*
4906
		 * If an unaligned PFN corresponds to a 2M region assigned as a
4907
		 * large page in the RMP table, PSMASH the region into individual
4908
		 * 4K RMP entries before attempting to convert a 4K sub-page.
4909
		 */
4910
		if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
4911
			/*
4912
			 * This shouldn't fail, but if it does, report it, but
4913
			 * still try to update RMP entry to shared and pray this
4914
			 * was a spurious error that can be addressed later.
4915
			 */
4916
			rc = snp_rmptable_psmash(pfn);
4917
			WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
4918
				  pfn, rc);
4919
		}
4920

4921
		rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
4922
		if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
4923
			      pfn, rc))
4924
			goto next_pfn;
4925

4926
		/*
4927
		 * SEV-ES avoids host/guest cache coherency issues through
4928
		 * WBNOINVD hooks issued via MMU notifiers during run-time, and
4929
		 * KVM's VM destroy path at shutdown. Those MMU notifier events
4930
		 * don't cover gmem since there is no requirement to map pages
4931
		 * to a HVA in order to use them for a running guest. While the
4932
		 * shutdown path would still likely cover things for SNP guests,
4933
		 * userspace may also free gmem pages during run-time via
4934
		 * hole-punching operations on the guest_memfd, so flush the
4935
		 * cache entries for these pages before free'ing them back to
4936
		 * the host.
4937
		 */
4938
		clflush_cache_range(__va(pfn_to_hpa(pfn)),
4939
				    use_2m_update ? PMD_SIZE : PAGE_SIZE);
4940
next_pfn:
4941
		pfn += use_2m_update ? PTRS_PER_PMD : 1;
4942
		cond_resched();
4943
	}
4944
}
4945

4946
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
4947
{
4948
	int level, rc;
4949
	bool assigned;
4950

4951
	if (!sev_snp_guest(kvm))
4952
		return 0;
4953

4954
	rc = snp_lookup_rmpentry(pfn, &assigned, &level);
4955
	if (rc || !assigned)
4956
		return PG_LEVEL_4K;
4957

4958
	return level;
4959
}
4960

4961
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
4962
{
4963
	struct vcpu_svm *svm = to_svm(vcpu);
4964
	struct vmcb_save_area *vmsa;
4965
	struct kvm_sev_info *sev;
4966
	int error = 0;
4967
	int ret;
4968

4969
	if (!sev_es_guest(vcpu->kvm))
4970
		return NULL;
4971

4972
	/*
4973
	 * If the VMSA has not yet been encrypted, return a pointer to the
4974
	 * current un-encrypted VMSA.
4975
	 */
4976
	if (!vcpu->arch.guest_state_protected)
4977
		return (struct vmcb_save_area *)svm->sev_es.vmsa;
4978

4979
	sev = to_kvm_sev_info(vcpu->kvm);
4980

4981
	/* Check if the SEV policy allows debugging */
4982
	if (sev_snp_guest(vcpu->kvm)) {
4983
		if (!(sev->policy & SNP_POLICY_DEBUG))
4984
			return NULL;
4985
	} else {
4986
		if (sev->policy & SEV_POLICY_NODBG)
4987
			return NULL;
4988
	}
4989

4990
	if (sev_snp_guest(vcpu->kvm)) {
4991
		struct sev_data_snp_dbg dbg = {0};
4992

4993
		vmsa = snp_alloc_firmware_page(__GFP_ZERO);
4994
		if (!vmsa)
4995
			return NULL;
4996

4997
		dbg.gctx_paddr = __psp_pa(sev->snp_context);
4998
		dbg.src_addr = svm->vmcb->control.vmsa_pa;
4999
		dbg.dst_addr = __psp_pa(vmsa);
5000

5001
		ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
5002

5003
		/*
5004
		 * Return the target page to a hypervisor page no matter what.
5005
		 * If this fails, the page can't be used, so leak it and don't
5006
		 * try to use it.
5007
		 */
5008
		if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
5009
			return NULL;
5010

5011
		if (ret) {
5012
			pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
5013
			       ret, error, error);
5014
			free_page((unsigned long)vmsa);
5015

5016
			return NULL;
5017
		}
5018
	} else {
5019
		struct sev_data_dbg dbg = {0};
5020
		struct page *vmsa_page;
5021

5022
		vmsa_page = alloc_page(GFP_KERNEL);
5023
		if (!vmsa_page)
5024
			return NULL;
5025

5026
		vmsa = page_address(vmsa_page);
5027

5028
		dbg.handle = sev->handle;
5029
		dbg.src_addr = svm->vmcb->control.vmsa_pa;
5030
		dbg.dst_addr = __psp_pa(vmsa);
5031
		dbg.len = PAGE_SIZE;
5032

5033
		ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
5034
		if (ret) {
5035
			pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
5036
			       ret, error, error);
5037
			__free_page(vmsa_page);
5038

5039
			return NULL;
5040
		}
5041
	}
5042

5043
	return vmsa;
5044
}
5045

5046
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
5047
{
5048
	/* If the VMSA has not yet been encrypted, nothing was allocated */
5049
	if (!vcpu->arch.guest_state_protected || !vmsa)
5050
		return;
5051

5052
	free_page((unsigned long)vmsa);
5053
}
5054

5055
Product

Resources

Company