Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/svm/sev.c
52032 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine driver for Linux
4
*
5
* AMD SVM-SEV support
6
*
7
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
8
*/
9
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11
#include <linux/kvm_types.h>
12
#include <linux/kvm_host.h>
13
#include <linux/kernel.h>
14
#include <linux/highmem.h>
15
#include <linux/psp.h>
16
#include <linux/psp-sev.h>
17
#include <linux/pagemap.h>
18
#include <linux/swap.h>
19
#include <linux/misc_cgroup.h>
20
#include <linux/processor.h>
21
#include <linux/trace_events.h>
22
#include <uapi/linux/sev-guest.h>
23
24
#include <asm/pkru.h>
25
#include <asm/trapnr.h>
26
#include <asm/fpu/xcr.h>
27
#include <asm/fpu/xstate.h>
28
#include <asm/debugreg.h>
29
#include <asm/msr.h>
30
#include <asm/sev.h>
31
32
#include "mmu.h"
33
#include "x86.h"
34
#include "svm.h"
35
#include "svm_ops.h"
36
#include "cpuid.h"
37
#include "trace.h"
38
39
#define GHCB_VERSION_MAX 2ULL
40
#define GHCB_VERSION_MIN 1ULL
41
42
#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
43
44
/*
45
* The GHCB spec essentially states that all non-zero error codes other than
46
* those explicitly defined above should be treated as an error by the guest.
47
* Define a generic error to cover that case, and choose a value that is not
48
* likely to overlap with new explicit error codes should more be added to
49
* the GHCB spec later. KVM will use this to report generic errors when
50
* handling SNP guest requests.
51
*/
52
#define SNP_GUEST_VMM_ERR_GENERIC (~0U)
53
54
/* enable/disable SEV support */
55
static bool sev_enabled = true;
56
module_param_named(sev, sev_enabled, bool, 0444);
57
58
/* enable/disable SEV-ES support */
59
static bool sev_es_enabled = true;
60
module_param_named(sev_es, sev_es_enabled, bool, 0444);
61
62
/* enable/disable SEV-SNP support */
63
static bool sev_snp_enabled = true;
64
module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
65
66
static unsigned int nr_ciphertext_hiding_asids;
67
module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444);
68
69
#define AP_RESET_HOLD_NONE 0
70
#define AP_RESET_HOLD_NAE_EVENT 1
71
#define AP_RESET_HOLD_MSR_PROTO 2
72
73
/*
74
* SEV-SNP policy bits that can be supported by KVM. These include policy bits
75
* that have implementation support within KVM or policy bits that do not
76
* require implementation support within KVM to enforce the policy.
77
*/
78
#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
79
SNP_POLICY_MASK_API_MAJOR | \
80
SNP_POLICY_MASK_SMT | \
81
SNP_POLICY_MASK_RSVD_MBO | \
82
SNP_POLICY_MASK_DEBUG | \
83
SNP_POLICY_MASK_SINGLE_SOCKET | \
84
SNP_POLICY_MASK_CXL_ALLOW | \
85
SNP_POLICY_MASK_MEM_AES_256_XTS | \
86
SNP_POLICY_MASK_RAPL_DIS | \
87
SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
88
SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
89
90
static u64 snp_supported_policy_bits __ro_after_init;
91
92
static u64 sev_supported_vmsa_features __ro_after_init;
93
94
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
95
96
static u8 sev_enc_bit;
97
static DECLARE_RWSEM(sev_deactivate_lock);
98
static DEFINE_MUTEX(sev_bitmap_lock);
99
unsigned int max_sev_asid;
100
static unsigned int min_sev_asid;
101
static unsigned int max_sev_es_asid;
102
static unsigned int min_sev_es_asid;
103
static unsigned int max_snp_asid;
104
static unsigned int min_snp_asid;
105
static unsigned long sev_me_mask;
106
static unsigned int nr_asids;
107
static unsigned long *sev_asid_bitmap;
108
static unsigned long *sev_reclaim_asid_bitmap;
109
110
static int snp_decommission_context(struct kvm *kvm);
111
112
struct enc_region {
113
struct list_head list;
114
unsigned long npages;
115
struct page **pages;
116
unsigned long uaddr;
117
unsigned long size;
118
};
119
120
/* Called with the sev_bitmap_lock held, or on shutdown */
121
static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
122
{
123
int ret, error = 0;
124
unsigned int asid;
125
126
/* Check if there are any ASIDs to reclaim before performing a flush */
127
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
128
if (asid > max_asid)
129
return -EBUSY;
130
131
/*
132
* DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
133
* so it must be guarded.
134
*/
135
down_write(&sev_deactivate_lock);
136
137
/* SNP firmware requires use of WBINVD for ASID recycling. */
138
wbinvd_on_all_cpus();
139
140
if (sev_snp_enabled)
141
ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
142
else
143
ret = sev_guest_df_flush(&error);
144
145
up_write(&sev_deactivate_lock);
146
147
if (ret)
148
pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
149
sev_snp_enabled ? "-SNP" : "", ret, error);
150
151
return ret;
152
}
153
154
static inline bool is_mirroring_enc_context(struct kvm *kvm)
155
{
156
return !!to_kvm_sev_info(kvm)->enc_context_owner;
157
}
158
159
static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
160
{
161
struct kvm_vcpu *vcpu = &svm->vcpu;
162
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
163
164
return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
165
}
166
167
static bool snp_is_secure_tsc_enabled(struct kvm *kvm)
168
{
169
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
170
171
return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) &&
172
!WARN_ON_ONCE(!sev_snp_guest(kvm));
173
}
174
175
/* Must be called with the sev_bitmap_lock held */
176
static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
177
{
178
if (sev_flush_asids(min_asid, max_asid))
179
return false;
180
181
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
182
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
183
nr_asids);
184
bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
185
186
return true;
187
}
188
189
static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
190
{
191
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
192
return misc_cg_try_charge(type, sev->misc_cg, 1);
193
}
194
195
static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
196
{
197
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
198
misc_cg_uncharge(type, sev->misc_cg, 1);
199
}
200
201
static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type)
202
{
203
/*
204
* SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
205
* SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
206
*/
207
unsigned int min_asid, max_asid, asid;
208
bool retry = true;
209
int ret;
210
211
if (vm_type == KVM_X86_SNP_VM) {
212
min_asid = min_snp_asid;
213
max_asid = max_snp_asid;
214
} else if (sev->es_active) {
215
min_asid = min_sev_es_asid;
216
max_asid = max_sev_es_asid;
217
} else {
218
min_asid = min_sev_asid;
219
max_asid = max_sev_asid;
220
}
221
222
/*
223
* The min ASID can end up larger than the max if basic SEV support is
224
* effectively disabled by disallowing use of ASIDs for SEV guests.
225
* Similarly for SEV-ES guests the min ASID can end up larger than the
226
* max when ciphertext hiding is enabled, effectively disabling SEV-ES
227
* support.
228
*/
229
if (min_asid > max_asid)
230
return -ENOTTY;
231
232
WARN_ON(sev->misc_cg);
233
sev->misc_cg = get_current_misc_cg();
234
ret = sev_misc_cg_try_charge(sev);
235
if (ret) {
236
put_misc_cg(sev->misc_cg);
237
sev->misc_cg = NULL;
238
return ret;
239
}
240
241
mutex_lock(&sev_bitmap_lock);
242
243
again:
244
asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
245
if (asid > max_asid) {
246
if (retry && __sev_recycle_asids(min_asid, max_asid)) {
247
retry = false;
248
goto again;
249
}
250
mutex_unlock(&sev_bitmap_lock);
251
ret = -EBUSY;
252
goto e_uncharge;
253
}
254
255
__set_bit(asid, sev_asid_bitmap);
256
257
mutex_unlock(&sev_bitmap_lock);
258
259
sev->asid = asid;
260
return 0;
261
e_uncharge:
262
sev_misc_cg_uncharge(sev);
263
put_misc_cg(sev->misc_cg);
264
sev->misc_cg = NULL;
265
return ret;
266
}
267
268
static unsigned int sev_get_asid(struct kvm *kvm)
269
{
270
return to_kvm_sev_info(kvm)->asid;
271
}
272
273
static void sev_asid_free(struct kvm_sev_info *sev)
274
{
275
struct svm_cpu_data *sd;
276
int cpu;
277
278
mutex_lock(&sev_bitmap_lock);
279
280
__set_bit(sev->asid, sev_reclaim_asid_bitmap);
281
282
for_each_possible_cpu(cpu) {
283
sd = per_cpu_ptr(&svm_data, cpu);
284
sd->sev_vmcbs[sev->asid] = NULL;
285
}
286
287
mutex_unlock(&sev_bitmap_lock);
288
289
sev_misc_cg_uncharge(sev);
290
put_misc_cg(sev->misc_cg);
291
sev->misc_cg = NULL;
292
}
293
294
static void sev_decommission(unsigned int handle)
295
{
296
struct sev_data_decommission decommission;
297
298
if (!handle)
299
return;
300
301
decommission.handle = handle;
302
sev_guest_decommission(&decommission, NULL);
303
}
304
305
/*
306
* Transition a page to hypervisor-owned/shared state in the RMP table. This
307
* should not fail under normal conditions, but leak the page should that
308
* happen since it will no longer be usable by the host due to RMP protections.
309
*/
310
static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
311
{
312
if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
313
snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
314
return -EIO;
315
}
316
317
return 0;
318
}
319
320
/*
321
* Certain page-states, such as Pre-Guest and Firmware pages (as documented
322
* in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
323
* directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
324
* unless they are reclaimed first.
325
*
326
* Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
327
* might not be usable by the host due to being set as immutable or still
328
* being associated with a guest ASID.
329
*
330
* Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
331
* converted back to shared, as the page is no longer usable due to RMP
332
* protections, and it's infeasible for the guest to continue on.
333
*/
334
static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
335
{
336
struct sev_data_snp_page_reclaim data = {0};
337
int fw_err, rc;
338
339
data.paddr = __sme_set(pfn << PAGE_SHIFT);
340
rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
341
if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
342
snp_leak_pages(pfn, 1);
343
return -EIO;
344
}
345
346
if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
347
return -EIO;
348
349
return rc;
350
}
351
352
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
353
{
354
struct sev_data_deactivate deactivate;
355
356
if (!handle)
357
return;
358
359
deactivate.handle = handle;
360
361
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
362
down_read(&sev_deactivate_lock);
363
sev_guest_deactivate(&deactivate, NULL);
364
up_read(&sev_deactivate_lock);
365
366
sev_decommission(handle);
367
}
368
369
/*
370
* This sets up bounce buffers/firmware pages to handle SNP Guest Request
371
* messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
372
* 2.0 specification for more details.
373
*
374
* Technically, when an SNP Guest Request is issued, the guest will provide its
375
* own request/response pages, which could in theory be passed along directly
376
* to firmware rather than using bounce pages. However, these pages would need
377
* special care:
378
*
379
* - Both pages are from shared guest memory, so they need to be protected
380
* from migration/etc. occurring while firmware reads/writes to them. At a
381
* minimum, this requires elevating the ref counts and potentially needing
382
* an explicit pinning of the memory. This places additional restrictions
383
* on what type of memory backends userspace can use for shared guest
384
* memory since there is some reliance on using refcounted pages.
385
*
386
* - The response page needs to be switched to Firmware-owned[1] state
387
* before the firmware can write to it, which can lead to potential
388
* host RMP #PFs if the guest is misbehaved and hands the host a
389
* guest page that KVM might write to for other reasons (e.g. virtio
390
* buffers/etc.).
391
*
392
* Both of these issues can be avoided completely by using separately-allocated
393
* bounce pages for both the request/response pages and passing those to
394
* firmware instead. So that's what is being set up here.
395
*
396
* Guest requests rely on message sequence numbers to ensure requests are
397
* issued to firmware in the order the guest issues them, so concurrent guest
398
* requests generally shouldn't happen. But a misbehaved guest could issue
399
* concurrent guest requests in theory, so a mutex is used to serialize
400
* access to the bounce buffers.
401
*
402
* [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
403
* details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
404
* in the APM for details on the related RMP restrictions.
405
*/
406
static int snp_guest_req_init(struct kvm *kvm)
407
{
408
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
409
struct page *req_page;
410
411
req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
412
if (!req_page)
413
return -ENOMEM;
414
415
sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
416
if (!sev->guest_resp_buf) {
417
__free_page(req_page);
418
return -EIO;
419
}
420
421
sev->guest_req_buf = page_address(req_page);
422
mutex_init(&sev->guest_req_mutex);
423
424
return 0;
425
}
426
427
static void snp_guest_req_cleanup(struct kvm *kvm)
428
{
429
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
430
431
if (sev->guest_resp_buf)
432
snp_free_firmware_page(sev->guest_resp_buf);
433
434
if (sev->guest_req_buf)
435
__free_page(virt_to_page(sev->guest_req_buf));
436
437
sev->guest_req_buf = NULL;
438
sev->guest_resp_buf = NULL;
439
}
440
441
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
442
struct kvm_sev_init *data,
443
unsigned long vm_type)
444
{
445
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
446
struct sev_platform_init_args init_args = {0};
447
bool es_active = vm_type != KVM_X86_SEV_VM;
448
bool snp_active = vm_type == KVM_X86_SNP_VM;
449
u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
450
int ret;
451
452
if (kvm->created_vcpus)
453
return -EINVAL;
454
455
if (data->flags)
456
return -EINVAL;
457
458
if (!snp_active)
459
valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC;
460
461
if (data->vmsa_features & ~valid_vmsa_features)
462
return -EINVAL;
463
464
if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
465
return -EINVAL;
466
467
/*
468
* KVM supports the full range of mandatory features defined by version
469
* 2 of the GHCB protocol, so default to that for SEV-ES guests created
470
* via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1).
471
*/
472
if (es_active && !data->ghcb_version)
473
data->ghcb_version = 2;
474
475
if (snp_active && data->ghcb_version < 2)
476
return -EINVAL;
477
478
if (unlikely(sev->active))
479
return -EINVAL;
480
481
sev->active = true;
482
sev->es_active = es_active;
483
sev->vmsa_features = data->vmsa_features;
484
sev->ghcb_version = data->ghcb_version;
485
486
if (snp_active)
487
sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
488
489
ret = sev_asid_new(sev, vm_type);
490
if (ret)
491
goto e_no_asid;
492
493
init_args.probe = false;
494
ret = sev_platform_init(&init_args);
495
if (ret)
496
goto e_free_asid;
497
498
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
499
ret = -ENOMEM;
500
goto e_free_asid;
501
}
502
503
/* This needs to happen after SEV/SNP firmware initialization. */
504
if (snp_active) {
505
ret = snp_guest_req_init(kvm);
506
if (ret)
507
goto e_free;
508
}
509
510
INIT_LIST_HEAD(&sev->regions_list);
511
INIT_LIST_HEAD(&sev->mirror_vms);
512
sev->need_init = false;
513
514
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
515
516
return 0;
517
518
e_free:
519
free_cpumask_var(sev->have_run_cpus);
520
e_free_asid:
521
argp->error = init_args.error;
522
sev_asid_free(sev);
523
sev->asid = 0;
524
e_no_asid:
525
sev->vmsa_features = 0;
526
sev->es_active = false;
527
sev->active = false;
528
return ret;
529
}
530
531
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
532
{
533
struct kvm_sev_init data = {
534
.vmsa_features = 0,
535
.ghcb_version = 0,
536
};
537
unsigned long vm_type;
538
539
if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
540
return -EINVAL;
541
542
vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
543
544
/*
545
* KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
546
* continue to only ever support the minimal GHCB protocol version.
547
*/
548
if (vm_type == KVM_X86_SEV_ES_VM)
549
data.ghcb_version = GHCB_VERSION_MIN;
550
551
return __sev_guest_init(kvm, argp, &data, vm_type);
552
}
553
554
static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
555
{
556
struct kvm_sev_init data;
557
558
if (!to_kvm_sev_info(kvm)->need_init)
559
return -EINVAL;
560
561
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
562
kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
563
kvm->arch.vm_type != KVM_X86_SNP_VM)
564
return -EINVAL;
565
566
if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
567
return -EFAULT;
568
569
return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
570
}
571
572
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
573
{
574
unsigned int asid = sev_get_asid(kvm);
575
struct sev_data_activate activate;
576
int ret;
577
578
/* activate ASID on the given handle */
579
activate.handle = handle;
580
activate.asid = asid;
581
ret = sev_guest_activate(&activate, error);
582
583
return ret;
584
}
585
586
static int __sev_issue_cmd(int fd, int id, void *data, int *error)
587
{
588
CLASS(fd, f)(fd);
589
590
if (fd_empty(f))
591
return -EBADF;
592
593
return sev_issue_cmd_external_user(fd_file(f), id, data, error);
594
}
595
596
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
597
{
598
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
599
600
return __sev_issue_cmd(sev->fd, id, data, error);
601
}
602
603
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
604
{
605
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
606
struct sev_data_launch_start start;
607
struct kvm_sev_launch_start params;
608
void *dh_blob, *session_blob;
609
int *error = &argp->error;
610
int ret;
611
612
if (!sev_guest(kvm))
613
return -ENOTTY;
614
615
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
616
return -EFAULT;
617
618
memset(&start, 0, sizeof(start));
619
620
dh_blob = NULL;
621
if (params.dh_uaddr) {
622
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
623
if (IS_ERR(dh_blob))
624
return PTR_ERR(dh_blob);
625
626
start.dh_cert_address = __sme_set(__pa(dh_blob));
627
start.dh_cert_len = params.dh_len;
628
}
629
630
session_blob = NULL;
631
if (params.session_uaddr) {
632
session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
633
if (IS_ERR(session_blob)) {
634
ret = PTR_ERR(session_blob);
635
goto e_free_dh;
636
}
637
638
start.session_address = __sme_set(__pa(session_blob));
639
start.session_len = params.session_len;
640
}
641
642
start.handle = params.handle;
643
start.policy = params.policy;
644
645
/* create memory encryption context */
646
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
647
if (ret)
648
goto e_free_session;
649
650
/* Bind ASID to this guest */
651
ret = sev_bind_asid(kvm, start.handle, error);
652
if (ret) {
653
sev_decommission(start.handle);
654
goto e_free_session;
655
}
656
657
/* return handle to userspace */
658
params.handle = start.handle;
659
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
660
sev_unbind_asid(kvm, start.handle);
661
ret = -EFAULT;
662
goto e_free_session;
663
}
664
665
sev->policy = params.policy;
666
sev->handle = start.handle;
667
sev->fd = argp->sev_fd;
668
669
e_free_session:
670
kfree(session_blob);
671
e_free_dh:
672
kfree(dh_blob);
673
return ret;
674
}
675
676
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
677
unsigned long ulen, unsigned long *n,
678
unsigned int flags)
679
{
680
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
681
unsigned long npages, size;
682
int npinned;
683
unsigned long locked, lock_limit;
684
struct page **pages;
685
unsigned long first, last;
686
int ret;
687
688
lockdep_assert_held(&kvm->lock);
689
690
if (ulen == 0 || uaddr + ulen < uaddr)
691
return ERR_PTR(-EINVAL);
692
693
/* Calculate number of pages. */
694
first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
695
last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
696
npages = (last - first + 1);
697
698
locked = sev->pages_locked + npages;
699
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
700
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
701
pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
702
return ERR_PTR(-ENOMEM);
703
}
704
705
if (WARN_ON_ONCE(npages > INT_MAX))
706
return ERR_PTR(-EINVAL);
707
708
/* Avoid using vmalloc for smaller buffers. */
709
size = npages * sizeof(struct page *);
710
if (size > PAGE_SIZE)
711
pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
712
else
713
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
714
715
if (!pages)
716
return ERR_PTR(-ENOMEM);
717
718
/* Pin the user virtual address. */
719
npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
720
if (npinned != npages) {
721
pr_err("SEV: Failure locking %lu pages.\n", npages);
722
ret = -ENOMEM;
723
goto err;
724
}
725
726
*n = npages;
727
sev->pages_locked = locked;
728
729
return pages;
730
731
err:
732
if (npinned > 0)
733
unpin_user_pages(pages, npinned);
734
735
kvfree(pages);
736
return ERR_PTR(ret);
737
}
738
739
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
740
unsigned long npages)
741
{
742
unpin_user_pages(pages, npages);
743
kvfree(pages);
744
to_kvm_sev_info(kvm)->pages_locked -= npages;
745
}
746
747
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
748
{
749
uint8_t *page_virtual;
750
unsigned long i;
751
752
if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
753
pages == NULL)
754
return;
755
756
for (i = 0; i < npages; i++) {
757
page_virtual = kmap_local_page(pages[i]);
758
clflush_cache_range(page_virtual, PAGE_SIZE);
759
kunmap_local(page_virtual);
760
cond_resched();
761
}
762
}
763
764
static void sev_writeback_caches(struct kvm *kvm)
765
{
766
/*
767
* Ensure that all dirty guest tagged cache entries are written back
768
* before releasing the pages back to the system for use. CLFLUSH will
769
* not do this without SME_COHERENT, and flushing many cache lines
770
* individually is slower than blasting WBINVD for large VMs, so issue
771
* WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
772
* on CPUs that have done VMRUN, i.e. may have dirtied data using the
773
* VM's ASID.
774
*
775
* For simplicity, never remove CPUs from the bitmap. Ideally, KVM
776
* would clear the mask when flushing caches, but doing so requires
777
* serializing multiple calls and having responding CPUs (to the IPI)
778
* mark themselves as still running if they are running (or about to
779
* run) a vCPU for the VM.
780
*
781
* Note, the caller is responsible for ensuring correctness if the mask
782
* can be modified, e.g. if a CPU could be doing VMRUN.
783
*/
784
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
785
}
786
787
static unsigned long get_num_contig_pages(unsigned long idx,
788
struct page **inpages, unsigned long npages)
789
{
790
unsigned long paddr, next_paddr;
791
unsigned long i = idx + 1, pages = 1;
792
793
/* find the number of contiguous pages starting from idx */
794
paddr = __sme_page_pa(inpages[idx]);
795
while (i < npages) {
796
next_paddr = __sme_page_pa(inpages[i++]);
797
if ((paddr + PAGE_SIZE) == next_paddr) {
798
pages++;
799
paddr = next_paddr;
800
continue;
801
}
802
break;
803
}
804
805
return pages;
806
}
807
808
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
809
{
810
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
811
struct kvm_sev_launch_update_data params;
812
struct sev_data_launch_update_data data;
813
struct page **inpages;
814
int ret;
815
816
if (!sev_guest(kvm))
817
return -ENOTTY;
818
819
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
820
return -EFAULT;
821
822
vaddr = params.uaddr;
823
size = params.len;
824
vaddr_end = vaddr + size;
825
826
/* Lock the user memory. */
827
inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
828
if (IS_ERR(inpages))
829
return PTR_ERR(inpages);
830
831
/*
832
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
833
* place; the cache may contain the data that was written unencrypted.
834
*/
835
sev_clflush_pages(inpages, npages);
836
837
data.reserved = 0;
838
data.handle = to_kvm_sev_info(kvm)->handle;
839
840
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
841
int offset, len;
842
843
/*
844
* If the user buffer is not page-aligned, calculate the offset
845
* within the page.
846
*/
847
offset = vaddr & (PAGE_SIZE - 1);
848
849
/* Calculate the number of pages that can be encrypted in one go. */
850
pages = get_num_contig_pages(i, inpages, npages);
851
852
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
853
854
data.len = len;
855
data.address = __sme_page_pa(inpages[i]) + offset;
856
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
857
if (ret)
858
goto e_unpin;
859
860
size -= len;
861
next_vaddr = vaddr + len;
862
}
863
864
e_unpin:
865
/* content of memory is updated, mark pages dirty */
866
for (i = 0; i < npages; i++) {
867
set_page_dirty_lock(inpages[i]);
868
mark_page_accessed(inpages[i]);
869
}
870
/* unlock the user pages */
871
sev_unpin_memory(kvm, inpages, npages);
872
return ret;
873
}
874
875
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
876
{
877
struct kvm_vcpu *vcpu = &svm->vcpu;
878
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
879
struct sev_es_save_area *save = svm->sev_es.vmsa;
880
struct xregs_state *xsave;
881
const u8 *s;
882
u8 *d;
883
int i;
884
885
/* Check some debug related fields before encrypting the VMSA */
886
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
887
return -EINVAL;
888
889
/*
890
* SEV-ES will use a VMSA that is pointed to by the VMCB, not
891
* the traditional VMSA that is part of the VMCB. Copy the
892
* traditional VMSA as it has been built so far (in prep
893
* for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
894
*/
895
memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
896
897
/* Sync registgers */
898
save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
899
save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
900
save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
901
save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
902
save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
903
save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
904
save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
905
save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
906
#ifdef CONFIG_X86_64
907
save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
908
save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
909
save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
910
save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
911
save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
912
save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
913
save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
914
save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
915
#endif
916
save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
917
918
/* Sync some non-GPR registers before encrypting */
919
save->xcr0 = svm->vcpu.arch.xcr0;
920
save->pkru = svm->vcpu.arch.pkru;
921
save->xss = svm->vcpu.arch.ia32_xss;
922
save->dr6 = svm->vcpu.arch.dr6;
923
924
save->sev_features = sev->vmsa_features;
925
926
/*
927
* Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
928
* breaking older measurements.
929
*/
930
if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
931
xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
932
save->x87_dp = xsave->i387.rdp;
933
save->mxcsr = xsave->i387.mxcsr;
934
save->x87_ftw = xsave->i387.twd;
935
save->x87_fsw = xsave->i387.swd;
936
save->x87_fcw = xsave->i387.cwd;
937
save->x87_fop = xsave->i387.fop;
938
save->x87_ds = 0;
939
save->x87_cs = 0;
940
save->x87_rip = xsave->i387.rip;
941
942
for (i = 0; i < 8; i++) {
943
/*
944
* The format of the x87 save area is undocumented and
945
* definitely not what you would expect. It consists of
946
* an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
947
* area with bytes 8-9 of each register.
948
*/
949
d = save->fpreg_x87 + i * 8;
950
s = ((u8 *)xsave->i387.st_space) + i * 16;
951
memcpy(d, s, 8);
952
save->fpreg_x87[64 + i * 2] = s[8];
953
save->fpreg_x87[64 + i * 2 + 1] = s[9];
954
}
955
memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
956
957
s = get_xsave_addr(xsave, XFEATURE_YMM);
958
if (s)
959
memcpy(save->fpreg_ymm, s, 256);
960
else
961
memset(save->fpreg_ymm, 0, 256);
962
}
963
964
pr_debug("Virtual Machine Save Area (VMSA):\n");
965
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
966
967
return 0;
968
}
969
970
static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
971
int *error)
972
{
973
struct sev_data_launch_update_vmsa vmsa;
974
struct vcpu_svm *svm = to_svm(vcpu);
975
int ret;
976
977
if (vcpu->guest_debug) {
978
pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
979
return -EINVAL;
980
}
981
982
/* Perform some pre-encryption checks against the VMSA */
983
ret = sev_es_sync_vmsa(svm);
984
if (ret)
985
return ret;
986
987
/*
988
* The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
989
* the VMSA memory content (i.e it will write the same memory region
990
* with the guest's key), so invalidate it first.
991
*/
992
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
993
994
vmsa.reserved = 0;
995
vmsa.handle = to_kvm_sev_info(kvm)->handle;
996
vmsa.address = __sme_pa(svm->sev_es.vmsa);
997
vmsa.len = PAGE_SIZE;
998
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
999
if (ret)
1000
return ret;
1001
1002
/*
1003
* SEV-ES guests maintain an encrypted version of their FPU
1004
* state which is restored and saved on VMRUN and VMEXIT.
1005
* Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1006
* do xsave/xrstor on it.
1007
*/
1008
fpstate_set_confidential(&vcpu->arch.guest_fpu);
1009
vcpu->arch.guest_state_protected = true;
1010
1011
/*
1012
* SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
1013
* only after setting guest_state_protected because KVM_SET_MSRS allows
1014
* dynamic toggling of LBRV (for performance reason) on write access to
1015
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
1016
*/
1017
svm_enable_lbrv(vcpu);
1018
return 0;
1019
}
1020
1021
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
1022
{
1023
struct kvm_vcpu *vcpu;
1024
unsigned long i;
1025
int ret;
1026
1027
if (!sev_es_guest(kvm))
1028
return -ENOTTY;
1029
1030
kvm_for_each_vcpu(i, vcpu, kvm) {
1031
ret = mutex_lock_killable(&vcpu->mutex);
1032
if (ret)
1033
return ret;
1034
1035
ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
1036
1037
mutex_unlock(&vcpu->mutex);
1038
if (ret)
1039
return ret;
1040
}
1041
1042
return 0;
1043
}
1044
1045
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
1046
{
1047
void __user *measure = u64_to_user_ptr(argp->data);
1048
struct sev_data_launch_measure data;
1049
struct kvm_sev_launch_measure params;
1050
void __user *p = NULL;
1051
void *blob = NULL;
1052
int ret;
1053
1054
if (!sev_guest(kvm))
1055
return -ENOTTY;
1056
1057
if (copy_from_user(&params, measure, sizeof(params)))
1058
return -EFAULT;
1059
1060
memset(&data, 0, sizeof(data));
1061
1062
/* User wants to query the blob length */
1063
if (!params.len)
1064
goto cmd;
1065
1066
p = u64_to_user_ptr(params.uaddr);
1067
if (p) {
1068
if (params.len > SEV_FW_BLOB_MAX_SIZE)
1069
return -EINVAL;
1070
1071
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1072
if (!blob)
1073
return -ENOMEM;
1074
1075
data.address = __psp_pa(blob);
1076
data.len = params.len;
1077
}
1078
1079
cmd:
1080
data.handle = to_kvm_sev_info(kvm)->handle;
1081
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
1082
1083
/*
1084
* If we query the session length, FW responded with expected data.
1085
*/
1086
if (!params.len)
1087
goto done;
1088
1089
if (ret)
1090
goto e_free_blob;
1091
1092
if (blob) {
1093
if (copy_to_user(p, blob, params.len))
1094
ret = -EFAULT;
1095
}
1096
1097
done:
1098
params.len = data.len;
1099
if (copy_to_user(measure, &params, sizeof(params)))
1100
ret = -EFAULT;
1101
e_free_blob:
1102
kfree(blob);
1103
return ret;
1104
}
1105
1106
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1107
{
1108
struct sev_data_launch_finish data;
1109
1110
if (!sev_guest(kvm))
1111
return -ENOTTY;
1112
1113
data.handle = to_kvm_sev_info(kvm)->handle;
1114
return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
1115
}
1116
1117
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
1118
{
1119
struct kvm_sev_guest_status params;
1120
struct sev_data_guest_status data;
1121
int ret;
1122
1123
if (!sev_guest(kvm))
1124
return -ENOTTY;
1125
1126
memset(&data, 0, sizeof(data));
1127
1128
data.handle = to_kvm_sev_info(kvm)->handle;
1129
ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
1130
if (ret)
1131
return ret;
1132
1133
params.policy = data.policy;
1134
params.state = data.state;
1135
params.handle = data.handle;
1136
1137
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
1138
ret = -EFAULT;
1139
1140
return ret;
1141
}
1142
1143
static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
1144
unsigned long dst, int size,
1145
int *error, bool enc)
1146
{
1147
struct sev_data_dbg data;
1148
1149
data.reserved = 0;
1150
data.handle = to_kvm_sev_info(kvm)->handle;
1151
data.dst_addr = dst;
1152
data.src_addr = src;
1153
data.len = size;
1154
1155
return sev_issue_cmd(kvm,
1156
enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
1157
&data, error);
1158
}
1159
1160
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
1161
unsigned long dst_paddr, int sz, int *err)
1162
{
1163
int offset;
1164
1165
/*
1166
* Its safe to read more than we are asked, caller should ensure that
1167
* destination has enough space.
1168
*/
1169
offset = src_paddr & 15;
1170
src_paddr = round_down(src_paddr, 16);
1171
sz = round_up(sz + offset, 16);
1172
1173
return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
1174
}
1175
1176
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
1177
void __user *dst_uaddr,
1178
unsigned long dst_paddr,
1179
int size, int *err)
1180
{
1181
struct page *tpage = NULL;
1182
int ret, offset;
1183
1184
/* if inputs are not 16-byte then use intermediate buffer */
1185
if (!IS_ALIGNED(dst_paddr, 16) ||
1186
!IS_ALIGNED(paddr, 16) ||
1187
!IS_ALIGNED(size, 16)) {
1188
tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1189
if (!tpage)
1190
return -ENOMEM;
1191
1192
dst_paddr = __sme_page_pa(tpage);
1193
}
1194
1195
ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
1196
if (ret)
1197
goto e_free;
1198
1199
if (tpage) {
1200
offset = paddr & 15;
1201
if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
1202
ret = -EFAULT;
1203
}
1204
1205
e_free:
1206
if (tpage)
1207
__free_page(tpage);
1208
1209
return ret;
1210
}
1211
1212
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
1213
void __user *vaddr,
1214
unsigned long dst_paddr,
1215
void __user *dst_vaddr,
1216
int size, int *error)
1217
{
1218
struct page *src_tpage = NULL;
1219
struct page *dst_tpage = NULL;
1220
int ret, len = size;
1221
1222
/* If source buffer is not aligned then use an intermediate buffer */
1223
if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
1224
src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1225
if (!src_tpage)
1226
return -ENOMEM;
1227
1228
if (copy_from_user(page_address(src_tpage), vaddr, size)) {
1229
__free_page(src_tpage);
1230
return -EFAULT;
1231
}
1232
1233
paddr = __sme_page_pa(src_tpage);
1234
}
1235
1236
/*
1237
* If destination buffer or length is not aligned then do read-modify-write:
1238
* - decrypt destination in an intermediate buffer
1239
* - copy the source buffer in an intermediate buffer
1240
* - use the intermediate buffer as source buffer
1241
*/
1242
if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
1243
int dst_offset;
1244
1245
dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1246
if (!dst_tpage) {
1247
ret = -ENOMEM;
1248
goto e_free;
1249
}
1250
1251
ret = __sev_dbg_decrypt(kvm, dst_paddr,
1252
__sme_page_pa(dst_tpage), size, error);
1253
if (ret)
1254
goto e_free;
1255
1256
/*
1257
* If source is kernel buffer then use memcpy() otherwise
1258
* copy_from_user().
1259
*/
1260
dst_offset = dst_paddr & 15;
1261
1262
if (src_tpage)
1263
memcpy(page_address(dst_tpage) + dst_offset,
1264
page_address(src_tpage), size);
1265
else {
1266
if (copy_from_user(page_address(dst_tpage) + dst_offset,
1267
vaddr, size)) {
1268
ret = -EFAULT;
1269
goto e_free;
1270
}
1271
}
1272
1273
paddr = __sme_page_pa(dst_tpage);
1274
dst_paddr = round_down(dst_paddr, 16);
1275
len = round_up(size, 16);
1276
}
1277
1278
ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
1279
1280
e_free:
1281
if (src_tpage)
1282
__free_page(src_tpage);
1283
if (dst_tpage)
1284
__free_page(dst_tpage);
1285
return ret;
1286
}
1287
1288
static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
1289
{
1290
unsigned long vaddr, vaddr_end, next_vaddr;
1291
unsigned long dst_vaddr;
1292
struct page **src_p, **dst_p;
1293
struct kvm_sev_dbg debug;
1294
unsigned long n;
1295
unsigned int size;
1296
int ret;
1297
1298
if (!sev_guest(kvm))
1299
return -ENOTTY;
1300
1301
if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
1302
return -EFAULT;
1303
1304
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
1305
return -EINVAL;
1306
if (!debug.dst_uaddr)
1307
return -EINVAL;
1308
1309
vaddr = debug.src_uaddr;
1310
size = debug.len;
1311
vaddr_end = vaddr + size;
1312
dst_vaddr = debug.dst_uaddr;
1313
1314
for (; vaddr < vaddr_end; vaddr = next_vaddr) {
1315
int len, s_off, d_off;
1316
1317
/* lock userspace source and destination page */
1318
src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
1319
if (IS_ERR(src_p))
1320
return PTR_ERR(src_p);
1321
1322
dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
1323
if (IS_ERR(dst_p)) {
1324
sev_unpin_memory(kvm, src_p, n);
1325
return PTR_ERR(dst_p);
1326
}
1327
1328
/*
1329
* Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
1330
* the pages; flush the destination too so that future accesses do not
1331
* see stale data.
1332
*/
1333
sev_clflush_pages(src_p, 1);
1334
sev_clflush_pages(dst_p, 1);
1335
1336
/*
1337
* Since user buffer may not be page aligned, calculate the
1338
* offset within the page.
1339
*/
1340
s_off = vaddr & ~PAGE_MASK;
1341
d_off = dst_vaddr & ~PAGE_MASK;
1342
len = min_t(size_t, (PAGE_SIZE - s_off), size);
1343
1344
if (dec)
1345
ret = __sev_dbg_decrypt_user(kvm,
1346
__sme_page_pa(src_p[0]) + s_off,
1347
(void __user *)dst_vaddr,
1348
__sme_page_pa(dst_p[0]) + d_off,
1349
len, &argp->error);
1350
else
1351
ret = __sev_dbg_encrypt_user(kvm,
1352
__sme_page_pa(src_p[0]) + s_off,
1353
(void __user *)vaddr,
1354
__sme_page_pa(dst_p[0]) + d_off,
1355
(void __user *)dst_vaddr,
1356
len, &argp->error);
1357
1358
sev_unpin_memory(kvm, src_p, n);
1359
sev_unpin_memory(kvm, dst_p, n);
1360
1361
if (ret)
1362
goto err;
1363
1364
next_vaddr = vaddr + len;
1365
dst_vaddr = dst_vaddr + len;
1366
size -= len;
1367
}
1368
err:
1369
return ret;
1370
}
1371
1372
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
1373
{
1374
struct sev_data_launch_secret data;
1375
struct kvm_sev_launch_secret params;
1376
struct page **pages;
1377
void *blob, *hdr;
1378
unsigned long n, i;
1379
int ret, offset;
1380
1381
if (!sev_guest(kvm))
1382
return -ENOTTY;
1383
1384
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1385
return -EFAULT;
1386
1387
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
1388
if (IS_ERR(pages))
1389
return PTR_ERR(pages);
1390
1391
/*
1392
* Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
1393
* place; the cache may contain the data that was written unencrypted.
1394
*/
1395
sev_clflush_pages(pages, n);
1396
1397
/*
1398
* The secret must be copied into contiguous memory region, lets verify
1399
* that userspace memory pages are contiguous before we issue command.
1400
*/
1401
if (get_num_contig_pages(0, pages, n) != n) {
1402
ret = -EINVAL;
1403
goto e_unpin_memory;
1404
}
1405
1406
memset(&data, 0, sizeof(data));
1407
1408
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1409
data.guest_address = __sme_page_pa(pages[0]) + offset;
1410
data.guest_len = params.guest_len;
1411
1412
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1413
if (IS_ERR(blob)) {
1414
ret = PTR_ERR(blob);
1415
goto e_unpin_memory;
1416
}
1417
1418
data.trans_address = __psp_pa(blob);
1419
data.trans_len = params.trans_len;
1420
1421
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1422
if (IS_ERR(hdr)) {
1423
ret = PTR_ERR(hdr);
1424
goto e_free_blob;
1425
}
1426
data.hdr_address = __psp_pa(hdr);
1427
data.hdr_len = params.hdr_len;
1428
1429
data.handle = to_kvm_sev_info(kvm)->handle;
1430
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
1431
1432
kfree(hdr);
1433
1434
e_free_blob:
1435
kfree(blob);
1436
e_unpin_memory:
1437
/* content of memory is updated, mark pages dirty */
1438
for (i = 0; i < n; i++) {
1439
set_page_dirty_lock(pages[i]);
1440
mark_page_accessed(pages[i]);
1441
}
1442
sev_unpin_memory(kvm, pages, n);
1443
return ret;
1444
}
1445
1446
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
1447
{
1448
void __user *report = u64_to_user_ptr(argp->data);
1449
struct sev_data_attestation_report data;
1450
struct kvm_sev_attestation_report params;
1451
void __user *p;
1452
void *blob = NULL;
1453
int ret;
1454
1455
if (!sev_guest(kvm))
1456
return -ENOTTY;
1457
1458
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1459
return -EFAULT;
1460
1461
memset(&data, 0, sizeof(data));
1462
1463
/* User wants to query the blob length */
1464
if (!params.len)
1465
goto cmd;
1466
1467
p = u64_to_user_ptr(params.uaddr);
1468
if (p) {
1469
if (params.len > SEV_FW_BLOB_MAX_SIZE)
1470
return -EINVAL;
1471
1472
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1473
if (!blob)
1474
return -ENOMEM;
1475
1476
data.address = __psp_pa(blob);
1477
data.len = params.len;
1478
memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
1479
}
1480
cmd:
1481
data.handle = to_kvm_sev_info(kvm)->handle;
1482
ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
1483
/*
1484
* If we query the session length, FW responded with expected data.
1485
*/
1486
if (!params.len)
1487
goto done;
1488
1489
if (ret)
1490
goto e_free_blob;
1491
1492
if (blob) {
1493
if (copy_to_user(p, blob, params.len))
1494
ret = -EFAULT;
1495
}
1496
1497
done:
1498
params.len = data.len;
1499
if (copy_to_user(report, &params, sizeof(params)))
1500
ret = -EFAULT;
1501
e_free_blob:
1502
kfree(blob);
1503
return ret;
1504
}
1505
1506
/* Userspace wants to query session length. */
1507
static int
1508
__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
1509
struct kvm_sev_send_start *params)
1510
{
1511
struct sev_data_send_start data;
1512
int ret;
1513
1514
memset(&data, 0, sizeof(data));
1515
data.handle = to_kvm_sev_info(kvm)->handle;
1516
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1517
1518
params->session_len = data.session_len;
1519
if (copy_to_user(u64_to_user_ptr(argp->data), params,
1520
sizeof(struct kvm_sev_send_start)))
1521
ret = -EFAULT;
1522
1523
return ret;
1524
}
1525
1526
static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1527
{
1528
struct sev_data_send_start data;
1529
struct kvm_sev_send_start params;
1530
void *amd_certs, *session_data;
1531
void *pdh_cert, *plat_certs;
1532
int ret;
1533
1534
if (!sev_guest(kvm))
1535
return -ENOTTY;
1536
1537
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1538
sizeof(struct kvm_sev_send_start)))
1539
return -EFAULT;
1540
1541
/* if session_len is zero, userspace wants to query the session length */
1542
if (!params.session_len)
1543
return __sev_send_start_query_session_length(kvm, argp,
1544
&params);
1545
1546
/* some sanity checks */
1547
if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
1548
!params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
1549
return -EINVAL;
1550
1551
/* allocate the memory to hold the session data blob */
1552
session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
1553
if (!session_data)
1554
return -ENOMEM;
1555
1556
/* copy the certificate blobs from userspace */
1557
pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
1558
params.pdh_cert_len);
1559
if (IS_ERR(pdh_cert)) {
1560
ret = PTR_ERR(pdh_cert);
1561
goto e_free_session;
1562
}
1563
1564
plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
1565
params.plat_certs_len);
1566
if (IS_ERR(plat_certs)) {
1567
ret = PTR_ERR(plat_certs);
1568
goto e_free_pdh;
1569
}
1570
1571
amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
1572
params.amd_certs_len);
1573
if (IS_ERR(amd_certs)) {
1574
ret = PTR_ERR(amd_certs);
1575
goto e_free_plat_cert;
1576
}
1577
1578
/* populate the FW SEND_START field with system physical address */
1579
memset(&data, 0, sizeof(data));
1580
data.pdh_cert_address = __psp_pa(pdh_cert);
1581
data.pdh_cert_len = params.pdh_cert_len;
1582
data.plat_certs_address = __psp_pa(plat_certs);
1583
data.plat_certs_len = params.plat_certs_len;
1584
data.amd_certs_address = __psp_pa(amd_certs);
1585
data.amd_certs_len = params.amd_certs_len;
1586
data.session_address = __psp_pa(session_data);
1587
data.session_len = params.session_len;
1588
data.handle = to_kvm_sev_info(kvm)->handle;
1589
1590
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1591
1592
if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
1593
session_data, params.session_len)) {
1594
ret = -EFAULT;
1595
goto e_free_amd_cert;
1596
}
1597
1598
params.policy = data.policy;
1599
params.session_len = data.session_len;
1600
if (copy_to_user(u64_to_user_ptr(argp->data), &params,
1601
sizeof(struct kvm_sev_send_start)))
1602
ret = -EFAULT;
1603
1604
e_free_amd_cert:
1605
kfree(amd_certs);
1606
e_free_plat_cert:
1607
kfree(plat_certs);
1608
e_free_pdh:
1609
kfree(pdh_cert);
1610
e_free_session:
1611
kfree(session_data);
1612
return ret;
1613
}
1614
1615
/* Userspace wants to query either header or trans length. */
1616
static int
1617
__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
1618
struct kvm_sev_send_update_data *params)
1619
{
1620
struct sev_data_send_update_data data;
1621
int ret;
1622
1623
memset(&data, 0, sizeof(data));
1624
data.handle = to_kvm_sev_info(kvm)->handle;
1625
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1626
1627
params->hdr_len = data.hdr_len;
1628
params->trans_len = data.trans_len;
1629
1630
if (copy_to_user(u64_to_user_ptr(argp->data), params,
1631
sizeof(struct kvm_sev_send_update_data)))
1632
ret = -EFAULT;
1633
1634
return ret;
1635
}
1636
1637
static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1638
{
1639
struct sev_data_send_update_data data;
1640
struct kvm_sev_send_update_data params;
1641
void *hdr, *trans_data;
1642
struct page **guest_page;
1643
unsigned long n;
1644
int ret, offset;
1645
1646
if (!sev_guest(kvm))
1647
return -ENOTTY;
1648
1649
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1650
sizeof(struct kvm_sev_send_update_data)))
1651
return -EFAULT;
1652
1653
/* userspace wants to query either header or trans length */
1654
if (!params.trans_len || !params.hdr_len)
1655
return __sev_send_update_data_query_lengths(kvm, argp, &params);
1656
1657
if (!params.trans_uaddr || !params.guest_uaddr ||
1658
!params.guest_len || !params.hdr_uaddr)
1659
return -EINVAL;
1660
1661
/* Check if we are crossing the page boundary */
1662
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1663
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1664
return -EINVAL;
1665
1666
/* Pin guest memory */
1667
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1668
PAGE_SIZE, &n, 0);
1669
if (IS_ERR(guest_page))
1670
return PTR_ERR(guest_page);
1671
1672
/* allocate memory for header and transport buffer */
1673
ret = -ENOMEM;
1674
hdr = kzalloc(params.hdr_len, GFP_KERNEL);
1675
if (!hdr)
1676
goto e_unpin;
1677
1678
trans_data = kzalloc(params.trans_len, GFP_KERNEL);
1679
if (!trans_data)
1680
goto e_free_hdr;
1681
1682
memset(&data, 0, sizeof(data));
1683
data.hdr_address = __psp_pa(hdr);
1684
data.hdr_len = params.hdr_len;
1685
data.trans_address = __psp_pa(trans_data);
1686
data.trans_len = params.trans_len;
1687
1688
/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
1689
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1690
data.guest_address |= sev_me_mask;
1691
data.guest_len = params.guest_len;
1692
data.handle = to_kvm_sev_info(kvm)->handle;
1693
1694
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1695
1696
if (ret)
1697
goto e_free_trans_data;
1698
1699
/* copy transport buffer to user space */
1700
if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
1701
trans_data, params.trans_len)) {
1702
ret = -EFAULT;
1703
goto e_free_trans_data;
1704
}
1705
1706
/* Copy packet header to userspace. */
1707
if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
1708
params.hdr_len))
1709
ret = -EFAULT;
1710
1711
e_free_trans_data:
1712
kfree(trans_data);
1713
e_free_hdr:
1714
kfree(hdr);
1715
e_unpin:
1716
sev_unpin_memory(kvm, guest_page, n);
1717
1718
return ret;
1719
}
1720
1721
static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1722
{
1723
struct sev_data_send_finish data;
1724
1725
if (!sev_guest(kvm))
1726
return -ENOTTY;
1727
1728
data.handle = to_kvm_sev_info(kvm)->handle;
1729
return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
1730
}
1731
1732
static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
1733
{
1734
struct sev_data_send_cancel data;
1735
1736
if (!sev_guest(kvm))
1737
return -ENOTTY;
1738
1739
data.handle = to_kvm_sev_info(kvm)->handle;
1740
return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
1741
}
1742
1743
static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1744
{
1745
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
1746
struct sev_data_receive_start start;
1747
struct kvm_sev_receive_start params;
1748
int *error = &argp->error;
1749
void *session_data;
1750
void *pdh_data;
1751
int ret;
1752
1753
if (!sev_guest(kvm))
1754
return -ENOTTY;
1755
1756
/* Get parameter from the userspace */
1757
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1758
sizeof(struct kvm_sev_receive_start)))
1759
return -EFAULT;
1760
1761
/* some sanity checks */
1762
if (!params.pdh_uaddr || !params.pdh_len ||
1763
!params.session_uaddr || !params.session_len)
1764
return -EINVAL;
1765
1766
pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
1767
if (IS_ERR(pdh_data))
1768
return PTR_ERR(pdh_data);
1769
1770
session_data = psp_copy_user_blob(params.session_uaddr,
1771
params.session_len);
1772
if (IS_ERR(session_data)) {
1773
ret = PTR_ERR(session_data);
1774
goto e_free_pdh;
1775
}
1776
1777
memset(&start, 0, sizeof(start));
1778
start.handle = params.handle;
1779
start.policy = params.policy;
1780
start.pdh_cert_address = __psp_pa(pdh_data);
1781
start.pdh_cert_len = params.pdh_len;
1782
start.session_address = __psp_pa(session_data);
1783
start.session_len = params.session_len;
1784
1785
/* create memory encryption context */
1786
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
1787
error);
1788
if (ret)
1789
goto e_free_session;
1790
1791
/* Bind ASID to this guest */
1792
ret = sev_bind_asid(kvm, start.handle, error);
1793
if (ret) {
1794
sev_decommission(start.handle);
1795
goto e_free_session;
1796
}
1797
1798
params.handle = start.handle;
1799
if (copy_to_user(u64_to_user_ptr(argp->data),
1800
&params, sizeof(struct kvm_sev_receive_start))) {
1801
ret = -EFAULT;
1802
sev_unbind_asid(kvm, start.handle);
1803
goto e_free_session;
1804
}
1805
1806
sev->handle = start.handle;
1807
sev->fd = argp->sev_fd;
1808
1809
e_free_session:
1810
kfree(session_data);
1811
e_free_pdh:
1812
kfree(pdh_data);
1813
1814
return ret;
1815
}
1816
1817
static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1818
{
1819
struct kvm_sev_receive_update_data params;
1820
struct sev_data_receive_update_data data;
1821
void *hdr = NULL, *trans = NULL;
1822
struct page **guest_page;
1823
unsigned long n;
1824
int ret, offset;
1825
1826
if (!sev_guest(kvm))
1827
return -EINVAL;
1828
1829
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1830
sizeof(struct kvm_sev_receive_update_data)))
1831
return -EFAULT;
1832
1833
if (!params.hdr_uaddr || !params.hdr_len ||
1834
!params.guest_uaddr || !params.guest_len ||
1835
!params.trans_uaddr || !params.trans_len)
1836
return -EINVAL;
1837
1838
/* Check if we are crossing the page boundary */
1839
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1840
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1841
return -EINVAL;
1842
1843
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1844
if (IS_ERR(hdr))
1845
return PTR_ERR(hdr);
1846
1847
trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1848
if (IS_ERR(trans)) {
1849
ret = PTR_ERR(trans);
1850
goto e_free_hdr;
1851
}
1852
1853
memset(&data, 0, sizeof(data));
1854
data.hdr_address = __psp_pa(hdr);
1855
data.hdr_len = params.hdr_len;
1856
data.trans_address = __psp_pa(trans);
1857
data.trans_len = params.trans_len;
1858
1859
/* Pin guest memory */
1860
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1861
PAGE_SIZE, &n, FOLL_WRITE);
1862
if (IS_ERR(guest_page)) {
1863
ret = PTR_ERR(guest_page);
1864
goto e_free_trans;
1865
}
1866
1867
/*
1868
* Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
1869
* encrypts the written data with the guest's key, and the cache may
1870
* contain dirty, unencrypted data.
1871
*/
1872
sev_clflush_pages(guest_page, n);
1873
1874
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
1875
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1876
data.guest_address |= sev_me_mask;
1877
data.guest_len = params.guest_len;
1878
data.handle = to_kvm_sev_info(kvm)->handle;
1879
1880
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
1881
&argp->error);
1882
1883
sev_unpin_memory(kvm, guest_page, n);
1884
1885
e_free_trans:
1886
kfree(trans);
1887
e_free_hdr:
1888
kfree(hdr);
1889
1890
return ret;
1891
}
1892
1893
static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1894
{
1895
struct sev_data_receive_finish data;
1896
1897
if (!sev_guest(kvm))
1898
return -ENOTTY;
1899
1900
data.handle = to_kvm_sev_info(kvm)->handle;
1901
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
1902
}
1903
1904
static bool is_cmd_allowed_from_mirror(u32 cmd_id)
1905
{
1906
/*
1907
* Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
1908
* active mirror VMs. Also allow the debugging and status commands.
1909
*/
1910
if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
1911
cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
1912
cmd_id == KVM_SEV_DBG_ENCRYPT)
1913
return true;
1914
1915
return false;
1916
}
1917
1918
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1919
{
1920
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1921
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1922
int r = -EBUSY;
1923
1924
if (dst_kvm == src_kvm)
1925
return -EINVAL;
1926
1927
/*
1928
* Bail if these VMs are already involved in a migration to avoid
1929
* deadlock between two VMs trying to migrate to/from each other.
1930
*/
1931
if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
1932
return -EBUSY;
1933
1934
if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
1935
goto release_dst;
1936
1937
r = -EINTR;
1938
if (mutex_lock_killable(&dst_kvm->lock))
1939
goto release_src;
1940
if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
1941
goto unlock_dst;
1942
return 0;
1943
1944
unlock_dst:
1945
mutex_unlock(&dst_kvm->lock);
1946
release_src:
1947
atomic_set_release(&src_sev->migration_in_progress, 0);
1948
release_dst:
1949
atomic_set_release(&dst_sev->migration_in_progress, 0);
1950
return r;
1951
}
1952
1953
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1954
{
1955
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1956
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1957
1958
mutex_unlock(&dst_kvm->lock);
1959
mutex_unlock(&src_kvm->lock);
1960
atomic_set_release(&dst_sev->migration_in_progress, 0);
1961
atomic_set_release(&src_sev->migration_in_progress, 0);
1962
}
1963
1964
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
1965
{
1966
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
1967
struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
1968
struct kvm_vcpu *dst_vcpu, *src_vcpu;
1969
struct vcpu_svm *dst_svm, *src_svm;
1970
struct kvm_sev_info *mirror;
1971
unsigned long i;
1972
1973
dst->active = true;
1974
dst->asid = src->asid;
1975
dst->handle = src->handle;
1976
dst->pages_locked = src->pages_locked;
1977
dst->enc_context_owner = src->enc_context_owner;
1978
dst->es_active = src->es_active;
1979
dst->vmsa_features = src->vmsa_features;
1980
1981
src->asid = 0;
1982
src->active = false;
1983
src->handle = 0;
1984
src->pages_locked = 0;
1985
src->enc_context_owner = NULL;
1986
src->es_active = false;
1987
1988
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
1989
1990
/*
1991
* If this VM has mirrors, "transfer" each mirror's refcount of the
1992
* source to the destination (this KVM). The caller holds a reference
1993
* to the source, so there's no danger of use-after-free.
1994
*/
1995
list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
1996
list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
1997
kvm_get_kvm(dst_kvm);
1998
kvm_put_kvm(src_kvm);
1999
mirror->enc_context_owner = dst_kvm;
2000
}
2001
2002
/*
2003
* If this VM is a mirror, remove the old mirror from the owners list
2004
* and add the new mirror to the list.
2005
*/
2006
if (is_mirroring_enc_context(dst_kvm)) {
2007
struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
2008
2009
list_del(&src->mirror_entry);
2010
list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
2011
}
2012
2013
kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
2014
dst_svm = to_svm(dst_vcpu);
2015
2016
sev_init_vmcb(dst_svm, false);
2017
2018
if (!dst->es_active)
2019
continue;
2020
2021
/*
2022
* Note, the source is not required to have the same number of
2023
* vCPUs as the destination when migrating a vanilla SEV VM.
2024
*/
2025
src_vcpu = kvm_get_vcpu(src_kvm, i);
2026
src_svm = to_svm(src_vcpu);
2027
2028
/*
2029
* Transfer VMSA and GHCB state to the destination. Nullify and
2030
* clear source fields as appropriate, the state now belongs to
2031
* the destination.
2032
*/
2033
memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
2034
dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
2035
dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
2036
dst_vcpu->arch.guest_state_protected = true;
2037
2038
memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
2039
src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
2040
src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
2041
src_vcpu->arch.guest_state_protected = false;
2042
}
2043
}
2044
2045
static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
2046
{
2047
struct kvm_vcpu *src_vcpu;
2048
unsigned long i;
2049
2050
if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
2051
dst->created_vcpus != atomic_read(&dst->online_vcpus))
2052
return -EBUSY;
2053
2054
if (!sev_es_guest(src))
2055
return 0;
2056
2057
if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
2058
return -EINVAL;
2059
2060
kvm_for_each_vcpu(i, src_vcpu, src) {
2061
if (!src_vcpu->arch.guest_state_protected)
2062
return -EINVAL;
2063
}
2064
2065
return 0;
2066
}
2067
2068
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2069
{
2070
struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
2071
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
2072
CLASS(fd, f)(source_fd);
2073
struct kvm *source_kvm;
2074
bool charged = false;
2075
int ret;
2076
2077
if (fd_empty(f))
2078
return -EBADF;
2079
2080
if (!file_is_kvm(fd_file(f)))
2081
return -EBADF;
2082
2083
source_kvm = fd_file(f)->private_data;
2084
ret = sev_lock_two_vms(kvm, source_kvm);
2085
if (ret)
2086
return ret;
2087
2088
if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
2089
sev_guest(kvm) || !sev_guest(source_kvm)) {
2090
ret = -EINVAL;
2091
goto out_unlock;
2092
}
2093
2094
src_sev = to_kvm_sev_info(source_kvm);
2095
2096
dst_sev->misc_cg = get_current_misc_cg();
2097
cg_cleanup_sev = dst_sev;
2098
if (dst_sev->misc_cg != src_sev->misc_cg) {
2099
ret = sev_misc_cg_try_charge(dst_sev);
2100
if (ret)
2101
goto out_dst_cgroup;
2102
charged = true;
2103
}
2104
2105
ret = kvm_lock_all_vcpus(kvm);
2106
if (ret)
2107
goto out_dst_cgroup;
2108
ret = kvm_lock_all_vcpus(source_kvm);
2109
if (ret)
2110
goto out_dst_vcpu;
2111
2112
ret = sev_check_source_vcpus(kvm, source_kvm);
2113
if (ret)
2114
goto out_source_vcpu;
2115
2116
/*
2117
* Allocate a new have_run_cpus for the destination, i.e. don't copy
2118
* the set of CPUs from the source. If a CPU was used to run a vCPU in
2119
* the source VM but is never used for the destination VM, then the CPU
2120
* can only have cached memory that was accessible to the source VM.
2121
*/
2122
if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2123
ret = -ENOMEM;
2124
goto out_source_vcpu;
2125
}
2126
2127
sev_migrate_from(kvm, source_kvm);
2128
kvm_vm_dead(source_kvm);
2129
cg_cleanup_sev = src_sev;
2130
ret = 0;
2131
2132
out_source_vcpu:
2133
kvm_unlock_all_vcpus(source_kvm);
2134
out_dst_vcpu:
2135
kvm_unlock_all_vcpus(kvm);
2136
out_dst_cgroup:
2137
/* Operates on the source on success, on the destination on failure. */
2138
if (charged)
2139
sev_misc_cg_uncharge(cg_cleanup_sev);
2140
put_misc_cg(cg_cleanup_sev->misc_cg);
2141
cg_cleanup_sev->misc_cg = NULL;
2142
out_unlock:
2143
sev_unlock_two_vms(kvm, source_kvm);
2144
return ret;
2145
}
2146
2147
int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
2148
{
2149
if (group != KVM_X86_GRP_SEV)
2150
return -ENXIO;
2151
2152
switch (attr) {
2153
case KVM_X86_SEV_VMSA_FEATURES:
2154
*val = sev_supported_vmsa_features;
2155
return 0;
2156
2157
case KVM_X86_SNP_POLICY_BITS:
2158
*val = snp_supported_policy_bits;
2159
return 0;
2160
2161
case KVM_X86_SEV_SNP_REQ_CERTS:
2162
*val = sev_snp_enabled ? 1 : 0;
2163
return 0;
2164
default:
2165
return -ENXIO;
2166
}
2167
}
2168
2169
/*
2170
* The guest context contains all the information, keys and metadata
2171
* associated with the guest that the firmware tracks to implement SEV
2172
* and SNP features. The firmware stores the guest context in hypervisor
2173
* provide page via the SNP_GCTX_CREATE command.
2174
*/
2175
static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
2176
{
2177
struct sev_data_snp_addr data = {};
2178
void *context;
2179
int rc;
2180
2181
/* Allocate memory for context page */
2182
context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
2183
if (!context)
2184
return NULL;
2185
2186
data.address = __psp_pa(context);
2187
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
2188
if (rc) {
2189
pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
2190
rc, argp->error);
2191
snp_free_firmware_page(context);
2192
return NULL;
2193
}
2194
2195
return context;
2196
}
2197
2198
static int snp_bind_asid(struct kvm *kvm, int *error)
2199
{
2200
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2201
struct sev_data_snp_activate data = {0};
2202
2203
data.gctx_paddr = __psp_pa(sev->snp_context);
2204
data.asid = sev_get_asid(kvm);
2205
return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
2206
}
2207
2208
static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
2209
{
2210
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2211
struct sev_data_snp_launch_start start = {0};
2212
struct kvm_sev_snp_launch_start params;
2213
int rc;
2214
2215
if (!sev_snp_guest(kvm))
2216
return -ENOTTY;
2217
2218
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2219
return -EFAULT;
2220
2221
/* Don't allow userspace to allocate memory for more than 1 SNP context. */
2222
if (sev->snp_context)
2223
return -EINVAL;
2224
2225
if (params.flags)
2226
return -EINVAL;
2227
2228
if (params.policy & ~snp_supported_policy_bits)
2229
return -EINVAL;
2230
2231
/* Check for policy bits that must be set */
2232
if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
2233
return -EINVAL;
2234
2235
if (snp_is_secure_tsc_enabled(kvm)) {
2236
if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz))
2237
return -EINVAL;
2238
2239
start.desired_tsc_khz = kvm->arch.default_tsc_khz;
2240
}
2241
2242
sev->snp_context = snp_context_create(kvm, argp);
2243
if (!sev->snp_context)
2244
return -ENOTTY;
2245
2246
start.gctx_paddr = __psp_pa(sev->snp_context);
2247
start.policy = params.policy;
2248
2249
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
2250
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
2251
if (rc) {
2252
pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
2253
__func__, rc);
2254
goto e_free_context;
2255
}
2256
2257
sev->policy = params.policy;
2258
sev->fd = argp->sev_fd;
2259
rc = snp_bind_asid(kvm, &argp->error);
2260
if (rc) {
2261
pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
2262
__func__, rc);
2263
goto e_free_context;
2264
}
2265
2266
return 0;
2267
2268
e_free_context:
2269
snp_decommission_context(kvm);
2270
2271
return rc;
2272
}
2273
2274
struct sev_gmem_populate_args {
2275
__u8 type;
2276
int sev_fd;
2277
int fw_error;
2278
};
2279
2280
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
2281
struct page *src_page, void *opaque)
2282
{
2283
struct sev_gmem_populate_args *sev_populate_args = opaque;
2284
struct sev_data_snp_launch_update fw_args = {0};
2285
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2286
bool assigned = false;
2287
int level;
2288
int ret;
2289
2290
if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
2291
return -EINVAL;
2292
2293
ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);
2294
if (ret || assigned) {
2295
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
2296
__func__, gfn, ret, assigned);
2297
ret = ret ? -EINVAL : -EEXIST;
2298
goto out;
2299
}
2300
2301
if (src_page) {
2302
void *src_vaddr = kmap_local_page(src_page);
2303
void *dst_vaddr = kmap_local_pfn(pfn);
2304
2305
memcpy(dst_vaddr, src_vaddr, PAGE_SIZE);
2306
2307
kunmap_local(src_vaddr);
2308
kunmap_local(dst_vaddr);
2309
}
2310
2311
ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K,
2312
sev_get_asid(kvm), true);
2313
if (ret)
2314
goto out;
2315
2316
fw_args.gctx_paddr = __psp_pa(sev->snp_context);
2317
fw_args.address = __sme_set(pfn_to_hpa(pfn));
2318
fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
2319
fw_args.page_type = sev_populate_args->type;
2320
2321
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2322
&fw_args, &sev_populate_args->fw_error);
2323
/*
2324
* If the firmware command failed handle the reclaim and cleanup of that
2325
* PFN before reporting an error.
2326
*
2327
* Additionally, when invalid CPUID function entries are detected,
2328
* firmware writes the expected values into the page and leaves it
2329
* unencrypted so it can be used for debugging and error-reporting.
2330
*
2331
* Copy this page back into the source buffer so userspace can use this
2332
* information to provide information on which CPUID leaves/fields
2333
* failed CPUID validation.
2334
*/
2335
if (ret && !snp_page_reclaim(kvm, pfn) &&
2336
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
2337
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
2338
void *src_vaddr = kmap_local_page(src_page);
2339
void *dst_vaddr = kmap_local_pfn(pfn);
2340
2341
memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
2342
2343
kunmap_local(src_vaddr);
2344
kunmap_local(dst_vaddr);
2345
}
2346
2347
out:
2348
if (ret)
2349
pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n",
2350
__func__, gfn, ret, sev_populate_args->fw_error);
2351
return ret;
2352
}
2353
2354
static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
2355
{
2356
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2357
struct sev_gmem_populate_args sev_populate_args = {0};
2358
struct kvm_sev_snp_launch_update params;
2359
struct kvm_memory_slot *memslot;
2360
long npages, count;
2361
void __user *src;
2362
int ret = 0;
2363
2364
if (!sev_snp_guest(kvm) || !sev->snp_context)
2365
return -EINVAL;
2366
2367
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2368
return -EFAULT;
2369
2370
pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
2371
params.gfn_start, params.len, params.type, params.flags);
2372
2373
if (!params.len || !PAGE_ALIGNED(params.len) || params.flags ||
2374
(params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
2375
params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
2376
params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
2377
params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
2378
params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
2379
return -EINVAL;
2380
2381
src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
2382
2383
if (!PAGE_ALIGNED(src))
2384
return -EINVAL;
2385
2386
npages = params.len / PAGE_SIZE;
2387
2388
/*
2389
* For each GFN that's being prepared as part of the initial guest
2390
* state, the following pre-conditions are verified:
2391
*
2392
* 1) The backing memslot is a valid private memslot.
2393
* 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
2394
* beforehand.
2395
* 3) The PFN of the guest_memfd has not already been set to private
2396
* in the RMP table.
2397
*
2398
* The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
2399
* faults if there's a race between a fault and an attribute update via
2400
* KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
2401
* here. However, kvm->slots_lock guards against both this as well as
2402
* concurrent memslot updates occurring while these checks are being
2403
* performed, so use that here to make it easier to reason about the
2404
* initial expected state and better guard against unexpected
2405
* situations.
2406
*/
2407
mutex_lock(&kvm->slots_lock);
2408
2409
memslot = gfn_to_memslot(kvm, params.gfn_start);
2410
if (!kvm_slot_has_gmem(memslot)) {
2411
ret = -EINVAL;
2412
goto out;
2413
}
2414
2415
sev_populate_args.sev_fd = argp->sev_fd;
2416
sev_populate_args.type = params.type;
2417
2418
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
2419
sev_gmem_post_populate, &sev_populate_args);
2420
if (count < 0) {
2421
argp->error = sev_populate_args.fw_error;
2422
pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
2423
__func__, count, argp->error);
2424
ret = -EIO;
2425
} else {
2426
params.gfn_start += count;
2427
params.len -= count * PAGE_SIZE;
2428
if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
2429
params.uaddr += count * PAGE_SIZE;
2430
2431
ret = 0;
2432
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
2433
ret = -EFAULT;
2434
}
2435
2436
out:
2437
mutex_unlock(&kvm->slots_lock);
2438
2439
return ret;
2440
}
2441
2442
static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
2443
{
2444
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2445
struct sev_data_snp_launch_update data = {};
2446
struct kvm_vcpu *vcpu;
2447
unsigned long i;
2448
int ret;
2449
2450
data.gctx_paddr = __psp_pa(sev->snp_context);
2451
data.page_type = SNP_PAGE_TYPE_VMSA;
2452
2453
kvm_for_each_vcpu(i, vcpu, kvm) {
2454
struct vcpu_svm *svm = to_svm(vcpu);
2455
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
2456
2457
ret = sev_es_sync_vmsa(svm);
2458
if (ret)
2459
return ret;
2460
2461
/* Transition the VMSA page to a firmware state. */
2462
ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
2463
if (ret)
2464
return ret;
2465
2466
/* Issue the SNP command to encrypt the VMSA */
2467
data.address = __sme_pa(svm->sev_es.vmsa);
2468
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2469
&data, &argp->error);
2470
if (ret) {
2471
snp_page_reclaim(kvm, pfn);
2472
2473
return ret;
2474
}
2475
2476
svm->vcpu.arch.guest_state_protected = true;
2477
/*
2478
* SEV-ES (and thus SNP) guest mandates LBR Virtualization to
2479
* be _always_ ON. Enable it only after setting
2480
* guest_state_protected because KVM_SET_MSRS allows dynamic
2481
* toggling of LBRV (for performance reason) on write access to
2482
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
2483
*/
2484
svm_enable_lbrv(vcpu);
2485
}
2486
2487
return 0;
2488
}
2489
2490
static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
2491
{
2492
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2493
struct kvm_sev_snp_launch_finish params;
2494
struct sev_data_snp_launch_finish *data;
2495
void *id_block = NULL, *id_auth = NULL;
2496
int ret;
2497
2498
if (!sev_snp_guest(kvm))
2499
return -ENOTTY;
2500
2501
if (!sev->snp_context)
2502
return -EINVAL;
2503
2504
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2505
return -EFAULT;
2506
2507
if (params.flags)
2508
return -EINVAL;
2509
2510
/* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
2511
ret = snp_launch_update_vmsa(kvm, argp);
2512
if (ret)
2513
return ret;
2514
2515
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
2516
if (!data)
2517
return -ENOMEM;
2518
2519
if (params.id_block_en) {
2520
id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
2521
if (IS_ERR(id_block)) {
2522
ret = PTR_ERR(id_block);
2523
goto e_free;
2524
}
2525
2526
data->id_block_en = 1;
2527
data->id_block_paddr = __sme_pa(id_block);
2528
2529
id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
2530
if (IS_ERR(id_auth)) {
2531
ret = PTR_ERR(id_auth);
2532
goto e_free_id_block;
2533
}
2534
2535
data->id_auth_paddr = __sme_pa(id_auth);
2536
2537
if (params.auth_key_en)
2538
data->auth_key_en = 1;
2539
}
2540
2541
data->vcek_disabled = params.vcek_disabled;
2542
2543
memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
2544
data->gctx_paddr = __psp_pa(sev->snp_context);
2545
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
2546
2547
/*
2548
* Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
2549
* can be given to the guest simply by marking the RMP entry as private.
2550
* This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
2551
*/
2552
if (!ret)
2553
kvm->arch.pre_fault_allowed = true;
2554
2555
kfree(id_auth);
2556
2557
e_free_id_block:
2558
kfree(id_block);
2559
2560
e_free:
2561
kfree(data);
2562
2563
return ret;
2564
}
2565
2566
static int snp_enable_certs(struct kvm *kvm)
2567
{
2568
if (kvm->created_vcpus || !sev_snp_guest(kvm))
2569
return -EINVAL;
2570
2571
to_kvm_sev_info(kvm)->snp_certs_enabled = true;
2572
2573
return 0;
2574
}
2575
2576
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
2577
{
2578
struct kvm_sev_cmd sev_cmd;
2579
int r;
2580
2581
if (!sev_enabled)
2582
return -ENOTTY;
2583
2584
if (!argp)
2585
return 0;
2586
2587
if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
2588
return -EFAULT;
2589
2590
mutex_lock(&kvm->lock);
2591
2592
/* Only the enc_context_owner handles some memory enc operations. */
2593
if (is_mirroring_enc_context(kvm) &&
2594
!is_cmd_allowed_from_mirror(sev_cmd.id)) {
2595
r = -EINVAL;
2596
goto out;
2597
}
2598
2599
/*
2600
* Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
2601
* allow the use of SNP-specific commands.
2602
*/
2603
if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
2604
r = -EPERM;
2605
goto out;
2606
}
2607
2608
switch (sev_cmd.id) {
2609
case KVM_SEV_ES_INIT:
2610
if (!sev_es_enabled) {
2611
r = -ENOTTY;
2612
goto out;
2613
}
2614
fallthrough;
2615
case KVM_SEV_INIT:
2616
r = sev_guest_init(kvm, &sev_cmd);
2617
break;
2618
case KVM_SEV_INIT2:
2619
r = sev_guest_init2(kvm, &sev_cmd);
2620
break;
2621
case KVM_SEV_LAUNCH_START:
2622
r = sev_launch_start(kvm, &sev_cmd);
2623
break;
2624
case KVM_SEV_LAUNCH_UPDATE_DATA:
2625
r = sev_launch_update_data(kvm, &sev_cmd);
2626
break;
2627
case KVM_SEV_LAUNCH_UPDATE_VMSA:
2628
r = sev_launch_update_vmsa(kvm, &sev_cmd);
2629
break;
2630
case KVM_SEV_LAUNCH_MEASURE:
2631
r = sev_launch_measure(kvm, &sev_cmd);
2632
break;
2633
case KVM_SEV_LAUNCH_FINISH:
2634
r = sev_launch_finish(kvm, &sev_cmd);
2635
break;
2636
case KVM_SEV_GUEST_STATUS:
2637
r = sev_guest_status(kvm, &sev_cmd);
2638
break;
2639
case KVM_SEV_DBG_DECRYPT:
2640
r = sev_dbg_crypt(kvm, &sev_cmd, true);
2641
break;
2642
case KVM_SEV_DBG_ENCRYPT:
2643
r = sev_dbg_crypt(kvm, &sev_cmd, false);
2644
break;
2645
case KVM_SEV_LAUNCH_SECRET:
2646
r = sev_launch_secret(kvm, &sev_cmd);
2647
break;
2648
case KVM_SEV_GET_ATTESTATION_REPORT:
2649
r = sev_get_attestation_report(kvm, &sev_cmd);
2650
break;
2651
case KVM_SEV_SEND_START:
2652
r = sev_send_start(kvm, &sev_cmd);
2653
break;
2654
case KVM_SEV_SEND_UPDATE_DATA:
2655
r = sev_send_update_data(kvm, &sev_cmd);
2656
break;
2657
case KVM_SEV_SEND_FINISH:
2658
r = sev_send_finish(kvm, &sev_cmd);
2659
break;
2660
case KVM_SEV_SEND_CANCEL:
2661
r = sev_send_cancel(kvm, &sev_cmd);
2662
break;
2663
case KVM_SEV_RECEIVE_START:
2664
r = sev_receive_start(kvm, &sev_cmd);
2665
break;
2666
case KVM_SEV_RECEIVE_UPDATE_DATA:
2667
r = sev_receive_update_data(kvm, &sev_cmd);
2668
break;
2669
case KVM_SEV_RECEIVE_FINISH:
2670
r = sev_receive_finish(kvm, &sev_cmd);
2671
break;
2672
case KVM_SEV_SNP_LAUNCH_START:
2673
r = snp_launch_start(kvm, &sev_cmd);
2674
break;
2675
case KVM_SEV_SNP_LAUNCH_UPDATE:
2676
r = snp_launch_update(kvm, &sev_cmd);
2677
break;
2678
case KVM_SEV_SNP_LAUNCH_FINISH:
2679
r = snp_launch_finish(kvm, &sev_cmd);
2680
break;
2681
case KVM_SEV_SNP_ENABLE_REQ_CERTS:
2682
r = snp_enable_certs(kvm);
2683
break;
2684
default:
2685
r = -EINVAL;
2686
goto out;
2687
}
2688
2689
if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
2690
r = -EFAULT;
2691
2692
out:
2693
mutex_unlock(&kvm->lock);
2694
return r;
2695
}
2696
2697
int sev_mem_enc_register_region(struct kvm *kvm,
2698
struct kvm_enc_region *range)
2699
{
2700
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2701
struct enc_region *region;
2702
int ret = 0;
2703
2704
if (!sev_guest(kvm))
2705
return -ENOTTY;
2706
2707
/* If kvm is mirroring encryption context it isn't responsible for it */
2708
if (is_mirroring_enc_context(kvm))
2709
return -EINVAL;
2710
2711
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
2712
return -EINVAL;
2713
2714
region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
2715
if (!region)
2716
return -ENOMEM;
2717
2718
mutex_lock(&kvm->lock);
2719
region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
2720
FOLL_WRITE | FOLL_LONGTERM);
2721
if (IS_ERR(region->pages)) {
2722
ret = PTR_ERR(region->pages);
2723
mutex_unlock(&kvm->lock);
2724
goto e_free;
2725
}
2726
2727
/*
2728
* The guest may change the memory encryption attribute from C=0 -> C=1
2729
* or vice versa for this memory range. Lets make sure caches are
2730
* flushed to ensure that guest data gets written into memory with
2731
* correct C-bit. Note, this must be done before dropping kvm->lock,
2732
* as region and its array of pages can be freed by a different task
2733
* once kvm->lock is released.
2734
*/
2735
sev_clflush_pages(region->pages, region->npages);
2736
2737
region->uaddr = range->addr;
2738
region->size = range->size;
2739
2740
list_add_tail(&region->list, &sev->regions_list);
2741
mutex_unlock(&kvm->lock);
2742
2743
return ret;
2744
2745
e_free:
2746
kfree(region);
2747
return ret;
2748
}
2749
2750
static struct enc_region *
2751
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
2752
{
2753
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2754
struct list_head *head = &sev->regions_list;
2755
struct enc_region *i;
2756
2757
list_for_each_entry(i, head, list) {
2758
if (i->uaddr == range->addr &&
2759
i->size == range->size)
2760
return i;
2761
}
2762
2763
return NULL;
2764
}
2765
2766
static void __unregister_enc_region_locked(struct kvm *kvm,
2767
struct enc_region *region)
2768
{
2769
sev_unpin_memory(kvm, region->pages, region->npages);
2770
list_del(&region->list);
2771
kfree(region);
2772
}
2773
2774
int sev_mem_enc_unregister_region(struct kvm *kvm,
2775
struct kvm_enc_region *range)
2776
{
2777
struct enc_region *region;
2778
int ret;
2779
2780
/* If kvm is mirroring encryption context it isn't responsible for it */
2781
if (is_mirroring_enc_context(kvm))
2782
return -EINVAL;
2783
2784
mutex_lock(&kvm->lock);
2785
2786
if (!sev_guest(kvm)) {
2787
ret = -ENOTTY;
2788
goto failed;
2789
}
2790
2791
region = find_enc_region(kvm, range);
2792
if (!region) {
2793
ret = -EINVAL;
2794
goto failed;
2795
}
2796
2797
sev_writeback_caches(kvm);
2798
2799
__unregister_enc_region_locked(kvm, region);
2800
2801
mutex_unlock(&kvm->lock);
2802
return 0;
2803
2804
failed:
2805
mutex_unlock(&kvm->lock);
2806
return ret;
2807
}
2808
2809
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2810
{
2811
CLASS(fd, f)(source_fd);
2812
struct kvm *source_kvm;
2813
struct kvm_sev_info *source_sev, *mirror_sev;
2814
int ret;
2815
2816
if (fd_empty(f))
2817
return -EBADF;
2818
2819
if (!file_is_kvm(fd_file(f)))
2820
return -EBADF;
2821
2822
source_kvm = fd_file(f)->private_data;
2823
ret = sev_lock_two_vms(kvm, source_kvm);
2824
if (ret)
2825
return ret;
2826
2827
/*
2828
* Mirrors of mirrors should work, but let's not get silly. Also
2829
* disallow out-of-band SEV/SEV-ES init if the target is already an
2830
* SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
2831
* created after SEV/SEV-ES initialization, e.g. to init intercepts.
2832
*/
2833
if (sev_guest(kvm) || !sev_guest(source_kvm) ||
2834
is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
2835
ret = -EINVAL;
2836
goto e_unlock;
2837
}
2838
2839
mirror_sev = to_kvm_sev_info(kvm);
2840
if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2841
ret = -ENOMEM;
2842
goto e_unlock;
2843
}
2844
2845
/*
2846
* The mirror kvm holds an enc_context_owner ref so its asid can't
2847
* disappear until we're done with it
2848
*/
2849
source_sev = to_kvm_sev_info(source_kvm);
2850
kvm_get_kvm(source_kvm);
2851
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
2852
2853
/* Set enc_context_owner and copy its encryption context over */
2854
mirror_sev->enc_context_owner = source_kvm;
2855
mirror_sev->active = true;
2856
mirror_sev->asid = source_sev->asid;
2857
mirror_sev->fd = source_sev->fd;
2858
mirror_sev->es_active = source_sev->es_active;
2859
mirror_sev->need_init = false;
2860
mirror_sev->handle = source_sev->handle;
2861
INIT_LIST_HEAD(&mirror_sev->regions_list);
2862
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
2863
ret = 0;
2864
2865
/*
2866
* Do not copy ap_jump_table. Since the mirror does not share the same
2867
* KVM contexts as the original, and they may have different
2868
* memory-views.
2869
*/
2870
2871
e_unlock:
2872
sev_unlock_two_vms(kvm, source_kvm);
2873
return ret;
2874
}
2875
2876
static int snp_decommission_context(struct kvm *kvm)
2877
{
2878
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2879
struct sev_data_snp_addr data = {};
2880
int ret;
2881
2882
/* If context is not created then do nothing */
2883
if (!sev->snp_context)
2884
return 0;
2885
2886
/* Do the decommision, which will unbind the ASID from the SNP context */
2887
data.address = __sme_pa(sev->snp_context);
2888
down_write(&sev_deactivate_lock);
2889
ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
2890
up_write(&sev_deactivate_lock);
2891
2892
if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
2893
return ret;
2894
2895
snp_free_firmware_page(sev->snp_context);
2896
sev->snp_context = NULL;
2897
2898
return 0;
2899
}
2900
2901
void sev_vm_destroy(struct kvm *kvm)
2902
{
2903
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2904
struct list_head *head = &sev->regions_list;
2905
struct list_head *pos, *q;
2906
2907
if (!sev_guest(kvm))
2908
return;
2909
2910
WARN_ON(!list_empty(&sev->mirror_vms));
2911
2912
free_cpumask_var(sev->have_run_cpus);
2913
2914
/*
2915
* If this is a mirror VM, remove it from the owner's list of a mirrors
2916
* and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
2917
* Note, mirror VMs don't support registering encrypted regions.
2918
*/
2919
if (is_mirroring_enc_context(kvm)) {
2920
struct kvm *owner_kvm = sev->enc_context_owner;
2921
2922
mutex_lock(&owner_kvm->lock);
2923
list_del(&sev->mirror_entry);
2924
mutex_unlock(&owner_kvm->lock);
2925
kvm_put_kvm(owner_kvm);
2926
return;
2927
}
2928
2929
2930
/*
2931
* if userspace was terminated before unregistering the memory regions
2932
* then lets unpin all the registered memory.
2933
*/
2934
if (!list_empty(head)) {
2935
list_for_each_safe(pos, q, head) {
2936
__unregister_enc_region_locked(kvm,
2937
list_entry(pos, struct enc_region, list));
2938
cond_resched();
2939
}
2940
}
2941
2942
if (sev_snp_guest(kvm)) {
2943
snp_guest_req_cleanup(kvm);
2944
2945
/*
2946
* Decomission handles unbinding of the ASID. If it fails for
2947
* some unexpected reason, just leak the ASID.
2948
*/
2949
if (snp_decommission_context(kvm))
2950
return;
2951
} else {
2952
sev_unbind_asid(kvm, sev->handle);
2953
}
2954
2955
sev_asid_free(sev);
2956
}
2957
2958
void __init sev_set_cpu_caps(void)
2959
{
2960
if (sev_enabled) {
2961
kvm_cpu_cap_set(X86_FEATURE_SEV);
2962
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
2963
}
2964
if (sev_es_enabled) {
2965
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
2966
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
2967
}
2968
if (sev_snp_enabled) {
2969
kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
2970
kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
2971
}
2972
}
2973
2974
static bool is_sev_snp_initialized(void)
2975
{
2976
struct sev_user_data_snp_status *status;
2977
struct sev_data_snp_addr buf;
2978
bool initialized = false;
2979
int ret, error = 0;
2980
2981
status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
2982
if (!status)
2983
return false;
2984
2985
buf.address = __psp_pa(status);
2986
ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
2987
if (ret) {
2988
pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
2989
ret, error, error);
2990
goto out;
2991
}
2992
2993
initialized = !!status->state;
2994
2995
out:
2996
snp_free_firmware_page(status);
2997
2998
return initialized;
2999
}
3000
3001
void __init sev_hardware_setup(void)
3002
{
3003
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
3004
struct sev_platform_init_args init_args = {0};
3005
bool sev_snp_supported = false;
3006
bool sev_es_supported = false;
3007
bool sev_supported = false;
3008
3009
if (!sev_enabled || !npt_enabled || !nrips)
3010
goto out;
3011
3012
/*
3013
* SEV must obviously be supported in hardware. Sanity check that the
3014
* CPU supports decode assists, which is mandatory for SEV guests to
3015
* support instruction emulation. Ditto for flushing by ASID, as SEV
3016
* guests are bound to a single ASID, i.e. KVM can't rotate to a new
3017
* ASID to effect a TLB flush.
3018
*/
3019
if (!boot_cpu_has(X86_FEATURE_SEV) ||
3020
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
3021
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
3022
goto out;
3023
3024
/*
3025
* The kernel's initcall infrastructure lacks the ability to express
3026
* dependencies between initcalls, whereas the modules infrastructure
3027
* automatically handles dependencies via symbol loading. Ensure the
3028
* PSP SEV driver is initialized before proceeding if KVM is built-in,
3029
* as the dependency isn't handled by the initcall infrastructure.
3030
*/
3031
if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
3032
goto out;
3033
3034
/* Retrieve SEV CPUID information */
3035
cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
3036
3037
/* Set encryption bit location for SEV-ES guests */
3038
sev_enc_bit = ebx & 0x3f;
3039
3040
/* Maximum number of encrypted guests supported simultaneously */
3041
max_sev_asid = ecx;
3042
if (!max_sev_asid)
3043
goto out;
3044
3045
/* Minimum ASID value that should be used for SEV guest */
3046
min_sev_asid = edx;
3047
sev_me_mask = 1UL << (ebx & 0x3f);
3048
3049
/*
3050
* Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
3051
* even though it's never used, so that the bitmap is indexed by the
3052
* actual ASID.
3053
*/
3054
nr_asids = max_sev_asid + 1;
3055
sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
3056
if (!sev_asid_bitmap)
3057
goto out;
3058
3059
sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
3060
if (!sev_reclaim_asid_bitmap) {
3061
bitmap_free(sev_asid_bitmap);
3062
sev_asid_bitmap = NULL;
3063
goto out;
3064
}
3065
3066
if (min_sev_asid <= max_sev_asid) {
3067
sev_asid_count = max_sev_asid - min_sev_asid + 1;
3068
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
3069
}
3070
sev_supported = true;
3071
3072
/* SEV-ES support requested? */
3073
if (!sev_es_enabled)
3074
goto out;
3075
3076
/*
3077
* SEV-ES requires MMIO caching as KVM doesn't have access to the guest
3078
* instruction stream, i.e. can't emulate in response to a #NPF and
3079
* instead relies on #NPF(RSVD) being reflected into the guest as #VC
3080
* (the guest can then do a #VMGEXIT to request MMIO emulation).
3081
*/
3082
if (!enable_mmio_caching)
3083
goto out;
3084
3085
/* Does the CPU support SEV-ES? */
3086
if (!boot_cpu_has(X86_FEATURE_SEV_ES))
3087
goto out;
3088
3089
if (!lbrv) {
3090
WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
3091
"LBRV must be present for SEV-ES support");
3092
goto out;
3093
}
3094
3095
/* Has the system been allocated ASIDs for SEV-ES? */
3096
if (min_sev_asid == 1)
3097
goto out;
3098
3099
min_sev_es_asid = min_snp_asid = 1;
3100
max_sev_es_asid = max_snp_asid = min_sev_asid - 1;
3101
3102
sev_es_asid_count = min_sev_asid - 1;
3103
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
3104
sev_es_supported = true;
3105
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
3106
3107
out:
3108
if (sev_enabled) {
3109
init_args.probe = true;
3110
3111
if (sev_is_snp_ciphertext_hiding_supported())
3112
init_args.max_snp_asid = min(nr_ciphertext_hiding_asids,
3113
min_sev_asid - 1);
3114
3115
if (sev_platform_init(&init_args))
3116
sev_supported = sev_es_supported = sev_snp_supported = false;
3117
else if (sev_snp_supported)
3118
sev_snp_supported = is_sev_snp_initialized();
3119
3120
if (sev_snp_supported) {
3121
snp_supported_policy_bits = sev_get_snp_policy_bits() &
3122
KVM_SNP_POLICY_MASK_VALID;
3123
nr_ciphertext_hiding_asids = init_args.max_snp_asid;
3124
}
3125
3126
/*
3127
* If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
3128
* ASID range is partitioned into separate SEV-ES and SEV-SNP
3129
* ASID ranges, with the SEV-SNP range being [1..max_snp_asid]
3130
* and the SEV-ES range being (max_snp_asid..max_sev_es_asid].
3131
* Note, SEV-ES may effectively be disabled if all ASIDs from
3132
* the joint range are assigned to SEV-SNP.
3133
*/
3134
if (nr_ciphertext_hiding_asids) {
3135
max_snp_asid = nr_ciphertext_hiding_asids;
3136
min_sev_es_asid = max_snp_asid + 1;
3137
pr_info("SEV-SNP ciphertext hiding enabled\n");
3138
}
3139
}
3140
3141
if (boot_cpu_has(X86_FEATURE_SEV))
3142
pr_info("SEV %s (ASIDs %u - %u)\n",
3143
sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
3144
"unusable" :
3145
"disabled",
3146
min_sev_asid, max_sev_asid);
3147
if (boot_cpu_has(X86_FEATURE_SEV_ES))
3148
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
3149
sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" :
3150
"unusable" :
3151
"disabled",
3152
min_sev_es_asid, max_sev_es_asid);
3153
if (boot_cpu_has(X86_FEATURE_SEV_SNP))
3154
pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
3155
str_enabled_disabled(sev_snp_supported),
3156
min_snp_asid, max_snp_asid);
3157
3158
sev_enabled = sev_supported;
3159
sev_es_enabled = sev_es_supported;
3160
sev_snp_enabled = sev_snp_supported;
3161
3162
sev_supported_vmsa_features = 0;
3163
3164
if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) &&
3165
cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
3166
sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
3167
3168
if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC))
3169
sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC;
3170
}
3171
3172
void sev_hardware_unsetup(void)
3173
{
3174
if (!sev_enabled)
3175
return;
3176
3177
/* No need to take sev_bitmap_lock, all VMs have been destroyed. */
3178
sev_flush_asids(1, max_sev_asid);
3179
3180
bitmap_free(sev_asid_bitmap);
3181
bitmap_free(sev_reclaim_asid_bitmap);
3182
3183
misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
3184
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
3185
3186
sev_platform_shutdown();
3187
}
3188
3189
int sev_cpu_init(struct svm_cpu_data *sd)
3190
{
3191
if (!sev_enabled)
3192
return 0;
3193
3194
sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
3195
if (!sd->sev_vmcbs)
3196
return -ENOMEM;
3197
3198
return 0;
3199
}
3200
3201
/*
3202
* Pages used by hardware to hold guest encrypted state must be flushed before
3203
* returning them to the system.
3204
*/
3205
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
3206
{
3207
unsigned int asid = sev_get_asid(vcpu->kvm);
3208
3209
/*
3210
* Note! The address must be a kernel address, as regular page walk
3211
* checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
3212
* address is non-deterministic and unsafe. This function deliberately
3213
* takes a pointer to deter passing in a user address.
3214
*/
3215
unsigned long addr = (unsigned long)va;
3216
3217
/*
3218
* If CPU enforced cache coherency for encrypted mappings of the
3219
* same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
3220
* flush is still needed in order to work properly with DMA devices.
3221
*/
3222
if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
3223
clflush_cache_range(va, PAGE_SIZE);
3224
return;
3225
}
3226
3227
/*
3228
* VM Page Flush takes a host virtual address and a guest ASID. Fall
3229
* back to full writeback of caches if this faults so as not to make
3230
* any problems worse by leaving stale encrypted data in the cache.
3231
*/
3232
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
3233
goto do_sev_writeback_caches;
3234
3235
return;
3236
3237
do_sev_writeback_caches:
3238
sev_writeback_caches(vcpu->kvm);
3239
}
3240
3241
void sev_guest_memory_reclaimed(struct kvm *kvm)
3242
{
3243
/*
3244
* With SNP+gmem, private/encrypted memory is unreachable via the
3245
* hva-based mmu notifiers, i.e. these events are explicitly scoped to
3246
* shared pages, where there's no need to flush caches.
3247
*/
3248
if (!sev_guest(kvm) || sev_snp_guest(kvm))
3249
return;
3250
3251
sev_writeback_caches(kvm);
3252
}
3253
3254
void sev_free_vcpu(struct kvm_vcpu *vcpu)
3255
{
3256
struct vcpu_svm *svm;
3257
3258
if (!sev_es_guest(vcpu->kvm))
3259
return;
3260
3261
svm = to_svm(vcpu);
3262
3263
/*
3264
* If it's an SNP guest, then the VMSA was marked in the RMP table as
3265
* a guest-owned page. Transition the page to hypervisor state before
3266
* releasing it back to the system.
3267
*/
3268
if (sev_snp_guest(vcpu->kvm)) {
3269
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
3270
3271
if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
3272
goto skip_vmsa_free;
3273
}
3274
3275
if (vcpu->arch.guest_state_protected)
3276
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
3277
3278
__free_page(virt_to_page(svm->sev_es.vmsa));
3279
3280
skip_vmsa_free:
3281
if (svm->sev_es.ghcb_sa_free)
3282
kvfree(svm->sev_es.ghcb_sa);
3283
}
3284
3285
static void dump_ghcb(struct vcpu_svm *svm)
3286
{
3287
struct vmcb_control_area *control = &svm->vmcb->control;
3288
unsigned int nbits;
3289
3290
/* Re-use the dump_invalid_vmcb module parameter */
3291
if (!dump_invalid_vmcb) {
3292
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3293
return;
3294
}
3295
3296
nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
3297
3298
/*
3299
* Print KVM's snapshot of the GHCB values that were (unsuccessfully)
3300
* used to handle the exit. If the guest has since modified the GHCB
3301
* itself, dumping the raw GHCB won't help debug why KVM was unable to
3302
* handle the VMGEXIT that KVM observed.
3303
*/
3304
pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
3305
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
3306
control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm));
3307
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
3308
control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
3309
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
3310
control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
3311
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
3312
svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
3313
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
3314
}
3315
3316
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
3317
{
3318
struct kvm_vcpu *vcpu = &svm->vcpu;
3319
struct ghcb *ghcb = svm->sev_es.ghcb;
3320
3321
/*
3322
* The GHCB protocol so far allows for the following data
3323
* to be returned:
3324
* GPRs RAX, RBX, RCX, RDX
3325
*
3326
* Copy their values, even if they may not have been written during the
3327
* VM-Exit. It's the guest's responsibility to not consume random data.
3328
*/
3329
ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
3330
ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
3331
ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
3332
ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
3333
}
3334
3335
static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
3336
{
3337
struct vmcb_control_area *control = &svm->vmcb->control;
3338
struct kvm_vcpu *vcpu = &svm->vcpu;
3339
struct ghcb *ghcb = svm->sev_es.ghcb;
3340
3341
/*
3342
* The GHCB protocol so far allows for the following data
3343
* to be supplied:
3344
* GPRs RAX, RBX, RCX, RDX
3345
* XCR0
3346
* CPL
3347
*
3348
* VMMCALL allows the guest to provide extra registers. KVM also
3349
* expects RSI for hypercalls, so include that, too.
3350
*
3351
* Copy their values to the appropriate location if supplied.
3352
*/
3353
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
3354
3355
BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
3356
memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
3357
3358
vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm);
3359
vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm);
3360
vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm);
3361
vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm);
3362
vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm);
3363
3364
svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm);
3365
3366
if (kvm_ghcb_xcr0_is_valid(svm))
3367
__kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm));
3368
3369
if (kvm_ghcb_xss_is_valid(svm))
3370
__kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm));
3371
3372
/* Copy the GHCB exit information into the VMCB fields */
3373
control->exit_code = kvm_ghcb_get_sw_exit_code(svm);
3374
control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm);
3375
control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm);
3376
svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm);
3377
3378
/* Clear the valid entries fields */
3379
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
3380
}
3381
3382
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
3383
{
3384
struct vmcb_control_area *control = &svm->vmcb->control;
3385
struct kvm_vcpu *vcpu = &svm->vcpu;
3386
u64 reason;
3387
3388
/* Only GHCB Usage code 0 is supported */
3389
if (svm->sev_es.ghcb->ghcb_usage) {
3390
reason = GHCB_ERR_INVALID_USAGE;
3391
goto vmgexit_err;
3392
}
3393
3394
reason = GHCB_ERR_MISSING_INPUT;
3395
3396
if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
3397
!kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
3398
!kvm_ghcb_sw_exit_info_2_is_valid(svm))
3399
goto vmgexit_err;
3400
3401
switch (control->exit_code) {
3402
case SVM_EXIT_READ_DR7:
3403
break;
3404
case SVM_EXIT_WRITE_DR7:
3405
if (!kvm_ghcb_rax_is_valid(svm))
3406
goto vmgexit_err;
3407
break;
3408
case SVM_EXIT_RDTSC:
3409
break;
3410
case SVM_EXIT_RDPMC:
3411
if (!kvm_ghcb_rcx_is_valid(svm))
3412
goto vmgexit_err;
3413
break;
3414
case SVM_EXIT_CPUID:
3415
if (!kvm_ghcb_rax_is_valid(svm) ||
3416
!kvm_ghcb_rcx_is_valid(svm))
3417
goto vmgexit_err;
3418
if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
3419
if (!kvm_ghcb_xcr0_is_valid(svm))
3420
goto vmgexit_err;
3421
break;
3422
case SVM_EXIT_INVD:
3423
break;
3424
case SVM_EXIT_IOIO:
3425
if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
3426
if (!kvm_ghcb_sw_scratch_is_valid(svm))
3427
goto vmgexit_err;
3428
} else {
3429
if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
3430
if (!kvm_ghcb_rax_is_valid(svm))
3431
goto vmgexit_err;
3432
}
3433
break;
3434
case SVM_EXIT_MSR:
3435
if (!kvm_ghcb_rcx_is_valid(svm))
3436
goto vmgexit_err;
3437
if (control->exit_info_1) {
3438
if (!kvm_ghcb_rax_is_valid(svm) ||
3439
!kvm_ghcb_rdx_is_valid(svm))
3440
goto vmgexit_err;
3441
}
3442
break;
3443
case SVM_EXIT_VMMCALL:
3444
if (!kvm_ghcb_rax_is_valid(svm) ||
3445
!kvm_ghcb_cpl_is_valid(svm))
3446
goto vmgexit_err;
3447
break;
3448
case SVM_EXIT_RDTSCP:
3449
break;
3450
case SVM_EXIT_WBINVD:
3451
break;
3452
case SVM_EXIT_MONITOR:
3453
if (!kvm_ghcb_rax_is_valid(svm) ||
3454
!kvm_ghcb_rcx_is_valid(svm) ||
3455
!kvm_ghcb_rdx_is_valid(svm))
3456
goto vmgexit_err;
3457
break;
3458
case SVM_EXIT_MWAIT:
3459
if (!kvm_ghcb_rax_is_valid(svm) ||
3460
!kvm_ghcb_rcx_is_valid(svm))
3461
goto vmgexit_err;
3462
break;
3463
case SVM_VMGEXIT_MMIO_READ:
3464
case SVM_VMGEXIT_MMIO_WRITE:
3465
if (!kvm_ghcb_sw_scratch_is_valid(svm))
3466
goto vmgexit_err;
3467
break;
3468
case SVM_VMGEXIT_AP_CREATION:
3469
if (!sev_snp_guest(vcpu->kvm))
3470
goto vmgexit_err;
3471
if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
3472
if (!kvm_ghcb_rax_is_valid(svm))
3473
goto vmgexit_err;
3474
break;
3475
case SVM_VMGEXIT_NMI_COMPLETE:
3476
case SVM_VMGEXIT_AP_HLT_LOOP:
3477
case SVM_VMGEXIT_AP_JUMP_TABLE:
3478
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
3479
case SVM_VMGEXIT_HV_FEATURES:
3480
case SVM_VMGEXIT_TERM_REQUEST:
3481
break;
3482
case SVM_VMGEXIT_PSC:
3483
if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
3484
goto vmgexit_err;
3485
break;
3486
case SVM_VMGEXIT_GUEST_REQUEST:
3487
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
3488
if (!sev_snp_guest(vcpu->kvm) ||
3489
!PAGE_ALIGNED(control->exit_info_1) ||
3490
!PAGE_ALIGNED(control->exit_info_2) ||
3491
control->exit_info_1 == control->exit_info_2)
3492
goto vmgexit_err;
3493
break;
3494
default:
3495
reason = GHCB_ERR_INVALID_EVENT;
3496
goto vmgexit_err;
3497
}
3498
3499
return 0;
3500
3501
vmgexit_err:
3502
/*
3503
* Print the exit code even though it may not be marked valid as it
3504
* could help with debugging.
3505
*/
3506
if (reason == GHCB_ERR_INVALID_USAGE) {
3507
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
3508
svm->sev_es.ghcb->ghcb_usage);
3509
} else if (reason == GHCB_ERR_INVALID_EVENT) {
3510
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
3511
control->exit_code);
3512
} else {
3513
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
3514
control->exit_code);
3515
dump_ghcb(svm);
3516
}
3517
3518
svm_vmgexit_bad_input(svm, reason);
3519
3520
/* Resume the guest to "return" the error code. */
3521
return 1;
3522
}
3523
3524
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
3525
{
3526
/* Clear any indication that the vCPU is in a type of AP Reset Hold */
3527
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
3528
3529
if (!svm->sev_es.ghcb)
3530
return;
3531
3532
if (svm->sev_es.ghcb_sa_free) {
3533
/*
3534
* The scratch area lives outside the GHCB, so there is a
3535
* buffer that, depending on the operation performed, may
3536
* need to be synced, then freed.
3537
*/
3538
if (svm->sev_es.ghcb_sa_sync) {
3539
kvm_write_guest(svm->vcpu.kvm,
3540
svm->sev_es.sw_scratch,
3541
svm->sev_es.ghcb_sa,
3542
svm->sev_es.ghcb_sa_len);
3543
svm->sev_es.ghcb_sa_sync = false;
3544
}
3545
3546
kvfree(svm->sev_es.ghcb_sa);
3547
svm->sev_es.ghcb_sa = NULL;
3548
svm->sev_es.ghcb_sa_free = false;
3549
}
3550
3551
trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
3552
3553
sev_es_sync_to_ghcb(svm);
3554
3555
kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
3556
svm->sev_es.ghcb = NULL;
3557
}
3558
3559
int pre_sev_run(struct vcpu_svm *svm, int cpu)
3560
{
3561
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
3562
struct kvm *kvm = svm->vcpu.kvm;
3563
unsigned int asid = sev_get_asid(kvm);
3564
3565
/*
3566
* Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
3567
* VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
3568
* AP Destroy event.
3569
*/
3570
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
3571
return -EINVAL;
3572
3573
/*
3574
* To optimize cache flushes when memory is reclaimed from an SEV VM,
3575
* track physical CPUs that enter the guest for SEV VMs and thus can
3576
* have encrypted, dirty data in the cache, and flush caches only for
3577
* CPUs that have entered the guest.
3578
*/
3579
if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
3580
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
3581
3582
/* Assign the asid allocated with this SEV guest */
3583
svm->asid = asid;
3584
3585
/*
3586
* Flush guest TLB:
3587
*
3588
* 1) when different VMCB for the same ASID is to be run on the same host CPU.
3589
* 2) or this VMCB was executed on different host CPU in previous VMRUNs.
3590
*/
3591
if (sd->sev_vmcbs[asid] == svm->vmcb &&
3592
svm->vcpu.arch.last_vmentry_cpu == cpu)
3593
return 0;
3594
3595
sd->sev_vmcbs[asid] = svm->vmcb;
3596
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3597
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3598
return 0;
3599
}
3600
3601
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
3602
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
3603
{
3604
struct vmcb_control_area *control = &svm->vmcb->control;
3605
u64 ghcb_scratch_beg, ghcb_scratch_end;
3606
u64 scratch_gpa_beg, scratch_gpa_end;
3607
void *scratch_va;
3608
3609
scratch_gpa_beg = svm->sev_es.sw_scratch;
3610
if (!scratch_gpa_beg) {
3611
pr_err("vmgexit: scratch gpa not provided\n");
3612
goto e_scratch;
3613
}
3614
3615
scratch_gpa_end = scratch_gpa_beg + len;
3616
if (scratch_gpa_end < scratch_gpa_beg) {
3617
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
3618
len, scratch_gpa_beg);
3619
goto e_scratch;
3620
}
3621
3622
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
3623
/* Scratch area begins within GHCB */
3624
ghcb_scratch_beg = control->ghcb_gpa +
3625
offsetof(struct ghcb, shared_buffer);
3626
ghcb_scratch_end = control->ghcb_gpa +
3627
offsetof(struct ghcb, reserved_0xff0);
3628
3629
/*
3630
* If the scratch area begins within the GHCB, it must be
3631
* completely contained in the GHCB shared buffer area.
3632
*/
3633
if (scratch_gpa_beg < ghcb_scratch_beg ||
3634
scratch_gpa_end > ghcb_scratch_end) {
3635
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
3636
scratch_gpa_beg, scratch_gpa_end);
3637
goto e_scratch;
3638
}
3639
3640
scratch_va = (void *)svm->sev_es.ghcb;
3641
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
3642
} else {
3643
/*
3644
* The guest memory must be read into a kernel buffer, so
3645
* limit the size
3646
*/
3647
if (len > GHCB_SCRATCH_AREA_LIMIT) {
3648
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
3649
len, GHCB_SCRATCH_AREA_LIMIT);
3650
goto e_scratch;
3651
}
3652
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
3653
if (!scratch_va)
3654
return -ENOMEM;
3655
3656
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
3657
/* Unable to copy scratch area from guest */
3658
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
3659
3660
kvfree(scratch_va);
3661
return -EFAULT;
3662
}
3663
3664
/*
3665
* The scratch area is outside the GHCB. The operation will
3666
* dictate whether the buffer needs to be synced before running
3667
* the vCPU next time (i.e. a read was requested so the data
3668
* must be written back to the guest memory).
3669
*/
3670
svm->sev_es.ghcb_sa_sync = sync;
3671
svm->sev_es.ghcb_sa_free = true;
3672
}
3673
3674
svm->sev_es.ghcb_sa = scratch_va;
3675
svm->sev_es.ghcb_sa_len = len;
3676
3677
return 0;
3678
3679
e_scratch:
3680
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
3681
3682
return 1;
3683
}
3684
3685
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
3686
unsigned int pos)
3687
{
3688
svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
3689
svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
3690
}
3691
3692
static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
3693
{
3694
return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
3695
}
3696
3697
static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
3698
{
3699
svm->vmcb->control.ghcb_gpa = value;
3700
}
3701
3702
static int snp_rmptable_psmash(kvm_pfn_t pfn)
3703
{
3704
int ret;
3705
3706
pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
3707
3708
/*
3709
* PSMASH_FAIL_INUSE indicates another processor is modifying the
3710
* entry, so retry until that's no longer the case.
3711
*/
3712
do {
3713
ret = psmash(pfn);
3714
} while (ret == PSMASH_FAIL_INUSE);
3715
3716
return ret;
3717
}
3718
3719
static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
3720
{
3721
struct vcpu_svm *svm = to_svm(vcpu);
3722
3723
if (vcpu->run->hypercall.ret)
3724
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3725
else
3726
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
3727
3728
return 1; /* resume guest */
3729
}
3730
3731
static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
3732
{
3733
u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
3734
u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
3735
struct kvm_vcpu *vcpu = &svm->vcpu;
3736
3737
if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
3738
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3739
return 1; /* resume guest */
3740
}
3741
3742
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3743
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3744
return 1; /* resume guest */
3745
}
3746
3747
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3748
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3749
/*
3750
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3751
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3752
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
3753
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3754
*/
3755
vcpu->run->hypercall.ret = 0;
3756
vcpu->run->hypercall.args[0] = gpa;
3757
vcpu->run->hypercall.args[1] = 1;
3758
vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
3759
? KVM_MAP_GPA_RANGE_ENCRYPTED
3760
: KVM_MAP_GPA_RANGE_DECRYPTED;
3761
vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3762
3763
vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
3764
3765
return 0; /* forward request to userspace */
3766
}
3767
3768
struct psc_buffer {
3769
struct psc_hdr hdr;
3770
struct psc_entry entries[];
3771
} __packed;
3772
3773
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
3774
3775
static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
3776
{
3777
svm->sev_es.psc_inflight = 0;
3778
svm->sev_es.psc_idx = 0;
3779
svm->sev_es.psc_2m = false;
3780
3781
/*
3782
* PSC requests always get a "no action" response in SW_EXITINFO1, with
3783
* a PSC-specific return code in SW_EXITINFO2 that provides the "real"
3784
* return code. E.g. if the PSC request was interrupted, the need to
3785
* retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
3786
*/
3787
svm_vmgexit_no_action(svm, psc_ret);
3788
}
3789
3790
static void __snp_complete_one_psc(struct vcpu_svm *svm)
3791
{
3792
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3793
struct psc_entry *entries = psc->entries;
3794
struct psc_hdr *hdr = &psc->hdr;
3795
__u16 idx;
3796
3797
/*
3798
* Everything in-flight has been processed successfully. Update the
3799
* corresponding entries in the guest's PSC buffer and zero out the
3800
* count of in-flight PSC entries.
3801
*/
3802
for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
3803
svm->sev_es.psc_inflight--, idx++) {
3804
struct psc_entry *entry = &entries[idx];
3805
3806
entry->cur_page = entry->pagesize ? 512 : 1;
3807
}
3808
3809
hdr->cur_entry = idx;
3810
}
3811
3812
static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
3813
{
3814
struct vcpu_svm *svm = to_svm(vcpu);
3815
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3816
3817
if (vcpu->run->hypercall.ret) {
3818
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3819
return 1; /* resume guest */
3820
}
3821
3822
__snp_complete_one_psc(svm);
3823
3824
/* Handle the next range (if any). */
3825
return snp_begin_psc(svm, psc);
3826
}
3827
3828
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
3829
{
3830
struct psc_entry *entries = psc->entries;
3831
struct kvm_vcpu *vcpu = &svm->vcpu;
3832
struct psc_hdr *hdr = &psc->hdr;
3833
struct psc_entry entry_start;
3834
u16 idx, idx_start, idx_end;
3835
int npages;
3836
bool huge;
3837
u64 gfn;
3838
3839
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3840
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3841
return 1;
3842
}
3843
3844
next_range:
3845
/* There should be no other PSCs in-flight at this point. */
3846
if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
3847
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3848
return 1;
3849
}
3850
3851
/*
3852
* The PSC descriptor buffer can be modified by a misbehaved guest after
3853
* validation, so take care to only use validated copies of values used
3854
* for things like array indexing.
3855
*/
3856
idx_start = hdr->cur_entry;
3857
idx_end = hdr->end_entry;
3858
3859
if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
3860
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
3861
return 1;
3862
}
3863
3864
/* Find the start of the next range which needs processing. */
3865
for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
3866
entry_start = entries[idx];
3867
3868
gfn = entry_start.gfn;
3869
huge = entry_start.pagesize;
3870
npages = huge ? 512 : 1;
3871
3872
if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
3873
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
3874
return 1;
3875
}
3876
3877
if (entry_start.cur_page) {
3878
/*
3879
* If this is a partially-completed 2M range, force 4K handling
3880
* for the remaining pages since they're effectively split at
3881
* this point. Subsequent code should ensure this doesn't get
3882
* combined with adjacent PSC entries where 2M handling is still
3883
* possible.
3884
*/
3885
npages -= entry_start.cur_page;
3886
gfn += entry_start.cur_page;
3887
huge = false;
3888
}
3889
3890
if (npages)
3891
break;
3892
}
3893
3894
if (idx > idx_end) {
3895
/* Nothing more to process. */
3896
snp_complete_psc(svm, 0);
3897
return 1;
3898
}
3899
3900
svm->sev_es.psc_2m = huge;
3901
svm->sev_es.psc_idx = idx;
3902
svm->sev_es.psc_inflight = 1;
3903
3904
/*
3905
* Find all subsequent PSC entries that contain adjacent GPA
3906
* ranges/operations and can be combined into a single
3907
* KVM_HC_MAP_GPA_RANGE exit.
3908
*/
3909
while (++idx <= idx_end) {
3910
struct psc_entry entry = entries[idx];
3911
3912
if (entry.operation != entry_start.operation ||
3913
entry.gfn != entry_start.gfn + npages ||
3914
entry.cur_page || !!entry.pagesize != huge)
3915
break;
3916
3917
svm->sev_es.psc_inflight++;
3918
npages += huge ? 512 : 1;
3919
}
3920
3921
switch (entry_start.operation) {
3922
case VMGEXIT_PSC_OP_PRIVATE:
3923
case VMGEXIT_PSC_OP_SHARED:
3924
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3925
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3926
/*
3927
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3928
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3929
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
3930
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3931
*/
3932
vcpu->run->hypercall.ret = 0;
3933
vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
3934
vcpu->run->hypercall.args[1] = npages;
3935
vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
3936
? KVM_MAP_GPA_RANGE_ENCRYPTED
3937
: KVM_MAP_GPA_RANGE_DECRYPTED;
3938
vcpu->run->hypercall.args[2] |= entry_start.pagesize
3939
? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
3940
: KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3941
vcpu->arch.complete_userspace_io = snp_complete_one_psc;
3942
return 0; /* forward request to userspace */
3943
default:
3944
/*
3945
* Only shared/private PSC operations are currently supported, so if the
3946
* entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
3947
* then consider the entire range completed and avoid exiting to
3948
* userspace. In theory snp_complete_psc() can always be called directly
3949
* at this point to complete the current range and start the next one,
3950
* but that could lead to unexpected levels of recursion.
3951
*/
3952
__snp_complete_one_psc(svm);
3953
goto next_range;
3954
}
3955
3956
BUG();
3957
}
3958
3959
/*
3960
* Invoked as part of svm_vcpu_reset() processing of an init event.
3961
*/
3962
static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
3963
{
3964
struct vcpu_svm *svm = to_svm(vcpu);
3965
struct kvm_memory_slot *slot;
3966
struct page *page;
3967
kvm_pfn_t pfn;
3968
gfn_t gfn;
3969
3970
guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
3971
3972
if (!svm->sev_es.snp_ap_waiting_for_reset)
3973
return;
3974
3975
svm->sev_es.snp_ap_waiting_for_reset = false;
3976
3977
/* Mark the vCPU as offline and not runnable */
3978
vcpu->arch.pv.pv_unhalted = false;
3979
kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
3980
3981
/* Clear use of the VMSA */
3982
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
3983
3984
/*
3985
* When replacing the VMSA during SEV-SNP AP creation,
3986
* mark the VMCB dirty so that full state is always reloaded.
3987
*/
3988
vmcb_mark_all_dirty(svm->vmcb);
3989
3990
if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
3991
return;
3992
3993
gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
3994
svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
3995
3996
slot = gfn_to_memslot(vcpu->kvm, gfn);
3997
if (!slot)
3998
return;
3999
4000
/*
4001
* The new VMSA will be private memory guest memory, so retrieve the
4002
* PFN from the gmem backend.
4003
*/
4004
if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
4005
return;
4006
4007
/*
4008
* From this point forward, the VMSA will always be a guest-mapped page
4009
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
4010
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
4011
* that involves cleanups like flushing caches, which would ideally be
4012
* handled during teardown rather than guest boot. Deferring that also
4013
* allows the existing logic for SEV-ES VMSAs to be re-used with
4014
* minimal SNP-specific changes.
4015
*/
4016
svm->sev_es.snp_has_guest_vmsa = true;
4017
4018
/* Use the new VMSA */
4019
svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
4020
4021
/* Mark the vCPU as runnable */
4022
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
4023
4024
/*
4025
* gmem pages aren't currently migratable, but if this ever changes
4026
* then care should be taken to ensure svm->sev_es.vmsa is pinned
4027
* through some other means.
4028
*/
4029
kvm_release_page_clean(page);
4030
}
4031
4032
static int sev_snp_ap_creation(struct vcpu_svm *svm)
4033
{
4034
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
4035
struct kvm_vcpu *vcpu = &svm->vcpu;
4036
struct kvm_vcpu *target_vcpu;
4037
struct vcpu_svm *target_svm;
4038
unsigned int request;
4039
unsigned int apic_id;
4040
4041
request = lower_32_bits(svm->vmcb->control.exit_info_1);
4042
apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
4043
4044
/* Validate the APIC ID */
4045
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
4046
if (!target_vcpu) {
4047
vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
4048
apic_id);
4049
return -EINVAL;
4050
}
4051
4052
target_svm = to_svm(target_vcpu);
4053
4054
guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
4055
4056
switch (request) {
4057
case SVM_VMGEXIT_AP_CREATE_ON_INIT:
4058
case SVM_VMGEXIT_AP_CREATE:
4059
if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
4060
vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
4061
vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
4062
return -EINVAL;
4063
}
4064
4065
if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
4066
vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
4067
svm->vmcb->control.exit_info_2);
4068
return -EINVAL;
4069
}
4070
4071
/*
4072
* Malicious guest can RMPADJUST a large page into VMSA which
4073
* will hit the SNP erratum where the CPU will incorrectly signal
4074
* an RMP violation #PF if a hugepage collides with the RMP entry
4075
* of VMSA page, reject the AP CREATE request if VMSA address from
4076
* guest is 2M aligned.
4077
*/
4078
if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
4079
vcpu_unimpl(vcpu,
4080
"vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
4081
svm->vmcb->control.exit_info_2);
4082
return -EINVAL;
4083
}
4084
4085
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
4086
break;
4087
case SVM_VMGEXIT_AP_DESTROY:
4088
target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
4089
break;
4090
default:
4091
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
4092
request);
4093
return -EINVAL;
4094
}
4095
4096
target_svm->sev_es.snp_ap_waiting_for_reset = true;
4097
4098
/*
4099
* Unless Creation is deferred until INIT, signal the vCPU to update
4100
* its state.
4101
*/
4102
if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
4103
kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
4104
4105
return 0;
4106
}
4107
4108
static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4109
{
4110
struct sev_data_snp_guest_request data = {0};
4111
struct kvm *kvm = svm->vcpu.kvm;
4112
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4113
sev_ret_code fw_err = 0;
4114
int ret;
4115
4116
if (!sev_snp_guest(kvm))
4117
return -EINVAL;
4118
4119
mutex_lock(&sev->guest_req_mutex);
4120
4121
if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
4122
ret = -EIO;
4123
goto out_unlock;
4124
}
4125
4126
data.gctx_paddr = __psp_pa(sev->snp_context);
4127
data.req_paddr = __psp_pa(sev->guest_req_buf);
4128
data.res_paddr = __psp_pa(sev->guest_resp_buf);
4129
4130
/*
4131
* Firmware failures are propagated on to guest, but any other failure
4132
* condition along the way should be reported to userspace. E.g. if
4133
* the PSP is dead and commands are timing out.
4134
*/
4135
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
4136
if (ret && !fw_err)
4137
goto out_unlock;
4138
4139
if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
4140
ret = -EIO;
4141
goto out_unlock;
4142
}
4143
4144
/* No action is requested *from KVM* if there was a firmware error. */
4145
svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
4146
4147
ret = 1; /* resume guest */
4148
4149
out_unlock:
4150
mutex_unlock(&sev->guest_req_mutex);
4151
return ret;
4152
}
4153
4154
static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error)
4155
{
4156
ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0));
4157
4158
return 1; /* resume guest */
4159
}
4160
4161
static int snp_complete_req_certs(struct kvm_vcpu *vcpu)
4162
{
4163
struct vcpu_svm *svm = to_svm(vcpu);
4164
struct vmcb_control_area *control = &svm->vmcb->control;
4165
4166
switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) {
4167
case 0:
4168
return snp_handle_guest_req(svm, control->exit_info_1,
4169
control->exit_info_2);
4170
case ENOSPC:
4171
vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages;
4172
return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN);
4173
case EAGAIN:
4174
return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY);
4175
case EIO:
4176
return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC);
4177
default:
4178
break;
4179
}
4180
4181
return -EINVAL;
4182
}
4183
4184
static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4185
{
4186
struct kvm *kvm = svm->vcpu.kvm;
4187
u8 msg_type;
4188
4189
if (!sev_snp_guest(kvm))
4190
return -EINVAL;
4191
4192
if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
4193
&msg_type, 1))
4194
return -EIO;
4195
4196
/*
4197
* As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
4198
* additional certificate data to be provided alongside the attestation
4199
* report via the guest-provided data pages indicated by RAX/RBX. If
4200
* userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace
4201
* to give userspace an opportunity to provide the certificate data
4202
* before issuing/completing the attestation request. Otherwise, return
4203
* an empty certificate table in the guest-provided data pages and
4204
* handle the attestation request immediately.
4205
*/
4206
if (msg_type == SNP_MSG_REPORT_REQ) {
4207
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
4208
struct kvm_vcpu *vcpu = &svm->vcpu;
4209
u64 data_npages;
4210
gpa_t data_gpa;
4211
4212
if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
4213
goto request_invalid;
4214
4215
data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
4216
data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
4217
4218
if (!PAGE_ALIGNED(data_gpa))
4219
goto request_invalid;
4220
4221
if (sev->snp_certs_enabled) {
4222
vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS;
4223
vcpu->run->snp_req_certs.gpa = data_gpa;
4224
vcpu->run->snp_req_certs.npages = data_npages;
4225
vcpu->run->snp_req_certs.ret = 0;
4226
vcpu->arch.complete_userspace_io = snp_complete_req_certs;
4227
return 0;
4228
}
4229
4230
/*
4231
* As per GHCB spec (see "SNP Extended Guest Request"), the
4232
* certificate table is terminated by 24-bytes of zeroes.
4233
*/
4234
if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
4235
return -EIO;
4236
}
4237
4238
return snp_handle_guest_req(svm, req_gpa, resp_gpa);
4239
4240
request_invalid:
4241
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4242
return 1; /* resume guest */
4243
}
4244
4245
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
4246
{
4247
struct vmcb_control_area *control = &svm->vmcb->control;
4248
struct kvm_vcpu *vcpu = &svm->vcpu;
4249
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4250
u64 ghcb_info;
4251
int ret = 1;
4252
4253
ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
4254
4255
trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
4256
control->ghcb_gpa);
4257
4258
switch (ghcb_info) {
4259
case GHCB_MSR_SEV_INFO_REQ:
4260
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4261
GHCB_VERSION_MIN,
4262
sev_enc_bit));
4263
break;
4264
case GHCB_MSR_CPUID_REQ: {
4265
u64 cpuid_fn, cpuid_reg, cpuid_value;
4266
4267
cpuid_fn = get_ghcb_msr_bits(svm,
4268
GHCB_MSR_CPUID_FUNC_MASK,
4269
GHCB_MSR_CPUID_FUNC_POS);
4270
4271
/* Initialize the registers needed by the CPUID intercept */
4272
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
4273
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
4274
4275
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
4276
if (!ret) {
4277
/* Error, keep GHCB MSR value as-is */
4278
break;
4279
}
4280
4281
cpuid_reg = get_ghcb_msr_bits(svm,
4282
GHCB_MSR_CPUID_REG_MASK,
4283
GHCB_MSR_CPUID_REG_POS);
4284
if (cpuid_reg == 0)
4285
cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
4286
else if (cpuid_reg == 1)
4287
cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
4288
else if (cpuid_reg == 2)
4289
cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
4290
else
4291
cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
4292
4293
set_ghcb_msr_bits(svm, cpuid_value,
4294
GHCB_MSR_CPUID_VALUE_MASK,
4295
GHCB_MSR_CPUID_VALUE_POS);
4296
4297
set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
4298
GHCB_MSR_INFO_MASK,
4299
GHCB_MSR_INFO_POS);
4300
break;
4301
}
4302
case GHCB_MSR_AP_RESET_HOLD_REQ:
4303
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
4304
ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
4305
4306
/*
4307
* Preset the result to a non-SIPI return and then only set
4308
* the result to non-zero when delivering a SIPI.
4309
*/
4310
set_ghcb_msr_bits(svm, 0,
4311
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4312
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4313
4314
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4315
GHCB_MSR_INFO_MASK,
4316
GHCB_MSR_INFO_POS);
4317
break;
4318
case GHCB_MSR_HV_FT_REQ:
4319
set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
4320
GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
4321
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
4322
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
4323
break;
4324
case GHCB_MSR_PREF_GPA_REQ:
4325
if (!sev_snp_guest(vcpu->kvm))
4326
goto out_terminate;
4327
4328
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
4329
GHCB_MSR_GPA_VALUE_POS);
4330
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
4331
GHCB_MSR_INFO_POS);
4332
break;
4333
case GHCB_MSR_REG_GPA_REQ: {
4334
u64 gfn;
4335
4336
if (!sev_snp_guest(vcpu->kvm))
4337
goto out_terminate;
4338
4339
gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
4340
GHCB_MSR_GPA_VALUE_POS);
4341
4342
svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
4343
4344
set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
4345
GHCB_MSR_GPA_VALUE_POS);
4346
set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
4347
GHCB_MSR_INFO_POS);
4348
break;
4349
}
4350
case GHCB_MSR_PSC_REQ:
4351
if (!sev_snp_guest(vcpu->kvm))
4352
goto out_terminate;
4353
4354
ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
4355
break;
4356
case GHCB_MSR_TERM_REQ: {
4357
u64 reason_set, reason_code;
4358
4359
reason_set = get_ghcb_msr_bits(svm,
4360
GHCB_MSR_TERM_REASON_SET_MASK,
4361
GHCB_MSR_TERM_REASON_SET_POS);
4362
reason_code = get_ghcb_msr_bits(svm,
4363
GHCB_MSR_TERM_REASON_MASK,
4364
GHCB_MSR_TERM_REASON_POS);
4365
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
4366
reason_set, reason_code);
4367
4368
goto out_terminate;
4369
}
4370
default:
4371
/* Error, keep GHCB MSR value as-is */
4372
break;
4373
}
4374
4375
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
4376
control->ghcb_gpa, ret);
4377
4378
return ret;
4379
4380
out_terminate:
4381
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4382
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4383
vcpu->run->system_event.ndata = 1;
4384
vcpu->run->system_event.data[0] = control->ghcb_gpa;
4385
4386
return 0;
4387
}
4388
4389
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
4390
{
4391
struct vcpu_svm *svm = to_svm(vcpu);
4392
struct vmcb_control_area *control = &svm->vmcb->control;
4393
u64 ghcb_gpa;
4394
int ret;
4395
4396
/* Validate the GHCB */
4397
ghcb_gpa = control->ghcb_gpa;
4398
if (ghcb_gpa & GHCB_MSR_INFO_MASK)
4399
return sev_handle_vmgexit_msr_protocol(svm);
4400
4401
if (!ghcb_gpa) {
4402
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
4403
4404
/* Without a GHCB, just return right back to the guest */
4405
return 1;
4406
}
4407
4408
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
4409
/* Unable to map GHCB from guest */
4410
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
4411
ghcb_gpa);
4412
4413
/* Without a GHCB, just return right back to the guest */
4414
return 1;
4415
}
4416
4417
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
4418
4419
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
4420
4421
sev_es_sync_from_ghcb(svm);
4422
4423
/* SEV-SNP guest requires that the GHCB GPA must be registered */
4424
if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
4425
vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
4426
return -EINVAL;
4427
}
4428
4429
ret = sev_es_validate_vmgexit(svm);
4430
if (ret)
4431
return ret;
4432
4433
svm_vmgexit_success(svm, 0);
4434
4435
switch (control->exit_code) {
4436
case SVM_VMGEXIT_MMIO_READ:
4437
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4438
if (ret)
4439
break;
4440
4441
ret = kvm_sev_es_mmio_read(vcpu,
4442
control->exit_info_1,
4443
control->exit_info_2,
4444
svm->sev_es.ghcb_sa);
4445
break;
4446
case SVM_VMGEXIT_MMIO_WRITE:
4447
ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
4448
if (ret)
4449
break;
4450
4451
ret = kvm_sev_es_mmio_write(vcpu,
4452
control->exit_info_1,
4453
control->exit_info_2,
4454
svm->sev_es.ghcb_sa);
4455
break;
4456
case SVM_VMGEXIT_NMI_COMPLETE:
4457
++vcpu->stat.nmi_window_exits;
4458
svm->nmi_masked = false;
4459
kvm_make_request(KVM_REQ_EVENT, vcpu);
4460
ret = 1;
4461
break;
4462
case SVM_VMGEXIT_AP_HLT_LOOP:
4463
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
4464
ret = kvm_emulate_ap_reset_hold(vcpu);
4465
break;
4466
case SVM_VMGEXIT_AP_JUMP_TABLE: {
4467
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4468
4469
switch (control->exit_info_1) {
4470
case 0:
4471
/* Set AP jump table address */
4472
sev->ap_jump_table = control->exit_info_2;
4473
break;
4474
case 1:
4475
/* Get AP jump table address */
4476
svm_vmgexit_success(svm, sev->ap_jump_table);
4477
break;
4478
default:
4479
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
4480
control->exit_info_1);
4481
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4482
}
4483
4484
ret = 1;
4485
break;
4486
}
4487
case SVM_VMGEXIT_HV_FEATURES:
4488
svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
4489
ret = 1;
4490
break;
4491
case SVM_VMGEXIT_TERM_REQUEST:
4492
pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
4493
control->exit_info_1, control->exit_info_2);
4494
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4495
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4496
vcpu->run->system_event.ndata = 1;
4497
vcpu->run->system_event.data[0] = control->ghcb_gpa;
4498
break;
4499
case SVM_VMGEXIT_PSC:
4500
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4501
if (ret)
4502
break;
4503
4504
ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
4505
break;
4506
case SVM_VMGEXIT_AP_CREATION:
4507
ret = sev_snp_ap_creation(svm);
4508
if (ret) {
4509
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4510
}
4511
4512
ret = 1;
4513
break;
4514
case SVM_VMGEXIT_GUEST_REQUEST:
4515
ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
4516
break;
4517
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
4518
ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
4519
break;
4520
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
4521
vcpu_unimpl(vcpu,
4522
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
4523
control->exit_info_1, control->exit_info_2);
4524
ret = -EINVAL;
4525
break;
4526
default:
4527
ret = svm_invoke_exit_handler(vcpu, control->exit_code);
4528
}
4529
4530
return ret;
4531
}
4532
4533
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
4534
{
4535
int count;
4536
int bytes;
4537
int r;
4538
4539
if (svm->vmcb->control.exit_info_2 > INT_MAX)
4540
return -EINVAL;
4541
4542
count = svm->vmcb->control.exit_info_2;
4543
if (unlikely(check_mul_overflow(count, size, &bytes)))
4544
return -EINVAL;
4545
4546
r = setup_vmgexit_scratch(svm, in, bytes);
4547
if (r)
4548
return r;
4549
4550
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
4551
count, in);
4552
}
4553
4554
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4555
{
4556
/* Clear intercepts on MSRs that are context switched by hardware. */
4557
svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
4558
svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
4559
svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
4560
4561
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
4562
svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
4563
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
4564
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
4565
4566
svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R,
4567
!snp_is_secure_tsc_enabled(vcpu->kvm));
4568
4569
/*
4570
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
4571
* the host/guest supports its use.
4572
*
4573
* KVM treats the guest as being capable of using XSAVES even if XSAVES
4574
* isn't enabled in guest CPUID as there is no intercept for XSAVES,
4575
* i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
4576
* exposed to the guest and XSAVES is supported in hardware. Condition
4577
* full XSS passthrough on the guest being able to use XSAVES *and*
4578
* XSAVES being exposed to the guest so that KVM can at least honor
4579
* guest CPUID for RDMSR and WRMSR.
4580
*/
4581
svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
4582
!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
4583
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
4584
}
4585
4586
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
4587
{
4588
struct kvm_vcpu *vcpu = &svm->vcpu;
4589
struct kvm_cpuid_entry2 *best;
4590
4591
/* For sev guests, the memory encryption bit is not reserved in CR3. */
4592
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4593
if (best)
4594
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4595
}
4596
4597
static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
4598
{
4599
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
4600
struct vmcb *vmcb = svm->vmcb01.ptr;
4601
4602
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
4603
4604
/*
4605
* An SEV-ES guest requires a VMSA area that is a separate from the
4606
* VMCB page. Do not include the encryption mask on the VMSA physical
4607
* address since hardware will access it using the guest key. Note,
4608
* the VMSA will be NULL if this vCPU is the destination for intrahost
4609
* migration, and will be copied later.
4610
*/
4611
if (!svm->sev_es.snp_has_guest_vmsa) {
4612
if (svm->sev_es.vmsa)
4613
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
4614
else
4615
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
4616
}
4617
4618
if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
4619
svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
4620
VMCB_ALLOWED_SEV_FEATURES_VALID;
4621
4622
/* Can't intercept CR register access, HV can't modify CR registers */
4623
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
4624
svm_clr_intercept(svm, INTERCEPT_CR4_READ);
4625
svm_clr_intercept(svm, INTERCEPT_CR8_READ);
4626
svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
4627
svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
4628
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
4629
4630
svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
4631
4632
/* Track EFER/CR register changes */
4633
svm_set_intercept(svm, TRAP_EFER_WRITE);
4634
svm_set_intercept(svm, TRAP_CR0_WRITE);
4635
svm_set_intercept(svm, TRAP_CR4_WRITE);
4636
svm_set_intercept(svm, TRAP_CR8_WRITE);
4637
4638
vmcb->control.intercepts[INTERCEPT_DR] = 0;
4639
if (!sev_vcpu_has_debug_swap(svm)) {
4640
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
4641
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
4642
recalc_intercepts(svm);
4643
} else {
4644
/*
4645
* Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
4646
* allow debugging SEV-ES guests, and enables DebugSwap iff
4647
* NO_NESTED_DATA_BP is supported, so there's no reason to
4648
* intercept #DB when DebugSwap is enabled. For simplicity
4649
* with respect to guest debug, intercept #DB for other VMs
4650
* even if NO_NESTED_DATA_BP is supported, i.e. even if the
4651
* guest can't DoS the CPU with infinite #DB vectoring.
4652
*/
4653
clr_exception_intercept(svm, DB_VECTOR);
4654
}
4655
4656
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
4657
svm_clr_intercept(svm, INTERCEPT_XSETBV);
4658
4659
/*
4660
* Set the GHCB MSR value as per the GHCB specification when emulating
4661
* vCPU RESET for an SEV-ES guest.
4662
*/
4663
if (!init_event)
4664
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4665
GHCB_VERSION_MIN,
4666
sev_enc_bit));
4667
}
4668
4669
void sev_init_vmcb(struct vcpu_svm *svm, bool init_event)
4670
{
4671
struct kvm_vcpu *vcpu = &svm->vcpu;
4672
4673
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
4674
clr_exception_intercept(svm, UD_VECTOR);
4675
4676
/*
4677
* Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
4678
* KVM can't decrypt guest memory to decode the faulting instruction.
4679
*/
4680
clr_exception_intercept(svm, GP_VECTOR);
4681
4682
if (init_event && sev_snp_guest(vcpu->kvm))
4683
sev_snp_init_protected_guest_state(vcpu);
4684
4685
if (sev_es_guest(vcpu->kvm))
4686
sev_es_init_vmcb(svm, init_event);
4687
}
4688
4689
int sev_vcpu_create(struct kvm_vcpu *vcpu)
4690
{
4691
struct vcpu_svm *svm = to_svm(vcpu);
4692
struct page *vmsa_page;
4693
4694
mutex_init(&svm->sev_es.snp_vmsa_mutex);
4695
4696
if (!sev_es_guest(vcpu->kvm))
4697
return 0;
4698
4699
/*
4700
* SEV-ES guests require a separate (from the VMCB) VMSA page used to
4701
* contain the encrypted register state of the guest.
4702
*/
4703
vmsa_page = snp_safe_alloc_page();
4704
if (!vmsa_page)
4705
return -ENOMEM;
4706
4707
svm->sev_es.vmsa = page_address(vmsa_page);
4708
4709
vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm);
4710
4711
return 0;
4712
}
4713
4714
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
4715
{
4716
struct kvm *kvm = svm->vcpu.kvm;
4717
4718
/*
4719
* All host state for SEV-ES guests is categorized into three swap types
4720
* based on how it is handled by hardware during a world switch:
4721
*
4722
* A: VMRUN: Host state saved in host save area
4723
* VMEXIT: Host state loaded from host save area
4724
*
4725
* B: VMRUN: Host state _NOT_ saved in host save area
4726
* VMEXIT: Host state loaded from host save area
4727
*
4728
* C: VMRUN: Host state _NOT_ saved in host save area
4729
* VMEXIT: Host state initialized to default(reset) values
4730
*
4731
* Manually save type-B state, i.e. state that is loaded by VMEXIT but
4732
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
4733
* by common SVM code).
4734
*/
4735
hostsa->xcr0 = kvm_host.xcr0;
4736
hostsa->pkru = read_pkru();
4737
hostsa->xss = kvm_host.xss;
4738
4739
/*
4740
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
4741
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
4742
* not save or load debug registers. Sadly, KVM can't prevent SNP
4743
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
4744
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
4745
* the guest has actually enabled (or not!) in the VMSA.
4746
*
4747
* If DebugSwap is *possible*, save the masks so that they're restored
4748
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
4749
* rely on the CPU to restore the host values; KVM will restore them as
4750
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
4751
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
4752
* don't need to be restored per se, KVM just needs to ensure they are
4753
* loaded with the correct values *if* the CPU writes the MSRs.
4754
*/
4755
if (sev_vcpu_has_debug_swap(svm) ||
4756
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
4757
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
4758
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
4759
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
4760
hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
4761
}
4762
4763
/*
4764
* TSC_AUX is always virtualized for SEV-ES guests when the feature is
4765
* available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area.
4766
* Set the save area to the current hardware value, i.e. the current
4767
* user return value, so that the correct value is restored on #VMEXIT.
4768
*/
4769
if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) &&
4770
!WARN_ON_ONCE(tsc_aux_uret_slot < 0))
4771
hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot);
4772
}
4773
4774
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4775
{
4776
struct vcpu_svm *svm = to_svm(vcpu);
4777
4778
/* First SIPI: Use the values as initially set by the VMM */
4779
if (!svm->sev_es.received_first_sipi) {
4780
svm->sev_es.received_first_sipi = true;
4781
return;
4782
}
4783
4784
/* Subsequent SIPI */
4785
switch (svm->sev_es.ap_reset_hold_type) {
4786
case AP_RESET_HOLD_NAE_EVENT:
4787
/*
4788
* Return from an AP Reset Hold VMGEXIT, where the guest will
4789
* set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
4790
*/
4791
svm_vmgexit_success(svm, 1);
4792
break;
4793
case AP_RESET_HOLD_MSR_PROTO:
4794
/*
4795
* Return from an AP Reset Hold VMGEXIT, where the guest will
4796
* set the CS and RIP. Set GHCB data field to a non-zero value.
4797
*/
4798
set_ghcb_msr_bits(svm, 1,
4799
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4800
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4801
4802
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4803
GHCB_MSR_INFO_MASK,
4804
GHCB_MSR_INFO_POS);
4805
break;
4806
default:
4807
break;
4808
}
4809
}
4810
4811
struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
4812
{
4813
unsigned long pfn;
4814
struct page *p;
4815
4816
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
4817
return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
4818
4819
/*
4820
* Allocate an SNP-safe page to workaround the SNP erratum where
4821
* the CPU will incorrectly signal an RMP violation #PF if a
4822
* hugepage (2MB or 1GB) collides with the RMP entry of a
4823
* 2MB-aligned VMCB, VMSA, or AVIC backing page.
4824
*
4825
* Allocate one extra page, choose a page which is not
4826
* 2MB-aligned, and free the other.
4827
*/
4828
p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
4829
if (!p)
4830
return NULL;
4831
4832
split_page(p, 1);
4833
4834
pfn = page_to_pfn(p);
4835
if (IS_ALIGNED(pfn, PTRS_PER_PMD))
4836
__free_page(p++);
4837
else
4838
__free_page(p + 1);
4839
4840
return p;
4841
}
4842
4843
void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
4844
{
4845
struct kvm_memory_slot *slot;
4846
struct kvm *kvm = vcpu->kvm;
4847
int order, rmp_level, ret;
4848
struct page *page;
4849
bool assigned;
4850
kvm_pfn_t pfn;
4851
gfn_t gfn;
4852
4853
gfn = gpa >> PAGE_SHIFT;
4854
4855
/*
4856
* The only time RMP faults occur for shared pages is when the guest is
4857
* triggering an RMP fault for an implicit page-state change from
4858
* shared->private. Implicit page-state changes are forwarded to
4859
* userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
4860
* for shared pages should not end up here.
4861
*/
4862
if (!kvm_mem_is_private(kvm, gfn)) {
4863
pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
4864
gpa);
4865
return;
4866
}
4867
4868
slot = gfn_to_memslot(kvm, gfn);
4869
if (!kvm_slot_has_gmem(slot)) {
4870
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
4871
gpa);
4872
return;
4873
}
4874
4875
ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
4876
if (ret) {
4877
pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
4878
gpa);
4879
return;
4880
}
4881
4882
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4883
if (ret || !assigned) {
4884
pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
4885
gpa, pfn, ret);
4886
goto out_no_trace;
4887
}
4888
4889
/*
4890
* There are 2 cases where a PSMASH may be needed to resolve an #NPF
4891
* with PFERR_GUEST_RMP_BIT set:
4892
*
4893
* 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
4894
* bit set if the guest issues them with a smaller granularity than
4895
* what is indicated by the page-size bit in the 2MB RMP entry for
4896
* the PFN that backs the GPA.
4897
*
4898
* 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
4899
* smaller than what is indicated by the 2MB RMP entry for the PFN
4900
* that backs the GPA.
4901
*
4902
* In both these cases, the corresponding 2M RMP entry needs to
4903
* be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
4904
* split into 4K RMP entries, then this is likely a spurious case which
4905
* can occur when there are concurrent accesses by the guest to a 2MB
4906
* GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
4907
* the process of being PMASH'd into 4K entries. These cases should
4908
* resolve automatically on subsequent accesses, so just ignore them
4909
* here.
4910
*/
4911
if (rmp_level == PG_LEVEL_4K)
4912
goto out;
4913
4914
ret = snp_rmptable_psmash(pfn);
4915
if (ret) {
4916
/*
4917
* Look it up again. If it's 4K now then the PSMASH may have
4918
* raced with another process and the issue has already resolved
4919
* itself.
4920
*/
4921
if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
4922
assigned && rmp_level == PG_LEVEL_4K)
4923
goto out;
4924
4925
pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
4926
gpa, pfn, ret);
4927
}
4928
4929
kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
4930
out:
4931
trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
4932
out_no_trace:
4933
kvm_release_page_unused(page);
4934
}
4935
4936
static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
4937
{
4938
kvm_pfn_t pfn = start;
4939
4940
while (pfn < end) {
4941
int ret, rmp_level;
4942
bool assigned;
4943
4944
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4945
if (ret) {
4946
pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
4947
pfn, start, end, rmp_level, ret);
4948
return false;
4949
}
4950
4951
if (assigned) {
4952
pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
4953
__func__, pfn, start, end, rmp_level);
4954
return false;
4955
}
4956
4957
pfn++;
4958
}
4959
4960
return true;
4961
}
4962
4963
static u8 max_level_for_order(int order)
4964
{
4965
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
4966
return PG_LEVEL_2M;
4967
4968
return PG_LEVEL_4K;
4969
}
4970
4971
static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
4972
{
4973
kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
4974
4975
/*
4976
* If this is a large folio, and the entire 2M range containing the
4977
* PFN is currently shared, then the entire 2M-aligned range can be
4978
* set to private via a single 2M RMP entry.
4979
*/
4980
if (max_level_for_order(order) > PG_LEVEL_4K &&
4981
is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
4982
return true;
4983
4984
return false;
4985
}
4986
4987
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
4988
{
4989
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4990
kvm_pfn_t pfn_aligned;
4991
gfn_t gfn_aligned;
4992
int level, rc;
4993
bool assigned;
4994
4995
if (!sev_snp_guest(kvm))
4996
return 0;
4997
4998
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
4999
if (rc) {
5000
pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
5001
gfn, pfn, rc);
5002
return -ENOENT;
5003
}
5004
5005
if (assigned) {
5006
pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
5007
__func__, gfn, pfn, max_order, level);
5008
return 0;
5009
}
5010
5011
if (is_large_rmp_possible(kvm, pfn, max_order)) {
5012
level = PG_LEVEL_2M;
5013
pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
5014
gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
5015
} else {
5016
level = PG_LEVEL_4K;
5017
pfn_aligned = pfn;
5018
gfn_aligned = gfn;
5019
}
5020
5021
rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
5022
if (rc) {
5023
pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
5024
gfn, pfn, level, rc);
5025
return -EINVAL;
5026
}
5027
5028
pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
5029
__func__, gfn, pfn, pfn_aligned, max_order, level);
5030
5031
return 0;
5032
}
5033
5034
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
5035
{
5036
kvm_pfn_t pfn;
5037
5038
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
5039
return;
5040
5041
pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
5042
5043
for (pfn = start; pfn < end;) {
5044
bool use_2m_update = false;
5045
int rc, rmp_level;
5046
bool assigned;
5047
5048
rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
5049
if (rc || !assigned)
5050
goto next_pfn;
5051
5052
use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
5053
end >= (pfn + PTRS_PER_PMD) &&
5054
rmp_level > PG_LEVEL_4K;
5055
5056
/*
5057
* If an unaligned PFN corresponds to a 2M region assigned as a
5058
* large page in the RMP table, PSMASH the region into individual
5059
* 4K RMP entries before attempting to convert a 4K sub-page.
5060
*/
5061
if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
5062
/*
5063
* This shouldn't fail, but if it does, report it, but
5064
* still try to update RMP entry to shared and pray this
5065
* was a spurious error that can be addressed later.
5066
*/
5067
rc = snp_rmptable_psmash(pfn);
5068
WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
5069
pfn, rc);
5070
}
5071
5072
rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
5073
if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
5074
pfn, rc))
5075
goto next_pfn;
5076
5077
/*
5078
* SEV-ES avoids host/guest cache coherency issues through
5079
* WBNOINVD hooks issued via MMU notifiers during run-time, and
5080
* KVM's VM destroy path at shutdown. Those MMU notifier events
5081
* don't cover gmem since there is no requirement to map pages
5082
* to a HVA in order to use them for a running guest. While the
5083
* shutdown path would still likely cover things for SNP guests,
5084
* userspace may also free gmem pages during run-time via
5085
* hole-punching operations on the guest_memfd, so flush the
5086
* cache entries for these pages before free'ing them back to
5087
* the host.
5088
*/
5089
clflush_cache_range(__va(pfn_to_hpa(pfn)),
5090
use_2m_update ? PMD_SIZE : PAGE_SIZE);
5091
next_pfn:
5092
pfn += use_2m_update ? PTRS_PER_PMD : 1;
5093
cond_resched();
5094
}
5095
}
5096
5097
int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
5098
{
5099
int level, rc;
5100
bool assigned;
5101
5102
if (!sev_snp_guest(kvm))
5103
return 0;
5104
5105
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
5106
if (rc || !assigned)
5107
return PG_LEVEL_4K;
5108
5109
return level;
5110
}
5111
5112
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
5113
{
5114
struct vcpu_svm *svm = to_svm(vcpu);
5115
struct vmcb_save_area *vmsa;
5116
struct kvm_sev_info *sev;
5117
int error = 0;
5118
int ret;
5119
5120
if (!sev_es_guest(vcpu->kvm))
5121
return NULL;
5122
5123
/*
5124
* If the VMSA has not yet been encrypted, return a pointer to the
5125
* current un-encrypted VMSA.
5126
*/
5127
if (!vcpu->arch.guest_state_protected)
5128
return (struct vmcb_save_area *)svm->sev_es.vmsa;
5129
5130
sev = to_kvm_sev_info(vcpu->kvm);
5131
5132
/* Check if the SEV policy allows debugging */
5133
if (sev_snp_guest(vcpu->kvm)) {
5134
if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
5135
return NULL;
5136
} else {
5137
if (sev->policy & SEV_POLICY_MASK_NODBG)
5138
return NULL;
5139
}
5140
5141
if (sev_snp_guest(vcpu->kvm)) {
5142
struct sev_data_snp_dbg dbg = {0};
5143
5144
vmsa = snp_alloc_firmware_page(__GFP_ZERO);
5145
if (!vmsa)
5146
return NULL;
5147
5148
dbg.gctx_paddr = __psp_pa(sev->snp_context);
5149
dbg.src_addr = svm->vmcb->control.vmsa_pa;
5150
dbg.dst_addr = __psp_pa(vmsa);
5151
5152
ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
5153
5154
/*
5155
* Return the target page to a hypervisor page no matter what.
5156
* If this fails, the page can't be used, so leak it and don't
5157
* try to use it.
5158
*/
5159
if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
5160
return NULL;
5161
5162
if (ret) {
5163
pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
5164
ret, error, error);
5165
free_page((unsigned long)vmsa);
5166
5167
return NULL;
5168
}
5169
} else {
5170
struct sev_data_dbg dbg = {0};
5171
struct page *vmsa_page;
5172
5173
vmsa_page = alloc_page(GFP_KERNEL);
5174
if (!vmsa_page)
5175
return NULL;
5176
5177
vmsa = page_address(vmsa_page);
5178
5179
dbg.handle = sev->handle;
5180
dbg.src_addr = svm->vmcb->control.vmsa_pa;
5181
dbg.dst_addr = __psp_pa(vmsa);
5182
dbg.len = PAGE_SIZE;
5183
5184
ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
5185
if (ret) {
5186
pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
5187
ret, error, error);
5188
__free_page(vmsa_page);
5189
5190
return NULL;
5191
}
5192
}
5193
5194
return vmsa;
5195
}
5196
5197
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
5198
{
5199
/* If the VMSA has not yet been encrypted, nothing was allocated */
5200
if (!vcpu->arch.guest_state_protected || !vmsa)
5201
return;
5202
5203
free_page((unsigned long)vmsa);
5204
}
5205
5206