Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/svm/sev.c
26471 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine driver for Linux
4
*
5
* AMD SVM-SEV support
6
*
7
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
8
*/
9
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11
#include <linux/kvm_types.h>
12
#include <linux/kvm_host.h>
13
#include <linux/kernel.h>
14
#include <linux/highmem.h>
15
#include <linux/psp.h>
16
#include <linux/psp-sev.h>
17
#include <linux/pagemap.h>
18
#include <linux/swap.h>
19
#include <linux/misc_cgroup.h>
20
#include <linux/processor.h>
21
#include <linux/trace_events.h>
22
#include <uapi/linux/sev-guest.h>
23
24
#include <asm/pkru.h>
25
#include <asm/trapnr.h>
26
#include <asm/fpu/xcr.h>
27
#include <asm/fpu/xstate.h>
28
#include <asm/debugreg.h>
29
#include <asm/msr.h>
30
#include <asm/sev.h>
31
32
#include "mmu.h"
33
#include "x86.h"
34
#include "svm.h"
35
#include "svm_ops.h"
36
#include "cpuid.h"
37
#include "trace.h"
38
39
#define GHCB_VERSION_MAX 2ULL
40
#define GHCB_VERSION_DEFAULT 2ULL
41
#define GHCB_VERSION_MIN 1ULL
42
43
#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
44
45
/* enable/disable SEV support */
46
static bool sev_enabled = true;
47
module_param_named(sev, sev_enabled, bool, 0444);
48
49
/* enable/disable SEV-ES support */
50
static bool sev_es_enabled = true;
51
module_param_named(sev_es, sev_es_enabled, bool, 0444);
52
53
/* enable/disable SEV-SNP support */
54
static bool sev_snp_enabled = true;
55
module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
56
57
/* enable/disable SEV-ES DebugSwap support */
58
static bool sev_es_debug_swap_enabled = true;
59
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
60
static u64 sev_supported_vmsa_features;
61
62
#define AP_RESET_HOLD_NONE 0
63
#define AP_RESET_HOLD_NAE_EVENT 1
64
#define AP_RESET_HOLD_MSR_PROTO 2
65
66
/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
67
#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
68
#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
69
#define SNP_POLICY_MASK_SMT BIT_ULL(16)
70
#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
71
#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
72
#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
73
74
#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
75
SNP_POLICY_MASK_API_MAJOR | \
76
SNP_POLICY_MASK_SMT | \
77
SNP_POLICY_MASK_RSVD_MBO | \
78
SNP_POLICY_MASK_DEBUG | \
79
SNP_POLICY_MASK_SINGLE_SOCKET)
80
81
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
82
83
static u8 sev_enc_bit;
84
static DECLARE_RWSEM(sev_deactivate_lock);
85
static DEFINE_MUTEX(sev_bitmap_lock);
86
unsigned int max_sev_asid;
87
static unsigned int min_sev_asid;
88
static unsigned long sev_me_mask;
89
static unsigned int nr_asids;
90
static unsigned long *sev_asid_bitmap;
91
static unsigned long *sev_reclaim_asid_bitmap;
92
93
static int snp_decommission_context(struct kvm *kvm);
94
95
struct enc_region {
96
struct list_head list;
97
unsigned long npages;
98
struct page **pages;
99
unsigned long uaddr;
100
unsigned long size;
101
};
102
103
/* Called with the sev_bitmap_lock held, or on shutdown */
104
static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
105
{
106
int ret, error = 0;
107
unsigned int asid;
108
109
/* Check if there are any ASIDs to reclaim before performing a flush */
110
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
111
if (asid > max_asid)
112
return -EBUSY;
113
114
/*
115
* DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
116
* so it must be guarded.
117
*/
118
down_write(&sev_deactivate_lock);
119
120
/* SNP firmware requires use of WBINVD for ASID recycling. */
121
wbinvd_on_all_cpus();
122
123
if (sev_snp_enabled)
124
ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
125
else
126
ret = sev_guest_df_flush(&error);
127
128
up_write(&sev_deactivate_lock);
129
130
if (ret)
131
pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
132
sev_snp_enabled ? "-SNP" : "", ret, error);
133
134
return ret;
135
}
136
137
static inline bool is_mirroring_enc_context(struct kvm *kvm)
138
{
139
return !!to_kvm_sev_info(kvm)->enc_context_owner;
140
}
141
142
static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
143
{
144
struct kvm_vcpu *vcpu = &svm->vcpu;
145
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
146
147
return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
148
}
149
150
/* Must be called with the sev_bitmap_lock held */
151
static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
152
{
153
if (sev_flush_asids(min_asid, max_asid))
154
return false;
155
156
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
157
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
158
nr_asids);
159
bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
160
161
return true;
162
}
163
164
static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
165
{
166
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
167
return misc_cg_try_charge(type, sev->misc_cg, 1);
168
}
169
170
static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
171
{
172
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
173
misc_cg_uncharge(type, sev->misc_cg, 1);
174
}
175
176
static int sev_asid_new(struct kvm_sev_info *sev)
177
{
178
/*
179
* SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
180
* SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
181
* Note: min ASID can end up larger than the max if basic SEV support is
182
* effectively disabled by disallowing use of ASIDs for SEV guests.
183
*/
184
unsigned int min_asid = sev->es_active ? 1 : min_sev_asid;
185
unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
186
unsigned int asid;
187
bool retry = true;
188
int ret;
189
190
if (min_asid > max_asid)
191
return -ENOTTY;
192
193
WARN_ON(sev->misc_cg);
194
sev->misc_cg = get_current_misc_cg();
195
ret = sev_misc_cg_try_charge(sev);
196
if (ret) {
197
put_misc_cg(sev->misc_cg);
198
sev->misc_cg = NULL;
199
return ret;
200
}
201
202
mutex_lock(&sev_bitmap_lock);
203
204
again:
205
asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
206
if (asid > max_asid) {
207
if (retry && __sev_recycle_asids(min_asid, max_asid)) {
208
retry = false;
209
goto again;
210
}
211
mutex_unlock(&sev_bitmap_lock);
212
ret = -EBUSY;
213
goto e_uncharge;
214
}
215
216
__set_bit(asid, sev_asid_bitmap);
217
218
mutex_unlock(&sev_bitmap_lock);
219
220
sev->asid = asid;
221
return 0;
222
e_uncharge:
223
sev_misc_cg_uncharge(sev);
224
put_misc_cg(sev->misc_cg);
225
sev->misc_cg = NULL;
226
return ret;
227
}
228
229
static unsigned int sev_get_asid(struct kvm *kvm)
230
{
231
return to_kvm_sev_info(kvm)->asid;
232
}
233
234
static void sev_asid_free(struct kvm_sev_info *sev)
235
{
236
struct svm_cpu_data *sd;
237
int cpu;
238
239
mutex_lock(&sev_bitmap_lock);
240
241
__set_bit(sev->asid, sev_reclaim_asid_bitmap);
242
243
for_each_possible_cpu(cpu) {
244
sd = per_cpu_ptr(&svm_data, cpu);
245
sd->sev_vmcbs[sev->asid] = NULL;
246
}
247
248
mutex_unlock(&sev_bitmap_lock);
249
250
sev_misc_cg_uncharge(sev);
251
put_misc_cg(sev->misc_cg);
252
sev->misc_cg = NULL;
253
}
254
255
static void sev_decommission(unsigned int handle)
256
{
257
struct sev_data_decommission decommission;
258
259
if (!handle)
260
return;
261
262
decommission.handle = handle;
263
sev_guest_decommission(&decommission, NULL);
264
}
265
266
/*
267
* Transition a page to hypervisor-owned/shared state in the RMP table. This
268
* should not fail under normal conditions, but leak the page should that
269
* happen since it will no longer be usable by the host due to RMP protections.
270
*/
271
static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
272
{
273
if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
274
snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
275
return -EIO;
276
}
277
278
return 0;
279
}
280
281
/*
282
* Certain page-states, such as Pre-Guest and Firmware pages (as documented
283
* in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
284
* directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
285
* unless they are reclaimed first.
286
*
287
* Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
288
* might not be usable by the host due to being set as immutable or still
289
* being associated with a guest ASID.
290
*
291
* Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
292
* converted back to shared, as the page is no longer usable due to RMP
293
* protections, and it's infeasible for the guest to continue on.
294
*/
295
static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
296
{
297
struct sev_data_snp_page_reclaim data = {0};
298
int fw_err, rc;
299
300
data.paddr = __sme_set(pfn << PAGE_SHIFT);
301
rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
302
if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
303
snp_leak_pages(pfn, 1);
304
return -EIO;
305
}
306
307
if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
308
return -EIO;
309
310
return rc;
311
}
312
313
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
314
{
315
struct sev_data_deactivate deactivate;
316
317
if (!handle)
318
return;
319
320
deactivate.handle = handle;
321
322
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
323
down_read(&sev_deactivate_lock);
324
sev_guest_deactivate(&deactivate, NULL);
325
up_read(&sev_deactivate_lock);
326
327
sev_decommission(handle);
328
}
329
330
/*
331
* This sets up bounce buffers/firmware pages to handle SNP Guest Request
332
* messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
333
* 2.0 specification for more details.
334
*
335
* Technically, when an SNP Guest Request is issued, the guest will provide its
336
* own request/response pages, which could in theory be passed along directly
337
* to firmware rather than using bounce pages. However, these pages would need
338
* special care:
339
*
340
* - Both pages are from shared guest memory, so they need to be protected
341
* from migration/etc. occurring while firmware reads/writes to them. At a
342
* minimum, this requires elevating the ref counts and potentially needing
343
* an explicit pinning of the memory. This places additional restrictions
344
* on what type of memory backends userspace can use for shared guest
345
* memory since there is some reliance on using refcounted pages.
346
*
347
* - The response page needs to be switched to Firmware-owned[1] state
348
* before the firmware can write to it, which can lead to potential
349
* host RMP #PFs if the guest is misbehaved and hands the host a
350
* guest page that KVM might write to for other reasons (e.g. virtio
351
* buffers/etc.).
352
*
353
* Both of these issues can be avoided completely by using separately-allocated
354
* bounce pages for both the request/response pages and passing those to
355
* firmware instead. So that's what is being set up here.
356
*
357
* Guest requests rely on message sequence numbers to ensure requests are
358
* issued to firmware in the order the guest issues them, so concurrent guest
359
* requests generally shouldn't happen. But a misbehaved guest could issue
360
* concurrent guest requests in theory, so a mutex is used to serialize
361
* access to the bounce buffers.
362
*
363
* [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
364
* details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
365
* in the APM for details on the related RMP restrictions.
366
*/
367
static int snp_guest_req_init(struct kvm *kvm)
368
{
369
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
370
struct page *req_page;
371
372
req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
373
if (!req_page)
374
return -ENOMEM;
375
376
sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
377
if (!sev->guest_resp_buf) {
378
__free_page(req_page);
379
return -EIO;
380
}
381
382
sev->guest_req_buf = page_address(req_page);
383
mutex_init(&sev->guest_req_mutex);
384
385
return 0;
386
}
387
388
static void snp_guest_req_cleanup(struct kvm *kvm)
389
{
390
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
391
392
if (sev->guest_resp_buf)
393
snp_free_firmware_page(sev->guest_resp_buf);
394
395
if (sev->guest_req_buf)
396
__free_page(virt_to_page(sev->guest_req_buf));
397
398
sev->guest_req_buf = NULL;
399
sev->guest_resp_buf = NULL;
400
}
401
402
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
403
struct kvm_sev_init *data,
404
unsigned long vm_type)
405
{
406
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
407
struct sev_platform_init_args init_args = {0};
408
bool es_active = vm_type != KVM_X86_SEV_VM;
409
u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
410
int ret;
411
412
if (kvm->created_vcpus)
413
return -EINVAL;
414
415
if (data->flags)
416
return -EINVAL;
417
418
if (data->vmsa_features & ~valid_vmsa_features)
419
return -EINVAL;
420
421
if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
422
return -EINVAL;
423
424
if (unlikely(sev->active))
425
return -EINVAL;
426
427
sev->active = true;
428
sev->es_active = es_active;
429
sev->vmsa_features = data->vmsa_features;
430
sev->ghcb_version = data->ghcb_version;
431
432
/*
433
* Currently KVM supports the full range of mandatory features defined
434
* by version 2 of the GHCB protocol, so default to that for SEV-ES
435
* guests created via KVM_SEV_INIT2.
436
*/
437
if (sev->es_active && !sev->ghcb_version)
438
sev->ghcb_version = GHCB_VERSION_DEFAULT;
439
440
if (vm_type == KVM_X86_SNP_VM)
441
sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
442
443
ret = sev_asid_new(sev);
444
if (ret)
445
goto e_no_asid;
446
447
init_args.probe = false;
448
ret = sev_platform_init(&init_args);
449
if (ret)
450
goto e_free_asid;
451
452
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
453
ret = -ENOMEM;
454
goto e_free_asid;
455
}
456
457
/* This needs to happen after SEV/SNP firmware initialization. */
458
if (vm_type == KVM_X86_SNP_VM) {
459
ret = snp_guest_req_init(kvm);
460
if (ret)
461
goto e_free;
462
}
463
464
INIT_LIST_HEAD(&sev->regions_list);
465
INIT_LIST_HEAD(&sev->mirror_vms);
466
sev->need_init = false;
467
468
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
469
470
return 0;
471
472
e_free:
473
free_cpumask_var(sev->have_run_cpus);
474
e_free_asid:
475
argp->error = init_args.error;
476
sev_asid_free(sev);
477
sev->asid = 0;
478
e_no_asid:
479
sev->vmsa_features = 0;
480
sev->es_active = false;
481
sev->active = false;
482
return ret;
483
}
484
485
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
486
{
487
struct kvm_sev_init data = {
488
.vmsa_features = 0,
489
.ghcb_version = 0,
490
};
491
unsigned long vm_type;
492
493
if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
494
return -EINVAL;
495
496
vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
497
498
/*
499
* KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
500
* continue to only ever support the minimal GHCB protocol version.
501
*/
502
if (vm_type == KVM_X86_SEV_ES_VM)
503
data.ghcb_version = GHCB_VERSION_MIN;
504
505
return __sev_guest_init(kvm, argp, &data, vm_type);
506
}
507
508
static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
509
{
510
struct kvm_sev_init data;
511
512
if (!to_kvm_sev_info(kvm)->need_init)
513
return -EINVAL;
514
515
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
516
kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
517
kvm->arch.vm_type != KVM_X86_SNP_VM)
518
return -EINVAL;
519
520
if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
521
return -EFAULT;
522
523
return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
524
}
525
526
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
527
{
528
unsigned int asid = sev_get_asid(kvm);
529
struct sev_data_activate activate;
530
int ret;
531
532
/* activate ASID on the given handle */
533
activate.handle = handle;
534
activate.asid = asid;
535
ret = sev_guest_activate(&activate, error);
536
537
return ret;
538
}
539
540
static int __sev_issue_cmd(int fd, int id, void *data, int *error)
541
{
542
CLASS(fd, f)(fd);
543
544
if (fd_empty(f))
545
return -EBADF;
546
547
return sev_issue_cmd_external_user(fd_file(f), id, data, error);
548
}
549
550
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
551
{
552
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
553
554
return __sev_issue_cmd(sev->fd, id, data, error);
555
}
556
557
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
558
{
559
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
560
struct sev_data_launch_start start;
561
struct kvm_sev_launch_start params;
562
void *dh_blob, *session_blob;
563
int *error = &argp->error;
564
int ret;
565
566
if (!sev_guest(kvm))
567
return -ENOTTY;
568
569
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
570
return -EFAULT;
571
572
sev->policy = params.policy;
573
574
memset(&start, 0, sizeof(start));
575
576
dh_blob = NULL;
577
if (params.dh_uaddr) {
578
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
579
if (IS_ERR(dh_blob))
580
return PTR_ERR(dh_blob);
581
582
start.dh_cert_address = __sme_set(__pa(dh_blob));
583
start.dh_cert_len = params.dh_len;
584
}
585
586
session_blob = NULL;
587
if (params.session_uaddr) {
588
session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
589
if (IS_ERR(session_blob)) {
590
ret = PTR_ERR(session_blob);
591
goto e_free_dh;
592
}
593
594
start.session_address = __sme_set(__pa(session_blob));
595
start.session_len = params.session_len;
596
}
597
598
start.handle = params.handle;
599
start.policy = params.policy;
600
601
/* create memory encryption context */
602
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
603
if (ret)
604
goto e_free_session;
605
606
/* Bind ASID to this guest */
607
ret = sev_bind_asid(kvm, start.handle, error);
608
if (ret) {
609
sev_decommission(start.handle);
610
goto e_free_session;
611
}
612
613
/* return handle to userspace */
614
params.handle = start.handle;
615
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
616
sev_unbind_asid(kvm, start.handle);
617
ret = -EFAULT;
618
goto e_free_session;
619
}
620
621
sev->handle = start.handle;
622
sev->fd = argp->sev_fd;
623
624
e_free_session:
625
kfree(session_blob);
626
e_free_dh:
627
kfree(dh_blob);
628
return ret;
629
}
630
631
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
632
unsigned long ulen, unsigned long *n,
633
unsigned int flags)
634
{
635
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
636
unsigned long npages, size;
637
int npinned;
638
unsigned long locked, lock_limit;
639
struct page **pages;
640
unsigned long first, last;
641
int ret;
642
643
lockdep_assert_held(&kvm->lock);
644
645
if (ulen == 0 || uaddr + ulen < uaddr)
646
return ERR_PTR(-EINVAL);
647
648
/* Calculate number of pages. */
649
first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
650
last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
651
npages = (last - first + 1);
652
653
locked = sev->pages_locked + npages;
654
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
655
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
656
pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
657
return ERR_PTR(-ENOMEM);
658
}
659
660
if (WARN_ON_ONCE(npages > INT_MAX))
661
return ERR_PTR(-EINVAL);
662
663
/* Avoid using vmalloc for smaller buffers. */
664
size = npages * sizeof(struct page *);
665
if (size > PAGE_SIZE)
666
pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
667
else
668
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
669
670
if (!pages)
671
return ERR_PTR(-ENOMEM);
672
673
/* Pin the user virtual address. */
674
npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
675
if (npinned != npages) {
676
pr_err("SEV: Failure locking %lu pages.\n", npages);
677
ret = -ENOMEM;
678
goto err;
679
}
680
681
*n = npages;
682
sev->pages_locked = locked;
683
684
return pages;
685
686
err:
687
if (npinned > 0)
688
unpin_user_pages(pages, npinned);
689
690
kvfree(pages);
691
return ERR_PTR(ret);
692
}
693
694
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
695
unsigned long npages)
696
{
697
unpin_user_pages(pages, npages);
698
kvfree(pages);
699
to_kvm_sev_info(kvm)->pages_locked -= npages;
700
}
701
702
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
703
{
704
uint8_t *page_virtual;
705
unsigned long i;
706
707
if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
708
pages == NULL)
709
return;
710
711
for (i = 0; i < npages; i++) {
712
page_virtual = kmap_local_page(pages[i]);
713
clflush_cache_range(page_virtual, PAGE_SIZE);
714
kunmap_local(page_virtual);
715
cond_resched();
716
}
717
}
718
719
static void sev_writeback_caches(struct kvm *kvm)
720
{
721
/*
722
* Ensure that all dirty guest tagged cache entries are written back
723
* before releasing the pages back to the system for use. CLFLUSH will
724
* not do this without SME_COHERENT, and flushing many cache lines
725
* individually is slower than blasting WBINVD for large VMs, so issue
726
* WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
727
* on CPUs that have done VMRUN, i.e. may have dirtied data using the
728
* VM's ASID.
729
*
730
* For simplicity, never remove CPUs from the bitmap. Ideally, KVM
731
* would clear the mask when flushing caches, but doing so requires
732
* serializing multiple calls and having responding CPUs (to the IPI)
733
* mark themselves as still running if they are running (or about to
734
* run) a vCPU for the VM.
735
*
736
* Note, the caller is responsible for ensuring correctness if the mask
737
* can be modified, e.g. if a CPU could be doing VMRUN.
738
*/
739
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
740
}
741
742
static unsigned long get_num_contig_pages(unsigned long idx,
743
struct page **inpages, unsigned long npages)
744
{
745
unsigned long paddr, next_paddr;
746
unsigned long i = idx + 1, pages = 1;
747
748
/* find the number of contiguous pages starting from idx */
749
paddr = __sme_page_pa(inpages[idx]);
750
while (i < npages) {
751
next_paddr = __sme_page_pa(inpages[i++]);
752
if ((paddr + PAGE_SIZE) == next_paddr) {
753
pages++;
754
paddr = next_paddr;
755
continue;
756
}
757
break;
758
}
759
760
return pages;
761
}
762
763
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
764
{
765
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
766
struct kvm_sev_launch_update_data params;
767
struct sev_data_launch_update_data data;
768
struct page **inpages;
769
int ret;
770
771
if (!sev_guest(kvm))
772
return -ENOTTY;
773
774
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
775
return -EFAULT;
776
777
vaddr = params.uaddr;
778
size = params.len;
779
vaddr_end = vaddr + size;
780
781
/* Lock the user memory. */
782
inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
783
if (IS_ERR(inpages))
784
return PTR_ERR(inpages);
785
786
/*
787
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
788
* place; the cache may contain the data that was written unencrypted.
789
*/
790
sev_clflush_pages(inpages, npages);
791
792
data.reserved = 0;
793
data.handle = to_kvm_sev_info(kvm)->handle;
794
795
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
796
int offset, len;
797
798
/*
799
* If the user buffer is not page-aligned, calculate the offset
800
* within the page.
801
*/
802
offset = vaddr & (PAGE_SIZE - 1);
803
804
/* Calculate the number of pages that can be encrypted in one go. */
805
pages = get_num_contig_pages(i, inpages, npages);
806
807
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
808
809
data.len = len;
810
data.address = __sme_page_pa(inpages[i]) + offset;
811
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
812
if (ret)
813
goto e_unpin;
814
815
size -= len;
816
next_vaddr = vaddr + len;
817
}
818
819
e_unpin:
820
/* content of memory is updated, mark pages dirty */
821
for (i = 0; i < npages; i++) {
822
set_page_dirty_lock(inpages[i]);
823
mark_page_accessed(inpages[i]);
824
}
825
/* unlock the user pages */
826
sev_unpin_memory(kvm, inpages, npages);
827
return ret;
828
}
829
830
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
831
{
832
struct kvm_vcpu *vcpu = &svm->vcpu;
833
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
834
struct sev_es_save_area *save = svm->sev_es.vmsa;
835
struct xregs_state *xsave;
836
const u8 *s;
837
u8 *d;
838
int i;
839
840
/* Check some debug related fields before encrypting the VMSA */
841
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
842
return -EINVAL;
843
844
/*
845
* SEV-ES will use a VMSA that is pointed to by the VMCB, not
846
* the traditional VMSA that is part of the VMCB. Copy the
847
* traditional VMSA as it has been built so far (in prep
848
* for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
849
*/
850
memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
851
852
/* Sync registgers */
853
save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
854
save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
855
save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
856
save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
857
save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
858
save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
859
save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
860
save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
861
#ifdef CONFIG_X86_64
862
save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
863
save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
864
save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
865
save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
866
save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
867
save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
868
save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
869
save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
870
#endif
871
save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
872
873
/* Sync some non-GPR registers before encrypting */
874
save->xcr0 = svm->vcpu.arch.xcr0;
875
save->pkru = svm->vcpu.arch.pkru;
876
save->xss = svm->vcpu.arch.ia32_xss;
877
save->dr6 = svm->vcpu.arch.dr6;
878
879
save->sev_features = sev->vmsa_features;
880
881
/*
882
* Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
883
* breaking older measurements.
884
*/
885
if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
886
xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
887
save->x87_dp = xsave->i387.rdp;
888
save->mxcsr = xsave->i387.mxcsr;
889
save->x87_ftw = xsave->i387.twd;
890
save->x87_fsw = xsave->i387.swd;
891
save->x87_fcw = xsave->i387.cwd;
892
save->x87_fop = xsave->i387.fop;
893
save->x87_ds = 0;
894
save->x87_cs = 0;
895
save->x87_rip = xsave->i387.rip;
896
897
for (i = 0; i < 8; i++) {
898
/*
899
* The format of the x87 save area is undocumented and
900
* definitely not what you would expect. It consists of
901
* an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
902
* area with bytes 8-9 of each register.
903
*/
904
d = save->fpreg_x87 + i * 8;
905
s = ((u8 *)xsave->i387.st_space) + i * 16;
906
memcpy(d, s, 8);
907
save->fpreg_x87[64 + i * 2] = s[8];
908
save->fpreg_x87[64 + i * 2 + 1] = s[9];
909
}
910
memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
911
912
s = get_xsave_addr(xsave, XFEATURE_YMM);
913
if (s)
914
memcpy(save->fpreg_ymm, s, 256);
915
else
916
memset(save->fpreg_ymm, 0, 256);
917
}
918
919
pr_debug("Virtual Machine Save Area (VMSA):\n");
920
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
921
922
return 0;
923
}
924
925
static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
926
int *error)
927
{
928
struct sev_data_launch_update_vmsa vmsa;
929
struct vcpu_svm *svm = to_svm(vcpu);
930
int ret;
931
932
if (vcpu->guest_debug) {
933
pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
934
return -EINVAL;
935
}
936
937
/* Perform some pre-encryption checks against the VMSA */
938
ret = sev_es_sync_vmsa(svm);
939
if (ret)
940
return ret;
941
942
/*
943
* The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
944
* the VMSA memory content (i.e it will write the same memory region
945
* with the guest's key), so invalidate it first.
946
*/
947
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
948
949
vmsa.reserved = 0;
950
vmsa.handle = to_kvm_sev_info(kvm)->handle;
951
vmsa.address = __sme_pa(svm->sev_es.vmsa);
952
vmsa.len = PAGE_SIZE;
953
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
954
if (ret)
955
return ret;
956
957
/*
958
* SEV-ES guests maintain an encrypted version of their FPU
959
* state which is restored and saved on VMRUN and VMEXIT.
960
* Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
961
* do xsave/xrstor on it.
962
*/
963
fpstate_set_confidential(&vcpu->arch.guest_fpu);
964
vcpu->arch.guest_state_protected = true;
965
966
/*
967
* SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
968
* only after setting guest_state_protected because KVM_SET_MSRS allows
969
* dynamic toggling of LBRV (for performance reason) on write access to
970
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
971
*/
972
svm_enable_lbrv(vcpu);
973
return 0;
974
}
975
976
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
977
{
978
struct kvm_vcpu *vcpu;
979
unsigned long i;
980
int ret;
981
982
if (!sev_es_guest(kvm))
983
return -ENOTTY;
984
985
kvm_for_each_vcpu(i, vcpu, kvm) {
986
ret = mutex_lock_killable(&vcpu->mutex);
987
if (ret)
988
return ret;
989
990
ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
991
992
mutex_unlock(&vcpu->mutex);
993
if (ret)
994
return ret;
995
}
996
997
return 0;
998
}
999
1000
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
1001
{
1002
void __user *measure = u64_to_user_ptr(argp->data);
1003
struct sev_data_launch_measure data;
1004
struct kvm_sev_launch_measure params;
1005
void __user *p = NULL;
1006
void *blob = NULL;
1007
int ret;
1008
1009
if (!sev_guest(kvm))
1010
return -ENOTTY;
1011
1012
if (copy_from_user(&params, measure, sizeof(params)))
1013
return -EFAULT;
1014
1015
memset(&data, 0, sizeof(data));
1016
1017
/* User wants to query the blob length */
1018
if (!params.len)
1019
goto cmd;
1020
1021
p = u64_to_user_ptr(params.uaddr);
1022
if (p) {
1023
if (params.len > SEV_FW_BLOB_MAX_SIZE)
1024
return -EINVAL;
1025
1026
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1027
if (!blob)
1028
return -ENOMEM;
1029
1030
data.address = __psp_pa(blob);
1031
data.len = params.len;
1032
}
1033
1034
cmd:
1035
data.handle = to_kvm_sev_info(kvm)->handle;
1036
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
1037
1038
/*
1039
* If we query the session length, FW responded with expected data.
1040
*/
1041
if (!params.len)
1042
goto done;
1043
1044
if (ret)
1045
goto e_free_blob;
1046
1047
if (blob) {
1048
if (copy_to_user(p, blob, params.len))
1049
ret = -EFAULT;
1050
}
1051
1052
done:
1053
params.len = data.len;
1054
if (copy_to_user(measure, &params, sizeof(params)))
1055
ret = -EFAULT;
1056
e_free_blob:
1057
kfree(blob);
1058
return ret;
1059
}
1060
1061
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1062
{
1063
struct sev_data_launch_finish data;
1064
1065
if (!sev_guest(kvm))
1066
return -ENOTTY;
1067
1068
data.handle = to_kvm_sev_info(kvm)->handle;
1069
return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
1070
}
1071
1072
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
1073
{
1074
struct kvm_sev_guest_status params;
1075
struct sev_data_guest_status data;
1076
int ret;
1077
1078
if (!sev_guest(kvm))
1079
return -ENOTTY;
1080
1081
memset(&data, 0, sizeof(data));
1082
1083
data.handle = to_kvm_sev_info(kvm)->handle;
1084
ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
1085
if (ret)
1086
return ret;
1087
1088
params.policy = data.policy;
1089
params.state = data.state;
1090
params.handle = data.handle;
1091
1092
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
1093
ret = -EFAULT;
1094
1095
return ret;
1096
}
1097
1098
static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
1099
unsigned long dst, int size,
1100
int *error, bool enc)
1101
{
1102
struct sev_data_dbg data;
1103
1104
data.reserved = 0;
1105
data.handle = to_kvm_sev_info(kvm)->handle;
1106
data.dst_addr = dst;
1107
data.src_addr = src;
1108
data.len = size;
1109
1110
return sev_issue_cmd(kvm,
1111
enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
1112
&data, error);
1113
}
1114
1115
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
1116
unsigned long dst_paddr, int sz, int *err)
1117
{
1118
int offset;
1119
1120
/*
1121
* Its safe to read more than we are asked, caller should ensure that
1122
* destination has enough space.
1123
*/
1124
offset = src_paddr & 15;
1125
src_paddr = round_down(src_paddr, 16);
1126
sz = round_up(sz + offset, 16);
1127
1128
return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
1129
}
1130
1131
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
1132
void __user *dst_uaddr,
1133
unsigned long dst_paddr,
1134
int size, int *err)
1135
{
1136
struct page *tpage = NULL;
1137
int ret, offset;
1138
1139
/* if inputs are not 16-byte then use intermediate buffer */
1140
if (!IS_ALIGNED(dst_paddr, 16) ||
1141
!IS_ALIGNED(paddr, 16) ||
1142
!IS_ALIGNED(size, 16)) {
1143
tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1144
if (!tpage)
1145
return -ENOMEM;
1146
1147
dst_paddr = __sme_page_pa(tpage);
1148
}
1149
1150
ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
1151
if (ret)
1152
goto e_free;
1153
1154
if (tpage) {
1155
offset = paddr & 15;
1156
if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
1157
ret = -EFAULT;
1158
}
1159
1160
e_free:
1161
if (tpage)
1162
__free_page(tpage);
1163
1164
return ret;
1165
}
1166
1167
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
1168
void __user *vaddr,
1169
unsigned long dst_paddr,
1170
void __user *dst_vaddr,
1171
int size, int *error)
1172
{
1173
struct page *src_tpage = NULL;
1174
struct page *dst_tpage = NULL;
1175
int ret, len = size;
1176
1177
/* If source buffer is not aligned then use an intermediate buffer */
1178
if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
1179
src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1180
if (!src_tpage)
1181
return -ENOMEM;
1182
1183
if (copy_from_user(page_address(src_tpage), vaddr, size)) {
1184
__free_page(src_tpage);
1185
return -EFAULT;
1186
}
1187
1188
paddr = __sme_page_pa(src_tpage);
1189
}
1190
1191
/*
1192
* If destination buffer or length is not aligned then do read-modify-write:
1193
* - decrypt destination in an intermediate buffer
1194
* - copy the source buffer in an intermediate buffer
1195
* - use the intermediate buffer as source buffer
1196
*/
1197
if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
1198
int dst_offset;
1199
1200
dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
1201
if (!dst_tpage) {
1202
ret = -ENOMEM;
1203
goto e_free;
1204
}
1205
1206
ret = __sev_dbg_decrypt(kvm, dst_paddr,
1207
__sme_page_pa(dst_tpage), size, error);
1208
if (ret)
1209
goto e_free;
1210
1211
/*
1212
* If source is kernel buffer then use memcpy() otherwise
1213
* copy_from_user().
1214
*/
1215
dst_offset = dst_paddr & 15;
1216
1217
if (src_tpage)
1218
memcpy(page_address(dst_tpage) + dst_offset,
1219
page_address(src_tpage), size);
1220
else {
1221
if (copy_from_user(page_address(dst_tpage) + dst_offset,
1222
vaddr, size)) {
1223
ret = -EFAULT;
1224
goto e_free;
1225
}
1226
}
1227
1228
paddr = __sme_page_pa(dst_tpage);
1229
dst_paddr = round_down(dst_paddr, 16);
1230
len = round_up(size, 16);
1231
}
1232
1233
ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
1234
1235
e_free:
1236
if (src_tpage)
1237
__free_page(src_tpage);
1238
if (dst_tpage)
1239
__free_page(dst_tpage);
1240
return ret;
1241
}
1242
1243
static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
1244
{
1245
unsigned long vaddr, vaddr_end, next_vaddr;
1246
unsigned long dst_vaddr;
1247
struct page **src_p, **dst_p;
1248
struct kvm_sev_dbg debug;
1249
unsigned long n;
1250
unsigned int size;
1251
int ret;
1252
1253
if (!sev_guest(kvm))
1254
return -ENOTTY;
1255
1256
if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
1257
return -EFAULT;
1258
1259
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
1260
return -EINVAL;
1261
if (!debug.dst_uaddr)
1262
return -EINVAL;
1263
1264
vaddr = debug.src_uaddr;
1265
size = debug.len;
1266
vaddr_end = vaddr + size;
1267
dst_vaddr = debug.dst_uaddr;
1268
1269
for (; vaddr < vaddr_end; vaddr = next_vaddr) {
1270
int len, s_off, d_off;
1271
1272
/* lock userspace source and destination page */
1273
src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
1274
if (IS_ERR(src_p))
1275
return PTR_ERR(src_p);
1276
1277
dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
1278
if (IS_ERR(dst_p)) {
1279
sev_unpin_memory(kvm, src_p, n);
1280
return PTR_ERR(dst_p);
1281
}
1282
1283
/*
1284
* Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
1285
* the pages; flush the destination too so that future accesses do not
1286
* see stale data.
1287
*/
1288
sev_clflush_pages(src_p, 1);
1289
sev_clflush_pages(dst_p, 1);
1290
1291
/*
1292
* Since user buffer may not be page aligned, calculate the
1293
* offset within the page.
1294
*/
1295
s_off = vaddr & ~PAGE_MASK;
1296
d_off = dst_vaddr & ~PAGE_MASK;
1297
len = min_t(size_t, (PAGE_SIZE - s_off), size);
1298
1299
if (dec)
1300
ret = __sev_dbg_decrypt_user(kvm,
1301
__sme_page_pa(src_p[0]) + s_off,
1302
(void __user *)dst_vaddr,
1303
__sme_page_pa(dst_p[0]) + d_off,
1304
len, &argp->error);
1305
else
1306
ret = __sev_dbg_encrypt_user(kvm,
1307
__sme_page_pa(src_p[0]) + s_off,
1308
(void __user *)vaddr,
1309
__sme_page_pa(dst_p[0]) + d_off,
1310
(void __user *)dst_vaddr,
1311
len, &argp->error);
1312
1313
sev_unpin_memory(kvm, src_p, n);
1314
sev_unpin_memory(kvm, dst_p, n);
1315
1316
if (ret)
1317
goto err;
1318
1319
next_vaddr = vaddr + len;
1320
dst_vaddr = dst_vaddr + len;
1321
size -= len;
1322
}
1323
err:
1324
return ret;
1325
}
1326
1327
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
1328
{
1329
struct sev_data_launch_secret data;
1330
struct kvm_sev_launch_secret params;
1331
struct page **pages;
1332
void *blob, *hdr;
1333
unsigned long n, i;
1334
int ret, offset;
1335
1336
if (!sev_guest(kvm))
1337
return -ENOTTY;
1338
1339
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1340
return -EFAULT;
1341
1342
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
1343
if (IS_ERR(pages))
1344
return PTR_ERR(pages);
1345
1346
/*
1347
* Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
1348
* place; the cache may contain the data that was written unencrypted.
1349
*/
1350
sev_clflush_pages(pages, n);
1351
1352
/*
1353
* The secret must be copied into contiguous memory region, lets verify
1354
* that userspace memory pages are contiguous before we issue command.
1355
*/
1356
if (get_num_contig_pages(0, pages, n) != n) {
1357
ret = -EINVAL;
1358
goto e_unpin_memory;
1359
}
1360
1361
memset(&data, 0, sizeof(data));
1362
1363
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1364
data.guest_address = __sme_page_pa(pages[0]) + offset;
1365
data.guest_len = params.guest_len;
1366
1367
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1368
if (IS_ERR(blob)) {
1369
ret = PTR_ERR(blob);
1370
goto e_unpin_memory;
1371
}
1372
1373
data.trans_address = __psp_pa(blob);
1374
data.trans_len = params.trans_len;
1375
1376
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1377
if (IS_ERR(hdr)) {
1378
ret = PTR_ERR(hdr);
1379
goto e_free_blob;
1380
}
1381
data.hdr_address = __psp_pa(hdr);
1382
data.hdr_len = params.hdr_len;
1383
1384
data.handle = to_kvm_sev_info(kvm)->handle;
1385
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
1386
1387
kfree(hdr);
1388
1389
e_free_blob:
1390
kfree(blob);
1391
e_unpin_memory:
1392
/* content of memory is updated, mark pages dirty */
1393
for (i = 0; i < n; i++) {
1394
set_page_dirty_lock(pages[i]);
1395
mark_page_accessed(pages[i]);
1396
}
1397
sev_unpin_memory(kvm, pages, n);
1398
return ret;
1399
}
1400
1401
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
1402
{
1403
void __user *report = u64_to_user_ptr(argp->data);
1404
struct sev_data_attestation_report data;
1405
struct kvm_sev_attestation_report params;
1406
void __user *p;
1407
void *blob = NULL;
1408
int ret;
1409
1410
if (!sev_guest(kvm))
1411
return -ENOTTY;
1412
1413
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
1414
return -EFAULT;
1415
1416
memset(&data, 0, sizeof(data));
1417
1418
/* User wants to query the blob length */
1419
if (!params.len)
1420
goto cmd;
1421
1422
p = u64_to_user_ptr(params.uaddr);
1423
if (p) {
1424
if (params.len > SEV_FW_BLOB_MAX_SIZE)
1425
return -EINVAL;
1426
1427
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
1428
if (!blob)
1429
return -ENOMEM;
1430
1431
data.address = __psp_pa(blob);
1432
data.len = params.len;
1433
memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
1434
}
1435
cmd:
1436
data.handle = to_kvm_sev_info(kvm)->handle;
1437
ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
1438
/*
1439
* If we query the session length, FW responded with expected data.
1440
*/
1441
if (!params.len)
1442
goto done;
1443
1444
if (ret)
1445
goto e_free_blob;
1446
1447
if (blob) {
1448
if (copy_to_user(p, blob, params.len))
1449
ret = -EFAULT;
1450
}
1451
1452
done:
1453
params.len = data.len;
1454
if (copy_to_user(report, &params, sizeof(params)))
1455
ret = -EFAULT;
1456
e_free_blob:
1457
kfree(blob);
1458
return ret;
1459
}
1460
1461
/* Userspace wants to query session length. */
1462
static int
1463
__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
1464
struct kvm_sev_send_start *params)
1465
{
1466
struct sev_data_send_start data;
1467
int ret;
1468
1469
memset(&data, 0, sizeof(data));
1470
data.handle = to_kvm_sev_info(kvm)->handle;
1471
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1472
1473
params->session_len = data.session_len;
1474
if (copy_to_user(u64_to_user_ptr(argp->data), params,
1475
sizeof(struct kvm_sev_send_start)))
1476
ret = -EFAULT;
1477
1478
return ret;
1479
}
1480
1481
static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1482
{
1483
struct sev_data_send_start data;
1484
struct kvm_sev_send_start params;
1485
void *amd_certs, *session_data;
1486
void *pdh_cert, *plat_certs;
1487
int ret;
1488
1489
if (!sev_guest(kvm))
1490
return -ENOTTY;
1491
1492
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1493
sizeof(struct kvm_sev_send_start)))
1494
return -EFAULT;
1495
1496
/* if session_len is zero, userspace wants to query the session length */
1497
if (!params.session_len)
1498
return __sev_send_start_query_session_length(kvm, argp,
1499
&params);
1500
1501
/* some sanity checks */
1502
if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
1503
!params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
1504
return -EINVAL;
1505
1506
/* allocate the memory to hold the session data blob */
1507
session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
1508
if (!session_data)
1509
return -ENOMEM;
1510
1511
/* copy the certificate blobs from userspace */
1512
pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
1513
params.pdh_cert_len);
1514
if (IS_ERR(pdh_cert)) {
1515
ret = PTR_ERR(pdh_cert);
1516
goto e_free_session;
1517
}
1518
1519
plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
1520
params.plat_certs_len);
1521
if (IS_ERR(plat_certs)) {
1522
ret = PTR_ERR(plat_certs);
1523
goto e_free_pdh;
1524
}
1525
1526
amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
1527
params.amd_certs_len);
1528
if (IS_ERR(amd_certs)) {
1529
ret = PTR_ERR(amd_certs);
1530
goto e_free_plat_cert;
1531
}
1532
1533
/* populate the FW SEND_START field with system physical address */
1534
memset(&data, 0, sizeof(data));
1535
data.pdh_cert_address = __psp_pa(pdh_cert);
1536
data.pdh_cert_len = params.pdh_cert_len;
1537
data.plat_certs_address = __psp_pa(plat_certs);
1538
data.plat_certs_len = params.plat_certs_len;
1539
data.amd_certs_address = __psp_pa(amd_certs);
1540
data.amd_certs_len = params.amd_certs_len;
1541
data.session_address = __psp_pa(session_data);
1542
data.session_len = params.session_len;
1543
data.handle = to_kvm_sev_info(kvm)->handle;
1544
1545
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
1546
1547
if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
1548
session_data, params.session_len)) {
1549
ret = -EFAULT;
1550
goto e_free_amd_cert;
1551
}
1552
1553
params.policy = data.policy;
1554
params.session_len = data.session_len;
1555
if (copy_to_user(u64_to_user_ptr(argp->data), &params,
1556
sizeof(struct kvm_sev_send_start)))
1557
ret = -EFAULT;
1558
1559
e_free_amd_cert:
1560
kfree(amd_certs);
1561
e_free_plat_cert:
1562
kfree(plat_certs);
1563
e_free_pdh:
1564
kfree(pdh_cert);
1565
e_free_session:
1566
kfree(session_data);
1567
return ret;
1568
}
1569
1570
/* Userspace wants to query either header or trans length. */
1571
static int
1572
__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
1573
struct kvm_sev_send_update_data *params)
1574
{
1575
struct sev_data_send_update_data data;
1576
int ret;
1577
1578
memset(&data, 0, sizeof(data));
1579
data.handle = to_kvm_sev_info(kvm)->handle;
1580
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1581
1582
params->hdr_len = data.hdr_len;
1583
params->trans_len = data.trans_len;
1584
1585
if (copy_to_user(u64_to_user_ptr(argp->data), params,
1586
sizeof(struct kvm_sev_send_update_data)))
1587
ret = -EFAULT;
1588
1589
return ret;
1590
}
1591
1592
static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1593
{
1594
struct sev_data_send_update_data data;
1595
struct kvm_sev_send_update_data params;
1596
void *hdr, *trans_data;
1597
struct page **guest_page;
1598
unsigned long n;
1599
int ret, offset;
1600
1601
if (!sev_guest(kvm))
1602
return -ENOTTY;
1603
1604
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1605
sizeof(struct kvm_sev_send_update_data)))
1606
return -EFAULT;
1607
1608
/* userspace wants to query either header or trans length */
1609
if (!params.trans_len || !params.hdr_len)
1610
return __sev_send_update_data_query_lengths(kvm, argp, &params);
1611
1612
if (!params.trans_uaddr || !params.guest_uaddr ||
1613
!params.guest_len || !params.hdr_uaddr)
1614
return -EINVAL;
1615
1616
/* Check if we are crossing the page boundary */
1617
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1618
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1619
return -EINVAL;
1620
1621
/* Pin guest memory */
1622
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1623
PAGE_SIZE, &n, 0);
1624
if (IS_ERR(guest_page))
1625
return PTR_ERR(guest_page);
1626
1627
/* allocate memory for header and transport buffer */
1628
ret = -ENOMEM;
1629
hdr = kzalloc(params.hdr_len, GFP_KERNEL);
1630
if (!hdr)
1631
goto e_unpin;
1632
1633
trans_data = kzalloc(params.trans_len, GFP_KERNEL);
1634
if (!trans_data)
1635
goto e_free_hdr;
1636
1637
memset(&data, 0, sizeof(data));
1638
data.hdr_address = __psp_pa(hdr);
1639
data.hdr_len = params.hdr_len;
1640
data.trans_address = __psp_pa(trans_data);
1641
data.trans_len = params.trans_len;
1642
1643
/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
1644
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1645
data.guest_address |= sev_me_mask;
1646
data.guest_len = params.guest_len;
1647
data.handle = to_kvm_sev_info(kvm)->handle;
1648
1649
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
1650
1651
if (ret)
1652
goto e_free_trans_data;
1653
1654
/* copy transport buffer to user space */
1655
if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
1656
trans_data, params.trans_len)) {
1657
ret = -EFAULT;
1658
goto e_free_trans_data;
1659
}
1660
1661
/* Copy packet header to userspace. */
1662
if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
1663
params.hdr_len))
1664
ret = -EFAULT;
1665
1666
e_free_trans_data:
1667
kfree(trans_data);
1668
e_free_hdr:
1669
kfree(hdr);
1670
e_unpin:
1671
sev_unpin_memory(kvm, guest_page, n);
1672
1673
return ret;
1674
}
1675
1676
static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1677
{
1678
struct sev_data_send_finish data;
1679
1680
if (!sev_guest(kvm))
1681
return -ENOTTY;
1682
1683
data.handle = to_kvm_sev_info(kvm)->handle;
1684
return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
1685
}
1686
1687
static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
1688
{
1689
struct sev_data_send_cancel data;
1690
1691
if (!sev_guest(kvm))
1692
return -ENOTTY;
1693
1694
data.handle = to_kvm_sev_info(kvm)->handle;
1695
return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
1696
}
1697
1698
static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
1699
{
1700
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
1701
struct sev_data_receive_start start;
1702
struct kvm_sev_receive_start params;
1703
int *error = &argp->error;
1704
void *session_data;
1705
void *pdh_data;
1706
int ret;
1707
1708
if (!sev_guest(kvm))
1709
return -ENOTTY;
1710
1711
/* Get parameter from the userspace */
1712
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1713
sizeof(struct kvm_sev_receive_start)))
1714
return -EFAULT;
1715
1716
/* some sanity checks */
1717
if (!params.pdh_uaddr || !params.pdh_len ||
1718
!params.session_uaddr || !params.session_len)
1719
return -EINVAL;
1720
1721
pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
1722
if (IS_ERR(pdh_data))
1723
return PTR_ERR(pdh_data);
1724
1725
session_data = psp_copy_user_blob(params.session_uaddr,
1726
params.session_len);
1727
if (IS_ERR(session_data)) {
1728
ret = PTR_ERR(session_data);
1729
goto e_free_pdh;
1730
}
1731
1732
memset(&start, 0, sizeof(start));
1733
start.handle = params.handle;
1734
start.policy = params.policy;
1735
start.pdh_cert_address = __psp_pa(pdh_data);
1736
start.pdh_cert_len = params.pdh_len;
1737
start.session_address = __psp_pa(session_data);
1738
start.session_len = params.session_len;
1739
1740
/* create memory encryption context */
1741
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
1742
error);
1743
if (ret)
1744
goto e_free_session;
1745
1746
/* Bind ASID to this guest */
1747
ret = sev_bind_asid(kvm, start.handle, error);
1748
if (ret) {
1749
sev_decommission(start.handle);
1750
goto e_free_session;
1751
}
1752
1753
params.handle = start.handle;
1754
if (copy_to_user(u64_to_user_ptr(argp->data),
1755
&params, sizeof(struct kvm_sev_receive_start))) {
1756
ret = -EFAULT;
1757
sev_unbind_asid(kvm, start.handle);
1758
goto e_free_session;
1759
}
1760
1761
sev->handle = start.handle;
1762
sev->fd = argp->sev_fd;
1763
1764
e_free_session:
1765
kfree(session_data);
1766
e_free_pdh:
1767
kfree(pdh_data);
1768
1769
return ret;
1770
}
1771
1772
static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
1773
{
1774
struct kvm_sev_receive_update_data params;
1775
struct sev_data_receive_update_data data;
1776
void *hdr = NULL, *trans = NULL;
1777
struct page **guest_page;
1778
unsigned long n;
1779
int ret, offset;
1780
1781
if (!sev_guest(kvm))
1782
return -EINVAL;
1783
1784
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
1785
sizeof(struct kvm_sev_receive_update_data)))
1786
return -EFAULT;
1787
1788
if (!params.hdr_uaddr || !params.hdr_len ||
1789
!params.guest_uaddr || !params.guest_len ||
1790
!params.trans_uaddr || !params.trans_len)
1791
return -EINVAL;
1792
1793
/* Check if we are crossing the page boundary */
1794
offset = params.guest_uaddr & (PAGE_SIZE - 1);
1795
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
1796
return -EINVAL;
1797
1798
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
1799
if (IS_ERR(hdr))
1800
return PTR_ERR(hdr);
1801
1802
trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
1803
if (IS_ERR(trans)) {
1804
ret = PTR_ERR(trans);
1805
goto e_free_hdr;
1806
}
1807
1808
memset(&data, 0, sizeof(data));
1809
data.hdr_address = __psp_pa(hdr);
1810
data.hdr_len = params.hdr_len;
1811
data.trans_address = __psp_pa(trans);
1812
data.trans_len = params.trans_len;
1813
1814
/* Pin guest memory */
1815
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
1816
PAGE_SIZE, &n, FOLL_WRITE);
1817
if (IS_ERR(guest_page)) {
1818
ret = PTR_ERR(guest_page);
1819
goto e_free_trans;
1820
}
1821
1822
/*
1823
* Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
1824
* encrypts the written data with the guest's key, and the cache may
1825
* contain dirty, unencrypted data.
1826
*/
1827
sev_clflush_pages(guest_page, n);
1828
1829
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
1830
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
1831
data.guest_address |= sev_me_mask;
1832
data.guest_len = params.guest_len;
1833
data.handle = to_kvm_sev_info(kvm)->handle;
1834
1835
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
1836
&argp->error);
1837
1838
sev_unpin_memory(kvm, guest_page, n);
1839
1840
e_free_trans:
1841
kfree(trans);
1842
e_free_hdr:
1843
kfree(hdr);
1844
1845
return ret;
1846
}
1847
1848
static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
1849
{
1850
struct sev_data_receive_finish data;
1851
1852
if (!sev_guest(kvm))
1853
return -ENOTTY;
1854
1855
data.handle = to_kvm_sev_info(kvm)->handle;
1856
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
1857
}
1858
1859
static bool is_cmd_allowed_from_mirror(u32 cmd_id)
1860
{
1861
/*
1862
* Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
1863
* active mirror VMs. Also allow the debugging and status commands.
1864
*/
1865
if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
1866
cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
1867
cmd_id == KVM_SEV_DBG_ENCRYPT)
1868
return true;
1869
1870
return false;
1871
}
1872
1873
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1874
{
1875
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1876
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1877
int r = -EBUSY;
1878
1879
if (dst_kvm == src_kvm)
1880
return -EINVAL;
1881
1882
/*
1883
* Bail if these VMs are already involved in a migration to avoid
1884
* deadlock between two VMs trying to migrate to/from each other.
1885
*/
1886
if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
1887
return -EBUSY;
1888
1889
if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
1890
goto release_dst;
1891
1892
r = -EINTR;
1893
if (mutex_lock_killable(&dst_kvm->lock))
1894
goto release_src;
1895
if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
1896
goto unlock_dst;
1897
return 0;
1898
1899
unlock_dst:
1900
mutex_unlock(&dst_kvm->lock);
1901
release_src:
1902
atomic_set_release(&src_sev->migration_in_progress, 0);
1903
release_dst:
1904
atomic_set_release(&dst_sev->migration_in_progress, 0);
1905
return r;
1906
}
1907
1908
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
1909
{
1910
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
1911
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
1912
1913
mutex_unlock(&dst_kvm->lock);
1914
mutex_unlock(&src_kvm->lock);
1915
atomic_set_release(&dst_sev->migration_in_progress, 0);
1916
atomic_set_release(&src_sev->migration_in_progress, 0);
1917
}
1918
1919
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
1920
{
1921
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
1922
struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
1923
struct kvm_vcpu *dst_vcpu, *src_vcpu;
1924
struct vcpu_svm *dst_svm, *src_svm;
1925
struct kvm_sev_info *mirror;
1926
unsigned long i;
1927
1928
dst->active = true;
1929
dst->asid = src->asid;
1930
dst->handle = src->handle;
1931
dst->pages_locked = src->pages_locked;
1932
dst->enc_context_owner = src->enc_context_owner;
1933
dst->es_active = src->es_active;
1934
dst->vmsa_features = src->vmsa_features;
1935
1936
src->asid = 0;
1937
src->active = false;
1938
src->handle = 0;
1939
src->pages_locked = 0;
1940
src->enc_context_owner = NULL;
1941
src->es_active = false;
1942
1943
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
1944
1945
/*
1946
* If this VM has mirrors, "transfer" each mirror's refcount of the
1947
* source to the destination (this KVM). The caller holds a reference
1948
* to the source, so there's no danger of use-after-free.
1949
*/
1950
list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
1951
list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
1952
kvm_get_kvm(dst_kvm);
1953
kvm_put_kvm(src_kvm);
1954
mirror->enc_context_owner = dst_kvm;
1955
}
1956
1957
/*
1958
* If this VM is a mirror, remove the old mirror from the owners list
1959
* and add the new mirror to the list.
1960
*/
1961
if (is_mirroring_enc_context(dst_kvm)) {
1962
struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
1963
1964
list_del(&src->mirror_entry);
1965
list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
1966
}
1967
1968
kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
1969
dst_svm = to_svm(dst_vcpu);
1970
1971
sev_init_vmcb(dst_svm);
1972
1973
if (!dst->es_active)
1974
continue;
1975
1976
/*
1977
* Note, the source is not required to have the same number of
1978
* vCPUs as the destination when migrating a vanilla SEV VM.
1979
*/
1980
src_vcpu = kvm_get_vcpu(src_kvm, i);
1981
src_svm = to_svm(src_vcpu);
1982
1983
/*
1984
* Transfer VMSA and GHCB state to the destination. Nullify and
1985
* clear source fields as appropriate, the state now belongs to
1986
* the destination.
1987
*/
1988
memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
1989
dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
1990
dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
1991
dst_vcpu->arch.guest_state_protected = true;
1992
1993
memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
1994
src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
1995
src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
1996
src_vcpu->arch.guest_state_protected = false;
1997
}
1998
}
1999
2000
static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
2001
{
2002
struct kvm_vcpu *src_vcpu;
2003
unsigned long i;
2004
2005
if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
2006
dst->created_vcpus != atomic_read(&dst->online_vcpus))
2007
return -EBUSY;
2008
2009
if (!sev_es_guest(src))
2010
return 0;
2011
2012
if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
2013
return -EINVAL;
2014
2015
kvm_for_each_vcpu(i, src_vcpu, src) {
2016
if (!src_vcpu->arch.guest_state_protected)
2017
return -EINVAL;
2018
}
2019
2020
return 0;
2021
}
2022
2023
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2024
{
2025
struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
2026
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
2027
CLASS(fd, f)(source_fd);
2028
struct kvm *source_kvm;
2029
bool charged = false;
2030
int ret;
2031
2032
if (fd_empty(f))
2033
return -EBADF;
2034
2035
if (!file_is_kvm(fd_file(f)))
2036
return -EBADF;
2037
2038
source_kvm = fd_file(f)->private_data;
2039
ret = sev_lock_two_vms(kvm, source_kvm);
2040
if (ret)
2041
return ret;
2042
2043
if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
2044
sev_guest(kvm) || !sev_guest(source_kvm)) {
2045
ret = -EINVAL;
2046
goto out_unlock;
2047
}
2048
2049
src_sev = to_kvm_sev_info(source_kvm);
2050
2051
dst_sev->misc_cg = get_current_misc_cg();
2052
cg_cleanup_sev = dst_sev;
2053
if (dst_sev->misc_cg != src_sev->misc_cg) {
2054
ret = sev_misc_cg_try_charge(dst_sev);
2055
if (ret)
2056
goto out_dst_cgroup;
2057
charged = true;
2058
}
2059
2060
ret = kvm_lock_all_vcpus(kvm);
2061
if (ret)
2062
goto out_dst_cgroup;
2063
ret = kvm_lock_all_vcpus(source_kvm);
2064
if (ret)
2065
goto out_dst_vcpu;
2066
2067
ret = sev_check_source_vcpus(kvm, source_kvm);
2068
if (ret)
2069
goto out_source_vcpu;
2070
2071
/*
2072
* Allocate a new have_run_cpus for the destination, i.e. don't copy
2073
* the set of CPUs from the source. If a CPU was used to run a vCPU in
2074
* the source VM but is never used for the destination VM, then the CPU
2075
* can only have cached memory that was accessible to the source VM.
2076
*/
2077
if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2078
ret = -ENOMEM;
2079
goto out_source_vcpu;
2080
}
2081
2082
sev_migrate_from(kvm, source_kvm);
2083
kvm_vm_dead(source_kvm);
2084
cg_cleanup_sev = src_sev;
2085
ret = 0;
2086
2087
out_source_vcpu:
2088
kvm_unlock_all_vcpus(source_kvm);
2089
out_dst_vcpu:
2090
kvm_unlock_all_vcpus(kvm);
2091
out_dst_cgroup:
2092
/* Operates on the source on success, on the destination on failure. */
2093
if (charged)
2094
sev_misc_cg_uncharge(cg_cleanup_sev);
2095
put_misc_cg(cg_cleanup_sev->misc_cg);
2096
cg_cleanup_sev->misc_cg = NULL;
2097
out_unlock:
2098
sev_unlock_two_vms(kvm, source_kvm);
2099
return ret;
2100
}
2101
2102
int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
2103
{
2104
if (group != KVM_X86_GRP_SEV)
2105
return -ENXIO;
2106
2107
switch (attr) {
2108
case KVM_X86_SEV_VMSA_FEATURES:
2109
*val = sev_supported_vmsa_features;
2110
return 0;
2111
2112
default:
2113
return -ENXIO;
2114
}
2115
}
2116
2117
/*
2118
* The guest context contains all the information, keys and metadata
2119
* associated with the guest that the firmware tracks to implement SEV
2120
* and SNP features. The firmware stores the guest context in hypervisor
2121
* provide page via the SNP_GCTX_CREATE command.
2122
*/
2123
static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
2124
{
2125
struct sev_data_snp_addr data = {};
2126
void *context;
2127
int rc;
2128
2129
/* Allocate memory for context page */
2130
context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
2131
if (!context)
2132
return NULL;
2133
2134
data.address = __psp_pa(context);
2135
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
2136
if (rc) {
2137
pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
2138
rc, argp->error);
2139
snp_free_firmware_page(context);
2140
return NULL;
2141
}
2142
2143
return context;
2144
}
2145
2146
static int snp_bind_asid(struct kvm *kvm, int *error)
2147
{
2148
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2149
struct sev_data_snp_activate data = {0};
2150
2151
data.gctx_paddr = __psp_pa(sev->snp_context);
2152
data.asid = sev_get_asid(kvm);
2153
return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
2154
}
2155
2156
static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
2157
{
2158
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2159
struct sev_data_snp_launch_start start = {0};
2160
struct kvm_sev_snp_launch_start params;
2161
int rc;
2162
2163
if (!sev_snp_guest(kvm))
2164
return -ENOTTY;
2165
2166
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2167
return -EFAULT;
2168
2169
/* Don't allow userspace to allocate memory for more than 1 SNP context. */
2170
if (sev->snp_context)
2171
return -EINVAL;
2172
2173
if (params.flags)
2174
return -EINVAL;
2175
2176
if (params.policy & ~SNP_POLICY_MASK_VALID)
2177
return -EINVAL;
2178
2179
/* Check for policy bits that must be set */
2180
if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
2181
return -EINVAL;
2182
2183
sev->policy = params.policy;
2184
2185
sev->snp_context = snp_context_create(kvm, argp);
2186
if (!sev->snp_context)
2187
return -ENOTTY;
2188
2189
start.gctx_paddr = __psp_pa(sev->snp_context);
2190
start.policy = params.policy;
2191
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
2192
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
2193
if (rc) {
2194
pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
2195
__func__, rc);
2196
goto e_free_context;
2197
}
2198
2199
sev->fd = argp->sev_fd;
2200
rc = snp_bind_asid(kvm, &argp->error);
2201
if (rc) {
2202
pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
2203
__func__, rc);
2204
goto e_free_context;
2205
}
2206
2207
return 0;
2208
2209
e_free_context:
2210
snp_decommission_context(kvm);
2211
2212
return rc;
2213
}
2214
2215
struct sev_gmem_populate_args {
2216
__u8 type;
2217
int sev_fd;
2218
int fw_error;
2219
};
2220
2221
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
2222
void __user *src, int order, void *opaque)
2223
{
2224
struct sev_gmem_populate_args *sev_populate_args = opaque;
2225
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2226
int n_private = 0, ret, i;
2227
int npages = (1 << order);
2228
gfn_t gfn;
2229
2230
if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
2231
return -EINVAL;
2232
2233
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
2234
struct sev_data_snp_launch_update fw_args = {0};
2235
bool assigned = false;
2236
int level;
2237
2238
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
2239
if (ret || assigned) {
2240
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
2241
__func__, gfn, ret, assigned);
2242
ret = ret ? -EINVAL : -EEXIST;
2243
goto err;
2244
}
2245
2246
if (src) {
2247
void *vaddr = kmap_local_pfn(pfn + i);
2248
2249
if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
2250
ret = -EFAULT;
2251
goto err;
2252
}
2253
kunmap_local(vaddr);
2254
}
2255
2256
ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
2257
sev_get_asid(kvm), true);
2258
if (ret)
2259
goto err;
2260
2261
n_private++;
2262
2263
fw_args.gctx_paddr = __psp_pa(sev->snp_context);
2264
fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
2265
fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
2266
fw_args.page_type = sev_populate_args->type;
2267
2268
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2269
&fw_args, &sev_populate_args->fw_error);
2270
if (ret)
2271
goto fw_err;
2272
}
2273
2274
return 0;
2275
2276
fw_err:
2277
/*
2278
* If the firmware command failed handle the reclaim and cleanup of that
2279
* PFN specially vs. prior pages which can be cleaned up below without
2280
* needing to reclaim in advance.
2281
*
2282
* Additionally, when invalid CPUID function entries are detected,
2283
* firmware writes the expected values into the page and leaves it
2284
* unencrypted so it can be used for debugging and error-reporting.
2285
*
2286
* Copy this page back into the source buffer so userspace can use this
2287
* information to provide information on which CPUID leaves/fields
2288
* failed CPUID validation.
2289
*/
2290
if (!snp_page_reclaim(kvm, pfn + i) &&
2291
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
2292
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
2293
void *vaddr = kmap_local_pfn(pfn + i);
2294
2295
if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
2296
pr_debug("Failed to write CPUID page back to userspace\n");
2297
2298
kunmap_local(vaddr);
2299
}
2300
2301
/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
2302
n_private--;
2303
2304
err:
2305
pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
2306
__func__, ret, sev_populate_args->fw_error, n_private);
2307
for (i = 0; i < n_private; i++)
2308
kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
2309
2310
return ret;
2311
}
2312
2313
static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
2314
{
2315
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2316
struct sev_gmem_populate_args sev_populate_args = {0};
2317
struct kvm_sev_snp_launch_update params;
2318
struct kvm_memory_slot *memslot;
2319
long npages, count;
2320
void __user *src;
2321
int ret = 0;
2322
2323
if (!sev_snp_guest(kvm) || !sev->snp_context)
2324
return -EINVAL;
2325
2326
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2327
return -EFAULT;
2328
2329
pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
2330
params.gfn_start, params.len, params.type, params.flags);
2331
2332
if (!PAGE_ALIGNED(params.len) || params.flags ||
2333
(params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
2334
params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
2335
params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
2336
params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
2337
params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
2338
return -EINVAL;
2339
2340
npages = params.len / PAGE_SIZE;
2341
2342
/*
2343
* For each GFN that's being prepared as part of the initial guest
2344
* state, the following pre-conditions are verified:
2345
*
2346
* 1) The backing memslot is a valid private memslot.
2347
* 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
2348
* beforehand.
2349
* 3) The PFN of the guest_memfd has not already been set to private
2350
* in the RMP table.
2351
*
2352
* The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
2353
* faults if there's a race between a fault and an attribute update via
2354
* KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
2355
* here. However, kvm->slots_lock guards against both this as well as
2356
* concurrent memslot updates occurring while these checks are being
2357
* performed, so use that here to make it easier to reason about the
2358
* initial expected state and better guard against unexpected
2359
* situations.
2360
*/
2361
mutex_lock(&kvm->slots_lock);
2362
2363
memslot = gfn_to_memslot(kvm, params.gfn_start);
2364
if (!kvm_slot_can_be_private(memslot)) {
2365
ret = -EINVAL;
2366
goto out;
2367
}
2368
2369
sev_populate_args.sev_fd = argp->sev_fd;
2370
sev_populate_args.type = params.type;
2371
src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
2372
2373
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
2374
sev_gmem_post_populate, &sev_populate_args);
2375
if (count < 0) {
2376
argp->error = sev_populate_args.fw_error;
2377
pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
2378
__func__, count, argp->error);
2379
ret = -EIO;
2380
} else {
2381
params.gfn_start += count;
2382
params.len -= count * PAGE_SIZE;
2383
if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
2384
params.uaddr += count * PAGE_SIZE;
2385
2386
ret = 0;
2387
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
2388
ret = -EFAULT;
2389
}
2390
2391
out:
2392
mutex_unlock(&kvm->slots_lock);
2393
2394
return ret;
2395
}
2396
2397
static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
2398
{
2399
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2400
struct sev_data_snp_launch_update data = {};
2401
struct kvm_vcpu *vcpu;
2402
unsigned long i;
2403
int ret;
2404
2405
data.gctx_paddr = __psp_pa(sev->snp_context);
2406
data.page_type = SNP_PAGE_TYPE_VMSA;
2407
2408
kvm_for_each_vcpu(i, vcpu, kvm) {
2409
struct vcpu_svm *svm = to_svm(vcpu);
2410
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
2411
2412
ret = sev_es_sync_vmsa(svm);
2413
if (ret)
2414
return ret;
2415
2416
/* Transition the VMSA page to a firmware state. */
2417
ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
2418
if (ret)
2419
return ret;
2420
2421
/* Issue the SNP command to encrypt the VMSA */
2422
data.address = __sme_pa(svm->sev_es.vmsa);
2423
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
2424
&data, &argp->error);
2425
if (ret) {
2426
snp_page_reclaim(kvm, pfn);
2427
2428
return ret;
2429
}
2430
2431
svm->vcpu.arch.guest_state_protected = true;
2432
/*
2433
* SEV-ES (and thus SNP) guest mandates LBR Virtualization to
2434
* be _always_ ON. Enable it only after setting
2435
* guest_state_protected because KVM_SET_MSRS allows dynamic
2436
* toggling of LBRV (for performance reason) on write access to
2437
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
2438
*/
2439
svm_enable_lbrv(vcpu);
2440
}
2441
2442
return 0;
2443
}
2444
2445
static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
2446
{
2447
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2448
struct kvm_sev_snp_launch_finish params;
2449
struct sev_data_snp_launch_finish *data;
2450
void *id_block = NULL, *id_auth = NULL;
2451
int ret;
2452
2453
if (!sev_snp_guest(kvm))
2454
return -ENOTTY;
2455
2456
if (!sev->snp_context)
2457
return -EINVAL;
2458
2459
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
2460
return -EFAULT;
2461
2462
if (params.flags)
2463
return -EINVAL;
2464
2465
/* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
2466
ret = snp_launch_update_vmsa(kvm, argp);
2467
if (ret)
2468
return ret;
2469
2470
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
2471
if (!data)
2472
return -ENOMEM;
2473
2474
if (params.id_block_en) {
2475
id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
2476
if (IS_ERR(id_block)) {
2477
ret = PTR_ERR(id_block);
2478
goto e_free;
2479
}
2480
2481
data->id_block_en = 1;
2482
data->id_block_paddr = __sme_pa(id_block);
2483
2484
id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
2485
if (IS_ERR(id_auth)) {
2486
ret = PTR_ERR(id_auth);
2487
goto e_free_id_block;
2488
}
2489
2490
data->id_auth_paddr = __sme_pa(id_auth);
2491
2492
if (params.auth_key_en)
2493
data->auth_key_en = 1;
2494
}
2495
2496
data->vcek_disabled = params.vcek_disabled;
2497
2498
memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
2499
data->gctx_paddr = __psp_pa(sev->snp_context);
2500
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
2501
2502
/*
2503
* Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
2504
* can be given to the guest simply by marking the RMP entry as private.
2505
* This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
2506
*/
2507
if (!ret)
2508
kvm->arch.pre_fault_allowed = true;
2509
2510
kfree(id_auth);
2511
2512
e_free_id_block:
2513
kfree(id_block);
2514
2515
e_free:
2516
kfree(data);
2517
2518
return ret;
2519
}
2520
2521
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
2522
{
2523
struct kvm_sev_cmd sev_cmd;
2524
int r;
2525
2526
if (!sev_enabled)
2527
return -ENOTTY;
2528
2529
if (!argp)
2530
return 0;
2531
2532
if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
2533
return -EFAULT;
2534
2535
mutex_lock(&kvm->lock);
2536
2537
/* Only the enc_context_owner handles some memory enc operations. */
2538
if (is_mirroring_enc_context(kvm) &&
2539
!is_cmd_allowed_from_mirror(sev_cmd.id)) {
2540
r = -EINVAL;
2541
goto out;
2542
}
2543
2544
/*
2545
* Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
2546
* allow the use of SNP-specific commands.
2547
*/
2548
if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
2549
r = -EPERM;
2550
goto out;
2551
}
2552
2553
switch (sev_cmd.id) {
2554
case KVM_SEV_ES_INIT:
2555
if (!sev_es_enabled) {
2556
r = -ENOTTY;
2557
goto out;
2558
}
2559
fallthrough;
2560
case KVM_SEV_INIT:
2561
r = sev_guest_init(kvm, &sev_cmd);
2562
break;
2563
case KVM_SEV_INIT2:
2564
r = sev_guest_init2(kvm, &sev_cmd);
2565
break;
2566
case KVM_SEV_LAUNCH_START:
2567
r = sev_launch_start(kvm, &sev_cmd);
2568
break;
2569
case KVM_SEV_LAUNCH_UPDATE_DATA:
2570
r = sev_launch_update_data(kvm, &sev_cmd);
2571
break;
2572
case KVM_SEV_LAUNCH_UPDATE_VMSA:
2573
r = sev_launch_update_vmsa(kvm, &sev_cmd);
2574
break;
2575
case KVM_SEV_LAUNCH_MEASURE:
2576
r = sev_launch_measure(kvm, &sev_cmd);
2577
break;
2578
case KVM_SEV_LAUNCH_FINISH:
2579
r = sev_launch_finish(kvm, &sev_cmd);
2580
break;
2581
case KVM_SEV_GUEST_STATUS:
2582
r = sev_guest_status(kvm, &sev_cmd);
2583
break;
2584
case KVM_SEV_DBG_DECRYPT:
2585
r = sev_dbg_crypt(kvm, &sev_cmd, true);
2586
break;
2587
case KVM_SEV_DBG_ENCRYPT:
2588
r = sev_dbg_crypt(kvm, &sev_cmd, false);
2589
break;
2590
case KVM_SEV_LAUNCH_SECRET:
2591
r = sev_launch_secret(kvm, &sev_cmd);
2592
break;
2593
case KVM_SEV_GET_ATTESTATION_REPORT:
2594
r = sev_get_attestation_report(kvm, &sev_cmd);
2595
break;
2596
case KVM_SEV_SEND_START:
2597
r = sev_send_start(kvm, &sev_cmd);
2598
break;
2599
case KVM_SEV_SEND_UPDATE_DATA:
2600
r = sev_send_update_data(kvm, &sev_cmd);
2601
break;
2602
case KVM_SEV_SEND_FINISH:
2603
r = sev_send_finish(kvm, &sev_cmd);
2604
break;
2605
case KVM_SEV_SEND_CANCEL:
2606
r = sev_send_cancel(kvm, &sev_cmd);
2607
break;
2608
case KVM_SEV_RECEIVE_START:
2609
r = sev_receive_start(kvm, &sev_cmd);
2610
break;
2611
case KVM_SEV_RECEIVE_UPDATE_DATA:
2612
r = sev_receive_update_data(kvm, &sev_cmd);
2613
break;
2614
case KVM_SEV_RECEIVE_FINISH:
2615
r = sev_receive_finish(kvm, &sev_cmd);
2616
break;
2617
case KVM_SEV_SNP_LAUNCH_START:
2618
r = snp_launch_start(kvm, &sev_cmd);
2619
break;
2620
case KVM_SEV_SNP_LAUNCH_UPDATE:
2621
r = snp_launch_update(kvm, &sev_cmd);
2622
break;
2623
case KVM_SEV_SNP_LAUNCH_FINISH:
2624
r = snp_launch_finish(kvm, &sev_cmd);
2625
break;
2626
default:
2627
r = -EINVAL;
2628
goto out;
2629
}
2630
2631
if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
2632
r = -EFAULT;
2633
2634
out:
2635
mutex_unlock(&kvm->lock);
2636
return r;
2637
}
2638
2639
int sev_mem_enc_register_region(struct kvm *kvm,
2640
struct kvm_enc_region *range)
2641
{
2642
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2643
struct enc_region *region;
2644
int ret = 0;
2645
2646
if (!sev_guest(kvm))
2647
return -ENOTTY;
2648
2649
/* If kvm is mirroring encryption context it isn't responsible for it */
2650
if (is_mirroring_enc_context(kvm))
2651
return -EINVAL;
2652
2653
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
2654
return -EINVAL;
2655
2656
region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
2657
if (!region)
2658
return -ENOMEM;
2659
2660
mutex_lock(&kvm->lock);
2661
region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
2662
FOLL_WRITE | FOLL_LONGTERM);
2663
if (IS_ERR(region->pages)) {
2664
ret = PTR_ERR(region->pages);
2665
mutex_unlock(&kvm->lock);
2666
goto e_free;
2667
}
2668
2669
/*
2670
* The guest may change the memory encryption attribute from C=0 -> C=1
2671
* or vice versa for this memory range. Lets make sure caches are
2672
* flushed to ensure that guest data gets written into memory with
2673
* correct C-bit. Note, this must be done before dropping kvm->lock,
2674
* as region and its array of pages can be freed by a different task
2675
* once kvm->lock is released.
2676
*/
2677
sev_clflush_pages(region->pages, region->npages);
2678
2679
region->uaddr = range->addr;
2680
region->size = range->size;
2681
2682
list_add_tail(&region->list, &sev->regions_list);
2683
mutex_unlock(&kvm->lock);
2684
2685
return ret;
2686
2687
e_free:
2688
kfree(region);
2689
return ret;
2690
}
2691
2692
static struct enc_region *
2693
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
2694
{
2695
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2696
struct list_head *head = &sev->regions_list;
2697
struct enc_region *i;
2698
2699
list_for_each_entry(i, head, list) {
2700
if (i->uaddr == range->addr &&
2701
i->size == range->size)
2702
return i;
2703
}
2704
2705
return NULL;
2706
}
2707
2708
static void __unregister_enc_region_locked(struct kvm *kvm,
2709
struct enc_region *region)
2710
{
2711
sev_unpin_memory(kvm, region->pages, region->npages);
2712
list_del(&region->list);
2713
kfree(region);
2714
}
2715
2716
int sev_mem_enc_unregister_region(struct kvm *kvm,
2717
struct kvm_enc_region *range)
2718
{
2719
struct enc_region *region;
2720
int ret;
2721
2722
/* If kvm is mirroring encryption context it isn't responsible for it */
2723
if (is_mirroring_enc_context(kvm))
2724
return -EINVAL;
2725
2726
mutex_lock(&kvm->lock);
2727
2728
if (!sev_guest(kvm)) {
2729
ret = -ENOTTY;
2730
goto failed;
2731
}
2732
2733
region = find_enc_region(kvm, range);
2734
if (!region) {
2735
ret = -EINVAL;
2736
goto failed;
2737
}
2738
2739
sev_writeback_caches(kvm);
2740
2741
__unregister_enc_region_locked(kvm, region);
2742
2743
mutex_unlock(&kvm->lock);
2744
return 0;
2745
2746
failed:
2747
mutex_unlock(&kvm->lock);
2748
return ret;
2749
}
2750
2751
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
2752
{
2753
CLASS(fd, f)(source_fd);
2754
struct kvm *source_kvm;
2755
struct kvm_sev_info *source_sev, *mirror_sev;
2756
int ret;
2757
2758
if (fd_empty(f))
2759
return -EBADF;
2760
2761
if (!file_is_kvm(fd_file(f)))
2762
return -EBADF;
2763
2764
source_kvm = fd_file(f)->private_data;
2765
ret = sev_lock_two_vms(kvm, source_kvm);
2766
if (ret)
2767
return ret;
2768
2769
/*
2770
* Mirrors of mirrors should work, but let's not get silly. Also
2771
* disallow out-of-band SEV/SEV-ES init if the target is already an
2772
* SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
2773
* created after SEV/SEV-ES initialization, e.g. to init intercepts.
2774
*/
2775
if (sev_guest(kvm) || !sev_guest(source_kvm) ||
2776
is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
2777
ret = -EINVAL;
2778
goto e_unlock;
2779
}
2780
2781
mirror_sev = to_kvm_sev_info(kvm);
2782
if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2783
ret = -ENOMEM;
2784
goto e_unlock;
2785
}
2786
2787
/*
2788
* The mirror kvm holds an enc_context_owner ref so its asid can't
2789
* disappear until we're done with it
2790
*/
2791
source_sev = to_kvm_sev_info(source_kvm);
2792
kvm_get_kvm(source_kvm);
2793
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
2794
2795
/* Set enc_context_owner and copy its encryption context over */
2796
mirror_sev->enc_context_owner = source_kvm;
2797
mirror_sev->active = true;
2798
mirror_sev->asid = source_sev->asid;
2799
mirror_sev->fd = source_sev->fd;
2800
mirror_sev->es_active = source_sev->es_active;
2801
mirror_sev->need_init = false;
2802
mirror_sev->handle = source_sev->handle;
2803
INIT_LIST_HEAD(&mirror_sev->regions_list);
2804
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
2805
ret = 0;
2806
2807
/*
2808
* Do not copy ap_jump_table. Since the mirror does not share the same
2809
* KVM contexts as the original, and they may have different
2810
* memory-views.
2811
*/
2812
2813
e_unlock:
2814
sev_unlock_two_vms(kvm, source_kvm);
2815
return ret;
2816
}
2817
2818
static int snp_decommission_context(struct kvm *kvm)
2819
{
2820
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2821
struct sev_data_snp_addr data = {};
2822
int ret;
2823
2824
/* If context is not created then do nothing */
2825
if (!sev->snp_context)
2826
return 0;
2827
2828
/* Do the decommision, which will unbind the ASID from the SNP context */
2829
data.address = __sme_pa(sev->snp_context);
2830
down_write(&sev_deactivate_lock);
2831
ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
2832
up_write(&sev_deactivate_lock);
2833
2834
if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
2835
return ret;
2836
2837
snp_free_firmware_page(sev->snp_context);
2838
sev->snp_context = NULL;
2839
2840
return 0;
2841
}
2842
2843
void sev_vm_destroy(struct kvm *kvm)
2844
{
2845
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
2846
struct list_head *head = &sev->regions_list;
2847
struct list_head *pos, *q;
2848
2849
if (!sev_guest(kvm))
2850
return;
2851
2852
WARN_ON(!list_empty(&sev->mirror_vms));
2853
2854
free_cpumask_var(sev->have_run_cpus);
2855
2856
/*
2857
* If this is a mirror VM, remove it from the owner's list of a mirrors
2858
* and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
2859
* Note, mirror VMs don't support registering encrypted regions.
2860
*/
2861
if (is_mirroring_enc_context(kvm)) {
2862
struct kvm *owner_kvm = sev->enc_context_owner;
2863
2864
mutex_lock(&owner_kvm->lock);
2865
list_del(&sev->mirror_entry);
2866
mutex_unlock(&owner_kvm->lock);
2867
kvm_put_kvm(owner_kvm);
2868
return;
2869
}
2870
2871
2872
/*
2873
* if userspace was terminated before unregistering the memory regions
2874
* then lets unpin all the registered memory.
2875
*/
2876
if (!list_empty(head)) {
2877
list_for_each_safe(pos, q, head) {
2878
__unregister_enc_region_locked(kvm,
2879
list_entry(pos, struct enc_region, list));
2880
cond_resched();
2881
}
2882
}
2883
2884
if (sev_snp_guest(kvm)) {
2885
snp_guest_req_cleanup(kvm);
2886
2887
/*
2888
* Decomission handles unbinding of the ASID. If it fails for
2889
* some unexpected reason, just leak the ASID.
2890
*/
2891
if (snp_decommission_context(kvm))
2892
return;
2893
} else {
2894
sev_unbind_asid(kvm, sev->handle);
2895
}
2896
2897
sev_asid_free(sev);
2898
}
2899
2900
void __init sev_set_cpu_caps(void)
2901
{
2902
if (sev_enabled) {
2903
kvm_cpu_cap_set(X86_FEATURE_SEV);
2904
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
2905
}
2906
if (sev_es_enabled) {
2907
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
2908
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
2909
}
2910
if (sev_snp_enabled) {
2911
kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
2912
kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
2913
}
2914
}
2915
2916
static bool is_sev_snp_initialized(void)
2917
{
2918
struct sev_user_data_snp_status *status;
2919
struct sev_data_snp_addr buf;
2920
bool initialized = false;
2921
int ret, error = 0;
2922
2923
status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
2924
if (!status)
2925
return false;
2926
2927
buf.address = __psp_pa(status);
2928
ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
2929
if (ret) {
2930
pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
2931
ret, error, error);
2932
goto out;
2933
}
2934
2935
initialized = !!status->state;
2936
2937
out:
2938
snp_free_firmware_page(status);
2939
2940
return initialized;
2941
}
2942
2943
void __init sev_hardware_setup(void)
2944
{
2945
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
2946
struct sev_platform_init_args init_args = {0};
2947
bool sev_snp_supported = false;
2948
bool sev_es_supported = false;
2949
bool sev_supported = false;
2950
2951
if (!sev_enabled || !npt_enabled || !nrips)
2952
goto out;
2953
2954
/*
2955
* SEV must obviously be supported in hardware. Sanity check that the
2956
* CPU supports decode assists, which is mandatory for SEV guests to
2957
* support instruction emulation. Ditto for flushing by ASID, as SEV
2958
* guests are bound to a single ASID, i.e. KVM can't rotate to a new
2959
* ASID to effect a TLB flush.
2960
*/
2961
if (!boot_cpu_has(X86_FEATURE_SEV) ||
2962
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
2963
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
2964
goto out;
2965
2966
/*
2967
* The kernel's initcall infrastructure lacks the ability to express
2968
* dependencies between initcalls, whereas the modules infrastructure
2969
* automatically handles dependencies via symbol loading. Ensure the
2970
* PSP SEV driver is initialized before proceeding if KVM is built-in,
2971
* as the dependency isn't handled by the initcall infrastructure.
2972
*/
2973
if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
2974
goto out;
2975
2976
/* Retrieve SEV CPUID information */
2977
cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
2978
2979
/* Set encryption bit location for SEV-ES guests */
2980
sev_enc_bit = ebx & 0x3f;
2981
2982
/* Maximum number of encrypted guests supported simultaneously */
2983
max_sev_asid = ecx;
2984
if (!max_sev_asid)
2985
goto out;
2986
2987
/* Minimum ASID value that should be used for SEV guest */
2988
min_sev_asid = edx;
2989
sev_me_mask = 1UL << (ebx & 0x3f);
2990
2991
/*
2992
* Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
2993
* even though it's never used, so that the bitmap is indexed by the
2994
* actual ASID.
2995
*/
2996
nr_asids = max_sev_asid + 1;
2997
sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
2998
if (!sev_asid_bitmap)
2999
goto out;
3000
3001
sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
3002
if (!sev_reclaim_asid_bitmap) {
3003
bitmap_free(sev_asid_bitmap);
3004
sev_asid_bitmap = NULL;
3005
goto out;
3006
}
3007
3008
if (min_sev_asid <= max_sev_asid) {
3009
sev_asid_count = max_sev_asid - min_sev_asid + 1;
3010
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
3011
}
3012
sev_supported = true;
3013
3014
/* SEV-ES support requested? */
3015
if (!sev_es_enabled)
3016
goto out;
3017
3018
/*
3019
* SEV-ES requires MMIO caching as KVM doesn't have access to the guest
3020
* instruction stream, i.e. can't emulate in response to a #NPF and
3021
* instead relies on #NPF(RSVD) being reflected into the guest as #VC
3022
* (the guest can then do a #VMGEXIT to request MMIO emulation).
3023
*/
3024
if (!enable_mmio_caching)
3025
goto out;
3026
3027
/* Does the CPU support SEV-ES? */
3028
if (!boot_cpu_has(X86_FEATURE_SEV_ES))
3029
goto out;
3030
3031
if (!lbrv) {
3032
WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
3033
"LBRV must be present for SEV-ES support");
3034
goto out;
3035
}
3036
3037
/* Has the system been allocated ASIDs for SEV-ES? */
3038
if (min_sev_asid == 1)
3039
goto out;
3040
3041
sev_es_asid_count = min_sev_asid - 1;
3042
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
3043
sev_es_supported = true;
3044
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
3045
3046
out:
3047
if (sev_enabled) {
3048
init_args.probe = true;
3049
if (sev_platform_init(&init_args))
3050
sev_supported = sev_es_supported = sev_snp_supported = false;
3051
else if (sev_snp_supported)
3052
sev_snp_supported = is_sev_snp_initialized();
3053
}
3054
3055
if (boot_cpu_has(X86_FEATURE_SEV))
3056
pr_info("SEV %s (ASIDs %u - %u)\n",
3057
sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
3058
"unusable" :
3059
"disabled",
3060
min_sev_asid, max_sev_asid);
3061
if (boot_cpu_has(X86_FEATURE_SEV_ES))
3062
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
3063
str_enabled_disabled(sev_es_supported),
3064
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
3065
if (boot_cpu_has(X86_FEATURE_SEV_SNP))
3066
pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
3067
str_enabled_disabled(sev_snp_supported),
3068
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
3069
3070
sev_enabled = sev_supported;
3071
sev_es_enabled = sev_es_supported;
3072
sev_snp_enabled = sev_snp_supported;
3073
3074
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
3075
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
3076
sev_es_debug_swap_enabled = false;
3077
3078
sev_supported_vmsa_features = 0;
3079
if (sev_es_debug_swap_enabled)
3080
sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
3081
}
3082
3083
void sev_hardware_unsetup(void)
3084
{
3085
if (!sev_enabled)
3086
return;
3087
3088
/* No need to take sev_bitmap_lock, all VMs have been destroyed. */
3089
sev_flush_asids(1, max_sev_asid);
3090
3091
bitmap_free(sev_asid_bitmap);
3092
bitmap_free(sev_reclaim_asid_bitmap);
3093
3094
misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
3095
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
3096
3097
sev_platform_shutdown();
3098
}
3099
3100
int sev_cpu_init(struct svm_cpu_data *sd)
3101
{
3102
if (!sev_enabled)
3103
return 0;
3104
3105
sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
3106
if (!sd->sev_vmcbs)
3107
return -ENOMEM;
3108
3109
return 0;
3110
}
3111
3112
/*
3113
* Pages used by hardware to hold guest encrypted state must be flushed before
3114
* returning them to the system.
3115
*/
3116
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
3117
{
3118
unsigned int asid = sev_get_asid(vcpu->kvm);
3119
3120
/*
3121
* Note! The address must be a kernel address, as regular page walk
3122
* checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
3123
* address is non-deterministic and unsafe. This function deliberately
3124
* takes a pointer to deter passing in a user address.
3125
*/
3126
unsigned long addr = (unsigned long)va;
3127
3128
/*
3129
* If CPU enforced cache coherency for encrypted mappings of the
3130
* same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
3131
* flush is still needed in order to work properly with DMA devices.
3132
*/
3133
if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
3134
clflush_cache_range(va, PAGE_SIZE);
3135
return;
3136
}
3137
3138
/*
3139
* VM Page Flush takes a host virtual address and a guest ASID. Fall
3140
* back to full writeback of caches if this faults so as not to make
3141
* any problems worse by leaving stale encrypted data in the cache.
3142
*/
3143
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
3144
goto do_sev_writeback_caches;
3145
3146
return;
3147
3148
do_sev_writeback_caches:
3149
sev_writeback_caches(vcpu->kvm);
3150
}
3151
3152
void sev_guest_memory_reclaimed(struct kvm *kvm)
3153
{
3154
/*
3155
* With SNP+gmem, private/encrypted memory is unreachable via the
3156
* hva-based mmu notifiers, i.e. these events are explicitly scoped to
3157
* shared pages, where there's no need to flush caches.
3158
*/
3159
if (!sev_guest(kvm) || sev_snp_guest(kvm))
3160
return;
3161
3162
sev_writeback_caches(kvm);
3163
}
3164
3165
void sev_free_vcpu(struct kvm_vcpu *vcpu)
3166
{
3167
struct vcpu_svm *svm;
3168
3169
if (!sev_es_guest(vcpu->kvm))
3170
return;
3171
3172
svm = to_svm(vcpu);
3173
3174
/*
3175
* If it's an SNP guest, then the VMSA was marked in the RMP table as
3176
* a guest-owned page. Transition the page to hypervisor state before
3177
* releasing it back to the system.
3178
*/
3179
if (sev_snp_guest(vcpu->kvm)) {
3180
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
3181
3182
if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
3183
goto skip_vmsa_free;
3184
}
3185
3186
if (vcpu->arch.guest_state_protected)
3187
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
3188
3189
__free_page(virt_to_page(svm->sev_es.vmsa));
3190
3191
skip_vmsa_free:
3192
if (svm->sev_es.ghcb_sa_free)
3193
kvfree(svm->sev_es.ghcb_sa);
3194
}
3195
3196
static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
3197
{
3198
return (((u64)control->exit_code_hi) << 32) | control->exit_code;
3199
}
3200
3201
static void dump_ghcb(struct vcpu_svm *svm)
3202
{
3203
struct vmcb_control_area *control = &svm->vmcb->control;
3204
unsigned int nbits;
3205
3206
/* Re-use the dump_invalid_vmcb module parameter */
3207
if (!dump_invalid_vmcb) {
3208
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3209
return;
3210
}
3211
3212
nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
3213
3214
/*
3215
* Print KVM's snapshot of the GHCB values that were (unsuccessfully)
3216
* used to handle the exit. If the guest has since modified the GHCB
3217
* itself, dumping the raw GHCB won't help debug why KVM was unable to
3218
* handle the VMGEXIT that KVM observed.
3219
*/
3220
pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
3221
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
3222
kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
3223
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
3224
control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
3225
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
3226
control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
3227
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
3228
svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
3229
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
3230
}
3231
3232
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
3233
{
3234
struct kvm_vcpu *vcpu = &svm->vcpu;
3235
struct ghcb *ghcb = svm->sev_es.ghcb;
3236
3237
/*
3238
* The GHCB protocol so far allows for the following data
3239
* to be returned:
3240
* GPRs RAX, RBX, RCX, RDX
3241
*
3242
* Copy their values, even if they may not have been written during the
3243
* VM-Exit. It's the guest's responsibility to not consume random data.
3244
*/
3245
ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
3246
ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
3247
ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
3248
ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
3249
}
3250
3251
static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
3252
{
3253
struct vmcb_control_area *control = &svm->vmcb->control;
3254
struct kvm_vcpu *vcpu = &svm->vcpu;
3255
struct ghcb *ghcb = svm->sev_es.ghcb;
3256
u64 exit_code;
3257
3258
/*
3259
* The GHCB protocol so far allows for the following data
3260
* to be supplied:
3261
* GPRs RAX, RBX, RCX, RDX
3262
* XCR0
3263
* CPL
3264
*
3265
* VMMCALL allows the guest to provide extra registers. KVM also
3266
* expects RSI for hypercalls, so include that, too.
3267
*
3268
* Copy their values to the appropriate location if supplied.
3269
*/
3270
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
3271
3272
BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
3273
memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
3274
3275
vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
3276
vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
3277
vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
3278
vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
3279
vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
3280
3281
svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
3282
3283
if (kvm_ghcb_xcr0_is_valid(svm)) {
3284
vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
3285
vcpu->arch.cpuid_dynamic_bits_dirty = true;
3286
}
3287
3288
/* Copy the GHCB exit information into the VMCB fields */
3289
exit_code = ghcb_get_sw_exit_code(ghcb);
3290
control->exit_code = lower_32_bits(exit_code);
3291
control->exit_code_hi = upper_32_bits(exit_code);
3292
control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
3293
control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
3294
svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
3295
3296
/* Clear the valid entries fields */
3297
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
3298
}
3299
3300
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
3301
{
3302
struct vmcb_control_area *control = &svm->vmcb->control;
3303
struct kvm_vcpu *vcpu = &svm->vcpu;
3304
u64 exit_code;
3305
u64 reason;
3306
3307
/*
3308
* Retrieve the exit code now even though it may not be marked valid
3309
* as it could help with debugging.
3310
*/
3311
exit_code = kvm_ghcb_get_sw_exit_code(control);
3312
3313
/* Only GHCB Usage code 0 is supported */
3314
if (svm->sev_es.ghcb->ghcb_usage) {
3315
reason = GHCB_ERR_INVALID_USAGE;
3316
goto vmgexit_err;
3317
}
3318
3319
reason = GHCB_ERR_MISSING_INPUT;
3320
3321
if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
3322
!kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
3323
!kvm_ghcb_sw_exit_info_2_is_valid(svm))
3324
goto vmgexit_err;
3325
3326
switch (exit_code) {
3327
case SVM_EXIT_READ_DR7:
3328
break;
3329
case SVM_EXIT_WRITE_DR7:
3330
if (!kvm_ghcb_rax_is_valid(svm))
3331
goto vmgexit_err;
3332
break;
3333
case SVM_EXIT_RDTSC:
3334
break;
3335
case SVM_EXIT_RDPMC:
3336
if (!kvm_ghcb_rcx_is_valid(svm))
3337
goto vmgexit_err;
3338
break;
3339
case SVM_EXIT_CPUID:
3340
if (!kvm_ghcb_rax_is_valid(svm) ||
3341
!kvm_ghcb_rcx_is_valid(svm))
3342
goto vmgexit_err;
3343
if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
3344
if (!kvm_ghcb_xcr0_is_valid(svm))
3345
goto vmgexit_err;
3346
break;
3347
case SVM_EXIT_INVD:
3348
break;
3349
case SVM_EXIT_IOIO:
3350
if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
3351
if (!kvm_ghcb_sw_scratch_is_valid(svm))
3352
goto vmgexit_err;
3353
} else {
3354
if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
3355
if (!kvm_ghcb_rax_is_valid(svm))
3356
goto vmgexit_err;
3357
}
3358
break;
3359
case SVM_EXIT_MSR:
3360
if (!kvm_ghcb_rcx_is_valid(svm))
3361
goto vmgexit_err;
3362
if (control->exit_info_1) {
3363
if (!kvm_ghcb_rax_is_valid(svm) ||
3364
!kvm_ghcb_rdx_is_valid(svm))
3365
goto vmgexit_err;
3366
}
3367
break;
3368
case SVM_EXIT_VMMCALL:
3369
if (!kvm_ghcb_rax_is_valid(svm) ||
3370
!kvm_ghcb_cpl_is_valid(svm))
3371
goto vmgexit_err;
3372
break;
3373
case SVM_EXIT_RDTSCP:
3374
break;
3375
case SVM_EXIT_WBINVD:
3376
break;
3377
case SVM_EXIT_MONITOR:
3378
if (!kvm_ghcb_rax_is_valid(svm) ||
3379
!kvm_ghcb_rcx_is_valid(svm) ||
3380
!kvm_ghcb_rdx_is_valid(svm))
3381
goto vmgexit_err;
3382
break;
3383
case SVM_EXIT_MWAIT:
3384
if (!kvm_ghcb_rax_is_valid(svm) ||
3385
!kvm_ghcb_rcx_is_valid(svm))
3386
goto vmgexit_err;
3387
break;
3388
case SVM_VMGEXIT_MMIO_READ:
3389
case SVM_VMGEXIT_MMIO_WRITE:
3390
if (!kvm_ghcb_sw_scratch_is_valid(svm))
3391
goto vmgexit_err;
3392
break;
3393
case SVM_VMGEXIT_AP_CREATION:
3394
if (!sev_snp_guest(vcpu->kvm))
3395
goto vmgexit_err;
3396
if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
3397
if (!kvm_ghcb_rax_is_valid(svm))
3398
goto vmgexit_err;
3399
break;
3400
case SVM_VMGEXIT_NMI_COMPLETE:
3401
case SVM_VMGEXIT_AP_HLT_LOOP:
3402
case SVM_VMGEXIT_AP_JUMP_TABLE:
3403
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
3404
case SVM_VMGEXIT_HV_FEATURES:
3405
case SVM_VMGEXIT_TERM_REQUEST:
3406
break;
3407
case SVM_VMGEXIT_PSC:
3408
if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
3409
goto vmgexit_err;
3410
break;
3411
case SVM_VMGEXIT_GUEST_REQUEST:
3412
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
3413
if (!sev_snp_guest(vcpu->kvm) ||
3414
!PAGE_ALIGNED(control->exit_info_1) ||
3415
!PAGE_ALIGNED(control->exit_info_2) ||
3416
control->exit_info_1 == control->exit_info_2)
3417
goto vmgexit_err;
3418
break;
3419
default:
3420
reason = GHCB_ERR_INVALID_EVENT;
3421
goto vmgexit_err;
3422
}
3423
3424
return 0;
3425
3426
vmgexit_err:
3427
if (reason == GHCB_ERR_INVALID_USAGE) {
3428
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
3429
svm->sev_es.ghcb->ghcb_usage);
3430
} else if (reason == GHCB_ERR_INVALID_EVENT) {
3431
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
3432
exit_code);
3433
} else {
3434
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
3435
exit_code);
3436
dump_ghcb(svm);
3437
}
3438
3439
svm_vmgexit_bad_input(svm, reason);
3440
3441
/* Resume the guest to "return" the error code. */
3442
return 1;
3443
}
3444
3445
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
3446
{
3447
/* Clear any indication that the vCPU is in a type of AP Reset Hold */
3448
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
3449
3450
if (!svm->sev_es.ghcb)
3451
return;
3452
3453
if (svm->sev_es.ghcb_sa_free) {
3454
/*
3455
* The scratch area lives outside the GHCB, so there is a
3456
* buffer that, depending on the operation performed, may
3457
* need to be synced, then freed.
3458
*/
3459
if (svm->sev_es.ghcb_sa_sync) {
3460
kvm_write_guest(svm->vcpu.kvm,
3461
svm->sev_es.sw_scratch,
3462
svm->sev_es.ghcb_sa,
3463
svm->sev_es.ghcb_sa_len);
3464
svm->sev_es.ghcb_sa_sync = false;
3465
}
3466
3467
kvfree(svm->sev_es.ghcb_sa);
3468
svm->sev_es.ghcb_sa = NULL;
3469
svm->sev_es.ghcb_sa_free = false;
3470
}
3471
3472
trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
3473
3474
sev_es_sync_to_ghcb(svm);
3475
3476
kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
3477
svm->sev_es.ghcb = NULL;
3478
}
3479
3480
int pre_sev_run(struct vcpu_svm *svm, int cpu)
3481
{
3482
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
3483
struct kvm *kvm = svm->vcpu.kvm;
3484
unsigned int asid = sev_get_asid(kvm);
3485
3486
/*
3487
* Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
3488
* VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
3489
* AP Destroy event.
3490
*/
3491
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
3492
return -EINVAL;
3493
3494
/*
3495
* To optimize cache flushes when memory is reclaimed from an SEV VM,
3496
* track physical CPUs that enter the guest for SEV VMs and thus can
3497
* have encrypted, dirty data in the cache, and flush caches only for
3498
* CPUs that have entered the guest.
3499
*/
3500
if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
3501
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
3502
3503
/* Assign the asid allocated with this SEV guest */
3504
svm->asid = asid;
3505
3506
/*
3507
* Flush guest TLB:
3508
*
3509
* 1) when different VMCB for the same ASID is to be run on the same host CPU.
3510
* 2) or this VMCB was executed on different host CPU in previous VMRUNs.
3511
*/
3512
if (sd->sev_vmcbs[asid] == svm->vmcb &&
3513
svm->vcpu.arch.last_vmentry_cpu == cpu)
3514
return 0;
3515
3516
sd->sev_vmcbs[asid] = svm->vmcb;
3517
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3518
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3519
return 0;
3520
}
3521
3522
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
3523
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
3524
{
3525
struct vmcb_control_area *control = &svm->vmcb->control;
3526
u64 ghcb_scratch_beg, ghcb_scratch_end;
3527
u64 scratch_gpa_beg, scratch_gpa_end;
3528
void *scratch_va;
3529
3530
scratch_gpa_beg = svm->sev_es.sw_scratch;
3531
if (!scratch_gpa_beg) {
3532
pr_err("vmgexit: scratch gpa not provided\n");
3533
goto e_scratch;
3534
}
3535
3536
scratch_gpa_end = scratch_gpa_beg + len;
3537
if (scratch_gpa_end < scratch_gpa_beg) {
3538
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
3539
len, scratch_gpa_beg);
3540
goto e_scratch;
3541
}
3542
3543
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
3544
/* Scratch area begins within GHCB */
3545
ghcb_scratch_beg = control->ghcb_gpa +
3546
offsetof(struct ghcb, shared_buffer);
3547
ghcb_scratch_end = control->ghcb_gpa +
3548
offsetof(struct ghcb, reserved_0xff0);
3549
3550
/*
3551
* If the scratch area begins within the GHCB, it must be
3552
* completely contained in the GHCB shared buffer area.
3553
*/
3554
if (scratch_gpa_beg < ghcb_scratch_beg ||
3555
scratch_gpa_end > ghcb_scratch_end) {
3556
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
3557
scratch_gpa_beg, scratch_gpa_end);
3558
goto e_scratch;
3559
}
3560
3561
scratch_va = (void *)svm->sev_es.ghcb;
3562
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
3563
} else {
3564
/*
3565
* The guest memory must be read into a kernel buffer, so
3566
* limit the size
3567
*/
3568
if (len > GHCB_SCRATCH_AREA_LIMIT) {
3569
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
3570
len, GHCB_SCRATCH_AREA_LIMIT);
3571
goto e_scratch;
3572
}
3573
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
3574
if (!scratch_va)
3575
return -ENOMEM;
3576
3577
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
3578
/* Unable to copy scratch area from guest */
3579
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
3580
3581
kvfree(scratch_va);
3582
return -EFAULT;
3583
}
3584
3585
/*
3586
* The scratch area is outside the GHCB. The operation will
3587
* dictate whether the buffer needs to be synced before running
3588
* the vCPU next time (i.e. a read was requested so the data
3589
* must be written back to the guest memory).
3590
*/
3591
svm->sev_es.ghcb_sa_sync = sync;
3592
svm->sev_es.ghcb_sa_free = true;
3593
}
3594
3595
svm->sev_es.ghcb_sa = scratch_va;
3596
svm->sev_es.ghcb_sa_len = len;
3597
3598
return 0;
3599
3600
e_scratch:
3601
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
3602
3603
return 1;
3604
}
3605
3606
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
3607
unsigned int pos)
3608
{
3609
svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
3610
svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
3611
}
3612
3613
static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
3614
{
3615
return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
3616
}
3617
3618
static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
3619
{
3620
svm->vmcb->control.ghcb_gpa = value;
3621
}
3622
3623
static int snp_rmptable_psmash(kvm_pfn_t pfn)
3624
{
3625
int ret;
3626
3627
pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
3628
3629
/*
3630
* PSMASH_FAIL_INUSE indicates another processor is modifying the
3631
* entry, so retry until that's no longer the case.
3632
*/
3633
do {
3634
ret = psmash(pfn);
3635
} while (ret == PSMASH_FAIL_INUSE);
3636
3637
return ret;
3638
}
3639
3640
static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
3641
{
3642
struct vcpu_svm *svm = to_svm(vcpu);
3643
3644
if (vcpu->run->hypercall.ret)
3645
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3646
else
3647
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
3648
3649
return 1; /* resume guest */
3650
}
3651
3652
static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
3653
{
3654
u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
3655
u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
3656
struct kvm_vcpu *vcpu = &svm->vcpu;
3657
3658
if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
3659
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3660
return 1; /* resume guest */
3661
}
3662
3663
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3664
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
3665
return 1; /* resume guest */
3666
}
3667
3668
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3669
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3670
/*
3671
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3672
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3673
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
3674
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3675
*/
3676
vcpu->run->hypercall.ret = 0;
3677
vcpu->run->hypercall.args[0] = gpa;
3678
vcpu->run->hypercall.args[1] = 1;
3679
vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
3680
? KVM_MAP_GPA_RANGE_ENCRYPTED
3681
: KVM_MAP_GPA_RANGE_DECRYPTED;
3682
vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3683
3684
vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
3685
3686
return 0; /* forward request to userspace */
3687
}
3688
3689
struct psc_buffer {
3690
struct psc_hdr hdr;
3691
struct psc_entry entries[];
3692
} __packed;
3693
3694
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
3695
3696
static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
3697
{
3698
svm->sev_es.psc_inflight = 0;
3699
svm->sev_es.psc_idx = 0;
3700
svm->sev_es.psc_2m = false;
3701
3702
/*
3703
* PSC requests always get a "no action" response in SW_EXITINFO1, with
3704
* a PSC-specific return code in SW_EXITINFO2 that provides the "real"
3705
* return code. E.g. if the PSC request was interrupted, the need to
3706
* retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
3707
*/
3708
svm_vmgexit_no_action(svm, psc_ret);
3709
}
3710
3711
static void __snp_complete_one_psc(struct vcpu_svm *svm)
3712
{
3713
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3714
struct psc_entry *entries = psc->entries;
3715
struct psc_hdr *hdr = &psc->hdr;
3716
__u16 idx;
3717
3718
/*
3719
* Everything in-flight has been processed successfully. Update the
3720
* corresponding entries in the guest's PSC buffer and zero out the
3721
* count of in-flight PSC entries.
3722
*/
3723
for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
3724
svm->sev_es.psc_inflight--, idx++) {
3725
struct psc_entry *entry = &entries[idx];
3726
3727
entry->cur_page = entry->pagesize ? 512 : 1;
3728
}
3729
3730
hdr->cur_entry = idx;
3731
}
3732
3733
static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
3734
{
3735
struct vcpu_svm *svm = to_svm(vcpu);
3736
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
3737
3738
if (vcpu->run->hypercall.ret) {
3739
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3740
return 1; /* resume guest */
3741
}
3742
3743
__snp_complete_one_psc(svm);
3744
3745
/* Handle the next range (if any). */
3746
return snp_begin_psc(svm, psc);
3747
}
3748
3749
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
3750
{
3751
struct psc_entry *entries = psc->entries;
3752
struct kvm_vcpu *vcpu = &svm->vcpu;
3753
struct psc_hdr *hdr = &psc->hdr;
3754
struct psc_entry entry_start;
3755
u16 idx, idx_start, idx_end;
3756
int npages;
3757
bool huge;
3758
u64 gfn;
3759
3760
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
3761
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3762
return 1;
3763
}
3764
3765
next_range:
3766
/* There should be no other PSCs in-flight at this point. */
3767
if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
3768
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
3769
return 1;
3770
}
3771
3772
/*
3773
* The PSC descriptor buffer can be modified by a misbehaved guest after
3774
* validation, so take care to only use validated copies of values used
3775
* for things like array indexing.
3776
*/
3777
idx_start = hdr->cur_entry;
3778
idx_end = hdr->end_entry;
3779
3780
if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
3781
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
3782
return 1;
3783
}
3784
3785
/* Find the start of the next range which needs processing. */
3786
for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
3787
entry_start = entries[idx];
3788
3789
gfn = entry_start.gfn;
3790
huge = entry_start.pagesize;
3791
npages = huge ? 512 : 1;
3792
3793
if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
3794
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
3795
return 1;
3796
}
3797
3798
if (entry_start.cur_page) {
3799
/*
3800
* If this is a partially-completed 2M range, force 4K handling
3801
* for the remaining pages since they're effectively split at
3802
* this point. Subsequent code should ensure this doesn't get
3803
* combined with adjacent PSC entries where 2M handling is still
3804
* possible.
3805
*/
3806
npages -= entry_start.cur_page;
3807
gfn += entry_start.cur_page;
3808
huge = false;
3809
}
3810
3811
if (npages)
3812
break;
3813
}
3814
3815
if (idx > idx_end) {
3816
/* Nothing more to process. */
3817
snp_complete_psc(svm, 0);
3818
return 1;
3819
}
3820
3821
svm->sev_es.psc_2m = huge;
3822
svm->sev_es.psc_idx = idx;
3823
svm->sev_es.psc_inflight = 1;
3824
3825
/*
3826
* Find all subsequent PSC entries that contain adjacent GPA
3827
* ranges/operations and can be combined into a single
3828
* KVM_HC_MAP_GPA_RANGE exit.
3829
*/
3830
while (++idx <= idx_end) {
3831
struct psc_entry entry = entries[idx];
3832
3833
if (entry.operation != entry_start.operation ||
3834
entry.gfn != entry_start.gfn + npages ||
3835
entry.cur_page || !!entry.pagesize != huge)
3836
break;
3837
3838
svm->sev_es.psc_inflight++;
3839
npages += huge ? 512 : 1;
3840
}
3841
3842
switch (entry_start.operation) {
3843
case VMGEXIT_PSC_OP_PRIVATE:
3844
case VMGEXIT_PSC_OP_SHARED:
3845
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
3846
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
3847
/*
3848
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
3849
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
3850
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
3851
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
3852
*/
3853
vcpu->run->hypercall.ret = 0;
3854
vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
3855
vcpu->run->hypercall.args[1] = npages;
3856
vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
3857
? KVM_MAP_GPA_RANGE_ENCRYPTED
3858
: KVM_MAP_GPA_RANGE_DECRYPTED;
3859
vcpu->run->hypercall.args[2] |= entry_start.pagesize
3860
? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
3861
: KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
3862
vcpu->arch.complete_userspace_io = snp_complete_one_psc;
3863
return 0; /* forward request to userspace */
3864
default:
3865
/*
3866
* Only shared/private PSC operations are currently supported, so if the
3867
* entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
3868
* then consider the entire range completed and avoid exiting to
3869
* userspace. In theory snp_complete_psc() can always be called directly
3870
* at this point to complete the current range and start the next one,
3871
* but that could lead to unexpected levels of recursion.
3872
*/
3873
__snp_complete_one_psc(svm);
3874
goto next_range;
3875
}
3876
3877
BUG();
3878
}
3879
3880
/*
3881
* Invoked as part of svm_vcpu_reset() processing of an init event.
3882
*/
3883
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
3884
{
3885
struct vcpu_svm *svm = to_svm(vcpu);
3886
struct kvm_memory_slot *slot;
3887
struct page *page;
3888
kvm_pfn_t pfn;
3889
gfn_t gfn;
3890
3891
if (!sev_snp_guest(vcpu->kvm))
3892
return;
3893
3894
guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
3895
3896
if (!svm->sev_es.snp_ap_waiting_for_reset)
3897
return;
3898
3899
svm->sev_es.snp_ap_waiting_for_reset = false;
3900
3901
/* Mark the vCPU as offline and not runnable */
3902
vcpu->arch.pv.pv_unhalted = false;
3903
kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
3904
3905
/* Clear use of the VMSA */
3906
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
3907
3908
/*
3909
* When replacing the VMSA during SEV-SNP AP creation,
3910
* mark the VMCB dirty so that full state is always reloaded.
3911
*/
3912
vmcb_mark_all_dirty(svm->vmcb);
3913
3914
if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
3915
return;
3916
3917
gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
3918
svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
3919
3920
slot = gfn_to_memslot(vcpu->kvm, gfn);
3921
if (!slot)
3922
return;
3923
3924
/*
3925
* The new VMSA will be private memory guest memory, so retrieve the
3926
* PFN from the gmem backend.
3927
*/
3928
if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
3929
return;
3930
3931
/*
3932
* From this point forward, the VMSA will always be a guest-mapped page
3933
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
3934
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
3935
* that involves cleanups like flushing caches, which would ideally be
3936
* handled during teardown rather than guest boot. Deferring that also
3937
* allows the existing logic for SEV-ES VMSAs to be re-used with
3938
* minimal SNP-specific changes.
3939
*/
3940
svm->sev_es.snp_has_guest_vmsa = true;
3941
3942
/* Use the new VMSA */
3943
svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
3944
3945
/* Mark the vCPU as runnable */
3946
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3947
3948
/*
3949
* gmem pages aren't currently migratable, but if this ever changes
3950
* then care should be taken to ensure svm->sev_es.vmsa is pinned
3951
* through some other means.
3952
*/
3953
kvm_release_page_clean(page);
3954
}
3955
3956
static int sev_snp_ap_creation(struct vcpu_svm *svm)
3957
{
3958
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
3959
struct kvm_vcpu *vcpu = &svm->vcpu;
3960
struct kvm_vcpu *target_vcpu;
3961
struct vcpu_svm *target_svm;
3962
unsigned int request;
3963
unsigned int apic_id;
3964
3965
request = lower_32_bits(svm->vmcb->control.exit_info_1);
3966
apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
3967
3968
/* Validate the APIC ID */
3969
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
3970
if (!target_vcpu) {
3971
vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
3972
apic_id);
3973
return -EINVAL;
3974
}
3975
3976
target_svm = to_svm(target_vcpu);
3977
3978
guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
3979
3980
switch (request) {
3981
case SVM_VMGEXIT_AP_CREATE_ON_INIT:
3982
case SVM_VMGEXIT_AP_CREATE:
3983
if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
3984
vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
3985
vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
3986
return -EINVAL;
3987
}
3988
3989
if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
3990
vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
3991
svm->vmcb->control.exit_info_2);
3992
return -EINVAL;
3993
}
3994
3995
/*
3996
* Malicious guest can RMPADJUST a large page into VMSA which
3997
* will hit the SNP erratum where the CPU will incorrectly signal
3998
* an RMP violation #PF if a hugepage collides with the RMP entry
3999
* of VMSA page, reject the AP CREATE request if VMSA address from
4000
* guest is 2M aligned.
4001
*/
4002
if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
4003
vcpu_unimpl(vcpu,
4004
"vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
4005
svm->vmcb->control.exit_info_2);
4006
return -EINVAL;
4007
}
4008
4009
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
4010
break;
4011
case SVM_VMGEXIT_AP_DESTROY:
4012
target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
4013
break;
4014
default:
4015
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
4016
request);
4017
return -EINVAL;
4018
}
4019
4020
target_svm->sev_es.snp_ap_waiting_for_reset = true;
4021
4022
/*
4023
* Unless Creation is deferred until INIT, signal the vCPU to update
4024
* its state.
4025
*/
4026
if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
4027
kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
4028
4029
return 0;
4030
}
4031
4032
static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4033
{
4034
struct sev_data_snp_guest_request data = {0};
4035
struct kvm *kvm = svm->vcpu.kvm;
4036
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4037
sev_ret_code fw_err = 0;
4038
int ret;
4039
4040
if (!sev_snp_guest(kvm))
4041
return -EINVAL;
4042
4043
mutex_lock(&sev->guest_req_mutex);
4044
4045
if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
4046
ret = -EIO;
4047
goto out_unlock;
4048
}
4049
4050
data.gctx_paddr = __psp_pa(sev->snp_context);
4051
data.req_paddr = __psp_pa(sev->guest_req_buf);
4052
data.res_paddr = __psp_pa(sev->guest_resp_buf);
4053
4054
/*
4055
* Firmware failures are propagated on to guest, but any other failure
4056
* condition along the way should be reported to userspace. E.g. if
4057
* the PSP is dead and commands are timing out.
4058
*/
4059
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
4060
if (ret && !fw_err)
4061
goto out_unlock;
4062
4063
if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
4064
ret = -EIO;
4065
goto out_unlock;
4066
}
4067
4068
/* No action is requested *from KVM* if there was a firmware error. */
4069
svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
4070
4071
ret = 1; /* resume guest */
4072
4073
out_unlock:
4074
mutex_unlock(&sev->guest_req_mutex);
4075
return ret;
4076
}
4077
4078
static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
4079
{
4080
struct kvm *kvm = svm->vcpu.kvm;
4081
u8 msg_type;
4082
4083
if (!sev_snp_guest(kvm))
4084
return -EINVAL;
4085
4086
if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
4087
&msg_type, 1))
4088
return -EIO;
4089
4090
/*
4091
* As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
4092
* additional certificate data to be provided alongside the attestation
4093
* report via the guest-provided data pages indicated by RAX/RBX. The
4094
* certificate data is optional and requires additional KVM enablement
4095
* to provide an interface for userspace to provide it, but KVM still
4096
* needs to be able to handle extended guest requests either way. So
4097
* provide a stub implementation that will always return an empty
4098
* certificate table in the guest-provided data pages.
4099
*/
4100
if (msg_type == SNP_MSG_REPORT_REQ) {
4101
struct kvm_vcpu *vcpu = &svm->vcpu;
4102
u64 data_npages;
4103
gpa_t data_gpa;
4104
4105
if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
4106
goto request_invalid;
4107
4108
data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
4109
data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
4110
4111
if (!PAGE_ALIGNED(data_gpa))
4112
goto request_invalid;
4113
4114
/*
4115
* As per GHCB spec (see "SNP Extended Guest Request"), the
4116
* certificate table is terminated by 24-bytes of zeroes.
4117
*/
4118
if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
4119
return -EIO;
4120
}
4121
4122
return snp_handle_guest_req(svm, req_gpa, resp_gpa);
4123
4124
request_invalid:
4125
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4126
return 1; /* resume guest */
4127
}
4128
4129
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
4130
{
4131
struct vmcb_control_area *control = &svm->vmcb->control;
4132
struct kvm_vcpu *vcpu = &svm->vcpu;
4133
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4134
u64 ghcb_info;
4135
int ret = 1;
4136
4137
ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
4138
4139
trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
4140
control->ghcb_gpa);
4141
4142
switch (ghcb_info) {
4143
case GHCB_MSR_SEV_INFO_REQ:
4144
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4145
GHCB_VERSION_MIN,
4146
sev_enc_bit));
4147
break;
4148
case GHCB_MSR_CPUID_REQ: {
4149
u64 cpuid_fn, cpuid_reg, cpuid_value;
4150
4151
cpuid_fn = get_ghcb_msr_bits(svm,
4152
GHCB_MSR_CPUID_FUNC_MASK,
4153
GHCB_MSR_CPUID_FUNC_POS);
4154
4155
/* Initialize the registers needed by the CPUID intercept */
4156
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
4157
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
4158
4159
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
4160
if (!ret) {
4161
/* Error, keep GHCB MSR value as-is */
4162
break;
4163
}
4164
4165
cpuid_reg = get_ghcb_msr_bits(svm,
4166
GHCB_MSR_CPUID_REG_MASK,
4167
GHCB_MSR_CPUID_REG_POS);
4168
if (cpuid_reg == 0)
4169
cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
4170
else if (cpuid_reg == 1)
4171
cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
4172
else if (cpuid_reg == 2)
4173
cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
4174
else
4175
cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
4176
4177
set_ghcb_msr_bits(svm, cpuid_value,
4178
GHCB_MSR_CPUID_VALUE_MASK,
4179
GHCB_MSR_CPUID_VALUE_POS);
4180
4181
set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
4182
GHCB_MSR_INFO_MASK,
4183
GHCB_MSR_INFO_POS);
4184
break;
4185
}
4186
case GHCB_MSR_AP_RESET_HOLD_REQ:
4187
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
4188
ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
4189
4190
/*
4191
* Preset the result to a non-SIPI return and then only set
4192
* the result to non-zero when delivering a SIPI.
4193
*/
4194
set_ghcb_msr_bits(svm, 0,
4195
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4196
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4197
4198
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4199
GHCB_MSR_INFO_MASK,
4200
GHCB_MSR_INFO_POS);
4201
break;
4202
case GHCB_MSR_HV_FT_REQ:
4203
set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
4204
GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
4205
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
4206
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
4207
break;
4208
case GHCB_MSR_PREF_GPA_REQ:
4209
if (!sev_snp_guest(vcpu->kvm))
4210
goto out_terminate;
4211
4212
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
4213
GHCB_MSR_GPA_VALUE_POS);
4214
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
4215
GHCB_MSR_INFO_POS);
4216
break;
4217
case GHCB_MSR_REG_GPA_REQ: {
4218
u64 gfn;
4219
4220
if (!sev_snp_guest(vcpu->kvm))
4221
goto out_terminate;
4222
4223
gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
4224
GHCB_MSR_GPA_VALUE_POS);
4225
4226
svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
4227
4228
set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
4229
GHCB_MSR_GPA_VALUE_POS);
4230
set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
4231
GHCB_MSR_INFO_POS);
4232
break;
4233
}
4234
case GHCB_MSR_PSC_REQ:
4235
if (!sev_snp_guest(vcpu->kvm))
4236
goto out_terminate;
4237
4238
ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
4239
break;
4240
case GHCB_MSR_TERM_REQ: {
4241
u64 reason_set, reason_code;
4242
4243
reason_set = get_ghcb_msr_bits(svm,
4244
GHCB_MSR_TERM_REASON_SET_MASK,
4245
GHCB_MSR_TERM_REASON_SET_POS);
4246
reason_code = get_ghcb_msr_bits(svm,
4247
GHCB_MSR_TERM_REASON_MASK,
4248
GHCB_MSR_TERM_REASON_POS);
4249
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
4250
reason_set, reason_code);
4251
4252
goto out_terminate;
4253
}
4254
default:
4255
/* Error, keep GHCB MSR value as-is */
4256
break;
4257
}
4258
4259
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
4260
control->ghcb_gpa, ret);
4261
4262
return ret;
4263
4264
out_terminate:
4265
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4266
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4267
vcpu->run->system_event.ndata = 1;
4268
vcpu->run->system_event.data[0] = control->ghcb_gpa;
4269
4270
return 0;
4271
}
4272
4273
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
4274
{
4275
struct vcpu_svm *svm = to_svm(vcpu);
4276
struct vmcb_control_area *control = &svm->vmcb->control;
4277
u64 ghcb_gpa, exit_code;
4278
int ret;
4279
4280
/* Validate the GHCB */
4281
ghcb_gpa = control->ghcb_gpa;
4282
if (ghcb_gpa & GHCB_MSR_INFO_MASK)
4283
return sev_handle_vmgexit_msr_protocol(svm);
4284
4285
if (!ghcb_gpa) {
4286
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
4287
4288
/* Without a GHCB, just return right back to the guest */
4289
return 1;
4290
}
4291
4292
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
4293
/* Unable to map GHCB from guest */
4294
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
4295
ghcb_gpa);
4296
4297
/* Without a GHCB, just return right back to the guest */
4298
return 1;
4299
}
4300
4301
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
4302
4303
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
4304
4305
sev_es_sync_from_ghcb(svm);
4306
4307
/* SEV-SNP guest requires that the GHCB GPA must be registered */
4308
if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
4309
vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
4310
return -EINVAL;
4311
}
4312
4313
ret = sev_es_validate_vmgexit(svm);
4314
if (ret)
4315
return ret;
4316
4317
svm_vmgexit_success(svm, 0);
4318
4319
exit_code = kvm_ghcb_get_sw_exit_code(control);
4320
switch (exit_code) {
4321
case SVM_VMGEXIT_MMIO_READ:
4322
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4323
if (ret)
4324
break;
4325
4326
ret = kvm_sev_es_mmio_read(vcpu,
4327
control->exit_info_1,
4328
control->exit_info_2,
4329
svm->sev_es.ghcb_sa);
4330
break;
4331
case SVM_VMGEXIT_MMIO_WRITE:
4332
ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
4333
if (ret)
4334
break;
4335
4336
ret = kvm_sev_es_mmio_write(vcpu,
4337
control->exit_info_1,
4338
control->exit_info_2,
4339
svm->sev_es.ghcb_sa);
4340
break;
4341
case SVM_VMGEXIT_NMI_COMPLETE:
4342
++vcpu->stat.nmi_window_exits;
4343
svm->nmi_masked = false;
4344
kvm_make_request(KVM_REQ_EVENT, vcpu);
4345
ret = 1;
4346
break;
4347
case SVM_VMGEXIT_AP_HLT_LOOP:
4348
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
4349
ret = kvm_emulate_ap_reset_hold(vcpu);
4350
break;
4351
case SVM_VMGEXIT_AP_JUMP_TABLE: {
4352
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4353
4354
switch (control->exit_info_1) {
4355
case 0:
4356
/* Set AP jump table address */
4357
sev->ap_jump_table = control->exit_info_2;
4358
break;
4359
case 1:
4360
/* Get AP jump table address */
4361
svm_vmgexit_success(svm, sev->ap_jump_table);
4362
break;
4363
default:
4364
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
4365
control->exit_info_1);
4366
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4367
}
4368
4369
ret = 1;
4370
break;
4371
}
4372
case SVM_VMGEXIT_HV_FEATURES:
4373
svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
4374
ret = 1;
4375
break;
4376
case SVM_VMGEXIT_TERM_REQUEST:
4377
pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
4378
control->exit_info_1, control->exit_info_2);
4379
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
4380
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
4381
vcpu->run->system_event.ndata = 1;
4382
vcpu->run->system_event.data[0] = control->ghcb_gpa;
4383
break;
4384
case SVM_VMGEXIT_PSC:
4385
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
4386
if (ret)
4387
break;
4388
4389
ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
4390
break;
4391
case SVM_VMGEXIT_AP_CREATION:
4392
ret = sev_snp_ap_creation(svm);
4393
if (ret) {
4394
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
4395
}
4396
4397
ret = 1;
4398
break;
4399
case SVM_VMGEXIT_GUEST_REQUEST:
4400
ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
4401
break;
4402
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
4403
ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
4404
break;
4405
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
4406
vcpu_unimpl(vcpu,
4407
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
4408
control->exit_info_1, control->exit_info_2);
4409
ret = -EINVAL;
4410
break;
4411
default:
4412
ret = svm_invoke_exit_handler(vcpu, exit_code);
4413
}
4414
4415
return ret;
4416
}
4417
4418
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
4419
{
4420
int count;
4421
int bytes;
4422
int r;
4423
4424
if (svm->vmcb->control.exit_info_2 > INT_MAX)
4425
return -EINVAL;
4426
4427
count = svm->vmcb->control.exit_info_2;
4428
if (unlikely(check_mul_overflow(count, size, &bytes)))
4429
return -EINVAL;
4430
4431
r = setup_vmgexit_scratch(svm, in, bytes);
4432
if (r)
4433
return r;
4434
4435
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
4436
count, in);
4437
}
4438
4439
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
4440
{
4441
/* Clear intercepts on MSRs that are context switched by hardware. */
4442
svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
4443
svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
4444
svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
4445
4446
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
4447
svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
4448
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
4449
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
4450
4451
/*
4452
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
4453
* the host/guest supports its use.
4454
*
4455
* KVM treats the guest as being capable of using XSAVES even if XSAVES
4456
* isn't enabled in guest CPUID as there is no intercept for XSAVES,
4457
* i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
4458
* exposed to the guest and XSAVES is supported in hardware. Condition
4459
* full XSS passthrough on the guest being able to use XSAVES *and*
4460
* XSAVES being exposed to the guest so that KVM can at least honor
4461
* guest CPUID for RDMSR and WRMSR.
4462
*/
4463
svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
4464
!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
4465
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
4466
}
4467
4468
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
4469
{
4470
struct kvm_vcpu *vcpu = &svm->vcpu;
4471
struct kvm_cpuid_entry2 *best;
4472
4473
/* For sev guests, the memory encryption bit is not reserved in CR3. */
4474
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4475
if (best)
4476
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4477
}
4478
4479
static void sev_es_init_vmcb(struct vcpu_svm *svm)
4480
{
4481
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
4482
struct vmcb *vmcb = svm->vmcb01.ptr;
4483
4484
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
4485
4486
/*
4487
* An SEV-ES guest requires a VMSA area that is a separate from the
4488
* VMCB page. Do not include the encryption mask on the VMSA physical
4489
* address since hardware will access it using the guest key. Note,
4490
* the VMSA will be NULL if this vCPU is the destination for intrahost
4491
* migration, and will be copied later.
4492
*/
4493
if (!svm->sev_es.snp_has_guest_vmsa) {
4494
if (svm->sev_es.vmsa)
4495
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
4496
else
4497
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
4498
}
4499
4500
if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
4501
svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
4502
VMCB_ALLOWED_SEV_FEATURES_VALID;
4503
4504
/* Can't intercept CR register access, HV can't modify CR registers */
4505
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
4506
svm_clr_intercept(svm, INTERCEPT_CR4_READ);
4507
svm_clr_intercept(svm, INTERCEPT_CR8_READ);
4508
svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
4509
svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
4510
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
4511
4512
svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
4513
4514
/* Track EFER/CR register changes */
4515
svm_set_intercept(svm, TRAP_EFER_WRITE);
4516
svm_set_intercept(svm, TRAP_CR0_WRITE);
4517
svm_set_intercept(svm, TRAP_CR4_WRITE);
4518
svm_set_intercept(svm, TRAP_CR8_WRITE);
4519
4520
vmcb->control.intercepts[INTERCEPT_DR] = 0;
4521
if (!sev_vcpu_has_debug_swap(svm)) {
4522
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
4523
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
4524
recalc_intercepts(svm);
4525
} else {
4526
/*
4527
* Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
4528
* allow debugging SEV-ES guests, and enables DebugSwap iff
4529
* NO_NESTED_DATA_BP is supported, so there's no reason to
4530
* intercept #DB when DebugSwap is enabled. For simplicity
4531
* with respect to guest debug, intercept #DB for other VMs
4532
* even if NO_NESTED_DATA_BP is supported, i.e. even if the
4533
* guest can't DoS the CPU with infinite #DB vectoring.
4534
*/
4535
clr_exception_intercept(svm, DB_VECTOR);
4536
}
4537
4538
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
4539
svm_clr_intercept(svm, INTERCEPT_XSETBV);
4540
}
4541
4542
void sev_init_vmcb(struct vcpu_svm *svm)
4543
{
4544
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
4545
clr_exception_intercept(svm, UD_VECTOR);
4546
4547
/*
4548
* Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
4549
* KVM can't decrypt guest memory to decode the faulting instruction.
4550
*/
4551
clr_exception_intercept(svm, GP_VECTOR);
4552
4553
if (sev_es_guest(svm->vcpu.kvm))
4554
sev_es_init_vmcb(svm);
4555
}
4556
4557
void sev_es_vcpu_reset(struct vcpu_svm *svm)
4558
{
4559
struct kvm_vcpu *vcpu = &svm->vcpu;
4560
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
4561
4562
/*
4563
* Set the GHCB MSR value as per the GHCB specification when emulating
4564
* vCPU RESET for an SEV-ES guest.
4565
*/
4566
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
4567
GHCB_VERSION_MIN,
4568
sev_enc_bit));
4569
4570
mutex_init(&svm->sev_es.snp_vmsa_mutex);
4571
}
4572
4573
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
4574
{
4575
struct kvm *kvm = svm->vcpu.kvm;
4576
4577
/*
4578
* All host state for SEV-ES guests is categorized into three swap types
4579
* based on how it is handled by hardware during a world switch:
4580
*
4581
* A: VMRUN: Host state saved in host save area
4582
* VMEXIT: Host state loaded from host save area
4583
*
4584
* B: VMRUN: Host state _NOT_ saved in host save area
4585
* VMEXIT: Host state loaded from host save area
4586
*
4587
* C: VMRUN: Host state _NOT_ saved in host save area
4588
* VMEXIT: Host state initialized to default(reset) values
4589
*
4590
* Manually save type-B state, i.e. state that is loaded by VMEXIT but
4591
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
4592
* by common SVM code).
4593
*/
4594
hostsa->xcr0 = kvm_host.xcr0;
4595
hostsa->pkru = read_pkru();
4596
hostsa->xss = kvm_host.xss;
4597
4598
/*
4599
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
4600
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
4601
* not save or load debug registers. Sadly, KVM can't prevent SNP
4602
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
4603
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
4604
* the guest has actually enabled (or not!) in the VMSA.
4605
*
4606
* If DebugSwap is *possible*, save the masks so that they're restored
4607
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
4608
* rely on the CPU to restore the host values; KVM will restore them as
4609
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
4610
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
4611
* don't need to be restored per se, KVM just needs to ensure they are
4612
* loaded with the correct values *if* the CPU writes the MSRs.
4613
*/
4614
if (sev_vcpu_has_debug_swap(svm) ||
4615
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
4616
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
4617
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
4618
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
4619
hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
4620
}
4621
}
4622
4623
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4624
{
4625
struct vcpu_svm *svm = to_svm(vcpu);
4626
4627
/* First SIPI: Use the values as initially set by the VMM */
4628
if (!svm->sev_es.received_first_sipi) {
4629
svm->sev_es.received_first_sipi = true;
4630
return;
4631
}
4632
4633
/* Subsequent SIPI */
4634
switch (svm->sev_es.ap_reset_hold_type) {
4635
case AP_RESET_HOLD_NAE_EVENT:
4636
/*
4637
* Return from an AP Reset Hold VMGEXIT, where the guest will
4638
* set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
4639
*/
4640
svm_vmgexit_success(svm, 1);
4641
break;
4642
case AP_RESET_HOLD_MSR_PROTO:
4643
/*
4644
* Return from an AP Reset Hold VMGEXIT, where the guest will
4645
* set the CS and RIP. Set GHCB data field to a non-zero value.
4646
*/
4647
set_ghcb_msr_bits(svm, 1,
4648
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
4649
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
4650
4651
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
4652
GHCB_MSR_INFO_MASK,
4653
GHCB_MSR_INFO_POS);
4654
break;
4655
default:
4656
break;
4657
}
4658
}
4659
4660
struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
4661
{
4662
unsigned long pfn;
4663
struct page *p;
4664
4665
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
4666
return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
4667
4668
/*
4669
* Allocate an SNP-safe page to workaround the SNP erratum where
4670
* the CPU will incorrectly signal an RMP violation #PF if a
4671
* hugepage (2MB or 1GB) collides with the RMP entry of a
4672
* 2MB-aligned VMCB, VMSA, or AVIC backing page.
4673
*
4674
* Allocate one extra page, choose a page which is not
4675
* 2MB-aligned, and free the other.
4676
*/
4677
p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
4678
if (!p)
4679
return NULL;
4680
4681
split_page(p, 1);
4682
4683
pfn = page_to_pfn(p);
4684
if (IS_ALIGNED(pfn, PTRS_PER_PMD))
4685
__free_page(p++);
4686
else
4687
__free_page(p + 1);
4688
4689
return p;
4690
}
4691
4692
void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
4693
{
4694
struct kvm_memory_slot *slot;
4695
struct kvm *kvm = vcpu->kvm;
4696
int order, rmp_level, ret;
4697
struct page *page;
4698
bool assigned;
4699
kvm_pfn_t pfn;
4700
gfn_t gfn;
4701
4702
gfn = gpa >> PAGE_SHIFT;
4703
4704
/*
4705
* The only time RMP faults occur for shared pages is when the guest is
4706
* triggering an RMP fault for an implicit page-state change from
4707
* shared->private. Implicit page-state changes are forwarded to
4708
* userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
4709
* for shared pages should not end up here.
4710
*/
4711
if (!kvm_mem_is_private(kvm, gfn)) {
4712
pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
4713
gpa);
4714
return;
4715
}
4716
4717
slot = gfn_to_memslot(kvm, gfn);
4718
if (!kvm_slot_can_be_private(slot)) {
4719
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
4720
gpa);
4721
return;
4722
}
4723
4724
ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
4725
if (ret) {
4726
pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
4727
gpa);
4728
return;
4729
}
4730
4731
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4732
if (ret || !assigned) {
4733
pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
4734
gpa, pfn, ret);
4735
goto out_no_trace;
4736
}
4737
4738
/*
4739
* There are 2 cases where a PSMASH may be needed to resolve an #NPF
4740
* with PFERR_GUEST_RMP_BIT set:
4741
*
4742
* 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
4743
* bit set if the guest issues them with a smaller granularity than
4744
* what is indicated by the page-size bit in the 2MB RMP entry for
4745
* the PFN that backs the GPA.
4746
*
4747
* 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
4748
* smaller than what is indicated by the 2MB RMP entry for the PFN
4749
* that backs the GPA.
4750
*
4751
* In both these cases, the corresponding 2M RMP entry needs to
4752
* be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
4753
* split into 4K RMP entries, then this is likely a spurious case which
4754
* can occur when there are concurrent accesses by the guest to a 2MB
4755
* GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
4756
* the process of being PMASH'd into 4K entries. These cases should
4757
* resolve automatically on subsequent accesses, so just ignore them
4758
* here.
4759
*/
4760
if (rmp_level == PG_LEVEL_4K)
4761
goto out;
4762
4763
ret = snp_rmptable_psmash(pfn);
4764
if (ret) {
4765
/*
4766
* Look it up again. If it's 4K now then the PSMASH may have
4767
* raced with another process and the issue has already resolved
4768
* itself.
4769
*/
4770
if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
4771
assigned && rmp_level == PG_LEVEL_4K)
4772
goto out;
4773
4774
pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
4775
gpa, pfn, ret);
4776
}
4777
4778
kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
4779
out:
4780
trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
4781
out_no_trace:
4782
kvm_release_page_unused(page);
4783
}
4784
4785
static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
4786
{
4787
kvm_pfn_t pfn = start;
4788
4789
while (pfn < end) {
4790
int ret, rmp_level;
4791
bool assigned;
4792
4793
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4794
if (ret) {
4795
pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
4796
pfn, start, end, rmp_level, ret);
4797
return false;
4798
}
4799
4800
if (assigned) {
4801
pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
4802
__func__, pfn, start, end, rmp_level);
4803
return false;
4804
}
4805
4806
pfn++;
4807
}
4808
4809
return true;
4810
}
4811
4812
static u8 max_level_for_order(int order)
4813
{
4814
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
4815
return PG_LEVEL_2M;
4816
4817
return PG_LEVEL_4K;
4818
}
4819
4820
static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
4821
{
4822
kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
4823
4824
/*
4825
* If this is a large folio, and the entire 2M range containing the
4826
* PFN is currently shared, then the entire 2M-aligned range can be
4827
* set to private via a single 2M RMP entry.
4828
*/
4829
if (max_level_for_order(order) > PG_LEVEL_4K &&
4830
is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
4831
return true;
4832
4833
return false;
4834
}
4835
4836
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
4837
{
4838
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
4839
kvm_pfn_t pfn_aligned;
4840
gfn_t gfn_aligned;
4841
int level, rc;
4842
bool assigned;
4843
4844
if (!sev_snp_guest(kvm))
4845
return 0;
4846
4847
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
4848
if (rc) {
4849
pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
4850
gfn, pfn, rc);
4851
return -ENOENT;
4852
}
4853
4854
if (assigned) {
4855
pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
4856
__func__, gfn, pfn, max_order, level);
4857
return 0;
4858
}
4859
4860
if (is_large_rmp_possible(kvm, pfn, max_order)) {
4861
level = PG_LEVEL_2M;
4862
pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
4863
gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
4864
} else {
4865
level = PG_LEVEL_4K;
4866
pfn_aligned = pfn;
4867
gfn_aligned = gfn;
4868
}
4869
4870
rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
4871
if (rc) {
4872
pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
4873
gfn, pfn, level, rc);
4874
return -EINVAL;
4875
}
4876
4877
pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
4878
__func__, gfn, pfn, pfn_aligned, max_order, level);
4879
4880
return 0;
4881
}
4882
4883
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
4884
{
4885
kvm_pfn_t pfn;
4886
4887
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
4888
return;
4889
4890
pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
4891
4892
for (pfn = start; pfn < end;) {
4893
bool use_2m_update = false;
4894
int rc, rmp_level;
4895
bool assigned;
4896
4897
rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
4898
if (rc || !assigned)
4899
goto next_pfn;
4900
4901
use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
4902
end >= (pfn + PTRS_PER_PMD) &&
4903
rmp_level > PG_LEVEL_4K;
4904
4905
/*
4906
* If an unaligned PFN corresponds to a 2M region assigned as a
4907
* large page in the RMP table, PSMASH the region into individual
4908
* 4K RMP entries before attempting to convert a 4K sub-page.
4909
*/
4910
if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
4911
/*
4912
* This shouldn't fail, but if it does, report it, but
4913
* still try to update RMP entry to shared and pray this
4914
* was a spurious error that can be addressed later.
4915
*/
4916
rc = snp_rmptable_psmash(pfn);
4917
WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
4918
pfn, rc);
4919
}
4920
4921
rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
4922
if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
4923
pfn, rc))
4924
goto next_pfn;
4925
4926
/*
4927
* SEV-ES avoids host/guest cache coherency issues through
4928
* WBNOINVD hooks issued via MMU notifiers during run-time, and
4929
* KVM's VM destroy path at shutdown. Those MMU notifier events
4930
* don't cover gmem since there is no requirement to map pages
4931
* to a HVA in order to use them for a running guest. While the
4932
* shutdown path would still likely cover things for SNP guests,
4933
* userspace may also free gmem pages during run-time via
4934
* hole-punching operations on the guest_memfd, so flush the
4935
* cache entries for these pages before free'ing them back to
4936
* the host.
4937
*/
4938
clflush_cache_range(__va(pfn_to_hpa(pfn)),
4939
use_2m_update ? PMD_SIZE : PAGE_SIZE);
4940
next_pfn:
4941
pfn += use_2m_update ? PTRS_PER_PMD : 1;
4942
cond_resched();
4943
}
4944
}
4945
4946
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
4947
{
4948
int level, rc;
4949
bool assigned;
4950
4951
if (!sev_snp_guest(kvm))
4952
return 0;
4953
4954
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
4955
if (rc || !assigned)
4956
return PG_LEVEL_4K;
4957
4958
return level;
4959
}
4960
4961
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
4962
{
4963
struct vcpu_svm *svm = to_svm(vcpu);
4964
struct vmcb_save_area *vmsa;
4965
struct kvm_sev_info *sev;
4966
int error = 0;
4967
int ret;
4968
4969
if (!sev_es_guest(vcpu->kvm))
4970
return NULL;
4971
4972
/*
4973
* If the VMSA has not yet been encrypted, return a pointer to the
4974
* current un-encrypted VMSA.
4975
*/
4976
if (!vcpu->arch.guest_state_protected)
4977
return (struct vmcb_save_area *)svm->sev_es.vmsa;
4978
4979
sev = to_kvm_sev_info(vcpu->kvm);
4980
4981
/* Check if the SEV policy allows debugging */
4982
if (sev_snp_guest(vcpu->kvm)) {
4983
if (!(sev->policy & SNP_POLICY_DEBUG))
4984
return NULL;
4985
} else {
4986
if (sev->policy & SEV_POLICY_NODBG)
4987
return NULL;
4988
}
4989
4990
if (sev_snp_guest(vcpu->kvm)) {
4991
struct sev_data_snp_dbg dbg = {0};
4992
4993
vmsa = snp_alloc_firmware_page(__GFP_ZERO);
4994
if (!vmsa)
4995
return NULL;
4996
4997
dbg.gctx_paddr = __psp_pa(sev->snp_context);
4998
dbg.src_addr = svm->vmcb->control.vmsa_pa;
4999
dbg.dst_addr = __psp_pa(vmsa);
5000
5001
ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
5002
5003
/*
5004
* Return the target page to a hypervisor page no matter what.
5005
* If this fails, the page can't be used, so leak it and don't
5006
* try to use it.
5007
*/
5008
if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
5009
return NULL;
5010
5011
if (ret) {
5012
pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
5013
ret, error, error);
5014
free_page((unsigned long)vmsa);
5015
5016
return NULL;
5017
}
5018
} else {
5019
struct sev_data_dbg dbg = {0};
5020
struct page *vmsa_page;
5021
5022
vmsa_page = alloc_page(GFP_KERNEL);
5023
if (!vmsa_page)
5024
return NULL;
5025
5026
vmsa = page_address(vmsa_page);
5027
5028
dbg.handle = sev->handle;
5029
dbg.src_addr = svm->vmcb->control.vmsa_pa;
5030
dbg.dst_addr = __psp_pa(vmsa);
5031
dbg.len = PAGE_SIZE;
5032
5033
ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
5034
if (ret) {
5035
pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
5036
ret, error, error);
5037
__free_page(vmsa_page);
5038
5039
return NULL;
5040
}
5041
}
5042
5043
return vmsa;
5044
}
5045
5046
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
5047
{
5048
/* If the VMSA has not yet been encrypted, nothing was allocated */
5049
if (!vcpu->arch.guest_state_protected || !vmsa)
5050
return;
5051
5052
free_page((unsigned long)vmsa);
5053
}
5054
5055