Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/kvm/nested.c
49530 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Copyright (C) 2017 - Columbia University and Linaro Ltd.
4
* Author: Jintack Lim <[email protected]>
5
*/
6
7
#include <linux/bitfield.h>
8
#include <linux/kvm.h>
9
#include <linux/kvm_host.h>
10
11
#include <asm/fixmap.h>
12
#include <asm/kvm_arm.h>
13
#include <asm/kvm_emulate.h>
14
#include <asm/kvm_mmu.h>
15
#include <asm/kvm_nested.h>
16
#include <asm/sysreg.h>
17
18
#include "sys_regs.h"
19
20
struct vncr_tlb {
21
/* The guest's VNCR_EL2 */
22
u64 gva;
23
struct s1_walk_info wi;
24
struct s1_walk_result wr;
25
26
u64 hpa;
27
28
/* -1 when not mapped on a CPU */
29
int cpu;
30
31
/*
32
* true if the TLB is valid. Can only be changed with the
33
* mmu_lock held.
34
*/
35
bool valid;
36
};
37
38
/*
39
* Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
40
* memory usage and potential number of different sets of S2 PTs in
41
* the guests. Running out of S2 MMUs only affects performance (we
42
* will invalidate them more often).
43
*/
44
#define S2_MMU_PER_VCPU 2
45
46
void kvm_init_nested(struct kvm *kvm)
47
{
48
kvm->arch.nested_mmus = NULL;
49
kvm->arch.nested_mmus_size = 0;
50
atomic_set(&kvm->arch.vncr_map_count, 0);
51
}
52
53
static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
54
{
55
/*
56
* We only initialise the IPA range on the canonical MMU, which
57
* defines the contract between KVM and userspace on where the
58
* "hardware" is in the IPA space. This affects the validity of MMIO
59
* exits forwarded to userspace, for example.
60
*
61
* For nested S2s, we use the PARange as exposed to the guest, as it
62
* is allowed to use it at will to expose whatever memory map it
63
* wants to its own guests as it would be on real HW.
64
*/
65
return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
66
}
67
68
int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
69
{
70
struct kvm *kvm = vcpu->kvm;
71
struct kvm_s2_mmu *tmp;
72
int num_mmus, ret = 0;
73
74
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
75
!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
76
return -EINVAL;
77
78
if (!vcpu->arch.ctxt.vncr_array)
79
vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
80
__GFP_ZERO);
81
82
if (!vcpu->arch.ctxt.vncr_array)
83
return -ENOMEM;
84
85
/*
86
* Let's treat memory allocation failures as benign: If we fail to
87
* allocate anything, return an error and keep the allocated array
88
* alive. Userspace may try to recover by initializing the vcpu
89
* again, and there is no reason to affect the whole VM for this.
90
*/
91
num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
92
tmp = kvrealloc(kvm->arch.nested_mmus,
93
size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
94
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
95
if (!tmp)
96
return -ENOMEM;
97
98
swap(kvm->arch.nested_mmus, tmp);
99
100
/*
101
* If we went through a realocation, adjust the MMU back-pointers in
102
* the previously initialised kvm_pgtable structures.
103
*/
104
if (kvm->arch.nested_mmus != tmp)
105
for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
106
kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];
107
108
for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
109
ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
110
111
if (ret) {
112
for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
113
kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
114
115
free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
116
vcpu->arch.ctxt.vncr_array = NULL;
117
118
return ret;
119
}
120
121
kvm->arch.nested_mmus_size = num_mmus;
122
123
return 0;
124
}
125
126
struct s2_walk_info {
127
u64 baddr;
128
unsigned int max_oa_bits;
129
unsigned int pgshift;
130
unsigned int sl;
131
unsigned int t0sz;
132
bool be;
133
bool ha;
134
};
135
136
static u32 compute_fsc(int level, u32 fsc)
137
{
138
return fsc | (level & 0x3);
139
}
140
141
static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
142
{
143
u32 esr;
144
145
esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
146
esr |= compute_fsc(level, fsc);
147
return esr;
148
}
149
150
static int get_ia_size(struct s2_walk_info *wi)
151
{
152
return 64 - wi->t0sz;
153
}
154
155
static int check_base_s2_limits(struct s2_walk_info *wi,
156
int level, int input_size, int stride)
157
{
158
int start_size, ia_size;
159
160
ia_size = get_ia_size(wi);
161
162
/* Check translation limits */
163
switch (BIT(wi->pgshift)) {
164
case SZ_64K:
165
if (level == 0 || (level == 1 && ia_size <= 42))
166
return -EFAULT;
167
break;
168
case SZ_16K:
169
if (level == 0 || (level == 1 && ia_size <= 40))
170
return -EFAULT;
171
break;
172
case SZ_4K:
173
if (level < 0 || (level == 0 && ia_size <= 42))
174
return -EFAULT;
175
break;
176
}
177
178
/* Check input size limits */
179
if (input_size > ia_size)
180
return -EFAULT;
181
182
/* Check number of entries in starting level table */
183
start_size = input_size - ((3 - level) * stride + wi->pgshift);
184
if (start_size < 1 || start_size > stride + 4)
185
return -EFAULT;
186
187
return 0;
188
}
189
190
/* Check if output is within boundaries */
191
static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
192
{
193
unsigned int output_size = wi->max_oa_bits;
194
195
if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
196
return -1;
197
198
return 0;
199
}
200
201
static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
202
struct s2_walk_info *wi)
203
{
204
u64 val;
205
int r;
206
207
r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
208
if (r)
209
return r;
210
211
/*
212
* Handle reversedescriptors if endianness differs between the
213
* host and the guest hypervisor.
214
*/
215
if (wi->be)
216
*desc = be64_to_cpu((__force __be64)val);
217
else
218
*desc = le64_to_cpu((__force __le64)val);
219
220
return 0;
221
}
222
223
static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
224
struct s2_walk_info *wi)
225
{
226
if (wi->be) {
227
old = (__force u64)cpu_to_be64(old);
228
new = (__force u64)cpu_to_be64(new);
229
} else {
230
old = (__force u64)cpu_to_le64(old);
231
new = (__force u64)cpu_to_le64(new);
232
}
233
234
return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
235
}
236
237
/*
238
* This is essentially a C-version of the pseudo code from the ARM ARM
239
* AArch64.TranslationTableWalk function. I strongly recommend looking at
240
* that pseudocode in trying to understand this.
241
*
242
* Must be called with the kvm->srcu read lock held
243
*/
244
static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
245
struct s2_walk_info *wi, struct kvm_s2_trans *out)
246
{
247
int first_block_level, level, stride, input_size, base_lower_bound;
248
phys_addr_t base_addr;
249
unsigned int addr_top, addr_bottom;
250
u64 desc, new_desc; /* page table entry */
251
int ret;
252
phys_addr_t paddr;
253
254
switch (BIT(wi->pgshift)) {
255
default:
256
case SZ_64K:
257
case SZ_16K:
258
level = 3 - wi->sl;
259
first_block_level = 2;
260
break;
261
case SZ_4K:
262
level = 2 - wi->sl;
263
first_block_level = 1;
264
break;
265
}
266
267
stride = wi->pgshift - 3;
268
input_size = get_ia_size(wi);
269
if (input_size > 48 || input_size < 25)
270
return -EFAULT;
271
272
ret = check_base_s2_limits(wi, level, input_size, stride);
273
if (WARN_ON(ret))
274
return ret;
275
276
base_lower_bound = 3 + input_size - ((3 - level) * stride +
277
wi->pgshift);
278
base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
279
280
if (check_output_size(wi, base_addr)) {
281
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
282
return 1;
283
}
284
285
addr_top = input_size - 1;
286
287
while (1) {
288
phys_addr_t index;
289
290
addr_bottom = (3 - level) * stride + wi->pgshift;
291
index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
292
>> (addr_bottom - 3);
293
294
paddr = base_addr | index;
295
ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
296
if (ret < 0)
297
return ret;
298
299
new_desc = desc;
300
301
/* Check for valid descriptor at this point */
302
if (!(desc & KVM_PTE_VALID)) {
303
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
304
out->desc = desc;
305
return 1;
306
}
307
308
if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
309
if (level < 3)
310
break;
311
312
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
313
out->desc = desc;
314
return 1;
315
}
316
317
/* We're at the final level */
318
if (level == 3)
319
break;
320
321
if (check_output_size(wi, desc)) {
322
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
323
out->desc = desc;
324
return 1;
325
}
326
327
base_addr = desc & GENMASK_ULL(47, wi->pgshift);
328
329
level += 1;
330
addr_top = addr_bottom - 1;
331
}
332
333
if (level < first_block_level) {
334
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
335
out->desc = desc;
336
return 1;
337
}
338
339
if (check_output_size(wi, desc)) {
340
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
341
out->desc = desc;
342
return 1;
343
}
344
345
if (wi->ha)
346
new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
347
348
if (new_desc != desc) {
349
ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
350
if (ret)
351
return ret;
352
353
desc = new_desc;
354
}
355
356
if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
357
out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
358
out->desc = desc;
359
return 1;
360
}
361
362
addr_bottom += contiguous_bit_shift(desc, wi, level);
363
364
/* Calculate and return the result */
365
paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
366
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
367
out->output = paddr;
368
out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
369
out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
370
out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
371
out->level = level;
372
out->desc = desc;
373
return 0;
374
}
375
376
static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
377
{
378
wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
379
380
switch (vtcr & VTCR_EL2_TG0_MASK) {
381
case VTCR_EL2_TG0_4K:
382
wi->pgshift = 12; break;
383
case VTCR_EL2_TG0_16K:
384
wi->pgshift = 14; break;
385
case VTCR_EL2_TG0_64K:
386
default: /* IMPDEF: treat any other value as 64k */
387
wi->pgshift = 16; break;
388
}
389
390
wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
391
/* Global limit for now, should eventually be per-VM */
392
wi->max_oa_bits = min(get_kvm_ipa_limit(),
393
ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
394
395
wi->ha = vtcr & VTCR_EL2_HA;
396
}
397
398
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
399
struct kvm_s2_trans *result)
400
{
401
u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
402
struct s2_walk_info wi;
403
int ret;
404
405
result->esr = 0;
406
407
if (!vcpu_has_nv(vcpu))
408
return 0;
409
410
wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
411
412
vtcr_to_walk_info(vtcr, &wi);
413
414
wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
415
416
ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
417
if (ret)
418
result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
419
420
return ret;
421
}
422
423
static unsigned int ttl_to_size(u8 ttl)
424
{
425
int level = ttl & 3;
426
int gran = (ttl >> 2) & 3;
427
unsigned int max_size = 0;
428
429
switch (gran) {
430
case TLBI_TTL_TG_4K:
431
switch (level) {
432
case 0:
433
break;
434
case 1:
435
max_size = SZ_1G;
436
break;
437
case 2:
438
max_size = SZ_2M;
439
break;
440
case 3:
441
max_size = SZ_4K;
442
break;
443
}
444
break;
445
case TLBI_TTL_TG_16K:
446
switch (level) {
447
case 0:
448
case 1:
449
break;
450
case 2:
451
max_size = SZ_32M;
452
break;
453
case 3:
454
max_size = SZ_16K;
455
break;
456
}
457
break;
458
case TLBI_TTL_TG_64K:
459
switch (level) {
460
case 0:
461
case 1:
462
/* No 52bit IPA support */
463
break;
464
case 2:
465
max_size = SZ_512M;
466
break;
467
case 3:
468
max_size = SZ_64K;
469
break;
470
}
471
break;
472
default: /* No size information */
473
break;
474
}
475
476
return max_size;
477
}
478
479
static u8 pgshift_level_to_ttl(u16 shift, u8 level)
480
{
481
u8 ttl;
482
483
switch(shift) {
484
case 12:
485
ttl = TLBI_TTL_TG_4K;
486
break;
487
case 14:
488
ttl = TLBI_TTL_TG_16K;
489
break;
490
case 16:
491
ttl = TLBI_TTL_TG_64K;
492
break;
493
default:
494
BUG();
495
}
496
497
ttl <<= 2;
498
ttl |= level & 3;
499
500
return ttl;
501
}
502
503
/*
504
* Compute the equivalent of the TTL field by parsing the shadow PT. The
505
* granule size is extracted from the cached VTCR_EL2.TG0 while the level is
506
* retrieved from first entry carrying the level as a tag.
507
*/
508
static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
509
{
510
u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
511
kvm_pte_t pte;
512
u8 ttl, level;
513
514
lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
515
516
switch (vtcr & VTCR_EL2_TG0_MASK) {
517
case VTCR_EL2_TG0_4K:
518
ttl = (TLBI_TTL_TG_4K << 2);
519
break;
520
case VTCR_EL2_TG0_16K:
521
ttl = (TLBI_TTL_TG_16K << 2);
522
break;
523
case VTCR_EL2_TG0_64K:
524
default: /* IMPDEF: treat any other value as 64k */
525
ttl = (TLBI_TTL_TG_64K << 2);
526
break;
527
}
528
529
tmp = addr;
530
531
again:
532
/* Iteratively compute the block sizes for a particular granule size */
533
switch (vtcr & VTCR_EL2_TG0_MASK) {
534
case VTCR_EL2_TG0_4K:
535
if (sz < SZ_4K) sz = SZ_4K;
536
else if (sz < SZ_2M) sz = SZ_2M;
537
else if (sz < SZ_1G) sz = SZ_1G;
538
else sz = 0;
539
break;
540
case VTCR_EL2_TG0_16K:
541
if (sz < SZ_16K) sz = SZ_16K;
542
else if (sz < SZ_32M) sz = SZ_32M;
543
else sz = 0;
544
break;
545
case VTCR_EL2_TG0_64K:
546
default: /* IMPDEF: treat any other value as 64k */
547
if (sz < SZ_64K) sz = SZ_64K;
548
else if (sz < SZ_512M) sz = SZ_512M;
549
else sz = 0;
550
break;
551
}
552
553
if (sz == 0)
554
return 0;
555
556
tmp &= ~(sz - 1);
557
if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
558
goto again;
559
if (!(pte & PTE_VALID))
560
goto again;
561
level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
562
if (!level)
563
goto again;
564
565
ttl |= level;
566
567
/*
568
* We now have found some level information in the shadow S2. Check
569
* that the resulting range is actually including the original IPA.
570
*/
571
sz = ttl_to_size(ttl);
572
if (addr < (tmp + sz))
573
return ttl;
574
575
return 0;
576
}
577
578
unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
579
{
580
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
581
unsigned long max_size;
582
u8 ttl;
583
584
ttl = FIELD_GET(TLBI_TTL_MASK, val);
585
586
if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
587
/* No TTL, check the shadow S2 for a hint */
588
u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
589
ttl = get_guest_mapping_ttl(mmu, addr);
590
}
591
592
max_size = ttl_to_size(ttl);
593
594
if (!max_size) {
595
/* Compute the maximum extent of the invalidation */
596
switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) {
597
case VTCR_EL2_TG0_4K:
598
max_size = SZ_1G;
599
break;
600
case VTCR_EL2_TG0_16K:
601
max_size = SZ_32M;
602
break;
603
case VTCR_EL2_TG0_64K:
604
default: /* IMPDEF: treat any other value as 64k */
605
/*
606
* No, we do not support 52bit IPA in nested yet. Once
607
* we do, this should be 4TB.
608
*/
609
max_size = SZ_512M;
610
break;
611
}
612
}
613
614
WARN_ON(!max_size);
615
return max_size;
616
}
617
618
/*
619
* We can have multiple *different* MMU contexts with the same VMID:
620
*
621
* - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
622
*
623
* - Multiple vcpus using private S2s (huh huh...), hence differing by the
624
* VBBTR_EL2.BADDR address
625
*
626
* - A combination of the above...
627
*
628
* We can always identify which MMU context to pick at run-time. However,
629
* TLB invalidation involving a VMID must take action on all the TLBs using
630
* this particular VMID. This translates into applying the same invalidation
631
* operation to all the contexts that are using this VMID. Moar phun!
632
*/
633
void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
634
const union tlbi_info *info,
635
void (*tlbi_callback)(struct kvm_s2_mmu *,
636
const union tlbi_info *))
637
{
638
write_lock(&kvm->mmu_lock);
639
640
for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
641
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
642
643
if (!kvm_s2_mmu_valid(mmu))
644
continue;
645
646
if (vmid == get_vmid(mmu->tlb_vttbr))
647
tlbi_callback(mmu, info);
648
}
649
650
write_unlock(&kvm->mmu_lock);
651
}
652
653
struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
654
{
655
struct kvm *kvm = vcpu->kvm;
656
bool nested_stage2_enabled;
657
u64 vttbr, vtcr, hcr;
658
659
lockdep_assert_held_write(&kvm->mmu_lock);
660
661
vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
662
vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
663
hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
664
665
nested_stage2_enabled = hcr & HCR_VM;
666
667
/* Don't consider the CnP bit for the vttbr match */
668
vttbr &= ~VTTBR_CNP_BIT;
669
670
/*
671
* Two possibilities when looking up a S2 MMU context:
672
*
673
* - either S2 is enabled in the guest, and we need a context that is
674
* S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
675
* which makes it safe from a TLB conflict perspective (a broken
676
* guest won't be able to generate them),
677
*
678
* - or S2 is disabled, and we need a context that is S2-disabled
679
* and matches the VMID only, as all TLBs are tagged by VMID even
680
* if S2 translation is disabled.
681
*/
682
for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
683
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
684
685
if (!kvm_s2_mmu_valid(mmu))
686
continue;
687
688
if (nested_stage2_enabled &&
689
mmu->nested_stage2_enabled &&
690
vttbr == mmu->tlb_vttbr &&
691
vtcr == mmu->tlb_vtcr)
692
return mmu;
693
694
if (!nested_stage2_enabled &&
695
!mmu->nested_stage2_enabled &&
696
get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
697
return mmu;
698
}
699
return NULL;
700
}
701
702
static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
703
{
704
struct kvm *kvm = vcpu->kvm;
705
struct kvm_s2_mmu *s2_mmu;
706
int i;
707
708
lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
709
710
s2_mmu = lookup_s2_mmu(vcpu);
711
if (s2_mmu)
712
goto out;
713
714
/*
715
* Make sure we don't always search from the same point, or we
716
* will always reuse a potentially active context, leaving
717
* free contexts unused.
718
*/
719
for (i = kvm->arch.nested_mmus_next;
720
i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
721
i++) {
722
s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
723
724
if (atomic_read(&s2_mmu->refcnt) == 0)
725
break;
726
}
727
BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
728
729
/* Set the scene for the next search */
730
kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
731
732
/* Make sure we don't forget to do the laundry */
733
if (kvm_s2_mmu_valid(s2_mmu))
734
s2_mmu->pending_unmap = true;
735
736
/*
737
* The virtual VMID (modulo CnP) will be used as a key when matching
738
* an existing kvm_s2_mmu.
739
*
740
* We cache VTCR at allocation time, once and for all. It'd be great
741
* if the guest didn't screw that one up, as this is not very
742
* forgiving...
743
*/
744
s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
745
s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
746
s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
747
748
out:
749
atomic_inc(&s2_mmu->refcnt);
750
751
/*
752
* Set the vCPU request to perform an unmap, even if the pending unmap
753
* originates from another vCPU. This guarantees that the MMU has been
754
* completely unmapped before any vCPU actually uses it, and allows
755
* multiple vCPUs to lend a hand with completing the unmap.
756
*/
757
if (s2_mmu->pending_unmap)
758
kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);
759
760
return s2_mmu;
761
}
762
763
void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
764
{
765
/* CnP being set denotes an invalid entry */
766
mmu->tlb_vttbr = VTTBR_CNP_BIT;
767
mmu->nested_stage2_enabled = false;
768
atomic_set(&mmu->refcnt, 0);
769
}
770
771
void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
772
{
773
/*
774
* If the vCPU kept its reference on the MMU after the last put,
775
* keep rolling with it.
776
*/
777
if (is_hyp_ctxt(vcpu)) {
778
if (!vcpu->arch.hw_mmu)
779
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
780
} else {
781
if (!vcpu->arch.hw_mmu) {
782
scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
783
vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
784
}
785
786
if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
787
kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
788
}
789
}
790
791
void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
792
{
793
/* Unconditionally drop the VNCR mapping if we have one */
794
if (host_data_test_flag(L1_VNCR_MAPPED)) {
795
BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
796
BUG_ON(is_hyp_ctxt(vcpu));
797
798
clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
799
vcpu->arch.vncr_tlb->cpu = -1;
800
host_data_clear_flag(L1_VNCR_MAPPED);
801
atomic_dec(&vcpu->kvm->arch.vncr_map_count);
802
}
803
804
/*
805
* Keep a reference on the associated stage-2 MMU if the vCPU is
806
* scheduling out and not in WFI emulation, suggesting it is likely to
807
* reuse the MMU sometime soon.
808
*/
809
if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
810
return;
811
812
if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
813
atomic_dec(&vcpu->arch.hw_mmu->refcnt);
814
815
vcpu->arch.hw_mmu = NULL;
816
}
817
818
/*
819
* Returns non-zero if permission fault is handled by injecting it to the next
820
* level hypervisor.
821
*/
822
int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
823
{
824
bool forward_fault = false;
825
826
trans->esr = 0;
827
828
if (!kvm_vcpu_trap_is_permission_fault(vcpu))
829
return 0;
830
831
if (kvm_vcpu_trap_is_iabt(vcpu)) {
832
if (vcpu_mode_priv(vcpu))
833
forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
834
else
835
forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
836
} else {
837
bool write_fault = kvm_is_write_fault(vcpu);
838
839
forward_fault = ((write_fault && !trans->writable) ||
840
(!write_fault && !trans->readable));
841
}
842
843
if (forward_fault)
844
trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
845
846
return forward_fault;
847
}
848
849
int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
850
{
851
vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
852
vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
853
854
return kvm_inject_nested_sync(vcpu, esr_el2);
855
}
856
857
static void invalidate_vncr(struct vncr_tlb *vt)
858
{
859
vt->valid = false;
860
if (vt->cpu != -1)
861
clear_fixmap(vncr_fixmap(vt->cpu));
862
}
863
864
static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
865
{
866
struct kvm_vcpu *vcpu;
867
unsigned long i;
868
869
lockdep_assert_held_write(&kvm->mmu_lock);
870
871
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
872
return;
873
874
kvm_for_each_vcpu(i, vcpu, kvm) {
875
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
876
u64 ipa_start, ipa_end, ipa_size;
877
878
/*
879
* Careful here: We end-up here from an MMU notifier,
880
* and this can race against a vcpu not being onlined
881
* yet, without the pseudo-TLB being allocated.
882
*
883
* Skip those, as they obviously don't participate in
884
* the invalidation at this stage.
885
*/
886
if (!vt)
887
continue;
888
889
if (!vt->valid)
890
continue;
891
892
ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
893
vt->wr.level));
894
ipa_start = vt->wr.pa & ~(ipa_size - 1);
895
ipa_end = ipa_start + ipa_size;
896
897
if (ipa_end <= start || ipa_start >= end)
898
continue;
899
900
invalidate_vncr(vt);
901
}
902
}
903
904
struct s1e2_tlbi_scope {
905
enum {
906
TLBI_ALL,
907
TLBI_VA,
908
TLBI_VAA,
909
TLBI_ASID,
910
} type;
911
912
u16 asid;
913
u64 va;
914
u64 size;
915
};
916
917
static void invalidate_vncr_va(struct kvm *kvm,
918
struct s1e2_tlbi_scope *scope)
919
{
920
struct kvm_vcpu *vcpu;
921
unsigned long i;
922
923
lockdep_assert_held_write(&kvm->mmu_lock);
924
925
kvm_for_each_vcpu(i, vcpu, kvm) {
926
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
927
u64 va_start, va_end, va_size;
928
929
if (!vt->valid)
930
continue;
931
932
va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
933
vt->wr.level));
934
va_start = vt->gva & ~(va_size - 1);
935
va_end = va_start + va_size;
936
937
switch (scope->type) {
938
case TLBI_ALL:
939
break;
940
941
case TLBI_VA:
942
if (va_end <= scope->va ||
943
va_start >= (scope->va + scope->size))
944
continue;
945
if (vt->wr.nG && vt->wr.asid != scope->asid)
946
continue;
947
break;
948
949
case TLBI_VAA:
950
if (va_end <= scope->va ||
951
va_start >= (scope->va + scope->size))
952
continue;
953
break;
954
955
case TLBI_ASID:
956
if (!vt->wr.nG || vt->wr.asid != scope->asid)
957
continue;
958
break;
959
}
960
961
invalidate_vncr(vt);
962
}
963
}
964
965
#define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48)
966
967
static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
968
struct s1e2_tlbi_scope *scope)
969
{
970
switch (inst) {
971
case OP_TLBI_ALLE2:
972
case OP_TLBI_ALLE2IS:
973
case OP_TLBI_ALLE2OS:
974
case OP_TLBI_VMALLE1:
975
case OP_TLBI_VMALLE1IS:
976
case OP_TLBI_VMALLE1OS:
977
case OP_TLBI_ALLE2NXS:
978
case OP_TLBI_ALLE2ISNXS:
979
case OP_TLBI_ALLE2OSNXS:
980
case OP_TLBI_VMALLE1NXS:
981
case OP_TLBI_VMALLE1ISNXS:
982
case OP_TLBI_VMALLE1OSNXS:
983
scope->type = TLBI_ALL;
984
break;
985
case OP_TLBI_VAE2:
986
case OP_TLBI_VAE2IS:
987
case OP_TLBI_VAE2OS:
988
case OP_TLBI_VAE1:
989
case OP_TLBI_VAE1IS:
990
case OP_TLBI_VAE1OS:
991
case OP_TLBI_VAE2NXS:
992
case OP_TLBI_VAE2ISNXS:
993
case OP_TLBI_VAE2OSNXS:
994
case OP_TLBI_VAE1NXS:
995
case OP_TLBI_VAE1ISNXS:
996
case OP_TLBI_VAE1OSNXS:
997
case OP_TLBI_VALE2:
998
case OP_TLBI_VALE2IS:
999
case OP_TLBI_VALE2OS:
1000
case OP_TLBI_VALE1:
1001
case OP_TLBI_VALE1IS:
1002
case OP_TLBI_VALE1OS:
1003
case OP_TLBI_VALE2NXS:
1004
case OP_TLBI_VALE2ISNXS:
1005
case OP_TLBI_VALE2OSNXS:
1006
case OP_TLBI_VALE1NXS:
1007
case OP_TLBI_VALE1ISNXS:
1008
case OP_TLBI_VALE1OSNXS:
1009
scope->type = TLBI_VA;
1010
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
1011
if (!scope->size)
1012
scope->size = SZ_1G;
1013
scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
1014
scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
1015
break;
1016
case OP_TLBI_ASIDE1:
1017
case OP_TLBI_ASIDE1IS:
1018
case OP_TLBI_ASIDE1OS:
1019
case OP_TLBI_ASIDE1NXS:
1020
case OP_TLBI_ASIDE1ISNXS:
1021
case OP_TLBI_ASIDE1OSNXS:
1022
scope->type = TLBI_ASID;
1023
scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
1024
break;
1025
case OP_TLBI_VAAE1:
1026
case OP_TLBI_VAAE1IS:
1027
case OP_TLBI_VAAE1OS:
1028
case OP_TLBI_VAAE1NXS:
1029
case OP_TLBI_VAAE1ISNXS:
1030
case OP_TLBI_VAAE1OSNXS:
1031
case OP_TLBI_VAALE1:
1032
case OP_TLBI_VAALE1IS:
1033
case OP_TLBI_VAALE1OS:
1034
case OP_TLBI_VAALE1NXS:
1035
case OP_TLBI_VAALE1ISNXS:
1036
case OP_TLBI_VAALE1OSNXS:
1037
scope->type = TLBI_VAA;
1038
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
1039
if (!scope->size)
1040
scope->size = SZ_1G;
1041
scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
1042
break;
1043
case OP_TLBI_RVAE2:
1044
case OP_TLBI_RVAE2IS:
1045
case OP_TLBI_RVAE2OS:
1046
case OP_TLBI_RVAE1:
1047
case OP_TLBI_RVAE1IS:
1048
case OP_TLBI_RVAE1OS:
1049
case OP_TLBI_RVAE2NXS:
1050
case OP_TLBI_RVAE2ISNXS:
1051
case OP_TLBI_RVAE2OSNXS:
1052
case OP_TLBI_RVAE1NXS:
1053
case OP_TLBI_RVAE1ISNXS:
1054
case OP_TLBI_RVAE1OSNXS:
1055
case OP_TLBI_RVALE2:
1056
case OP_TLBI_RVALE2IS:
1057
case OP_TLBI_RVALE2OS:
1058
case OP_TLBI_RVALE1:
1059
case OP_TLBI_RVALE1IS:
1060
case OP_TLBI_RVALE1OS:
1061
case OP_TLBI_RVALE2NXS:
1062
case OP_TLBI_RVALE2ISNXS:
1063
case OP_TLBI_RVALE2OSNXS:
1064
case OP_TLBI_RVALE1NXS:
1065
case OP_TLBI_RVALE1ISNXS:
1066
case OP_TLBI_RVALE1OSNXS:
1067
scope->type = TLBI_VA;
1068
scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
1069
break;
1070
case OP_TLBI_RVAAE1:
1071
case OP_TLBI_RVAAE1IS:
1072
case OP_TLBI_RVAAE1OS:
1073
case OP_TLBI_RVAAE1NXS:
1074
case OP_TLBI_RVAAE1ISNXS:
1075
case OP_TLBI_RVAAE1OSNXS:
1076
case OP_TLBI_RVAALE1:
1077
case OP_TLBI_RVAALE1IS:
1078
case OP_TLBI_RVAALE1OS:
1079
case OP_TLBI_RVAALE1NXS:
1080
case OP_TLBI_RVAALE1ISNXS:
1081
case OP_TLBI_RVAALE1OSNXS:
1082
scope->type = TLBI_VAA;
1083
scope->va = decode_range_tlbi(val, &scope->size, NULL);
1084
break;
1085
}
1086
}
1087
1088
void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
1089
{
1090
struct s1e2_tlbi_scope scope = {};
1091
1092
compute_s1_tlbi_range(vcpu, inst, val, &scope);
1093
1094
guard(write_lock)(&vcpu->kvm->mmu_lock);
1095
invalidate_vncr_va(vcpu->kvm, &scope);
1096
}
1097
1098
void kvm_nested_s2_wp(struct kvm *kvm)
1099
{
1100
int i;
1101
1102
lockdep_assert_held_write(&kvm->mmu_lock);
1103
1104
for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1105
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1106
1107
if (kvm_s2_mmu_valid(mmu))
1108
kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
1109
}
1110
1111
kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
1112
}
1113
1114
void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
1115
{
1116
int i;
1117
1118
lockdep_assert_held_write(&kvm->mmu_lock);
1119
1120
for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1121
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1122
1123
if (kvm_s2_mmu_valid(mmu))
1124
kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
1125
}
1126
1127
kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
1128
}
1129
1130
void kvm_nested_s2_flush(struct kvm *kvm)
1131
{
1132
int i;
1133
1134
lockdep_assert_held_write(&kvm->mmu_lock);
1135
1136
for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1137
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1138
1139
if (kvm_s2_mmu_valid(mmu))
1140
kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
1141
}
1142
}
1143
1144
void kvm_arch_flush_shadow_all(struct kvm *kvm)
1145
{
1146
int i;
1147
1148
for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
1149
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
1150
1151
if (!WARN_ON(atomic_read(&mmu->refcnt)))
1152
kvm_free_stage2_pgd(mmu);
1153
}
1154
kvfree(kvm->arch.nested_mmus);
1155
kvm->arch.nested_mmus = NULL;
1156
kvm->arch.nested_mmus_size = 0;
1157
kvm_uninit_stage2_mmu(kvm);
1158
}
1159
1160
/*
1161
* Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
1162
*
1163
* - We introduce an internal representation of a vcpu-private TLB,
1164
* representing the mapping between the guest VA contained in VNCR_EL2,
1165
* the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
1166
*
1167
* - On translation fault from a nested VNCR access, we create such a TLB.
1168
* If there is no mapping to describe, the guest inherits the fault.
1169
* Crucially, no actual mapping is done at this stage.
1170
*
1171
* - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
1172
* TLB exists, we map it in the fixmap for this CPU, and run with it. We
1173
* have to respect the permissions dictated by the guest, but not the
1174
* memory type (FWB is a must).
1175
*
1176
* - Note that we usually don't do a vcpu_load() on the back of a fault
1177
* (unless we are preempted), so the resolution of a translation fault
1178
* must go via a request that will map the VNCR page in the fixmap.
1179
* vcpu_load() might as well use the same mechanism.
1180
*
1181
* - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
1182
* mapped, we unmap it. Yes it is that simple. The TLB still exists
1183
* though, and may be reused at a later load.
1184
*
1185
* - On permission fault, we simply forward the fault to the guest's EL2.
1186
* Get out of my way.
1187
*
1188
* - On any TLBI for the EL2&0 translation regime, we must find any TLB that
1189
* intersects with the TLBI request, invalidate it, and unmap the page
1190
* from the fixmap. Because we need to look at all the vcpu-private TLBs,
1191
* this requires some wide-ranging locking to ensure that nothing races
1192
* against it. This may require some refcounting to avoid the search when
1193
* no such TLB is present.
1194
*
1195
* - On MMU notifiers, we must invalidate our TLB in a similar way, but
1196
* looking at the IPA instead. The funny part is that there may not be a
1197
* stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
1198
* instructions.
1199
*/
1200
1201
int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
1202
{
1203
if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
1204
return 0;
1205
1206
vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb),
1207
GFP_KERNEL_ACCOUNT);
1208
if (!vcpu->arch.vncr_tlb)
1209
return -ENOMEM;
1210
1211
return 0;
1212
}
1213
1214
static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
1215
{
1216
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
1217
}
1218
1219
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
1220
{
1221
struct kvm_memory_slot *memslot;
1222
bool write_fault, writable;
1223
unsigned long mmu_seq;
1224
struct vncr_tlb *vt;
1225
struct page *page;
1226
u64 va, pfn, gfn;
1227
int ret;
1228
1229
vt = vcpu->arch.vncr_tlb;
1230
1231
/*
1232
* If we're about to walk the EL2 S1 PTs, we must invalidate the
1233
* current TLB, as it could be sampled from another vcpu doing a
1234
* TLBI *IS. A real CPU wouldn't do that, but we only keep a single
1235
* translation, so not much of a choice.
1236
*
1237
* We also prepare the next walk wilst we're at it.
1238
*/
1239
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
1240
invalidate_vncr(vt);
1241
1242
vt->wi = (struct s1_walk_info) {
1243
.regime = TR_EL20,
1244
.as_el0 = false,
1245
.pan = false,
1246
};
1247
vt->wr = (struct s1_walk_result){};
1248
}
1249
1250
guard(srcu)(&vcpu->kvm->srcu);
1251
1252
va = read_vncr_el2(vcpu);
1253
1254
ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
1255
if (ret)
1256
return ret;
1257
1258
write_fault = kvm_is_write_fault(vcpu);
1259
1260
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1261
smp_rmb();
1262
1263
gfn = vt->wr.pa >> PAGE_SHIFT;
1264
memslot = gfn_to_memslot(vcpu->kvm, gfn);
1265
if (!memslot)
1266
return -EFAULT;
1267
1268
*is_gmem = kvm_slot_has_gmem(memslot);
1269
if (!*is_gmem) {
1270
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
1271
&writable, &page);
1272
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
1273
return -EFAULT;
1274
} else {
1275
ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
1276
if (ret) {
1277
kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
1278
write_fault, false, false);
1279
return ret;
1280
}
1281
}
1282
1283
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
1284
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
1285
return -EAGAIN;
1286
1287
vt->gva = va;
1288
vt->hpa = pfn << PAGE_SHIFT;
1289
vt->valid = true;
1290
vt->cpu = -1;
1291
1292
kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
1293
kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
1294
}
1295
1296
if (vt->wr.pw)
1297
mark_page_dirty(vcpu->kvm, gfn);
1298
1299
return 0;
1300
}
1301
1302
static void inject_vncr_perm(struct kvm_vcpu *vcpu)
1303
{
1304
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1305
u64 esr = kvm_vcpu_get_esr(vcpu);
1306
1307
/* Adjust the fault level to reflect that of the guest's */
1308
esr &= ~ESR_ELx_FSC;
1309
esr |= FIELD_PREP(ESR_ELx_FSC,
1310
ESR_ELx_FSC_PERM_L(vt->wr.level));
1311
1312
kvm_inject_nested_sync(vcpu, esr);
1313
}
1314
1315
static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
1316
{
1317
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1318
1319
lockdep_assert_held_read(&vcpu->kvm->mmu_lock);
1320
1321
if (!vt->valid)
1322
return false;
1323
1324
if (read_vncr_el2(vcpu) != vt->gva)
1325
return false;
1326
1327
if (vt->wr.nG) {
1328
u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
1329
u64 ttbr = ((tcr & TCR_A1) ?
1330
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
1331
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
1332
u16 asid;
1333
1334
asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
1335
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
1336
!(tcr & TCR_ASID16))
1337
asid &= GENMASK(7, 0);
1338
1339
return asid == vt->wr.asid;
1340
}
1341
1342
return true;
1343
}
1344
1345
int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
1346
{
1347
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1348
u64 esr = kvm_vcpu_get_esr(vcpu);
1349
1350
WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
1351
1352
if (kvm_vcpu_abt_issea(vcpu))
1353
return kvm_handle_guest_sea(vcpu);
1354
1355
if (esr_fsc_is_permission_fault(esr)) {
1356
inject_vncr_perm(vcpu);
1357
} else if (esr_fsc_is_translation_fault(esr)) {
1358
bool valid, is_gmem = false;
1359
int ret;
1360
1361
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
1362
valid = kvm_vncr_tlb_lookup(vcpu);
1363
1364
if (!valid)
1365
ret = kvm_translate_vncr(vcpu, &is_gmem);
1366
else
1367
ret = -EPERM;
1368
1369
switch (ret) {
1370
case -EAGAIN:
1371
/* Let's try again... */
1372
break;
1373
case -ENOMEM:
1374
/*
1375
* For guest_memfd, this indicates that it failed to
1376
* create a folio to back the memory. Inform userspace.
1377
*/
1378
if (is_gmem)
1379
return 0;
1380
/* Otherwise, let's try again... */
1381
break;
1382
case -EFAULT:
1383
case -EIO:
1384
case -EHWPOISON:
1385
if (is_gmem)
1386
return 0;
1387
fallthrough;
1388
case -EINVAL:
1389
case -ENOENT:
1390
case -EACCES:
1391
/*
1392
* Translation failed, inject the corresponding
1393
* exception back to EL2.
1394
*/
1395
BUG_ON(!vt->wr.failed);
1396
1397
esr &= ~ESR_ELx_FSC;
1398
esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);
1399
1400
kvm_inject_nested_sync(vcpu, esr);
1401
break;
1402
case -EPERM:
1403
/* Hack to deal with POE until we get kernel support */
1404
inject_vncr_perm(vcpu);
1405
break;
1406
case 0:
1407
break;
1408
}
1409
} else {
1410
WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
1411
}
1412
1413
return 1;
1414
}
1415
1416
static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
1417
{
1418
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
1419
pgprot_t prot;
1420
1421
guard(preempt)();
1422
guard(read_lock)(&vcpu->kvm->mmu_lock);
1423
1424
/*
1425
* The request to map VNCR may have raced against some other
1426
* event, such as an interrupt, and may not be valid anymore.
1427
*/
1428
if (is_hyp_ctxt(vcpu))
1429
return;
1430
1431
/*
1432
* Check that the pseudo-TLB is valid and that VNCR_EL2 still
1433
* contains the expected value. If it doesn't, we simply bail out
1434
* without a mapping -- a transformed MSR/MRS will generate the
1435
* fault and allows us to populate the pseudo-TLB.
1436
*/
1437
if (!vt->valid)
1438
return;
1439
1440
if (read_vncr_el2(vcpu) != vt->gva)
1441
return;
1442
1443
if (vt->wr.nG) {
1444
u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
1445
u64 ttbr = ((tcr & TCR_A1) ?
1446
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
1447
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
1448
u16 asid;
1449
1450
asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
1451
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
1452
!(tcr & TCR_ASID16))
1453
asid &= GENMASK(7, 0);
1454
1455
if (asid != vt->wr.asid)
1456
return;
1457
}
1458
1459
vt->cpu = smp_processor_id();
1460
1461
if (vt->wr.pw && vt->wr.pr)
1462
prot = PAGE_KERNEL;
1463
else if (vt->wr.pr)
1464
prot = PAGE_KERNEL_RO;
1465
else
1466
prot = PAGE_NONE;
1467
1468
/*
1469
* We can't map write-only (or no permission at all) in the kernel,
1470
* but the guest can do it if using POE, so we'll have to turn a
1471
* translation fault into a permission fault at runtime.
1472
* FIXME: WO doesn't work at all, need POE support in the kernel.
1473
*/
1474
if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
1475
__set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
1476
host_data_set_flag(L1_VNCR_MAPPED);
1477
atomic_inc(&vcpu->kvm->arch.vncr_map_count);
1478
}
1479
}
1480
1481
#define has_tgran_2(__r, __sz) \
1482
({ \
1483
u64 _s1, _s2, _mmfr0 = __r; \
1484
\
1485
_s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
1486
TGRAN##__sz##_2, _mmfr0); \
1487
\
1488
_s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
1489
TGRAN##__sz, _mmfr0); \
1490
\
1491
((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \
1492
_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
1493
(_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
1494
_s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \
1495
})
1496
/*
1497
* Our emulated CPU doesn't support all the possible features. For the
1498
* sake of simplicity (and probably mental sanity), wipe out a number
1499
* of feature bits we don't intend to support for the time being.
1500
* This list should get updated as new features get added to the NV
1501
* support, and new extension to the architecture.
1502
*/
1503
u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
1504
{
1505
u64 orig_val = val;
1506
1507
switch (reg) {
1508
case SYS_ID_AA64ISAR0_EL1:
1509
/* Support everything but TME */
1510
val &= ~ID_AA64ISAR0_EL1_TME;
1511
break;
1512
1513
case SYS_ID_AA64ISAR1_EL1:
1514
/* Support everything but LS64 and Spec Invalidation */
1515
val &= ~(ID_AA64ISAR1_EL1_LS64 |
1516
ID_AA64ISAR1_EL1_SPECRES);
1517
break;
1518
1519
case SYS_ID_AA64PFR0_EL1:
1520
/* No RME, AMU, MPAM, or S-EL2 */
1521
val &= ~(ID_AA64PFR0_EL1_RME |
1522
ID_AA64PFR0_EL1_AMU |
1523
ID_AA64PFR0_EL1_MPAM |
1524
ID_AA64PFR0_EL1_SEL2 |
1525
ID_AA64PFR0_EL1_EL3 |
1526
ID_AA64PFR0_EL1_EL2 |
1527
ID_AA64PFR0_EL1_EL1 |
1528
ID_AA64PFR0_EL1_EL0);
1529
/* 64bit only at any EL */
1530
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
1531
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
1532
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
1533
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
1534
break;
1535
1536
case SYS_ID_AA64PFR1_EL1:
1537
/* Only support BTI, SSBS, CSV2_frac */
1538
val &= ~(ID_AA64PFR1_EL1_PFAR |
1539
ID_AA64PFR1_EL1_MTEX |
1540
ID_AA64PFR1_EL1_THE |
1541
ID_AA64PFR1_EL1_GCS |
1542
ID_AA64PFR1_EL1_MTE_frac |
1543
ID_AA64PFR1_EL1_NMI |
1544
ID_AA64PFR1_EL1_SME |
1545
ID_AA64PFR1_EL1_RES0 |
1546
ID_AA64PFR1_EL1_MPAM_frac |
1547
ID_AA64PFR1_EL1_MTE);
1548
break;
1549
1550
case SYS_ID_AA64MMFR0_EL1:
1551
/* Hide ExS, Secure Memory */
1552
val &= ~(ID_AA64MMFR0_EL1_EXS |
1553
ID_AA64MMFR0_EL1_TGRAN4_2 |
1554
ID_AA64MMFR0_EL1_TGRAN16_2 |
1555
ID_AA64MMFR0_EL1_TGRAN64_2 |
1556
ID_AA64MMFR0_EL1_SNSMEM);
1557
1558
/* Hide CNTPOFF if present */
1559
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
1560
1561
/* Disallow unsupported S2 page sizes */
1562
switch (PAGE_SIZE) {
1563
case SZ_64K:
1564
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
1565
fallthrough;
1566
case SZ_16K:
1567
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
1568
fallthrough;
1569
case SZ_4K:
1570
/* Support everything */
1571
break;
1572
}
1573
1574
/*
1575
* Since we can't support a guest S2 page size smaller
1576
* than the host's own page size (due to KVM only
1577
* populating its own S2 using the kernel's page
1578
* size), advertise the limitation using FEAT_GTG.
1579
*/
1580
switch (PAGE_SIZE) {
1581
case SZ_4K:
1582
if (has_tgran_2(orig_val, 4))
1583
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
1584
fallthrough;
1585
case SZ_16K:
1586
if (has_tgran_2(orig_val, 16))
1587
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
1588
fallthrough;
1589
case SZ_64K:
1590
if (has_tgran_2(orig_val, 64))
1591
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
1592
break;
1593
}
1594
1595
/* Cap PARange to 48bits */
1596
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
1597
break;
1598
1599
case SYS_ID_AA64MMFR1_EL1:
1600
val &= ~(ID_AA64MMFR1_EL1_CMOW |
1601
ID_AA64MMFR1_EL1_nTLBPA |
1602
ID_AA64MMFR1_EL1_ETS);
1603
1604
/* FEAT_E2H0 implies no VHE */
1605
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
1606
val &= ~ID_AA64MMFR1_EL1_VH;
1607
1608
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
1609
break;
1610
1611
case SYS_ID_AA64MMFR2_EL1:
1612
val &= ~(ID_AA64MMFR2_EL1_BBM |
1613
ID_AA64MMFR2_EL1_TTL |
1614
GENMASK_ULL(47, 44) |
1615
ID_AA64MMFR2_EL1_ST |
1616
ID_AA64MMFR2_EL1_CCIDX |
1617
ID_AA64MMFR2_EL1_VARange);
1618
1619
/* Force TTL support */
1620
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
1621
break;
1622
1623
case SYS_ID_AA64MMFR4_EL1:
1624
/*
1625
* You get EITHER
1626
*
1627
* - FEAT_VHE without FEAT_E2H0
1628
* - FEAT_NV limited to FEAT_NV2
1629
* - HCR_EL2.NV1 being RES0
1630
*
1631
* OR
1632
*
1633
* - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
1634
*
1635
* Life is too short for anything else.
1636
*/
1637
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
1638
val = 0;
1639
} else {
1640
val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
1641
val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
1642
}
1643
break;
1644
1645
case SYS_ID_AA64DFR0_EL1:
1646
/* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
1647
val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff |
1648
ID_AA64DFR0_EL1_BRBE |
1649
ID_AA64DFR0_EL1_MTPMU |
1650
ID_AA64DFR0_EL1_TraceBuffer |
1651
ID_AA64DFR0_EL1_TraceFilt |
1652
ID_AA64DFR0_EL1_PMSVer |
1653
ID_AA64DFR0_EL1_CTX_CMPs |
1654
ID_AA64DFR0_EL1_SEBEP |
1655
ID_AA64DFR0_EL1_PMSS |
1656
ID_AA64DFR0_EL1_TraceVer);
1657
1658
/*
1659
* FEAT_Debugv8p9 requires support for extended breakpoints /
1660
* watchpoints.
1661
*/
1662
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
1663
break;
1664
}
1665
1666
return val;
1667
}
1668
1669
u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
1670
enum vcpu_sysreg sr, u64 v)
1671
{
1672
struct kvm_sysreg_masks *masks;
1673
1674
masks = vcpu->kvm->arch.sysreg_masks;
1675
1676
if (masks) {
1677
sr -= __SANITISED_REG_START__;
1678
1679
v &= ~masks->mask[sr].res0;
1680
v |= masks->mask[sr].res1;
1681
}
1682
1683
return v;
1684
}
1685
1686
static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
1687
{
1688
int i = sr - __SANITISED_REG_START__;
1689
1690
BUILD_BUG_ON(!__builtin_constant_p(sr));
1691
BUILD_BUG_ON(sr < __SANITISED_REG_START__);
1692
BUILD_BUG_ON(sr >= NR_SYS_REGS);
1693
1694
kvm->arch.sysreg_masks->mask[i].res0 = res0;
1695
kvm->arch.sysreg_masks->mask[i].res1 = res1;
1696
}
1697
1698
int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
1699
{
1700
struct kvm *kvm = vcpu->kvm;
1701
u64 res0, res1;
1702
1703
lockdep_assert_held(&kvm->arch.config_lock);
1704
1705
if (kvm->arch.sysreg_masks)
1706
goto out;
1707
1708
kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
1709
GFP_KERNEL_ACCOUNT);
1710
if (!kvm->arch.sysreg_masks)
1711
return -ENOMEM;
1712
1713
/* VTTBR_EL2 */
1714
res0 = res1 = 0;
1715
if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
1716
res0 |= GENMASK(63, 56);
1717
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP))
1718
res0 |= VTTBR_CNP_BIT;
1719
set_sysreg_masks(kvm, VTTBR_EL2, res0, res1);
1720
1721
/* VTCR_EL2 */
1722
res0 = GENMASK(63, 32) | GENMASK(30, 20);
1723
res1 = BIT(31);
1724
set_sysreg_masks(kvm, VTCR_EL2, res0, res1);
1725
1726
/* VMPIDR_EL2 */
1727
res0 = GENMASK(63, 40) | GENMASK(30, 24);
1728
res1 = BIT(31);
1729
set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1);
1730
1731
/* HCR_EL2 */
1732
get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1);
1733
set_sysreg_masks(kvm, HCR_EL2, res0, res1);
1734
1735
/* HCRX_EL2 */
1736
get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1);
1737
set_sysreg_masks(kvm, HCRX_EL2, res0, res1);
1738
1739
/* HFG[RW]TR_EL2 */
1740
get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1);
1741
set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1);
1742
get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1);
1743
set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1);
1744
1745
/* HDFG[RW]TR_EL2 */
1746
get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1);
1747
set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1);
1748
get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1);
1749
set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1);
1750
1751
/* HFGITR_EL2 */
1752
get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1);
1753
set_sysreg_masks(kvm, HFGITR_EL2, res0, res1);
1754
1755
/* HAFGRTR_EL2 - not a lot to see here */
1756
get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1);
1757
set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
1758
1759
/* HFG[RW]TR2_EL2 */
1760
get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1);
1761
set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1);
1762
get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1);
1763
set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1);
1764
1765
/* HDFG[RW]TR2_EL2 */
1766
get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1);
1767
set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1);
1768
get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1);
1769
set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1);
1770
1771
/* HFGITR2_EL2 */
1772
get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1);
1773
set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);
1774
1775
/* TCR2_EL2 */
1776
get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1);
1777
set_sysreg_masks(kvm, TCR2_EL2, res0, res1);
1778
1779
/* SCTLR_EL1 */
1780
get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1);
1781
set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
1782
1783
/* SCTLR2_ELx */
1784
get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1);
1785
set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1);
1786
get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1);
1787
set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1);
1788
1789
/* MDCR_EL2 */
1790
get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1);
1791
set_sysreg_masks(kvm, MDCR_EL2, res0, res1);
1792
1793
/* CNTHCTL_EL2 */
1794
res0 = GENMASK(63, 20);
1795
res1 = 0;
1796
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
1797
res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
1798
if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
1799
res0 |= CNTHCTL_ECV;
1800
if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
1801
res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
1802
CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
1803
}
1804
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
1805
res0 |= GENMASK(11, 8);
1806
set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1);
1807
1808
/* ICH_HCR_EL2 */
1809
res0 = ICH_HCR_EL2_RES0;
1810
res1 = ICH_HCR_EL2_RES1;
1811
if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
1812
res0 |= ICH_HCR_EL2_TDIR;
1813
/* No GICv4 is presented to the guest */
1814
res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
1815
set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);
1816
1817
/* VNCR_EL2 */
1818
set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1);
1819
1820
out:
1821
for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
1822
__vcpu_rmw_sys_reg(vcpu, sr, |=, 0);
1823
1824
return 0;
1825
}
1826
1827
void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
1828
{
1829
if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
1830
struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1831
1832
write_lock(&vcpu->kvm->mmu_lock);
1833
if (mmu->pending_unmap) {
1834
kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
1835
mmu->pending_unmap = false;
1836
}
1837
write_unlock(&vcpu->kvm->mmu_lock);
1838
}
1839
1840
if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
1841
kvm_map_l1_vncr(vcpu);
1842
1843
/* Must be last, as may switch context! */
1844
if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
1845
kvm_inject_nested_irq(vcpu);
1846
}
1847
1848
/*
1849
* One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
1850
* can write to HCR_EL2 behind our back, potentially changing the exception
1851
* routing / masking for even the host context.
1852
*
1853
* What follows is some slop to (1) react to exception routing / masking and (2)
1854
* preserve the pending SError state across translation regimes.
1855
*/
1856
void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
1857
{
1858
if (!vcpu_has_nv(vcpu))
1859
return;
1860
1861
if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
1862
kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
1863
}
1864
1865
void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
1866
{
1867
unsigned long *hcr = vcpu_hcr(vcpu);
1868
1869
if (!vcpu_has_nv(vcpu))
1870
return;
1871
1872
/*
1873
* We previously decided that an SError was deliverable to the guest.
1874
* Reap the pending state from HCR_EL2 and...
1875
*/
1876
if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
1877
vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
1878
1879
/*
1880
* Re-attempt SError injection in case the deliverability has changed,
1881
* which is necessary to faithfully emulate WFI the case of a pending
1882
* SError being a wakeup condition.
1883
*/
1884
if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
1885
kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
1886
}
1887
1888
/*
1889
* KVM unconditionally sets most of these traps anyway but use an allowlist
1890
* to document the guest hypervisor traps that may take precedence and guard
1891
* against future changes to the non-nested trap configuration.
1892
*/
1893
#define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \
1894
MDCR_EL2_TDA | \
1895
MDCR_EL2_TDRA | \
1896
MDCR_EL2_TTRF | \
1897
MDCR_EL2_TPMS | \
1898
MDCR_EL2_TPM | \
1899
MDCR_EL2_TPMCR | \
1900
MDCR_EL2_TDCC | \
1901
MDCR_EL2_TDOSA)
1902
1903
void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu)
1904
{
1905
u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
1906
1907
if (is_nested_ctxt(vcpu))
1908
vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE);
1909
/*
1910
* In yet another example where FEAT_NV2 is fscking broken, accesses
1911
* to MDSCR_EL1 are redirected to the VNCR despite having an effect
1912
* at EL2. Use a big hammer to apply sanity.
1913
*
1914
* Unless of course we have FEAT_FGT, in which case we can precisely
1915
* trap MDSCR_EL1.
1916
*/
1917
else if (!cpus_have_final_cap(ARM64_HAS_FGT))
1918
vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
1919
}
1920
1921