Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/kvm/vgic/vgic-v3-nested.c
51970 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
#include <linux/cpu.h>
4
#include <linux/kvm.h>
5
#include <linux/kvm_host.h>
6
#include <linux/interrupt.h>
7
#include <linux/io.h>
8
#include <linux/uaccess.h>
9
10
#include <kvm/arm_vgic.h>
11
12
#include <asm/kvm_arm.h>
13
#include <asm/kvm_emulate.h>
14
#include <asm/kvm_nested.h>
15
16
#include "vgic.h"
17
18
#define ICH_LRN(n) (ICH_LR0_EL2 + (n))
19
#define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n))
20
#define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n))
21
22
struct mi_state {
23
u16 eisr;
24
u16 elrsr;
25
bool pend;
26
};
27
28
/*
29
* The shadow registers loaded to the hardware when running a L2 guest
30
* with the virtual IMO/FMO bits set.
31
*/
32
struct shadow_if {
33
struct vgic_v3_cpu_if cpuif;
34
unsigned long lr_map;
35
};
36
37
static DEFINE_PER_CPU(struct shadow_if, shadow_if);
38
39
static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
40
{
41
return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
42
}
43
44
/*
45
* Nesting GICv3 support
46
*
47
* On a non-nesting VM (only running at EL0/EL1), the host hypervisor
48
* completely controls the interrupts injected via the list registers.
49
* Consequently, most of the state that is modified by the guest (by ACK-ing
50
* and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
51
* keep a semi-consistent view of the interrupts.
52
*
53
* This still applies for a NV guest, but only while "InHost" (either
54
* running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
55
*
56
* When running a L2 guest ("not InHost"), things are radically different,
57
* as the L1 guest is in charge of provisioning the interrupts via its own
58
* view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
59
* page. This means that the flow described above does work (there is no
60
* state to rebuild in the L0 hypervisor), and that most things happen on L2
61
* load/put:
62
*
63
* - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
64
* per-CPU data structure that is used to populate the actual LRs. This is
65
* an extra copy that we could avoid, but life is short. In the process,
66
* we remap any interrupt that has the HW bit set to the mapped interrupt
67
* on the host, should the host consider it a HW one. This allows the HW
68
* deactivation to take its course, such as for the timer.
69
*
70
* - on L2 put: perform the inverse transformation, so that the result of L2
71
* running becomes visible to L1 in the VNCR-accessible registers.
72
*
73
* - there is nothing to do on L2 entry apart from enabling the vgic, as
74
* everything will have happened on load. However, this is the point where
75
* we detect that an interrupt targeting L1 and prepare the grand
76
* switcheroo.
77
*
78
* - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate
79
* corresponding the L1 interrupt. The L0 active state will be cleared by
80
* the HW if the L1 interrupt was itself backed by a HW interrupt.
81
*
82
* Maintenance Interrupt (MI) management:
83
*
84
* Since the L2 guest runs the vgic in its full glory, MIs get delivered and
85
* used as a handover point between L2 and L1.
86
*
87
* - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
88
* and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
89
* run and process the MI.
90
*
91
* - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
92
* state must be computed at each entry/exit of the guest, much like we do
93
* it for the PMU interrupt.
94
*
95
* - because most of the ICH_*_EL2 registers live in the VNCR page, the
96
* quality of emulation is poor: L1 can setup the vgic so that an MI would
97
* immediately fire, and not observe anything until the next exit.
98
* Similarly, a pending MI is not immediately disabled by clearing
99
* ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for
100
* example.
101
*
102
* System register emulation:
103
*
104
* We get two classes of registers:
105
*
106
* - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
107
* them, and L0 doesn't see a thing.
108
*
109
* - those that always trap (ELRSR, EISR, MISR): these are status registers
110
* that are built on the fly based on the in-memory state.
111
*
112
* Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
113
* and a NV L2 would either access the VNCR page provided by L1 (memory
114
* based registers), or see the access redirected to L1 (registers that
115
* trap) thanks to NV being set by L1.
116
*/
117
118
bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
119
{
120
u64 xmo;
121
122
if (is_nested_ctxt(vcpu)) {
123
xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
124
WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
125
"Separate virtual IRQ/FIQ settings not supported\n");
126
127
return !!xmo;
128
}
129
130
return false;
131
}
132
133
static struct shadow_if *get_shadow_if(void)
134
{
135
return this_cpu_ptr(&shadow_if);
136
}
137
138
static bool lr_triggers_eoi(u64 lr)
139
{
140
return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
141
}
142
143
static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
144
{
145
u16 eisr = 0, elrsr = 0;
146
bool pend = false;
147
148
for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
149
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
150
151
if (lr_triggers_eoi(lr))
152
eisr |= BIT(i);
153
if (!(lr & ICH_LR_STATE))
154
elrsr |= BIT(i);
155
pend |= (lr & ICH_LR_PENDING_BIT);
156
}
157
158
mi_state->eisr = eisr;
159
mi_state->elrsr = elrsr;
160
mi_state->pend = pend;
161
}
162
163
u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
164
{
165
struct mi_state mi_state;
166
167
vgic_compute_mi_state(vcpu, &mi_state);
168
return mi_state.eisr;
169
}
170
171
u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
172
{
173
struct mi_state mi_state;
174
175
vgic_compute_mi_state(vcpu, &mi_state);
176
return mi_state.elrsr;
177
}
178
179
u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
180
{
181
struct mi_state mi_state;
182
u64 reg = 0, hcr, vmcr;
183
184
hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
185
vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
186
187
vgic_compute_mi_state(vcpu, &mi_state);
188
189
if (mi_state.eisr)
190
reg |= ICH_MISR_EL2_EOI;
191
192
if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
193
int used_lrs = kvm_vgic_global_state.nr_lr;
194
195
used_lrs -= hweight16(mi_state.elrsr);
196
reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
197
}
198
199
if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
200
reg |= ICH_MISR_EL2_LRENP;
201
202
if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
203
reg |= ICH_MISR_EL2_NP;
204
205
if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_EL2_VENG0_MASK))
206
reg |= ICH_MISR_EL2_VGrp0E;
207
208
if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_EL2_VENG0_MASK))
209
reg |= ICH_MISR_EL2_VGrp0D;
210
211
if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_EL2_VENG1_MASK))
212
reg |= ICH_MISR_EL2_VGrp1E;
213
214
if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_EL2_VENG1_MASK))
215
reg |= ICH_MISR_EL2_VGrp1D;
216
217
return reg;
218
}
219
220
static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
221
{
222
struct vgic_irq *irq;
223
224
if (!(lr & ICH_LR_HW))
225
return lr;
226
227
/* We have the HW bit set, check for validity of pINTID */
228
irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
229
/* If there was no real mapping, nuke the HW bit */
230
if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
231
lr &= ~ICH_LR_HW;
232
233
/* Translate the virtual mapping to the real one, even if invalid */
234
if (irq) {
235
lr &= ~ICH_LR_PHYS_ID_MASK;
236
lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
237
vgic_put_irq(vcpu->kvm, irq);
238
}
239
240
return lr;
241
}
242
243
/*
244
* For LRs which have HW bit set such as timer interrupts, we modify them to
245
* have the host hardware interrupt number instead of the virtual one programmed
246
* by the guest hypervisor.
247
*/
248
static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
249
struct vgic_v3_cpu_if *s_cpu_if)
250
{
251
struct shadow_if *shadow_if;
252
253
shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
254
shadow_if->lr_map = 0;
255
256
for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
257
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
258
259
if (!(lr & ICH_LR_STATE))
260
continue;
261
262
lr = translate_lr_pintid(vcpu, lr);
263
264
s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
265
shadow_if->lr_map |= BIT(i);
266
}
267
268
s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
269
}
270
271
void vgic_v3_flush_nested(struct kvm_vcpu *vcpu)
272
{
273
u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
274
275
write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2);
276
}
277
278
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
279
{
280
struct shadow_if *shadow_if = get_shadow_if();
281
int i;
282
283
for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
284
u64 val, host_lr, lr;
285
286
host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
287
288
/* Propagate the new LR state */
289
lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
290
val = lr & ~ICH_LR_STATE;
291
val |= host_lr & ICH_LR_STATE;
292
__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
293
294
/*
295
* Deactivation of a HW interrupt: the LR must have the HW
296
* bit set, have been in a non-invalid state before the run,
297
* and now be in an invalid state. If any of that doesn't
298
* hold, we're done with this LR.
299
*/
300
if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) &&
301
!(host_lr & ICH_LR_STATE)))
302
continue;
303
304
/*
305
* If we had a HW lr programmed by the guest hypervisor, we
306
* need to emulate the HW effect between the guest hypervisor
307
* and the nested guest.
308
*/
309
vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
310
}
311
312
/* We need these to be synchronised to generate the MI */
313
__vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2));
314
__vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount);
315
__vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount);
316
317
write_sysreg_s(0, SYS_ICH_HCR_EL2);
318
isb();
319
320
vgic_v3_nested_update_mi(vcpu);
321
}
322
323
static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
324
struct vgic_v3_cpu_if *s_cpu_if)
325
{
326
struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
327
int i;
328
329
s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
330
s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
331
s_cpu_if->vgic_sre = host_if->vgic_sre;
332
333
for (i = 0; i < 4; i++) {
334
s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
335
s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
336
}
337
338
vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
339
}
340
341
void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
342
{
343
struct shadow_if *shadow_if = get_shadow_if();
344
struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
345
346
BUG_ON(!vgic_state_is_nested(vcpu));
347
348
vgic_v3_create_shadow_state(vcpu, cpu_if);
349
350
__vgic_v3_restore_vmcr_aprs(cpu_if);
351
__vgic_v3_activate_traps(cpu_if);
352
353
for (int i = 0; i < cpu_if->used_lrs; i++)
354
__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
355
356
/*
357
* Propagate the number of used LRs for the benefit of the HYP
358
* GICv3 emulation code. Yes, this is a pretty sorry hack.
359
*/
360
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
361
}
362
363
void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
364
{
365
struct shadow_if *shadow_if = get_shadow_if();
366
struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
367
int i;
368
369
__vgic_v3_save_aprs(s_cpu_if);
370
371
for (i = 0; i < 4; i++) {
372
__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
373
__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
374
}
375
376
for (i = 0; i < s_cpu_if->used_lrs; i++)
377
__gic_v3_set_lr(0, i);
378
379
__vgic_v3_deactivate_traps(s_cpu_if);
380
381
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
382
}
383
384
/*
385
* If we exit a L2 VM with a pending maintenance interrupt from the GIC,
386
* then we need to forward this to L1 so that it can re-sync the appropriate
387
* LRs and sample level triggered interrupts again.
388
*/
389
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
390
{
391
bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
392
393
/* This will force a switch back to L1 if the level is high */
394
kvm_vgic_inject_irq(vcpu->kvm, vcpu,
395
vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
396
397
sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
398
}
399
400
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
401
{
402
bool level;
403
404
level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
405
kvm_vgic_inject_irq(vcpu->kvm, vcpu,
406
vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
407
}
408
409