CoCalc -- vgic-v3-nested.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/kvm/vgic/vgic-v3-nested.c
²⁶⁵³⁰ views
1
// SPDX-License-Identifier: GPL-2.0-only
2

3
#include <linux/cpu.h>
4
#include <linux/kvm.h>
5
#include <linux/kvm_host.h>
6
#include <linux/interrupt.h>
7
#include <linux/io.h>
8
#include <linux/uaccess.h>
9

10
#include <kvm/arm_vgic.h>
11

12
#include <asm/kvm_arm.h>
13
#include <asm/kvm_emulate.h>
14
#include <asm/kvm_nested.h>
15

16
#include "vgic.h"
17

18
#define ICH_LRN(n)	(ICH_LR0_EL2 + (n))
19
#define ICH_AP0RN(n)	(ICH_AP0R0_EL2 + (n))
20
#define ICH_AP1RN(n)	(ICH_AP1R0_EL2 + (n))
21

22
struct mi_state {
23
	u16	eisr;
24
	u16	elrsr;
25
	bool	pend;
26
};
27

28
/*
29
 * The shadow registers loaded to the hardware when running a L2 guest
30
 * with the virtual IMO/FMO bits set.
31
 */
32
struct shadow_if {
33
	struct vgic_v3_cpu_if	cpuif;
34
	unsigned long		lr_map;
35
};
36

37
static DEFINE_PER_CPU(struct shadow_if, shadow_if);
38

39
static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
40
{
41
	return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
42
}
43

44
/*
45
 * Nesting GICv3 support
46
 *
47
 * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
48
 * completely controls the interrupts injected via the list registers.
49
 * Consequently, most of the state that is modified by the guest (by ACK-ing
50
 * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
51
 * keep a semi-consistent view of the interrupts.
52
 *
53
 * This still applies for a NV guest, but only while "InHost" (either
54
 * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
55
 *
56
 * When running a L2 guest ("not InHost"), things are radically different,
57
 * as the L1 guest is in charge of provisioning the interrupts via its own
58
 * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
59
 * page.  This means that the flow described above does work (there is no
60
 * state to rebuild in the L0 hypervisor), and that most things happed on L2
61
 * load/put:
62
 *
63
 * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
64
 *   per-CPU data structure that is used to populate the actual LRs. This is
65
 *   an extra copy that we could avoid, but life is short. In the process,
66
 *   we remap any interrupt that has the HW bit set to the mapped interrupt
67
 *   on the host, should the host consider it a HW one. This allows the HW
68
 *   deactivation to take its course, such as for the timer.
69
 *
70
 * - on L2 put: perform the inverse transformation, so that the result of L2
71
 *   running becomes visible to L1 in the VNCR-accessible registers.
72
 *
73
 * - there is nothing to do on L2 entry, as everything will have happened
74
 *   on load. However, this is the point where we detect that an interrupt
75
 *   targeting L1 and prepare the grand switcheroo.
76
 *
77
 * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
78
 *   interrupt. The L0 active state will be cleared by the HW if the L1
79
 *   interrupt was itself backed by a HW interrupt.
80
 *
81
 * Maintenance Interrupt (MI) management:
82
 *
83
 * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
84
 * used as a handover point between L2 and L1.
85
 *
86
 * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
87
 *   and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
88
 *   run and process the MI.
89
 *
90
 * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
91
 *   state must be computed at each entry/exit of the guest, much like we do
92
 *   it for the PMU interrupt.
93
 *
94
 * - because most of the ICH_*_EL2 registers live in the VNCR page, the
95
 *   quality of emulation is poor: L1 can setup the vgic so that an MI would
96
 *   immediately fire, and not observe anything until the next exit. Trying
97
 *   to read ICH_MISR_EL2 would do the trick, for example.
98
 *
99
 * System register emulation:
100
 *
101
 * We get two classes of registers:
102
 *
103
 * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
104
 *   them, and L0 doesn't see a thing.
105
 *
106
 * - those that always trap (ELRSR, EISR, MISR): these are status registers
107
 *   that are built on the fly based on the in-memory state.
108
 *
109
 * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
110
 * and a NV L2 would either access the VNCR page provided by L1 (memory
111
 * based registers), or see the access redirected to L1 (registers that
112
 * trap) thanks to NV being set by L1.
113
 */
114

115
bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
116
{
117
	u64 xmo;
118

119
	if (is_nested_ctxt(vcpu)) {
120
		xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
121
		WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
122
			  "Separate virtual IRQ/FIQ settings not supported\n");
123

124
		return !!xmo;
125
	}
126

127
	return false;
128
}
129

130
static struct shadow_if *get_shadow_if(void)
131
{
132
	return this_cpu_ptr(&shadow_if);
133
}
134

135
static bool lr_triggers_eoi(u64 lr)
136
{
137
	return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
138
}
139

140
static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
141
{
142
	u16 eisr = 0, elrsr = 0;
143
	bool pend = false;
144

145
	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
146
		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
147

148
		if (lr_triggers_eoi(lr))
149
			eisr |= BIT(i);
150
		if (!(lr & ICH_LR_STATE))
151
			elrsr |= BIT(i);
152
		pend |= (lr & ICH_LR_PENDING_BIT);
153
	}
154

155
	mi_state->eisr	= eisr;
156
	mi_state->elrsr	= elrsr;
157
	mi_state->pend	= pend;
158
}
159

160
u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
161
{
162
	struct mi_state mi_state;
163

164
	vgic_compute_mi_state(vcpu, &mi_state);
165
	return mi_state.eisr;
166
}
167

168
u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
169
{
170
	struct mi_state mi_state;
171

172
	vgic_compute_mi_state(vcpu, &mi_state);
173
	return mi_state.elrsr;
174
}
175

176
u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
177
{
178
	struct mi_state mi_state;
179
	u64 reg = 0, hcr, vmcr;
180

181
	hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
182
	vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
183

184
	vgic_compute_mi_state(vcpu, &mi_state);
185

186
	if (mi_state.eisr)
187
		reg |= ICH_MISR_EL2_EOI;
188

189
	if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
190
		int used_lrs = kvm_vgic_global_state.nr_lr;
191

192
		used_lrs -= hweight16(mi_state.elrsr);
193
		reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
194
	}
195

196
	if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
197
		reg |= ICH_MISR_EL2_LRENP;
198

199
	if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
200
		reg |= ICH_MISR_EL2_NP;
201

202
	if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
203
		reg |= ICH_MISR_EL2_VGrp0E;
204

205
	if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
206
		reg |= ICH_MISR_EL2_VGrp0D;
207

208
	if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
209
		reg |= ICH_MISR_EL2_VGrp1E;
210

211
	if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
212
		reg |= ICH_MISR_EL2_VGrp1D;
213

214
	return reg;
215
}
216

217
static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
218
{
219
	struct vgic_irq *irq;
220

221
	if (!(lr & ICH_LR_HW))
222
		return lr;
223

224
	/* We have the HW bit set, check for validity of pINTID */
225
	irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
226
	/* If there was no real mapping, nuke the HW bit */
227
	if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
228
		lr &= ~ICH_LR_HW;
229

230
	/* Translate the virtual mapping to the real one, even if invalid */
231
	if (irq) {
232
		lr &= ~ICH_LR_PHYS_ID_MASK;
233
		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
234
		vgic_put_irq(vcpu->kvm, irq);
235
	}
236

237
	return lr;
238
}
239

240
/*
241
 * For LRs which have HW bit set such as timer interrupts, we modify them to
242
 * have the host hardware interrupt number instead of the virtual one programmed
243
 * by the guest hypervisor.
244
 */
245
static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
246
				     struct vgic_v3_cpu_if *s_cpu_if)
247
{
248
	struct shadow_if *shadow_if;
249

250
	shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
251
	shadow_if->lr_map = 0;
252

253
	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
254
		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
255

256
		if (!(lr & ICH_LR_STATE))
257
			continue;
258

259
		lr = translate_lr_pintid(vcpu, lr);
260

261
		s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
262
		shadow_if->lr_map |= BIT(i);
263
	}
264

265
	s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
266
}
267

268
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
269
{
270
	struct shadow_if *shadow_if = get_shadow_if();
271
	int i;
272

273
	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
274
		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
275
		struct vgic_irq *irq;
276

277
		if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
278
			continue;
279

280
		/*
281
		 * If we had a HW lr programmed by the guest hypervisor, we
282
		 * need to emulate the HW effect between the guest hypervisor
283
		 * and the nested guest.
284
		 */
285
		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
286
		if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
287
			continue;
288

289
		lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
290
		if (!(lr & ICH_LR_STATE))
291
			irq->active = false;
292

293
		vgic_put_irq(vcpu->kvm, irq);
294
	}
295
}
296

297
static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
298
					struct vgic_v3_cpu_if *s_cpu_if)
299
{
300
	struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
301
	u64 val = 0;
302
	int i;
303

304
	/*
305
	 * If we're on a system with a broken vgic that requires
306
	 * trapping, propagate the trapping requirements.
307
	 *
308
	 * Ah, the smell of rotten fruits...
309
	 */
310
	if (static_branch_unlikely(&vgic_v3_cpuif_trap))
311
		val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
312
					   ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
313
	s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
314
	s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
315
	s_cpu_if->vgic_sre = host_if->vgic_sre;
316

317
	for (i = 0; i < 4; i++) {
318
		s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
319
		s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
320
	}
321

322
	vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
323
}
324

325
void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
326
{
327
	struct shadow_if *shadow_if = get_shadow_if();
328
	struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
329

330
	BUG_ON(!vgic_state_is_nested(vcpu));
331

332
	vgic_v3_create_shadow_state(vcpu, cpu_if);
333

334
	__vgic_v3_restore_vmcr_aprs(cpu_if);
335
	__vgic_v3_activate_traps(cpu_if);
336

337
	__vgic_v3_restore_state(cpu_if);
338

339
	/*
340
	 * Propagate the number of used LRs for the benefit of the HYP
341
	 * GICv3 emulation code. Yes, this is a pretty sorry hack.
342
	 */
343
	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
344
}
345

346
void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
347
{
348
	struct shadow_if *shadow_if = get_shadow_if();
349
	struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
350
	u64 val;
351
	int i;
352

353
	__vgic_v3_save_vmcr_aprs(s_cpu_if);
354
	__vgic_v3_deactivate_traps(s_cpu_if);
355
	__vgic_v3_save_state(s_cpu_if);
356

357
	/*
358
	 * Translate the shadow state HW fields back to the virtual ones
359
	 * before copying the shadow struct back to the nested one.
360
	 */
361
	val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
362
	val &= ~ICH_HCR_EL2_EOIcount_MASK;
363
	val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
364
	__vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val);
365
	__vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr);
366

367
	for (i = 0; i < 4; i++) {
368
		__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
369
		__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
370
	}
371

372
	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
373
		val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
374

375
		val &= ~ICH_LR_STATE;
376
		val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
377

378
		__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
379
	}
380

381
	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
382
}
383

384
/*
385
 * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
386
 * then we need to forward this to L1 so that it can re-sync the appropriate
387
 * LRs and sample level triggered interrupts again.
388
 */
389
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
390
{
391
	bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
392

393
	/* This will force a switch back to L1 if the level is high */
394
	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
395
			    vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
396

397
	sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
398
}
399

400
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
401
{
402
	bool level;
403

404
	level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
405
	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
406
			    vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
407
}
408

409
Product

Resources

Company