CoCalc -- lapic.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/lapic.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0-only
2

3
/*
4
 * Local APIC virtualization
5
 *
6
 * Copyright (C) 2006 Qumranet, Inc.
7
 * Copyright (C) 2007 Novell
8
 * Copyright (C) 2007 Intel
9
 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
10
 *
11
 * Authors:
12
 *   Dor Laor <[email protected]>
13
 *   Gregory Haskins <[email protected]>
14
 *   Yaozu (Eddie) Dong <[email protected]>
15
 *
16
 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
17
 */
18
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19

20
#include <linux/kvm_host.h>
21
#include <linux/kvm.h>
22
#include <linux/mm.h>
23
#include <linux/highmem.h>
24
#include <linux/smp.h>
25
#include <linux/hrtimer.h>
26
#include <linux/io.h>
27
#include <linux/export.h>
28
#include <linux/math64.h>
29
#include <linux/slab.h>
30
#include <asm/apic.h>
31
#include <asm/processor.h>
32
#include <asm/mce.h>
33
#include <asm/msr.h>
34
#include <asm/page.h>
35
#include <asm/current.h>
36
#include <asm/apicdef.h>
37
#include <asm/delay.h>
38
#include <linux/atomic.h>
39
#include <linux/jump_label.h>
40
#include "kvm_cache_regs.h"
41
#include "irq.h"
42
#include "ioapic.h"
43
#include "trace.h"
44
#include "x86.h"
45
#include "xen.h"
46
#include "cpuid.h"
47
#include "hyperv.h"
48
#include "smm.h"
49

50
#ifndef CONFIG_X86_64
51
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
52
#else
53
#define mod_64(x, y) ((x) % (y))
54
#endif
55

56
/* 14 is the version for Xeon and Pentium 8.4.8*/
57
#define APIC_VERSION			0x14UL
58
#define LAPIC_MMIO_LENGTH		(1 << 12)
59

60
/*
61
 * Enable local APIC timer advancement (tscdeadline mode only) with adaptive
62
 * tuning.  When enabled, KVM programs the host timer event to fire early, i.e.
63
 * before the deadline expires, to account for the delay between taking the
64
 * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume
65
 * the guest, i.e. so that the interrupt arrives in the guest with minimal
66
 * latency relative to the deadline programmed by the guest.
67
 */
68
static bool lapic_timer_advance __read_mostly = true;
69
module_param(lapic_timer_advance, bool, 0444);
70

71
#define LAPIC_TIMER_ADVANCE_ADJUST_MIN	100	/* clock cycles */
72
#define LAPIC_TIMER_ADVANCE_ADJUST_MAX	10000	/* clock cycles */
73
#define LAPIC_TIMER_ADVANCE_NS_INIT	1000
74
#define LAPIC_TIMER_ADVANCE_NS_MAX     5000
75
/* step-by-step approximation to mitigate fluctuation */
76
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
77
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
78
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
79

80
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
81
{
82
	apic_set_reg(apic->regs, reg_off, val);
83
}
84

85
static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
86
{
87
	return apic_get_reg64(apic->regs, reg);
88
}
89

90
static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
91
						int reg, u64 val)
92
{
93
	apic_set_reg64(apic->regs, reg, val);
94
}
95

96
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
97
{
98
	struct kvm_lapic *apic = vcpu->arch.apic;
99

100
	return apic_test_vector(vector, apic->regs + APIC_ISR) ||
101
		apic_test_vector(vector, apic->regs + APIC_IRR);
102
}
103

104
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
105
EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
106

107
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
108
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
109

110
static inline int apic_enabled(struct kvm_lapic *apic)
111
{
112
	return kvm_apic_sw_enabled(apic) &&	kvm_apic_hw_enabled(apic);
113
}
114

115
#define LVT_MASK	\
116
	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
117

118
#define LINT_MASK	\
119
	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
120
	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
121

122
static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
123
{
124
	return apic->vcpu->vcpu_id;
125
}
126

127
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
128
{
129
	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
130
		(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
131
}
132

133
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
134
{
135
	return kvm_x86_ops.set_hv_timer
136
	       && !(kvm_mwait_in_guest(vcpu->kvm) ||
137
		    kvm_can_post_timer_interrupt(vcpu));
138
}
139

140
static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
141
{
142
	return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
143
}
144

145
static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
146
{
147
	return ((id >> 4) << 16) | (1 << (id & 0xf));
148
}
149

150
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
151
		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
152
	switch (map->logical_mode) {
153
	case KVM_APIC_MODE_SW_DISABLED:
154
		/* Arbitrarily use the flat map so that @cluster isn't NULL. */
155
		*cluster = map->xapic_flat_map;
156
		*mask = 0;
157
		return true;
158
	case KVM_APIC_MODE_X2APIC: {
159
		u32 offset = (dest_id >> 16) * 16;
160
		u32 max_apic_id = map->max_apic_id;
161

162
		if (offset <= max_apic_id) {
163
			u8 cluster_size = min(max_apic_id - offset + 1, 16U);
164

165
			offset = array_index_nospec(offset, map->max_apic_id + 1);
166
			*cluster = &map->phys_map[offset];
167
			*mask = dest_id & (0xffff >> (16 - cluster_size));
168
		} else {
169
			*mask = 0;
170
		}
171

172
		return true;
173
		}
174
	case KVM_APIC_MODE_XAPIC_FLAT:
175
		*cluster = map->xapic_flat_map;
176
		*mask = dest_id & 0xff;
177
		return true;
178
	case KVM_APIC_MODE_XAPIC_CLUSTER:
179
		*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
180
		*mask = dest_id & 0xf;
181
		return true;
182
	case KVM_APIC_MODE_MAP_DISABLED:
183
		return false;
184
	default:
185
		WARN_ON_ONCE(1);
186
		return false;
187
	}
188
}
189

190
static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
191
				    struct kvm_vcpu *vcpu,
192
				    bool *xapic_id_mismatch)
193
{
194
	struct kvm_lapic *apic = vcpu->arch.apic;
195
	u32 x2apic_id = kvm_x2apic_id(apic);
196
	u32 xapic_id = kvm_xapic_id(apic);
197
	u32 physical_id;
198

199
	/*
200
	 * For simplicity, KVM always allocates enough space for all possible
201
	 * xAPIC IDs.  Yell, but don't kill the VM, as KVM can continue on
202
	 * without the optimized map.
203
	 */
204
	if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
205
		return -EINVAL;
206

207
	/*
208
	 * Bail if a vCPU was added and/or enabled its APIC between allocating
209
	 * the map and doing the actual calculations for the map.  Note, KVM
210
	 * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
211
	 * the compiler decides to reload x2apic_id after this check.
212
	 */
213
	if (x2apic_id > new->max_apic_id)
214
		return -E2BIG;
215

216
	/*
217
	 * Deliberately truncate the vCPU ID when detecting a mismatched APIC
218
	 * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
219
	 * 32-bit value.  Any unwanted aliasing due to truncation results will
220
	 * be detected below.
221
	 */
222
	if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
223
		*xapic_id_mismatch = true;
224

225
	/*
226
	 * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
227
	 * Allow sending events to vCPUs by their x2APIC ID even if the target
228
	 * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
229
	 * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
230
	 * and collide).
231
	 *
232
	 * Honor the architectural (and KVM's non-optimized) behavior if
233
	 * userspace has not enabled 32-bit x2APIC IDs.  Each APIC is supposed
234
	 * to process messages independently.  If multiple vCPUs have the same
235
	 * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
236
	 * manually modified its xAPIC IDs, events targeting that ID are
237
	 * supposed to be recognized by all vCPUs with said ID.
238
	 */
239
	if (vcpu->kvm->arch.x2apic_format) {
240
		/* See also kvm_apic_match_physical_addr(). */
241
		if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
242
			new->phys_map[x2apic_id] = apic;
243

244
		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
245
			new->phys_map[xapic_id] = apic;
246
	} else {
247
		/*
248
		 * Disable the optimized map if the physical APIC ID is already
249
		 * mapped, i.e. is aliased to multiple vCPUs.  The optimized
250
		 * map requires a strict 1:1 mapping between IDs and vCPUs.
251
		 */
252
		if (apic_x2apic_mode(apic))
253
			physical_id = x2apic_id;
254
		else
255
			physical_id = xapic_id;
256

257
		if (new->phys_map[physical_id])
258
			return -EINVAL;
259

260
		new->phys_map[physical_id] = apic;
261
	}
262

263
	return 0;
264
}
265

266
static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
267
					struct kvm_vcpu *vcpu)
268
{
269
	struct kvm_lapic *apic = vcpu->arch.apic;
270
	enum kvm_apic_logical_mode logical_mode;
271
	struct kvm_lapic **cluster;
272
	u16 mask;
273
	u32 ldr;
274

275
	if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
276
		return;
277

278
	if (!kvm_apic_sw_enabled(apic))
279
		return;
280

281
	ldr = kvm_lapic_get_reg(apic, APIC_LDR);
282
	if (!ldr)
283
		return;
284

285
	if (apic_x2apic_mode(apic)) {
286
		logical_mode = KVM_APIC_MODE_X2APIC;
287
	} else {
288
		ldr = GET_APIC_LOGICAL_ID(ldr);
289
		if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
290
			logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
291
		else
292
			logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
293
	}
294

295
	/*
296
	 * To optimize logical mode delivery, all software-enabled APICs must
297
	 * be configured for the same mode.
298
	 */
299
	if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
300
		new->logical_mode = logical_mode;
301
	} else if (new->logical_mode != logical_mode) {
302
		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
303
		return;
304
	}
305

306
	/*
307
	 * In x2APIC mode, the LDR is read-only and derived directly from the
308
	 * x2APIC ID, thus is guaranteed to be addressable.  KVM reuses
309
	 * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
310
	 * reversing the LDR calculation to get cluster of APICs, i.e. no
311
	 * additional work is required.
312
	 */
313
	if (apic_x2apic_mode(apic))
314
		return;
315

316
	if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
317
							&cluster, &mask))) {
318
		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
319
		return;
320
	}
321

322
	if (!mask)
323
		return;
324

325
	ldr = ffs(mask) - 1;
326
	if (!is_power_of_2(mask) || cluster[ldr])
327
		new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
328
	else
329
		cluster[ldr] = apic;
330
}
331

332
/*
333
 * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
334
 *
335
 * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
336
 * apic_map_lock_held.
337
 */
338
enum {
339
	CLEAN,
340
	UPDATE_IN_PROGRESS,
341
	DIRTY
342
};
343

344
static void kvm_recalculate_apic_map(struct kvm *kvm)
345
{
346
	struct kvm_apic_map *new, *old = NULL;
347
	struct kvm_vcpu *vcpu;
348
	unsigned long i;
349
	u32 max_id = 255; /* enough space for any xAPIC ID */
350
	bool xapic_id_mismatch;
351
	int r;
352

353
	/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map.  */
354
	if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
355
		return;
356

357
	WARN_ONCE(!irqchip_in_kernel(kvm),
358
		  "Dirty APIC map without an in-kernel local APIC");
359

360
	mutex_lock(&kvm->arch.apic_map_lock);
361

362
retry:
363
	/*
364
	 * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
365
	 * or the APIC registers (if dirty).  Note, on retry the map may have
366
	 * not yet been marked dirty by whatever task changed a vCPU's x2APIC
367
	 * ID, i.e. the map may still show up as in-progress.  In that case
368
	 * this task still needs to retry and complete its calculation.
369
	 */
370
	if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
371
				   DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
372
		/* Someone else has updated the map. */
373
		mutex_unlock(&kvm->arch.apic_map_lock);
374
		return;
375
	}
376

377
	/*
378
	 * Reset the mismatch flag between attempts so that KVM does the right
379
	 * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
380
	 * keep max_id strictly increasing.  Disallowing max_id from shrinking
381
	 * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
382
	 * with the highest x2APIC ID is toggling its APIC on and off.
383
	 */
384
	xapic_id_mismatch = false;
385

386
	kvm_for_each_vcpu(i, vcpu, kvm)
387
		if (kvm_apic_present(vcpu))
388
			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
389

390
	new = kvzalloc(sizeof(struct kvm_apic_map) +
391
	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
392
			   GFP_KERNEL_ACCOUNT);
393

394
	if (!new)
395
		goto out;
396

397
	new->max_apic_id = max_id;
398
	new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
399

400
	kvm_for_each_vcpu(i, vcpu, kvm) {
401
		if (!kvm_apic_present(vcpu))
402
			continue;
403

404
		r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
405
		if (r) {
406
			kvfree(new);
407
			new = NULL;
408
			if (r == -E2BIG) {
409
				cond_resched();
410
				goto retry;
411
			}
412

413
			goto out;
414
		}
415

416
		kvm_recalculate_logical_map(new, vcpu);
417
	}
418
out:
419
	/*
420
	 * The optimized map is effectively KVM's internal version of APICv,
421
	 * and all unwanted aliasing that results in disabling the optimized
422
	 * map also applies to APICv.
423
	 */
424
	if (!new)
425
		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
426
	else
427
		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
428

429
	if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
430
		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
431
	else
432
		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
433

434
	if (xapic_id_mismatch)
435
		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
436
	else
437
		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
438

439
	old = rcu_dereference_protected(kvm->arch.apic_map,
440
			lockdep_is_held(&kvm->arch.apic_map_lock));
441
	rcu_assign_pointer(kvm->arch.apic_map, new);
442
	/*
443
	 * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
444
	 * If another update has come in, leave it DIRTY.
445
	 */
446
	atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
447
			       UPDATE_IN_PROGRESS, CLEAN);
448
	mutex_unlock(&kvm->arch.apic_map_lock);
449

450
	if (old)
451
		kvfree_rcu(old, rcu);
452

453
	kvm_make_scan_ioapic_request(kvm);
454
}
455

456
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
457
{
458
	bool enabled = val & APIC_SPIV_APIC_ENABLED;
459

460
	kvm_lapic_set_reg(apic, APIC_SPIV, val);
461

462
	if (enabled != apic->sw_enabled) {
463
		apic->sw_enabled = enabled;
464
		if (enabled)
465
			static_branch_slow_dec_deferred(&apic_sw_disabled);
466
		else
467
			static_branch_inc(&apic_sw_disabled.key);
468

469
		atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
470
	}
471

472
	/* Check if there are APF page ready requests pending */
473
	if (enabled) {
474
		kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
475
		kvm_xen_sw_enable_lapic(apic->vcpu);
476
	}
477
}
478

479
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
480
{
481
	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
482
	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
483
}
484

485
static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
486
{
487
	kvm_lapic_set_reg(apic, APIC_LDR, id);
488
	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
489
}
490

491
static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
492
{
493
	kvm_lapic_set_reg(apic, APIC_DFR, val);
494
	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
495
}
496

497
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
498
{
499
	u32 ldr = kvm_apic_calc_x2apic_ldr(id);
500

501
	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
502

503
	kvm_lapic_set_reg(apic, APIC_ID, id);
504
	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
505
	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
506
}
507

508
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
509
{
510
	return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
511
}
512

513
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
514
{
515
	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
516
}
517

518
static inline int apic_lvtt_period(struct kvm_lapic *apic)
519
{
520
	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
521
}
522

523
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
524
{
525
	return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
526
}
527

528
static inline int apic_lvt_nmi_mode(u32 lvt_val)
529
{
530
	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
531
}
532

533
static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
534
{
535
	return apic->nr_lvt_entries > lvt_index;
536
}
537

538
static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
539
{
540
	return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
541
}
542

543
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
544
{
545
	struct kvm_lapic *apic = vcpu->arch.apic;
546
	u32 v = 0;
547

548
	if (!lapic_in_kernel(vcpu))
549
		return;
550

551
	v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
552

553
	/*
554
	 * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
555
	 * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
556
	 * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
557
	 * version first and level-triggered interrupts never get EOIed in
558
	 * IOAPIC.
559
	 */
560
	if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
561
	    !ioapic_in_kernel(vcpu->kvm))
562
		v |= APIC_LVR_DIRECTED_EOI;
563
	kvm_lapic_set_reg(apic, APIC_LVR, v);
564
}
565

566
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
567
{
568
	int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
569
	struct kvm_lapic *apic = vcpu->arch.apic;
570
	int i;
571

572
	if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
573
		return;
574

575
	/* Initialize/mask any "new" LVT entries. */
576
	for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
577
		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
578

579
	apic->nr_lvt_entries = nr_lvt_entries;
580

581
	/* The number of LVT entries is reflected in the version register. */
582
	kvm_apic_set_version(vcpu);
583
}
584

585
static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
586
	[LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
587
	[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
588
	[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
589
	[LVT_LINT0] = LINT_MASK,
590
	[LVT_LINT1] = LINT_MASK,
591
	[LVT_ERROR] = LVT_MASK,
592
	[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
593
};
594

595
static u8 count_vectors(void *bitmap)
596
{
597
	int vec;
598
	u32 *reg;
599
	u8 count = 0;
600

601
	for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
602
		reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
603
		count += hweight32(*reg);
604
	}
605

606
	return count;
607
}
608

609
bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
610
{
611
	unsigned long pir_vals[NR_PIR_WORDS];
612
	u32 *__pir = (void *)pir_vals;
613
	u32 i, vec;
614
	u32 irr_val, prev_irr_val;
615
	int max_updated_irr;
616

617
	max_updated_irr = -1;
618
	*max_irr = -1;
619

620
	if (!pi_harvest_pir(pir, pir_vals))
621
		return false;
622

623
	for (i = vec = 0; i <= 7; i++, vec += 32) {
624
		u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
625

626
		irr_val = READ_ONCE(*p_irr);
627

628
		if (__pir[i]) {
629
			prev_irr_val = irr_val;
630
			do {
631
				irr_val = prev_irr_val | __pir[i];
632
			} while (prev_irr_val != irr_val &&
633
				 !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
634

635
			if (prev_irr_val != irr_val)
636
				max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
637
		}
638
		if (irr_val)
639
			*max_irr = __fls(irr_val) + vec;
640
	}
641

642
	return ((max_updated_irr != -1) &&
643
		(max_updated_irr == *max_irr));
644
}
645
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
646

647
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
648
{
649
	struct kvm_lapic *apic = vcpu->arch.apic;
650
	bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
651

652
	if (unlikely(!apic->apicv_active && irr_updated))
653
		apic->irr_pending = true;
654
	return irr_updated;
655
}
656
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
657

658
static inline int apic_search_irr(struct kvm_lapic *apic)
659
{
660
	return apic_find_highest_vector(apic->regs + APIC_IRR);
661
}
662

663
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
664
{
665
	int result;
666

667
	/*
668
	 * Note that irr_pending is just a hint. It will be always
669
	 * true with virtual interrupt delivery enabled.
670
	 */
671
	if (!apic->irr_pending)
672
		return -1;
673

674
	result = apic_search_irr(apic);
675
	ASSERT(result == -1 || result >= 16);
676

677
	return result;
678
}
679

680
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
681
{
682
	if (unlikely(apic->apicv_active)) {
683
		apic_clear_vector(vec, apic->regs + APIC_IRR);
684
	} else {
685
		apic->irr_pending = false;
686
		apic_clear_vector(vec, apic->regs + APIC_IRR);
687
		if (apic_search_irr(apic) != -1)
688
			apic->irr_pending = true;
689
	}
690
}
691

692
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
693
{
694
	apic_clear_irr(vec, vcpu->arch.apic);
695
}
696
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
697

698
static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
699
{
700
	return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
701
}
702

703
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
704
{
705
	if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
706
			       apic_vector_to_isr(vec, apic)))
707
		return;
708

709
	/*
710
	 * With APIC virtualization enabled, all caching is disabled
711
	 * because the processor can modify ISR under the hood.  Instead
712
	 * just set SVI.
713
	 */
714
	if (unlikely(apic->apicv_active))
715
		kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec);
716
	else {
717
		++apic->isr_count;
718
		BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
719
		/*
720
		 * ISR (in service register) bit is set when injecting an interrupt.
721
		 * The highest vector is injected. Thus the latest bit set matches
722
		 * the highest bit in ISR.
723
		 */
724
		apic->highest_isr_cache = vec;
725
	}
726
}
727

728
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
729
{
730
	int result;
731

732
	/*
733
	 * Note that isr_count is always 1, and highest_isr_cache
734
	 * is always -1, with APIC virtualization enabled.
735
	 */
736
	if (!apic->isr_count)
737
		return -1;
738
	if (likely(apic->highest_isr_cache != -1))
739
		return apic->highest_isr_cache;
740

741
	result = apic_find_highest_vector(apic->regs + APIC_ISR);
742
	ASSERT(result == -1 || result >= 16);
743

744
	return result;
745
}
746

747
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
748
{
749
	if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
750
				  apic_vector_to_isr(vec, apic)))
751
		return;
752

753
	/*
754
	 * We do get here for APIC virtualization enabled if the guest
755
	 * uses the Hyper-V APIC enlightenment.  In this case we may need
756
	 * to trigger a new interrupt delivery by writing the SVI field;
757
	 * on the other hand isr_count and highest_isr_cache are unused
758
	 * and must be left alone.
759
	 */
760
	if (unlikely(apic->apicv_active))
761
		kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
762
	else {
763
		--apic->isr_count;
764
		BUG_ON(apic->isr_count < 0);
765
		apic->highest_isr_cache = -1;
766
	}
767
}
768

769
void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
770
{
771
	struct kvm_lapic *apic = vcpu->arch.apic;
772

773
	if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
774
		return;
775

776
	kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
777
}
778
EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
779

780
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
781
{
782
	/* This may race with setting of irr in __apic_accept_irq() and
783
	 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
784
	 * will cause vmexit immediately and the value will be recalculated
785
	 * on the next vmentry.
786
	 */
787
	return apic_find_highest_irr(vcpu->arch.apic);
788
}
789
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
790

791
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
792
			     int vector, int level, int trig_mode,
793
			     struct dest_map *dest_map);
794

795
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
796
		     struct dest_map *dest_map)
797
{
798
	struct kvm_lapic *apic = vcpu->arch.apic;
799

800
	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
801
			irq->level, irq->trig_mode, dest_map);
802
}
803

804
static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
805
			 struct kvm_lapic_irq *irq, u32 min)
806
{
807
	int i, count = 0;
808
	struct kvm_vcpu *vcpu;
809

810
	if (min > map->max_apic_id)
811
		return 0;
812

813
	min = array_index_nospec(min, map->max_apic_id + 1);
814

815
	for_each_set_bit(i, ipi_bitmap,
816
		min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
817
		if (map->phys_map[min + i]) {
818
			vcpu = map->phys_map[min + i]->vcpu;
819
			count += kvm_apic_set_irq(vcpu, irq, NULL);
820
		}
821
	}
822

823
	return count;
824
}
825

826
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
827
		    unsigned long ipi_bitmap_high, u32 min,
828
		    unsigned long icr, int op_64_bit)
829
{
830
	struct kvm_apic_map *map;
831
	struct kvm_lapic_irq irq = {0};
832
	int cluster_size = op_64_bit ? 64 : 32;
833
	int count;
834

835
	if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
836
		return -KVM_EINVAL;
837

838
	irq.vector = icr & APIC_VECTOR_MASK;
839
	irq.delivery_mode = icr & APIC_MODE_MASK;
840
	irq.level = (icr & APIC_INT_ASSERT) != 0;
841
	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
842

843
	rcu_read_lock();
844
	map = rcu_dereference(kvm->arch.apic_map);
845

846
	count = -EOPNOTSUPP;
847
	if (likely(map)) {
848
		count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
849
		min += cluster_size;
850
		count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
851
	}
852

853
	rcu_read_unlock();
854
	return count;
855
}
856

857
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
858
{
859

860
	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
861
				      sizeof(val));
862
}
863

864
static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
865
{
866

867
	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
868
				      sizeof(*val));
869
}
870

871
static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
872
{
873
	return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
874
}
875

876
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
877
{
878
	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
879
		return;
880

881
	__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
882
}
883

884
static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
885
{
886
	u8 val;
887

888
	if (pv_eoi_get_user(vcpu, &val) < 0)
889
		return false;
890

891
	val &= KVM_PV_EOI_ENABLED;
892

893
	if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
894
		return false;
895

896
	/*
897
	 * Clear pending bit in any case: it will be set again on vmentry.
898
	 * While this might not be ideal from performance point of view,
899
	 * this makes sure pv eoi is only enabled when we know it's safe.
900
	 */
901
	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
902

903
	return val;
904
}
905

906
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
907
{
908
	int highest_irr;
909
	if (kvm_x86_ops.sync_pir_to_irr)
910
		highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
911
	else
912
		highest_irr = apic_find_highest_irr(apic);
913
	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
914
		return -1;
915
	return highest_irr;
916
}
917

918
static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
919
{
920
	u32 tpr, isrv, ppr, old_ppr;
921
	int isr;
922

923
	old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
924
	tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
925
	isr = apic_find_highest_isr(apic);
926
	isrv = (isr != -1) ? isr : 0;
927

928
	if ((tpr & 0xf0) >= (isrv & 0xf0))
929
		ppr = tpr & 0xff;
930
	else
931
		ppr = isrv & 0xf0;
932

933
	*new_ppr = ppr;
934
	if (old_ppr != ppr)
935
		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
936

937
	return ppr < old_ppr;
938
}
939

940
static void apic_update_ppr(struct kvm_lapic *apic)
941
{
942
	u32 ppr;
943

944
	if (__apic_update_ppr(apic, &ppr) &&
945
	    apic_has_interrupt_for_ppr(apic, ppr) != -1)
946
		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
947
}
948

949
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
950
{
951
	apic_update_ppr(vcpu->arch.apic);
952
}
953
EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
954

955
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
956
{
957
	kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
958
	apic_update_ppr(apic);
959
}
960

961
static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
962
{
963
	return mda == (apic_x2apic_mode(apic) ?
964
			X2APIC_BROADCAST : APIC_BROADCAST);
965
}
966

967
static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
968
{
969
	if (kvm_apic_broadcast(apic, mda))
970
		return true;
971

972
	/*
973
	 * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
974
	 * were in x2APIC mode if the target APIC ID can't be encoded as an
975
	 * xAPIC ID.  This allows unique addressing of hotplugged vCPUs (which
976
	 * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
977
	 * mode.  Match the x2APIC ID if and only if the target APIC ID can't
978
	 * be encoded in xAPIC to avoid spurious matches against a vCPU that
979
	 * changed its (addressable) xAPIC ID (which is writable).
980
	 */
981
	if (apic_x2apic_mode(apic) || mda > 0xff)
982
		return mda == kvm_x2apic_id(apic);
983

984
	return mda == kvm_xapic_id(apic);
985
}
986

987
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
988
{
989
	u32 logical_id;
990

991
	if (kvm_apic_broadcast(apic, mda))
992
		return true;
993

994
	logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
995

996
	if (apic_x2apic_mode(apic))
997
		return ((logical_id >> 16) == (mda >> 16))
998
		       && (logical_id & mda & 0xffff) != 0;
999

1000
	logical_id = GET_APIC_LOGICAL_ID(logical_id);
1001

1002
	switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
1003
	case APIC_DFR_FLAT:
1004
		return (logical_id & mda) != 0;
1005
	case APIC_DFR_CLUSTER:
1006
		return ((logical_id >> 4) == (mda >> 4))
1007
		       && (logical_id & mda & 0xf) != 0;
1008
	default:
1009
		return false;
1010
	}
1011
}
1012

1013
/* The KVM local APIC implementation has two quirks:
1014
 *
1015
 *  - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
1016
 *    in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
1017
 *    KVM doesn't do that aliasing.
1018
 *
1019
 *  - in-kernel IOAPIC messages have to be delivered directly to
1020
 *    x2APIC, because the kernel does not support interrupt remapping.
1021
 *    In order to support broadcast without interrupt remapping, x2APIC
1022
 *    rewrites the destination of non-IPI messages from APIC_BROADCAST
1023
 *    to X2APIC_BROADCAST.
1024
 *
1025
 * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
1026
 * important when userspace wants to use x2APIC-format MSIs, because
1027
 * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
1028
 */
1029
static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
1030
		struct kvm_lapic *source, struct kvm_lapic *target)
1031
{
1032
	bool ipi = source != NULL;
1033

1034
	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
1035
	    !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
1036
		return X2APIC_BROADCAST;
1037

1038
	return dest_id;
1039
}
1040

1041
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
1042
			   int shorthand, unsigned int dest, int dest_mode)
1043
{
1044
	struct kvm_lapic *target = vcpu->arch.apic;
1045
	u32 mda = kvm_apic_mda(vcpu, dest, source, target);
1046

1047
	ASSERT(target);
1048
	switch (shorthand) {
1049
	case APIC_DEST_NOSHORT:
1050
		if (dest_mode == APIC_DEST_PHYSICAL)
1051
			return kvm_apic_match_physical_addr(target, mda);
1052
		else
1053
			return kvm_apic_match_logical_addr(target, mda);
1054
	case APIC_DEST_SELF:
1055
		return target == source;
1056
	case APIC_DEST_ALLINC:
1057
		return true;
1058
	case APIC_DEST_ALLBUT:
1059
		return target != source;
1060
	default:
1061
		return false;
1062
	}
1063
}
1064
EXPORT_SYMBOL_GPL(kvm_apic_match_dest);
1065

1066
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
1067
		       const unsigned long *bitmap, u32 bitmap_size)
1068
{
1069
	u32 mod;
1070
	int i, idx = -1;
1071

1072
	mod = vector % dest_vcpus;
1073

1074
	for (i = 0; i <= mod; i++) {
1075
		idx = find_next_bit(bitmap, bitmap_size, idx + 1);
1076
		BUG_ON(idx == bitmap_size);
1077
	}
1078

1079
	return idx;
1080
}
1081

1082
static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
1083
{
1084
	if (!kvm->arch.disabled_lapic_found) {
1085
		kvm->arch.disabled_lapic_found = true;
1086
		pr_info("Disabled LAPIC found during irq injection\n");
1087
	}
1088
}
1089

1090
static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
1091
		struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
1092
{
1093
	if (kvm->arch.x2apic_broadcast_quirk_disabled) {
1094
		if ((irq->dest_id == APIC_BROADCAST &&
1095
		     map->logical_mode != KVM_APIC_MODE_X2APIC))
1096
			return true;
1097
		if (irq->dest_id == X2APIC_BROADCAST)
1098
			return true;
1099
	} else {
1100
		bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
1101
		if (irq->dest_id == (x2apic_ipi ?
1102
		                     X2APIC_BROADCAST : APIC_BROADCAST))
1103
			return true;
1104
	}
1105

1106
	return false;
1107
}
1108

1109
/* Return true if the interrupt can be handled by using *bitmap as index mask
1110
 * for valid destinations in *dst array.
1111
 * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
1112
 * Note: we may have zero kvm_lapic destinations when we return true, which
1113
 * means that the interrupt should be dropped.  In this case, *bitmap would be
1114
 * zero and *dst undefined.
1115
 */
1116
static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
1117
		struct kvm_lapic **src, struct kvm_lapic_irq *irq,
1118
		struct kvm_apic_map *map, struct kvm_lapic ***dst,
1119
		unsigned long *bitmap)
1120
{
1121
	int i, lowest;
1122

1123
	if (irq->shorthand == APIC_DEST_SELF && src) {
1124
		*dst = src;
1125
		*bitmap = 1;
1126
		return true;
1127
	} else if (irq->shorthand)
1128
		return false;
1129

1130
	if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
1131
		return false;
1132

1133
	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
1134
		if (irq->dest_id > map->max_apic_id) {
1135
			*bitmap = 0;
1136
		} else {
1137
			u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
1138
			*dst = &map->phys_map[dest_id];
1139
			*bitmap = 1;
1140
		}
1141
		return true;
1142
	}
1143

1144
	*bitmap = 0;
1145
	if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
1146
				(u16 *)bitmap))
1147
		return false;
1148

1149
	if (!kvm_lowest_prio_delivery(irq))
1150
		return true;
1151

1152
	if (!kvm_vector_hashing_enabled()) {
1153
		lowest = -1;
1154
		for_each_set_bit(i, bitmap, 16) {
1155
			if (!(*dst)[i])
1156
				continue;
1157
			if (lowest < 0)
1158
				lowest = i;
1159
			else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
1160
						(*dst)[lowest]->vcpu) < 0)
1161
				lowest = i;
1162
		}
1163
	} else {
1164
		if (!*bitmap)
1165
			return true;
1166

1167
		lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
1168
				bitmap, 16);
1169

1170
		if (!(*dst)[lowest]) {
1171
			kvm_apic_disabled_lapic_found(kvm);
1172
			*bitmap = 0;
1173
			return true;
1174
		}
1175
	}
1176

1177
	*bitmap = (lowest >= 0) ? 1 << lowest : 0;
1178

1179
	return true;
1180
}
1181

1182
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
1183
		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
1184
{
1185
	struct kvm_apic_map *map;
1186
	unsigned long bitmap;
1187
	struct kvm_lapic **dst = NULL;
1188
	int i;
1189
	bool ret;
1190

1191
	*r = -1;
1192

1193
	if (irq->shorthand == APIC_DEST_SELF) {
1194
		if (KVM_BUG_ON(!src, kvm)) {
1195
			*r = 0;
1196
			return true;
1197
		}
1198
		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
1199
		return true;
1200
	}
1201

1202
	rcu_read_lock();
1203
	map = rcu_dereference(kvm->arch.apic_map);
1204

1205
	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
1206
	if (ret) {
1207
		*r = 0;
1208
		for_each_set_bit(i, &bitmap, 16) {
1209
			if (!dst[i])
1210
				continue;
1211
			*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
1212
		}
1213
	}
1214

1215
	rcu_read_unlock();
1216
	return ret;
1217
}
1218

1219
/*
1220
 * This routine tries to handle interrupts in posted mode, here is how
1221
 * it deals with different cases:
1222
 * - For single-destination interrupts, handle it in posted mode
1223
 * - Else if vector hashing is enabled and it is a lowest-priority
1224
 *   interrupt, handle it in posted mode and use the following mechanism
1225
 *   to find the destination vCPU.
1226
 *	1. For lowest-priority interrupts, store all the possible
1227
 *	   destination vCPUs in an array.
1228
 *	2. Use "guest vector % max number of destination vCPUs" to find
1229
 *	   the right destination vCPU in the array for the lowest-priority
1230
 *	   interrupt.
1231
 * - Otherwise, use remapped mode to inject the interrupt.
1232
 */
1233
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
1234
			struct kvm_vcpu **dest_vcpu)
1235
{
1236
	struct kvm_apic_map *map;
1237
	unsigned long bitmap;
1238
	struct kvm_lapic **dst = NULL;
1239
	bool ret = false;
1240

1241
	if (irq->shorthand)
1242
		return false;
1243

1244
	rcu_read_lock();
1245
	map = rcu_dereference(kvm->arch.apic_map);
1246

1247
	if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
1248
			hweight16(bitmap) == 1) {
1249
		unsigned long i = find_first_bit(&bitmap, 16);
1250

1251
		if (dst[i]) {
1252
			*dest_vcpu = dst[i]->vcpu;
1253
			ret = true;
1254
		}
1255
	}
1256

1257
	rcu_read_unlock();
1258
	return ret;
1259
}
1260

1261
/*
1262
 * Add a pending IRQ into lapic.
1263
 * Return 1 if successfully added and 0 if discarded.
1264
 */
1265
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
1266
			     int vector, int level, int trig_mode,
1267
			     struct dest_map *dest_map)
1268
{
1269
	int result = 0;
1270
	struct kvm_vcpu *vcpu = apic->vcpu;
1271

1272
	trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
1273
				  trig_mode, vector);
1274
	switch (delivery_mode) {
1275
	case APIC_DM_LOWEST:
1276
		vcpu->arch.apic_arb_prio++;
1277
		fallthrough;
1278
	case APIC_DM_FIXED:
1279
		if (unlikely(trig_mode && !level))
1280
			break;
1281

1282
		/* FIXME add logic for vcpu on reset */
1283
		if (unlikely(!apic_enabled(apic)))
1284
			break;
1285

1286
		result = 1;
1287

1288
		if (dest_map) {
1289
			__set_bit(vcpu->vcpu_id, dest_map->map);
1290
			dest_map->vectors[vcpu->vcpu_id] = vector;
1291
		}
1292

1293
		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
1294
			if (trig_mode)
1295
				apic_set_vector(vector, apic->regs + APIC_TMR);
1296
			else
1297
				apic_clear_vector(vector, apic->regs + APIC_TMR);
1298
		}
1299

1300
		kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
1301
						trig_mode, vector);
1302
		break;
1303

1304
	case APIC_DM_REMRD:
1305
		result = 1;
1306
		vcpu->arch.pv.pv_unhalted = 1;
1307
		kvm_make_request(KVM_REQ_EVENT, vcpu);
1308
		kvm_vcpu_kick(vcpu);
1309
		break;
1310

1311
	case APIC_DM_SMI:
1312
		if (!kvm_inject_smi(vcpu)) {
1313
			kvm_vcpu_kick(vcpu);
1314
			result = 1;
1315
		}
1316
		break;
1317

1318
	case APIC_DM_NMI:
1319
		result = 1;
1320
		kvm_inject_nmi(vcpu);
1321
		kvm_vcpu_kick(vcpu);
1322
		break;
1323

1324
	case APIC_DM_INIT:
1325
		if (!trig_mode || level) {
1326
			result = 1;
1327
			/* assumes that there are only KVM_APIC_INIT/SIPI */
1328
			apic->pending_events = (1UL << KVM_APIC_INIT);
1329
			kvm_make_request(KVM_REQ_EVENT, vcpu);
1330
			kvm_vcpu_kick(vcpu);
1331
		}
1332
		break;
1333

1334
	case APIC_DM_STARTUP:
1335
		result = 1;
1336
		apic->sipi_vector = vector;
1337
		/* make sure sipi_vector is visible for the receiver */
1338
		smp_wmb();
1339
		set_bit(KVM_APIC_SIPI, &apic->pending_events);
1340
		kvm_make_request(KVM_REQ_EVENT, vcpu);
1341
		kvm_vcpu_kick(vcpu);
1342
		break;
1343

1344
	case APIC_DM_EXTINT:
1345
		/*
1346
		 * Should only be called by kvm_apic_local_deliver() with LVT0,
1347
		 * before NMI watchdog was enabled. Already handled by
1348
		 * kvm_apic_accept_pic_intr().
1349
		 */
1350
		break;
1351

1352
	default:
1353
		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
1354
		       delivery_mode);
1355
		break;
1356
	}
1357
	return result;
1358
}
1359

1360
/*
1361
 * This routine identifies the destination vcpus mask meant to receive the
1362
 * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
1363
 * out the destination vcpus array and set the bitmap or it traverses to
1364
 * each available vcpu to identify the same.
1365
 */
1366
void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
1367
			      unsigned long *vcpu_bitmap)
1368
{
1369
	struct kvm_lapic **dest_vcpu = NULL;
1370
	struct kvm_lapic *src = NULL;
1371
	struct kvm_apic_map *map;
1372
	struct kvm_vcpu *vcpu;
1373
	unsigned long bitmap, i;
1374
	int vcpu_idx;
1375
	bool ret;
1376

1377
	rcu_read_lock();
1378
	map = rcu_dereference(kvm->arch.apic_map);
1379

1380
	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
1381
					  &bitmap);
1382
	if (ret) {
1383
		for_each_set_bit(i, &bitmap, 16) {
1384
			if (!dest_vcpu[i])
1385
				continue;
1386
			vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
1387
			__set_bit(vcpu_idx, vcpu_bitmap);
1388
		}
1389
	} else {
1390
		kvm_for_each_vcpu(i, vcpu, kvm) {
1391
			if (!kvm_apic_present(vcpu))
1392
				continue;
1393
			if (!kvm_apic_match_dest(vcpu, NULL,
1394
						 irq->shorthand,
1395
						 irq->dest_id,
1396
						 irq->dest_mode))
1397
				continue;
1398
			__set_bit(i, vcpu_bitmap);
1399
		}
1400
	}
1401
	rcu_read_unlock();
1402
}
1403

1404
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
1405
{
1406
	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
1407
}
1408

1409
static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
1410
{
1411
	return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
1412
}
1413

1414
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
1415
{
1416
	int __maybe_unused trigger_mode;
1417

1418
	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
1419
	if (!kvm_ioapic_handles_vector(apic, vector))
1420
		return;
1421

1422
	/*
1423
	 * If the intercepted EOI is for an IRQ that was pending from previous
1424
	 * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
1425
	 * no longer need to be intercepted.
1426
	 */
1427
	if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
1428
		kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
1429

1430
	/* Request a KVM exit to inform the userspace IOAPIC. */
1431
	if (irqchip_split(apic->vcpu->kvm)) {
1432
		apic->vcpu->arch.pending_ioapic_eoi = vector;
1433
		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
1434
		return;
1435
	}
1436

1437
#ifdef CONFIG_KVM_IOAPIC
1438
	if (apic_test_vector(vector, apic->regs + APIC_TMR))
1439
		trigger_mode = IOAPIC_LEVEL_TRIG;
1440
	else
1441
		trigger_mode = IOAPIC_EDGE_TRIG;
1442

1443
	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
1444
#endif
1445
}
1446

1447
static int apic_set_eoi(struct kvm_lapic *apic)
1448
{
1449
	int vector = apic_find_highest_isr(apic);
1450

1451
	trace_kvm_eoi(apic, vector);
1452

1453
	/*
1454
	 * Not every write EOI will has corresponding ISR,
1455
	 * one example is when Kernel check timer on setup_IO_APIC
1456
	 */
1457
	if (vector == -1)
1458
		return vector;
1459

1460
	apic_clear_isr(vector, apic);
1461
	apic_update_ppr(apic);
1462

1463
	if (kvm_hv_synic_has_vector(apic->vcpu, vector))
1464
		kvm_hv_synic_send_eoi(apic->vcpu, vector);
1465

1466
	kvm_ioapic_send_eoi(apic, vector);
1467
	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
1468
	return vector;
1469
}
1470

1471
/*
1472
 * this interface assumes a trap-like exit, which has already finished
1473
 * desired side effect including vISR and vPPR update.
1474
 */
1475
void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
1476
{
1477
	struct kvm_lapic *apic = vcpu->arch.apic;
1478

1479
	trace_kvm_eoi(apic, vector);
1480

1481
	kvm_ioapic_send_eoi(apic, vector);
1482
	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
1483
}
1484
EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
1485

1486
void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
1487
{
1488
	struct kvm_lapic_irq irq;
1489

1490
	/* KVM has no delay and should always clear the BUSY/PENDING flag. */
1491
	WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
1492

1493
	irq.vector = icr_low & APIC_VECTOR_MASK;
1494
	irq.delivery_mode = icr_low & APIC_MODE_MASK;
1495
	irq.dest_mode = icr_low & APIC_DEST_MASK;
1496
	irq.level = (icr_low & APIC_INT_ASSERT) != 0;
1497
	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
1498
	irq.shorthand = icr_low & APIC_SHORT_MASK;
1499
	irq.msi_redir_hint = false;
1500
	if (apic_x2apic_mode(apic))
1501
		irq.dest_id = icr_high;
1502
	else
1503
		irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
1504

1505
	trace_kvm_apic_ipi(icr_low, irq.dest_id);
1506

1507
	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
1508
}
1509
EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
1510

1511
static u32 apic_get_tmcct(struct kvm_lapic *apic)
1512
{
1513
	ktime_t remaining, now;
1514
	s64 ns;
1515

1516
	ASSERT(apic != NULL);
1517

1518
	/* if initial count is 0, current count should also be 0 */
1519
	if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
1520
		apic->lapic_timer.period == 0)
1521
		return 0;
1522

1523
	now = ktime_get();
1524
	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
1525
	if (ktime_to_ns(remaining) < 0)
1526
		remaining = 0;
1527

1528
	ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
1529
	return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
1530
			      apic->divide_count));
1531
}
1532

1533
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
1534
{
1535
	struct kvm_vcpu *vcpu = apic->vcpu;
1536
	struct kvm_run *run = vcpu->run;
1537

1538
	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
1539
	run->tpr_access.rip = kvm_rip_read(vcpu);
1540
	run->tpr_access.is_write = write;
1541
}
1542

1543
static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
1544
{
1545
	if (apic->vcpu->arch.tpr_access_reporting)
1546
		__report_tpr_access(apic, write);
1547
}
1548

1549
static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
1550
{
1551
	u32 val = 0;
1552

1553
	if (offset >= LAPIC_MMIO_LENGTH)
1554
		return 0;
1555

1556
	switch (offset) {
1557
	case APIC_ARBPRI:
1558
		break;
1559

1560
	case APIC_TMCCT:	/* Timer CCR */
1561
		if (apic_lvtt_tscdeadline(apic))
1562
			return 0;
1563

1564
		val = apic_get_tmcct(apic);
1565
		break;
1566
	case APIC_PROCPRI:
1567
		apic_update_ppr(apic);
1568
		val = kvm_lapic_get_reg(apic, offset);
1569
		break;
1570
	case APIC_TASKPRI:
1571
		report_tpr_access(apic, false);
1572
		fallthrough;
1573
	default:
1574
		val = kvm_lapic_get_reg(apic, offset);
1575
		break;
1576
	}
1577

1578
	return val;
1579
}
1580

1581
static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
1582
{
1583
	return container_of(dev, struct kvm_lapic, dev);
1584
}
1585

1586
#define APIC_REG_MASK(reg)	(1ull << ((reg) >> 4))
1587
#define APIC_REGS_MASK(first, count) \
1588
	(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
1589

1590
u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
1591
{
1592
	/* Leave bits '0' for reserved and write-only registers. */
1593
	u64 valid_reg_mask =
1594
		APIC_REG_MASK(APIC_ID) |
1595
		APIC_REG_MASK(APIC_LVR) |
1596
		APIC_REG_MASK(APIC_TASKPRI) |
1597
		APIC_REG_MASK(APIC_PROCPRI) |
1598
		APIC_REG_MASK(APIC_LDR) |
1599
		APIC_REG_MASK(APIC_SPIV) |
1600
		APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
1601
		APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
1602
		APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
1603
		APIC_REG_MASK(APIC_ESR) |
1604
		APIC_REG_MASK(APIC_ICR) |
1605
		APIC_REG_MASK(APIC_LVTT) |
1606
		APIC_REG_MASK(APIC_LVTTHMR) |
1607
		APIC_REG_MASK(APIC_LVTPC) |
1608
		APIC_REG_MASK(APIC_LVT0) |
1609
		APIC_REG_MASK(APIC_LVT1) |
1610
		APIC_REG_MASK(APIC_LVTERR) |
1611
		APIC_REG_MASK(APIC_TMICT) |
1612
		APIC_REG_MASK(APIC_TMCCT) |
1613
		APIC_REG_MASK(APIC_TDCR);
1614

1615
	if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
1616
		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
1617

1618
	/* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
1619
	if (!apic_x2apic_mode(apic))
1620
		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
1621
				  APIC_REG_MASK(APIC_DFR) |
1622
				  APIC_REG_MASK(APIC_ICR2);
1623

1624
	return valid_reg_mask;
1625
}
1626
EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
1627

1628
static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
1629
			      void *data)
1630
{
1631
	unsigned char alignment = offset & 0xf;
1632
	u32 result;
1633

1634
	/*
1635
	 * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
1636
	 * x2APIC and needs to be manually handled by the caller.
1637
	 */
1638
	WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
1639

1640
	if (alignment + len > 4)
1641
		return 1;
1642

1643
	if (offset > 0x3f0 ||
1644
	    !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
1645
		return 1;
1646

1647
	result = __apic_read(apic, offset & ~0xf);
1648

1649
	trace_kvm_apic_read(offset, result);
1650

1651
	switch (len) {
1652
	case 1:
1653
	case 2:
1654
	case 4:
1655
		memcpy(data, (char *)&result + alignment, len);
1656
		break;
1657
	default:
1658
		printk(KERN_ERR "Local APIC read with len = %x, "
1659
		       "should be 1,2, or 4 instead\n", len);
1660
		break;
1661
	}
1662
	return 0;
1663
}
1664

1665
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
1666
{
1667
	return addr >= apic->base_address &&
1668
		addr < apic->base_address + LAPIC_MMIO_LENGTH;
1669
}
1670

1671
static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
1672
			   gpa_t address, int len, void *data)
1673
{
1674
	struct kvm_lapic *apic = to_lapic(this);
1675
	u32 offset = address - apic->base_address;
1676

1677
	if (!apic_mmio_in_range(apic, address))
1678
		return -EOPNOTSUPP;
1679

1680
	if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
1681
		if (!kvm_check_has_quirk(vcpu->kvm,
1682
					 KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
1683
			return -EOPNOTSUPP;
1684

1685
		memset(data, 0xff, len);
1686
		return 0;
1687
	}
1688

1689
	kvm_lapic_reg_read(apic, offset, len, data);
1690

1691
	return 0;
1692
}
1693

1694
static void update_divide_count(struct kvm_lapic *apic)
1695
{
1696
	u32 tmp1, tmp2, tdcr;
1697

1698
	tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
1699
	tmp1 = tdcr & 0xf;
1700
	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
1701
	apic->divide_count = 0x1 << (tmp2 & 0x7);
1702
}
1703

1704
static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
1705
{
1706
	/*
1707
	 * Do not allow the guest to program periodic timers with small
1708
	 * interval, since the hrtimers are not throttled by the host
1709
	 * scheduler.
1710
	 */
1711
	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
1712
		s64 min_period = min_timer_period_us * 1000LL;
1713

1714
		if (apic->lapic_timer.period < min_period) {
1715
			pr_info_once(
1716
			    "vcpu %i: requested %lld ns "
1717
			    "lapic timer period limited to %lld ns\n",
1718
			    apic->vcpu->vcpu_id,
1719
			    apic->lapic_timer.period, min_period);
1720
			apic->lapic_timer.period = min_period;
1721
		}
1722
	}
1723
}
1724

1725
static void cancel_hv_timer(struct kvm_lapic *apic);
1726

1727
static void cancel_apic_timer(struct kvm_lapic *apic)
1728
{
1729
	hrtimer_cancel(&apic->lapic_timer.timer);
1730
	preempt_disable();
1731
	if (apic->lapic_timer.hv_timer_in_use)
1732
		cancel_hv_timer(apic);
1733
	preempt_enable();
1734
	atomic_set(&apic->lapic_timer.pending, 0);
1735
}
1736

1737
static void apic_update_lvtt(struct kvm_lapic *apic)
1738
{
1739
	u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
1740
			apic->lapic_timer.timer_mode_mask;
1741

1742
	if (apic->lapic_timer.timer_mode != timer_mode) {
1743
		if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
1744
				APIC_LVT_TIMER_TSCDEADLINE)) {
1745
			cancel_apic_timer(apic);
1746
			kvm_lapic_set_reg(apic, APIC_TMICT, 0);
1747
			apic->lapic_timer.period = 0;
1748
			apic->lapic_timer.tscdeadline = 0;
1749
		}
1750
		apic->lapic_timer.timer_mode = timer_mode;
1751
		limit_periodic_timer_frequency(apic);
1752
	}
1753
}
1754

1755
/*
1756
 * On APICv, this test will cause a busy wait
1757
 * during a higher-priority task.
1758
 */
1759

1760
static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
1761
{
1762
	struct kvm_lapic *apic = vcpu->arch.apic;
1763
	u32 reg;
1764

1765
	/*
1766
	 * Assume a timer IRQ was "injected" if the APIC is protected.  KVM's
1767
	 * copy of the vIRR is bogus, it's the responsibility of the caller to
1768
	 * precisely check whether or not a timer IRQ is pending.
1769
	 */
1770
	if (apic->guest_apic_protected)
1771
		return true;
1772

1773
	reg = kvm_lapic_get_reg(apic, APIC_LVTT);
1774
	if (kvm_apic_hw_enabled(apic)) {
1775
		int vec = reg & APIC_VECTOR_MASK;
1776
		void *bitmap = apic->regs + APIC_ISR;
1777

1778
		if (apic->apicv_active)
1779
			bitmap = apic->regs + APIC_IRR;
1780

1781
		if (apic_test_vector(vec, bitmap))
1782
			return true;
1783
	}
1784
	return false;
1785
}
1786

1787
static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
1788
{
1789
	u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
1790

1791
	/*
1792
	 * If the guest TSC is running at a different ratio than the host, then
1793
	 * convert the delay to nanoseconds to achieve an accurate delay.  Note
1794
	 * that __delay() uses delay_tsc whenever the hardware has TSC, thus
1795
	 * always for VMX enabled hardware.
1796
	 */
1797
	if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
1798
		__delay(min(guest_cycles,
1799
			nsec_to_cycles(vcpu, timer_advance_ns)));
1800
	} else {
1801
		u64 delay_ns = guest_cycles * 1000000ULL;
1802
		do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
1803
		ndelay(min_t(u32, delay_ns, timer_advance_ns));
1804
	}
1805
}
1806

1807
static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
1808
					      s64 advance_expire_delta)
1809
{
1810
	struct kvm_lapic *apic = vcpu->arch.apic;
1811
	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
1812
	u64 ns;
1813

1814
	/* Do not adjust for tiny fluctuations or large random spikes. */
1815
	if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
1816
	    abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
1817
		return;
1818

1819
	/* too early */
1820
	if (advance_expire_delta < 0) {
1821
		ns = -advance_expire_delta * 1000000ULL;
1822
		do_div(ns, vcpu->arch.virtual_tsc_khz);
1823
		timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
1824
	} else {
1825
	/* too late */
1826
		ns = advance_expire_delta * 1000000ULL;
1827
		do_div(ns, vcpu->arch.virtual_tsc_khz);
1828
		timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
1829
	}
1830

1831
	if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
1832
		timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
1833
	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
1834
}
1835

1836
static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
1837
{
1838
	struct kvm_lapic *apic = vcpu->arch.apic;
1839
	u64 guest_tsc, tsc_deadline;
1840

1841
	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1842
	apic->lapic_timer.expired_tscdeadline = 0;
1843
	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1844
	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
1845

1846
	adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
1847

1848
	/*
1849
	 * If the timer fired early, reread the TSC to account for the overhead
1850
	 * of the above adjustment to avoid waiting longer than is necessary.
1851
	 */
1852
	if (guest_tsc < tsc_deadline)
1853
		guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1854

1855
	if (guest_tsc < tsc_deadline)
1856
		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
1857
}
1858

1859
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
1860
{
1861
	if (lapic_in_kernel(vcpu) &&
1862
	    vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
1863
	    vcpu->arch.apic->lapic_timer.timer_advance_ns &&
1864
	    lapic_timer_int_injected(vcpu))
1865
		__kvm_wait_lapic_expire(vcpu);
1866
}
1867
EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
1868

1869
static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
1870
{
1871
	struct kvm_timer *ktimer = &apic->lapic_timer;
1872

1873
	kvm_apic_local_deliver(apic, APIC_LVTT);
1874
	if (apic_lvtt_tscdeadline(apic)) {
1875
		ktimer->tscdeadline = 0;
1876
	} else if (apic_lvtt_oneshot(apic)) {
1877
		ktimer->tscdeadline = 0;
1878
		ktimer->target_expiration = 0;
1879
	}
1880
}
1881

1882
static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
1883
{
1884
	struct kvm_vcpu *vcpu = apic->vcpu;
1885
	struct kvm_timer *ktimer = &apic->lapic_timer;
1886

1887
	if (atomic_read(&apic->lapic_timer.pending))
1888
		return;
1889

1890
	if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
1891
		ktimer->expired_tscdeadline = ktimer->tscdeadline;
1892

1893
	if (!from_timer_fn && apic->apicv_active) {
1894
		WARN_ON(kvm_get_running_vcpu() != vcpu);
1895
		kvm_apic_inject_pending_timer_irqs(apic);
1896
		return;
1897
	}
1898

1899
	if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
1900
		/*
1901
		 * Ensure the guest's timer has truly expired before posting an
1902
		 * interrupt.  Open code the relevant checks to avoid querying
1903
		 * lapic_timer_int_injected(), which will be false since the
1904
		 * interrupt isn't yet injected.  Waiting until after injecting
1905
		 * is not an option since that won't help a posted interrupt.
1906
		 */
1907
		if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
1908
		    vcpu->arch.apic->lapic_timer.timer_advance_ns)
1909
			__kvm_wait_lapic_expire(vcpu);
1910
		kvm_apic_inject_pending_timer_irqs(apic);
1911
		return;
1912
	}
1913

1914
	atomic_inc(&apic->lapic_timer.pending);
1915
	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1916
	if (from_timer_fn)
1917
		kvm_vcpu_kick(vcpu);
1918
}
1919

1920
static void start_sw_tscdeadline(struct kvm_lapic *apic)
1921
{
1922
	struct kvm_timer *ktimer = &apic->lapic_timer;
1923
	u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
1924
	u64 ns = 0;
1925
	ktime_t expire;
1926
	struct kvm_vcpu *vcpu = apic->vcpu;
1927
	u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
1928
	unsigned long flags;
1929
	ktime_t now;
1930

1931
	if (unlikely(!tscdeadline || !this_tsc_khz))
1932
		return;
1933

1934
	local_irq_save(flags);
1935

1936
	now = ktime_get();
1937
	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1938

1939
	ns = (tscdeadline - guest_tsc) * 1000000ULL;
1940
	do_div(ns, this_tsc_khz);
1941

1942
	if (likely(tscdeadline > guest_tsc) &&
1943
	    likely(ns > apic->lapic_timer.timer_advance_ns)) {
1944
		expire = ktime_add_ns(now, ns);
1945
		expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
1946
		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
1947
	} else
1948
		apic_timer_expired(apic, false);
1949

1950
	local_irq_restore(flags);
1951
}
1952

1953
static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
1954
{
1955
	return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
1956
		(u64)apic->divide_count;
1957
}
1958

1959
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
1960
{
1961
	ktime_t now, remaining;
1962
	u64 ns_remaining_old, ns_remaining_new;
1963

1964
	apic->lapic_timer.period =
1965
			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
1966
	limit_periodic_timer_frequency(apic);
1967

1968
	now = ktime_get();
1969
	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
1970
	if (ktime_to_ns(remaining) < 0)
1971
		remaining = 0;
1972

1973
	ns_remaining_old = ktime_to_ns(remaining);
1974
	ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
1975
	                                   apic->divide_count, old_divisor);
1976

1977
	apic->lapic_timer.tscdeadline +=
1978
		nsec_to_cycles(apic->vcpu, ns_remaining_new) -
1979
		nsec_to_cycles(apic->vcpu, ns_remaining_old);
1980
	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
1981
}
1982

1983
static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
1984
{
1985
	ktime_t now;
1986
	u64 tscl = rdtsc();
1987
	s64 deadline;
1988

1989
	now = ktime_get();
1990
	apic->lapic_timer.period =
1991
			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
1992

1993
	if (!apic->lapic_timer.period) {
1994
		apic->lapic_timer.tscdeadline = 0;
1995
		return false;
1996
	}
1997

1998
	limit_periodic_timer_frequency(apic);
1999
	deadline = apic->lapic_timer.period;
2000

2001
	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
2002
		if (unlikely(count_reg != APIC_TMICT)) {
2003
			deadline = tmict_to_ns(apic,
2004
				     kvm_lapic_get_reg(apic, count_reg));
2005
			if (unlikely(deadline <= 0)) {
2006
				if (apic_lvtt_period(apic))
2007
					deadline = apic->lapic_timer.period;
2008
				else
2009
					deadline = 0;
2010
			}
2011
			else if (unlikely(deadline > apic->lapic_timer.period)) {
2012
				pr_info_ratelimited(
2013
				    "vcpu %i: requested lapic timer restore with "
2014
				    "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
2015
				    "Using initial count to start timer.\n",
2016
				    apic->vcpu->vcpu_id,
2017
				    count_reg,
2018
				    kvm_lapic_get_reg(apic, count_reg),
2019
				    deadline, apic->lapic_timer.period);
2020
				kvm_lapic_set_reg(apic, count_reg, 0);
2021
				deadline = apic->lapic_timer.period;
2022
			}
2023
		}
2024
	}
2025

2026
	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
2027
		nsec_to_cycles(apic->vcpu, deadline);
2028
	apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
2029

2030
	return true;
2031
}
2032

2033
static void advance_periodic_target_expiration(struct kvm_lapic *apic)
2034
{
2035
	ktime_t now = ktime_get();
2036
	u64 tscl = rdtsc();
2037
	ktime_t delta;
2038

2039
	/*
2040
	 * Synchronize both deadlines to the same time source or
2041
	 * differences in the periods (caused by differences in the
2042
	 * underlying clocks or numerical approximation errors) will
2043
	 * cause the two to drift apart over time as the errors
2044
	 * accumulate.
2045
	 */
2046
	apic->lapic_timer.target_expiration =
2047
		ktime_add_ns(apic->lapic_timer.target_expiration,
2048
				apic->lapic_timer.period);
2049
	delta = ktime_sub(apic->lapic_timer.target_expiration, now);
2050
	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
2051
		nsec_to_cycles(apic->vcpu, delta);
2052
}
2053

2054
static void start_sw_period(struct kvm_lapic *apic)
2055
{
2056
	if (!apic->lapic_timer.period)
2057
		return;
2058

2059
	if (ktime_after(ktime_get(),
2060
			apic->lapic_timer.target_expiration)) {
2061
		apic_timer_expired(apic, false);
2062

2063
		if (apic_lvtt_oneshot(apic))
2064
			return;
2065

2066
		advance_periodic_target_expiration(apic);
2067
	}
2068

2069
	hrtimer_start(&apic->lapic_timer.timer,
2070
		apic->lapic_timer.target_expiration,
2071
		HRTIMER_MODE_ABS_HARD);
2072
}
2073

2074
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
2075
{
2076
	if (!lapic_in_kernel(vcpu))
2077
		return false;
2078

2079
	return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
2080
}
2081

2082
static void cancel_hv_timer(struct kvm_lapic *apic)
2083
{
2084
	WARN_ON(preemptible());
2085
	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
2086
	kvm_x86_call(cancel_hv_timer)(apic->vcpu);
2087
	apic->lapic_timer.hv_timer_in_use = false;
2088
}
2089

2090
static bool start_hv_timer(struct kvm_lapic *apic)
2091
{
2092
	struct kvm_timer *ktimer = &apic->lapic_timer;
2093
	struct kvm_vcpu *vcpu = apic->vcpu;
2094
	bool expired;
2095

2096
	WARN_ON(preemptible());
2097
	if (!kvm_can_use_hv_timer(vcpu))
2098
		return false;
2099

2100
	if (!ktimer->tscdeadline)
2101
		return false;
2102

2103
	if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
2104
		return false;
2105

2106
	ktimer->hv_timer_in_use = true;
2107
	hrtimer_cancel(&ktimer->timer);
2108

2109
	/*
2110
	 * To simplify handling the periodic timer, leave the hv timer running
2111
	 * even if the deadline timer has expired, i.e. rely on the resulting
2112
	 * VM-Exit to recompute the periodic timer's target expiration.
2113
	 */
2114
	if (!apic_lvtt_period(apic)) {
2115
		/*
2116
		 * Cancel the hv timer if the sw timer fired while the hv timer
2117
		 * was being programmed, or if the hv timer itself expired.
2118
		 */
2119
		if (atomic_read(&ktimer->pending)) {
2120
			cancel_hv_timer(apic);
2121
		} else if (expired) {
2122
			apic_timer_expired(apic, false);
2123
			cancel_hv_timer(apic);
2124
		}
2125
	}
2126

2127
	trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
2128

2129
	return true;
2130
}
2131

2132
static void start_sw_timer(struct kvm_lapic *apic)
2133
{
2134
	struct kvm_timer *ktimer = &apic->lapic_timer;
2135

2136
	WARN_ON(preemptible());
2137
	if (apic->lapic_timer.hv_timer_in_use)
2138
		cancel_hv_timer(apic);
2139
	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
2140
		return;
2141

2142
	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
2143
		start_sw_period(apic);
2144
	else if (apic_lvtt_tscdeadline(apic))
2145
		start_sw_tscdeadline(apic);
2146
	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
2147
}
2148

2149
static void restart_apic_timer(struct kvm_lapic *apic)
2150
{
2151
	preempt_disable();
2152

2153
	if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
2154
		goto out;
2155

2156
	if (!start_hv_timer(apic))
2157
		start_sw_timer(apic);
2158
out:
2159
	preempt_enable();
2160
}
2161

2162
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
2163
{
2164
	struct kvm_lapic *apic = vcpu->arch.apic;
2165

2166
	preempt_disable();
2167
	/* If the preempt notifier has already run, it also called apic_timer_expired */
2168
	if (!apic->lapic_timer.hv_timer_in_use)
2169
		goto out;
2170
	WARN_ON(kvm_vcpu_is_blocking(vcpu));
2171
	apic_timer_expired(apic, false);
2172
	cancel_hv_timer(apic);
2173

2174
	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
2175
		advance_periodic_target_expiration(apic);
2176
		restart_apic_timer(apic);
2177
	}
2178
out:
2179
	preempt_enable();
2180
}
2181
EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
2182

2183
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
2184
{
2185
	restart_apic_timer(vcpu->arch.apic);
2186
}
2187

2188
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
2189
{
2190
	struct kvm_lapic *apic = vcpu->arch.apic;
2191

2192
	preempt_disable();
2193
	/* Possibly the TSC deadline timer is not enabled yet */
2194
	if (apic->lapic_timer.hv_timer_in_use)
2195
		start_sw_timer(apic);
2196
	preempt_enable();
2197
}
2198

2199
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
2200
{
2201
	struct kvm_lapic *apic = vcpu->arch.apic;
2202

2203
	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
2204
	restart_apic_timer(apic);
2205
}
2206

2207
static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
2208
{
2209
	atomic_set(&apic->lapic_timer.pending, 0);
2210

2211
	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
2212
	    && !set_target_expiration(apic, count_reg))
2213
		return;
2214

2215
	restart_apic_timer(apic);
2216
}
2217

2218
static void start_apic_timer(struct kvm_lapic *apic)
2219
{
2220
	__start_apic_timer(apic, APIC_TMICT);
2221
}
2222

2223
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
2224
{
2225
	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
2226

2227
	if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
2228
		apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
2229
		if (lvt0_in_nmi_mode) {
2230
			atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
2231
		} else
2232
			atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
2233
	}
2234
}
2235

2236
static int get_lvt_index(u32 reg)
2237
{
2238
	if (reg == APIC_LVTCMCI)
2239
		return LVT_CMCI;
2240
	if (reg < APIC_LVTT || reg > APIC_LVTERR)
2241
		return -1;
2242
	return array_index_nospec(
2243
			(reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
2244
}
2245

2246
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
2247
{
2248
	int ret = 0;
2249

2250
	trace_kvm_apic_write(reg, val);
2251

2252
	switch (reg) {
2253
	case APIC_ID:		/* Local APIC ID */
2254
		if (!apic_x2apic_mode(apic)) {
2255
			kvm_apic_set_xapic_id(apic, val >> 24);
2256
		} else {
2257
			ret = 1;
2258
		}
2259
		break;
2260

2261
	case APIC_TASKPRI:
2262
		report_tpr_access(apic, true);
2263
		apic_set_tpr(apic, val & 0xff);
2264
		break;
2265

2266
	case APIC_EOI:
2267
		apic_set_eoi(apic);
2268
		break;
2269

2270
	case APIC_LDR:
2271
		if (!apic_x2apic_mode(apic))
2272
			kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
2273
		else
2274
			ret = 1;
2275
		break;
2276

2277
	case APIC_DFR:
2278
		if (!apic_x2apic_mode(apic))
2279
			kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
2280
		else
2281
			ret = 1;
2282
		break;
2283

2284
	case APIC_SPIV: {
2285
		u32 mask = 0x3ff;
2286
		if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
2287
			mask |= APIC_SPIV_DIRECTED_EOI;
2288
		apic_set_spiv(apic, val & mask);
2289
		if (!(val & APIC_SPIV_APIC_ENABLED)) {
2290
			int i;
2291

2292
			for (i = 0; i < apic->nr_lvt_entries; i++) {
2293
				kvm_lapic_set_reg(apic, APIC_LVTx(i),
2294
					kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
2295
			}
2296
			apic_update_lvtt(apic);
2297
			atomic_set(&apic->lapic_timer.pending, 0);
2298

2299
		}
2300
		break;
2301
	}
2302
	case APIC_ICR:
2303
		WARN_ON_ONCE(apic_x2apic_mode(apic));
2304

2305
		/* No delay here, so we always clear the pending bit */
2306
		val &= ~APIC_ICR_BUSY;
2307
		kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
2308
		kvm_lapic_set_reg(apic, APIC_ICR, val);
2309
		break;
2310
	case APIC_ICR2:
2311
		if (apic_x2apic_mode(apic))
2312
			ret = 1;
2313
		else
2314
			kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
2315
		break;
2316

2317
	case APIC_LVT0:
2318
		apic_manage_nmi_watchdog(apic, val);
2319
		fallthrough;
2320
	case APIC_LVTTHMR:
2321
	case APIC_LVTPC:
2322
	case APIC_LVT1:
2323
	case APIC_LVTERR:
2324
	case APIC_LVTCMCI: {
2325
		u32 index = get_lvt_index(reg);
2326
		if (!kvm_lapic_lvt_supported(apic, index)) {
2327
			ret = 1;
2328
			break;
2329
		}
2330
		if (!kvm_apic_sw_enabled(apic))
2331
			val |= APIC_LVT_MASKED;
2332
		val &= apic_lvt_mask[index];
2333
		kvm_lapic_set_reg(apic, reg, val);
2334
		break;
2335
	}
2336

2337
	case APIC_LVTT:
2338
		if (!kvm_apic_sw_enabled(apic))
2339
			val |= APIC_LVT_MASKED;
2340
		val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
2341
		kvm_lapic_set_reg(apic, APIC_LVTT, val);
2342
		apic_update_lvtt(apic);
2343
		break;
2344

2345
	case APIC_TMICT:
2346
		if (apic_lvtt_tscdeadline(apic))
2347
			break;
2348

2349
		cancel_apic_timer(apic);
2350
		kvm_lapic_set_reg(apic, APIC_TMICT, val);
2351
		start_apic_timer(apic);
2352
		break;
2353

2354
	case APIC_TDCR: {
2355
		uint32_t old_divisor = apic->divide_count;
2356

2357
		kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
2358
		update_divide_count(apic);
2359
		if (apic->divide_count != old_divisor &&
2360
				apic->lapic_timer.period) {
2361
			hrtimer_cancel(&apic->lapic_timer.timer);
2362
			update_target_expiration(apic, old_divisor);
2363
			restart_apic_timer(apic);
2364
		}
2365
		break;
2366
	}
2367
	case APIC_ESR:
2368
		if (apic_x2apic_mode(apic) && val != 0)
2369
			ret = 1;
2370
		break;
2371

2372
	case APIC_SELF_IPI:
2373
		/*
2374
		 * Self-IPI exists only when x2APIC is enabled.  Bits 7:0 hold
2375
		 * the vector, everything else is reserved.
2376
		 */
2377
		if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
2378
			ret = 1;
2379
		else
2380
			kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
2381
		break;
2382
	default:
2383
		ret = 1;
2384
		break;
2385
	}
2386

2387
	/*
2388
	 * Recalculate APIC maps if necessary, e.g. if the software enable bit
2389
	 * was toggled, the APIC ID changed, etc...   The maps are marked dirty
2390
	 * on relevant changes, i.e. this is a nop for most writes.
2391
	 */
2392
	kvm_recalculate_apic_map(apic->vcpu->kvm);
2393

2394
	return ret;
2395
}
2396

2397
static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
2398
			    gpa_t address, int len, const void *data)
2399
{
2400
	struct kvm_lapic *apic = to_lapic(this);
2401
	unsigned int offset = address - apic->base_address;
2402
	u32 val;
2403

2404
	if (!apic_mmio_in_range(apic, address))
2405
		return -EOPNOTSUPP;
2406

2407
	if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
2408
		if (!kvm_check_has_quirk(vcpu->kvm,
2409
					 KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
2410
			return -EOPNOTSUPP;
2411

2412
		return 0;
2413
	}
2414

2415
	/*
2416
	 * APIC register must be aligned on 128-bits boundary.
2417
	 * 32/64/128 bits registers must be accessed thru 32 bits.
2418
	 * Refer SDM 8.4.1
2419
	 */
2420
	if (len != 4 || (offset & 0xf))
2421
		return 0;
2422

2423
	val = *(u32*)data;
2424

2425
	kvm_lapic_reg_write(apic, offset & 0xff0, val);
2426

2427
	return 0;
2428
}
2429

2430
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
2431
{
2432
	kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
2433
}
2434
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
2435

2436
#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
2437

2438
int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
2439
{
2440
	if (data & X2APIC_ICR_RESERVED_BITS)
2441
		return 1;
2442

2443
	/*
2444
	 * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
2445
	 * only AMD requires it to be zero, Intel essentially just ignores the
2446
	 * bit.  And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
2447
	 * the CPU performs the reserved bits checks, i.e. the underlying CPU
2448
	 * behavior will "win".  Arbitrarily clear the BUSY bit, as there is no
2449
	 * sane way to provide consistent behavior with respect to hardware.
2450
	 */
2451
	data &= ~APIC_ICR_BUSY;
2452

2453
	kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
2454
	if (kvm_x86_ops.x2apic_icr_is_split) {
2455
		kvm_lapic_set_reg(apic, APIC_ICR, data);
2456
		kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
2457
	} else {
2458
		kvm_lapic_set_reg64(apic, APIC_ICR, data);
2459
	}
2460
	trace_kvm_apic_write(APIC_ICR, data);
2461
	return 0;
2462
}
2463

2464
static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
2465
{
2466
	if (kvm_x86_ops.x2apic_icr_is_split)
2467
		return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
2468
		       (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
2469

2470
	return kvm_lapic_get_reg64(apic, APIC_ICR);
2471
}
2472

2473
/* emulate APIC access in a trap manner */
2474
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
2475
{
2476
	struct kvm_lapic *apic = vcpu->arch.apic;
2477

2478
	/*
2479
	 * ICR is a single 64-bit register when x2APIC is enabled, all others
2480
	 * registers hold 32-bit values.  For legacy xAPIC, ICR writes need to
2481
	 * go down the common path to get the upper half from ICR2.
2482
	 *
2483
	 * Note, using the write helpers may incur an unnecessary write to the
2484
	 * virtual APIC state, but KVM needs to conditionally modify the value
2485
	 * in certain cases, e.g. to clear the ICR busy bit.  The cost of extra
2486
	 * conditional branches is likely a wash relative to the cost of the
2487
	 * maybe-unecessary write, and both are in the noise anyways.
2488
	 */
2489
	if (apic_x2apic_mode(apic) && offset == APIC_ICR)
2490
		WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
2491
	else
2492
		kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
2493
}
2494
EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
2495

2496
void kvm_free_lapic(struct kvm_vcpu *vcpu)
2497
{
2498
	struct kvm_lapic *apic = vcpu->arch.apic;
2499

2500
	if (!vcpu->arch.apic) {
2501
		static_branch_dec(&kvm_has_noapic_vcpu);
2502
		return;
2503
	}
2504

2505
	hrtimer_cancel(&apic->lapic_timer.timer);
2506

2507
	if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
2508
		static_branch_slow_dec_deferred(&apic_hw_disabled);
2509

2510
	if (!apic->sw_enabled)
2511
		static_branch_slow_dec_deferred(&apic_sw_disabled);
2512

2513
	if (apic->regs)
2514
		free_page((unsigned long)apic->regs);
2515

2516
	kfree(apic);
2517
}
2518

2519
/*
2520
 *----------------------------------------------------------------------
2521
 * LAPIC interface
2522
 *----------------------------------------------------------------------
2523
 */
2524
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
2525
{
2526
	struct kvm_lapic *apic = vcpu->arch.apic;
2527

2528
	if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
2529
		return 0;
2530

2531
	return apic->lapic_timer.tscdeadline;
2532
}
2533

2534
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
2535
{
2536
	struct kvm_lapic *apic = vcpu->arch.apic;
2537

2538
	if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
2539
		return;
2540

2541
	hrtimer_cancel(&apic->lapic_timer.timer);
2542
	apic->lapic_timer.tscdeadline = data;
2543
	start_apic_timer(apic);
2544
}
2545

2546
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
2547
{
2548
	apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
2549
}
2550

2551
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
2552
{
2553
	u64 tpr;
2554

2555
	tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
2556

2557
	return (tpr & 0xf0) >> 4;
2558
}
2559

2560
static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
2561
{
2562
	u64 old_value = vcpu->arch.apic_base;
2563
	struct kvm_lapic *apic = vcpu->arch.apic;
2564

2565
	vcpu->arch.apic_base = value;
2566

2567
	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
2568
		vcpu->arch.cpuid_dynamic_bits_dirty = true;
2569

2570
	if (!apic)
2571
		return;
2572

2573
	/* update jump label if enable bit changes */
2574
	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
2575
		if (value & MSR_IA32_APICBASE_ENABLE) {
2576
			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2577
			static_branch_slow_dec_deferred(&apic_hw_disabled);
2578
			/* Check if there are APF page ready requests pending */
2579
			kvm_make_request(KVM_REQ_APF_READY, vcpu);
2580
		} else {
2581
			static_branch_inc(&apic_hw_disabled.key);
2582
			atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
2583
		}
2584
	}
2585

2586
	if ((old_value ^ value) & X2APIC_ENABLE) {
2587
		if (value & X2APIC_ENABLE)
2588
			kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
2589
		else if (value & MSR_IA32_APICBASE_ENABLE)
2590
			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2591
	}
2592

2593
	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
2594
		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
2595
		kvm_x86_call(set_virtual_apic_mode)(vcpu);
2596
	}
2597

2598
	apic->base_address = apic->vcpu->arch.apic_base &
2599
			     MSR_IA32_APICBASE_BASE;
2600

2601
	if ((value & MSR_IA32_APICBASE_ENABLE) &&
2602
	     apic->base_address != APIC_DEFAULT_PHYS_BASE) {
2603
		kvm_set_apicv_inhibit(apic->vcpu->kvm,
2604
				      APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
2605
	}
2606
}
2607

2608
int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
2609
{
2610
	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
2611
	enum lapic_mode new_mode = kvm_apic_mode(value);
2612

2613
	if (vcpu->arch.apic_base == value)
2614
		return 0;
2615

2616
	u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
2617
		(guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
2618

2619
	if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
2620
		return 1;
2621
	if (!host_initiated) {
2622
		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
2623
			return 1;
2624
		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
2625
			return 1;
2626
	}
2627

2628
	__kvm_apic_set_base(vcpu, value);
2629
	kvm_recalculate_apic_map(vcpu->kvm);
2630
	return 0;
2631
}
2632
EXPORT_SYMBOL_GPL(kvm_apic_set_base);
2633

2634
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
2635
{
2636
	struct kvm_lapic *apic = vcpu->arch.apic;
2637

2638
	/*
2639
	 * When APICv is enabled, KVM must always search the IRR for a pending
2640
	 * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
2641
	 * isn't running.  If APICv is disabled, KVM _should_ search the IRR
2642
	 * for a pending IRQ.  But KVM currently doesn't ensure *all* hardware,
2643
	 * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
2644
	 * the IRR at this time could race with IRQ delivery from hardware that
2645
	 * still sees APICv as being enabled.
2646
	 *
2647
	 * FIXME: Ensure other vCPUs and devices observe the change in APICv
2648
	 *        state prior to updating KVM's metadata caches, so that KVM
2649
	 *        can safely search the IRR and set irr_pending accordingly.
2650
	 */
2651
	apic->irr_pending = true;
2652

2653
	if (apic->apicv_active)
2654
		apic->isr_count = 1;
2655
	else
2656
		apic->isr_count = count_vectors(apic->regs + APIC_ISR);
2657

2658
	apic->highest_isr_cache = -1;
2659
}
2660

2661
int kvm_alloc_apic_access_page(struct kvm *kvm)
2662
{
2663
	void __user *hva;
2664
	int ret = 0;
2665

2666
	mutex_lock(&kvm->slots_lock);
2667
	if (kvm->arch.apic_access_memslot_enabled ||
2668
	    kvm->arch.apic_access_memslot_inhibited)
2669
		goto out;
2670

2671
	hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
2672
				      APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
2673
	if (IS_ERR(hva)) {
2674
		ret = PTR_ERR(hva);
2675
		goto out;
2676
	}
2677

2678
	kvm->arch.apic_access_memslot_enabled = true;
2679
out:
2680
	mutex_unlock(&kvm->slots_lock);
2681
	return ret;
2682
}
2683
EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
2684

2685
void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
2686
{
2687
	struct kvm *kvm = vcpu->kvm;
2688

2689
	if (!kvm->arch.apic_access_memslot_enabled)
2690
		return;
2691

2692
	kvm_vcpu_srcu_read_unlock(vcpu);
2693

2694
	mutex_lock(&kvm->slots_lock);
2695

2696
	if (kvm->arch.apic_access_memslot_enabled) {
2697
		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
2698
		/*
2699
		 * Clear "enabled" after the memslot is deleted so that a
2700
		 * different vCPU doesn't get a false negative when checking
2701
		 * the flag out of slots_lock.  No additional memory barrier is
2702
		 * needed as modifying memslots requires waiting other vCPUs to
2703
		 * drop SRCU (see above), and false positives are ok as the
2704
		 * flag is rechecked after acquiring slots_lock.
2705
		 */
2706
		kvm->arch.apic_access_memslot_enabled = false;
2707

2708
		/*
2709
		 * Mark the memslot as inhibited to prevent reallocating the
2710
		 * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
2711
		 */
2712
		kvm->arch.apic_access_memslot_inhibited = true;
2713
	}
2714

2715
	mutex_unlock(&kvm->slots_lock);
2716

2717
	kvm_vcpu_srcu_read_lock(vcpu);
2718
}
2719

2720
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
2721
{
2722
	struct kvm_lapic *apic = vcpu->arch.apic;
2723
	u64 msr_val;
2724
	int i;
2725

2726
	kvm_x86_call(apicv_pre_state_restore)(vcpu);
2727

2728
	if (!init_event) {
2729
		msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
2730
		if (kvm_vcpu_is_reset_bsp(vcpu))
2731
			msr_val |= MSR_IA32_APICBASE_BSP;
2732

2733
		/*
2734
		 * Use the inner helper to avoid an extra recalcuation of the
2735
		 * optimized APIC map if some other task has dirtied the map.
2736
		 * The recalculation needed for this vCPU will be done after
2737
		 * all APIC state has been initialized (see below).
2738
		 */
2739
		__kvm_apic_set_base(vcpu, msr_val);
2740
	}
2741

2742
	if (!apic)
2743
		return;
2744

2745
	/* Stop the timer in case it's a reset to an active apic */
2746
	hrtimer_cancel(&apic->lapic_timer.timer);
2747

2748
	/* The xAPIC ID is set at RESET even if the APIC was already enabled. */
2749
	if (!init_event)
2750
		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
2751
	kvm_apic_set_version(apic->vcpu);
2752

2753
	for (i = 0; i < apic->nr_lvt_entries; i++)
2754
		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
2755
	apic_update_lvtt(apic);
2756
	if (kvm_vcpu_is_reset_bsp(vcpu) &&
2757
	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
2758
		kvm_lapic_set_reg(apic, APIC_LVT0,
2759
			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
2760
	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
2761

2762
	kvm_apic_set_dfr(apic, 0xffffffffU);
2763
	apic_set_spiv(apic, 0xff);
2764
	kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
2765
	if (!apic_x2apic_mode(apic))
2766
		kvm_apic_set_ldr(apic, 0);
2767
	kvm_lapic_set_reg(apic, APIC_ESR, 0);
2768
	if (!apic_x2apic_mode(apic)) {
2769
		kvm_lapic_set_reg(apic, APIC_ICR, 0);
2770
		kvm_lapic_set_reg(apic, APIC_ICR2, 0);
2771
	} else {
2772
		kvm_lapic_set_reg64(apic, APIC_ICR, 0);
2773
	}
2774
	kvm_lapic_set_reg(apic, APIC_TDCR, 0);
2775
	kvm_lapic_set_reg(apic, APIC_TMICT, 0);
2776
	for (i = 0; i < 8; i++) {
2777
		kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
2778
		kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
2779
		kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
2780
	}
2781
	kvm_apic_update_apicv(vcpu);
2782
	update_divide_count(apic);
2783
	atomic_set(&apic->lapic_timer.pending, 0);
2784

2785
	vcpu->arch.pv_eoi.msr_val = 0;
2786
	apic_update_ppr(apic);
2787
	if (apic->apicv_active) {
2788
		kvm_x86_call(apicv_post_state_restore)(vcpu);
2789
		kvm_x86_call(hwapic_isr_update)(vcpu, -1);
2790
	}
2791

2792
	vcpu->arch.apic_arb_prio = 0;
2793
	vcpu->arch.apic_attention = 0;
2794

2795
	kvm_recalculate_apic_map(vcpu->kvm);
2796
}
2797

2798
/*
2799
 *----------------------------------------------------------------------
2800
 * timer interface
2801
 *----------------------------------------------------------------------
2802
 */
2803

2804
static bool lapic_is_periodic(struct kvm_lapic *apic)
2805
{
2806
	return apic_lvtt_period(apic);
2807
}
2808

2809
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
2810
{
2811
	struct kvm_lapic *apic = vcpu->arch.apic;
2812

2813
	if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
2814
		return atomic_read(&apic->lapic_timer.pending);
2815

2816
	return 0;
2817
}
2818

2819
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
2820
{
2821
	u32 reg = kvm_lapic_get_reg(apic, lvt_type);
2822
	int vector, mode, trig_mode;
2823
	int r;
2824

2825
	if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
2826
		vector = reg & APIC_VECTOR_MASK;
2827
		mode = reg & APIC_MODE_MASK;
2828
		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
2829

2830
		r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
2831
		if (r && lvt_type == APIC_LVTPC &&
2832
		    guest_cpuid_is_intel_compatible(apic->vcpu))
2833
			kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
2834
		return r;
2835
	}
2836
	return 0;
2837
}
2838

2839
void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
2840
{
2841
	struct kvm_lapic *apic = vcpu->arch.apic;
2842

2843
	if (apic)
2844
		kvm_apic_local_deliver(apic, APIC_LVT0);
2845
}
2846

2847
static const struct kvm_io_device_ops apic_mmio_ops = {
2848
	.read     = apic_mmio_read,
2849
	.write    = apic_mmio_write,
2850
};
2851

2852
static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
2853
{
2854
	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
2855
	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
2856

2857
	apic_timer_expired(apic, true);
2858

2859
	if (lapic_is_periodic(apic)) {
2860
		advance_periodic_target_expiration(apic);
2861
		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
2862
		return HRTIMER_RESTART;
2863
	} else
2864
		return HRTIMER_NORESTART;
2865
}
2866

2867
int kvm_create_lapic(struct kvm_vcpu *vcpu)
2868
{
2869
	struct kvm_lapic *apic;
2870

2871
	ASSERT(vcpu != NULL);
2872

2873
	if (!irqchip_in_kernel(vcpu->kvm)) {
2874
		static_branch_inc(&kvm_has_noapic_vcpu);
2875
		return 0;
2876
	}
2877

2878
	apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
2879
	if (!apic)
2880
		goto nomem;
2881

2882
	vcpu->arch.apic = apic;
2883

2884
	if (kvm_x86_ops.alloc_apic_backing_page)
2885
		apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
2886
	else
2887
		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
2888
	if (!apic->regs) {
2889
		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
2890
		       vcpu->vcpu_id);
2891
		goto nomem_free_apic;
2892
	}
2893
	apic->vcpu = vcpu;
2894

2895
	apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
2896

2897
	hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
2898
		      HRTIMER_MODE_ABS_HARD);
2899
	if (lapic_timer_advance)
2900
		apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
2901

2902
	/*
2903
	 * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
2904
	 * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
2905
	 */
2906
	vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
2907
	static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
2908
	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
2909

2910
	/*
2911
	 * Defer evaluating inhibits until the vCPU is first run, as this vCPU
2912
	 * will not get notified of any changes until this vCPU is visible to
2913
	 * other vCPUs (marked online and added to the set of vCPUs).
2914
	 *
2915
	 * Opportunistically mark APICv active as VMX in particularly is highly
2916
	 * unlikely to have inhibits.  Ignore the current per-VM APICv state so
2917
	 * that vCPU creation is guaranteed to run with a deterministic value,
2918
	 * the request will ensure the vCPU gets the correct state before VM-Entry.
2919
	 */
2920
	if (enable_apicv) {
2921
		apic->apicv_active = true;
2922
		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
2923
	}
2924

2925
	return 0;
2926
nomem_free_apic:
2927
	kfree(apic);
2928
	vcpu->arch.apic = NULL;
2929
nomem:
2930
	return -ENOMEM;
2931
}
2932

2933
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
2934
{
2935
	struct kvm_lapic *apic = vcpu->arch.apic;
2936
	u32 ppr;
2937

2938
	if (!kvm_apic_present(vcpu))
2939
		return -1;
2940

2941
	if (apic->guest_apic_protected)
2942
		return -1;
2943

2944
	__apic_update_ppr(apic, &ppr);
2945
	return apic_has_interrupt_for_ppr(apic, ppr);
2946
}
2947
EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
2948

2949
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
2950
{
2951
	u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
2952

2953
	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
2954
		return 1;
2955
	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
2956
	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
2957
		return 1;
2958
	return 0;
2959
}
2960

2961
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
2962
{
2963
	struct kvm_lapic *apic = vcpu->arch.apic;
2964

2965
	if (atomic_read(&apic->lapic_timer.pending) > 0) {
2966
		kvm_apic_inject_pending_timer_irqs(apic);
2967
		atomic_set(&apic->lapic_timer.pending, 0);
2968
	}
2969
}
2970

2971
void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
2972
{
2973
	struct kvm_lapic *apic = vcpu->arch.apic;
2974
	u32 ppr;
2975

2976
	if (WARN_ON_ONCE(vector < 0 || !apic))
2977
		return;
2978

2979
	/*
2980
	 * We get here even with APIC virtualization enabled, if doing
2981
	 * nested virtualization and L1 runs with the "acknowledge interrupt
2982
	 * on exit" mode.  Then we cannot inject the interrupt via RVI,
2983
	 * because the process would deliver it through the IDT.
2984
	 */
2985

2986
	apic_clear_irr(vector, apic);
2987
	if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) {
2988
		/*
2989
		 * For auto-EOI interrupts, there might be another pending
2990
		 * interrupt above PPR, so check whether to raise another
2991
		 * KVM_REQ_EVENT.
2992
		 */
2993
		apic_update_ppr(apic);
2994
	} else {
2995
		/*
2996
		 * For normal interrupts, PPR has been raised and there cannot
2997
		 * be a higher-priority pending interrupt---except if there was
2998
		 * a concurrent interrupt injection, but that would have
2999
		 * triggered KVM_REQ_EVENT already.
3000
		 */
3001
		apic_set_isr(vector, apic);
3002
		__apic_update_ppr(apic, &ppr);
3003
	}
3004

3005
}
3006
EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
3007

3008
static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
3009
		struct kvm_lapic_state *s, bool set)
3010
{
3011
	if (apic_x2apic_mode(vcpu->arch.apic)) {
3012
		u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
3013
		u32 *id = (u32 *)(s->regs + APIC_ID);
3014
		u32 *ldr = (u32 *)(s->regs + APIC_LDR);
3015
		u64 icr;
3016

3017
		if (vcpu->kvm->arch.x2apic_format) {
3018
			if (*id != x2apic_id)
3019
				return -EINVAL;
3020
		} else {
3021
			/*
3022
			 * Ignore the userspace value when setting APIC state.
3023
			 * KVM's model is that the x2APIC ID is readonly, e.g.
3024
			 * KVM only supports delivering interrupts to KVM's
3025
			 * version of the x2APIC ID.  However, for backwards
3026
			 * compatibility, don't reject attempts to set a
3027
			 * mismatched ID for userspace that hasn't opted into
3028
			 * x2apic_format.
3029
			 */
3030
			if (set)
3031
				*id = x2apic_id;
3032
			else
3033
				*id = x2apic_id << 24;
3034
		}
3035

3036
		/*
3037
		 * In x2APIC mode, the LDR is fixed and based on the id.  And
3038
		 * if the ICR is _not_ split, ICR is internally a single 64-bit
3039
		 * register, but needs to be split to ICR+ICR2 in userspace for
3040
		 * backwards compatibility.
3041
		 */
3042
		if (set)
3043
			*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
3044

3045
		if (!kvm_x86_ops.x2apic_icr_is_split) {
3046
			if (set) {
3047
				icr = apic_get_reg(s->regs, APIC_ICR) |
3048
				      (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
3049
				apic_set_reg64(s->regs, APIC_ICR, icr);
3050
			} else {
3051
				icr = apic_get_reg64(s->regs, APIC_ICR);
3052
				apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
3053
			}
3054
		}
3055
	}
3056

3057
	return 0;
3058
}
3059

3060
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
3061
{
3062
	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
3063

3064
	/*
3065
	 * Get calculated timer current count for remaining timer period (if
3066
	 * any) and store it in the returned register set.
3067
	 */
3068
	apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
3069

3070
	return kvm_apic_state_fixup(vcpu, s, false);
3071
}
3072

3073
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
3074
{
3075
	struct kvm_lapic *apic = vcpu->arch.apic;
3076
	int r;
3077

3078
	kvm_x86_call(apicv_pre_state_restore)(vcpu);
3079

3080
	/* set SPIV separately to get count of SW disabled APICs right */
3081
	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
3082

3083
	r = kvm_apic_state_fixup(vcpu, s, true);
3084
	if (r) {
3085
		kvm_recalculate_apic_map(vcpu->kvm);
3086
		return r;
3087
	}
3088
	memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
3089

3090
	atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
3091
	kvm_recalculate_apic_map(vcpu->kvm);
3092
	kvm_apic_set_version(vcpu);
3093

3094
	apic_update_ppr(apic);
3095
	cancel_apic_timer(apic);
3096
	apic->lapic_timer.expired_tscdeadline = 0;
3097
	apic_update_lvtt(apic);
3098
	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
3099
	update_divide_count(apic);
3100
	__start_apic_timer(apic, APIC_TMCCT);
3101
	kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
3102
	kvm_apic_update_apicv(vcpu);
3103
	if (apic->apicv_active) {
3104
		kvm_x86_call(apicv_post_state_restore)(vcpu);
3105
		kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
3106
	}
3107
	kvm_make_request(KVM_REQ_EVENT, vcpu);
3108

3109
#ifdef CONFIG_KVM_IOAPIC
3110
	if (ioapic_in_kernel(vcpu->kvm))
3111
		kvm_rtc_eoi_tracking_restore_one(vcpu);
3112
#endif
3113

3114
	vcpu->arch.apic_arb_prio = 0;
3115

3116
	return 0;
3117
}
3118

3119
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
3120
{
3121
	struct hrtimer *timer;
3122

3123
	if (!lapic_in_kernel(vcpu) ||
3124
		kvm_can_post_timer_interrupt(vcpu))
3125
		return;
3126

3127
	timer = &vcpu->arch.apic->lapic_timer.timer;
3128
	if (hrtimer_cancel(timer))
3129
		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
3130
}
3131

3132
/*
3133
 * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
3134
 *
3135
 * Detect whether guest triggered PV EOI since the
3136
 * last entry. If yes, set EOI on guests's behalf.
3137
 * Clear PV EOI in guest memory in any case.
3138
 */
3139
static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
3140
					struct kvm_lapic *apic)
3141
{
3142
	int vector;
3143
	/*
3144
	 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
3145
	 * and KVM_PV_EOI_ENABLED in guest memory as follows:
3146
	 *
3147
	 * KVM_APIC_PV_EOI_PENDING is unset:
3148
	 * 	-> host disabled PV EOI.
3149
	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
3150
	 * 	-> host enabled PV EOI, guest did not execute EOI yet.
3151
	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
3152
	 * 	-> host enabled PV EOI, guest executed EOI.
3153
	 */
3154
	BUG_ON(!pv_eoi_enabled(vcpu));
3155

3156
	if (pv_eoi_test_and_clr_pending(vcpu))
3157
		return;
3158
	vector = apic_set_eoi(apic);
3159
	trace_kvm_pv_eoi(apic, vector);
3160
}
3161

3162
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
3163
{
3164
	u32 data;
3165

3166
	if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
3167
		apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
3168

3169
	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
3170
		return;
3171

3172
	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
3173
				  sizeof(u32)))
3174
		return;
3175

3176
	apic_set_tpr(vcpu->arch.apic, data & 0xff);
3177
}
3178

3179
/*
3180
 * apic_sync_pv_eoi_to_guest - called before vmentry
3181
 *
3182
 * Detect whether it's safe to enable PV EOI and
3183
 * if yes do so.
3184
 */
3185
static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
3186
					struct kvm_lapic *apic)
3187
{
3188
	if (!pv_eoi_enabled(vcpu) ||
3189
	    /* IRR set or many bits in ISR: could be nested. */
3190
	    apic->irr_pending ||
3191
	    /* Cache not set: could be safe but we don't bother. */
3192
	    apic->highest_isr_cache == -1 ||
3193
	    /* Need EOI to update ioapic. */
3194
	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
3195
		/*
3196
		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
3197
		 * so we need not do anything here.
3198
		 */
3199
		return;
3200
	}
3201

3202
	pv_eoi_set_pending(apic->vcpu);
3203
}
3204

3205
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
3206
{
3207
	u32 data, tpr;
3208
	int max_irr, max_isr;
3209
	struct kvm_lapic *apic = vcpu->arch.apic;
3210

3211
	apic_sync_pv_eoi_to_guest(vcpu, apic);
3212

3213
	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
3214
		return;
3215

3216
	tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
3217
	max_irr = apic_find_highest_irr(apic);
3218
	if (max_irr < 0)
3219
		max_irr = 0;
3220
	max_isr = apic_find_highest_isr(apic);
3221
	if (max_isr < 0)
3222
		max_isr = 0;
3223
	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
3224

3225
	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
3226
				sizeof(u32));
3227
}
3228

3229
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
3230
{
3231
	if (vapic_addr) {
3232
		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
3233
					&vcpu->arch.apic->vapic_cache,
3234
					vapic_addr, sizeof(u32)))
3235
			return -EINVAL;
3236
		__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
3237
	} else {
3238
		__clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
3239
	}
3240

3241
	vcpu->arch.apic->vapic_addr = vapic_addr;
3242
	return 0;
3243
}
3244

3245
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
3246
{
3247
	u32 low;
3248

3249
	if (reg == APIC_ICR) {
3250
		*data = kvm_x2apic_icr_read(apic);
3251
		return 0;
3252
	}
3253

3254
	if (kvm_lapic_reg_read(apic, reg, 4, &low))
3255
		return 1;
3256

3257
	*data = low;
3258

3259
	return 0;
3260
}
3261

3262
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
3263
{
3264
	/*
3265
	 * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
3266
	 * can be written as such, all other registers remain accessible only
3267
	 * through 32-bit reads/writes.
3268
	 */
3269
	if (reg == APIC_ICR)
3270
		return kvm_x2apic_icr_write(apic, data);
3271

3272
	/* Bits 63:32 are reserved in all other registers. */
3273
	if (data >> 32)
3274
		return 1;
3275

3276
	return kvm_lapic_reg_write(apic, reg, (u32)data);
3277
}
3278

3279
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
3280
{
3281
	struct kvm_lapic *apic = vcpu->arch.apic;
3282
	u32 reg = (msr - APIC_BASE_MSR) << 4;
3283

3284
	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
3285
		return 1;
3286

3287
	return kvm_lapic_msr_write(apic, reg, data);
3288
}
3289

3290
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
3291
{
3292
	struct kvm_lapic *apic = vcpu->arch.apic;
3293
	u32 reg = (msr - APIC_BASE_MSR) << 4;
3294

3295
	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
3296
		return 1;
3297

3298
	return kvm_lapic_msr_read(apic, reg, data);
3299
}
3300

3301
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
3302
{
3303
	if (!lapic_in_kernel(vcpu))
3304
		return 1;
3305

3306
	return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
3307
}
3308

3309
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
3310
{
3311
	if (!lapic_in_kernel(vcpu))
3312
		return 1;
3313

3314
	return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
3315
}
3316

3317
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
3318
{
3319
	u64 addr = data & ~KVM_MSR_ENABLED;
3320
	struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
3321
	unsigned long new_len;
3322
	int ret;
3323

3324
	if (!IS_ALIGNED(addr, 4))
3325
		return 1;
3326

3327
	if (data & KVM_MSR_ENABLED) {
3328
		if (addr == ghc->gpa && len <= ghc->len)
3329
			new_len = ghc->len;
3330
		else
3331
			new_len = len;
3332

3333
		ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
3334
		if (ret)
3335
			return ret;
3336
	}
3337

3338
	vcpu->arch.pv_eoi.msr_val = data;
3339

3340
	return 0;
3341
}
3342

3343
int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
3344
{
3345
	struct kvm_lapic *apic = vcpu->arch.apic;
3346
	u8 sipi_vector;
3347
	int r;
3348

3349
	if (!kvm_apic_has_pending_init_or_sipi(vcpu))
3350
		return 0;
3351

3352
	if (is_guest_mode(vcpu)) {
3353
		r = kvm_check_nested_events(vcpu);
3354
		if (r < 0)
3355
			return r == -EBUSY ? 0 : r;
3356
		/*
3357
		 * Continue processing INIT/SIPI even if a nested VM-Exit
3358
		 * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
3359
		 * are blocked as a result of transitioning to VMX root mode.
3360
		 */
3361
	}
3362

3363
	/*
3364
	 * INITs are blocked while CPU is in specific states (SMM, VMX root
3365
	 * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
3366
	 * wait-for-SIPI (WFS).
3367
	 */
3368
	if (!kvm_apic_init_sipi_allowed(vcpu)) {
3369
		WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
3370
		clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3371
		return 0;
3372
	}
3373

3374
	if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
3375
		kvm_vcpu_reset(vcpu, true);
3376
		if (kvm_vcpu_is_bsp(apic->vcpu))
3377
			kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3378
		else
3379
			kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
3380
	}
3381
	if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3382
		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
3383
			/* evaluate pending_events before reading the vector */
3384
			smp_rmb();
3385
			sipi_vector = apic->sipi_vector;
3386
			kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
3387
							       sipi_vector);
3388
			kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
3389
		}
3390
	}
3391
	return 0;
3392
}
3393

3394
void kvm_lapic_exit(void)
3395
{
3396
	static_key_deferred_flush(&apic_hw_disabled);
3397
	WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
3398
	static_key_deferred_flush(&apic_sw_disabled);
3399
	WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
3400
}
3401

3402
Product

Resources

Company