CoCalc -- xen.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/xen.c
²⁶⁴³⁹ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4
 * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5
 *
6
 * KVM Xen emulation
7
 */
8
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9

10
#include "x86.h"
11
#include "xen.h"
12
#include "hyperv.h"
13
#include "irq.h"
14

15
#include <linux/eventfd.h>
16
#include <linux/kvm_host.h>
17
#include <linux/sched/stat.h>
18

19
#include <trace/events/kvm.h>
20
#include <xen/interface/xen.h>
21
#include <xen/interface/vcpu.h>
22
#include <xen/interface/version.h>
23
#include <xen/interface/event_channel.h>
24
#include <xen/interface/sched.h>
25

26
#include <asm/xen/cpuid.h>
27
#include <asm/pvclock.h>
28

29
#include "cpuid.h"
30
#include "trace.h"
31

32
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
33
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
34
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
35

36
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
37

38
static int kvm_xen_shared_info_init(struct kvm *kvm)
39
{
40
	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
41
	struct pvclock_wall_clock *wc;
42
	u32 *wc_sec_hi;
43
	u32 wc_version;
44
	u64 wall_nsec;
45
	int ret = 0;
46
	int idx = srcu_read_lock(&kvm->srcu);
47

48
	read_lock_irq(&gpc->lock);
49
	while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
50
		read_unlock_irq(&gpc->lock);
51

52
		ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
53
		if (ret)
54
			goto out;
55

56
		read_lock_irq(&gpc->lock);
57
	}
58

59
	/*
60
	 * This code mirrors kvm_write_wall_clock() except that it writes
61
	 * directly through the pfn cache and doesn't mark the page dirty.
62
	 */
63
	wall_nsec = kvm_get_wall_clock_epoch(kvm);
64

65
	/* Paranoia checks on the 32-bit struct layout */
66
	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
67
	BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
68
	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
69

70
#ifdef CONFIG_X86_64
71
	/* Paranoia checks on the 64-bit struct layout */
72
	BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
73
	BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
74

75
	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
76
		struct shared_info *shinfo = gpc->khva;
77

78
		wc_sec_hi = &shinfo->wc_sec_hi;
79
		wc = &shinfo->wc;
80
	} else
81
#endif
82
	{
83
		struct compat_shared_info *shinfo = gpc->khva;
84

85
		wc_sec_hi = &shinfo->arch.wc_sec_hi;
86
		wc = &shinfo->wc;
87
	}
88

89
	/* Increment and ensure an odd value */
90
	wc_version = wc->version = (wc->version + 1) | 1;
91
	smp_wmb();
92

93
	wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
94
	wc->sec = (u32)wall_nsec;
95
	*wc_sec_hi = wall_nsec >> 32;
96
	smp_wmb();
97

98
	wc->version = wc_version + 1;
99
	read_unlock_irq(&gpc->lock);
100

101
	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
102

103
out:
104
	srcu_read_unlock(&kvm->srcu, idx);
105
	return ret;
106
}
107

108
void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
109
{
110
	if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
111
		struct kvm_xen_evtchn e;
112

113
		e.vcpu_id = vcpu->vcpu_id;
114
		e.vcpu_idx = vcpu->vcpu_idx;
115
		e.port = vcpu->arch.xen.timer_virq;
116
		e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
117

118
		kvm_xen_set_evtchn(&e, vcpu->kvm);
119

120
		vcpu->arch.xen.timer_expires = 0;
121
		atomic_set(&vcpu->arch.xen.timer_pending, 0);
122
	}
123
}
124

125
static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
126
{
127
	struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
128
					     arch.xen.timer);
129
	struct kvm_xen_evtchn e;
130
	int rc;
131

132
	if (atomic_read(&vcpu->arch.xen.timer_pending))
133
		return HRTIMER_NORESTART;
134

135
	e.vcpu_id = vcpu->vcpu_id;
136
	e.vcpu_idx = vcpu->vcpu_idx;
137
	e.port = vcpu->arch.xen.timer_virq;
138
	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
139

140
	rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
141
	if (rc != -EWOULDBLOCK) {
142
		vcpu->arch.xen.timer_expires = 0;
143
		return HRTIMER_NORESTART;
144
	}
145

146
	atomic_inc(&vcpu->arch.xen.timer_pending);
147
	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
148
	kvm_vcpu_kick(vcpu);
149

150
	return HRTIMER_NORESTART;
151
}
152

153
static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
154
				 struct pvclock_vcpu_time_info *hv_clock,
155
				 struct gfn_to_pfn_cache *gpc,
156
				 unsigned int offset)
157
{
158
	unsigned long flags;
159
	int r;
160

161
	read_lock_irqsave(&gpc->lock, flags);
162
	while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
163
		read_unlock_irqrestore(&gpc->lock, flags);
164

165
		r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
166
		if (r)
167
			return r;
168

169
		read_lock_irqsave(&gpc->lock, flags);
170
	}
171

172
	memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
173
	read_unlock_irqrestore(&gpc->lock, flags);
174

175
	/*
176
	 * Sanity check TSC shift+multiplier to verify the guest's view of time
177
	 * is more or less consistent.
178
	 */
179
	if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
180
	    hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
181
		return -EINVAL;
182

183
	return 0;
184
}
185

186
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
187
				bool linux_wa)
188
{
189
	struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
190
	int64_t kernel_now, delta;
191
	uint64_t guest_now;
192
	int r = -EOPNOTSUPP;
193

194
	/*
195
	 * The guest provides the requested timeout in absolute nanoseconds
196
	 * of the KVM clock — as *it* sees it, based on the scaled TSC and
197
	 * the pvclock information provided by KVM.
198
	 *
199
	 * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
200
	 * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
201
	 * difference won't matter much as there is no cumulative effect.
202
	 *
203
	 * Calculate the time for some arbitrary point in time around "now"
204
	 * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
205
	 * delta between the kvmclock "now" value and the guest's requested
206
	 * timeout, apply the "Linux workaround" described below, and add
207
	 * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
208
	 * the absolute CLOCK_MONOTONIC time at which the timer should
209
	 * fire.
210
	 */
211
	do {
212
		struct pvclock_vcpu_time_info hv_clock;
213
		uint64_t host_tsc, guest_tsc;
214

215
		if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
216
		    !vcpu->kvm->arch.use_master_clock)
217
			break;
218

219
		/*
220
		 * If both Xen PV clocks are active, arbitrarily try to use the
221
		 * compat clock first, but also try to use the non-compat clock
222
		 * if the compat clock is unusable.  The two PV clocks hold the
223
		 * same information, but it's possible one (or both) is stale
224
		 * and/or currently unreachable.
225
		 */
226
		if (xen->vcpu_info_cache.active)
227
			r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
228
						  offsetof(struct compat_vcpu_info, time));
229
		if (r && xen->vcpu_time_info_cache.active)
230
			r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
231
		if (r)
232
			break;
233

234
		if (!IS_ENABLED(CONFIG_64BIT) ||
235
		    !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
236
			/*
237
			 * Don't fall back to get_kvmclock_ns() because it's
238
			 * broken; it has a systemic error in its results
239
			 * because it scales directly from host TSC to
240
			 * nanoseconds, and doesn't scale first to guest TSC
241
			 * and *then* to nanoseconds as the guest does.
242
			 *
243
			 * There is a small error introduced here because time
244
			 * continues to elapse between the ktime_get() and the
245
			 * subsequent rdtsc(). But not the systemic drift due
246
			 * to get_kvmclock_ns().
247
			 */
248
			kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
249
			host_tsc = rdtsc();
250
		}
251

252
		/* Calculate the guest kvmclock as the guest would do it. */
253
		guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
254
		guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
255
	} while (0);
256

257
	if (r) {
258
		/*
259
		 * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
260
		 *
261
		 * Also if the guest PV clock hasn't been set up yet, as is
262
		 * likely to be the case during migration when the vCPU has
263
		 * not been run yet. It would be possible to calculate the
264
		 * scaling factors properly in that case but there's not much
265
		 * point in doing so. The get_kvmclock_ns() drift accumulates
266
		 * over time, so it's OK to use it at startup. Besides, on
267
		 * migration there's going to be a little bit of skew in the
268
		 * precise moment at which timers fire anyway. Often they'll
269
		 * be in the "past" by the time the VM is running again after
270
		 * migration.
271
		 */
272
		guest_now = get_kvmclock_ns(vcpu->kvm);
273
		kernel_now = ktime_get();
274
	}
275

276
	delta = guest_abs - guest_now;
277

278
	/*
279
	 * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
280
	 * negative absolute timeout values (caused by integer overflow), and
281
	 * for values about 13 days in the future (2^50ns) which would be
282
	 * caused by jiffies overflow. For those cases, Xen sets the timeout
283
	 * 100ms in the future (not *too* soon, since if a guest really did
284
	 * set a long timeout on purpose we don't want to keep churning CPU
285
	 * time by waking it up).  Emulate Xen's workaround when starting the
286
	 * timer in response to __HYPERVISOR_set_timer_op.
287
	 */
288
	if (linux_wa &&
289
	    unlikely((int64_t)guest_abs < 0 ||
290
		     (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
291
		delta = 100 * NSEC_PER_MSEC;
292
		guest_abs = guest_now + delta;
293
	}
294

295
	/*
296
	 * Avoid races with the old timer firing. Checking timer_expires
297
	 * to avoid calling hrtimer_cancel() will only have false positives
298
	 * so is fine.
299
	 */
300
	if (vcpu->arch.xen.timer_expires)
301
		hrtimer_cancel(&vcpu->arch.xen.timer);
302

303
	atomic_set(&vcpu->arch.xen.timer_pending, 0);
304
	vcpu->arch.xen.timer_expires = guest_abs;
305

306
	if (delta <= 0)
307
		xen_timer_callback(&vcpu->arch.xen.timer);
308
	else
309
		hrtimer_start(&vcpu->arch.xen.timer,
310
			      ktime_add_ns(kernel_now, delta),
311
			      HRTIMER_MODE_ABS_HARD);
312
}
313

314
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
315
{
316
	hrtimer_cancel(&vcpu->arch.xen.timer);
317
	vcpu->arch.xen.timer_expires = 0;
318
	atomic_set(&vcpu->arch.xen.timer_pending, 0);
319
}
320

321
static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
322
{
323
	struct kvm_vcpu_xen *vx = &v->arch.xen;
324
	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
325
	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
326
	size_t user_len, user_len1, user_len2;
327
	struct vcpu_runstate_info rs;
328
	unsigned long flags;
329
	size_t times_ofs;
330
	uint8_t *update_bit = NULL;
331
	uint64_t entry_time;
332
	uint64_t *rs_times;
333
	int *rs_state;
334

335
	/*
336
	 * The only difference between 32-bit and 64-bit versions of the
337
	 * runstate struct is the alignment of uint64_t in 32-bit, which
338
	 * means that the 64-bit version has an additional 4 bytes of
339
	 * padding after the first field 'state'. Let's be really really
340
	 * paranoid about that, and matching it with our internal data
341
	 * structures that we memcpy into it...
342
	 */
343
	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
344
	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
345
	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
346
#ifdef CONFIG_X86_64
347
	/*
348
	 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
349
	 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
350
	 */
351
	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
352
		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
353
	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
354
		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
355
	BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
356
#endif
357
	/*
358
	 * The state field is in the same place at the start of both structs,
359
	 * and is the same size (int) as vx->current_runstate.
360
	 */
361
	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
362
		     offsetof(struct compat_vcpu_runstate_info, state));
363
	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
364
		     sizeof(vx->current_runstate));
365
	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
366
		     sizeof(vx->current_runstate));
367

368
	/*
369
	 * The state_entry_time field is 64 bits in both versions, and the
370
	 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
371
	 * is little-endian means that it's in the last *byte* of the word.
372
	 * That detail is important later.
373
	 */
374
	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
375
		     sizeof(uint64_t));
376
	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
377
		     sizeof(uint64_t));
378
	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
379

380
	/*
381
	 * The time array is four 64-bit quantities in both versions, matching
382
	 * the vx->runstate_times and immediately following state_entry_time.
383
	 */
384
	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
385
		     offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
386
	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
387
		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
388
	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
389
		     sizeof_field(struct compat_vcpu_runstate_info, time));
390
	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
391
		     sizeof(vx->runstate_times));
392

393
	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
394
		user_len = sizeof(struct vcpu_runstate_info);
395
		times_ofs = offsetof(struct vcpu_runstate_info,
396
				     state_entry_time);
397
	} else {
398
		user_len = sizeof(struct compat_vcpu_runstate_info);
399
		times_ofs = offsetof(struct compat_vcpu_runstate_info,
400
				     state_entry_time);
401
	}
402

403
	/*
404
	 * There are basically no alignment constraints. The guest can set it
405
	 * up so it crosses from one page to the next, and at arbitrary byte
406
	 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
407
	 * anyway, even if the overall struct had been 64-bit aligned).
408
	 */
409
	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
410
		user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
411
		user_len2 = user_len - user_len1;
412
	} else {
413
		user_len1 = user_len;
414
		user_len2 = 0;
415
	}
416
	BUG_ON(user_len1 + user_len2 != user_len);
417

418
 retry:
419
	/*
420
	 * Attempt to obtain the GPC lock on *both* (if there are two)
421
	 * gfn_to_pfn caches that cover the region.
422
	 */
423
	if (atomic) {
424
		local_irq_save(flags);
425
		if (!read_trylock(&gpc1->lock)) {
426
			local_irq_restore(flags);
427
			return;
428
		}
429
	} else {
430
		read_lock_irqsave(&gpc1->lock, flags);
431
	}
432
	while (!kvm_gpc_check(gpc1, user_len1)) {
433
		read_unlock_irqrestore(&gpc1->lock, flags);
434

435
		/* When invoked from kvm_sched_out() we cannot sleep */
436
		if (atomic)
437
			return;
438

439
		if (kvm_gpc_refresh(gpc1, user_len1))
440
			return;
441

442
		read_lock_irqsave(&gpc1->lock, flags);
443
	}
444

445
	if (likely(!user_len2)) {
446
		/*
447
		 * Set up three pointers directly to the runstate_info
448
		 * struct in the guest (via the GPC).
449
		 *
450
		 *  • @rs_state   → state field
451
		 *  • @rs_times   → state_entry_time field.
452
		 *  • @update_bit → last byte of state_entry_time, which
453
		 *                  contains the XEN_RUNSTATE_UPDATE bit.
454
		 */
455
		rs_state = gpc1->khva;
456
		rs_times = gpc1->khva + times_ofs;
457
		if (v->kvm->arch.xen.runstate_update_flag)
458
			update_bit = ((void *)(&rs_times[1])) - 1;
459
	} else {
460
		/*
461
		 * The guest's runstate_info is split across two pages and we
462
		 * need to hold and validate both GPCs simultaneously. We can
463
		 * declare a lock ordering GPC1 > GPC2 because nothing else
464
		 * takes them more than one at a time. Set a subclass on the
465
		 * gpc1 lock to make lockdep shut up about it.
466
		 */
467
		lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
468
		if (atomic) {
469
			if (!read_trylock(&gpc2->lock)) {
470
				read_unlock_irqrestore(&gpc1->lock, flags);
471
				return;
472
			}
473
		} else {
474
			read_lock(&gpc2->lock);
475
		}
476

477
		if (!kvm_gpc_check(gpc2, user_len2)) {
478
			read_unlock(&gpc2->lock);
479
			read_unlock_irqrestore(&gpc1->lock, flags);
480

481
			/* When invoked from kvm_sched_out() we cannot sleep */
482
			if (atomic)
483
				return;
484

485
			/*
486
			 * Use kvm_gpc_activate() here because if the runstate
487
			 * area was configured in 32-bit mode and only extends
488
			 * to the second page now because the guest changed to
489
			 * 64-bit mode, the second GPC won't have been set up.
490
			 */
491
			if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
492
					     user_len2))
493
				return;
494

495
			/*
496
			 * We dropped the lock on GPC1 so we have to go all the
497
			 * way back and revalidate that too.
498
			 */
499
			goto retry;
500
		}
501

502
		/*
503
		 * In this case, the runstate_info struct will be assembled on
504
		 * the kernel stack (compat or not as appropriate) and will
505
		 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
506
		 * rs pointers accordingly.
507
		 */
508
		rs_times = &rs.state_entry_time;
509

510
		/*
511
		 * The rs_state pointer points to the start of what we'll
512
		 * copy to the guest, which in the case of a compat guest
513
		 * is the 32-bit field that the compiler thinks is padding.
514
		 */
515
		rs_state = ((void *)rs_times) - times_ofs;
516

517
		/*
518
		 * The update_bit is still directly in the guest memory,
519
		 * via one GPC or the other.
520
		 */
521
		if (v->kvm->arch.xen.runstate_update_flag) {
522
			if (user_len1 >= times_ofs + sizeof(uint64_t))
523
				update_bit = gpc1->khva + times_ofs +
524
					sizeof(uint64_t) - 1;
525
			else
526
				update_bit = gpc2->khva + times_ofs +
527
					sizeof(uint64_t) - 1 - user_len1;
528
		}
529

530
#ifdef CONFIG_X86_64
531
		/*
532
		 * Don't leak kernel memory through the padding in the 64-bit
533
		 * version of the struct.
534
		 */
535
		memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
536
#endif
537
	}
538

539
	/*
540
	 * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
541
	 * state_entry_time field, directly in the guest. We need to set
542
	 * that (and write-barrier) before writing to the rest of the
543
	 * structure, and clear it last. Just as Xen does, we address the
544
	 * single *byte* in which it resides because it might be in a
545
	 * different cache line to the rest of the 64-bit word, due to
546
	 * the (lack of) alignment constraints.
547
	 */
548
	entry_time = vx->runstate_entry_time;
549
	if (update_bit) {
550
		entry_time |= XEN_RUNSTATE_UPDATE;
551
		*update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
552
		smp_wmb();
553
	}
554

555
	/*
556
	 * Now assemble the actual structure, either on our kernel stack
557
	 * or directly in the guest according to how the rs_state and
558
	 * rs_times pointers were set up above.
559
	 */
560
	*rs_state = vx->current_runstate;
561
	rs_times[0] = entry_time;
562
	memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
563

564
	/* For the split case, we have to then copy it to the guest. */
565
	if (user_len2) {
566
		memcpy(gpc1->khva, rs_state, user_len1);
567
		memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
568
	}
569
	smp_wmb();
570

571
	/* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
572
	if (update_bit) {
573
		entry_time &= ~XEN_RUNSTATE_UPDATE;
574
		*update_bit = entry_time >> 56;
575
		smp_wmb();
576
	}
577

578
	if (user_len2) {
579
		kvm_gpc_mark_dirty_in_slot(gpc2);
580
		read_unlock(&gpc2->lock);
581
	}
582

583
	kvm_gpc_mark_dirty_in_slot(gpc1);
584
	read_unlock_irqrestore(&gpc1->lock, flags);
585
}
586

587
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
588
{
589
	struct kvm_vcpu_xen *vx = &v->arch.xen;
590
	u64 now = get_kvmclock_ns(v->kvm);
591
	u64 delta_ns = now - vx->runstate_entry_time;
592
	u64 run_delay = current->sched_info.run_delay;
593

594
	if (unlikely(!vx->runstate_entry_time))
595
		vx->current_runstate = RUNSTATE_offline;
596

597
	/*
598
	 * Time waiting for the scheduler isn't "stolen" if the
599
	 * vCPU wasn't running anyway.
600
	 */
601
	if (vx->current_runstate == RUNSTATE_running) {
602
		u64 steal_ns = run_delay - vx->last_steal;
603

604
		delta_ns -= steal_ns;
605

606
		vx->runstate_times[RUNSTATE_runnable] += steal_ns;
607
	}
608
	vx->last_steal = run_delay;
609

610
	vx->runstate_times[vx->current_runstate] += delta_ns;
611
	vx->current_runstate = state;
612
	vx->runstate_entry_time = now;
613

614
	if (vx->runstate_cache.active)
615
		kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
616
}
617

618
void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
619
{
620
	struct kvm_lapic_irq irq = { };
621

622
	irq.dest_id = v->vcpu_id;
623
	irq.vector = v->arch.xen.upcall_vector;
624
	irq.dest_mode = APIC_DEST_PHYSICAL;
625
	irq.shorthand = APIC_DEST_NOSHORT;
626
	irq.delivery_mode = APIC_DM_FIXED;
627
	irq.level = 1;
628

629
	kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
630
}
631

632
/*
633
 * On event channel delivery, the vcpu_info may not have been accessible.
634
 * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
635
 * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
636
 * Do so now that we can sleep in the context of the vCPU to bring the
637
 * page in, and refresh the pfn cache for it.
638
 */
639
void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
640
{
641
	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
642
	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
643
	unsigned long flags;
644

645
	if (!evtchn_pending_sel)
646
		return;
647

648
	/*
649
	 * Yes, this is an open-coded loop. But that's just what put_user()
650
	 * does anyway. Page it in and retry the instruction. We're just a
651
	 * little more honest about it.
652
	 */
653
	read_lock_irqsave(&gpc->lock, flags);
654
	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
655
		read_unlock_irqrestore(&gpc->lock, flags);
656

657
		if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
658
			return;
659

660
		read_lock_irqsave(&gpc->lock, flags);
661
	}
662

663
	/* Now gpc->khva is a valid kernel address for the vcpu_info */
664
	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
665
		struct vcpu_info *vi = gpc->khva;
666

667
		asm volatile(LOCK_PREFIX "orq %0, %1\n"
668
			     "notq %0\n"
669
			     LOCK_PREFIX "andq %0, %2\n"
670
			     : "=r" (evtchn_pending_sel),
671
			       "+m" (vi->evtchn_pending_sel),
672
			       "+m" (v->arch.xen.evtchn_pending_sel)
673
			     : "0" (evtchn_pending_sel));
674
		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
675
	} else {
676
		u32 evtchn_pending_sel32 = evtchn_pending_sel;
677
		struct compat_vcpu_info *vi = gpc->khva;
678

679
		asm volatile(LOCK_PREFIX "orl %0, %1\n"
680
			     "notl %0\n"
681
			     LOCK_PREFIX "andl %0, %2\n"
682
			     : "=r" (evtchn_pending_sel32),
683
			       "+m" (vi->evtchn_pending_sel),
684
			       "+m" (v->arch.xen.evtchn_pending_sel)
685
			     : "0" (evtchn_pending_sel32));
686
		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
687
	}
688

689
	kvm_gpc_mark_dirty_in_slot(gpc);
690
	read_unlock_irqrestore(&gpc->lock, flags);
691

692
	/* For the per-vCPU lapic vector, deliver it as MSI. */
693
	if (v->arch.xen.upcall_vector)
694
		kvm_xen_inject_vcpu_vector(v);
695
}
696

697
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
698
{
699
	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
700
	unsigned long flags;
701
	u8 rc = 0;
702

703
	/*
704
	 * If the global upcall vector (HVMIRQ_callback_vector) is set and
705
	 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
706
	 */
707

708
	/* No need for compat handling here */
709
	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
710
		     offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
711
	BUILD_BUG_ON(sizeof(rc) !=
712
		     sizeof_field(struct vcpu_info, evtchn_upcall_pending));
713
	BUILD_BUG_ON(sizeof(rc) !=
714
		     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
715

716
	read_lock_irqsave(&gpc->lock, flags);
717
	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
718
		read_unlock_irqrestore(&gpc->lock, flags);
719

720
		/*
721
		 * This function gets called from kvm_vcpu_block() after setting the
722
		 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
723
		 * from a HLT. So we really mustn't sleep. If the page ended up absent
724
		 * at that point, just return 1 in order to trigger an immediate wake,
725
		 * and we'll end up getting called again from a context where we *can*
726
		 * fault in the page and wait for it.
727
		 */
728
		if (in_atomic() || !task_is_running(current))
729
			return 1;
730

731
		if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
732
			/*
733
			 * If this failed, userspace has screwed up the
734
			 * vcpu_info mapping. No interrupts for you.
735
			 */
736
			return 0;
737
		}
738
		read_lock_irqsave(&gpc->lock, flags);
739
	}
740

741
	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
742
	read_unlock_irqrestore(&gpc->lock, flags);
743
	return rc;
744
}
745

746
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
747
{
748
	int r = -ENOENT;
749

750

751
	switch (data->type) {
752
	case KVM_XEN_ATTR_TYPE_LONG_MODE:
753
		if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
754
			r = -EINVAL;
755
		} else {
756
			mutex_lock(&kvm->arch.xen.xen_lock);
757
			kvm->arch.xen.long_mode = !!data->u.long_mode;
758

759
			/*
760
			 * Re-initialize shared_info to put the wallclock in the
761
			 * correct place. Whilst it's not necessary to do this
762
			 * unless the mode is actually changed, it does no harm
763
			 * to make the call anyway.
764
			 */
765
			r = kvm->arch.xen.shinfo_cache.active ?
766
				kvm_xen_shared_info_init(kvm) : 0;
767
			mutex_unlock(&kvm->arch.xen.xen_lock);
768
		}
769
		break;
770

771
	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
772
	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
773
		int idx;
774

775
		mutex_lock(&kvm->arch.xen.xen_lock);
776

777
		idx = srcu_read_lock(&kvm->srcu);
778

779
		if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
780
			gfn_t gfn = data->u.shared_info.gfn;
781

782
			if (gfn == KVM_XEN_INVALID_GFN) {
783
				kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
784
				r = 0;
785
			} else {
786
				r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
787
						     gfn_to_gpa(gfn), PAGE_SIZE);
788
			}
789
		} else {
790
			void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
791

792
			if (!PAGE_ALIGNED(hva)) {
793
				r = -EINVAL;
794
			} else if (!hva) {
795
				kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
796
				r = 0;
797
			} else {
798
				r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
799
							 (unsigned long)hva, PAGE_SIZE);
800
			}
801
		}
802

803
		srcu_read_unlock(&kvm->srcu, idx);
804

805
		if (!r && kvm->arch.xen.shinfo_cache.active)
806
			r = kvm_xen_shared_info_init(kvm);
807

808
		mutex_unlock(&kvm->arch.xen.xen_lock);
809
		break;
810
	}
811
	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
812
		if (data->u.vector && data->u.vector < 0x10)
813
			r = -EINVAL;
814
		else {
815
			mutex_lock(&kvm->arch.xen.xen_lock);
816
			kvm->arch.xen.upcall_vector = data->u.vector;
817
			mutex_unlock(&kvm->arch.xen.xen_lock);
818
			r = 0;
819
		}
820
		break;
821

822
	case KVM_XEN_ATTR_TYPE_EVTCHN:
823
		r = kvm_xen_setattr_evtchn(kvm, data);
824
		break;
825

826
	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
827
		mutex_lock(&kvm->arch.xen.xen_lock);
828
		kvm->arch.xen.xen_version = data->u.xen_version;
829
		mutex_unlock(&kvm->arch.xen.xen_lock);
830
		r = 0;
831
		break;
832

833
	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
834
		if (!sched_info_on()) {
835
			r = -EOPNOTSUPP;
836
			break;
837
		}
838
		mutex_lock(&kvm->arch.xen.xen_lock);
839
		kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
840
		mutex_unlock(&kvm->arch.xen.xen_lock);
841
		r = 0;
842
		break;
843

844
	default:
845
		break;
846
	}
847

848
	return r;
849
}
850

851
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
852
{
853
	int r = -ENOENT;
854

855
	mutex_lock(&kvm->arch.xen.xen_lock);
856

857
	switch (data->type) {
858
	case KVM_XEN_ATTR_TYPE_LONG_MODE:
859
		data->u.long_mode = kvm->arch.xen.long_mode;
860
		r = 0;
861
		break;
862

863
	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
864
		if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
865
			data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
866
		else
867
			data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
868
		r = 0;
869
		break;
870

871
	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
872
		if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
873
			data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
874
		else
875
			data->u.shared_info.hva = 0;
876
		r = 0;
877
		break;
878

879
	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
880
		data->u.vector = kvm->arch.xen.upcall_vector;
881
		r = 0;
882
		break;
883

884
	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
885
		data->u.xen_version = kvm->arch.xen.xen_version;
886
		r = 0;
887
		break;
888

889
	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
890
		if (!sched_info_on()) {
891
			r = -EOPNOTSUPP;
892
			break;
893
		}
894
		data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
895
		r = 0;
896
		break;
897

898
	default:
899
		break;
900
	}
901

902
	mutex_unlock(&kvm->arch.xen.xen_lock);
903
	return r;
904
}
905

906
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
907
{
908
	int idx, r = -ENOENT;
909

910
	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
911
	idx = srcu_read_lock(&vcpu->kvm->srcu);
912

913
	switch (data->type) {
914
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
915
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
916
		/* No compat necessary here. */
917
		BUILD_BUG_ON(sizeof(struct vcpu_info) !=
918
			     sizeof(struct compat_vcpu_info));
919
		BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
920
			     offsetof(struct compat_vcpu_info, time));
921

922
		if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
923
			if (data->u.gpa == KVM_XEN_INVALID_GPA) {
924
				kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
925
				r = 0;
926
				break;
927
			}
928

929
			r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
930
					     data->u.gpa, sizeof(struct vcpu_info));
931
		} else {
932
			if (data->u.hva == 0) {
933
				kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
934
				r = 0;
935
				break;
936
			}
937

938
			r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
939
						 data->u.hva, sizeof(struct vcpu_info));
940
		}
941

942
		if (!r)
943
			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
944

945
		break;
946

947
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
948
		if (data->u.gpa == KVM_XEN_INVALID_GPA) {
949
			kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
950
			r = 0;
951
			break;
952
		}
953

954
		r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
955
				     data->u.gpa,
956
				     sizeof(struct pvclock_vcpu_time_info));
957
		if (!r)
958
			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
959
		break;
960

961
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
962
		size_t sz, sz1, sz2;
963

964
		if (!sched_info_on()) {
965
			r = -EOPNOTSUPP;
966
			break;
967
		}
968
		if (data->u.gpa == KVM_XEN_INVALID_GPA) {
969
			r = 0;
970
		deactivate_out:
971
			kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
972
			kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
973
			break;
974
		}
975

976
		/*
977
		 * If the guest switches to 64-bit mode after setting the runstate
978
		 * address, that's actually OK. kvm_xen_update_runstate_guest()
979
		 * will cope.
980
		 */
981
		if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
982
			sz = sizeof(struct vcpu_runstate_info);
983
		else
984
			sz = sizeof(struct compat_vcpu_runstate_info);
985

986
		/* How much fits in the (first) page? */
987
		sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
988
		r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
989
				     data->u.gpa, sz1);
990
		if (r)
991
			goto deactivate_out;
992

993
		/* Either map the second page, or deactivate the second GPC */
994
		if (sz1 >= sz) {
995
			kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
996
		} else {
997
			sz2 = sz - sz1;
998
			BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
999
			r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
1000
					     data->u.gpa + sz1, sz2);
1001
			if (r)
1002
				goto deactivate_out;
1003
		}
1004

1005
		kvm_xen_update_runstate_guest(vcpu, false);
1006
		break;
1007
	}
1008
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1009
		if (!sched_info_on()) {
1010
			r = -EOPNOTSUPP;
1011
			break;
1012
		}
1013
		if (data->u.runstate.state > RUNSTATE_offline) {
1014
			r = -EINVAL;
1015
			break;
1016
		}
1017

1018
		kvm_xen_update_runstate(vcpu, data->u.runstate.state);
1019
		r = 0;
1020
		break;
1021

1022
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1023
		if (!sched_info_on()) {
1024
			r = -EOPNOTSUPP;
1025
			break;
1026
		}
1027
		if (data->u.runstate.state > RUNSTATE_offline) {
1028
			r = -EINVAL;
1029
			break;
1030
		}
1031
		if (data->u.runstate.state_entry_time !=
1032
		    (data->u.runstate.time_running +
1033
		     data->u.runstate.time_runnable +
1034
		     data->u.runstate.time_blocked +
1035
		     data->u.runstate.time_offline)) {
1036
			r = -EINVAL;
1037
			break;
1038
		}
1039
		if (get_kvmclock_ns(vcpu->kvm) <
1040
		    data->u.runstate.state_entry_time) {
1041
			r = -EINVAL;
1042
			break;
1043
		}
1044

1045
		vcpu->arch.xen.current_runstate = data->u.runstate.state;
1046
		vcpu->arch.xen.runstate_entry_time =
1047
			data->u.runstate.state_entry_time;
1048
		vcpu->arch.xen.runstate_times[RUNSTATE_running] =
1049
			data->u.runstate.time_running;
1050
		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
1051
			data->u.runstate.time_runnable;
1052
		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
1053
			data->u.runstate.time_blocked;
1054
		vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
1055
			data->u.runstate.time_offline;
1056
		vcpu->arch.xen.last_steal = current->sched_info.run_delay;
1057
		r = 0;
1058
		break;
1059

1060
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1061
		if (!sched_info_on()) {
1062
			r = -EOPNOTSUPP;
1063
			break;
1064
		}
1065
		if (data->u.runstate.state > RUNSTATE_offline &&
1066
		    data->u.runstate.state != (u64)-1) {
1067
			r = -EINVAL;
1068
			break;
1069
		}
1070
		/* The adjustment must add up */
1071
		if (data->u.runstate.state_entry_time !=
1072
		    (data->u.runstate.time_running +
1073
		     data->u.runstate.time_runnable +
1074
		     data->u.runstate.time_blocked +
1075
		     data->u.runstate.time_offline)) {
1076
			r = -EINVAL;
1077
			break;
1078
		}
1079

1080
		if (get_kvmclock_ns(vcpu->kvm) <
1081
		    (vcpu->arch.xen.runstate_entry_time +
1082
		     data->u.runstate.state_entry_time)) {
1083
			r = -EINVAL;
1084
			break;
1085
		}
1086

1087
		vcpu->arch.xen.runstate_entry_time +=
1088
			data->u.runstate.state_entry_time;
1089
		vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
1090
			data->u.runstate.time_running;
1091
		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
1092
			data->u.runstate.time_runnable;
1093
		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
1094
			data->u.runstate.time_blocked;
1095
		vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
1096
			data->u.runstate.time_offline;
1097

1098
		if (data->u.runstate.state <= RUNSTATE_offline)
1099
			kvm_xen_update_runstate(vcpu, data->u.runstate.state);
1100
		else if (vcpu->arch.xen.runstate_cache.active)
1101
			kvm_xen_update_runstate_guest(vcpu, false);
1102
		r = 0;
1103
		break;
1104

1105
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1106
		if (data->u.vcpu_id >= KVM_MAX_VCPUS)
1107
			r = -EINVAL;
1108
		else {
1109
			vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
1110
			r = 0;
1111
		}
1112
		break;
1113

1114
	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1115
		if (data->u.timer.port &&
1116
		    data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
1117
			r = -EINVAL;
1118
			break;
1119
		}
1120

1121
		/* Stop the timer (if it's running) before changing the vector */
1122
		kvm_xen_stop_timer(vcpu);
1123
		vcpu->arch.xen.timer_virq = data->u.timer.port;
1124

1125
		/* Start the timer if the new value has a valid vector+expiry. */
1126
		if (data->u.timer.port && data->u.timer.expires_ns)
1127
			kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
1128

1129
		r = 0;
1130
		break;
1131

1132
	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1133
		if (data->u.vector && data->u.vector < 0x10)
1134
			r = -EINVAL;
1135
		else {
1136
			vcpu->arch.xen.upcall_vector = data->u.vector;
1137
			r = 0;
1138
		}
1139
		break;
1140

1141
	default:
1142
		break;
1143
	}
1144

1145
	srcu_read_unlock(&vcpu->kvm->srcu, idx);
1146
	mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
1147
	return r;
1148
}
1149

1150
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
1151
{
1152
	int r = -ENOENT;
1153

1154
	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
1155

1156
	switch (data->type) {
1157
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
1158
		if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
1159
			data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
1160
		else
1161
			data->u.gpa = KVM_XEN_INVALID_GPA;
1162
		r = 0;
1163
		break;
1164

1165
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
1166
		if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
1167
			data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
1168
		else
1169
			data->u.hva = 0;
1170
		r = 0;
1171
		break;
1172

1173
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
1174
		if (vcpu->arch.xen.vcpu_time_info_cache.active)
1175
			data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
1176
		else
1177
			data->u.gpa = KVM_XEN_INVALID_GPA;
1178
		r = 0;
1179
		break;
1180

1181
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
1182
		if (!sched_info_on()) {
1183
			r = -EOPNOTSUPP;
1184
			break;
1185
		}
1186
		if (vcpu->arch.xen.runstate_cache.active) {
1187
			data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
1188
			r = 0;
1189
		}
1190
		break;
1191

1192
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1193
		if (!sched_info_on()) {
1194
			r = -EOPNOTSUPP;
1195
			break;
1196
		}
1197
		data->u.runstate.state = vcpu->arch.xen.current_runstate;
1198
		r = 0;
1199
		break;
1200

1201
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1202
		if (!sched_info_on()) {
1203
			r = -EOPNOTSUPP;
1204
			break;
1205
		}
1206
		data->u.runstate.state = vcpu->arch.xen.current_runstate;
1207
		data->u.runstate.state_entry_time =
1208
			vcpu->arch.xen.runstate_entry_time;
1209
		data->u.runstate.time_running =
1210
			vcpu->arch.xen.runstate_times[RUNSTATE_running];
1211
		data->u.runstate.time_runnable =
1212
			vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
1213
		data->u.runstate.time_blocked =
1214
			vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
1215
		data->u.runstate.time_offline =
1216
			vcpu->arch.xen.runstate_times[RUNSTATE_offline];
1217
		r = 0;
1218
		break;
1219

1220
	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1221
		r = -EINVAL;
1222
		break;
1223

1224
	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1225
		data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
1226
		r = 0;
1227
		break;
1228

1229
	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1230
		/*
1231
		 * Ensure a consistent snapshot of state is captured, with a
1232
		 * timer either being pending, or the event channel delivered
1233
		 * to the corresponding bit in the shared_info. Not still
1234
		 * lurking in the timer_pending flag for deferred delivery.
1235
		 * Purely as an optimisation, if the timer_expires field is
1236
		 * zero, that means the timer isn't active (or even in the
1237
		 * timer_pending flag) and there is no need to cancel it.
1238
		 */
1239
		if (vcpu->arch.xen.timer_expires) {
1240
			hrtimer_cancel(&vcpu->arch.xen.timer);
1241
			kvm_xen_inject_timer_irqs(vcpu);
1242
		}
1243

1244
		data->u.timer.port = vcpu->arch.xen.timer_virq;
1245
		data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
1246
		data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
1247

1248
		/*
1249
		 * The hrtimer may trigger and raise the IRQ immediately,
1250
		 * while the returned state causes it to be set up and
1251
		 * raised again on the destination system after migration.
1252
		 * That's fine, as the guest won't even have had a chance
1253
		 * to run and handle the interrupt. Asserting an already
1254
		 * pending event channel is idempotent.
1255
		 */
1256
		if (vcpu->arch.xen.timer_expires)
1257
			hrtimer_start_expires(&vcpu->arch.xen.timer,
1258
					      HRTIMER_MODE_ABS_HARD);
1259

1260
		r = 0;
1261
		break;
1262

1263
	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1264
		data->u.vector = vcpu->arch.xen.upcall_vector;
1265
		r = 0;
1266
		break;
1267

1268
	default:
1269
		break;
1270
	}
1271

1272
	mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
1273
	return r;
1274
}
1275

1276
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
1277
{
1278
	struct kvm *kvm = vcpu->kvm;
1279
	u32 page_num = data & ~PAGE_MASK;
1280
	u64 page_addr = data & PAGE_MASK;
1281
	bool lm = is_long_mode(vcpu);
1282
	int r = 0;
1283

1284
	mutex_lock(&kvm->arch.xen.xen_lock);
1285
	if (kvm->arch.xen.long_mode != lm) {
1286
		kvm->arch.xen.long_mode = lm;
1287

1288
		/*
1289
		 * Re-initialize shared_info to put the wallclock in the
1290
		 * correct place.
1291
		 */
1292
		if (kvm->arch.xen.shinfo_cache.active &&
1293
		    kvm_xen_shared_info_init(kvm))
1294
			r = 1;
1295
	}
1296
	mutex_unlock(&kvm->arch.xen.xen_lock);
1297

1298
	if (r)
1299
		return r;
1300

1301
	/*
1302
	 * If Xen hypercall intercept is enabled, fill the hypercall
1303
	 * page with VMCALL/VMMCALL instructions since that's what
1304
	 * we catch. Else the VMM has provided the hypercall pages
1305
	 * with instructions of its own choosing, so use those.
1306
	 */
1307
	if (kvm_xen_hypercall_enabled(kvm)) {
1308
		u8 instructions[32];
1309
		int i;
1310

1311
		if (page_num)
1312
			return 1;
1313

1314
		/* mov imm32, %eax */
1315
		instructions[0] = 0xb8;
1316

1317
		/* vmcall / vmmcall */
1318
		kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
1319

1320
		/* ret */
1321
		instructions[8] = 0xc3;
1322

1323
		/* int3 to pad */
1324
		memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
1325

1326
		for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
1327
			*(u32 *)&instructions[1] = i;
1328
			if (kvm_vcpu_write_guest(vcpu,
1329
						 page_addr + (i * sizeof(instructions)),
1330
						 instructions, sizeof(instructions)))
1331
				return 1;
1332
		}
1333
	} else {
1334
		/*
1335
		 * Note, truncation is a non-issue as 'lm' is guaranteed to be
1336
		 * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
1337
		 */
1338
		hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64
1339
				     : kvm->arch.xen.hvm_config.blob_addr_32;
1340
		u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64
1341
				  : kvm->arch.xen.hvm_config.blob_size_32;
1342
		u8 *page;
1343
		int ret;
1344

1345
		if (page_num >= blob_size)
1346
			return 1;
1347

1348
		blob_addr += page_num * PAGE_SIZE;
1349

1350
		page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
1351
		if (IS_ERR(page))
1352
			return PTR_ERR(page);
1353

1354
		ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
1355
		kfree(page);
1356
		if (ret)
1357
			return 1;
1358
	}
1359
	return 0;
1360
}
1361

1362
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
1363
{
1364
	/* Only some feature flags need to be *enabled* by userspace */
1365
	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
1366
		KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
1367
		KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
1368
	u32 old_flags;
1369

1370
	if (xhc->flags & ~permitted_flags)
1371
		return -EINVAL;
1372

1373
	/*
1374
	 * With hypercall interception the kernel generates its own
1375
	 * hypercall page so it must not be provided.
1376
	 */
1377
	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
1378
	    (xhc->blob_addr_32 || xhc->blob_addr_64 ||
1379
	     xhc->blob_size_32 || xhc->blob_size_64))
1380
		return -EINVAL;
1381

1382
	/*
1383
	 * Restrict the MSR to the range that is unofficially reserved for
1384
	 * synthetic, virtualization-defined MSRs, e.g. to prevent confusing
1385
	 * KVM by colliding with a real MSR that requires special handling.
1386
	 */
1387
	if (xhc->msr &&
1388
	    (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX))
1389
		return -EINVAL;
1390

1391
	mutex_lock(&kvm->arch.xen.xen_lock);
1392

1393
	if (xhc->msr && !kvm->arch.xen.hvm_config.msr)
1394
		static_branch_inc(&kvm_xen_enabled.key);
1395
	else if (!xhc->msr && kvm->arch.xen.hvm_config.msr)
1396
		static_branch_slow_dec_deferred(&kvm_xen_enabled);
1397

1398
	old_flags = kvm->arch.xen.hvm_config.flags;
1399
	memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc));
1400

1401
	mutex_unlock(&kvm->arch.xen.xen_lock);
1402

1403
	if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
1404
		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
1405

1406
	return 0;
1407
}
1408

1409
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
1410
{
1411
	kvm_rax_write(vcpu, result);
1412
	return kvm_skip_emulated_instruction(vcpu);
1413
}
1414

1415
static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
1416
{
1417
	struct kvm_run *run = vcpu->run;
1418

1419
	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
1420
		return 1;
1421

1422
	return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
1423
}
1424

1425
static inline int max_evtchn_port(struct kvm *kvm)
1426
{
1427
	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
1428
		return EVTCHN_2L_NR_CHANNELS;
1429
	else
1430
		return COMPAT_EVTCHN_2L_NR_CHANNELS;
1431
}
1432

1433
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
1434
			       evtchn_port_t *ports)
1435
{
1436
	struct kvm *kvm = vcpu->kvm;
1437
	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1438
	unsigned long *pending_bits;
1439
	unsigned long flags;
1440
	bool ret = true;
1441
	int idx, i;
1442

1443
	idx = srcu_read_lock(&kvm->srcu);
1444
	read_lock_irqsave(&gpc->lock, flags);
1445
	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1446
		goto out_rcu;
1447

1448
	ret = false;
1449
	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1450
		struct shared_info *shinfo = gpc->khva;
1451
		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1452
	} else {
1453
		struct compat_shared_info *shinfo = gpc->khva;
1454
		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1455
	}
1456

1457
	for (i = 0; i < nr_ports; i++) {
1458
		if (test_bit(ports[i], pending_bits)) {
1459
			ret = true;
1460
			break;
1461
		}
1462
	}
1463

1464
 out_rcu:
1465
	read_unlock_irqrestore(&gpc->lock, flags);
1466
	srcu_read_unlock(&kvm->srcu, idx);
1467

1468
	return ret;
1469
}
1470

1471
static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
1472
				 u64 param, u64 *r)
1473
{
1474
	struct sched_poll sched_poll;
1475
	evtchn_port_t port, *ports;
1476
	struct x86_exception e;
1477
	int i;
1478

1479
	if (!lapic_in_kernel(vcpu) ||
1480
	    !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
1481
		return false;
1482

1483
	if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
1484
		struct compat_sched_poll sp32;
1485

1486
		/* Sanity check that the compat struct definition is correct */
1487
		BUILD_BUG_ON(sizeof(sp32) != 16);
1488

1489
		if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
1490
			*r = -EFAULT;
1491
			return true;
1492
		}
1493

1494
		/*
1495
		 * This is a 32-bit pointer to an array of evtchn_port_t which
1496
		 * are uint32_t, so once it's converted no further compat
1497
		 * handling is needed.
1498
		 */
1499
		sched_poll.ports = (void *)(unsigned long)(sp32.ports);
1500
		sched_poll.nr_ports = sp32.nr_ports;
1501
		sched_poll.timeout = sp32.timeout;
1502
	} else {
1503
		if (kvm_read_guest_virt(vcpu, param, &sched_poll,
1504
					sizeof(sched_poll), &e)) {
1505
			*r = -EFAULT;
1506
			return true;
1507
		}
1508
	}
1509

1510
	if (unlikely(sched_poll.nr_ports > 1)) {
1511
		/* Xen (unofficially) limits number of pollers to 128 */
1512
		if (sched_poll.nr_ports > 128) {
1513
			*r = -EINVAL;
1514
			return true;
1515
		}
1516

1517
		ports = kmalloc_array(sched_poll.nr_ports,
1518
				      sizeof(*ports), GFP_KERNEL);
1519
		if (!ports) {
1520
			*r = -ENOMEM;
1521
			return true;
1522
		}
1523
	} else
1524
		ports = &port;
1525

1526
	if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
1527
				sched_poll.nr_ports * sizeof(*ports), &e)) {
1528
		*r = -EFAULT;
1529
		goto out;
1530
	}
1531

1532
	for (i = 0; i < sched_poll.nr_ports; i++) {
1533
		if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
1534
			*r = -EINVAL;
1535
			goto out;
1536
		}
1537
	}
1538

1539
	if (sched_poll.nr_ports == 1)
1540
		vcpu->arch.xen.poll_evtchn = port;
1541
	else
1542
		vcpu->arch.xen.poll_evtchn = -1;
1543

1544
	set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1545

1546
	if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
1547
		kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
1548

1549
		if (sched_poll.timeout)
1550
			mod_timer(&vcpu->arch.xen.poll_timer,
1551
				  jiffies + nsecs_to_jiffies(sched_poll.timeout));
1552

1553
		kvm_vcpu_halt(vcpu);
1554

1555
		if (sched_poll.timeout)
1556
			timer_delete(&vcpu->arch.xen.poll_timer);
1557

1558
		kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
1559
	}
1560

1561
	vcpu->arch.xen.poll_evtchn = 0;
1562
	*r = 0;
1563
out:
1564
	/* Really, this is only needed in case of timeout */
1565
	clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1566

1567
	if (unlikely(sched_poll.nr_ports > 1))
1568
		kfree(ports);
1569
	return true;
1570
}
1571

1572
static void cancel_evtchn_poll(struct timer_list *t)
1573
{
1574
	struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
1575
						   arch.xen.poll_timer);
1576

1577
	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1578
	kvm_vcpu_kick(vcpu);
1579
}
1580

1581
static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
1582
				   int cmd, u64 param, u64 *r)
1583
{
1584
	switch (cmd) {
1585
	case SCHEDOP_poll:
1586
		if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
1587
			return true;
1588
		fallthrough;
1589
	case SCHEDOP_yield:
1590
		kvm_vcpu_on_spin(vcpu, true);
1591
		*r = 0;
1592
		return true;
1593
	default:
1594
		break;
1595
	}
1596

1597
	return false;
1598
}
1599

1600
struct compat_vcpu_set_singleshot_timer {
1601
    uint64_t timeout_abs_ns;
1602
    uint32_t flags;
1603
} __attribute__((packed));
1604

1605
static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
1606
				  int vcpu_id, u64 param, u64 *r)
1607
{
1608
	struct vcpu_set_singleshot_timer oneshot;
1609
	struct x86_exception e;
1610

1611
	if (!kvm_xen_timer_enabled(vcpu))
1612
		return false;
1613

1614
	switch (cmd) {
1615
	case VCPUOP_set_singleshot_timer:
1616
		if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1617
			*r = -EINVAL;
1618
			return true;
1619
		}
1620

1621
		/*
1622
		 * The only difference for 32-bit compat is the 4 bytes of
1623
		 * padding after the interesting part of the structure. So
1624
		 * for a faithful emulation of Xen we have to *try* to copy
1625
		 * the padding and return -EFAULT if we can't. Otherwise we
1626
		 * might as well just have copied the 12-byte 32-bit struct.
1627
		 */
1628
		BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1629
			     offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1630
		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1631
			     sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1632
		BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
1633
			     offsetof(struct vcpu_set_singleshot_timer, flags));
1634
		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
1635
			     sizeof_field(struct vcpu_set_singleshot_timer, flags));
1636

1637
		if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
1638
					sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
1639
			*r = -EFAULT;
1640
			return true;
1641
		}
1642

1643
		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
1644
		*r = 0;
1645
		return true;
1646

1647
	case VCPUOP_stop_singleshot_timer:
1648
		if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1649
			*r = -EINVAL;
1650
			return true;
1651
		}
1652
		kvm_xen_stop_timer(vcpu);
1653
		*r = 0;
1654
		return true;
1655
	}
1656

1657
	return false;
1658
}
1659

1660
static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
1661
				       u64 *r)
1662
{
1663
	if (!kvm_xen_timer_enabled(vcpu))
1664
		return false;
1665

1666
	if (timeout)
1667
		kvm_xen_start_timer(vcpu, timeout, true);
1668
	else
1669
		kvm_xen_stop_timer(vcpu);
1670

1671
	*r = 0;
1672
	return true;
1673
}
1674

1675
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
1676
{
1677
	bool longmode;
1678
	u64 input, params[6], r = -ENOSYS;
1679
	bool handled = false;
1680
	u8 cpl;
1681

1682
	input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
1683

1684
	/* Hyper-V hypercalls get bit 31 set in EAX */
1685
	if ((input & 0x80000000) &&
1686
	    kvm_hv_hypercall_enabled(vcpu))
1687
		return kvm_hv_hypercall(vcpu);
1688

1689
	longmode = is_64_bit_hypercall(vcpu);
1690
	if (!longmode) {
1691
		params[0] = (u32)kvm_rbx_read(vcpu);
1692
		params[1] = (u32)kvm_rcx_read(vcpu);
1693
		params[2] = (u32)kvm_rdx_read(vcpu);
1694
		params[3] = (u32)kvm_rsi_read(vcpu);
1695
		params[4] = (u32)kvm_rdi_read(vcpu);
1696
		params[5] = (u32)kvm_rbp_read(vcpu);
1697
	}
1698
#ifdef CONFIG_X86_64
1699
	else {
1700
		params[0] = (u64)kvm_rdi_read(vcpu);
1701
		params[1] = (u64)kvm_rsi_read(vcpu);
1702
		params[2] = (u64)kvm_rdx_read(vcpu);
1703
		params[3] = (u64)kvm_r10_read(vcpu);
1704
		params[4] = (u64)kvm_r8_read(vcpu);
1705
		params[5] = (u64)kvm_r9_read(vcpu);
1706
	}
1707
#endif
1708
	cpl = kvm_x86_call(get_cpl)(vcpu);
1709
	trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
1710
				params[3], params[4], params[5]);
1711

1712
	/*
1713
	 * Only allow hypercall acceleration for CPL0. The rare hypercalls that
1714
	 * are permitted in guest userspace can be handled by the VMM.
1715
	 */
1716
	if (unlikely(cpl > 0))
1717
		goto handle_in_userspace;
1718

1719
	switch (input) {
1720
	case __HYPERVISOR_xen_version:
1721
		if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
1722
			r = vcpu->kvm->arch.xen.xen_version;
1723
			handled = true;
1724
		}
1725
		break;
1726
	case __HYPERVISOR_event_channel_op:
1727
		if (params[0] == EVTCHNOP_send)
1728
			handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
1729
		break;
1730
	case __HYPERVISOR_sched_op:
1731
		handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
1732
						 params[1], &r);
1733
		break;
1734
	case __HYPERVISOR_vcpu_op:
1735
		handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
1736
						params[2], &r);
1737
		break;
1738
	case __HYPERVISOR_set_timer_op: {
1739
		u64 timeout = params[0];
1740
		/* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
1741
		if (!longmode)
1742
			timeout |= params[1] << 32;
1743
		handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
1744
		break;
1745
	}
1746
	default:
1747
		break;
1748
	}
1749

1750
	if (handled)
1751
		return kvm_xen_hypercall_set_result(vcpu, r);
1752

1753
handle_in_userspace:
1754
	vcpu->run->exit_reason = KVM_EXIT_XEN;
1755
	vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
1756
	vcpu->run->xen.u.hcall.longmode = longmode;
1757
	vcpu->run->xen.u.hcall.cpl = cpl;
1758
	vcpu->run->xen.u.hcall.input = input;
1759
	vcpu->run->xen.u.hcall.params[0] = params[0];
1760
	vcpu->run->xen.u.hcall.params[1] = params[1];
1761
	vcpu->run->xen.u.hcall.params[2] = params[2];
1762
	vcpu->run->xen.u.hcall.params[3] = params[3];
1763
	vcpu->run->xen.u.hcall.params[4] = params[4];
1764
	vcpu->run->xen.u.hcall.params[5] = params[5];
1765
	vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
1766
	vcpu->arch.complete_userspace_io =
1767
		kvm_xen_hypercall_complete_userspace;
1768

1769
	return 0;
1770
}
1771

1772
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
1773
{
1774
	int poll_evtchn = vcpu->arch.xen.poll_evtchn;
1775

1776
	if ((poll_evtchn == port || poll_evtchn == -1) &&
1777
	    test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
1778
		kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1779
		kvm_vcpu_kick(vcpu);
1780
	}
1781
}
1782

1783
/*
1784
 * The return value from this function is propagated to kvm_set_irq() API,
1785
 * so it returns:
1786
 *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
1787
 *  = 0   Interrupt was coalesced (previous irq is still pending)
1788
 *  > 0   Number of CPUs interrupt was delivered to
1789
 *
1790
 * It is also called directly from kvm_arch_set_irq_inatomic(), where the
1791
 * only check on its return value is a comparison with -EWOULDBLOCK'.
1792
 */
1793
int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1794
{
1795
	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1796
	struct kvm_vcpu *vcpu;
1797
	unsigned long *pending_bits, *mask_bits;
1798
	unsigned long flags;
1799
	int port_word_bit;
1800
	bool kick_vcpu = false;
1801
	int vcpu_idx, idx, rc;
1802

1803
	vcpu_idx = READ_ONCE(xe->vcpu_idx);
1804
	if (vcpu_idx >= 0)
1805
		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
1806
	else {
1807
		vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
1808
		if (!vcpu)
1809
			return -EINVAL;
1810
		WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
1811
	}
1812

1813
	if (xe->port >= max_evtchn_port(kvm))
1814
		return -EINVAL;
1815

1816
	rc = -EWOULDBLOCK;
1817

1818
	idx = srcu_read_lock(&kvm->srcu);
1819

1820
	read_lock_irqsave(&gpc->lock, flags);
1821
	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1822
		goto out_rcu;
1823

1824
	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1825
		struct shared_info *shinfo = gpc->khva;
1826
		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1827
		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1828
		port_word_bit = xe->port / 64;
1829
	} else {
1830
		struct compat_shared_info *shinfo = gpc->khva;
1831
		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1832
		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1833
		port_word_bit = xe->port / 32;
1834
	}
1835

1836
	/*
1837
	 * If this port wasn't already set, and if it isn't masked, then
1838
	 * we try to set the corresponding bit in the in-kernel shadow of
1839
	 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
1840
	 * already set, then we kick the vCPU in question to write to the
1841
	 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
1842
	 */
1843
	if (test_and_set_bit(xe->port, pending_bits)) {
1844
		rc = 0; /* It was already raised */
1845
	} else if (test_bit(xe->port, mask_bits)) {
1846
		rc = -ENOTCONN; /* Masked */
1847
		kvm_xen_check_poller(vcpu, xe->port);
1848
	} else {
1849
		rc = 1; /* Delivered to the bitmap in shared_info. */
1850
		/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
1851
		read_unlock_irqrestore(&gpc->lock, flags);
1852
		gpc = &vcpu->arch.xen.vcpu_info_cache;
1853

1854
		read_lock_irqsave(&gpc->lock, flags);
1855
		if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
1856
			/*
1857
			 * Could not access the vcpu_info. Set the bit in-kernel
1858
			 * and prod the vCPU to deliver it for itself.
1859
			 */
1860
			if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
1861
				kick_vcpu = true;
1862
			goto out_rcu;
1863
		}
1864

1865
		if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1866
			struct vcpu_info *vcpu_info = gpc->khva;
1867
			if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
1868
				WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1869
				kick_vcpu = true;
1870
			}
1871
		} else {
1872
			struct compat_vcpu_info *vcpu_info = gpc->khva;
1873
			if (!test_and_set_bit(port_word_bit,
1874
					      (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
1875
				WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1876
				kick_vcpu = true;
1877
			}
1878
		}
1879

1880
		/* For the per-vCPU lapic vector, deliver it as MSI. */
1881
		if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
1882
			kvm_xen_inject_vcpu_vector(vcpu);
1883
			kick_vcpu = false;
1884
		}
1885
	}
1886

1887
 out_rcu:
1888
	read_unlock_irqrestore(&gpc->lock, flags);
1889
	srcu_read_unlock(&kvm->srcu, idx);
1890

1891
	if (kick_vcpu) {
1892
		kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1893
		kvm_vcpu_kick(vcpu);
1894
	}
1895

1896
	return rc;
1897
}
1898

1899
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1900
{
1901
	bool mm_borrowed = false;
1902
	int rc;
1903

1904
	rc = kvm_xen_set_evtchn_fast(xe, kvm);
1905
	if (rc != -EWOULDBLOCK)
1906
		return rc;
1907

1908
	if (current->mm != kvm->mm) {
1909
		/*
1910
		 * If not on a thread which already belongs to this KVM,
1911
		 * we'd better be in the irqfd workqueue.
1912
		 */
1913
		if (WARN_ON_ONCE(current->mm))
1914
			return -EINVAL;
1915

1916
		kthread_use_mm(kvm->mm);
1917
		mm_borrowed = true;
1918
	}
1919

1920
	/*
1921
	 * It is theoretically possible for the page to be unmapped
1922
	 * and the MMU notifier to invalidate the shared_info before
1923
	 * we even get to use it. In that case, this looks like an
1924
	 * infinite loop. It was tempting to do it via the userspace
1925
	 * HVA instead... but that just *hides* the fact that it's
1926
	 * an infinite loop, because if a fault occurs and it waits
1927
	 * for the page to come back, it can *still* immediately
1928
	 * fault and have to wait again, repeatedly.
1929
	 *
1930
	 * Conversely, the page could also have been reinstated by
1931
	 * another thread before we even obtain the mutex above, so
1932
	 * check again *first* before remapping it.
1933
	 */
1934
	do {
1935
		struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1936
		int idx;
1937

1938
		rc = kvm_xen_set_evtchn_fast(xe, kvm);
1939
		if (rc != -EWOULDBLOCK)
1940
			break;
1941

1942
		idx = srcu_read_lock(&kvm->srcu);
1943
		rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
1944
		srcu_read_unlock(&kvm->srcu, idx);
1945
	} while(!rc);
1946

1947
	if (mm_borrowed)
1948
		kthread_unuse_mm(kvm->mm);
1949

1950
	return rc;
1951
}
1952

1953
/* This is the version called from kvm_set_irq() as the .set function */
1954
static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
1955
			 int irq_source_id, int level, bool line_status)
1956
{
1957
	if (!level)
1958
		return -EINVAL;
1959

1960
	return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
1961
}
1962

1963
/*
1964
 * Set up an event channel interrupt from the KVM IRQ routing table.
1965
 * Used for e.g. PIRQ from passed through physical devices.
1966
 */
1967
int kvm_xen_setup_evtchn(struct kvm *kvm,
1968
			 struct kvm_kernel_irq_routing_entry *e,
1969
			 const struct kvm_irq_routing_entry *ue)
1970

1971
{
1972
	struct kvm_vcpu *vcpu;
1973

1974
	/*
1975
	 * Don't check for the port being within range of max_evtchn_port().
1976
	 * Userspace can configure what ever targets it likes; events just won't
1977
	 * be delivered if/while the target is invalid, just like userspace can
1978
	 * configure MSIs which target non-existent APICs.
1979
	 *
1980
	 * This allow on Live Migration and Live Update, the IRQ routing table
1981
	 * can be restored *independently* of other things like creating vCPUs,
1982
	 * without imposing an ordering dependency on userspace.  In this
1983
	 * particular case, the problematic ordering would be with setting the
1984
	 * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
1985
	 * instead of 1024 event channels.
1986
	 */
1987

1988
	/* We only support 2 level event channels for now */
1989
	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1990
		return -EINVAL;
1991

1992
	/*
1993
	 * Xen gives us interesting mappings from vCPU index to APIC ID,
1994
	 * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1995
	 * to find it. Do that once at setup time, instead of every time.
1996
	 * But beware that on live update / live migration, the routing
1997
	 * table might be reinstated before the vCPU threads have finished
1998
	 * recreating their vCPUs.
1999
	 */
2000
	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
2001
	if (vcpu)
2002
		e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
2003
	else
2004
		e->xen_evtchn.vcpu_idx = -1;
2005

2006
	e->xen_evtchn.port = ue->u.xen_evtchn.port;
2007
	e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
2008
	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
2009
	e->set = evtchn_set_fn;
2010

2011
	return 0;
2012
}
2013

2014
/*
2015
 * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
2016
 */
2017
int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
2018
{
2019
	struct kvm_xen_evtchn e;
2020
	int ret;
2021

2022
	if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
2023
		return -EINVAL;
2024

2025
	/* We only support 2 level event channels for now */
2026
	if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2027
		return -EINVAL;
2028

2029
	e.port = uxe->port;
2030
	e.vcpu_id = uxe->vcpu;
2031
	e.vcpu_idx = -1;
2032
	e.priority = uxe->priority;
2033

2034
	ret = kvm_xen_set_evtchn(&e, kvm);
2035

2036
	/*
2037
	 * None of that 'return 1 if it actually got delivered' nonsense.
2038
	 * We don't care if it was masked (-ENOTCONN) either.
2039
	 */
2040
	if (ret > 0 || ret == -ENOTCONN)
2041
		ret = 0;
2042

2043
	return ret;
2044
}
2045

2046
/*
2047
 * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
2048
 */
2049
struct evtchnfd {
2050
	u32 send_port;
2051
	u32 type;
2052
	union {
2053
		struct kvm_xen_evtchn port;
2054
		struct {
2055
			u32 port; /* zero */
2056
			struct eventfd_ctx *ctx;
2057
		} eventfd;
2058
	} deliver;
2059
};
2060

2061
/*
2062
 * Update target vCPU or priority for a registered sending channel.
2063
 */
2064
static int kvm_xen_eventfd_update(struct kvm *kvm,
2065
				  struct kvm_xen_hvm_attr *data)
2066
{
2067
	u32 port = data->u.evtchn.send_port;
2068
	struct evtchnfd *evtchnfd;
2069
	int ret;
2070

2071
	/* Protect writes to evtchnfd as well as the idr lookup.  */
2072
	mutex_lock(&kvm->arch.xen.xen_lock);
2073
	evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
2074

2075
	ret = -ENOENT;
2076
	if (!evtchnfd)
2077
		goto out_unlock;
2078

2079
	/* For an UPDATE, nothing may change except the priority/vcpu */
2080
	ret = -EINVAL;
2081
	if (evtchnfd->type != data->u.evtchn.type)
2082
		goto out_unlock;
2083

2084
	/*
2085
	 * Port cannot change, and if it's zero that was an eventfd
2086
	 * which can't be changed either.
2087
	 */
2088
	if (!evtchnfd->deliver.port.port ||
2089
	    evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
2090
		goto out_unlock;
2091

2092
	/* We only support 2 level event channels for now */
2093
	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2094
		goto out_unlock;
2095

2096
	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2097
	if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
2098
		evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2099
		evtchnfd->deliver.port.vcpu_idx = -1;
2100
	}
2101
	ret = 0;
2102
out_unlock:
2103
	mutex_unlock(&kvm->arch.xen.xen_lock);
2104
	return ret;
2105
}
2106

2107
/*
2108
 * Configure the target (eventfd or local port delivery) for sending on
2109
 * a given event channel.
2110
 */
2111
static int kvm_xen_eventfd_assign(struct kvm *kvm,
2112
				  struct kvm_xen_hvm_attr *data)
2113
{
2114
	u32 port = data->u.evtchn.send_port;
2115
	struct eventfd_ctx *eventfd = NULL;
2116
	struct evtchnfd *evtchnfd;
2117
	int ret = -EINVAL;
2118

2119
	evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
2120
	if (!evtchnfd)
2121
		return -ENOMEM;
2122

2123
	switch(data->u.evtchn.type) {
2124
	case EVTCHNSTAT_ipi:
2125
		/* IPI  must map back to the same port# */
2126
		if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
2127
			goto out_noeventfd; /* -EINVAL */
2128
		break;
2129

2130
	case EVTCHNSTAT_interdomain:
2131
		if (data->u.evtchn.deliver.port.port) {
2132
			if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
2133
				goto out_noeventfd; /* -EINVAL */
2134
		} else {
2135
			eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
2136
			if (IS_ERR(eventfd)) {
2137
				ret = PTR_ERR(eventfd);
2138
				goto out_noeventfd;
2139
			}
2140
		}
2141
		break;
2142

2143
	case EVTCHNSTAT_virq:
2144
	case EVTCHNSTAT_closed:
2145
	case EVTCHNSTAT_unbound:
2146
	case EVTCHNSTAT_pirq:
2147
	default: /* Unknown event channel type */
2148
		goto out; /* -EINVAL */
2149
	}
2150

2151
	evtchnfd->send_port = data->u.evtchn.send_port;
2152
	evtchnfd->type = data->u.evtchn.type;
2153
	if (eventfd) {
2154
		evtchnfd->deliver.eventfd.ctx = eventfd;
2155
	} else {
2156
		/* We only support 2 level event channels for now */
2157
		if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2158
			goto out; /* -EINVAL; */
2159

2160
		evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
2161
		evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2162
		evtchnfd->deliver.port.vcpu_idx = -1;
2163
		evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2164
	}
2165

2166
	mutex_lock(&kvm->arch.xen.xen_lock);
2167
	ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
2168
			GFP_KERNEL);
2169
	mutex_unlock(&kvm->arch.xen.xen_lock);
2170
	if (ret >= 0)
2171
		return 0;
2172

2173
	if (ret == -ENOSPC)
2174
		ret = -EEXIST;
2175
out:
2176
	if (eventfd)
2177
		eventfd_ctx_put(eventfd);
2178
out_noeventfd:
2179
	kfree(evtchnfd);
2180
	return ret;
2181
}
2182

2183
static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
2184
{
2185
	struct evtchnfd *evtchnfd;
2186

2187
	mutex_lock(&kvm->arch.xen.xen_lock);
2188
	evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
2189
	mutex_unlock(&kvm->arch.xen.xen_lock);
2190

2191
	if (!evtchnfd)
2192
		return -ENOENT;
2193

2194
	synchronize_srcu(&kvm->srcu);
2195
	if (!evtchnfd->deliver.port.port)
2196
		eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2197
	kfree(evtchnfd);
2198
	return 0;
2199
}
2200

2201
static int kvm_xen_eventfd_reset(struct kvm *kvm)
2202
{
2203
	struct evtchnfd *evtchnfd, **all_evtchnfds;
2204
	int i;
2205
	int n = 0;
2206

2207
	mutex_lock(&kvm->arch.xen.xen_lock);
2208

2209
	/*
2210
	 * Because synchronize_srcu() cannot be called inside the
2211
	 * critical section, first collect all the evtchnfd objects
2212
	 * in an array as they are removed from evtchn_ports.
2213
	 */
2214
	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
2215
		n++;
2216

2217
	all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
2218
	if (!all_evtchnfds) {
2219
		mutex_unlock(&kvm->arch.xen.xen_lock);
2220
		return -ENOMEM;
2221
	}
2222

2223
	n = 0;
2224
	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2225
		all_evtchnfds[n++] = evtchnfd;
2226
		idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
2227
	}
2228
	mutex_unlock(&kvm->arch.xen.xen_lock);
2229

2230
	synchronize_srcu(&kvm->srcu);
2231

2232
	while (n--) {
2233
		evtchnfd = all_evtchnfds[n];
2234
		if (!evtchnfd->deliver.port.port)
2235
			eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2236
		kfree(evtchnfd);
2237
	}
2238
	kfree(all_evtchnfds);
2239

2240
	return 0;
2241
}
2242

2243
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
2244
{
2245
	u32 port = data->u.evtchn.send_port;
2246

2247
	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
2248
		return kvm_xen_eventfd_reset(kvm);
2249

2250
	if (!port || port >= max_evtchn_port(kvm))
2251
		return -EINVAL;
2252

2253
	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
2254
		return kvm_xen_eventfd_deassign(kvm, port);
2255
	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
2256
		return kvm_xen_eventfd_update(kvm, data);
2257
	if (data->u.evtchn.flags)
2258
		return -EINVAL;
2259

2260
	return kvm_xen_eventfd_assign(kvm, data);
2261
}
2262

2263
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
2264
{
2265
	struct evtchnfd *evtchnfd;
2266
	struct evtchn_send send;
2267
	struct x86_exception e;
2268

2269
	/* Sanity check: this structure is the same for 32-bit and 64-bit */
2270
	BUILD_BUG_ON(sizeof(send) != 4);
2271
	if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
2272
		*r = -EFAULT;
2273
		return true;
2274
	}
2275

2276
	/*
2277
	 * evtchnfd is protected by kvm->srcu; the idr lookup instead
2278
	 * is protected by RCU.
2279
	 */
2280
	rcu_read_lock();
2281
	evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
2282
	rcu_read_unlock();
2283
	if (!evtchnfd)
2284
		return false;
2285

2286
	if (evtchnfd->deliver.port.port) {
2287
		int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
2288
		if (ret < 0 && ret != -ENOTCONN)
2289
			return false;
2290
	} else {
2291
		eventfd_signal(evtchnfd->deliver.eventfd.ctx);
2292
	}
2293

2294
	*r = 0;
2295
	return true;
2296
}
2297

2298
void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
2299
{
2300
	vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
2301
	vcpu->arch.xen.poll_evtchn = 0;
2302

2303
	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
2304
	hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
2305
		      HRTIMER_MODE_ABS_HARD);
2306

2307
	kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
2308
	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
2309
	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
2310
	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
2311
}
2312

2313
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
2314
{
2315
	if (kvm_xen_timer_enabled(vcpu))
2316
		kvm_xen_stop_timer(vcpu);
2317

2318
	kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
2319
	kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
2320
	kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
2321
	kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
2322

2323
	timer_delete_sync(&vcpu->arch.xen.poll_timer);
2324
}
2325

2326
void kvm_xen_init_vm(struct kvm *kvm)
2327
{
2328
	mutex_init(&kvm->arch.xen.xen_lock);
2329
	idr_init(&kvm->arch.xen.evtchn_ports);
2330
	kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
2331
}
2332

2333
void kvm_xen_destroy_vm(struct kvm *kvm)
2334
{
2335
	struct evtchnfd *evtchnfd;
2336
	int i;
2337

2338
	kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
2339

2340
	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2341
		if (!evtchnfd->deliver.port.port)
2342
			eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2343
		kfree(evtchnfd);
2344
	}
2345
	idr_destroy(&kvm->arch.xen.evtchn_ports);
2346

2347
	if (kvm->arch.xen.hvm_config.msr)
2348
		static_branch_slow_dec_deferred(&kvm_xen_enabled);
2349
}
2350

2351
Product

Resources

Company