Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/xen.c
26439 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4
* Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5
*
6
* KVM Xen emulation
7
*/
8
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10
#include "x86.h"
11
#include "xen.h"
12
#include "hyperv.h"
13
#include "irq.h"
14
15
#include <linux/eventfd.h>
16
#include <linux/kvm_host.h>
17
#include <linux/sched/stat.h>
18
19
#include <trace/events/kvm.h>
20
#include <xen/interface/xen.h>
21
#include <xen/interface/vcpu.h>
22
#include <xen/interface/version.h>
23
#include <xen/interface/event_channel.h>
24
#include <xen/interface/sched.h>
25
26
#include <asm/xen/cpuid.h>
27
#include <asm/pvclock.h>
28
29
#include "cpuid.h"
30
#include "trace.h"
31
32
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
33
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
34
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
35
36
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
37
38
static int kvm_xen_shared_info_init(struct kvm *kvm)
39
{
40
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
41
struct pvclock_wall_clock *wc;
42
u32 *wc_sec_hi;
43
u32 wc_version;
44
u64 wall_nsec;
45
int ret = 0;
46
int idx = srcu_read_lock(&kvm->srcu);
47
48
read_lock_irq(&gpc->lock);
49
while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
50
read_unlock_irq(&gpc->lock);
51
52
ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
53
if (ret)
54
goto out;
55
56
read_lock_irq(&gpc->lock);
57
}
58
59
/*
60
* This code mirrors kvm_write_wall_clock() except that it writes
61
* directly through the pfn cache and doesn't mark the page dirty.
62
*/
63
wall_nsec = kvm_get_wall_clock_epoch(kvm);
64
65
/* Paranoia checks on the 32-bit struct layout */
66
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
67
BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
68
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
69
70
#ifdef CONFIG_X86_64
71
/* Paranoia checks on the 64-bit struct layout */
72
BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
73
BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
74
75
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
76
struct shared_info *shinfo = gpc->khva;
77
78
wc_sec_hi = &shinfo->wc_sec_hi;
79
wc = &shinfo->wc;
80
} else
81
#endif
82
{
83
struct compat_shared_info *shinfo = gpc->khva;
84
85
wc_sec_hi = &shinfo->arch.wc_sec_hi;
86
wc = &shinfo->wc;
87
}
88
89
/* Increment and ensure an odd value */
90
wc_version = wc->version = (wc->version + 1) | 1;
91
smp_wmb();
92
93
wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
94
wc->sec = (u32)wall_nsec;
95
*wc_sec_hi = wall_nsec >> 32;
96
smp_wmb();
97
98
wc->version = wc_version + 1;
99
read_unlock_irq(&gpc->lock);
100
101
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
102
103
out:
104
srcu_read_unlock(&kvm->srcu, idx);
105
return ret;
106
}
107
108
void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
109
{
110
if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
111
struct kvm_xen_evtchn e;
112
113
e.vcpu_id = vcpu->vcpu_id;
114
e.vcpu_idx = vcpu->vcpu_idx;
115
e.port = vcpu->arch.xen.timer_virq;
116
e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
117
118
kvm_xen_set_evtchn(&e, vcpu->kvm);
119
120
vcpu->arch.xen.timer_expires = 0;
121
atomic_set(&vcpu->arch.xen.timer_pending, 0);
122
}
123
}
124
125
static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
126
{
127
struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
128
arch.xen.timer);
129
struct kvm_xen_evtchn e;
130
int rc;
131
132
if (atomic_read(&vcpu->arch.xen.timer_pending))
133
return HRTIMER_NORESTART;
134
135
e.vcpu_id = vcpu->vcpu_id;
136
e.vcpu_idx = vcpu->vcpu_idx;
137
e.port = vcpu->arch.xen.timer_virq;
138
e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
139
140
rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
141
if (rc != -EWOULDBLOCK) {
142
vcpu->arch.xen.timer_expires = 0;
143
return HRTIMER_NORESTART;
144
}
145
146
atomic_inc(&vcpu->arch.xen.timer_pending);
147
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
148
kvm_vcpu_kick(vcpu);
149
150
return HRTIMER_NORESTART;
151
}
152
153
static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
154
struct pvclock_vcpu_time_info *hv_clock,
155
struct gfn_to_pfn_cache *gpc,
156
unsigned int offset)
157
{
158
unsigned long flags;
159
int r;
160
161
read_lock_irqsave(&gpc->lock, flags);
162
while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
163
read_unlock_irqrestore(&gpc->lock, flags);
164
165
r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
166
if (r)
167
return r;
168
169
read_lock_irqsave(&gpc->lock, flags);
170
}
171
172
memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
173
read_unlock_irqrestore(&gpc->lock, flags);
174
175
/*
176
* Sanity check TSC shift+multiplier to verify the guest's view of time
177
* is more or less consistent.
178
*/
179
if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
180
hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
181
return -EINVAL;
182
183
return 0;
184
}
185
186
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
187
bool linux_wa)
188
{
189
struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
190
int64_t kernel_now, delta;
191
uint64_t guest_now;
192
int r = -EOPNOTSUPP;
193
194
/*
195
* The guest provides the requested timeout in absolute nanoseconds
196
* of the KVM clock — as *it* sees it, based on the scaled TSC and
197
* the pvclock information provided by KVM.
198
*
199
* The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
200
* so use CLOCK_MONOTONIC. In the timescales covered by timers, the
201
* difference won't matter much as there is no cumulative effect.
202
*
203
* Calculate the time for some arbitrary point in time around "now"
204
* in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
205
* delta between the kvmclock "now" value and the guest's requested
206
* timeout, apply the "Linux workaround" described below, and add
207
* the resulting delta to the CLOCK_MONOTONIC "now" value, to get
208
* the absolute CLOCK_MONOTONIC time at which the timer should
209
* fire.
210
*/
211
do {
212
struct pvclock_vcpu_time_info hv_clock;
213
uint64_t host_tsc, guest_tsc;
214
215
if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
216
!vcpu->kvm->arch.use_master_clock)
217
break;
218
219
/*
220
* If both Xen PV clocks are active, arbitrarily try to use the
221
* compat clock first, but also try to use the non-compat clock
222
* if the compat clock is unusable. The two PV clocks hold the
223
* same information, but it's possible one (or both) is stale
224
* and/or currently unreachable.
225
*/
226
if (xen->vcpu_info_cache.active)
227
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
228
offsetof(struct compat_vcpu_info, time));
229
if (r && xen->vcpu_time_info_cache.active)
230
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
231
if (r)
232
break;
233
234
if (!IS_ENABLED(CONFIG_64BIT) ||
235
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
236
/*
237
* Don't fall back to get_kvmclock_ns() because it's
238
* broken; it has a systemic error in its results
239
* because it scales directly from host TSC to
240
* nanoseconds, and doesn't scale first to guest TSC
241
* and *then* to nanoseconds as the guest does.
242
*
243
* There is a small error introduced here because time
244
* continues to elapse between the ktime_get() and the
245
* subsequent rdtsc(). But not the systemic drift due
246
* to get_kvmclock_ns().
247
*/
248
kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
249
host_tsc = rdtsc();
250
}
251
252
/* Calculate the guest kvmclock as the guest would do it. */
253
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
254
guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
255
} while (0);
256
257
if (r) {
258
/*
259
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
260
*
261
* Also if the guest PV clock hasn't been set up yet, as is
262
* likely to be the case during migration when the vCPU has
263
* not been run yet. It would be possible to calculate the
264
* scaling factors properly in that case but there's not much
265
* point in doing so. The get_kvmclock_ns() drift accumulates
266
* over time, so it's OK to use it at startup. Besides, on
267
* migration there's going to be a little bit of skew in the
268
* precise moment at which timers fire anyway. Often they'll
269
* be in the "past" by the time the VM is running again after
270
* migration.
271
*/
272
guest_now = get_kvmclock_ns(vcpu->kvm);
273
kernel_now = ktime_get();
274
}
275
276
delta = guest_abs - guest_now;
277
278
/*
279
* Xen has a 'Linux workaround' in do_set_timer_op() which checks for
280
* negative absolute timeout values (caused by integer overflow), and
281
* for values about 13 days in the future (2^50ns) which would be
282
* caused by jiffies overflow. For those cases, Xen sets the timeout
283
* 100ms in the future (not *too* soon, since if a guest really did
284
* set a long timeout on purpose we don't want to keep churning CPU
285
* time by waking it up). Emulate Xen's workaround when starting the
286
* timer in response to __HYPERVISOR_set_timer_op.
287
*/
288
if (linux_wa &&
289
unlikely((int64_t)guest_abs < 0 ||
290
(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
291
delta = 100 * NSEC_PER_MSEC;
292
guest_abs = guest_now + delta;
293
}
294
295
/*
296
* Avoid races with the old timer firing. Checking timer_expires
297
* to avoid calling hrtimer_cancel() will only have false positives
298
* so is fine.
299
*/
300
if (vcpu->arch.xen.timer_expires)
301
hrtimer_cancel(&vcpu->arch.xen.timer);
302
303
atomic_set(&vcpu->arch.xen.timer_pending, 0);
304
vcpu->arch.xen.timer_expires = guest_abs;
305
306
if (delta <= 0)
307
xen_timer_callback(&vcpu->arch.xen.timer);
308
else
309
hrtimer_start(&vcpu->arch.xen.timer,
310
ktime_add_ns(kernel_now, delta),
311
HRTIMER_MODE_ABS_HARD);
312
}
313
314
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
315
{
316
hrtimer_cancel(&vcpu->arch.xen.timer);
317
vcpu->arch.xen.timer_expires = 0;
318
atomic_set(&vcpu->arch.xen.timer_pending, 0);
319
}
320
321
static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
322
{
323
struct kvm_vcpu_xen *vx = &v->arch.xen;
324
struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
325
struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
326
size_t user_len, user_len1, user_len2;
327
struct vcpu_runstate_info rs;
328
unsigned long flags;
329
size_t times_ofs;
330
uint8_t *update_bit = NULL;
331
uint64_t entry_time;
332
uint64_t *rs_times;
333
int *rs_state;
334
335
/*
336
* The only difference between 32-bit and 64-bit versions of the
337
* runstate struct is the alignment of uint64_t in 32-bit, which
338
* means that the 64-bit version has an additional 4 bytes of
339
* padding after the first field 'state'. Let's be really really
340
* paranoid about that, and matching it with our internal data
341
* structures that we memcpy into it...
342
*/
343
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
344
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
345
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
346
#ifdef CONFIG_X86_64
347
/*
348
* The 64-bit structure has 4 bytes of padding before 'state_entry_time'
349
* so each subsequent field is shifted by 4, and it's 4 bytes longer.
350
*/
351
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
352
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
353
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
354
offsetof(struct compat_vcpu_runstate_info, time) + 4);
355
BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
356
#endif
357
/*
358
* The state field is in the same place at the start of both structs,
359
* and is the same size (int) as vx->current_runstate.
360
*/
361
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
362
offsetof(struct compat_vcpu_runstate_info, state));
363
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
364
sizeof(vx->current_runstate));
365
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
366
sizeof(vx->current_runstate));
367
368
/*
369
* The state_entry_time field is 64 bits in both versions, and the
370
* XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
371
* is little-endian means that it's in the last *byte* of the word.
372
* That detail is important later.
373
*/
374
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
375
sizeof(uint64_t));
376
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
377
sizeof(uint64_t));
378
BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
379
380
/*
381
* The time array is four 64-bit quantities in both versions, matching
382
* the vx->runstate_times and immediately following state_entry_time.
383
*/
384
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
385
offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
386
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
387
offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
388
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
389
sizeof_field(struct compat_vcpu_runstate_info, time));
390
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
391
sizeof(vx->runstate_times));
392
393
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
394
user_len = sizeof(struct vcpu_runstate_info);
395
times_ofs = offsetof(struct vcpu_runstate_info,
396
state_entry_time);
397
} else {
398
user_len = sizeof(struct compat_vcpu_runstate_info);
399
times_ofs = offsetof(struct compat_vcpu_runstate_info,
400
state_entry_time);
401
}
402
403
/*
404
* There are basically no alignment constraints. The guest can set it
405
* up so it crosses from one page to the next, and at arbitrary byte
406
* alignment (and the 32-bit ABI doesn't align the 64-bit integers
407
* anyway, even if the overall struct had been 64-bit aligned).
408
*/
409
if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
410
user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
411
user_len2 = user_len - user_len1;
412
} else {
413
user_len1 = user_len;
414
user_len2 = 0;
415
}
416
BUG_ON(user_len1 + user_len2 != user_len);
417
418
retry:
419
/*
420
* Attempt to obtain the GPC lock on *both* (if there are two)
421
* gfn_to_pfn caches that cover the region.
422
*/
423
if (atomic) {
424
local_irq_save(flags);
425
if (!read_trylock(&gpc1->lock)) {
426
local_irq_restore(flags);
427
return;
428
}
429
} else {
430
read_lock_irqsave(&gpc1->lock, flags);
431
}
432
while (!kvm_gpc_check(gpc1, user_len1)) {
433
read_unlock_irqrestore(&gpc1->lock, flags);
434
435
/* When invoked from kvm_sched_out() we cannot sleep */
436
if (atomic)
437
return;
438
439
if (kvm_gpc_refresh(gpc1, user_len1))
440
return;
441
442
read_lock_irqsave(&gpc1->lock, flags);
443
}
444
445
if (likely(!user_len2)) {
446
/*
447
* Set up three pointers directly to the runstate_info
448
* struct in the guest (via the GPC).
449
*
450
* • @rs_state → state field
451
* • @rs_times → state_entry_time field.
452
* • @update_bit → last byte of state_entry_time, which
453
* contains the XEN_RUNSTATE_UPDATE bit.
454
*/
455
rs_state = gpc1->khva;
456
rs_times = gpc1->khva + times_ofs;
457
if (v->kvm->arch.xen.runstate_update_flag)
458
update_bit = ((void *)(&rs_times[1])) - 1;
459
} else {
460
/*
461
* The guest's runstate_info is split across two pages and we
462
* need to hold and validate both GPCs simultaneously. We can
463
* declare a lock ordering GPC1 > GPC2 because nothing else
464
* takes them more than one at a time. Set a subclass on the
465
* gpc1 lock to make lockdep shut up about it.
466
*/
467
lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
468
if (atomic) {
469
if (!read_trylock(&gpc2->lock)) {
470
read_unlock_irqrestore(&gpc1->lock, flags);
471
return;
472
}
473
} else {
474
read_lock(&gpc2->lock);
475
}
476
477
if (!kvm_gpc_check(gpc2, user_len2)) {
478
read_unlock(&gpc2->lock);
479
read_unlock_irqrestore(&gpc1->lock, flags);
480
481
/* When invoked from kvm_sched_out() we cannot sleep */
482
if (atomic)
483
return;
484
485
/*
486
* Use kvm_gpc_activate() here because if the runstate
487
* area was configured in 32-bit mode and only extends
488
* to the second page now because the guest changed to
489
* 64-bit mode, the second GPC won't have been set up.
490
*/
491
if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
492
user_len2))
493
return;
494
495
/*
496
* We dropped the lock on GPC1 so we have to go all the
497
* way back and revalidate that too.
498
*/
499
goto retry;
500
}
501
502
/*
503
* In this case, the runstate_info struct will be assembled on
504
* the kernel stack (compat or not as appropriate) and will
505
* be copied to GPC1/GPC2 with a dual memcpy. Set up the three
506
* rs pointers accordingly.
507
*/
508
rs_times = &rs.state_entry_time;
509
510
/*
511
* The rs_state pointer points to the start of what we'll
512
* copy to the guest, which in the case of a compat guest
513
* is the 32-bit field that the compiler thinks is padding.
514
*/
515
rs_state = ((void *)rs_times) - times_ofs;
516
517
/*
518
* The update_bit is still directly in the guest memory,
519
* via one GPC or the other.
520
*/
521
if (v->kvm->arch.xen.runstate_update_flag) {
522
if (user_len1 >= times_ofs + sizeof(uint64_t))
523
update_bit = gpc1->khva + times_ofs +
524
sizeof(uint64_t) - 1;
525
else
526
update_bit = gpc2->khva + times_ofs +
527
sizeof(uint64_t) - 1 - user_len1;
528
}
529
530
#ifdef CONFIG_X86_64
531
/*
532
* Don't leak kernel memory through the padding in the 64-bit
533
* version of the struct.
534
*/
535
memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
536
#endif
537
}
538
539
/*
540
* First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
541
* state_entry_time field, directly in the guest. We need to set
542
* that (and write-barrier) before writing to the rest of the
543
* structure, and clear it last. Just as Xen does, we address the
544
* single *byte* in which it resides because it might be in a
545
* different cache line to the rest of the 64-bit word, due to
546
* the (lack of) alignment constraints.
547
*/
548
entry_time = vx->runstate_entry_time;
549
if (update_bit) {
550
entry_time |= XEN_RUNSTATE_UPDATE;
551
*update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
552
smp_wmb();
553
}
554
555
/*
556
* Now assemble the actual structure, either on our kernel stack
557
* or directly in the guest according to how the rs_state and
558
* rs_times pointers were set up above.
559
*/
560
*rs_state = vx->current_runstate;
561
rs_times[0] = entry_time;
562
memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
563
564
/* For the split case, we have to then copy it to the guest. */
565
if (user_len2) {
566
memcpy(gpc1->khva, rs_state, user_len1);
567
memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
568
}
569
smp_wmb();
570
571
/* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
572
if (update_bit) {
573
entry_time &= ~XEN_RUNSTATE_UPDATE;
574
*update_bit = entry_time >> 56;
575
smp_wmb();
576
}
577
578
if (user_len2) {
579
kvm_gpc_mark_dirty_in_slot(gpc2);
580
read_unlock(&gpc2->lock);
581
}
582
583
kvm_gpc_mark_dirty_in_slot(gpc1);
584
read_unlock_irqrestore(&gpc1->lock, flags);
585
}
586
587
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
588
{
589
struct kvm_vcpu_xen *vx = &v->arch.xen;
590
u64 now = get_kvmclock_ns(v->kvm);
591
u64 delta_ns = now - vx->runstate_entry_time;
592
u64 run_delay = current->sched_info.run_delay;
593
594
if (unlikely(!vx->runstate_entry_time))
595
vx->current_runstate = RUNSTATE_offline;
596
597
/*
598
* Time waiting for the scheduler isn't "stolen" if the
599
* vCPU wasn't running anyway.
600
*/
601
if (vx->current_runstate == RUNSTATE_running) {
602
u64 steal_ns = run_delay - vx->last_steal;
603
604
delta_ns -= steal_ns;
605
606
vx->runstate_times[RUNSTATE_runnable] += steal_ns;
607
}
608
vx->last_steal = run_delay;
609
610
vx->runstate_times[vx->current_runstate] += delta_ns;
611
vx->current_runstate = state;
612
vx->runstate_entry_time = now;
613
614
if (vx->runstate_cache.active)
615
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
616
}
617
618
void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
619
{
620
struct kvm_lapic_irq irq = { };
621
622
irq.dest_id = v->vcpu_id;
623
irq.vector = v->arch.xen.upcall_vector;
624
irq.dest_mode = APIC_DEST_PHYSICAL;
625
irq.shorthand = APIC_DEST_NOSHORT;
626
irq.delivery_mode = APIC_DM_FIXED;
627
irq.level = 1;
628
629
kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
630
}
631
632
/*
633
* On event channel delivery, the vcpu_info may not have been accessible.
634
* In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
635
* need to be marked into the vcpu_info (and evtchn_upcall_pending set).
636
* Do so now that we can sleep in the context of the vCPU to bring the
637
* page in, and refresh the pfn cache for it.
638
*/
639
void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
640
{
641
unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
642
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
643
unsigned long flags;
644
645
if (!evtchn_pending_sel)
646
return;
647
648
/*
649
* Yes, this is an open-coded loop. But that's just what put_user()
650
* does anyway. Page it in and retry the instruction. We're just a
651
* little more honest about it.
652
*/
653
read_lock_irqsave(&gpc->lock, flags);
654
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
655
read_unlock_irqrestore(&gpc->lock, flags);
656
657
if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
658
return;
659
660
read_lock_irqsave(&gpc->lock, flags);
661
}
662
663
/* Now gpc->khva is a valid kernel address for the vcpu_info */
664
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
665
struct vcpu_info *vi = gpc->khva;
666
667
asm volatile(LOCK_PREFIX "orq %0, %1\n"
668
"notq %0\n"
669
LOCK_PREFIX "andq %0, %2\n"
670
: "=r" (evtchn_pending_sel),
671
"+m" (vi->evtchn_pending_sel),
672
"+m" (v->arch.xen.evtchn_pending_sel)
673
: "0" (evtchn_pending_sel));
674
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
675
} else {
676
u32 evtchn_pending_sel32 = evtchn_pending_sel;
677
struct compat_vcpu_info *vi = gpc->khva;
678
679
asm volatile(LOCK_PREFIX "orl %0, %1\n"
680
"notl %0\n"
681
LOCK_PREFIX "andl %0, %2\n"
682
: "=r" (evtchn_pending_sel32),
683
"+m" (vi->evtchn_pending_sel),
684
"+m" (v->arch.xen.evtchn_pending_sel)
685
: "0" (evtchn_pending_sel32));
686
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
687
}
688
689
kvm_gpc_mark_dirty_in_slot(gpc);
690
read_unlock_irqrestore(&gpc->lock, flags);
691
692
/* For the per-vCPU lapic vector, deliver it as MSI. */
693
if (v->arch.xen.upcall_vector)
694
kvm_xen_inject_vcpu_vector(v);
695
}
696
697
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
698
{
699
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
700
unsigned long flags;
701
u8 rc = 0;
702
703
/*
704
* If the global upcall vector (HVMIRQ_callback_vector) is set and
705
* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
706
*/
707
708
/* No need for compat handling here */
709
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
710
offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
711
BUILD_BUG_ON(sizeof(rc) !=
712
sizeof_field(struct vcpu_info, evtchn_upcall_pending));
713
BUILD_BUG_ON(sizeof(rc) !=
714
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
715
716
read_lock_irqsave(&gpc->lock, flags);
717
while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
718
read_unlock_irqrestore(&gpc->lock, flags);
719
720
/*
721
* This function gets called from kvm_vcpu_block() after setting the
722
* task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
723
* from a HLT. So we really mustn't sleep. If the page ended up absent
724
* at that point, just return 1 in order to trigger an immediate wake,
725
* and we'll end up getting called again from a context where we *can*
726
* fault in the page and wait for it.
727
*/
728
if (in_atomic() || !task_is_running(current))
729
return 1;
730
731
if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
732
/*
733
* If this failed, userspace has screwed up the
734
* vcpu_info mapping. No interrupts for you.
735
*/
736
return 0;
737
}
738
read_lock_irqsave(&gpc->lock, flags);
739
}
740
741
rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
742
read_unlock_irqrestore(&gpc->lock, flags);
743
return rc;
744
}
745
746
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
747
{
748
int r = -ENOENT;
749
750
751
switch (data->type) {
752
case KVM_XEN_ATTR_TYPE_LONG_MODE:
753
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
754
r = -EINVAL;
755
} else {
756
mutex_lock(&kvm->arch.xen.xen_lock);
757
kvm->arch.xen.long_mode = !!data->u.long_mode;
758
759
/*
760
* Re-initialize shared_info to put the wallclock in the
761
* correct place. Whilst it's not necessary to do this
762
* unless the mode is actually changed, it does no harm
763
* to make the call anyway.
764
*/
765
r = kvm->arch.xen.shinfo_cache.active ?
766
kvm_xen_shared_info_init(kvm) : 0;
767
mutex_unlock(&kvm->arch.xen.xen_lock);
768
}
769
break;
770
771
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
772
case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
773
int idx;
774
775
mutex_lock(&kvm->arch.xen.xen_lock);
776
777
idx = srcu_read_lock(&kvm->srcu);
778
779
if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
780
gfn_t gfn = data->u.shared_info.gfn;
781
782
if (gfn == KVM_XEN_INVALID_GFN) {
783
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
784
r = 0;
785
} else {
786
r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
787
gfn_to_gpa(gfn), PAGE_SIZE);
788
}
789
} else {
790
void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
791
792
if (!PAGE_ALIGNED(hva)) {
793
r = -EINVAL;
794
} else if (!hva) {
795
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
796
r = 0;
797
} else {
798
r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
799
(unsigned long)hva, PAGE_SIZE);
800
}
801
}
802
803
srcu_read_unlock(&kvm->srcu, idx);
804
805
if (!r && kvm->arch.xen.shinfo_cache.active)
806
r = kvm_xen_shared_info_init(kvm);
807
808
mutex_unlock(&kvm->arch.xen.xen_lock);
809
break;
810
}
811
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
812
if (data->u.vector && data->u.vector < 0x10)
813
r = -EINVAL;
814
else {
815
mutex_lock(&kvm->arch.xen.xen_lock);
816
kvm->arch.xen.upcall_vector = data->u.vector;
817
mutex_unlock(&kvm->arch.xen.xen_lock);
818
r = 0;
819
}
820
break;
821
822
case KVM_XEN_ATTR_TYPE_EVTCHN:
823
r = kvm_xen_setattr_evtchn(kvm, data);
824
break;
825
826
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
827
mutex_lock(&kvm->arch.xen.xen_lock);
828
kvm->arch.xen.xen_version = data->u.xen_version;
829
mutex_unlock(&kvm->arch.xen.xen_lock);
830
r = 0;
831
break;
832
833
case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
834
if (!sched_info_on()) {
835
r = -EOPNOTSUPP;
836
break;
837
}
838
mutex_lock(&kvm->arch.xen.xen_lock);
839
kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
840
mutex_unlock(&kvm->arch.xen.xen_lock);
841
r = 0;
842
break;
843
844
default:
845
break;
846
}
847
848
return r;
849
}
850
851
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
852
{
853
int r = -ENOENT;
854
855
mutex_lock(&kvm->arch.xen.xen_lock);
856
857
switch (data->type) {
858
case KVM_XEN_ATTR_TYPE_LONG_MODE:
859
data->u.long_mode = kvm->arch.xen.long_mode;
860
r = 0;
861
break;
862
863
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
864
if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
865
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
866
else
867
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
868
r = 0;
869
break;
870
871
case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
872
if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
873
data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
874
else
875
data->u.shared_info.hva = 0;
876
r = 0;
877
break;
878
879
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
880
data->u.vector = kvm->arch.xen.upcall_vector;
881
r = 0;
882
break;
883
884
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
885
data->u.xen_version = kvm->arch.xen.xen_version;
886
r = 0;
887
break;
888
889
case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
890
if (!sched_info_on()) {
891
r = -EOPNOTSUPP;
892
break;
893
}
894
data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
895
r = 0;
896
break;
897
898
default:
899
break;
900
}
901
902
mutex_unlock(&kvm->arch.xen.xen_lock);
903
return r;
904
}
905
906
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
907
{
908
int idx, r = -ENOENT;
909
910
mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
911
idx = srcu_read_lock(&vcpu->kvm->srcu);
912
913
switch (data->type) {
914
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
915
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
916
/* No compat necessary here. */
917
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
918
sizeof(struct compat_vcpu_info));
919
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
920
offsetof(struct compat_vcpu_info, time));
921
922
if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
923
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
924
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
925
r = 0;
926
break;
927
}
928
929
r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
930
data->u.gpa, sizeof(struct vcpu_info));
931
} else {
932
if (data->u.hva == 0) {
933
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
934
r = 0;
935
break;
936
}
937
938
r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
939
data->u.hva, sizeof(struct vcpu_info));
940
}
941
942
if (!r)
943
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
944
945
break;
946
947
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
948
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
949
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
950
r = 0;
951
break;
952
}
953
954
r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
955
data->u.gpa,
956
sizeof(struct pvclock_vcpu_time_info));
957
if (!r)
958
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
959
break;
960
961
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
962
size_t sz, sz1, sz2;
963
964
if (!sched_info_on()) {
965
r = -EOPNOTSUPP;
966
break;
967
}
968
if (data->u.gpa == KVM_XEN_INVALID_GPA) {
969
r = 0;
970
deactivate_out:
971
kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
972
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
973
break;
974
}
975
976
/*
977
* If the guest switches to 64-bit mode after setting the runstate
978
* address, that's actually OK. kvm_xen_update_runstate_guest()
979
* will cope.
980
*/
981
if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
982
sz = sizeof(struct vcpu_runstate_info);
983
else
984
sz = sizeof(struct compat_vcpu_runstate_info);
985
986
/* How much fits in the (first) page? */
987
sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
988
r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
989
data->u.gpa, sz1);
990
if (r)
991
goto deactivate_out;
992
993
/* Either map the second page, or deactivate the second GPC */
994
if (sz1 >= sz) {
995
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
996
} else {
997
sz2 = sz - sz1;
998
BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
999
r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
1000
data->u.gpa + sz1, sz2);
1001
if (r)
1002
goto deactivate_out;
1003
}
1004
1005
kvm_xen_update_runstate_guest(vcpu, false);
1006
break;
1007
}
1008
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1009
if (!sched_info_on()) {
1010
r = -EOPNOTSUPP;
1011
break;
1012
}
1013
if (data->u.runstate.state > RUNSTATE_offline) {
1014
r = -EINVAL;
1015
break;
1016
}
1017
1018
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
1019
r = 0;
1020
break;
1021
1022
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1023
if (!sched_info_on()) {
1024
r = -EOPNOTSUPP;
1025
break;
1026
}
1027
if (data->u.runstate.state > RUNSTATE_offline) {
1028
r = -EINVAL;
1029
break;
1030
}
1031
if (data->u.runstate.state_entry_time !=
1032
(data->u.runstate.time_running +
1033
data->u.runstate.time_runnable +
1034
data->u.runstate.time_blocked +
1035
data->u.runstate.time_offline)) {
1036
r = -EINVAL;
1037
break;
1038
}
1039
if (get_kvmclock_ns(vcpu->kvm) <
1040
data->u.runstate.state_entry_time) {
1041
r = -EINVAL;
1042
break;
1043
}
1044
1045
vcpu->arch.xen.current_runstate = data->u.runstate.state;
1046
vcpu->arch.xen.runstate_entry_time =
1047
data->u.runstate.state_entry_time;
1048
vcpu->arch.xen.runstate_times[RUNSTATE_running] =
1049
data->u.runstate.time_running;
1050
vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
1051
data->u.runstate.time_runnable;
1052
vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
1053
data->u.runstate.time_blocked;
1054
vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
1055
data->u.runstate.time_offline;
1056
vcpu->arch.xen.last_steal = current->sched_info.run_delay;
1057
r = 0;
1058
break;
1059
1060
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1061
if (!sched_info_on()) {
1062
r = -EOPNOTSUPP;
1063
break;
1064
}
1065
if (data->u.runstate.state > RUNSTATE_offline &&
1066
data->u.runstate.state != (u64)-1) {
1067
r = -EINVAL;
1068
break;
1069
}
1070
/* The adjustment must add up */
1071
if (data->u.runstate.state_entry_time !=
1072
(data->u.runstate.time_running +
1073
data->u.runstate.time_runnable +
1074
data->u.runstate.time_blocked +
1075
data->u.runstate.time_offline)) {
1076
r = -EINVAL;
1077
break;
1078
}
1079
1080
if (get_kvmclock_ns(vcpu->kvm) <
1081
(vcpu->arch.xen.runstate_entry_time +
1082
data->u.runstate.state_entry_time)) {
1083
r = -EINVAL;
1084
break;
1085
}
1086
1087
vcpu->arch.xen.runstate_entry_time +=
1088
data->u.runstate.state_entry_time;
1089
vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
1090
data->u.runstate.time_running;
1091
vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
1092
data->u.runstate.time_runnable;
1093
vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
1094
data->u.runstate.time_blocked;
1095
vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
1096
data->u.runstate.time_offline;
1097
1098
if (data->u.runstate.state <= RUNSTATE_offline)
1099
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
1100
else if (vcpu->arch.xen.runstate_cache.active)
1101
kvm_xen_update_runstate_guest(vcpu, false);
1102
r = 0;
1103
break;
1104
1105
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1106
if (data->u.vcpu_id >= KVM_MAX_VCPUS)
1107
r = -EINVAL;
1108
else {
1109
vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
1110
r = 0;
1111
}
1112
break;
1113
1114
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1115
if (data->u.timer.port &&
1116
data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
1117
r = -EINVAL;
1118
break;
1119
}
1120
1121
/* Stop the timer (if it's running) before changing the vector */
1122
kvm_xen_stop_timer(vcpu);
1123
vcpu->arch.xen.timer_virq = data->u.timer.port;
1124
1125
/* Start the timer if the new value has a valid vector+expiry. */
1126
if (data->u.timer.port && data->u.timer.expires_ns)
1127
kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
1128
1129
r = 0;
1130
break;
1131
1132
case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1133
if (data->u.vector && data->u.vector < 0x10)
1134
r = -EINVAL;
1135
else {
1136
vcpu->arch.xen.upcall_vector = data->u.vector;
1137
r = 0;
1138
}
1139
break;
1140
1141
default:
1142
break;
1143
}
1144
1145
srcu_read_unlock(&vcpu->kvm->srcu, idx);
1146
mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
1147
return r;
1148
}
1149
1150
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
1151
{
1152
int r = -ENOENT;
1153
1154
mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
1155
1156
switch (data->type) {
1157
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
1158
if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
1159
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
1160
else
1161
data->u.gpa = KVM_XEN_INVALID_GPA;
1162
r = 0;
1163
break;
1164
1165
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
1166
if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
1167
data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
1168
else
1169
data->u.hva = 0;
1170
r = 0;
1171
break;
1172
1173
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
1174
if (vcpu->arch.xen.vcpu_time_info_cache.active)
1175
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
1176
else
1177
data->u.gpa = KVM_XEN_INVALID_GPA;
1178
r = 0;
1179
break;
1180
1181
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
1182
if (!sched_info_on()) {
1183
r = -EOPNOTSUPP;
1184
break;
1185
}
1186
if (vcpu->arch.xen.runstate_cache.active) {
1187
data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
1188
r = 0;
1189
}
1190
break;
1191
1192
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1193
if (!sched_info_on()) {
1194
r = -EOPNOTSUPP;
1195
break;
1196
}
1197
data->u.runstate.state = vcpu->arch.xen.current_runstate;
1198
r = 0;
1199
break;
1200
1201
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1202
if (!sched_info_on()) {
1203
r = -EOPNOTSUPP;
1204
break;
1205
}
1206
data->u.runstate.state = vcpu->arch.xen.current_runstate;
1207
data->u.runstate.state_entry_time =
1208
vcpu->arch.xen.runstate_entry_time;
1209
data->u.runstate.time_running =
1210
vcpu->arch.xen.runstate_times[RUNSTATE_running];
1211
data->u.runstate.time_runnable =
1212
vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
1213
data->u.runstate.time_blocked =
1214
vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
1215
data->u.runstate.time_offline =
1216
vcpu->arch.xen.runstate_times[RUNSTATE_offline];
1217
r = 0;
1218
break;
1219
1220
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1221
r = -EINVAL;
1222
break;
1223
1224
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1225
data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
1226
r = 0;
1227
break;
1228
1229
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1230
/*
1231
* Ensure a consistent snapshot of state is captured, with a
1232
* timer either being pending, or the event channel delivered
1233
* to the corresponding bit in the shared_info. Not still
1234
* lurking in the timer_pending flag for deferred delivery.
1235
* Purely as an optimisation, if the timer_expires field is
1236
* zero, that means the timer isn't active (or even in the
1237
* timer_pending flag) and there is no need to cancel it.
1238
*/
1239
if (vcpu->arch.xen.timer_expires) {
1240
hrtimer_cancel(&vcpu->arch.xen.timer);
1241
kvm_xen_inject_timer_irqs(vcpu);
1242
}
1243
1244
data->u.timer.port = vcpu->arch.xen.timer_virq;
1245
data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
1246
data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
1247
1248
/*
1249
* The hrtimer may trigger and raise the IRQ immediately,
1250
* while the returned state causes it to be set up and
1251
* raised again on the destination system after migration.
1252
* That's fine, as the guest won't even have had a chance
1253
* to run and handle the interrupt. Asserting an already
1254
* pending event channel is idempotent.
1255
*/
1256
if (vcpu->arch.xen.timer_expires)
1257
hrtimer_start_expires(&vcpu->arch.xen.timer,
1258
HRTIMER_MODE_ABS_HARD);
1259
1260
r = 0;
1261
break;
1262
1263
case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1264
data->u.vector = vcpu->arch.xen.upcall_vector;
1265
r = 0;
1266
break;
1267
1268
default:
1269
break;
1270
}
1271
1272
mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
1273
return r;
1274
}
1275
1276
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
1277
{
1278
struct kvm *kvm = vcpu->kvm;
1279
u32 page_num = data & ~PAGE_MASK;
1280
u64 page_addr = data & PAGE_MASK;
1281
bool lm = is_long_mode(vcpu);
1282
int r = 0;
1283
1284
mutex_lock(&kvm->arch.xen.xen_lock);
1285
if (kvm->arch.xen.long_mode != lm) {
1286
kvm->arch.xen.long_mode = lm;
1287
1288
/*
1289
* Re-initialize shared_info to put the wallclock in the
1290
* correct place.
1291
*/
1292
if (kvm->arch.xen.shinfo_cache.active &&
1293
kvm_xen_shared_info_init(kvm))
1294
r = 1;
1295
}
1296
mutex_unlock(&kvm->arch.xen.xen_lock);
1297
1298
if (r)
1299
return r;
1300
1301
/*
1302
* If Xen hypercall intercept is enabled, fill the hypercall
1303
* page with VMCALL/VMMCALL instructions since that's what
1304
* we catch. Else the VMM has provided the hypercall pages
1305
* with instructions of its own choosing, so use those.
1306
*/
1307
if (kvm_xen_hypercall_enabled(kvm)) {
1308
u8 instructions[32];
1309
int i;
1310
1311
if (page_num)
1312
return 1;
1313
1314
/* mov imm32, %eax */
1315
instructions[0] = 0xb8;
1316
1317
/* vmcall / vmmcall */
1318
kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
1319
1320
/* ret */
1321
instructions[8] = 0xc3;
1322
1323
/* int3 to pad */
1324
memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
1325
1326
for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
1327
*(u32 *)&instructions[1] = i;
1328
if (kvm_vcpu_write_guest(vcpu,
1329
page_addr + (i * sizeof(instructions)),
1330
instructions, sizeof(instructions)))
1331
return 1;
1332
}
1333
} else {
1334
/*
1335
* Note, truncation is a non-issue as 'lm' is guaranteed to be
1336
* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
1337
*/
1338
hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64
1339
: kvm->arch.xen.hvm_config.blob_addr_32;
1340
u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64
1341
: kvm->arch.xen.hvm_config.blob_size_32;
1342
u8 *page;
1343
int ret;
1344
1345
if (page_num >= blob_size)
1346
return 1;
1347
1348
blob_addr += page_num * PAGE_SIZE;
1349
1350
page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
1351
if (IS_ERR(page))
1352
return PTR_ERR(page);
1353
1354
ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
1355
kfree(page);
1356
if (ret)
1357
return 1;
1358
}
1359
return 0;
1360
}
1361
1362
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
1363
{
1364
/* Only some feature flags need to be *enabled* by userspace */
1365
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
1366
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
1367
KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
1368
u32 old_flags;
1369
1370
if (xhc->flags & ~permitted_flags)
1371
return -EINVAL;
1372
1373
/*
1374
* With hypercall interception the kernel generates its own
1375
* hypercall page so it must not be provided.
1376
*/
1377
if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
1378
(xhc->blob_addr_32 || xhc->blob_addr_64 ||
1379
xhc->blob_size_32 || xhc->blob_size_64))
1380
return -EINVAL;
1381
1382
/*
1383
* Restrict the MSR to the range that is unofficially reserved for
1384
* synthetic, virtualization-defined MSRs, e.g. to prevent confusing
1385
* KVM by colliding with a real MSR that requires special handling.
1386
*/
1387
if (xhc->msr &&
1388
(xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX))
1389
return -EINVAL;
1390
1391
mutex_lock(&kvm->arch.xen.xen_lock);
1392
1393
if (xhc->msr && !kvm->arch.xen.hvm_config.msr)
1394
static_branch_inc(&kvm_xen_enabled.key);
1395
else if (!xhc->msr && kvm->arch.xen.hvm_config.msr)
1396
static_branch_slow_dec_deferred(&kvm_xen_enabled);
1397
1398
old_flags = kvm->arch.xen.hvm_config.flags;
1399
memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc));
1400
1401
mutex_unlock(&kvm->arch.xen.xen_lock);
1402
1403
if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
1404
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
1405
1406
return 0;
1407
}
1408
1409
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
1410
{
1411
kvm_rax_write(vcpu, result);
1412
return kvm_skip_emulated_instruction(vcpu);
1413
}
1414
1415
static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
1416
{
1417
struct kvm_run *run = vcpu->run;
1418
1419
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
1420
return 1;
1421
1422
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
1423
}
1424
1425
static inline int max_evtchn_port(struct kvm *kvm)
1426
{
1427
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
1428
return EVTCHN_2L_NR_CHANNELS;
1429
else
1430
return COMPAT_EVTCHN_2L_NR_CHANNELS;
1431
}
1432
1433
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
1434
evtchn_port_t *ports)
1435
{
1436
struct kvm *kvm = vcpu->kvm;
1437
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1438
unsigned long *pending_bits;
1439
unsigned long flags;
1440
bool ret = true;
1441
int idx, i;
1442
1443
idx = srcu_read_lock(&kvm->srcu);
1444
read_lock_irqsave(&gpc->lock, flags);
1445
if (!kvm_gpc_check(gpc, PAGE_SIZE))
1446
goto out_rcu;
1447
1448
ret = false;
1449
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1450
struct shared_info *shinfo = gpc->khva;
1451
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1452
} else {
1453
struct compat_shared_info *shinfo = gpc->khva;
1454
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1455
}
1456
1457
for (i = 0; i < nr_ports; i++) {
1458
if (test_bit(ports[i], pending_bits)) {
1459
ret = true;
1460
break;
1461
}
1462
}
1463
1464
out_rcu:
1465
read_unlock_irqrestore(&gpc->lock, flags);
1466
srcu_read_unlock(&kvm->srcu, idx);
1467
1468
return ret;
1469
}
1470
1471
static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
1472
u64 param, u64 *r)
1473
{
1474
struct sched_poll sched_poll;
1475
evtchn_port_t port, *ports;
1476
struct x86_exception e;
1477
int i;
1478
1479
if (!lapic_in_kernel(vcpu) ||
1480
!(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
1481
return false;
1482
1483
if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
1484
struct compat_sched_poll sp32;
1485
1486
/* Sanity check that the compat struct definition is correct */
1487
BUILD_BUG_ON(sizeof(sp32) != 16);
1488
1489
if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
1490
*r = -EFAULT;
1491
return true;
1492
}
1493
1494
/*
1495
* This is a 32-bit pointer to an array of evtchn_port_t which
1496
* are uint32_t, so once it's converted no further compat
1497
* handling is needed.
1498
*/
1499
sched_poll.ports = (void *)(unsigned long)(sp32.ports);
1500
sched_poll.nr_ports = sp32.nr_ports;
1501
sched_poll.timeout = sp32.timeout;
1502
} else {
1503
if (kvm_read_guest_virt(vcpu, param, &sched_poll,
1504
sizeof(sched_poll), &e)) {
1505
*r = -EFAULT;
1506
return true;
1507
}
1508
}
1509
1510
if (unlikely(sched_poll.nr_ports > 1)) {
1511
/* Xen (unofficially) limits number of pollers to 128 */
1512
if (sched_poll.nr_ports > 128) {
1513
*r = -EINVAL;
1514
return true;
1515
}
1516
1517
ports = kmalloc_array(sched_poll.nr_ports,
1518
sizeof(*ports), GFP_KERNEL);
1519
if (!ports) {
1520
*r = -ENOMEM;
1521
return true;
1522
}
1523
} else
1524
ports = &port;
1525
1526
if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
1527
sched_poll.nr_ports * sizeof(*ports), &e)) {
1528
*r = -EFAULT;
1529
goto out;
1530
}
1531
1532
for (i = 0; i < sched_poll.nr_ports; i++) {
1533
if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
1534
*r = -EINVAL;
1535
goto out;
1536
}
1537
}
1538
1539
if (sched_poll.nr_ports == 1)
1540
vcpu->arch.xen.poll_evtchn = port;
1541
else
1542
vcpu->arch.xen.poll_evtchn = -1;
1543
1544
set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1545
1546
if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
1547
kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
1548
1549
if (sched_poll.timeout)
1550
mod_timer(&vcpu->arch.xen.poll_timer,
1551
jiffies + nsecs_to_jiffies(sched_poll.timeout));
1552
1553
kvm_vcpu_halt(vcpu);
1554
1555
if (sched_poll.timeout)
1556
timer_delete(&vcpu->arch.xen.poll_timer);
1557
1558
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
1559
}
1560
1561
vcpu->arch.xen.poll_evtchn = 0;
1562
*r = 0;
1563
out:
1564
/* Really, this is only needed in case of timeout */
1565
clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
1566
1567
if (unlikely(sched_poll.nr_ports > 1))
1568
kfree(ports);
1569
return true;
1570
}
1571
1572
static void cancel_evtchn_poll(struct timer_list *t)
1573
{
1574
struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
1575
arch.xen.poll_timer);
1576
1577
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1578
kvm_vcpu_kick(vcpu);
1579
}
1580
1581
static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
1582
int cmd, u64 param, u64 *r)
1583
{
1584
switch (cmd) {
1585
case SCHEDOP_poll:
1586
if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
1587
return true;
1588
fallthrough;
1589
case SCHEDOP_yield:
1590
kvm_vcpu_on_spin(vcpu, true);
1591
*r = 0;
1592
return true;
1593
default:
1594
break;
1595
}
1596
1597
return false;
1598
}
1599
1600
struct compat_vcpu_set_singleshot_timer {
1601
uint64_t timeout_abs_ns;
1602
uint32_t flags;
1603
} __attribute__((packed));
1604
1605
static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
1606
int vcpu_id, u64 param, u64 *r)
1607
{
1608
struct vcpu_set_singleshot_timer oneshot;
1609
struct x86_exception e;
1610
1611
if (!kvm_xen_timer_enabled(vcpu))
1612
return false;
1613
1614
switch (cmd) {
1615
case VCPUOP_set_singleshot_timer:
1616
if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1617
*r = -EINVAL;
1618
return true;
1619
}
1620
1621
/*
1622
* The only difference for 32-bit compat is the 4 bytes of
1623
* padding after the interesting part of the structure. So
1624
* for a faithful emulation of Xen we have to *try* to copy
1625
* the padding and return -EFAULT if we can't. Otherwise we
1626
* might as well just have copied the 12-byte 32-bit struct.
1627
*/
1628
BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1629
offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1630
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1631
sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1632
BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
1633
offsetof(struct vcpu_set_singleshot_timer, flags));
1634
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
1635
sizeof_field(struct vcpu_set_singleshot_timer, flags));
1636
1637
if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
1638
sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
1639
*r = -EFAULT;
1640
return true;
1641
}
1642
1643
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
1644
*r = 0;
1645
return true;
1646
1647
case VCPUOP_stop_singleshot_timer:
1648
if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1649
*r = -EINVAL;
1650
return true;
1651
}
1652
kvm_xen_stop_timer(vcpu);
1653
*r = 0;
1654
return true;
1655
}
1656
1657
return false;
1658
}
1659
1660
static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
1661
u64 *r)
1662
{
1663
if (!kvm_xen_timer_enabled(vcpu))
1664
return false;
1665
1666
if (timeout)
1667
kvm_xen_start_timer(vcpu, timeout, true);
1668
else
1669
kvm_xen_stop_timer(vcpu);
1670
1671
*r = 0;
1672
return true;
1673
}
1674
1675
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
1676
{
1677
bool longmode;
1678
u64 input, params[6], r = -ENOSYS;
1679
bool handled = false;
1680
u8 cpl;
1681
1682
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
1683
1684
/* Hyper-V hypercalls get bit 31 set in EAX */
1685
if ((input & 0x80000000) &&
1686
kvm_hv_hypercall_enabled(vcpu))
1687
return kvm_hv_hypercall(vcpu);
1688
1689
longmode = is_64_bit_hypercall(vcpu);
1690
if (!longmode) {
1691
params[0] = (u32)kvm_rbx_read(vcpu);
1692
params[1] = (u32)kvm_rcx_read(vcpu);
1693
params[2] = (u32)kvm_rdx_read(vcpu);
1694
params[3] = (u32)kvm_rsi_read(vcpu);
1695
params[4] = (u32)kvm_rdi_read(vcpu);
1696
params[5] = (u32)kvm_rbp_read(vcpu);
1697
}
1698
#ifdef CONFIG_X86_64
1699
else {
1700
params[0] = (u64)kvm_rdi_read(vcpu);
1701
params[1] = (u64)kvm_rsi_read(vcpu);
1702
params[2] = (u64)kvm_rdx_read(vcpu);
1703
params[3] = (u64)kvm_r10_read(vcpu);
1704
params[4] = (u64)kvm_r8_read(vcpu);
1705
params[5] = (u64)kvm_r9_read(vcpu);
1706
}
1707
#endif
1708
cpl = kvm_x86_call(get_cpl)(vcpu);
1709
trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
1710
params[3], params[4], params[5]);
1711
1712
/*
1713
* Only allow hypercall acceleration for CPL0. The rare hypercalls that
1714
* are permitted in guest userspace can be handled by the VMM.
1715
*/
1716
if (unlikely(cpl > 0))
1717
goto handle_in_userspace;
1718
1719
switch (input) {
1720
case __HYPERVISOR_xen_version:
1721
if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
1722
r = vcpu->kvm->arch.xen.xen_version;
1723
handled = true;
1724
}
1725
break;
1726
case __HYPERVISOR_event_channel_op:
1727
if (params[0] == EVTCHNOP_send)
1728
handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
1729
break;
1730
case __HYPERVISOR_sched_op:
1731
handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
1732
params[1], &r);
1733
break;
1734
case __HYPERVISOR_vcpu_op:
1735
handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
1736
params[2], &r);
1737
break;
1738
case __HYPERVISOR_set_timer_op: {
1739
u64 timeout = params[0];
1740
/* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
1741
if (!longmode)
1742
timeout |= params[1] << 32;
1743
handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
1744
break;
1745
}
1746
default:
1747
break;
1748
}
1749
1750
if (handled)
1751
return kvm_xen_hypercall_set_result(vcpu, r);
1752
1753
handle_in_userspace:
1754
vcpu->run->exit_reason = KVM_EXIT_XEN;
1755
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
1756
vcpu->run->xen.u.hcall.longmode = longmode;
1757
vcpu->run->xen.u.hcall.cpl = cpl;
1758
vcpu->run->xen.u.hcall.input = input;
1759
vcpu->run->xen.u.hcall.params[0] = params[0];
1760
vcpu->run->xen.u.hcall.params[1] = params[1];
1761
vcpu->run->xen.u.hcall.params[2] = params[2];
1762
vcpu->run->xen.u.hcall.params[3] = params[3];
1763
vcpu->run->xen.u.hcall.params[4] = params[4];
1764
vcpu->run->xen.u.hcall.params[5] = params[5];
1765
vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
1766
vcpu->arch.complete_userspace_io =
1767
kvm_xen_hypercall_complete_userspace;
1768
1769
return 0;
1770
}
1771
1772
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
1773
{
1774
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
1775
1776
if ((poll_evtchn == port || poll_evtchn == -1) &&
1777
test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
1778
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1779
kvm_vcpu_kick(vcpu);
1780
}
1781
}
1782
1783
/*
1784
* The return value from this function is propagated to kvm_set_irq() API,
1785
* so it returns:
1786
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
1787
* = 0 Interrupt was coalesced (previous irq is still pending)
1788
* > 0 Number of CPUs interrupt was delivered to
1789
*
1790
* It is also called directly from kvm_arch_set_irq_inatomic(), where the
1791
* only check on its return value is a comparison with -EWOULDBLOCK'.
1792
*/
1793
int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1794
{
1795
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1796
struct kvm_vcpu *vcpu;
1797
unsigned long *pending_bits, *mask_bits;
1798
unsigned long flags;
1799
int port_word_bit;
1800
bool kick_vcpu = false;
1801
int vcpu_idx, idx, rc;
1802
1803
vcpu_idx = READ_ONCE(xe->vcpu_idx);
1804
if (vcpu_idx >= 0)
1805
vcpu = kvm_get_vcpu(kvm, vcpu_idx);
1806
else {
1807
vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
1808
if (!vcpu)
1809
return -EINVAL;
1810
WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
1811
}
1812
1813
if (xe->port >= max_evtchn_port(kvm))
1814
return -EINVAL;
1815
1816
rc = -EWOULDBLOCK;
1817
1818
idx = srcu_read_lock(&kvm->srcu);
1819
1820
read_lock_irqsave(&gpc->lock, flags);
1821
if (!kvm_gpc_check(gpc, PAGE_SIZE))
1822
goto out_rcu;
1823
1824
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1825
struct shared_info *shinfo = gpc->khva;
1826
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1827
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1828
port_word_bit = xe->port / 64;
1829
} else {
1830
struct compat_shared_info *shinfo = gpc->khva;
1831
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1832
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1833
port_word_bit = xe->port / 32;
1834
}
1835
1836
/*
1837
* If this port wasn't already set, and if it isn't masked, then
1838
* we try to set the corresponding bit in the in-kernel shadow of
1839
* evtchn_pending_sel for the target vCPU. And if *that* wasn't
1840
* already set, then we kick the vCPU in question to write to the
1841
* *real* evtchn_pending_sel in its own guest vcpu_info struct.
1842
*/
1843
if (test_and_set_bit(xe->port, pending_bits)) {
1844
rc = 0; /* It was already raised */
1845
} else if (test_bit(xe->port, mask_bits)) {
1846
rc = -ENOTCONN; /* Masked */
1847
kvm_xen_check_poller(vcpu, xe->port);
1848
} else {
1849
rc = 1; /* Delivered to the bitmap in shared_info. */
1850
/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
1851
read_unlock_irqrestore(&gpc->lock, flags);
1852
gpc = &vcpu->arch.xen.vcpu_info_cache;
1853
1854
read_lock_irqsave(&gpc->lock, flags);
1855
if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
1856
/*
1857
* Could not access the vcpu_info. Set the bit in-kernel
1858
* and prod the vCPU to deliver it for itself.
1859
*/
1860
if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
1861
kick_vcpu = true;
1862
goto out_rcu;
1863
}
1864
1865
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1866
struct vcpu_info *vcpu_info = gpc->khva;
1867
if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
1868
WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1869
kick_vcpu = true;
1870
}
1871
} else {
1872
struct compat_vcpu_info *vcpu_info = gpc->khva;
1873
if (!test_and_set_bit(port_word_bit,
1874
(unsigned long *)&vcpu_info->evtchn_pending_sel)) {
1875
WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
1876
kick_vcpu = true;
1877
}
1878
}
1879
1880
/* For the per-vCPU lapic vector, deliver it as MSI. */
1881
if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
1882
kvm_xen_inject_vcpu_vector(vcpu);
1883
kick_vcpu = false;
1884
}
1885
}
1886
1887
out_rcu:
1888
read_unlock_irqrestore(&gpc->lock, flags);
1889
srcu_read_unlock(&kvm->srcu, idx);
1890
1891
if (kick_vcpu) {
1892
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1893
kvm_vcpu_kick(vcpu);
1894
}
1895
1896
return rc;
1897
}
1898
1899
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
1900
{
1901
bool mm_borrowed = false;
1902
int rc;
1903
1904
rc = kvm_xen_set_evtchn_fast(xe, kvm);
1905
if (rc != -EWOULDBLOCK)
1906
return rc;
1907
1908
if (current->mm != kvm->mm) {
1909
/*
1910
* If not on a thread which already belongs to this KVM,
1911
* we'd better be in the irqfd workqueue.
1912
*/
1913
if (WARN_ON_ONCE(current->mm))
1914
return -EINVAL;
1915
1916
kthread_use_mm(kvm->mm);
1917
mm_borrowed = true;
1918
}
1919
1920
/*
1921
* It is theoretically possible for the page to be unmapped
1922
* and the MMU notifier to invalidate the shared_info before
1923
* we even get to use it. In that case, this looks like an
1924
* infinite loop. It was tempting to do it via the userspace
1925
* HVA instead... but that just *hides* the fact that it's
1926
* an infinite loop, because if a fault occurs and it waits
1927
* for the page to come back, it can *still* immediately
1928
* fault and have to wait again, repeatedly.
1929
*
1930
* Conversely, the page could also have been reinstated by
1931
* another thread before we even obtain the mutex above, so
1932
* check again *first* before remapping it.
1933
*/
1934
do {
1935
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1936
int idx;
1937
1938
rc = kvm_xen_set_evtchn_fast(xe, kvm);
1939
if (rc != -EWOULDBLOCK)
1940
break;
1941
1942
idx = srcu_read_lock(&kvm->srcu);
1943
rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
1944
srcu_read_unlock(&kvm->srcu, idx);
1945
} while(!rc);
1946
1947
if (mm_borrowed)
1948
kthread_unuse_mm(kvm->mm);
1949
1950
return rc;
1951
}
1952
1953
/* This is the version called from kvm_set_irq() as the .set function */
1954
static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
1955
int irq_source_id, int level, bool line_status)
1956
{
1957
if (!level)
1958
return -EINVAL;
1959
1960
return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
1961
}
1962
1963
/*
1964
* Set up an event channel interrupt from the KVM IRQ routing table.
1965
* Used for e.g. PIRQ from passed through physical devices.
1966
*/
1967
int kvm_xen_setup_evtchn(struct kvm *kvm,
1968
struct kvm_kernel_irq_routing_entry *e,
1969
const struct kvm_irq_routing_entry *ue)
1970
1971
{
1972
struct kvm_vcpu *vcpu;
1973
1974
/*
1975
* Don't check for the port being within range of max_evtchn_port().
1976
* Userspace can configure what ever targets it likes; events just won't
1977
* be delivered if/while the target is invalid, just like userspace can
1978
* configure MSIs which target non-existent APICs.
1979
*
1980
* This allow on Live Migration and Live Update, the IRQ routing table
1981
* can be restored *independently* of other things like creating vCPUs,
1982
* without imposing an ordering dependency on userspace. In this
1983
* particular case, the problematic ordering would be with setting the
1984
* Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
1985
* instead of 1024 event channels.
1986
*/
1987
1988
/* We only support 2 level event channels for now */
1989
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1990
return -EINVAL;
1991
1992
/*
1993
* Xen gives us interesting mappings from vCPU index to APIC ID,
1994
* which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1995
* to find it. Do that once at setup time, instead of every time.
1996
* But beware that on live update / live migration, the routing
1997
* table might be reinstated before the vCPU threads have finished
1998
* recreating their vCPUs.
1999
*/
2000
vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
2001
if (vcpu)
2002
e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
2003
else
2004
e->xen_evtchn.vcpu_idx = -1;
2005
2006
e->xen_evtchn.port = ue->u.xen_evtchn.port;
2007
e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
2008
e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
2009
e->set = evtchn_set_fn;
2010
2011
return 0;
2012
}
2013
2014
/*
2015
* Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
2016
*/
2017
int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
2018
{
2019
struct kvm_xen_evtchn e;
2020
int ret;
2021
2022
if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
2023
return -EINVAL;
2024
2025
/* We only support 2 level event channels for now */
2026
if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2027
return -EINVAL;
2028
2029
e.port = uxe->port;
2030
e.vcpu_id = uxe->vcpu;
2031
e.vcpu_idx = -1;
2032
e.priority = uxe->priority;
2033
2034
ret = kvm_xen_set_evtchn(&e, kvm);
2035
2036
/*
2037
* None of that 'return 1 if it actually got delivered' nonsense.
2038
* We don't care if it was masked (-ENOTCONN) either.
2039
*/
2040
if (ret > 0 || ret == -ENOTCONN)
2041
ret = 0;
2042
2043
return ret;
2044
}
2045
2046
/*
2047
* Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
2048
*/
2049
struct evtchnfd {
2050
u32 send_port;
2051
u32 type;
2052
union {
2053
struct kvm_xen_evtchn port;
2054
struct {
2055
u32 port; /* zero */
2056
struct eventfd_ctx *ctx;
2057
} eventfd;
2058
} deliver;
2059
};
2060
2061
/*
2062
* Update target vCPU or priority for a registered sending channel.
2063
*/
2064
static int kvm_xen_eventfd_update(struct kvm *kvm,
2065
struct kvm_xen_hvm_attr *data)
2066
{
2067
u32 port = data->u.evtchn.send_port;
2068
struct evtchnfd *evtchnfd;
2069
int ret;
2070
2071
/* Protect writes to evtchnfd as well as the idr lookup. */
2072
mutex_lock(&kvm->arch.xen.xen_lock);
2073
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
2074
2075
ret = -ENOENT;
2076
if (!evtchnfd)
2077
goto out_unlock;
2078
2079
/* For an UPDATE, nothing may change except the priority/vcpu */
2080
ret = -EINVAL;
2081
if (evtchnfd->type != data->u.evtchn.type)
2082
goto out_unlock;
2083
2084
/*
2085
* Port cannot change, and if it's zero that was an eventfd
2086
* which can't be changed either.
2087
*/
2088
if (!evtchnfd->deliver.port.port ||
2089
evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
2090
goto out_unlock;
2091
2092
/* We only support 2 level event channels for now */
2093
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2094
goto out_unlock;
2095
2096
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2097
if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
2098
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2099
evtchnfd->deliver.port.vcpu_idx = -1;
2100
}
2101
ret = 0;
2102
out_unlock:
2103
mutex_unlock(&kvm->arch.xen.xen_lock);
2104
return ret;
2105
}
2106
2107
/*
2108
* Configure the target (eventfd or local port delivery) for sending on
2109
* a given event channel.
2110
*/
2111
static int kvm_xen_eventfd_assign(struct kvm *kvm,
2112
struct kvm_xen_hvm_attr *data)
2113
{
2114
u32 port = data->u.evtchn.send_port;
2115
struct eventfd_ctx *eventfd = NULL;
2116
struct evtchnfd *evtchnfd;
2117
int ret = -EINVAL;
2118
2119
evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
2120
if (!evtchnfd)
2121
return -ENOMEM;
2122
2123
switch(data->u.evtchn.type) {
2124
case EVTCHNSTAT_ipi:
2125
/* IPI must map back to the same port# */
2126
if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
2127
goto out_noeventfd; /* -EINVAL */
2128
break;
2129
2130
case EVTCHNSTAT_interdomain:
2131
if (data->u.evtchn.deliver.port.port) {
2132
if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
2133
goto out_noeventfd; /* -EINVAL */
2134
} else {
2135
eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
2136
if (IS_ERR(eventfd)) {
2137
ret = PTR_ERR(eventfd);
2138
goto out_noeventfd;
2139
}
2140
}
2141
break;
2142
2143
case EVTCHNSTAT_virq:
2144
case EVTCHNSTAT_closed:
2145
case EVTCHNSTAT_unbound:
2146
case EVTCHNSTAT_pirq:
2147
default: /* Unknown event channel type */
2148
goto out; /* -EINVAL */
2149
}
2150
2151
evtchnfd->send_port = data->u.evtchn.send_port;
2152
evtchnfd->type = data->u.evtchn.type;
2153
if (eventfd) {
2154
evtchnfd->deliver.eventfd.ctx = eventfd;
2155
} else {
2156
/* We only support 2 level event channels for now */
2157
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2158
goto out; /* -EINVAL; */
2159
2160
evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
2161
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2162
evtchnfd->deliver.port.vcpu_idx = -1;
2163
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2164
}
2165
2166
mutex_lock(&kvm->arch.xen.xen_lock);
2167
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
2168
GFP_KERNEL);
2169
mutex_unlock(&kvm->arch.xen.xen_lock);
2170
if (ret >= 0)
2171
return 0;
2172
2173
if (ret == -ENOSPC)
2174
ret = -EEXIST;
2175
out:
2176
if (eventfd)
2177
eventfd_ctx_put(eventfd);
2178
out_noeventfd:
2179
kfree(evtchnfd);
2180
return ret;
2181
}
2182
2183
static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
2184
{
2185
struct evtchnfd *evtchnfd;
2186
2187
mutex_lock(&kvm->arch.xen.xen_lock);
2188
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
2189
mutex_unlock(&kvm->arch.xen.xen_lock);
2190
2191
if (!evtchnfd)
2192
return -ENOENT;
2193
2194
synchronize_srcu(&kvm->srcu);
2195
if (!evtchnfd->deliver.port.port)
2196
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2197
kfree(evtchnfd);
2198
return 0;
2199
}
2200
2201
static int kvm_xen_eventfd_reset(struct kvm *kvm)
2202
{
2203
struct evtchnfd *evtchnfd, **all_evtchnfds;
2204
int i;
2205
int n = 0;
2206
2207
mutex_lock(&kvm->arch.xen.xen_lock);
2208
2209
/*
2210
* Because synchronize_srcu() cannot be called inside the
2211
* critical section, first collect all the evtchnfd objects
2212
* in an array as they are removed from evtchn_ports.
2213
*/
2214
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
2215
n++;
2216
2217
all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
2218
if (!all_evtchnfds) {
2219
mutex_unlock(&kvm->arch.xen.xen_lock);
2220
return -ENOMEM;
2221
}
2222
2223
n = 0;
2224
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2225
all_evtchnfds[n++] = evtchnfd;
2226
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
2227
}
2228
mutex_unlock(&kvm->arch.xen.xen_lock);
2229
2230
synchronize_srcu(&kvm->srcu);
2231
2232
while (n--) {
2233
evtchnfd = all_evtchnfds[n];
2234
if (!evtchnfd->deliver.port.port)
2235
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2236
kfree(evtchnfd);
2237
}
2238
kfree(all_evtchnfds);
2239
2240
return 0;
2241
}
2242
2243
static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
2244
{
2245
u32 port = data->u.evtchn.send_port;
2246
2247
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
2248
return kvm_xen_eventfd_reset(kvm);
2249
2250
if (!port || port >= max_evtchn_port(kvm))
2251
return -EINVAL;
2252
2253
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
2254
return kvm_xen_eventfd_deassign(kvm, port);
2255
if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
2256
return kvm_xen_eventfd_update(kvm, data);
2257
if (data->u.evtchn.flags)
2258
return -EINVAL;
2259
2260
return kvm_xen_eventfd_assign(kvm, data);
2261
}
2262
2263
static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
2264
{
2265
struct evtchnfd *evtchnfd;
2266
struct evtchn_send send;
2267
struct x86_exception e;
2268
2269
/* Sanity check: this structure is the same for 32-bit and 64-bit */
2270
BUILD_BUG_ON(sizeof(send) != 4);
2271
if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
2272
*r = -EFAULT;
2273
return true;
2274
}
2275
2276
/*
2277
* evtchnfd is protected by kvm->srcu; the idr lookup instead
2278
* is protected by RCU.
2279
*/
2280
rcu_read_lock();
2281
evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
2282
rcu_read_unlock();
2283
if (!evtchnfd)
2284
return false;
2285
2286
if (evtchnfd->deliver.port.port) {
2287
int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
2288
if (ret < 0 && ret != -ENOTCONN)
2289
return false;
2290
} else {
2291
eventfd_signal(evtchnfd->deliver.eventfd.ctx);
2292
}
2293
2294
*r = 0;
2295
return true;
2296
}
2297
2298
void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
2299
{
2300
vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
2301
vcpu->arch.xen.poll_evtchn = 0;
2302
2303
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
2304
hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
2305
HRTIMER_MODE_ABS_HARD);
2306
2307
kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
2308
kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
2309
kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
2310
kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
2311
}
2312
2313
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
2314
{
2315
if (kvm_xen_timer_enabled(vcpu))
2316
kvm_xen_stop_timer(vcpu);
2317
2318
kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
2319
kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
2320
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
2321
kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
2322
2323
timer_delete_sync(&vcpu->arch.xen.poll_timer);
2324
}
2325
2326
void kvm_xen_init_vm(struct kvm *kvm)
2327
{
2328
mutex_init(&kvm->arch.xen.xen_lock);
2329
idr_init(&kvm->arch.xen.evtchn_ports);
2330
kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
2331
}
2332
2333
void kvm_xen_destroy_vm(struct kvm *kvm)
2334
{
2335
struct evtchnfd *evtchnfd;
2336
int i;
2337
2338
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
2339
2340
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2341
if (!evtchnfd->deliver.port.port)
2342
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
2343
kfree(evtchnfd);
2344
}
2345
idr_destroy(&kvm->arch.xen.evtchn_ports);
2346
2347
if (kvm->arch.xen.hvm_config.msr)
2348
static_branch_slow_dec_deferred(&kvm_xen_enabled);
2349
}
2350
2351