Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/xen/time.c
50868 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Xen time implementation.
4
*
5
* This is implemented in terms of a clocksource driver which uses
6
* the hypervisor clock as a nanosecond timebase, and a clockevent
7
* driver which uses the hypervisor's timer mechanism.
8
*
9
* Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
10
*/
11
#include <linux/kernel.h>
12
#include <linux/interrupt.h>
13
#include <linux/clocksource.h>
14
#include <linux/clockchips.h>
15
#include <linux/gfp.h>
16
#include <linux/slab.h>
17
#include <linux/pvclock_gtod.h>
18
#include <linux/timekeeper_internal.h>
19
#include <linux/sched/cputime.h>
20
21
#include <asm/pvclock.h>
22
#include <asm/timer.h>
23
#include <asm/xen/hypervisor.h>
24
#include <asm/xen/hypercall.h>
25
#include <asm/xen/cpuid.h>
26
27
#include <xen/events.h>
28
#include <xen/features.h>
29
#include <xen/interface/xen.h>
30
#include <xen/interface/vcpu.h>
31
32
#include "xen-ops.h"
33
34
/* Minimum amount of time until next clock event fires */
35
#define TIMER_SLOP 1
36
37
static u64 xen_sched_clock_offset __read_mostly;
38
39
/* Get the TSC speed from Xen */
40
static unsigned long xen_tsc_khz(void)
41
{
42
struct pvclock_vcpu_time_info *info =
43
&HYPERVISOR_shared_info->vcpu_info[0].time;
44
45
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
46
return pvclock_tsc_khz(info);
47
}
48
49
static u64 xen_clocksource_read(void)
50
{
51
struct pvclock_vcpu_time_info *src;
52
u64 ret;
53
54
preempt_disable_notrace();
55
src = &__this_cpu_read(xen_vcpu)->time;
56
ret = pvclock_clocksource_read(src);
57
preempt_enable_notrace();
58
return ret;
59
}
60
61
static u64 xen_clocksource_get_cycles(struct clocksource *cs)
62
{
63
return xen_clocksource_read();
64
}
65
66
static noinstr u64 xen_sched_clock(void)
67
{
68
struct pvclock_vcpu_time_info *src;
69
u64 ret;
70
71
src = &__this_cpu_read(xen_vcpu)->time;
72
ret = pvclock_clocksource_read_nowd(src);
73
ret -= xen_sched_clock_offset;
74
75
return ret;
76
}
77
78
static void xen_read_wallclock(struct timespec64 *ts)
79
{
80
struct shared_info *s = HYPERVISOR_shared_info;
81
struct pvclock_wall_clock *wall_clock = &(s->wc);
82
struct pvclock_vcpu_time_info *vcpu_time;
83
84
vcpu_time = &get_cpu_var(xen_vcpu)->time;
85
pvclock_read_wallclock(wall_clock, vcpu_time, ts);
86
put_cpu_var(xen_vcpu);
87
}
88
89
static void xen_get_wallclock(struct timespec64 *now)
90
{
91
xen_read_wallclock(now);
92
}
93
94
static int xen_set_wallclock(const struct timespec64 *now)
95
{
96
return -ENODEV;
97
}
98
99
static int xen_pvclock_gtod_notify(struct notifier_block *nb,
100
unsigned long was_set, void *priv)
101
{
102
/* Protected by the calling core code serialization */
103
static struct timespec64 next_sync;
104
105
struct xen_platform_op op;
106
struct timespec64 now;
107
struct timekeeper *tk = priv;
108
static bool settime64_supported = true;
109
int ret;
110
111
now.tv_sec = tk->xtime_sec;
112
now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
113
114
/*
115
* We only take the expensive HV call when the clock was set
116
* or when the 11 minutes RTC synchronization time elapsed.
117
*/
118
if (!was_set && timespec64_compare(&now, &next_sync) < 0)
119
return NOTIFY_OK;
120
121
again:
122
if (settime64_supported) {
123
op.cmd = XENPF_settime64;
124
op.u.settime64.mbz = 0;
125
op.u.settime64.secs = now.tv_sec;
126
op.u.settime64.nsecs = now.tv_nsec;
127
op.u.settime64.system_time = xen_clocksource_read();
128
} else {
129
op.cmd = XENPF_settime32;
130
op.u.settime32.secs = now.tv_sec;
131
op.u.settime32.nsecs = now.tv_nsec;
132
op.u.settime32.system_time = xen_clocksource_read();
133
}
134
135
ret = HYPERVISOR_platform_op(&op);
136
137
if (ret == -ENOSYS && settime64_supported) {
138
settime64_supported = false;
139
goto again;
140
}
141
if (ret < 0)
142
return NOTIFY_BAD;
143
144
/*
145
* Move the next drift compensation time 11 minutes
146
* ahead. That's emulating the sync_cmos_clock() update for
147
* the hardware RTC.
148
*/
149
next_sync = now;
150
next_sync.tv_sec += 11 * 60;
151
152
return NOTIFY_OK;
153
}
154
155
static struct notifier_block xen_pvclock_gtod_notifier = {
156
.notifier_call = xen_pvclock_gtod_notify,
157
};
158
159
static int xen_cs_enable(struct clocksource *cs)
160
{
161
vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
162
return 0;
163
}
164
165
static struct clocksource xen_clocksource __read_mostly = {
166
.name = "xen",
167
.rating = 400,
168
.read = xen_clocksource_get_cycles,
169
.mask = CLOCKSOURCE_MASK(64),
170
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
171
.enable = xen_cs_enable,
172
};
173
174
/*
175
Xen clockevent implementation
176
177
Xen has two clockevent implementations:
178
179
The old timer_op one works with all released versions of Xen prior
180
to version 3.0.4. This version of the hypervisor provides a
181
single-shot timer with nanosecond resolution. However, sharing the
182
same event channel is a 100Hz tick which is delivered while the
183
vcpu is running. We don't care about or use this tick, but it will
184
cause the core time code to think the timer fired too soon, and
185
will end up resetting it each time. It could be filtered, but
186
doing so has complications when the ktime clocksource is not yet
187
the xen clocksource (ie, at boot time).
188
189
The new vcpu_op-based timer interface allows the tick timer period
190
to be changed or turned off. The tick timer is not useful as a
191
periodic timer because events are only delivered to running vcpus.
192
The one-shot timer can report when a timeout is in the past, so
193
set_next_event is capable of returning -ETIME when appropriate.
194
This interface is used when available.
195
*/
196
197
198
/*
199
Get a hypervisor absolute time. In theory we could maintain an
200
offset between the kernel's time and the hypervisor's time, and
201
apply that to a kernel's absolute timeout. Unfortunately the
202
hypervisor and kernel times can drift even if the kernel is using
203
the Xen clocksource, because ntp can warp the kernel's clocksource.
204
*/
205
static s64 get_abs_timeout(unsigned long delta)
206
{
207
return xen_clocksource_read() + delta;
208
}
209
210
static int xen_timerop_shutdown(struct clock_event_device *evt)
211
{
212
/* cancel timeout */
213
HYPERVISOR_set_timer_op(0);
214
215
return 0;
216
}
217
218
static int xen_timerop_set_next_event(unsigned long delta,
219
struct clock_event_device *evt)
220
{
221
WARN_ON(!clockevent_state_oneshot(evt));
222
223
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
224
BUG();
225
226
/* We may have missed the deadline, but there's no real way of
227
knowing for sure. If the event was in the past, then we'll
228
get an immediate interrupt. */
229
230
return 0;
231
}
232
233
static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
234
.name = "xen",
235
.features = CLOCK_EVT_FEAT_ONESHOT,
236
237
.max_delta_ns = 0xffffffff,
238
.max_delta_ticks = 0xffffffff,
239
.min_delta_ns = TIMER_SLOP,
240
.min_delta_ticks = TIMER_SLOP,
241
242
.mult = 1,
243
.shift = 0,
244
.rating = 500,
245
246
.set_state_shutdown = xen_timerop_shutdown,
247
.set_next_event = xen_timerop_set_next_event,
248
};
249
250
static int xen_vcpuop_shutdown(struct clock_event_device *evt)
251
{
252
int cpu = smp_processor_id();
253
254
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
255
NULL) ||
256
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
257
NULL))
258
BUG();
259
260
return 0;
261
}
262
263
static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
264
{
265
int cpu = smp_processor_id();
266
267
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
268
NULL))
269
BUG();
270
271
return 0;
272
}
273
274
static int xen_vcpuop_set_next_event(unsigned long delta,
275
struct clock_event_device *evt)
276
{
277
int cpu = smp_processor_id();
278
struct vcpu_set_singleshot_timer single;
279
int ret;
280
281
WARN_ON(!clockevent_state_oneshot(evt));
282
283
single.timeout_abs_ns = get_abs_timeout(delta);
284
/* Get an event anyway, even if the timeout is already expired */
285
single.flags = 0;
286
287
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
288
&single);
289
BUG_ON(ret != 0);
290
291
return ret;
292
}
293
294
static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
295
.name = "xen",
296
.features = CLOCK_EVT_FEAT_ONESHOT,
297
298
.max_delta_ns = 0xffffffff,
299
.max_delta_ticks = 0xffffffff,
300
.min_delta_ns = TIMER_SLOP,
301
.min_delta_ticks = TIMER_SLOP,
302
303
.mult = 1,
304
.shift = 0,
305
.rating = 500,
306
307
.set_state_shutdown = xen_vcpuop_shutdown,
308
.set_state_oneshot = xen_vcpuop_set_oneshot,
309
.set_next_event = xen_vcpuop_set_next_event,
310
};
311
312
static const struct clock_event_device *xen_clockevent =
313
&xen_timerop_clockevent;
314
315
struct xen_clock_event_device {
316
struct clock_event_device evt;
317
char name[16];
318
};
319
static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
320
321
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
322
{
323
struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
324
irqreturn_t ret;
325
326
ret = IRQ_NONE;
327
if (evt->event_handler) {
328
evt->event_handler(evt);
329
ret = IRQ_HANDLED;
330
}
331
332
return ret;
333
}
334
335
void xen_teardown_timer(int cpu)
336
{
337
struct clock_event_device *evt;
338
evt = &per_cpu(xen_clock_events, cpu).evt;
339
340
if (evt->irq >= 0) {
341
unbind_from_irqhandler(evt->irq, NULL);
342
evt->irq = -1;
343
}
344
}
345
346
void xen_setup_timer(int cpu)
347
{
348
struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
349
struct clock_event_device *evt = &xevt->evt;
350
int irq;
351
352
WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
353
if (evt->irq >= 0)
354
xen_teardown_timer(cpu);
355
356
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
357
358
snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
359
360
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
361
IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
362
IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
363
xevt->name, NULL);
364
(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
365
366
memcpy(evt, xen_clockevent, sizeof(*evt));
367
368
evt->cpumask = cpumask_of(cpu);
369
evt->irq = irq;
370
}
371
372
373
void xen_setup_cpu_clockevents(void)
374
{
375
clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
376
}
377
378
void xen_timer_resume(void)
379
{
380
int cpu;
381
382
if (xen_clockevent != &xen_vcpuop_clockevent)
383
return;
384
385
for_each_online_cpu(cpu) {
386
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
387
xen_vcpu_nr(cpu), NULL))
388
BUG();
389
}
390
}
391
392
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
393
static u64 xen_clock_value_saved;
394
395
void xen_save_time_memory_area(void)
396
{
397
struct vcpu_register_time_memory_area t;
398
int ret;
399
400
xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
401
402
if (!xen_clock)
403
return;
404
405
t.addr.v = NULL;
406
407
ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
408
if (ret != 0)
409
pr_notice("Cannot save secondary vcpu_time_info (err %d)",
410
ret);
411
else
412
clear_page(xen_clock);
413
}
414
415
void xen_restore_time_memory_area(void)
416
{
417
struct vcpu_register_time_memory_area t;
418
int ret;
419
420
if (!xen_clock)
421
goto out;
422
423
t.addr.v = &xen_clock->pvti;
424
425
ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
426
427
/*
428
* We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
429
* register the secondary time info with Xen or if we migrated to a
430
* host without the necessary flags. On both of these cases what
431
* happens is either process seeing a zeroed out pvti or seeing no
432
* PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
433
* if 0, it discards the data in pvti and fallbacks to a system
434
* call for a reliable timestamp.
435
*/
436
if (ret != 0)
437
pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
438
ret);
439
440
out:
441
/* Need pvclock_resume() before using xen_clocksource_read(). */
442
pvclock_resume();
443
xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
444
}
445
446
static void xen_setup_vsyscall_time_info(void)
447
{
448
struct vcpu_register_time_memory_area t;
449
struct pvclock_vsyscall_time_info *ti;
450
int ret;
451
452
ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
453
if (!ti)
454
return;
455
456
t.addr.v = &ti->pvti;
457
458
ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
459
if (ret) {
460
pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
461
free_page((unsigned long)ti);
462
return;
463
}
464
465
/*
466
* If primary time info had this bit set, secondary should too since
467
* it's the same data on both just different memory regions. But we
468
* still check it in case hypervisor is buggy.
469
*/
470
if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
471
t.addr.v = NULL;
472
ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
473
0, &t);
474
if (!ret)
475
free_page((unsigned long)ti);
476
477
pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
478
return;
479
}
480
481
xen_clock = ti;
482
pvclock_set_pvti_cpu0_va(xen_clock);
483
484
xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
485
}
486
487
/*
488
* Check if it is possible to safely use the tsc as a clocksource. This is
489
* only true if the hypervisor notifies the guest that its tsc is invariant,
490
* the tsc is stable, and the tsc instruction will never be emulated.
491
*/
492
static int __init xen_tsc_safe_clocksource(void)
493
{
494
u32 eax, ebx, ecx, edx;
495
496
if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
497
return 0;
498
499
if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
500
return 0;
501
502
if (check_tsc_unstable())
503
return 0;
504
505
/* Leaf 4, sub-leaf 0 (0x40000x03) */
506
cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
507
508
return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
509
}
510
511
static void __init xen_time_init(void)
512
{
513
struct pvclock_vcpu_time_info *pvti;
514
int cpu = smp_processor_id();
515
struct timespec64 tp;
516
517
/*
518
* As Dom0 is never moved, no penalty on using TSC there.
519
*
520
* If it is possible for the guest to determine that the tsc is a safe
521
* clocksource, then set xen_clocksource rating below that of the tsc
522
* so that the system prefers tsc instead.
523
*/
524
if (xen_initial_domain())
525
xen_clocksource.rating = 275;
526
else if (xen_tsc_safe_clocksource())
527
xen_clocksource.rating = 299;
528
529
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
530
531
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
532
NULL) == 0) {
533
/* Successfully turned off 100Hz tick, so we have the
534
vcpuop-based timer interface */
535
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
536
xen_clockevent = &xen_vcpuop_clockevent;
537
}
538
539
/* Set initial system time with full resolution */
540
xen_read_wallclock(&tp);
541
do_settimeofday64(&tp);
542
543
setup_force_cpu_cap(X86_FEATURE_TSC);
544
545
/*
546
* We check ahead on the primary time info if this
547
* bit is supported hence speeding up Xen clocksource.
548
*/
549
pvti = &__this_cpu_read(xen_vcpu)->time;
550
if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
551
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
552
xen_setup_vsyscall_time_info();
553
}
554
555
xen_setup_runstate_info(cpu);
556
xen_setup_timer(cpu);
557
xen_setup_cpu_clockevents();
558
559
xen_time_setup_guest();
560
561
if (xen_initial_domain())
562
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
563
}
564
565
static void __init xen_init_time_common(void)
566
{
567
xen_sched_clock_offset = xen_clocksource_read();
568
static_call_update(pv_steal_clock, xen_steal_clock);
569
paravirt_set_sched_clock(xen_sched_clock);
570
571
x86_platform.calibrate_tsc = xen_tsc_khz;
572
x86_platform.get_wallclock = xen_get_wallclock;
573
}
574
575
void __init xen_init_time_ops(void)
576
{
577
xen_init_time_common();
578
579
x86_init.timers.timer_init = xen_time_init;
580
x86_init.timers.setup_percpu_clockev = x86_init_noop;
581
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
582
583
/* Dom0 uses the native method to set the hardware RTC. */
584
if (!xen_initial_domain())
585
x86_platform.set_wallclock = xen_set_wallclock;
586
}
587
588
#ifdef CONFIG_XEN_PVHVM
589
static void xen_hvm_setup_cpu_clockevents(void)
590
{
591
int cpu = smp_processor_id();
592
xen_setup_runstate_info(cpu);
593
/*
594
* xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
595
* doing it xen_hvm_cpu_notify (which gets called by smp_init during
596
* early bootup and also during CPU hotplug events).
597
*/
598
xen_setup_cpu_clockevents();
599
}
600
601
void __init xen_hvm_init_time_ops(void)
602
{
603
static bool hvm_time_initialized;
604
605
if (hvm_time_initialized)
606
return;
607
608
/*
609
* vector callback is needed otherwise we cannot receive interrupts
610
* on cpu > 0 and at this point we don't know how many cpus are
611
* available.
612
*/
613
if (!xen_have_vector_callback)
614
return;
615
616
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
617
pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
618
return;
619
}
620
621
/*
622
* Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
623
* The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
624
* boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
625
* __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
626
*
627
* The xen_hvm_init_time_ops() should be called again later after
628
* __this_cpu_read(xen_vcpu) is available.
629
*/
630
if (!__this_cpu_read(xen_vcpu)) {
631
pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
632
xen_vcpu_nr(0));
633
return;
634
}
635
636
xen_init_time_common();
637
638
x86_init.timers.setup_percpu_clockev = xen_time_init;
639
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
640
641
x86_platform.set_wallclock = xen_set_wallclock;
642
643
hvm_time_initialized = true;
644
}
645
#endif
646
647
/* Kernel parameter to specify Xen timer slop */
648
static int __init parse_xen_timer_slop(char *ptr)
649
{
650
unsigned long slop = memparse(ptr, NULL);
651
652
xen_timerop_clockevent.min_delta_ns = slop;
653
xen_timerop_clockevent.min_delta_ticks = slop;
654
xen_vcpuop_clockevent.min_delta_ns = slop;
655
xen_vcpuop_clockevent.min_delta_ticks = slop;
656
657
return 0;
658
}
659
early_param("xen_timer_slop", parse_xen_timer_slop);
660
661