CoCalc -- time.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/xen/time.c
⁵⁰⁸⁶⁸ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Xen time implementation.
4
 *
5
 * This is implemented in terms of a clocksource driver which uses
6
 * the hypervisor clock as a nanosecond timebase, and a clockevent
7
 * driver which uses the hypervisor's timer mechanism.
8
 *
9
 * Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
10
 */
11
#include <linux/kernel.h>
12
#include <linux/interrupt.h>
13
#include <linux/clocksource.h>
14
#include <linux/clockchips.h>
15
#include <linux/gfp.h>
16
#include <linux/slab.h>
17
#include <linux/pvclock_gtod.h>
18
#include <linux/timekeeper_internal.h>
19
#include <linux/sched/cputime.h>
20

21
#include <asm/pvclock.h>
22
#include <asm/timer.h>
23
#include <asm/xen/hypervisor.h>
24
#include <asm/xen/hypercall.h>
25
#include <asm/xen/cpuid.h>
26

27
#include <xen/events.h>
28
#include <xen/features.h>
29
#include <xen/interface/xen.h>
30
#include <xen/interface/vcpu.h>
31

32
#include "xen-ops.h"
33

34
/* Minimum amount of time until next clock event fires */
35
#define TIMER_SLOP	1
36

37
static u64 xen_sched_clock_offset __read_mostly;
38

39
/* Get the TSC speed from Xen */
40
static unsigned long xen_tsc_khz(void)
41
{
42
	struct pvclock_vcpu_time_info *info =
43
		&HYPERVISOR_shared_info->vcpu_info[0].time;
44

45
	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
46
	return pvclock_tsc_khz(info);
47
}
48

49
static u64 xen_clocksource_read(void)
50
{
51
        struct pvclock_vcpu_time_info *src;
52
	u64 ret;
53

54
	preempt_disable_notrace();
55
	src = &__this_cpu_read(xen_vcpu)->time;
56
	ret = pvclock_clocksource_read(src);
57
	preempt_enable_notrace();
58
	return ret;
59
}
60

61
static u64 xen_clocksource_get_cycles(struct clocksource *cs)
62
{
63
	return xen_clocksource_read();
64
}
65

66
static noinstr u64 xen_sched_clock(void)
67
{
68
        struct pvclock_vcpu_time_info *src;
69
	u64 ret;
70

71
	src = &__this_cpu_read(xen_vcpu)->time;
72
	ret = pvclock_clocksource_read_nowd(src);
73
	ret -= xen_sched_clock_offset;
74

75
	return ret;
76
}
77

78
static void xen_read_wallclock(struct timespec64 *ts)
79
{
80
	struct shared_info *s = HYPERVISOR_shared_info;
81
	struct pvclock_wall_clock *wall_clock = &(s->wc);
82
        struct pvclock_vcpu_time_info *vcpu_time;
83

84
	vcpu_time = &get_cpu_var(xen_vcpu)->time;
85
	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
86
	put_cpu_var(xen_vcpu);
87
}
88

89
static void xen_get_wallclock(struct timespec64 *now)
90
{
91
	xen_read_wallclock(now);
92
}
93

94
static int xen_set_wallclock(const struct timespec64 *now)
95
{
96
	return -ENODEV;
97
}
98

99
static int xen_pvclock_gtod_notify(struct notifier_block *nb,
100
				   unsigned long was_set, void *priv)
101
{
102
	/* Protected by the calling core code serialization */
103
	static struct timespec64 next_sync;
104

105
	struct xen_platform_op op;
106
	struct timespec64 now;
107
	struct timekeeper *tk = priv;
108
	static bool settime64_supported = true;
109
	int ret;
110

111
	now.tv_sec = tk->xtime_sec;
112
	now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
113

114
	/*
115
	 * We only take the expensive HV call when the clock was set
116
	 * or when the 11 minutes RTC synchronization time elapsed.
117
	 */
118
	if (!was_set && timespec64_compare(&now, &next_sync) < 0)
119
		return NOTIFY_OK;
120

121
again:
122
	if (settime64_supported) {
123
		op.cmd = XENPF_settime64;
124
		op.u.settime64.mbz = 0;
125
		op.u.settime64.secs = now.tv_sec;
126
		op.u.settime64.nsecs = now.tv_nsec;
127
		op.u.settime64.system_time = xen_clocksource_read();
128
	} else {
129
		op.cmd = XENPF_settime32;
130
		op.u.settime32.secs = now.tv_sec;
131
		op.u.settime32.nsecs = now.tv_nsec;
132
		op.u.settime32.system_time = xen_clocksource_read();
133
	}
134

135
	ret = HYPERVISOR_platform_op(&op);
136

137
	if (ret == -ENOSYS && settime64_supported) {
138
		settime64_supported = false;
139
		goto again;
140
	}
141
	if (ret < 0)
142
		return NOTIFY_BAD;
143

144
	/*
145
	 * Move the next drift compensation time 11 minutes
146
	 * ahead. That's emulating the sync_cmos_clock() update for
147
	 * the hardware RTC.
148
	 */
149
	next_sync = now;
150
	next_sync.tv_sec += 11 * 60;
151

152
	return NOTIFY_OK;
153
}
154

155
static struct notifier_block xen_pvclock_gtod_notifier = {
156
	.notifier_call = xen_pvclock_gtod_notify,
157
};
158

159
static int xen_cs_enable(struct clocksource *cs)
160
{
161
	vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
162
	return 0;
163
}
164

165
static struct clocksource xen_clocksource __read_mostly = {
166
	.name	= "xen",
167
	.rating	= 400,
168
	.read	= xen_clocksource_get_cycles,
169
	.mask	= CLOCKSOURCE_MASK(64),
170
	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
171
	.enable = xen_cs_enable,
172
};
173

174
/*
175
   Xen clockevent implementation
176

177
   Xen has two clockevent implementations:
178

179
   The old timer_op one works with all released versions of Xen prior
180
   to version 3.0.4.  This version of the hypervisor provides a
181
   single-shot timer with nanosecond resolution.  However, sharing the
182
   same event channel is a 100Hz tick which is delivered while the
183
   vcpu is running.  We don't care about or use this tick, but it will
184
   cause the core time code to think the timer fired too soon, and
185
   will end up resetting it each time.  It could be filtered, but
186
   doing so has complications when the ktime clocksource is not yet
187
   the xen clocksource (ie, at boot time).
188

189
   The new vcpu_op-based timer interface allows the tick timer period
190
   to be changed or turned off.  The tick timer is not useful as a
191
   periodic timer because events are only delivered to running vcpus.
192
   The one-shot timer can report when a timeout is in the past, so
193
   set_next_event is capable of returning -ETIME when appropriate.
194
   This interface is used when available.
195
*/
196

197

198
/*
199
  Get a hypervisor absolute time.  In theory we could maintain an
200
  offset between the kernel's time and the hypervisor's time, and
201
  apply that to a kernel's absolute timeout.  Unfortunately the
202
  hypervisor and kernel times can drift even if the kernel is using
203
  the Xen clocksource, because ntp can warp the kernel's clocksource.
204
*/
205
static s64 get_abs_timeout(unsigned long delta)
206
{
207
	return xen_clocksource_read() + delta;
208
}
209

210
static int xen_timerop_shutdown(struct clock_event_device *evt)
211
{
212
	/* cancel timeout */
213
	HYPERVISOR_set_timer_op(0);
214

215
	return 0;
216
}
217

218
static int xen_timerop_set_next_event(unsigned long delta,
219
				      struct clock_event_device *evt)
220
{
221
	WARN_ON(!clockevent_state_oneshot(evt));
222

223
	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
224
		BUG();
225

226
	/* We may have missed the deadline, but there's no real way of
227
	   knowing for sure.  If the event was in the past, then we'll
228
	   get an immediate interrupt. */
229

230
	return 0;
231
}
232

233
static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
234
	.name			= "xen",
235
	.features		= CLOCK_EVT_FEAT_ONESHOT,
236

237
	.max_delta_ns		= 0xffffffff,
238
	.max_delta_ticks	= 0xffffffff,
239
	.min_delta_ns		= TIMER_SLOP,
240
	.min_delta_ticks	= TIMER_SLOP,
241

242
	.mult			= 1,
243
	.shift			= 0,
244
	.rating			= 500,
245

246
	.set_state_shutdown	= xen_timerop_shutdown,
247
	.set_next_event		= xen_timerop_set_next_event,
248
};
249

250
static int xen_vcpuop_shutdown(struct clock_event_device *evt)
251
{
252
	int cpu = smp_processor_id();
253

254
	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
255
			       NULL) ||
256
	    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
257
			       NULL))
258
		BUG();
259

260
	return 0;
261
}
262

263
static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
264
{
265
	int cpu = smp_processor_id();
266

267
	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
268
			       NULL))
269
		BUG();
270

271
	return 0;
272
}
273

274
static int xen_vcpuop_set_next_event(unsigned long delta,
275
				     struct clock_event_device *evt)
276
{
277
	int cpu = smp_processor_id();
278
	struct vcpu_set_singleshot_timer single;
279
	int ret;
280

281
	WARN_ON(!clockevent_state_oneshot(evt));
282

283
	single.timeout_abs_ns = get_abs_timeout(delta);
284
	/* Get an event anyway, even if the timeout is already expired */
285
	single.flags = 0;
286

287
	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
288
				 &single);
289
	BUG_ON(ret != 0);
290

291
	return ret;
292
}
293

294
static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
295
	.name = "xen",
296
	.features = CLOCK_EVT_FEAT_ONESHOT,
297

298
	.max_delta_ns = 0xffffffff,
299
	.max_delta_ticks = 0xffffffff,
300
	.min_delta_ns = TIMER_SLOP,
301
	.min_delta_ticks = TIMER_SLOP,
302

303
	.mult = 1,
304
	.shift = 0,
305
	.rating = 500,
306

307
	.set_state_shutdown = xen_vcpuop_shutdown,
308
	.set_state_oneshot = xen_vcpuop_set_oneshot,
309
	.set_next_event = xen_vcpuop_set_next_event,
310
};
311

312
static const struct clock_event_device *xen_clockevent =
313
	&xen_timerop_clockevent;
314

315
struct xen_clock_event_device {
316
	struct clock_event_device evt;
317
	char name[16];
318
};
319
static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
320

321
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
322
{
323
	struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
324
	irqreturn_t ret;
325

326
	ret = IRQ_NONE;
327
	if (evt->event_handler) {
328
		evt->event_handler(evt);
329
		ret = IRQ_HANDLED;
330
	}
331

332
	return ret;
333
}
334

335
void xen_teardown_timer(int cpu)
336
{
337
	struct clock_event_device *evt;
338
	evt = &per_cpu(xen_clock_events, cpu).evt;
339

340
	if (evt->irq >= 0) {
341
		unbind_from_irqhandler(evt->irq, NULL);
342
		evt->irq = -1;
343
	}
344
}
345

346
void xen_setup_timer(int cpu)
347
{
348
	struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
349
	struct clock_event_device *evt = &xevt->evt;
350
	int irq;
351

352
	WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
353
	if (evt->irq >= 0)
354
		xen_teardown_timer(cpu);
355

356
	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
357

358
	snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
359

360
	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
361
				      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
362
				      IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
363
				      xevt->name, NULL);
364
	(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
365

366
	memcpy(evt, xen_clockevent, sizeof(*evt));
367

368
	evt->cpumask = cpumask_of(cpu);
369
	evt->irq = irq;
370
}
371

372

373
void xen_setup_cpu_clockevents(void)
374
{
375
	clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
376
}
377

378
void xen_timer_resume(void)
379
{
380
	int cpu;
381

382
	if (xen_clockevent != &xen_vcpuop_clockevent)
383
		return;
384

385
	for_each_online_cpu(cpu) {
386
		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
387
				       xen_vcpu_nr(cpu), NULL))
388
			BUG();
389
	}
390
}
391

392
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
393
static u64 xen_clock_value_saved;
394

395
void xen_save_time_memory_area(void)
396
{
397
	struct vcpu_register_time_memory_area t;
398
	int ret;
399

400
	xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
401

402
	if (!xen_clock)
403
		return;
404

405
	t.addr.v = NULL;
406

407
	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
408
	if (ret != 0)
409
		pr_notice("Cannot save secondary vcpu_time_info (err %d)",
410
			  ret);
411
	else
412
		clear_page(xen_clock);
413
}
414

415
void xen_restore_time_memory_area(void)
416
{
417
	struct vcpu_register_time_memory_area t;
418
	int ret;
419

420
	if (!xen_clock)
421
		goto out;
422

423
	t.addr.v = &xen_clock->pvti;
424

425
	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
426

427
	/*
428
	 * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
429
	 * register the secondary time info with Xen or if we migrated to a
430
	 * host without the necessary flags. On both of these cases what
431
	 * happens is either process seeing a zeroed out pvti or seeing no
432
	 * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
433
	 * if 0, it discards the data in pvti and fallbacks to a system
434
	 * call for a reliable timestamp.
435
	 */
436
	if (ret != 0)
437
		pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
438
			  ret);
439

440
out:
441
	/* Need pvclock_resume() before using xen_clocksource_read(). */
442
	pvclock_resume();
443
	xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
444
}
445

446
static void xen_setup_vsyscall_time_info(void)
447
{
448
	struct vcpu_register_time_memory_area t;
449
	struct pvclock_vsyscall_time_info *ti;
450
	int ret;
451

452
	ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
453
	if (!ti)
454
		return;
455

456
	t.addr.v = &ti->pvti;
457

458
	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
459
	if (ret) {
460
		pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
461
		free_page((unsigned long)ti);
462
		return;
463
	}
464

465
	/*
466
	 * If primary time info had this bit set, secondary should too since
467
	 * it's the same data on both just different memory regions. But we
468
	 * still check it in case hypervisor is buggy.
469
	 */
470
	if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
471
		t.addr.v = NULL;
472
		ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
473
					 0, &t);
474
		if (!ret)
475
			free_page((unsigned long)ti);
476

477
		pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
478
		return;
479
	}
480

481
	xen_clock = ti;
482
	pvclock_set_pvti_cpu0_va(xen_clock);
483

484
	xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
485
}
486

487
/*
488
 * Check if it is possible to safely use the tsc as a clocksource.  This is
489
 * only true if the hypervisor notifies the guest that its tsc is invariant,
490
 * the tsc is stable, and the tsc instruction will never be emulated.
491
 */
492
static int __init xen_tsc_safe_clocksource(void)
493
{
494
	u32 eax, ebx, ecx, edx;
495

496
	if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
497
		return 0;
498

499
	if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
500
		return 0;
501

502
	if (check_tsc_unstable())
503
		return 0;
504

505
	/* Leaf 4, sub-leaf 0 (0x40000x03) */
506
	cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
507

508
	return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
509
}
510

511
static void __init xen_time_init(void)
512
{
513
	struct pvclock_vcpu_time_info *pvti;
514
	int cpu = smp_processor_id();
515
	struct timespec64 tp;
516

517
	/*
518
	 * As Dom0 is never moved, no penalty on using TSC there.
519
	 *
520
	 * If it is possible for the guest to determine that the tsc is a safe
521
	 * clocksource, then set xen_clocksource rating below that of the tsc
522
	 * so that the system prefers tsc instead.
523
	 */
524
	if (xen_initial_domain())
525
		xen_clocksource.rating = 275;
526
	else if (xen_tsc_safe_clocksource())
527
		xen_clocksource.rating = 299;
528

529
	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
530

531
	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
532
			       NULL) == 0) {
533
		/* Successfully turned off 100Hz tick, so we have the
534
		   vcpuop-based timer interface */
535
		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
536
		xen_clockevent = &xen_vcpuop_clockevent;
537
	}
538

539
	/* Set initial system time with full resolution */
540
	xen_read_wallclock(&tp);
541
	do_settimeofday64(&tp);
542

543
	setup_force_cpu_cap(X86_FEATURE_TSC);
544

545
	/*
546
	 * We check ahead on the primary time info if this
547
	 * bit is supported hence speeding up Xen clocksource.
548
	 */
549
	pvti = &__this_cpu_read(xen_vcpu)->time;
550
	if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
551
		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
552
		xen_setup_vsyscall_time_info();
553
	}
554

555
	xen_setup_runstate_info(cpu);
556
	xen_setup_timer(cpu);
557
	xen_setup_cpu_clockevents();
558

559
	xen_time_setup_guest();
560

561
	if (xen_initial_domain())
562
		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
563
}
564

565
static void __init xen_init_time_common(void)
566
{
567
	xen_sched_clock_offset = xen_clocksource_read();
568
	static_call_update(pv_steal_clock, xen_steal_clock);
569
	paravirt_set_sched_clock(xen_sched_clock);
570

571
	x86_platform.calibrate_tsc = xen_tsc_khz;
572
	x86_platform.get_wallclock = xen_get_wallclock;
573
}
574

575
void __init xen_init_time_ops(void)
576
{
577
	xen_init_time_common();
578

579
	x86_init.timers.timer_init = xen_time_init;
580
	x86_init.timers.setup_percpu_clockev = x86_init_noop;
581
	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
582

583
	/* Dom0 uses the native method to set the hardware RTC. */
584
	if (!xen_initial_domain())
585
		x86_platform.set_wallclock = xen_set_wallclock;
586
}
587

588
#ifdef CONFIG_XEN_PVHVM
589
static void xen_hvm_setup_cpu_clockevents(void)
590
{
591
	int cpu = smp_processor_id();
592
	xen_setup_runstate_info(cpu);
593
	/*
594
	 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
595
	 * doing it xen_hvm_cpu_notify (which gets called by smp_init during
596
	 * early bootup and also during CPU hotplug events).
597
	 */
598
	xen_setup_cpu_clockevents();
599
}
600

601
void __init xen_hvm_init_time_ops(void)
602
{
603
	static bool hvm_time_initialized;
604

605
	if (hvm_time_initialized)
606
		return;
607

608
	/*
609
	 * vector callback is needed otherwise we cannot receive interrupts
610
	 * on cpu > 0 and at this point we don't know how many cpus are
611
	 * available.
612
	 */
613
	if (!xen_have_vector_callback)
614
		return;
615

616
	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
617
		pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
618
		return;
619
	}
620

621
	/*
622
	 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
623
	 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
624
	 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
625
	 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
626
	 *
627
	 * The xen_hvm_init_time_ops() should be called again later after
628
	 * __this_cpu_read(xen_vcpu) is available.
629
	 */
630
	if (!__this_cpu_read(xen_vcpu)) {
631
		pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
632
			xen_vcpu_nr(0));
633
		return;
634
	}
635

636
	xen_init_time_common();
637

638
	x86_init.timers.setup_percpu_clockev = xen_time_init;
639
	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
640

641
	x86_platform.set_wallclock = xen_set_wallclock;
642

643
	hvm_time_initialized = true;
644
}
645
#endif
646

647
/* Kernel parameter to specify Xen timer slop */
648
static int __init parse_xen_timer_slop(char *ptr)
649
{
650
	unsigned long slop = memparse(ptr, NULL);
651

652
	xen_timerop_clockevent.min_delta_ns = slop;
653
	xen_timerop_clockevent.min_delta_ticks = slop;
654
	xen_vcpuop_clockevent.min_delta_ns = slop;
655
	xen_vcpuop_clockevent.min_delta_ticks = slop;
656

657
	return 0;
658
}
659
early_param("xen_timer_slop", parse_xen_timer_slop);
660

661
Product

Resources

Company