Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/devices/src/virtio/pvclock.rs
5394 views
1
// Copyright 2022 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
//! Virtio version of a linux pvclock clocksource.
6
//!
7
//! Driver source is here:
8
//! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9
//!
10
//! # Background
11
//!
12
//! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13
//! Large jumps can signal problems (e.g., triggering Android watchdogs).
14
//! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15
//! inherently linked to the guest kernel's concept of "suspend".
16
//! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17
//! to collaborate on emulating the expected clock behavior around suspend/resume.
18
//!
19
//! # How it works
20
//!
21
//! ## Core functions of virtio-pvclock device:
22
//!
23
//! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24
//! suspended.
25
//! - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26
//! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27
//! its clocks accordingly.
28
//! - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29
//! kernel, applying the adjustment is the guest driver's responsibility.
30
//!
31
//! ## Expected guest clock behaviors under virtio-pvclock is enabled
32
//!
33
//! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34
//! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35
//! perspective.
36
//! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37
//!
38
//! # Why it is needed
39
//!
40
//! Because the existing solution does not cover some expectations we need.
41
//!
42
//! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43
//! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44
//! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45
46
#[cfg(target_arch = "aarch64")]
47
use std::arch::asm;
48
use std::collections::BTreeMap;
49
use std::mem::replace;
50
use std::mem::size_of;
51
use std::sync::atomic::AtomicU64;
52
use std::sync::atomic::Ordering;
53
use std::sync::Arc;
54
use std::time::Duration;
55
56
use anyhow::anyhow;
57
use anyhow::bail;
58
use anyhow::Context;
59
use anyhow::Result;
60
use base::error;
61
use base::info;
62
use base::warn;
63
use base::AsRawDescriptor;
64
#[cfg(windows)]
65
use base::CloseNotifier;
66
use base::Error;
67
use base::Event;
68
use base::EventToken;
69
use base::RawDescriptor;
70
use base::ReadNotifier;
71
use base::Tube;
72
use base::WaitContext;
73
use base::WorkerThread;
74
use chrono::DateTime;
75
use chrono::Utc;
76
use data_model::Le32;
77
use data_model::Le64;
78
use serde::Deserialize;
79
use serde::Serialize;
80
use snapshot::AnySnapshot;
81
use vm_control::PvClockCommand;
82
use vm_control::PvClockCommandResponse;
83
use vm_memory::GuestAddress;
84
use vm_memory::GuestMemory;
85
use vm_memory::GuestMemoryError;
86
use zerocopy::FromBytes;
87
use zerocopy::Immutable;
88
use zerocopy::IntoBytes;
89
use zerocopy::KnownLayout;
90
91
use super::copy_config;
92
use super::DeviceType;
93
use super::Interrupt;
94
use super::Queue;
95
use super::VirtioDevice;
96
97
// Pvclock has one virtio queue: set_pvclock_page
98
const QUEUE_SIZE: u16 = 1;
99
const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
100
101
// pvclock flag bits
102
const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
103
const PVCLOCK_GUEST_STOPPED: u8 = 2;
104
105
// The feature bitmap for virtio pvclock
106
const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
107
const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
108
const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
109
110
// Status values for a virtio_pvclock request.
111
const VIRTIO_PVCLOCK_S_OK: u8 = 0;
112
const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
113
114
const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
115
116
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
117
fn read_clock_counter() -> u64 {
118
// SAFETY: rdtsc is unprivileged and have no side effects.
119
unsafe { std::arch::x86_64::_rdtsc() }
120
}
121
122
#[cfg(target_arch = "aarch64")]
123
fn read_clock_counter() -> u64 {
124
let mut x: u64;
125
// SAFETY: This instruction have no side effect apart from storing the current timestamp counter
126
// into the specified register.
127
unsafe {
128
asm!("mrs {x}, cntvct_el0",
129
x = out(reg) x,
130
);
131
}
132
x
133
}
134
135
/// Calculate a (multiplier, shift) pair for scaled math of clocks.
136
/// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
137
/// (approximate) equality:
138
/// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
139
/// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
140
/// # Arguments
141
/// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
142
/// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
143
/// frequency.
144
fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
145
assert!(scaled_hz > 0 && base_hz > 0);
146
// We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
147
// With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
148
// with a corresponding `shift`.
149
//
150
// The value of `shift` should satisfy a few constraints:
151
// 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
152
// (2^32-1)/2^32).
153
// 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
154
// TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
155
// time.
156
//
157
// Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
158
// equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
159
// satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
160
let mut shift = 0;
161
// Convert to u128 so that overflow handling becomes much easier.
162
let mut scaled_hz = scaled_hz as u128;
163
let mut base_hz = base_hz as u128;
164
if scaled_hz >= base_hz {
165
while scaled_hz >= base_hz {
166
// `multiplier` >= 1.0; iteratively scale it down
167
// scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
168
base_hz <<= 1;
169
shift += 1;
170
}
171
} else {
172
while base_hz > 2 * scaled_hz {
173
// `multiplier` < 0.5; iteratively scale it up
174
// base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
175
// bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
176
scaled_hz <<= 1;
177
shift -= 1;
178
}
179
}
180
// From above, we know that the values are at most 65 bits. This provides sufficient headroom
181
// for scaled_hz << 32 below.
182
assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
183
let mult: u32 = ((scaled_hz << 32) / base_hz)
184
.try_into()
185
.expect("should not overflow");
186
(mult, shift)
187
}
188
189
// The config structure being exposed to the guest to tell them how much suspend time should be
190
// injected to the guest's CLOCK_BOOTTIME.
191
#[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
192
#[allow(non_camel_case_types)]
193
#[repr(C)]
194
struct virtio_pvclock_config {
195
// Total duration the VM has been paused while the guest kernel is not in the suspended state
196
// (from the power management and timekeeping perspective).
197
suspend_time_ns: Le64,
198
// Device-suggested rating of the pvclock clocksource.
199
clocksource_rating: Le32,
200
padding: u32,
201
}
202
203
#[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
204
#[allow(non_camel_case_types)]
205
#[repr(C)]
206
struct virtio_pvclock_set_pvclock_page_req {
207
// Physical address of pvclock page.
208
pvclock_page_pa: Le64,
209
// Current system time.
210
system_time: Le64,
211
// Current tsc value.
212
tsc_timestamp: Le64,
213
// Status of this request, one of VIRTIO_PVCLOCK_S_*.
214
status: u8,
215
padding: [u8; 7],
216
}
217
218
// Data structure for interacting with pvclock shared memory.
219
struct PvclockSharedData {
220
mem: GuestMemory,
221
seqlock_addr: GuestAddress,
222
tsc_suspended_delta_addr: GuestAddress,
223
tsc_frequency_multiplier_addr: GuestAddress,
224
tsc_frequency_shift_addr: GuestAddress,
225
flags_addr: GuestAddress,
226
}
227
228
impl PvclockSharedData {
229
pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
230
PvclockSharedData {
231
mem,
232
// The addresses of the various fields that we need to modify are relative to the
233
// base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
234
seqlock_addr: addr,
235
tsc_suspended_delta_addr: addr.unchecked_add(8),
236
tsc_frequency_multiplier_addr: addr.unchecked_add(24),
237
tsc_frequency_shift_addr: addr.unchecked_add(28),
238
flags_addr: addr.unchecked_add(29),
239
}
240
}
241
242
/// Only the seqlock_addr is needed to re-create this struct at restore
243
/// time, so that is all our snapshot contains.
244
fn snapshot(&self) -> GuestAddress {
245
self.seqlock_addr
246
}
247
248
/// Set all fields to zero.
249
pub fn zero_fill(&mut self) -> Result<()> {
250
// The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
251
self.mem
252
.write_all_at_addr(&[0u8; 32], self.seqlock_addr)
253
.context("failed to zero fill the pvclock shared data")
254
}
255
256
pub fn increment_seqlock(&mut self) -> Result<()> {
257
// TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
258
// guaranteed to be atomic. Although this should not be a problem for the seqlock
259
// or the other fields in the pvclock shared data (whch are protected via the seqlock)
260
// we might want to update these calls to be as atomic as possible if/when we have
261
// the ability to do so, just as a general cleanup and to be consistent.
262
let value = self
263
.mem
264
.read_obj_from_addr::<u32>(self.seqlock_addr)
265
.context("failed to read seqlock value")?;
266
self.mem
267
.write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
268
.context("failed to write seqlock value")
269
}
270
271
pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
272
self.mem
273
.write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
274
.context("failed to write tsc suspended delta")
275
}
276
277
pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
278
let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
279
280
self.mem
281
.write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
282
.context("failed to write tsc frequency mlutiplier")?;
283
self.mem
284
.write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
285
.context("failed to write tsc frequency shift")
286
}
287
288
pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
289
let value = self
290
.mem
291
.read_obj_from_addr::<u8>(self.flags_addr)
292
.context("failed to read flags")?;
293
self.mem
294
.write_obj_at_addr(value | flags, self.flags_addr)
295
.context("failed to write flags")
296
}
297
}
298
299
/// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
300
#[derive(Serialize, Deserialize)]
301
struct PvClockState {
302
tsc_frequency: u64,
303
/// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
304
/// will be stored here. (We can't just store the worker itself as it contains an object
305
/// tree with references to [GuestMemory].)
306
paused_main_worker: Option<PvClockWorkerSnapshot>,
307
/// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
308
/// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
309
total_suspend_ns: Arc<AtomicU64>,
310
features: u64,
311
acked_features: u64,
312
}
313
314
/// An enum to keep dynamic state of pvclock workers in a type safe manner.
315
enum PvClockWorkerState {
316
/// Idle means no worker is running.
317
/// This tube is for communicating with this device from the crosvm threads.
318
Idle(Tube),
319
/// A stub worker to respond pvclock commands when the device is not activated yet.
320
Stub(WorkerThread<StubWorkerReturn>),
321
/// A main worker to respond pvclock commands while the device is active.
322
Main(WorkerThread<MainWorkerReturn>),
323
/// None is used only for handling transitional state between the states above.
324
None,
325
}
326
327
/// A struct that represents virtio-pvclock device.
328
pub struct PvClock {
329
state: PvClockState,
330
worker_state: PvClockWorkerState,
331
}
332
333
impl PvClock {
334
pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
335
let state = PvClockState {
336
tsc_frequency,
337
paused_main_worker: None,
338
total_suspend_ns: Arc::new(AtomicU64::new(0)),
339
features: base_features
340
| 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
341
| 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
342
| 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
343
acked_features: 0,
344
};
345
PvClock {
346
state,
347
worker_state: PvClockWorkerState::Idle(suspend_tube),
348
}
349
}
350
351
fn get_config(&self) -> virtio_pvclock_config {
352
virtio_pvclock_config {
353
suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
354
clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
355
padding: 0,
356
}
357
}
358
359
/// Use switch_to_*_worker unless needed to keep the state transition consistent
360
fn start_main_worker(
361
&mut self,
362
interrupt: Interrupt,
363
pvclock_worker: PvClockWorker,
364
mut queues: BTreeMap<usize, Queue>,
365
) -> anyhow::Result<()> {
366
let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
367
if let PvClockWorkerState::Idle(suspend_tube) = last_state {
368
if queues.len() != QUEUE_SIZES.len() {
369
self.worker_state = PvClockWorkerState::Idle(suspend_tube);
370
return Err(anyhow!(
371
"expected {} queues, got {}",
372
QUEUE_SIZES.len(),
373
queues.len()
374
));
375
}
376
let set_pvclock_page_queue = queues.remove(&0).unwrap();
377
self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
378
"virtio_pvclock".to_string(),
379
move |kill_evt| {
380
run_main_worker(
381
pvclock_worker,
382
set_pvclock_page_queue,
383
suspend_tube,
384
interrupt,
385
kill_evt,
386
)
387
},
388
));
389
} else {
390
panic!("Invalid state transition");
391
}
392
Ok(())
393
}
394
395
/// Use switch_to_*_worker unless needed to keep the state transition consistent
396
fn start_stub_worker(&mut self) {
397
let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
398
self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
399
PvClockWorkerState::Stub(WorkerThread::start(
400
"virtio_pvclock_stub".to_string(),
401
move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
402
))
403
} else {
404
panic!("Invalid state transition");
405
};
406
}
407
408
/// Use switch_to_*_worker unless needed to keep the state transition consistent
409
fn stop_stub_worker(&mut self) {
410
let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
411
self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
412
let stub_worker_ret = stub_worker_thread.stop();
413
PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
414
} else {
415
panic!("Invalid state transition");
416
}
417
}
418
419
/// Use switch_to_*_worker unless needed to keep the state transition consistent
420
fn stop_main_worker(&mut self) {
421
let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
422
if let PvClockWorkerState::Main(main_worker_thread) = last_state {
423
let main_worker_ret = main_worker_thread.stop();
424
self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
425
let mut queues = BTreeMap::new();
426
queues.insert(0, main_worker_ret.set_pvclock_page_queue);
427
self.state.paused_main_worker = Some(main_worker_ret.worker.into());
428
} else {
429
panic!("Invalid state transition");
430
}
431
}
432
433
fn switch_to_stub_worker(&mut self) {
434
self.stop_main_worker();
435
self.start_stub_worker();
436
}
437
438
fn switch_to_main_worker(
439
&mut self,
440
interrupt: Interrupt,
441
pvclock_worker: PvClockWorker,
442
queues: BTreeMap<usize, Queue>,
443
) -> anyhow::Result<()> {
444
self.stop_stub_worker();
445
self.start_main_worker(interrupt, pvclock_worker, queues)
446
}
447
}
448
449
/// Represents a moment in time including the TSC counter value at that time.
450
#[derive(Serialize, Deserialize, Clone)]
451
struct PvclockInstant {
452
time: DateTime<Utc>,
453
tsc_value: u64,
454
}
455
456
/// The unique data retained by [PvClockWorker] which can be used to re-create
457
/// an identical worker.
458
#[derive(Serialize, Deserialize, Clone)]
459
struct PvClockWorkerSnapshot {
460
suspend_time: Option<PvclockInstant>,
461
total_suspend_tsc_delta: u64,
462
pvclock_shared_data_base_address: Option<GuestAddress>,
463
}
464
465
impl From<PvClockWorker> for PvClockWorkerSnapshot {
466
fn from(worker: PvClockWorker) -> Self {
467
PvClockWorkerSnapshot {
468
suspend_time: worker.suspend_time,
469
total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
470
pvclock_shared_data_base_address: worker
471
.pvclock_shared_data
472
.map(|pvclock| pvclock.snapshot()),
473
}
474
}
475
}
476
477
/// Worker struct for the virtio-pvclock device.
478
///
479
/// Handles virtio requests, storing information about suspend/resume, adjusting the
480
/// pvclock data in shared memory, and injecting suspend durations via config
481
/// changes.
482
struct PvClockWorker {
483
tsc_frequency: u64,
484
// The moment the last suspend occurred.
485
suspend_time: Option<PvclockInstant>,
486
// The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
487
// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
488
total_injected_ns: Arc<AtomicU64>,
489
// The total change in the TSC value over suspensions.
490
total_suspend_tsc_delta: u64,
491
// Pvclock shared data.
492
pvclock_shared_data: Option<PvclockSharedData>,
493
mem: GuestMemory,
494
}
495
496
impl PvClockWorker {
497
pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
498
PvClockWorker {
499
tsc_frequency,
500
suspend_time: None,
501
total_injected_ns,
502
total_suspend_tsc_delta: 0,
503
pvclock_shared_data: None,
504
mem,
505
}
506
}
507
508
fn from_snapshot(
509
tsc_frequency: u64,
510
total_injected_ns: Arc<AtomicU64>,
511
snap: PvClockWorkerSnapshot,
512
mem: GuestMemory,
513
) -> Self {
514
PvClockWorker {
515
tsc_frequency,
516
suspend_time: snap.suspend_time,
517
total_injected_ns,
518
total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
519
pvclock_shared_data: snap
520
.pvclock_shared_data_base_address
521
.map(|addr| PvclockSharedData::new(mem.clone(), addr)),
522
mem,
523
}
524
}
525
526
/// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
527
/// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
528
/// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
529
/// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
530
/// fields doesn't matter at this point, but does matter when updating.
531
fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
532
if self.pvclock_shared_data.is_some() {
533
return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
534
}
535
536
let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
537
538
// set all fields to 0 first
539
shared_data.zero_fill()?;
540
541
shared_data.set_tsc_frequency(self.tsc_frequency)?;
542
shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
543
544
self.pvclock_shared_data = Some(shared_data);
545
Ok(())
546
}
547
548
pub fn suspend(&mut self) {
549
if self.suspend_time.is_some() {
550
warn!("Suspend time already set, ignoring new suspend time");
551
return;
552
}
553
self.suspend_time = Some(PvclockInstant {
554
time: Utc::now(),
555
tsc_value: read_clock_counter(),
556
});
557
}
558
559
pub fn resume(&mut self) -> Result<u64> {
560
// First, increment the sequence lock by 1 before writing to the pvclock page.
561
self.increment_pvclock_seqlock()?;
562
563
// The guest makes sure there are memory barriers in between reads of the seqlock and other
564
// fields, we should make sure there are memory barriers in between writes of seqlock and
565
// writes to other fields.
566
std::sync::atomic::fence(Ordering::SeqCst);
567
568
// Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
569
// the bit, the guest will unset it once the guest has handled the stoppage.
570
// We get the result here because we want to call increment_pvclock_seqlock regardless of
571
// the result of these calls.
572
let result = self
573
.set_guest_stopped_bit()
574
.and_then(|_| self.set_suspended_time());
575
576
// The guest makes sure there are memory barriers in between reads of the seqlock and other
577
// fields, we should make sure there are memory barriers in between writes of seqlock and
578
// writes to other fields.
579
std::sync::atomic::fence(Ordering::SeqCst);
580
581
// Do a final increment once changes are done.
582
self.increment_pvclock_seqlock()?;
583
584
result
585
}
586
587
fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
588
match Utc::now().signed_duration_since(suspend_time.time).to_std() {
589
Ok(duration) => duration,
590
Err(e) => {
591
error!(
592
"pvclock found suspend time in the future (was the host \
593
clock adjusted?). Guest boot/realtime clock may now be \
594
incorrect. Details: {}",
595
e
596
);
597
Duration::ZERO
598
}
599
}
600
}
601
602
fn set_suspended_time(&mut self) -> Result<u64> {
603
let (this_suspend_duration, this_suspend_tsc_delta) =
604
if let Some(suspend_time) = self.suspend_time.take() {
605
(
606
Self::get_suspended_duration(&suspend_time),
607
// NB: This calculation may wrap around, as TSC can be reset to zero when
608
// the device has resumed from the "deep" suspend state (it may not happen for
609
// s2idle cases). It also happens when the tsc value itself wraps.
610
read_clock_counter().wrapping_sub(suspend_time.tsc_value),
611
)
612
} else {
613
return Err(Error::new(libc::ENOTSUP))
614
.context("Cannot set suspend time because suspend was never called");
615
};
616
617
// update the total tsc delta during all suspends
618
// NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
619
self.total_suspend_tsc_delta = self
620
.total_suspend_tsc_delta
621
.wrapping_add(this_suspend_tsc_delta);
622
623
// save tsc_suspended_delta to shared memory
624
self.pvclock_shared_data
625
.as_mut()
626
.ok_or(
627
anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
628
)?
629
.set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
630
631
info!(
632
"set total suspend tsc delta to {}",
633
self.total_suspend_tsc_delta
634
);
635
636
// update total suspend ns
637
self.total_injected_ns
638
.fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
639
640
Ok(self.total_suspend_tsc_delta)
641
}
642
643
fn increment_pvclock_seqlock(&mut self) -> Result<()> {
644
self.pvclock_shared_data
645
.as_mut()
646
.ok_or(
647
anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
648
)?
649
.increment_seqlock()
650
}
651
652
fn set_guest_stopped_bit(&mut self) -> Result<()> {
653
self.pvclock_shared_data
654
.as_mut()
655
.ok_or(
656
anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
657
)?
658
.enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
659
}
660
}
661
662
fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
663
for cause in error.chain() {
664
if let Some(e) = cause.downcast_ref::<base::Error>() {
665
return *e;
666
}
667
668
if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
669
return match e {
670
// Two kinds of GuestMemoryError contain base::Error
671
GuestMemoryError::MemoryAddSealsFailed(e) => *e,
672
GuestMemoryError::MemoryCreationFailed(e) => *e,
673
// Otherwise return EINVAL
674
_ => Error::new(libc::EINVAL),
675
};
676
}
677
}
678
// Unknown base error
679
Error::new(libc::EFAULT)
680
}
681
682
struct StubWorkerReturn {
683
suspend_tube: Tube,
684
}
685
686
/// A stub worker to respond any requests when the device is inactive.
687
fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
688
#[derive(EventToken, Debug)]
689
enum Token {
690
SomePvClockRequest,
691
Kill,
692
}
693
let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
694
(suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
695
// TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
696
// implemented for Tube.
697
#[cfg(windows)]
698
(suspend_tube.get_close_notifier(), Token::Kill),
699
(&kill_evt, Token::Kill),
700
]) {
701
Ok(wait_ctx) => wait_ctx,
702
Err(e) => {
703
error!("failed creating WaitContext: {}", e);
704
return StubWorkerReturn { suspend_tube };
705
}
706
};
707
'wait: loop {
708
let events = match wait_ctx.wait() {
709
Ok(v) => v,
710
Err(e) => {
711
error!("failed polling for events: {}", e);
712
break;
713
}
714
};
715
for event in events.iter().filter(|e| e.is_readable) {
716
match event.token {
717
Token::SomePvClockRequest => {
718
match suspend_tube.recv::<PvClockCommand>() {
719
Ok(req) => req,
720
Err(e) => {
721
error!("failed to receive request: {}", e);
722
continue;
723
}
724
};
725
if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
726
error!("error sending PvClockCommandResponse: {}", e);
727
}
728
}
729
Token::Kill => {
730
break 'wait;
731
}
732
}
733
}
734
}
735
StubWorkerReturn { suspend_tube }
736
}
737
738
struct MainWorkerReturn {
739
worker: PvClockWorker,
740
set_pvclock_page_queue: Queue,
741
suspend_tube: Tube,
742
}
743
744
// TODO(b/237300012): asyncify this device.
745
/// A worker to process PvClockCommand requests
746
fn run_main_worker(
747
mut worker: PvClockWorker,
748
mut set_pvclock_page_queue: Queue,
749
suspend_tube: Tube,
750
interrupt: Interrupt,
751
kill_evt: Event,
752
) -> MainWorkerReturn {
753
#[derive(EventToken)]
754
enum Token {
755
SetPvClockPageQueue,
756
SuspendResume,
757
Kill,
758
}
759
760
let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
761
(set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
762
(suspend_tube.get_read_notifier(), Token::SuspendResume),
763
// TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
764
// implemented for Tube.
765
#[cfg(windows)]
766
(suspend_tube.get_close_notifier(), Token::Kill),
767
(&kill_evt, Token::Kill),
768
]) {
769
Ok(pc) => pc,
770
Err(e) => {
771
error!("failed creating WaitContext: {}", e);
772
return MainWorkerReturn {
773
suspend_tube,
774
set_pvclock_page_queue,
775
worker,
776
};
777
}
778
};
779
780
'wait: loop {
781
let events = match wait_ctx.wait() {
782
Ok(v) => v,
783
Err(e) => {
784
error!("failed polling for events: {}", e);
785
break;
786
}
787
};
788
789
for event in events.iter().filter(|e| e.is_readable) {
790
match event.token {
791
Token::SetPvClockPageQueue => {
792
let _ = set_pvclock_page_queue.event().wait();
793
let desc_chain = match set_pvclock_page_queue.pop() {
794
Some(desc_chain) => desc_chain,
795
None => {
796
// Spurious doorbells from the driver are permitted
797
// by the virtio spec (v1.3; section 2.9).
798
continue;
799
}
800
};
801
802
// This device does not follow the virtio spec requirements for device-readable
803
// vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
804
// first descriptor from the chain and assume the whole req structure is
805
// contained within it.
806
let desc = desc_chain
807
.reader
808
.get_remaining_regions()
809
.chain(desc_chain.writer.get_remaining_regions())
810
.next()
811
.unwrap();
812
813
let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
814
error!("pvclock descriptor too short");
815
0
816
} else {
817
let addr = GuestAddress(desc.offset);
818
let mut req: virtio_pvclock_set_pvclock_page_req = match worker
819
.mem
820
.read_obj_from_addr(addr)
821
{
822
Ok(req) => req,
823
Err(e) => {
824
error!("failed to read request from set_pvclock_page queue: {}", e);
825
continue;
826
}
827
};
828
829
req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
830
Err(e) => {
831
error!("failed to set pvclock page: {:#}", e);
832
VIRTIO_PVCLOCK_S_IOERR
833
}
834
Ok(_) => VIRTIO_PVCLOCK_S_OK,
835
};
836
837
if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
838
error!("failed to write set_pvclock_page status: {}", e);
839
continue;
840
}
841
842
desc.len as u32
843
};
844
845
set_pvclock_page_queue.add_used_with_bytes_written(desc_chain, len);
846
set_pvclock_page_queue.trigger_interrupt();
847
}
848
Token::SuspendResume => {
849
let req = match suspend_tube.recv::<PvClockCommand>() {
850
Ok(req) => req,
851
Err(e) => {
852
error!("failed to receive request: {}", e);
853
continue;
854
}
855
};
856
857
let resp = match req {
858
PvClockCommand::Suspend => {
859
worker.suspend();
860
PvClockCommandResponse::Ok
861
}
862
PvClockCommand::Resume => {
863
match worker.resume() {
864
Ok(total_suspended_ticks) => {
865
// signal to the driver that the total_suspend_ns has changed
866
interrupt.signal_config_changed();
867
PvClockCommandResponse::Resumed {
868
total_suspended_ticks,
869
}
870
}
871
Err(e) => {
872
error!("Failed to resume pvclock: {:#}", e);
873
PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
874
e,
875
))
876
}
877
}
878
}
879
};
880
881
if let Err(e) = suspend_tube.send(&resp) {
882
error!("error sending PvClockCommandResponse: {}", e);
883
}
884
}
885
Token::Kill => {
886
break 'wait;
887
}
888
}
889
}
890
}
891
892
MainWorkerReturn {
893
suspend_tube,
894
set_pvclock_page_queue,
895
worker,
896
}
897
}
898
899
impl VirtioDevice for PvClock {
900
fn keep_rds(&self) -> Vec<RawDescriptor> {
901
if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
902
vec![suspend_tube.as_raw_descriptor()]
903
} else {
904
Vec::new()
905
}
906
}
907
908
fn device_type(&self) -> DeviceType {
909
DeviceType::Pvclock
910
}
911
912
fn queue_max_sizes(&self) -> &[u16] {
913
QUEUE_SIZES
914
}
915
916
fn features(&self) -> u64 {
917
self.state.features
918
}
919
920
fn ack_features(&mut self, mut value: u64) {
921
if value & !self.features() != 0 {
922
warn!("virtio-pvclock got unknown feature ack {:x}", value);
923
value &= self.features();
924
}
925
self.state.acked_features |= value;
926
}
927
928
fn read_config(&self, offset: u64, data: &mut [u8]) {
929
copy_config(data, 0, self.get_config().as_bytes(), offset);
930
}
931
932
fn write_config(&mut self, offset: u64, data: &[u8]) {
933
// Pvclock device doesn't expect a guest write to config
934
warn!(
935
"Unexpected write to virtio-pvclock config at offset {}: {:?}",
936
offset, data
937
);
938
}
939
940
fn activate(
941
&mut self,
942
mem: GuestMemory,
943
interrupt: Interrupt,
944
queues: BTreeMap<usize, Queue>,
945
) -> anyhow::Result<()> {
946
let tsc_frequency = self.state.tsc_frequency;
947
let total_suspend_ns = self.state.total_suspend_ns.clone();
948
let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
949
self.switch_to_main_worker(interrupt, worker, queues)
950
}
951
952
fn reset(&mut self) -> Result<()> {
953
self.switch_to_stub_worker();
954
Ok(())
955
}
956
957
fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
958
let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
959
match last_state {
960
PvClockWorkerState::Main(main_worker_thread) => {
961
let main_worker_ret = main_worker_thread.stop();
962
let mut queues = BTreeMap::new();
963
queues.insert(0, main_worker_ret.set_pvclock_page_queue);
964
self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
965
self.state.paused_main_worker = Some(main_worker_ret.worker.into());
966
Ok(Some(queues))
967
}
968
PvClockWorkerState::Stub(stub_worker_thread) => {
969
let stub_ret = stub_worker_thread.stop();
970
self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
971
Ok(None)
972
}
973
PvClockWorkerState::Idle(suspend_tube) => {
974
self.worker_state = PvClockWorkerState::Idle(suspend_tube);
975
Ok(None)
976
}
977
PvClockWorkerState::None => panic!("invalid state transition"),
978
}
979
}
980
981
fn virtio_wake(
982
&mut self,
983
queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
984
) -> anyhow::Result<()> {
985
if let Some((mem, interrupt, queues)) = queues_state {
986
let worker_snap = self
987
.state
988
.paused_main_worker
989
.take()
990
.ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
991
let worker = PvClockWorker::from_snapshot(
992
self.state.tsc_frequency,
993
self.state.total_suspend_ns.clone(),
994
worker_snap,
995
mem,
996
);
997
// Use unchecked as no worker is running at this point
998
self.start_main_worker(interrupt, worker, queues)?;
999
} else {
1000
// If the device wasn't activated, we should bring up the stub worker since that's
1001
// what is supposed to be running for an un-activated device.
1002
self.start_stub_worker();
1003
}
1004
Ok(())
1005
}
1006
1007
fn virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
1008
AnySnapshot::to_any(&self.state).context("failed to serialize PvClockState")
1009
}
1010
1011
fn virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
1012
let state: PvClockState = AnySnapshot::from_any(data).context("error deserializing")?;
1013
if state.features != self.features() {
1014
bail!(
1015
"expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1016
self.features(),
1017
state.features,
1018
);
1019
}
1020
// TODO(b/291346907): we assume that the TSC frequency has NOT changed
1021
// since the snapshot was made. Assuming we have not moved machines,
1022
// this is a reasonable assumption. We don't verify the frequency
1023
// because TSC calibration noisy.
1024
self.state = state;
1025
Ok(())
1026
}
1027
1028
fn on_device_sandboxed(&mut self) {
1029
self.start_stub_worker();
1030
}
1031
}
1032
1033
#[cfg(test)]
1034
mod tests {
1035
use super::*;
1036
use crate::virtio::QueueConfig;
1037
1038
const TEST_QUEUE_SIZE: u16 = 2048;
1039
1040
fn make_interrupt() -> Interrupt {
1041
Interrupt::new_for_test()
1042
}
1043
1044
fn create_pvclock_device() -> (Tube, PvClock) {
1045
let (host_tube, device_tube) = Tube::pair().unwrap();
1046
let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1047
1048
// Simulate the device initialization to start the stub thread.
1049
// In the real case, on_device_sandboxed will be called after the device is sandboxed
1050
// (or at some point during the device initializtion when the sandbox is disabled) to
1051
// allow devices to use multi-threads (as spawning new threads before sandboxing is
1052
// prohibited because of the minijail's restriction).
1053
pvclock_device.on_device_sandboxed();
1054
1055
(host_tube, pvclock_device)
1056
}
1057
1058
fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1059
let (_host_tube, mut pvclock_device) = create_pvclock_device();
1060
1061
// The queue won't actually be used, so passing one that isn't
1062
// fully configured is fine.
1063
let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1064
fake_queue.set_ready(true);
1065
let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1066
let interrupt = make_interrupt();
1067
pvclock_device
1068
.activate(
1069
mem.clone(),
1070
interrupt.clone(),
1071
BTreeMap::from([(
1072
0,
1073
fake_queue
1074
.activate(&mem, Event::new().unwrap(), interrupt)
1075
.unwrap(),
1076
)]),
1077
)
1078
.expect("activate should succeed");
1079
let queues = pvclock_device
1080
.virtio_sleep()
1081
.expect("sleep should succeed")
1082
.expect("sleep should yield queues");
1083
assert_eq!(queues.len(), 1);
1084
assert_eq!(
1085
queues.get(&0).expect("queue must be present").size(),
1086
TEST_QUEUE_SIZE
1087
);
1088
assert!(pvclock_device.state.paused_main_worker.is_some());
1089
(pvclock_device, mem, _host_tube)
1090
}
1091
1092
fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1093
// We just create a new queue here, because it isn't actually accessed
1094
// by the device in these tests.
1095
let mut wake_queues = BTreeMap::new();
1096
let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1097
let interrupt = make_interrupt();
1098
fake_queue.set_ready(true);
1099
wake_queues.insert(
1100
0,
1101
fake_queue
1102
.activate(mem, Event::new().unwrap(), interrupt.clone())
1103
.unwrap(),
1104
);
1105
let queues_state = (mem.clone(), interrupt, wake_queues);
1106
pvclock_device
1107
.virtio_wake(Some(queues_state))
1108
.expect("wake should succeed");
1109
assert!(pvclock_device.state.paused_main_worker.is_none());
1110
}
1111
1112
#[test]
1113
fn test_command_response_when_inactive() {
1114
let (host_tube, _pvclock_device) = create_pvclock_device();
1115
assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1116
let res = host_tube.recv::<PvClockCommandResponse>();
1117
assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1118
}
1119
1120
#[test]
1121
fn test_sleep_wake_smoke() {
1122
let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1123
assert_wake_successful(&mut pvclock_device, &mem);
1124
}
1125
1126
#[test]
1127
fn test_save_restore() {
1128
let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1129
let test_suspend_ns = 9999;
1130
1131
// Store a test value we can look for later in the test to verify
1132
// we're restoring properties.
1133
pvclock_device
1134
.state
1135
.total_suspend_ns
1136
.store(test_suspend_ns, Ordering::SeqCst);
1137
1138
let snap = pvclock_device.virtio_snapshot().unwrap();
1139
pvclock_device
1140
.state
1141
.total_suspend_ns
1142
.store(0, Ordering::SeqCst);
1143
pvclock_device.virtio_restore(snap).unwrap();
1144
assert_eq!(
1145
pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1146
test_suspend_ns
1147
);
1148
1149
assert_wake_successful(&mut pvclock_device, &mem);
1150
}
1151
1152
/// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1153
/// what the kernel does when converting TSC to ktime.
1154
fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1155
let shifted = if shift < 0 {
1156
tsc >> -shift
1157
} else {
1158
tsc << shift
1159
};
1160
let product = shifted as u128 * mult as u128;
1161
(product >> 32).try_into().expect("should not overflow")
1162
}
1163
1164
/// Helper function for checking the behavior of `freq_scale_shift`.
1165
fn check_freq_scale(f: u64, input: u64) {
1166
// We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1167
let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1168
1169
let scaled = pvclock_scale_tsc(mult, shift, input);
1170
1171
// Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1172
// accurate target, and our goal is to simply sanity check the math without adding too many
1173
// requirements about rounding errors.
1174
let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1175
let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1176
let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1177
assert!(
1178
(expected_lo..=expected_hi).contains(&scaled),
1179
"{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1180
);
1181
}
1182
1183
#[test]
1184
fn test_freq_scale_shift_accuracy() {
1185
// Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1186
// `base_hz`.
1187
for f in (1..=50).map(|n| n * 100_000_000) {
1188
check_freq_scale(f, f);
1189
}
1190
}
1191
1192
#[test]
1193
fn test_freq_scale_shift_overflow_high_freq() {
1194
// For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1195
// overflow. We must be able to handle values as large as it realistically can be, as the
1196
// kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1197
for f in (11..=50).map(|n| n * 100_000_000) {
1198
check_freq_scale(f, u64::MAX);
1199
}
1200
}
1201
1202
#[test]
1203
fn test_freq_scale_shift_overflow_low_freq() {
1204
fn prev_power_of_two(n: u64) -> u64 {
1205
assert_ne!(n, 0);
1206
let highest_bit_set = 63 - n.leading_zeros();
1207
1 << highest_bit_set
1208
}
1209
// Same test as above, but for scale factors >= 1.0. The difference is that for scale
1210
// factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1211
// this limitation in our tested maximum value.
1212
for f in (1..=10).map(|n| n * 100_000_000) {
1213
// Truncate the remainder since prev_power_of_two rounds down anyway.
1214
let factor = 1_000_000_000 / f;
1215
// This is like (exp2(floor(log2(factor)) + 1)).
1216
let target = u64::MAX / (prev_power_of_two(factor) << 1);
1217
check_freq_scale(f, target);
1218
}
1219
}
1220
}
1221
1222