Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/devices/src/vmwdt.rs
5392 views
1
// Copyright 2022 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
//! vmwdt is a virtual watchdog memory mapped device which detects stalls
6
//! on the vCPUs and resets the guest when no 'pet' events are received.
7
//! <https://docs.google.com/document/d/1DYmk2roxlwHZsOfcJi8xDMdWOHAmomvs2SDh7KPud3Y/edit?usp=sharing&resourcekey=0-oSNabc-t040a1q0K4cyI8Q>
8
9
use std::collections::BTreeMap;
10
use std::convert::TryFrom;
11
use std::fs;
12
use std::sync::Arc;
13
use std::time::Duration;
14
15
use anyhow::Context;
16
use base::custom_serde::serialize_arc_mutex;
17
use base::debug;
18
use base::error;
19
use base::warn;
20
use base::AsRawDescriptor;
21
use base::Descriptor;
22
use base::Error as SysError;
23
use base::Event;
24
use base::EventToken;
25
use base::SendTube;
26
use base::Timer;
27
use base::TimerTrait;
28
use base::Tube;
29
use base::VmEventType;
30
use base::WaitContext;
31
use base::WorkerThread;
32
use serde::Deserialize;
33
use serde::Serialize;
34
use snapshot::AnySnapshot;
35
use sync::Mutex;
36
use vm_control::DeviceId;
37
use vm_control::PlatformDeviceId;
38
use vm_control::VmResponse;
39
40
use crate::BusAccessInfo;
41
use crate::BusDevice;
42
use crate::IrqEdgeEvent;
43
use crate::Suspendable;
44
45
// Registers offsets
46
const VMWDT_REG_STATUS: u32 = 0x00;
47
const VMWDT_REG_LOAD_CNT: u32 = 0x04;
48
const VMWDT_REG_CURRENT_CNT: u32 = 0x08;
49
const VMWDT_REG_CLOCK_FREQ_HZ: u32 = 0x0C;
50
51
// Length of the registers
52
const VMWDT_REG_LEN: u64 = 0x10;
53
54
pub const VMWDT_DEFAULT_TIMEOUT_SEC: u32 = 10;
55
pub const VMWDT_DEFAULT_CLOCK_HZ: u32 = 2;
56
57
// Proc stat indexes
58
const PROCSTAT_GUEST_TIME_INDX: usize = 42;
59
60
#[derive(Serialize)]
61
pub struct VmwdtPerCpu {
62
// Flag which indicated if the watchdog is started
63
is_enabled: bool,
64
// Timer used to generate periodic events at `timer_freq_hz` frequency
65
#[serde(skip_serializing)]
66
timer: Timer,
67
// The frequency of the `timer`
68
timer_freq_hz: u64,
69
// Timestamp measured in miliseconds of the last guest activity
70
last_guest_time_ms: i64,
71
// The thread_id of the thread this vcpu belongs to
72
thread_id: u32,
73
// The process id of the task this vcpu belongs to
74
process_id: u32,
75
// The pre-programmed one-shot expiration interval. If the guest runs in this
76
// interval but we don't receive a periodic event, the guest is stalled.
77
next_expiration_interval_ms: i64,
78
// Keep track if the watchdog PPI raised.
79
stall_evt_ppi_triggered: bool,
80
// Keep track if the time was armed with oneshot mode or with repeating interval
81
repeating_interval: Option<Duration>,
82
}
83
84
#[derive(Deserialize)]
85
struct VmwdtPerCpuRestore {
86
is_enabled: bool,
87
timer_freq_hz: u64,
88
last_guest_time_ms: i64,
89
next_expiration_interval_ms: i64,
90
repeating_interval: Option<Duration>,
91
}
92
93
pub struct Vmwdt {
94
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
95
// The worker thread that waits on the timer fd
96
worker_thread: Option<WorkerThread<Tube>>,
97
// TODO: @sebastianene add separate reset event for the watchdog
98
// Reset source if the device is not responding
99
reset_evt_wrtube: SendTube,
100
activated: bool,
101
// Event to be used to interrupt the guest on detected stalls
102
stall_evt: IrqEdgeEvent,
103
vm_ctrl_tube: Option<Tube>,
104
}
105
106
#[derive(Serialize)]
107
struct VmwdtSnapshot {
108
#[serde(serialize_with = "serialize_arc_mutex")]
109
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
110
activated: bool,
111
}
112
113
#[derive(Deserialize)]
114
struct VmwdtRestore {
115
vm_wdts: Vec<VmwdtPerCpuRestore>,
116
activated: bool,
117
}
118
119
impl Vmwdt {
120
pub fn new(
121
cpu_count: usize,
122
reset_evt_wrtube: SendTube,
123
evt: IrqEdgeEvent,
124
vm_ctrl_tube: Tube,
125
) -> anyhow::Result<Vmwdt> {
126
let mut vec = Vec::new();
127
for _ in 0..cpu_count {
128
vec.push(VmwdtPerCpu {
129
last_guest_time_ms: 0,
130
thread_id: 0,
131
process_id: 0,
132
is_enabled: false,
133
stall_evt_ppi_triggered: false,
134
timer: Timer::new().context("failed to create Timer")?,
135
timer_freq_hz: 0,
136
next_expiration_interval_ms: 0,
137
repeating_interval: None,
138
});
139
}
140
let vm_wdts = Arc::new(Mutex::new(vec));
141
142
Ok(Vmwdt {
143
vm_wdts,
144
worker_thread: None,
145
reset_evt_wrtube,
146
activated: false,
147
stall_evt: evt,
148
vm_ctrl_tube: Some(vm_ctrl_tube),
149
})
150
}
151
152
pub fn vmwdt_worker_thread(
153
vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
154
kill_evt: Event,
155
reset_evt_wrtube: SendTube,
156
stall_evt: IrqEdgeEvent,
157
vm_ctrl_tube: Tube,
158
worker_started_send: Option<SendTube>,
159
) -> anyhow::Result<Tube> {
160
let msg = vm_control::VmRequest::VcpuPidTid;
161
vm_ctrl_tube
162
.send(&msg)
163
.context("failed to send request to fetch Vcpus PID and TID")?;
164
let vcpus_pid_tid: BTreeMap<usize, (u32, u32)> = match vm_ctrl_tube
165
.recv()
166
.context("failed to receive vmwdt pids and tids")?
167
{
168
VmResponse::VcpuPidTidResponse { pid_tid_map } => pid_tid_map,
169
_ => {
170
return Err(anyhow::anyhow!(
171
"Receive incorrect message type when trying to get vcpu pid tid map"
172
));
173
}
174
};
175
{
176
let mut vm_wdts = vm_wdts.lock();
177
for (i, vmwdt) in (*vm_wdts).iter_mut().enumerate() {
178
let pid_tid = vcpus_pid_tid
179
.get(&i)
180
.context("vmwdts empty, which could indicate no vcpus are initialized")?;
181
vmwdt.process_id = pid_tid.0;
182
vmwdt.thread_id = pid_tid.1;
183
}
184
}
185
if let Some(worker_started_send) = worker_started_send {
186
worker_started_send
187
.send(&())
188
.context("failed to send vmwdt worker started")?;
189
}
190
#[derive(EventToken)]
191
enum Token {
192
Kill,
193
Timer(usize),
194
}
195
196
let wait_ctx: WaitContext<Token> =
197
WaitContext::new().context("Failed to create wait_ctx")?;
198
wait_ctx
199
.add(&kill_evt, Token::Kill)
200
.context("Failed to add Tokens to wait_ctx")?;
201
202
let len = vm_wdts.lock().len();
203
for clock_id in 0..len {
204
let timer_fd = vm_wdts.lock()[clock_id].timer.as_raw_descriptor();
205
wait_ctx
206
.add(&Descriptor(timer_fd), Token::Timer(clock_id))
207
.context("Failed to link FDs to Tokens")?;
208
}
209
210
loop {
211
let events = wait_ctx.wait().context("Failed to wait for events")?;
212
for event in events.iter().filter(|e| e.is_readable) {
213
match event.token {
214
Token::Kill => {
215
return Ok(vm_ctrl_tube);
216
}
217
Token::Timer(cpu_id) => {
218
let mut wdts_locked = vm_wdts.lock();
219
let watchdog = &mut wdts_locked[cpu_id];
220
match watchdog.timer.mark_waited() {
221
Ok(true) => continue, // timer not actually ready
222
Ok(false) => {}
223
Err(e) => {
224
error!("error waiting for timer event on vcpu {cpu_id}: {e:#}");
225
continue;
226
}
227
}
228
229
let current_guest_time_ms =
230
Vmwdt::get_guest_time_ms(watchdog.process_id, watchdog.thread_id)
231
.context("get_guest_time_ms failed")?;
232
let remaining_time_ms = watchdog.next_expiration_interval_ms
233
- (current_guest_time_ms - watchdog.last_guest_time_ms);
234
235
if remaining_time_ms > 0 {
236
watchdog.next_expiration_interval_ms = remaining_time_ms;
237
if let Err(e) = watchdog
238
.timer
239
.reset_oneshot(Duration::from_millis(remaining_time_ms as u64))
240
{
241
error!(
242
"failed to reset internal timer on vcpu {}: {:#}",
243
cpu_id, e
244
);
245
}
246
watchdog.repeating_interval = None;
247
} else {
248
if watchdog.stall_evt_ppi_triggered {
249
if let Err(e) = reset_evt_wrtube
250
.send::<VmEventType>(&VmEventType::WatchdogReset)
251
{
252
error!("{} failed to send reset event from vcpu {}", e, cpu_id)
253
}
254
}
255
256
stall_evt
257
.trigger()
258
.context("Failed to trigger stall event")?;
259
watchdog.stall_evt_ppi_triggered = true;
260
watchdog.last_guest_time_ms = current_guest_time_ms;
261
}
262
}
263
}
264
}
265
}
266
}
267
268
fn start(&mut self, worker_started_send: Option<SendTube>) -> anyhow::Result<()> {
269
let vm_wdts = self.vm_wdts.clone();
270
let reset_evt_wrtube = self.reset_evt_wrtube.try_clone().unwrap();
271
let stall_event = self.stall_evt.try_clone().unwrap();
272
let vm_ctrl_tube = self
273
.vm_ctrl_tube
274
.take()
275
.context("missing vm control tube")?;
276
277
self.activated = true;
278
self.worker_thread = Some(WorkerThread::start("vmwdt worker", |kill_evt| {
279
Vmwdt::vmwdt_worker_thread(
280
vm_wdts,
281
kill_evt,
282
reset_evt_wrtube,
283
stall_event,
284
vm_ctrl_tube,
285
worker_started_send,
286
)
287
.expect("failed to start vmwdt worker thread")
288
}));
289
Ok(())
290
}
291
292
fn ensure_started(&mut self) {
293
if self.worker_thread.is_some() {
294
return;
295
}
296
297
let (worker_started_send, worker_started_recv) =
298
Tube::directional_pair().expect("failed to create vmwdt worker started tubes");
299
self.start(Some(worker_started_send))
300
.expect("failed to start Vmwdt");
301
worker_started_recv
302
.recv::<()>()
303
.expect("failed to receive vmwdt worker started");
304
}
305
306
#[cfg(any(target_os = "linux", target_os = "android"))]
307
pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
308
// TODO: @sebastianene check if we can avoid open-read-close on each call
309
let stat_path = format!("/proc/{process_id}/task/{thread_id}/stat");
310
let contents = fs::read_to_string(stat_path)?;
311
312
let gtime_ticks = contents
313
.split_whitespace()
314
.nth(PROCSTAT_GUEST_TIME_INDX)
315
.and_then(|guest_time| guest_time.parse::<u64>().ok())
316
.unwrap_or(0);
317
318
// SAFETY:
319
// Safe because this just returns an integer
320
let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64;
321
Ok((gtime_ticks * 1000 / ticks_per_sec) as i64)
322
}
323
324
#[cfg(not(any(target_os = "linux", target_os = "android")))]
325
pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
326
Ok(0)
327
}
328
}
329
330
impl BusDevice for Vmwdt {
331
fn debug_label(&self) -> String {
332
"Vmwdt".to_owned()
333
}
334
335
fn device_id(&self) -> DeviceId {
336
PlatformDeviceId::VmWatchdog.into()
337
}
338
339
fn read(&mut self, _offset: BusAccessInfo, _data: &mut [u8]) {}
340
341
fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
342
let data_array = match <&[u8; 4]>::try_from(data) {
343
Ok(array) => array,
344
_ => {
345
error!("Bad write size: {} for vmwdt", data.len());
346
return;
347
}
348
};
349
350
let reg_val = u32::from_ne_bytes(*data_array);
351
let cpu_index: usize = (info.offset / VMWDT_REG_LEN) as usize;
352
let reg_offset = (info.offset % VMWDT_REG_LEN) as u32;
353
354
if cpu_index > self.vm_wdts.lock().len() {
355
error!("Bad write cpu_index {}", cpu_index);
356
return;
357
}
358
359
match reg_offset {
360
VMWDT_REG_STATUS => {
361
self.ensure_started();
362
let mut wdts_locked = self.vm_wdts.lock();
363
let cpu_watchdog = &mut wdts_locked[cpu_index];
364
365
cpu_watchdog.is_enabled = reg_val != 0;
366
367
if reg_val != 0 {
368
let interval = Duration::from_millis(1000 / cpu_watchdog.timer_freq_hz);
369
cpu_watchdog.repeating_interval = Some(interval);
370
cpu_watchdog
371
.timer
372
.reset_repeating(interval)
373
.expect("Failed to reset timer repeating interval");
374
} else {
375
cpu_watchdog.repeating_interval = None;
376
cpu_watchdog
377
.timer
378
.clear()
379
.expect("Failed to clear cpu watchdog timer");
380
}
381
}
382
VMWDT_REG_LOAD_CNT => {
383
self.ensure_started();
384
let (process_id, thread_id) = {
385
let mut wdts_locked = self.vm_wdts.lock();
386
let cpu_watchdog = &mut wdts_locked[cpu_index];
387
(cpu_watchdog.process_id, cpu_watchdog.thread_id)
388
};
389
let guest_time_ms = Vmwdt::get_guest_time_ms(process_id, thread_id)
390
.expect("get_guest_time_ms failed");
391
392
let mut wdts_locked = self.vm_wdts.lock();
393
let cpu_watchdog = &mut wdts_locked[cpu_index];
394
let next_expiration_interval_ms =
395
reg_val as u64 * 1000 / cpu_watchdog.timer_freq_hz;
396
397
cpu_watchdog.last_guest_time_ms = guest_time_ms;
398
cpu_watchdog.stall_evt_ppi_triggered = false;
399
cpu_watchdog.next_expiration_interval_ms = next_expiration_interval_ms as i64;
400
401
if cpu_watchdog.is_enabled {
402
if let Err(_e) = cpu_watchdog
403
.timer
404
.reset_oneshot(Duration::from_millis(next_expiration_interval_ms))
405
{
406
error!("failed to reset one-shot vcpu time {}", cpu_index);
407
}
408
cpu_watchdog.repeating_interval = None;
409
}
410
}
411
VMWDT_REG_CURRENT_CNT => {
412
warn!("invalid write to read-only VMWDT_REG_CURRENT_CNT register");
413
}
414
VMWDT_REG_CLOCK_FREQ_HZ => {
415
let mut wdts_locked = self.vm_wdts.lock();
416
let cpu_watchdog = &mut wdts_locked[cpu_index];
417
418
debug!(
419
"CPU:{:x} wrote VMWDT_REG_CLOCK_FREQ_HZ {:x}",
420
cpu_index, reg_val
421
);
422
cpu_watchdog.timer_freq_hz = reg_val as u64;
423
}
424
_ => unreachable!(),
425
}
426
}
427
}
428
429
impl Suspendable for Vmwdt {
430
fn sleep(&mut self) -> anyhow::Result<()> {
431
if let Some(worker) = self.worker_thread.take() {
432
self.vm_ctrl_tube = Some(worker.stop());
433
}
434
Ok(())
435
}
436
437
fn wake(&mut self) -> anyhow::Result<()> {
438
if self.activated {
439
// We do not pass a tube to notify that the worker thread has started on wake.
440
// At this stage, vm_control is blocked on resuming devices and cannot provide the vcpu
441
// PIDs/TIDs yet.
442
// At the same time, the Vcpus are still frozen, which means no MMIO will get
443
// processed, and write will not get triggered.
444
// The request to get PIDs/TIDs should get processed before any MMIO request occurs.
445
self.start(None)?;
446
let mut vm_wdts = self.vm_wdts.lock();
447
for vmwdt in vm_wdts.iter_mut() {
448
if let Some(interval) = &vmwdt.repeating_interval {
449
vmwdt
450
.timer
451
.reset_repeating(*interval)
452
.context("failed to write repeating interval")?;
453
} else if vmwdt.is_enabled {
454
vmwdt
455
.timer
456
.reset_oneshot(Duration::from_millis(
457
vmwdt.next_expiration_interval_ms as u64,
458
))
459
.context("failed to write oneshot interval")?;
460
}
461
}
462
}
463
Ok(())
464
}
465
466
fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
467
AnySnapshot::to_any(&VmwdtSnapshot {
468
vm_wdts: self.vm_wdts.clone(),
469
activated: self.activated,
470
})
471
.context("failed to snapshot Vmwdt")
472
}
473
474
fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
475
let deser: VmwdtRestore =
476
AnySnapshot::from_any(data).context("failed to deserialize Vmwdt")?;
477
let mut vm_wdts = self.vm_wdts.lock();
478
for (vmwdt_restore, vmwdt) in deser.vm_wdts.iter().zip(vm_wdts.iter_mut()) {
479
vmwdt.is_enabled = vmwdt_restore.is_enabled;
480
vmwdt.timer_freq_hz = vmwdt_restore.timer_freq_hz;
481
vmwdt.last_guest_time_ms = vmwdt_restore.last_guest_time_ms;
482
vmwdt.next_expiration_interval_ms = vmwdt_restore.next_expiration_interval_ms;
483
vmwdt.repeating_interval = vmwdt_restore.repeating_interval;
484
}
485
self.activated = deser.activated;
486
Ok(())
487
}
488
}
489
490
#[cfg(test)]
491
mod tests {
492
use std::process;
493
use std::thread::sleep;
494
495
#[cfg(any(target_os = "linux", target_os = "android"))]
496
use base::gettid;
497
use base::poll_assert;
498
use base::Tube;
499
500
use super::*;
501
502
const AARCH64_VMWDT_ADDR: u64 = 0x3000;
503
const TEST_VMWDT_CPU_NO: usize = 0x1;
504
505
fn vmwdt_bus_address(offset: u64) -> BusAccessInfo {
506
BusAccessInfo {
507
offset,
508
address: AARCH64_VMWDT_ADDR,
509
id: 0,
510
}
511
}
512
513
#[test]
514
fn test_watchdog_internal_timer() {
515
let (vm_evt_wrtube, _vm_evt_rdtube) = Tube::directional_pair().unwrap();
516
let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
517
let irq = IrqEdgeEvent::new().unwrap();
518
#[cfg(any(target_os = "linux", target_os = "android"))]
519
{
520
vm_ctrl_wrtube
521
.send(&VmResponse::VcpuPidTidResponse {
522
pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
523
})
524
.unwrap();
525
}
526
let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
527
528
// Configure the watchdog device, 2Hz internal clock
529
device.write(
530
vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
531
&[10, 0, 0, 0],
532
);
533
device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
534
device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
535
let next_expiration_ms = {
536
let mut vmwdt_locked = device.vm_wdts.lock();
537
// In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
538
// the function get_guest_time() returns 0
539
vmwdt_locked[0].last_guest_time_ms = 10;
540
vmwdt_locked[0].next_expiration_interval_ms
541
};
542
543
// Poll multiple times as we don't get a signal when the watchdog thread has run.
544
poll_assert!(10, || {
545
sleep(Duration::from_millis(50));
546
let vmwdt_locked = device.vm_wdts.lock();
547
// Verify that our timer expired and the next_expiration_interval_ms changed
548
vmwdt_locked[0].next_expiration_interval_ms != next_expiration_ms
549
});
550
}
551
552
#[test]
553
fn test_watchdog_expiration() {
554
let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().unwrap();
555
let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
556
let irq = IrqEdgeEvent::new().unwrap();
557
#[cfg(any(target_os = "linux", target_os = "android"))]
558
{
559
vm_ctrl_wrtube
560
.send(&VmResponse::VcpuPidTidResponse {
561
pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
562
})
563
.unwrap();
564
}
565
let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
566
567
// Configure the watchdog device, 2Hz internal clock
568
device.write(
569
vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
570
&[10, 0, 0, 0],
571
);
572
device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
573
device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
574
// In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
575
// the function get_guest_time() returns 0
576
device.vm_wdts.lock()[0].last_guest_time_ms = -100;
577
578
// Check that the interrupt has raised
579
poll_assert!(10, || {
580
sleep(Duration::from_millis(50));
581
let vmwdt_locked = device.vm_wdts.lock();
582
vmwdt_locked[0].stall_evt_ppi_triggered
583
});
584
585
// Simulate that the time has passed since the last expiration
586
device.vm_wdts.lock()[0].last_guest_time_ms = -100;
587
588
// Poll multiple times as we don't get a signal when the watchdog thread has run.
589
poll_assert!(10, || {
590
sleep(Duration::from_millis(50));
591
match vm_evt_rdtube.recv::<VmEventType>() {
592
Ok(vm_event) => vm_event == VmEventType::WatchdogReset,
593
Err(_e) => false,
594
}
595
});
596
}
597
}
598
599