Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/devices/src/irqchip/userspace.rs
5394 views
1
// Copyright 2020 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
use std::convert::TryFrom;
6
use std::convert::TryInto;
7
use std::fmt;
8
use std::fmt::Display;
9
use std::iter;
10
use std::sync::Arc;
11
12
cfg_if::cfg_if! {
13
if #[cfg(test)] {
14
use base::{FakeClock as Clock, FakeTimer as Timer};
15
} else {
16
use base::{Clock, Timer};
17
}
18
}
19
use base::error;
20
use base::info;
21
use base::warn;
22
use base::AsRawDescriptor;
23
use base::Descriptor;
24
use base::Error;
25
use base::Event;
26
use base::EventToken;
27
use base::Result;
28
use base::Tube;
29
use base::WaitContext;
30
use base::WorkerThread;
31
use hypervisor::DeliveryMode;
32
use hypervisor::IoapicState;
33
use hypervisor::IrqRoute;
34
use hypervisor::IrqSource;
35
use hypervisor::IrqSourceChip;
36
use hypervisor::LapicState;
37
use hypervisor::MPState;
38
use hypervisor::MsiAddressMessage;
39
use hypervisor::MsiDataMessage;
40
use hypervisor::PicSelect;
41
use hypervisor::PicState;
42
use hypervisor::PitState;
43
use hypervisor::Vcpu;
44
use hypervisor::VcpuX86_64;
45
use resources::SystemAllocator;
46
use snapshot::AnySnapshot;
47
use sync::Condvar;
48
use sync::Mutex;
49
use vm_control::DeviceId;
50
use vm_control::PlatformDeviceId;
51
52
use crate::bus::BusDeviceSync;
53
use crate::irqchip::Apic;
54
use crate::irqchip::ApicBusMsg;
55
use crate::irqchip::DelayedIoApicIrqEvents;
56
use crate::irqchip::Interrupt;
57
use crate::irqchip::InterruptData;
58
use crate::irqchip::InterruptDestination;
59
use crate::irqchip::Ioapic;
60
use crate::irqchip::IrqEvent;
61
use crate::irqchip::IrqEventIndex;
62
use crate::irqchip::Pic;
63
use crate::irqchip::Routes;
64
use crate::irqchip::VcpuRunState;
65
use crate::irqchip::APIC_BASE_ADDRESS;
66
use crate::irqchip::APIC_MEM_LENGTH_BYTES;
67
use crate::irqchip::IOAPIC_BASE_ADDRESS;
68
use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES;
69
use crate::Bus;
70
use crate::BusAccessInfo;
71
use crate::BusDevice;
72
use crate::IrqChip;
73
use crate::IrqChipCap;
74
use crate::IrqChipX86_64;
75
use crate::IrqEdgeEvent;
76
use crate::IrqEventSource;
77
use crate::IrqLevelEvent;
78
use crate::Pit;
79
use crate::PitError;
80
use crate::Suspendable;
81
82
/// PIT channel 0 timer is connected to IRQ 0
83
const PIT_CHANNEL0_IRQ: u32 = 0;
84
/// CR0 extension type bit
85
const X86_CR0_ET: u64 = 0x00000010;
86
/// CR0 not write through bit
87
const X86_CR0_NW: u64 = 0x20000000;
88
/// CR0 cache disable bit
89
const X86_CR0_CD: u64 = 0x40000000;
90
/// Default power on state of CR0 register, according to the Intel manual.
91
const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD;
92
93
/// An `IrqChip` with all interrupt devices emulated in userspace. `UserspaceIrqChip` works with
94
/// any hypervisor, but only supports x86.
95
pub struct UserspaceIrqChip<V: VcpuX86_64> {
96
pub vcpus: Arc<Mutex<Vec<Option<V>>>>,
97
routes: Arc<Mutex<Routes>>,
98
pit: Arc<Mutex<Pit>>,
99
pic: Arc<Mutex<Pic>>,
100
ioapic: Arc<Mutex<Ioapic>>,
101
ioapic_pins: usize,
102
pub apics: Vec<Arc<Mutex<Apic>>>,
103
// Condition variables used by wait_until_runnable.
104
waiters: Vec<Arc<Waiter>>,
105
// Raw descriptors of the apic Timers.
106
timer_descriptors: Vec<Descriptor>,
107
/// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked
108
/// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has
109
/// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which
110
/// itself may be busy trying to call service_irq).
111
///
112
/// ## Note:
113
/// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in
114
/// conjunction with the `irq_events` field, that lock should be taken first to prevent
115
/// deadlocks stemming from lock-ordering issues.
116
delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>,
117
// Array of Events that devices will use to assert ioapic pins.
118
irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>,
119
dropper: Arc<Mutex<Dropper>>,
120
activated: bool,
121
}
122
123
/// Helper that implements `Drop` on behalf of `UserspaceIrqChip`. The many cloned copies of an irq
124
/// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is
125
/// dropped.
126
struct Dropper {
127
/// Worker threads that deliver timer events to the APICs.
128
workers: Vec<WorkerThread<()>>,
129
}
130
131
impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
132
/// Constructs a new `UserspaceIrqChip`.
133
pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> {
134
let clock = Arc::new(Mutex::new(Clock::new()));
135
Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock)
136
}
137
138
/// Constructs a new `UserspaceIrqChip`, with a clock. Used for testing.
139
pub fn new_with_clock(
140
num_vcpus: usize,
141
irq_tube: Tube,
142
ioapic_pins: Option<usize>,
143
clock: Arc<Mutex<Clock>>,
144
) -> Result<Self> {
145
let pit_evt = IrqEdgeEvent::new()?;
146
// For test only, this clock instance is FakeClock. It needs to be cloned for every Timer
147
// instance, so make a clone for it now.
148
#[cfg(test)]
149
let test_clock = clock.clone();
150
let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e {
151
PitError::CloneEvent(err) => err,
152
PitError::CreateEvent(err) => err,
153
PitError::CreateWaitContext(err) => err,
154
PitError::TimerCreateError(err) => err,
155
PitError::WaitError(err) => err,
156
PitError::SpawnThread(_) => Error::new(libc::EIO),
157
})?;
158
let pit_event_source = IrqEventSource::from_device(&pit);
159
160
let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS);
161
let ioapic = Ioapic::new(irq_tube, ioapic_pins)?;
162
163
let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus);
164
let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus);
165
for id in 0..num_vcpus {
166
cfg_if::cfg_if! {
167
if #[cfg(test)] {
168
let timer = Timer::new(test_clock.clone());
169
} else {
170
let timer = Timer::new()?;
171
}
172
}
173
// Timers are owned by the apics, which outlive the raw descriptors stored here and in
174
// the worker threads.
175
timer_descriptors.push(Descriptor(timer.as_raw_descriptor()));
176
177
let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?;
178
let apic = Apic::new(id, Box::new(timer));
179
apics.push(Arc::new(Mutex::new(apic)));
180
}
181
let dropper = Dropper {
182
workers: Vec::new(),
183
};
184
185
let mut chip = UserspaceIrqChip {
186
vcpus: Arc::new(Mutex::new(
187
iter::repeat_with(|| None).take(num_vcpus).collect(),
188
)),
189
waiters: iter::repeat_with(Default::default)
190
.take(num_vcpus)
191
.collect(),
192
routes: Arc::new(Mutex::new(Routes::new())),
193
pit: Arc::new(Mutex::new(pit)),
194
pic: Arc::new(Mutex::new(Pic::new())),
195
ioapic: Arc::new(Mutex::new(ioapic)),
196
ioapic_pins,
197
apics,
198
timer_descriptors,
199
delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)),
200
irq_events: Arc::new(Mutex::new(Vec::new())),
201
dropper: Arc::new(Mutex::new(dropper)),
202
activated: false,
203
};
204
205
// Setup standard x86 irq routes
206
chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?;
207
208
chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?;
209
Ok(chip)
210
}
211
212
/// Handles a message from an APIC.
213
fn handle_msg(&self, msg: ApicBusMsg) {
214
match msg {
215
ApicBusMsg::Eoi(vector) => {
216
let _ = self.broadcast_eoi(vector);
217
}
218
ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt),
219
}
220
}
221
222
/// Sends a Message Signaled Interrupt to one or more APICs. MSIs are a 64-bit address and
223
/// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address
224
/// are used.
225
fn send_msi(&self, addr: u32, data: u32) {
226
let mut msi_addr = MsiAddressMessage::new();
227
msi_addr.set(0, 32, addr as u64);
228
let dest = match InterruptDestination::try_from(&msi_addr) {
229
Ok(dest) => dest,
230
Err(e) => {
231
warn!("Invalid MSI message: {}", e);
232
return;
233
}
234
};
235
236
let mut msi_data = MsiDataMessage::new();
237
msi_data.set(0, 32, data as u64);
238
let data = InterruptData::from(&msi_data);
239
240
self.send_irq_to_apics(&Interrupt { dest, data });
241
}
242
243
pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) {
244
// id can come from the guest, so check bounds.
245
if let Some(apic) = self.apics.get(id) {
246
apic.lock().accept_irq(irq);
247
} else {
248
error!("Interrupt for non-existent apic {}: {:?}", id, irq);
249
}
250
if let Some(Some(vcpu)) = self.vcpus.lock().get(id) {
251
vcpu.set_interrupt_window_requested(true);
252
} else {
253
error!("Interrupt for non-existent vcpu {}: {:?}", id, irq);
254
}
255
self.waiters[id].notify();
256
}
257
258
/// Sends an interrupt to one or more APICs. Used for sending MSIs and IPIs.
259
pub fn send_irq_to_apics(&self, irq: &Interrupt) {
260
match irq.data.delivery {
261
DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {}
262
_ => info!("UserspaceIrqChip received special irq: {:?}", irq),
263
}
264
265
// First try the fast path, where the destination is a single APIC we can send to directly.
266
if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) {
267
self.send_irq_to_apic(apic_id as usize, &irq.data);
268
return;
269
}
270
271
let lowest_mode = irq.data.delivery == DeliveryMode::Lowest;
272
let mut lowest_priority = u8::MAX;
273
let mut lowest_apic: Option<usize> = None;
274
275
for (i, apic) in self.apics.iter().enumerate() {
276
let send = {
277
let apic = apic.lock();
278
if !apic.match_dest(&irq.dest) {
279
false
280
} else if lowest_mode {
281
let priority = apic.get_processor_priority();
282
if priority <= lowest_priority {
283
lowest_priority = priority;
284
lowest_apic = Some(i);
285
}
286
false
287
} else {
288
true
289
}
290
};
291
if send {
292
self.send_irq_to_apic(i, &irq.data);
293
}
294
}
295
296
if lowest_mode {
297
if let Some(index) = lowest_apic {
298
self.send_irq_to_apic(index, &irq.data);
299
} else {
300
// According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let
301
// this happen. If the OS is misconfigured then drop the interrupt and log a
302
// warning.
303
warn!(
304
"Lowest priority interrupt sent, but no apics configured as valid target: {:?}",
305
irq
306
);
307
}
308
}
309
}
310
311
/// Delivers a startup IPI to `vcpu`.
312
fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> {
313
// This comes from Intel SDM volume 3, chapter 8.4. The vector specifies a page aligned
314
// address where execution should start. cs.base is the offset for the code segment with an
315
// RIP of 0. The cs.selector is just the base shifted right by 4 bits.
316
let mut sregs = vcpu.get_sregs()?;
317
sregs.cs.base = (vector as u64) << 12;
318
sregs.cs.selector = (vector as u16) << 8;
319
320
// Set CR0 to its INIT value per the manual. Application processors won't boot with the CR0
321
// protected mode and paging bits set by setup_sregs(). Kernel APIC doesn't have this
322
// issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's
323
// state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0.
324
sregs.cr0 = X86_CR0_INIT;
325
vcpu.set_sregs(&sregs)?;
326
327
let mut regs = vcpu.get_regs()?;
328
regs.rip = 0;
329
vcpu.set_regs(&regs)?;
330
331
Ok(())
332
}
333
334
/// Checks if the specified VCPU is in a runnable state.
335
fn is_runnable(&self, vcpu_id: usize) -> bool {
336
self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable
337
}
338
}
339
340
impl Dropper {
341
fn sleep(&mut self) -> anyhow::Result<()> {
342
for thread in self.workers.split_off(0).into_iter() {
343
thread.stop();
344
}
345
Ok(())
346
}
347
}
348
349
impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
350
fn register_irq_event(
351
&mut self,
352
irq: u32,
353
irq_event: &Event,
354
resample_event: Option<&Event>,
355
source: IrqEventSource,
356
) -> Result<Option<IrqEventIndex>> {
357
let mut evt = IrqEvent {
358
gsi: irq,
359
event: irq_event.try_clone()?,
360
resample_event: None,
361
source,
362
};
363
if let Some(resample_event) = resample_event {
364
evt.resample_event = Some(resample_event.try_clone()?);
365
}
366
367
let mut irq_events = self.irq_events.lock();
368
let index = irq_events.len();
369
irq_events.push(Some(evt));
370
Ok(Some(index))
371
}
372
373
fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> {
374
let mut irq_events = self.irq_events.lock();
375
for (index, evt) in irq_events.iter().enumerate() {
376
if let Some(evt) = evt {
377
if evt.gsi == irq && irq_event.eq(&evt.event) {
378
irq_events[index] = None;
379
break;
380
}
381
}
382
}
383
Ok(())
384
}
385
}
386
387
impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> {
388
fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> {
389
let vcpu: &V = vcpu
390
.downcast_ref()
391
.expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
392
self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?);
393
Ok(())
394
}
395
396
fn register_edge_irq_event(
397
&mut self,
398
irq: u32,
399
irq_event: &IrqEdgeEvent,
400
source: IrqEventSource,
401
) -> Result<Option<IrqEventIndex>> {
402
self.register_irq_event(irq, irq_event.get_trigger(), None, source)
403
}
404
405
fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> {
406
self.unregister_irq_event(irq, irq_event.get_trigger())
407
}
408
409
fn register_level_irq_event(
410
&mut self,
411
irq: u32,
412
irq_event: &IrqLevelEvent,
413
source: IrqEventSource,
414
) -> Result<Option<IrqEventIndex>> {
415
self.register_irq_event(
416
irq,
417
irq_event.get_trigger(),
418
Some(irq_event.get_resample()),
419
source,
420
)
421
}
422
423
fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> {
424
self.unregister_irq_event(irq, irq_event.get_trigger())
425
}
426
427
fn route_irq(&mut self, route: IrqRoute) -> Result<()> {
428
self.routes.lock().add(route)
429
}
430
431
fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> {
432
self.routes.lock().replace_all(routes)
433
}
434
435
fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> {
436
let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new();
437
for (index, evt) in self.irq_events.lock().iter().enumerate() {
438
if let Some(evt) = evt {
439
tokens.push((index, evt.source.clone(), evt.event.try_clone()?));
440
}
441
}
442
Ok(tokens)
443
}
444
445
fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> {
446
for route in self.routes.lock()[irq as usize].iter() {
447
match *route {
448
IrqSource::Irqchip {
449
chip: IrqSourceChip::PicPrimary,
450
pin,
451
}
452
| IrqSource::Irqchip {
453
chip: IrqSourceChip::PicSecondary,
454
pin,
455
} => {
456
self.pic.lock().service_irq(pin as u8, level);
457
}
458
IrqSource::Irqchip {
459
chip: IrqSourceChip::Ioapic,
460
pin,
461
} => {
462
self.ioapic.lock().service_irq(pin as usize, level);
463
}
464
// service_irq's level parameter is ignored for MSIs. MSI data specifies the level.
465
IrqSource::Msi { address, data } => {
466
self.send_msi(address as u32, data);
467
}
468
_ => {
469
error!("Unexpected route source {:?}", route);
470
return Err(Error::new(libc::EINVAL));
471
}
472
}
473
}
474
Ok(())
475
}
476
477
/// Services an IRQ event by asserting then deasserting an IRQ line. The associated Event
478
/// that triggered the irq event will be read from. If the irq is associated with a resample
479
/// Event, then the deassert will only happen after an EOI is broadcast for a vector
480
/// associated with the irq line.
481
/// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC,
482
/// or APIC (MSI). If it's a PIC or IOAPIC route, we attempt to call service_irq on those
483
/// chips. If the IOAPIC is unable to be immediately locked, we add the irq to the
484
/// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq
485
/// event). If it's an MSI route, we call send_msi to decode the MSI and send it to the
486
/// destination APIC(s).
487
fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> {
488
let irq_events = self.irq_events.lock();
489
let evt = if let Some(evt) = &irq_events[event_index] {
490
evt
491
} else {
492
return Ok(());
493
};
494
evt.event.wait()?;
495
496
for route in self.routes.lock()[evt.gsi as usize].iter() {
497
match *route {
498
IrqSource::Irqchip {
499
chip: IrqSourceChip::PicPrimary,
500
pin,
501
}
502
| IrqSource::Irqchip {
503
chip: IrqSourceChip::PicSecondary,
504
pin,
505
} => {
506
let mut pic = self.pic.lock();
507
if evt.resample_event.is_some() {
508
pic.service_irq(pin as u8, true);
509
} else {
510
pic.service_irq(pin as u8, true);
511
pic.service_irq(pin as u8, false);
512
}
513
}
514
IrqSource::Irqchip {
515
chip: IrqSourceChip::Ioapic,
516
pin,
517
} => {
518
if let Ok(mut ioapic) = self.ioapic.try_lock() {
519
if evt.resample_event.is_some() {
520
ioapic.service_irq(pin as usize, true);
521
} else {
522
ioapic.service_irq(pin as usize, true);
523
ioapic.service_irq(pin as usize, false);
524
}
525
} else {
526
let mut delayed_events = self.delayed_ioapic_irq_events.lock();
527
delayed_events.events.push(event_index);
528
delayed_events.trigger.signal().unwrap();
529
}
530
}
531
IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
532
_ => {
533
error!("Unexpected route source {:?}", route);
534
return Err(Error::new(libc::EINVAL));
535
}
536
}
537
}
538
539
Ok(())
540
}
541
542
/// Broadcasts an end of interrupt. For UserspaceIrqChip this sends the EOI to the ioapic.
543
fn broadcast_eoi(&self, vector: u8) -> Result<()> {
544
self.ioapic.lock().end_of_interrupt(vector);
545
Ok(())
546
}
547
548
/// Injects any pending interrupts for `vcpu`.
549
///
550
/// For UserspaceIrqChip this:
551
/// * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt
552
/// * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject
553
/// * Injects APIC NMIs
554
/// * Handles APIC INIT IPIs
555
/// * Handles APIC SIPIs
556
/// * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu
557
fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> {
558
let vcpu: &V = vcpu
559
.downcast_ref()
560
.expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
561
let vcpu_id = vcpu.id();
562
let mut vcpu_ready = vcpu.ready_for_interrupt();
563
564
let mut pic_needs_window = false;
565
if vcpu_id == 0 {
566
let mut pic = self.pic.lock();
567
if vcpu_ready {
568
if let Some(vector) = pic.get_external_interrupt() {
569
vcpu.interrupt(vector)?;
570
self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
571
// Already injected a PIC interrupt, so APIC fixed interrupt can't be injected.
572
vcpu_ready = false;
573
}
574
}
575
pic_needs_window = pic.interrupt_requested();
576
}
577
578
let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready);
579
if let Some(vector) = irqs.fixed {
580
let do_interrupt = {
581
let mut apic = self.apics[vcpu_id].lock();
582
match apic.get_mp_state() {
583
MPState::Runnable | MPState::Halted => {
584
// APIC interrupts should only be injectable when the MPState is
585
// Halted or Runnable.
586
apic.set_mp_state(&MPState::Runnable);
587
true
588
}
589
s => {
590
// This shouldn't happen, but log a helpful error if it does.
591
error!("Interrupt cannot be injected while in state: {:?}", s);
592
false
593
}
594
}
595
};
596
597
if do_interrupt {
598
vcpu.interrupt(vector)?;
599
}
600
}
601
for _ in 0..irqs.nmis {
602
let prev_state = self.apics[vcpu_id].lock().get_mp_state();
603
vcpu.inject_nmi()?;
604
self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
605
info!(
606
"Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}",
607
vcpu_id,
608
prev_state,
609
MPState::Runnable
610
);
611
}
612
if irqs.init {
613
{
614
let mut apic = self.apics[vcpu_id].lock();
615
apic.load_reset_state();
616
apic.set_mp_state(&MPState::InitReceived);
617
}
618
info!("Delivered INIT IPI to cpu {}", vcpu_id);
619
}
620
if let Some(vector) = irqs.startup {
621
// If our state is not MPState::InitReceived then this is probably
622
// the second SIPI in the INIT-SIPI-SIPI sequence; ignore.
623
if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived {
624
self.deliver_startup(vcpu, vector)?;
625
self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
626
info!("Delivered SIPI to cpu {}", vcpu_id);
627
}
628
}
629
630
let needs_window = pic_needs_window || irqs.needs_window;
631
vcpu.set_interrupt_window_requested(needs_window);
632
633
Ok(())
634
}
635
636
/// Notifies the irq chip that the specified VCPU has executed a halt instruction.
637
/// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`.
638
fn halted(&self, vcpu_id: usize) {
639
self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted)
640
}
641
642
/// Blocks until `vcpu` is in a runnable state or until interrupted by
643
/// `IrqChip::kick_halted_vcpus`. Returns `VcpuRunState::Runnable if vcpu is runnable, or
644
/// `VcpuRunState::Interrupted` if the wait was interrupted.
645
/// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new
646
/// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not
647
/// runnable.
648
fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> {
649
let vcpu_id = vcpu.id();
650
let waiter = &self.waiters[vcpu_id];
651
let mut interrupted_lock = waiter.mtx.lock();
652
loop {
653
if *interrupted_lock {
654
*interrupted_lock = false;
655
info!("wait_until_runnable interrupted on cpu {}", vcpu_id);
656
return Ok(VcpuRunState::Interrupted);
657
}
658
if self.is_runnable(vcpu_id) {
659
return Ok(VcpuRunState::Runnable);
660
}
661
662
self.inject_interrupts(vcpu)?;
663
if self.is_runnable(vcpu_id) {
664
return Ok(VcpuRunState::Runnable);
665
}
666
interrupted_lock = waiter.cvar.wait(interrupted_lock);
667
}
668
}
669
670
/// Makes unrunnable VCPUs return immediately from `wait_until_runnable`.
671
/// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to
672
/// `wait_until_runnable` will immediately return false. After that one kick, subsequent
673
/// `wait_until_runnable` calls go back to waiting for runnability normally.
674
fn kick_halted_vcpus(&self) {
675
for waiter in self.waiters.iter() {
676
waiter.set_and_notify(/* interrupted= */ true);
677
}
678
}
679
680
fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> {
681
Ok(self.apics[vcpu_id].lock().get_mp_state())
682
}
683
684
fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> {
685
self.apics[vcpu_id].lock().set_mp_state(state);
686
Ok(())
687
}
688
689
fn try_clone(&self) -> Result<Self> {
690
// kill_evts and timer_descriptors don't change, so they could be a plain Vec with each
691
// element cloned. But the Arc<Mutex> avoids a quadratic number of open descriptors from
692
// cloning, and those fields aren't performance critical.
693
Ok(UserspaceIrqChip {
694
vcpus: self.vcpus.clone(),
695
waiters: self.waiters.clone(),
696
routes: self.routes.clone(),
697
pit: self.pit.clone(),
698
pic: self.pic.clone(),
699
ioapic: self.ioapic.clone(),
700
ioapic_pins: self.ioapic_pins,
701
apics: self.apics.clone(),
702
timer_descriptors: self.timer_descriptors.clone(),
703
delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(),
704
irq_events: self.irq_events.clone(),
705
dropper: self.dropper.clone(),
706
activated: self.activated,
707
})
708
}
709
710
// TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices
711
fn finalize_devices(
712
&mut self,
713
resources: &mut SystemAllocator,
714
io_bus: &Bus,
715
mmio_bus: &Bus,
716
) -> Result<()> {
717
// Insert pit into io_bus
718
io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap();
719
io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap();
720
721
// Insert pic into io_bus
722
io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap();
723
io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap();
724
io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap();
725
726
// Insert ioapic into mmio_bus
727
mmio_bus
728
.insert(
729
self.ioapic.clone(),
730
IOAPIC_BASE_ADDRESS,
731
IOAPIC_MEM_LENGTH_BYTES,
732
)
733
.unwrap();
734
735
// Insert self into mmio_bus for handling APIC mmio
736
mmio_bus
737
.insert_sync(
738
Arc::new(self.try_clone()?),
739
APIC_BASE_ADDRESS,
740
APIC_MEM_LENGTH_BYTES,
741
)
742
.unwrap();
743
744
// At this point, all of our devices have been created and they have registered their
745
// irq events, so we can clone our resample events
746
let mut ioapic_resample_events: Vec<Vec<Event>> =
747
(0..self.ioapic_pins).map(|_| Vec::new()).collect();
748
let mut pic_resample_events: Vec<Vec<Event>> =
749
(0..self.ioapic_pins).map(|_| Vec::new()).collect();
750
751
for evt in self.irq_events.lock().iter().flatten() {
752
if (evt.gsi as usize) >= self.ioapic_pins {
753
continue;
754
}
755
if let Some(resample_evt) = &evt.resample_event {
756
ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
757
pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
758
}
759
}
760
761
// Register resample events with the ioapic
762
self.ioapic
763
.lock()
764
.register_resample_events(ioapic_resample_events);
765
// Register resample events with the pic
766
self.pic
767
.lock()
768
.register_resample_events(pic_resample_events);
769
770
// Make sure all future irq numbers are >= self.ioapic_pins
771
let mut irq_num = resources.allocate_irq().unwrap();
772
while irq_num < self.ioapic_pins as u32 {
773
irq_num = resources.allocate_irq().unwrap();
774
}
775
776
// Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode.
777
self.activated = true;
778
let _ = self.wake();
779
780
Ok(())
781
}
782
783
/// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to
784
/// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking
785
/// tube communication back to the main thread. Thus, we do not want the main thread to
786
/// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could
787
/// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function
788
/// processes each delayed event in the vec each time it's called. If the ioapic is still
789
/// locked, we keep the queued irqs for the next time this function is called.
790
fn process_delayed_irq_events(&mut self) -> Result<()> {
791
let irq_events = self.irq_events.lock();
792
let mut delayed_events = self.delayed_ioapic_irq_events.lock();
793
delayed_events.events.retain(|&event_index| {
794
if let Some(evt) = &irq_events[event_index] {
795
if let Ok(mut ioapic) = self.ioapic.try_lock() {
796
if evt.resample_event.is_some() {
797
ioapic.service_irq(evt.gsi as usize, true);
798
} else {
799
ioapic.service_irq(evt.gsi as usize, true);
800
ioapic.service_irq(evt.gsi as usize, false);
801
}
802
803
false
804
} else {
805
true
806
}
807
} else {
808
true
809
}
810
});
811
812
if delayed_events.events.is_empty() {
813
delayed_events.trigger.wait()?;
814
}
815
Ok(())
816
}
817
818
fn irq_delayed_event_token(&self) -> Result<Option<Event>> {
819
Ok(Some(
820
self.delayed_ioapic_irq_events.lock().trigger.try_clone()?,
821
))
822
}
823
824
fn check_capability(&self, c: IrqChipCap) -> bool {
825
match c {
826
IrqChipCap::TscDeadlineTimer => false,
827
IrqChipCap::X2Apic => false,
828
IrqChipCap::MpStateGetSet => true,
829
}
830
}
831
}
832
833
impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> {
834
fn debug_label(&self) -> String {
835
"UserspaceIrqChip APIC".to_string()
836
}
837
fn device_id(&self) -> DeviceId {
838
PlatformDeviceId::UserspaceIrqChip.into()
839
}
840
}
841
842
impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> {
843
fn sleep(&mut self) -> anyhow::Result<()> {
844
let mut dropper = self.dropper.lock();
845
dropper.sleep()
846
}
847
848
fn wake(&mut self) -> anyhow::Result<()> {
849
if self.activated {
850
// create workers and run them.
851
let mut dropper = self.dropper.lock();
852
for (i, descriptor) in self.timer_descriptors.iter().enumerate() {
853
let mut worker = TimerWorker {
854
id: i,
855
apic: self.apics[i].clone(),
856
descriptor: *descriptor,
857
vcpus: self.vcpus.clone(),
858
waiter: self.waiters[i].clone(),
859
};
860
let worker_thread =
861
WorkerThread::start(format!("UserspaceIrqChip timer worker {i}"), move |evt| {
862
if let Err(e) = worker.run(evt) {
863
error!("UserspaceIrqChip worker failed: {e:#}");
864
}
865
});
866
dropper.workers.push(worker_thread);
867
}
868
}
869
Ok(())
870
}
871
}
872
873
impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> {
874
fn read(&self, info: BusAccessInfo, data: &mut [u8]) {
875
self.apics[info.id].lock().read(info.offset, data)
876
}
877
fn write(&self, info: BusAccessInfo, data: &[u8]) {
878
let msg = self.apics[info.id].lock().write(info.offset, data);
879
if let Some(m) = msg {
880
self.handle_msg(m);
881
}
882
}
883
}
884
885
impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> {
886
fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> {
887
Ok(Box::new(self.try_clone()?))
888
}
889
890
fn as_irq_chip(&self) -> &dyn IrqChip {
891
self
892
}
893
894
fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip {
895
self
896
}
897
898
fn get_pic_state(&self, select: PicSelect) -> Result<PicState> {
899
Ok(self.pic.lock().get_pic_state(select))
900
}
901
902
fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> {
903
self.pic.lock().set_pic_state(select, state);
904
Ok(())
905
}
906
907
fn get_ioapic_state(&self) -> Result<IoapicState> {
908
Ok(self.ioapic.lock().get_ioapic_state())
909
}
910
911
fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> {
912
self.ioapic.lock().set_ioapic_state(state);
913
Ok(())
914
}
915
916
fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> {
917
Ok(self.apics[vcpu_id].lock().get_state())
918
}
919
920
fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> {
921
self.apics[vcpu_id].lock().set_state(state);
922
Ok(())
923
}
924
925
/// Get the lapic frequency in Hz
926
fn lapic_frequency(&self) -> u32 {
927
Apic::frequency()
928
}
929
930
fn get_pit(&self) -> Result<PitState> {
931
Ok(self.pit.lock().get_pit_state())
932
}
933
934
fn set_pit(&mut self, state: &PitState) -> Result<()> {
935
self.pit.lock().set_pit_state(state);
936
Ok(())
937
}
938
939
/// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused.
940
/// devices::Pit uses 0x61.
941
fn pit_uses_speaker_port(&self) -> bool {
942
true
943
}
944
945
fn snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot> {
946
Err(anyhow::anyhow!("Not supported yet in userspace"))
947
}
948
fn restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()> {
949
Err(anyhow::anyhow!("Not supported yet in userspace"))
950
}
951
}
952
953
/// Condition variable used by `UserspaceIrqChip::wait_until_runnable`.
954
#[derive(Default)]
955
struct Waiter {
956
// mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called.
957
mtx: Mutex<bool>,
958
cvar: Condvar,
959
}
960
961
impl Waiter {
962
/// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state.
963
pub fn notify(&self) {
964
let _lock = self.mtx.lock();
965
self.cvar.notify_all();
966
}
967
968
/// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted
969
/// flag and vcpu runnable state. If `interrupted` is true, then `wait_until_runnable` should
970
/// stop waiting for a runnable vcpu and return immediately.
971
pub fn set_and_notify(&self, interrupted: bool) {
972
let mut interrupted_lock = self.mtx.lock();
973
*interrupted_lock = interrupted;
974
self.cvar.notify_all();
975
}
976
}
977
978
/// Worker thread for polling timer events and sending them to an APIC.
979
struct TimerWorker<V: VcpuX86_64> {
980
id: usize,
981
apic: Arc<Mutex<Apic>>,
982
vcpus: Arc<Mutex<Vec<Option<V>>>>,
983
descriptor: Descriptor,
984
waiter: Arc<Waiter>,
985
}
986
987
impl<V: VcpuX86_64> TimerWorker<V> {
988
fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> {
989
#[derive(EventToken)]
990
enum Token {
991
// The timer expired.
992
TimerExpire,
993
// The parent thread requested an exit.
994
Kill,
995
}
996
997
let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[
998
(&self.descriptor, Token::TimerExpire),
999
(&kill_evt, Token::Kill),
1000
])
1001
.map_err(TimerWorkerError::CreateWaitContext)?;
1002
1003
loop {
1004
let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?;
1005
for event in events.iter().filter(|e| e.is_readable) {
1006
match event.token {
1007
Token::TimerExpire => {
1008
self.apic.lock().handle_timer_expiration();
1009
if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) {
1010
vcpu.set_interrupt_window_requested(true);
1011
}
1012
self.waiter.notify();
1013
}
1014
Token::Kill => return Ok(()),
1015
}
1016
}
1017
}
1018
}
1019
}
1020
1021
#[derive(Debug)]
1022
enum TimerWorkerError {
1023
/// Creating WaitContext failed.
1024
CreateWaitContext(Error),
1025
/// Error while waiting for events.
1026
WaitError(Error),
1027
}
1028
1029
impl Display for TimerWorkerError {
1030
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1031
use self::TimerWorkerError::*;
1032
1033
match self {
1034
CreateWaitContext(e) => write!(f, "failed to create event context: {e}"),
1035
WaitError(e) => write!(f, "failed to wait for events: {e}"),
1036
}
1037
}
1038
}
1039
1040
impl std::error::Error for TimerWorkerError {}
1041
1042
type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>;
1043
1044