Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/devices/src/virtio/vhost_user_backend/gpu.rs
5394 views
1
// Copyright 2021 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
pub mod sys;
6
7
use std::cell::RefCell;
8
use std::rc::Rc;
9
use std::sync::Arc;
10
11
use anyhow::anyhow;
12
use anyhow::bail;
13
use anyhow::Context;
14
use base::error;
15
use base::warn;
16
use base::Tube;
17
use cros_async::EventAsync;
18
use cros_async::Executor;
19
use cros_async::TaskHandle;
20
use futures::FutureExt;
21
use futures::StreamExt;
22
use snapshot::AnySnapshot;
23
use sync::Mutex;
24
pub use sys::run_gpu_device;
25
pub use sys::Options;
26
use vm_memory::GuestMemory;
27
use vmm_vhost::message::VhostUserProtocolFeatures;
28
use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
29
30
use crate::virtio::device_constants::gpu::NUM_QUEUES;
31
use crate::virtio::gpu;
32
use crate::virtio::gpu::QueueReader;
33
use crate::virtio::vhost_user_backend::handler::Error as DeviceError;
34
use crate::virtio::vhost_user_backend::handler::VhostBackendReqConnection;
35
use crate::virtio::vhost_user_backend::handler::VhostUserDevice;
36
use crate::virtio::vhost_user_backend::handler::WorkerState;
37
use crate::virtio::DescriptorChain;
38
use crate::virtio::Gpu;
39
use crate::virtio::Queue;
40
use crate::virtio::SharedMemoryMapper;
41
use crate::virtio::SharedMemoryRegion;
42
use crate::virtio::VirtioDevice;
43
44
const MAX_QUEUE_NUM: usize = NUM_QUEUES;
45
46
#[derive(Clone)]
47
struct SharedReader {
48
queue: Arc<Mutex<Queue>>,
49
}
50
51
impl gpu::QueueReader for SharedReader {
52
fn pop(&self) -> Option<DescriptorChain> {
53
self.queue.lock().pop()
54
}
55
56
fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
57
self.queue
58
.lock()
59
.add_used_with_bytes_written(desc_chain, len)
60
}
61
62
fn signal_used(&self) {
63
self.queue.lock().trigger_interrupt();
64
}
65
}
66
67
async fn run_ctrl_queue(
68
reader: SharedReader,
69
mem: GuestMemory,
70
kick_evt: EventAsync,
71
state: Rc<RefCell<gpu::Frontend>>,
72
) {
73
loop {
74
if let Err(e) = kick_evt.next_val().await {
75
error!("Failed to read kick event for ctrl queue: {}", e);
76
break;
77
}
78
79
let mut state = state.borrow_mut();
80
let needs_interrupt = state.process_queue(&mem, &reader);
81
82
if needs_interrupt {
83
reader.signal_used();
84
}
85
}
86
}
87
88
struct GpuBackend {
89
ex: Executor,
90
gpu: Rc<RefCell<Gpu>>,
91
resource_bridges: Arc<Mutex<Vec<Tube>>>,
92
state: Option<Rc<RefCell<gpu::Frontend>>>,
93
fence_state: Arc<Mutex<gpu::FenceState>>,
94
queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
95
// In the downstream, we may add platform workers after start_platform_workers returns.
96
platform_worker_tx: futures::channel::mpsc::UnboundedSender<TaskHandle<()>>,
97
platform_worker_rx: futures::channel::mpsc::UnboundedReceiver<TaskHandle<()>>,
98
shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
99
}
100
101
impl GpuBackend {
102
fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
103
self.ex
104
.run_until(async {
105
while let Some(Some(handle)) = self.platform_worker_rx.next().now_or_never() {
106
handle.cancel().await;
107
}
108
})
109
.context("stopping the non-queue workers for GPU")?;
110
Ok(())
111
}
112
}
113
114
impl VhostUserDevice for GpuBackend {
115
fn max_queue_num(&self) -> usize {
116
MAX_QUEUE_NUM
117
}
118
119
fn features(&self) -> u64 {
120
self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
121
}
122
123
fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
124
self.gpu.borrow_mut().ack_features(value);
125
Ok(())
126
}
127
128
fn protocol_features(&self) -> VhostUserProtocolFeatures {
129
VhostUserProtocolFeatures::CONFIG
130
| VhostUserProtocolFeatures::BACKEND_REQ
131
| VhostUserProtocolFeatures::MQ
132
| VhostUserProtocolFeatures::SHMEM_MAP
133
| VhostUserProtocolFeatures::DEVICE_STATE
134
}
135
136
fn read_config(&self, offset: u64, dst: &mut [u8]) {
137
self.gpu.borrow().read_config(offset, dst)
138
}
139
140
fn write_config(&self, offset: u64, data: &[u8]) {
141
self.gpu.borrow_mut().write_config(offset, data)
142
}
143
144
fn start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()> {
145
if self.queue_workers[idx].is_some() {
146
warn!("Starting new queue handler without stopping old handler");
147
self.stop_queue(idx)?;
148
}
149
150
let doorbell = queue.interrupt().clone();
151
152
// Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
153
// handle fences in the RutabagaFenceHandler, and also handle queue messages in
154
// `run_ctrl_queue`.
155
// For the cursor queue, we still create the refcounted queue to support retrieving the
156
// queue for snapshotting (but don't handle any messages).
157
let queue = Arc::new(Mutex::new(queue));
158
159
// Spawn a worker for the queue.
160
let queue_task = match idx {
161
0 => {
162
// Set up worker for the control queue.
163
let kick_evt = queue
164
.lock()
165
.event()
166
.try_clone()
167
.context("failed to clone queue event")?;
168
let kick_evt = EventAsync::new(kick_evt, &self.ex)
169
.context("failed to create EventAsync for kick_evt")?;
170
let reader = SharedReader {
171
queue: queue.clone(),
172
};
173
174
let state = if let Some(s) = self.state.as_ref() {
175
s.clone()
176
} else {
177
let fence_handler_resources =
178
Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
179
mem: mem.clone(),
180
ctrl_queue: reader.clone(),
181
})));
182
let fence_handler = gpu::create_fence_handler(
183
fence_handler_resources,
184
self.fence_state.clone(),
185
);
186
187
let state = Rc::new(RefCell::new(
188
self.gpu
189
.borrow_mut()
190
.initialize_frontend(
191
self.fence_state.clone(),
192
fence_handler,
193
Arc::clone(&self.shmem_mapper),
194
)
195
.ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
196
));
197
self.state = Some(state.clone());
198
state
199
};
200
201
// Start handling platform-specific workers.
202
self.start_platform_workers(doorbell)?;
203
204
// Start handling the control queue.
205
self.ex
206
.spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
207
}
208
1 => {
209
// For the cursor queue, spawn an empty worker, as we don't process it at all.
210
// We don't handle the cursor queue because no current users of vhost-user GPU pass
211
// any messages on it.
212
self.ex.spawn_local(async {})
213
}
214
_ => bail!("attempted to start unknown queue: {}", idx),
215
};
216
217
self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
218
Ok(())
219
}
220
221
fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
222
if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
223
// Wait for queue_task to be aborted.
224
let _ = self.ex.run_until(worker.queue_task.cancel());
225
226
if idx == 0 {
227
// Stop the non-queue workers if this is the control queue (where we start them).
228
self.stop_non_queue_workers()?;
229
230
// After we stop all workers, we have only one reference left to self.state.
231
// Clearing it allows the GPU state to be destroyed, which gets rid of the
232
// remaining control queue reference from RutabagaFenceHandler.
233
// This allows our worker.queue to be recovered as it has no further references.
234
self.state = None;
235
}
236
237
let queue = match Arc::try_unwrap(worker.queue) {
238
Ok(queue_mutex) => queue_mutex.into_inner(),
239
Err(_) => panic!("failed to recover queue from worker"),
240
};
241
242
Ok(queue)
243
} else {
244
Err(anyhow::Error::new(DeviceError::WorkerNotFound))
245
}
246
}
247
248
fn enter_suspended_state(&mut self) -> anyhow::Result<()> {
249
self.stop_non_queue_workers()?;
250
Ok(())
251
}
252
253
fn reset(&mut self) {
254
self.stop_non_queue_workers()
255
.expect("Failed to stop platform workers.");
256
257
for queue_num in 0..self.max_queue_num() {
258
// The cursor queue is never used, so we should check if the queue is set before
259
// stopping.
260
if self.queue_workers[queue_num].is_some() {
261
if let Err(e) = self.stop_queue(queue_num) {
262
error!("Failed to stop_queue during reset: {}", e);
263
}
264
}
265
}
266
}
267
268
fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
269
self.gpu.borrow().get_shared_memory_region()
270
}
271
272
fn set_backend_req_connection(&mut self, conn: VhostBackendReqConnection) {
273
if self
274
.shmem_mapper
275
.lock()
276
.replace(conn.shmem_mapper().unwrap())
277
.is_some()
278
{
279
warn!("Connection already established. Overwriting shmem_mapper");
280
}
281
}
282
283
fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
284
// TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
285
// snapshot of the GPU to create a POC.
286
AnySnapshot::to_any(())
287
}
288
289
fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
290
let () = AnySnapshot::from_any(data)?;
291
Ok(())
292
}
293
}
294
295
impl Drop for GpuBackend {
296
fn drop(&mut self) {
297
// Workers are detached and will leak unless they are aborted. Aborting marks the
298
// Abortable task, then wakes it up. This means the executor should be asked to continue
299
// running for one more step after the backend is destroyed.
300
self.reset();
301
}
302
}
303
304