Path: blob/main/devices/src/virtio/vhost_user_backend/gpu.rs
5394 views
// Copyright 2021 The ChromiumOS Authors1// Use of this source code is governed by a BSD-style license that can be2// found in the LICENSE file.34pub mod sys;56use std::cell::RefCell;7use std::rc::Rc;8use std::sync::Arc;910use anyhow::anyhow;11use anyhow::bail;12use anyhow::Context;13use base::error;14use base::warn;15use base::Tube;16use cros_async::EventAsync;17use cros_async::Executor;18use cros_async::TaskHandle;19use futures::FutureExt;20use futures::StreamExt;21use snapshot::AnySnapshot;22use sync::Mutex;23pub use sys::run_gpu_device;24pub use sys::Options;25use vm_memory::GuestMemory;26use vmm_vhost::message::VhostUserProtocolFeatures;27use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;2829use crate::virtio::device_constants::gpu::NUM_QUEUES;30use crate::virtio::gpu;31use crate::virtio::gpu::QueueReader;32use crate::virtio::vhost_user_backend::handler::Error as DeviceError;33use crate::virtio::vhost_user_backend::handler::VhostBackendReqConnection;34use crate::virtio::vhost_user_backend::handler::VhostUserDevice;35use crate::virtio::vhost_user_backend::handler::WorkerState;36use crate::virtio::DescriptorChain;37use crate::virtio::Gpu;38use crate::virtio::Queue;39use crate::virtio::SharedMemoryMapper;40use crate::virtio::SharedMemoryRegion;41use crate::virtio::VirtioDevice;4243const MAX_QUEUE_NUM: usize = NUM_QUEUES;4445#[derive(Clone)]46struct SharedReader {47queue: Arc<Mutex<Queue>>,48}4950impl gpu::QueueReader for SharedReader {51fn pop(&self) -> Option<DescriptorChain> {52self.queue.lock().pop()53}5455fn add_used(&self, desc_chain: DescriptorChain, len: u32) {56self.queue57.lock()58.add_used_with_bytes_written(desc_chain, len)59}6061fn signal_used(&self) {62self.queue.lock().trigger_interrupt();63}64}6566async fn run_ctrl_queue(67reader: SharedReader,68mem: GuestMemory,69kick_evt: EventAsync,70state: Rc<RefCell<gpu::Frontend>>,71) {72loop {73if let Err(e) = kick_evt.next_val().await {74error!("Failed to read kick event for ctrl queue: {}", e);75break;76}7778let mut state = state.borrow_mut();79let needs_interrupt = state.process_queue(&mem, &reader);8081if needs_interrupt {82reader.signal_used();83}84}85}8687struct GpuBackend {88ex: Executor,89gpu: Rc<RefCell<Gpu>>,90resource_bridges: Arc<Mutex<Vec<Tube>>>,91state: Option<Rc<RefCell<gpu::Frontend>>>,92fence_state: Arc<Mutex<gpu::FenceState>>,93queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],94// In the downstream, we may add platform workers after start_platform_workers returns.95platform_worker_tx: futures::channel::mpsc::UnboundedSender<TaskHandle<()>>,96platform_worker_rx: futures::channel::mpsc::UnboundedReceiver<TaskHandle<()>>,97shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,98}99100impl GpuBackend {101fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {102self.ex103.run_until(async {104while let Some(Some(handle)) = self.platform_worker_rx.next().now_or_never() {105handle.cancel().await;106}107})108.context("stopping the non-queue workers for GPU")?;109Ok(())110}111}112113impl VhostUserDevice for GpuBackend {114fn max_queue_num(&self) -> usize {115MAX_QUEUE_NUM116}117118fn features(&self) -> u64 {119self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES120}121122fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {123self.gpu.borrow_mut().ack_features(value);124Ok(())125}126127fn protocol_features(&self) -> VhostUserProtocolFeatures {128VhostUserProtocolFeatures::CONFIG129| VhostUserProtocolFeatures::BACKEND_REQ130| VhostUserProtocolFeatures::MQ131| VhostUserProtocolFeatures::SHMEM_MAP132| VhostUserProtocolFeatures::DEVICE_STATE133}134135fn read_config(&self, offset: u64, dst: &mut [u8]) {136self.gpu.borrow().read_config(offset, dst)137}138139fn write_config(&self, offset: u64, data: &[u8]) {140self.gpu.borrow_mut().write_config(offset, data)141}142143fn start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()> {144if self.queue_workers[idx].is_some() {145warn!("Starting new queue handler without stopping old handler");146self.stop_queue(idx)?;147}148149let doorbell = queue.interrupt().clone();150151// Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to152// handle fences in the RutabagaFenceHandler, and also handle queue messages in153// `run_ctrl_queue`.154// For the cursor queue, we still create the refcounted queue to support retrieving the155// queue for snapshotting (but don't handle any messages).156let queue = Arc::new(Mutex::new(queue));157158// Spawn a worker for the queue.159let queue_task = match idx {1600 => {161// Set up worker for the control queue.162let kick_evt = queue163.lock()164.event()165.try_clone()166.context("failed to clone queue event")?;167let kick_evt = EventAsync::new(kick_evt, &self.ex)168.context("failed to create EventAsync for kick_evt")?;169let reader = SharedReader {170queue: queue.clone(),171};172173let state = if let Some(s) = self.state.as_ref() {174s.clone()175} else {176let fence_handler_resources =177Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {178mem: mem.clone(),179ctrl_queue: reader.clone(),180})));181let fence_handler = gpu::create_fence_handler(182fence_handler_resources,183self.fence_state.clone(),184);185186let state = Rc::new(RefCell::new(187self.gpu188.borrow_mut()189.initialize_frontend(190self.fence_state.clone(),191fence_handler,192Arc::clone(&self.shmem_mapper),193)194.ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,195));196self.state = Some(state.clone());197state198};199200// Start handling platform-specific workers.201self.start_platform_workers(doorbell)?;202203// Start handling the control queue.204self.ex205.spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))206}2071 => {208// For the cursor queue, spawn an empty worker, as we don't process it at all.209// We don't handle the cursor queue because no current users of vhost-user GPU pass210// any messages on it.211self.ex.spawn_local(async {})212}213_ => bail!("attempted to start unknown queue: {}", idx),214};215216self.queue_workers[idx] = Some(WorkerState { queue_task, queue });217Ok(())218}219220fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {221if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {222// Wait for queue_task to be aborted.223let _ = self.ex.run_until(worker.queue_task.cancel());224225if idx == 0 {226// Stop the non-queue workers if this is the control queue (where we start them).227self.stop_non_queue_workers()?;228229// After we stop all workers, we have only one reference left to self.state.230// Clearing it allows the GPU state to be destroyed, which gets rid of the231// remaining control queue reference from RutabagaFenceHandler.232// This allows our worker.queue to be recovered as it has no further references.233self.state = None;234}235236let queue = match Arc::try_unwrap(worker.queue) {237Ok(queue_mutex) => queue_mutex.into_inner(),238Err(_) => panic!("failed to recover queue from worker"),239};240241Ok(queue)242} else {243Err(anyhow::Error::new(DeviceError::WorkerNotFound))244}245}246247fn enter_suspended_state(&mut self) -> anyhow::Result<()> {248self.stop_non_queue_workers()?;249Ok(())250}251252fn reset(&mut self) {253self.stop_non_queue_workers()254.expect("Failed to stop platform workers.");255256for queue_num in 0..self.max_queue_num() {257// The cursor queue is never used, so we should check if the queue is set before258// stopping.259if self.queue_workers[queue_num].is_some() {260if let Err(e) = self.stop_queue(queue_num) {261error!("Failed to stop_queue during reset: {}", e);262}263}264}265}266267fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {268self.gpu.borrow().get_shared_memory_region()269}270271fn set_backend_req_connection(&mut self, conn: VhostBackendReqConnection) {272if self273.shmem_mapper274.lock()275.replace(conn.shmem_mapper().unwrap())276.is_some()277{278warn!("Connection already established. Overwriting shmem_mapper");279}280}281282fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {283// TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones284// snapshot of the GPU to create a POC.285AnySnapshot::to_any(())286}287288fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {289let () = AnySnapshot::from_any(data)?;290Ok(())291}292}293294impl Drop for GpuBackend {295fn drop(&mut self) {296// Workers are detached and will leak unless they are aborted. Aborting marks the297// Abortable task, then wakes it up. This means the executor should be asked to continue298// running for one more step after the backend is destroyed.299self.reset();300}301}302303304