CoCalc -- sparse_buffer

GitHub Repository: bevyengine/bevy
Path: blob/main/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs
³⁰⁶³⁶ views
1
//! GPU buffers that support sparse updates if only a small number of elements
2
//! have changed.
3

4
use alloc::sync::{Arc, Weak};
5
use core::{
6
    iter, slice,
7
    sync::atomic::{AtomicU64, Ordering},
8
};
9

10
use bevy_app::{App, Plugin};
11
use bevy_asset::{embedded_asset, load_embedded_asset, Handle};
12
use bevy_derive::{Deref, DerefMut};
13
use bevy_ecs::{
14
    resource::Resource,
15
    schedule::IntoScheduleConfigs as _,
16
    system::{Res, ResMut},
17
    world::{FromWorld, World},
18
};
19
use bevy_log::{error, info};
20
use bevy_material::{
21
    bind_group_layout_entries::{
22
        binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
23
        BindGroupLayoutEntries,
24
    },
25
    descriptor::{BindGroupLayoutDescriptor, CachedComputePipelineId, ComputePipelineDescriptor},
26
};
27
use bevy_shader::Shader;
28
use bytemuck::{Pod, Zeroable};
29
use encase::ShaderType;
30
use weak_table::WeakKeyHashMap;
31
use wgpu::{BufferDescriptor, BufferUsages, ComputePassDescriptor, ShaderStages};
32

33
use crate::{
34
    diagnostic::{DiagnosticsRecorder, RecordDiagnostics as _},
35
    render_resource::{
36
        AtomicPod, BindGroup, BindGroupEntries, Buffer, PipelineCache, RawBufferVec,
37
        SpecializedComputePipeline, SpecializedComputePipelines, UniformBuffer,
38
    },
39
    renderer::{RenderDevice, RenderGraph, RenderGraphSystems, RenderQueue},
40
    ExtractSchedule, RenderApp,
41
};
42

43
/// A plugin that allows sparse updates of GPU buffers if only a small number of
44
/// elements have changed.
45
pub struct SparseBufferPlugin;
46

47
impl Plugin for SparseBufferPlugin {
48
    fn build(&self, app: &mut App) {
49
        embedded_asset!(app, "sparse_buffer_update.wgsl");
50
    }
51

52
    fn finish(&self, app: &mut App) {
53
        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
54
            return;
55
        };
56

57
        render_app
58
            .init_resource::<SparseBufferUpdateJobs>()
59
            .init_resource::<SparseBufferUpdatePipelines>()
60
            .init_resource::<SpecializedComputePipelines<SparseBufferUpdatePipelines>>()
61
            .init_resource::<SparseBufferUpdateBindGroups>()
62
            .add_systems(ExtractSchedule, clear_sparse_buffer_jobs)
63
            .add_systems(
64
                RenderGraph,
65
                // We perform sparse buffer updates very early so that sparse
66
                // buffers can be used in any render pass.
67
                update_sparse_buffers.in_set(RenderGraphSystems::Begin),
68
            );
69
    }
70
}
71

72
/// A globally-unique ID that identifies this sparse buffer.
73
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Deref, DerefMut)]
74
pub struct SparseBufferId(pub u64);
75

76
/// An object that allows the sparse buffer ID to be query and holds the bind
77
/// group for that sparse buffer alive.
78
///
79
/// Each sparse buffer holds a strong reference to this handle, and the
80
/// [`SparseBufferUpdateBindGroups`] resource contains a weak map from this
81
/// handle to the bind group. This setup ensures that, when the sparse buffer is
82
/// freed, the bind groups for that sparse buffer are freed as well.
83
pub type SparseBufferHandle = Arc<SparseBufferId>;
84

85
/// The next sparse buffer ID to be assigned.
86
static NEXT_SPARSE_BUFFER_ID: AtomicU64 = AtomicU64::new(0);
87

88
/// The size of a single workgroup in the sparse buffer shader.
89
const SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE: u32 = 256;
90

91
/// The fraction of the buffer that may be changed before we fall back to full
92
/// reupload.
93
///
94
/// This is set to 15% by default. This was obtained experimentally by testing
95
/// very large scenes and roughly matches the values used by other engines that
96
/// perform sparse buffer updates.
97
const SPARSE_UPLOAD_THRESHOLD: f64 = 0.15;
98

99
/// The WebGPU limit on the number of workgroups that can be dispatched.
100
const MAX_WORKGROUPS: u32 = 65535;
101

102
/// We round all allocations up to the nearest power of this.
103
const REALLOCATION_FACTOR: f64 = 1.5;
104
/// We round all allocations up to the nearest multiple of this.
105
const REALLOCATION_SIZE_MULTIPLE: usize = 256;
106

107
/// The number of dirty-page bits packed into each [`AtomicU64`] word.
108
const PAGES_PER_DIRTY_WORD: u32 = 64;
109

110
/// Pipelines for the sparse buffer update shader.
111
///
112
/// This shader is shared among all sparse buffer vectors.
113
#[derive(Resource)]
114
pub struct SparseBufferUpdatePipelines {
115
    /// The bind group layout.
116
    ///
117
    /// We only have one bind group layout shared among all sparse buffer
118
    /// vectors.
119
    bind_group_layout: Option<BindGroupLayoutDescriptor>,
120
    /// The shader that performs the scatter operation.
121
    shader: Option<Handle<Shader>>,
122
}
123

124
/// A resource, part of the render world, that stores the bind groups for each
125
/// sparse buffer.
126
#[derive(Resource)]
127
pub struct SparseBufferUpdateBindGroups {
128
    /// The bind groups for each sparse buffer.
129
    ///
130
    /// These are stored in a weak map so that when the sparse buffer goes away,
131
    /// the bind group for that buffer goes away as well.
132
    bind_groups: WeakKeyHashMap<Weak<SparseBufferId>, SparseBufferUpdateBindGroup>,
133
    /// The ID of the update shader pipeline shared among all sparse buffers.
134
    pipeline_id: CachedComputePipelineId,
135
}
136

137
/// A single bind group for the sparse buffer update shader.
138
pub struct SparseBufferUpdateBindGroup {
139
    /// The actual bind group.
140
    bind_group: BindGroup,
141
}
142

143
/// A resource, part of the render world, that stores all pending sparse updates
144
/// to buffers.
145
#[derive(Resource, Default, Deref, DerefMut)]
146
pub struct SparseBufferUpdateJobs(pub Vec<SparseBufferUpdateJob>);
147

148
/// Describes a sparse update operation for a buffer.
149
pub struct SparseBufferUpdateJob {
150
    /// A handle to the buffer to be updated.
151
    sparse_buffer_handle: SparseBufferHandle,
152
    /// The number of pages to update.
153
    updated_page_count: u32,
154
    /// The base-2 logarithm of the size of a page for the buffer.
155
    ///
156
    /// The actual page size can be computed as `1 << page_size_log2`.
157
    page_size_log2: u32,
158
    /// The size of each element in 32-bit words.
159
    element_word_size: u32,
160
    /// A debugging label for the buffer.
161
    label: Arc<str>,
162
}
163

164
impl SparseBufferUpdateJob {
165
    /// The number of elements per page.
166
    fn page_size(&self) -> u32 {
167
        1 << self.page_size_log2
168
    }
169

170
    /// Calculates the number of words that need to be updated.
171
    fn words_to_update(&self) -> u32 {
172
        self.updated_page_count * self.page_size() * self.element_word_size
173
    }
174

175
    /// Calculates the number of workgroups that need to be dispatched.
176
    fn workgroup_count(&self) -> u32 {
177
        self.words_to_update()
178
            .div_ceil(SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE)
179
    }
180
}
181

182
/// A GPU type that describes a sparse update that is to be performed.
183
#[derive(Clone, Copy, Default, ShaderType, Pod, Zeroable)]
184
#[repr(C)]
185
struct GpuSparseBufferUpdateMetadata {
186
    /// The size of a single element in 32-bit words.
187
    element_size: u32,
188
    /// The number of pages that need to be updated.
189
    updated_page_count: u32,
190
    /// The base-2 logarithm of the page size.
191
    ///
192
    /// That is, the page size is `1 << page_size_log2`.
193
    page_size_log2: u32,
194
}
195

196
/// A system, part of the render graph, that performs sparse buffer updates to
197
/// buffers for which only a small number of elements have changed.
198
///
199
/// This runs as early in the pipeline as possible so that sparse buffers can be
200
/// used for any subsequent pass.
201
fn update_sparse_buffers(
202
    sparse_buffer_update_jobs: Res<SparseBufferUpdateJobs>,
203
    sparse_buffer_update_bind_groups: Res<SparseBufferUpdateBindGroups>,
204
    pipeline_cache: Res<PipelineCache>,
205
    mut diagnostics: Option<ResMut<DiagnosticsRecorder>>,
206
    render_device: Res<RenderDevice>,
207
    render_queue: Res<RenderQueue>,
208
) {
209
    // Bail if we have nothing to do.
210
    if sparse_buffer_update_jobs.is_empty() {
211
        return;
212
    }
213

214
    // We need to create a command encoder since this pass isn't associated with
215
    // a view.
216
    let mut command_encoder =
217
        render_device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
218
            label: Some("sparse buffer update"),
219
        });
220

221
    let time_span = diagnostics
222
        .as_mut()
223
        .map(|diagnostics| diagnostics.time_span(&mut command_encoder, "sparse buffer update"));
224

225
    command_encoder.push_debug_group("sparse buffer update");
226

227
    let Some(compute_pipeline) =
228
        pipeline_cache.get_compute_pipeline(sparse_buffer_update_bind_groups.pipeline_id)
229
    else {
230
        return;
231
    };
232

233
    // Process each sparse buffer update job.
234
    for sparse_buffer_update_job in sparse_buffer_update_jobs.iter() {
235
        let Some(sparse_buffer_update_bind_group) = sparse_buffer_update_bind_groups
236
            .bind_groups
237
            .get(&sparse_buffer_update_job.sparse_buffer_handle)
238
        else {
239
            continue;
240
        };
241

242
        let mut sparse_buffer_update_pass =
243
            command_encoder.begin_compute_pass(&ComputePassDescriptor {
244
                label: Some(&*format!(
245
                    "sparse buffer update ({})",
246
                    &sparse_buffer_update_job.label
247
                )),
248
                timestamp_writes: None,
249
            });
250
        sparse_buffer_update_pass.set_pipeline(compute_pipeline);
251
        sparse_buffer_update_pass.set_bind_group(
252
            0,
253
            &sparse_buffer_update_bind_group.bind_group,
254
            &[],
255
        );
256
        sparse_buffer_update_pass.dispatch_workgroups(
257
            sparse_buffer_update_job.workgroup_count(),
258
            1,
259
            1,
260
        );
261
    }
262

263
    command_encoder.pop_debug_group();
264
    if let Some(time_span) = time_span {
265
        time_span.end(&mut command_encoder);
266
    }
267

268
    render_queue.submit([command_encoder.finish()]);
269
}
270

271
/// A system that clears out the sparse buffer update jobs in preparation for a
272
/// new frame.
273
fn clear_sparse_buffer_jobs(mut sparse_buffer_update_jobs: ResMut<SparseBufferUpdateJobs>) {
274
    sparse_buffer_update_jobs.clear();
275
}
276

277
impl FromWorld for SparseBufferUpdatePipelines {
278
    fn from_world(world: &mut World) -> Self {
279
        let render_device = world.resource::<RenderDevice>();
280
        let limit = render_device.limits().max_storage_buffers_per_shader_stage;
281

282
        if limit < 3 {
283
            info!(
284
                "Sparse buffer updates disabled. RenderDevice lacks support: max_storage_buffers_per_shader_stage ({}) < 3.",
285
                limit
286
            );
287

288
            return SparseBufferUpdatePipelines {
289
                bind_group_layout: None,
290
                shader: None,
291
            };
292
        }
293

294
        let bind_group_layout = BindGroupLayoutDescriptor::new(
295
            "sparse buffer update bind group layout",
296
            &BindGroupLayoutEntries::sequential(
297
                ShaderStages::COMPUTE,
298
                (
299
                    // @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
300
                    storage_buffer::<u32>(false),
301
                    // @group(0) @binding(1) var<storage> src_buffer: array<u32>;
302
                    storage_buffer_read_only::<u32>(false),
303
                    // @group(0) @binding(2) var<storage> indices: array<u32>;
304
                    storage_buffer_read_only::<u32>(false),
305
                    // @group(0) @binding(3) var<uniform> metadata:
306
                    // SparseBufferUpdateMetadata;
307
                    uniform_buffer::<GpuSparseBufferUpdateMetadata>(false),
308
                ),
309
            ),
310
        );
311

312
        SparseBufferUpdatePipelines {
313
            bind_group_layout: Some(bind_group_layout),
314
            shader: Some(load_embedded_asset!(world, "sparse_buffer_update.wgsl")),
315
        }
316
    }
317
}
318

319
impl SpecializedComputePipeline for SparseBufferUpdatePipelines {
320
    type Key = ();
321

322
    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
323
        ComputePipelineDescriptor {
324
            label: Some("sparse buffer update pipeline".into()),
325
            layout: self.bind_group_layout.clone().into_iter().collect(),
326
            shader: self.shader.clone().unwrap_or_default(),
327
            shader_defs: vec![],
328
            ..ComputePipelineDescriptor::default()
329
        }
330
    }
331
}
332

333
/// The buffers that we use to sparsely scatter new data to the GPU.
334
///
335
/// There's one such set of buffers per sparse buffer vector.
336
struct SparseBufferStagingBuffers {
337
    /// All pages that have changed and need to be updated.
338
    source_data: RawBufferVec<u32>,
339

340
    /// The index at which we write each page in [`Self::source_data`].
341
    ///
342
    /// The length of this buffer is equal to [`Self::source_data`] divided by
343
    /// 2^[`Self::page_size_log2`].
344
    indices: RawBufferVec<u32>,
345

346
    /// The size of each element in 32-bit words.
347
    element_word_size: u32,
348

349
    /// The base-2 logarithm of the page size in elements.
350
    ///
351
    /// That is, the page size in elements is `1 << page_size_log2`.
352
    page_size_log2: u32,
353
}
354

355
impl SparseBufferStagingBuffers {
356
    /// The number of elements per page.
357
    fn page_size(&self) -> usize {
358
        1 << self.page_size_log2
359
    }
360

361
    /// Creates a new set of staging buffers for a sparse buffer vector.
362
    fn new(label: &str, element_word_size: u32, page_size_log2: u32) -> SparseBufferStagingBuffers {
363
        let mut source_data_buffer =
364
            RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
365
        source_data_buffer.set_label(Some(&*format!("{} staging buffer", label)));
366

367
        let mut indices_buffer = RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
368
        indices_buffer.set_label(Some(&*format!("{} index buffer", label)));
369

370
        SparseBufferStagingBuffers {
371
            source_data: source_data_buffer,
372
            indices: indices_buffer,
373
            element_word_size,
374
            page_size_log2,
375
        }
376
    }
377

378
    /// Returns the number of updated pages.
379
    fn updated_page_count(&self) -> u32 {
380
        // Note that we don't have to round up here because data is always
381
        // uploaded in increments of a whole page.
382
        let element_count = self.source_data.len() / self.element_word_size as usize;
383
        (element_count / self.page_size()) as u32
384
    }
385

386
    /// Writes the buffers that contain all the data necessary to perform a
387
    /// sparse upload to the GPU.
388
    ///
389
    /// This includes the buffer associated with the supplied
390
    /// `metadata_uniform`.
391
    fn write_buffers(
392
        &mut self,
393
        metadata_uniform: &mut UniformBuffer<GpuSparseBufferUpdateMetadata>,
394
        render_device: &RenderDevice,
395
        render_queue: &RenderQueue,
396
    ) {
397
        metadata_uniform.get_mut().updated_page_count = self.updated_page_count();
398
        metadata_uniform.write_buffer(render_device, render_queue);
399

400
        self.source_data.write_buffer(render_device, render_queue);
401
        self.indices.write_buffer(render_device, render_queue);
402
    }
403

404
    /// Returns true if a sparse buffer update should *not* be performed because
405
    /// too many words changed.
406
    fn should_perform_full_reupload(&self, changed_page_count: u32, buffer_length: usize) -> bool {
407
        // Calculate the number of changed words. If it's greater than the
408
        // maximum number of workgroups as defined by `wgpu`, we must perform a
409
        // full reupload.
410
        let total_changed_word_count =
411
            changed_page_count * self.page_size() as u32 * self.element_word_size;
412
        if total_changed_word_count > MAX_WORKGROUPS * SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE {
413
            return true;
414
        }
415

416
        // Don't perform a sparse upload if too many words changed, as it'll end
417
        // up being slower than just uploading the whole buffer afresh.
418
        let sparse_upload_fraction =
419
            changed_page_count as f64 / buffer_length.div_ceil(self.page_size()) as f64;
420
        sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD
421
    }
422
}
423

424
/// A GPU buffer that can grow, can be updated atomically from multiple threads
425
/// on the CPU, and is sparsely updated on the GPU if only a small number of
426
/// elements have changed.
427
///
428
/// This type is similar to
429
/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], but instead of
430
/// reuploading the entire buffer to the GPU when it's changed, it tracks
431
/// changes on a per-page level and uploads only the pages that changed if the
432
/// number of such pages is small. It uses a compute shader to scatter the
433
/// changed pages.
434
///
435
/// As the stored data is [`AtomicPod`], multiple threads may update the buffer
436
/// simultaneously. Note that, like
437
/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], only existing
438
/// elements may be updated from multiple threads; new data still requires
439
/// exclusive access.
440
///
441
/// `T` must have a size that's a multiple of 4.
442
pub struct AtomicSparseBufferVec<T>
443
where
444
    T: AtomicPod,
445
{
446
    /// An ID that uniquely identifies this [`AtomicSparseBufferVec`].
447
    handle: SparseBufferHandle,
448
    /// The underlying values.
449
    ///
450
    /// These are stored as their blob representation to allow for thread-safe
451
    /// update.
452
    values: Vec<T::Blob>,
453
    /// The GPU buffer, if allocated.
454
    data_buffer: Option<Buffer>,
455
    /// The GPU buffers that data is copied into in preparation to be scattered
456
    /// to the [`Self::data_buffer`].
457
    staging_buffers: SparseBufferStagingBuffers,
458
    /// A GPU buffer that stores information such as the element size and stride
459
    /// that's needed to perform sparse updates.
460
    metadata_uniform: UniformBuffer<GpuSparseBufferUpdateMetadata>,
461
    /// The capacity of the GPU buffer in elements.
462
    capacity: usize,
463
    /// The allowed `wgpu` buffer usages for the GPU buffer.
464
    buffer_usages: BufferUsages,
465
    /// An optional debug label to identify this buffer.
466
    label: Arc<str>,
467
    /// A bit set of dirty pages.
468
    ///
469
    /// The size of this vector in bits is the number of elements divided by the
470
    /// page size, rounded up. A 1 in a bit indicates that the page has changed
471
    /// since the last upload, while a 0 indicates that the page hasn't changed.
472
    dirty_pages: Vec<AtomicU64>,
473
    /// True if the entire buffer needs to be reuploaded because it resized.
474
    needs_full_reupload: bool,
475
    /// True if a sparse update is to be performed.
476
    sparse_update_scheduled: bool,
477
}
478

479
impl<T> AtomicSparseBufferVec<T>
480
where
481
    T: AtomicPod,
482
{
483
    /// The number of elements per page.
484
    fn page_size(&self) -> u32 {
485
        1 << self.staging_buffers.page_size_log2
486
    }
487

488
    /// Creates a new [`AtomicSparseBufferVec`] with the given set of buffer
489
    /// usages, page size, and label.
490
    ///
491
    /// `buffer_usages` specifies the set of allowed `wgpu` buffer usages for
492
    /// the buffer that [`AtomicSparseBufferVec`] manages.
493
    /// `BufferUsages::COPY_DST` is automatically added to this set.
494
    ///
495
    /// The `page_size_log2` parameter is the base-2 logarithm of the page size.
496
    /// That is, the page size is `1 << page_size_log2`.
497
    pub fn new(buffer_usages: BufferUsages, page_size_log2: u32, label: Arc<str>) -> Self {
498
        // Make sure the value is word-aligned.
499
        debug_assert_eq!(size_of::<T>() % 4, 0);
500
        let element_word_size = size_of::<T>() / 4;
501

502
        // Create a unique ID.
503
        let id = Arc::new(SparseBufferId(
504
            NEXT_SPARSE_BUFFER_ID.fetch_add(1, Ordering::Relaxed),
505
        ));
506

507
        Self {
508
            handle: id,
509
            values: vec![],
510
            data_buffer: None,
511
            staging_buffers: SparseBufferStagingBuffers::new(
512
                &label,
513
                element_word_size as u32,
514
                page_size_log2,
515
            ),
516
            metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::<T>(
517
                page_size_log2,
518
            )),
519
            capacity: 0,
520
            buffer_usages: buffer_usages | BufferUsages::COPY_DST,
521
            label,
522
            dirty_pages: vec![],
523
            needs_full_reupload: false,
524
            sparse_update_scheduled: false,
525
        }
526
    }
527

528
    /// Returns the number of elements in the CPU side copy of the buffer.
529
    pub fn len(&self) -> u32 {
530
        self.values.len() as u32
531
    }
532

533
    /// Returns true if there are no elements in the CPU side copy of the buffer.
534
    pub fn is_empty(&self) -> bool {
535
        self.values.is_empty()
536
    }
537

538
    /// Returns a handle to the buffer, if the data has been uploaded.
539
    pub fn buffer(&self) -> Option<&Buffer> {
540
        self.data_buffer.as_ref()
541
    }
542

543
    /// Removes all elements from the buffer.
544
    pub fn clear(&mut self) {
545
        self.truncate(0);
546
    }
547

548
    /// Copies a value out of the buffer.
549
    pub fn get(&self, index: u32) -> T {
550
        T::read_from_blob(&self.values[index as usize])
551
    }
552

553
    /// Sets the value at the given index.
554
    ///
555
    /// If the index isn't in range of the buffer, this method panics.
556
    ///
557
    /// Internally, the value is converted to its blob representation.
558
    ///
559
    /// Note that this method is thread-safe and doesn't require `&mut self`.
560
    /// It's your responsibility, however, to ensure synchronization; though
561
    /// this method is memory-safe, it's possible for other threads to observe
562
    /// partially-overwritten values if [`Self::get`] or similar methods are
563
    /// called while the write operation is occurring.
564
    pub fn set(&self, index: u32, value: T) {
565
        value.write_to_blob(&self.values[index as usize]);
566
        self.note_changed_index(index);
567
    }
568

569
    /// Adds a new value and returns its index.
570
    pub fn push(&mut self, value: T) -> u32 {
571
        let index = self.values.len() as u32;
572
        self.values.push(T::Blob::default());
573
        value.write_to_blob(&self.values[index as usize]);
574

575
        let page_word = (self.index_to_page(index) / PAGES_PER_DIRTY_WORD) as usize;
576
        while self.dirty_pages.len() < page_word + 1 {
577
            self.dirty_pages.push(AtomicU64::default());
578
        }
579
        self.note_changed_index(index);
580

581
        index
582
    }
583

584
    /// Marks the page corresponding to the given element index as dirty so that
585
    /// we know that we need to upload it.
586
    fn note_changed_index(&self, index: u32) {
587
        let page = self.index_to_page(index);
588
        let (page_word, page_in_word) = (page / PAGES_PER_DIRTY_WORD, page % PAGES_PER_DIRTY_WORD);
589
        self.dirty_pages[page_word as usize].fetch_or(1 << page_in_word, Ordering::Relaxed);
590
    }
591

592
    /// Returns the page corresponding to the given element index.
593
    fn index_to_page(&self, index: u32) -> u32 {
594
        index / self.page_size()
595
    }
596

597
    /// Ensures that the backing buffer for this buffer vector is present and
598
    /// appropriately sized on the GPU.
599
    pub fn reserve(&mut self, new_capacity: usize, render_device: &RenderDevice) {
600
        reserve(
601
            new_capacity,
602
            &mut self.capacity,
603
            &self.label,
604
            &mut self.data_buffer,
605
            self.buffer_usages,
606
            &mut self.needs_full_reupload,
607
            size_of::<T::Blob>(),
608
            render_device,
609
        );
610
    }
611

612
    /// Grows the buffer by adding default values so that it's at least the
613
    /// given size.
614
    ///
615
    /// If the buffer is already large enough, this method does nothing.
616
    pub fn grow(&mut self, new_len: u32) {
617
        let old_len = self.values.len() as u32;
618
        if old_len >= new_len {
619
            return;
620
        }
621

622
        self.values.reserve(new_len as usize - old_len as usize);
623
        self.values.resize_with(new_len as usize, T::Blob::default);
624

625
        // This is a bit tricky. We want to set the dirty bits corresponding to
626
        // all pages that we added, if any. First, we compute the index of the
627
        // last page word before the append operation.
628
        let old_final_page = self.index_to_page(old_len);
629
        let old_final_page_word_index = old_final_page / PAGES_PER_DIRTY_WORD;
630
        let old_final_page_in_word = old_final_page % PAGES_PER_DIRTY_WORD;
631

632
        // Next, we set the bits corresponding to every page that we added to
633
        // that final page word. Note that this might set bits corresponding to
634
        // pages past the end of our buffer; that's OK as we ignore them.
635
        if old_final_page_in_word != 0
636
            && let Some(ref mut old_final_atomic_page_word) =
637
                self.dirty_pages.get_mut(old_final_page_word_index as usize)
638
        {
639
            *old_final_atomic_page_word.get_mut() |= !((1u64 << old_final_page_in_word) - 1);
640
        }
641

642
        // Finally, we add any new page words, with all bits set.
643
        let new_page_count = self.index_to_page(new_len);
644
        self.dirty_pages.resize_with(
645
            (new_page_count as usize).div_ceil(PAGES_PER_DIRTY_WORD as usize),
646
            || AtomicU64::new(u64::MAX),
647
        );
648
    }
649

650
    /// Truncates the buffer to the given length.
651
    ///
652
    /// If the buffer is already that length or shorter, this method does
653
    /// nothing.
654
    pub fn truncate(&mut self, len: u32) {
655
        self.values.truncate(len as usize);
656

657
        let page = self.index_to_page(len);
658
        self.dirty_pages
659
            .truncate(page.div_ceil(PAGES_PER_DIRTY_WORD) as usize);
660
    }
661

662
    /// Writes the data to the GPU, either via a sparse upload or a bulk data
663
    /// upload.
664
    pub fn write_buffers(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
665
        if self.values.is_empty() {
666
            return;
667
        }
668

669
        // Round up the size to a good value to balance reallocation frequency
670
        // against memory waste.
671
        let good_size = calculate_allocation_size(self.values.len());
672
        self.reserve(good_size, render_device);
673

674
        if self.should_perform_full_reupload(render_device) {
675
            self.write_entire_buffer(render_queue);
676
        } else {
677
            self.prepare_sparse_upload(render_device, render_queue);
678
        }
679
    }
680

681
    /// Returns true if the sparse buffer should perform a full reupload, either
682
    /// because it was resized or because too much data changed for a sparse
683
    /// update to be worthwhile.
684
    fn should_perform_full_reupload(&self, render_device: &RenderDevice) -> bool {
685
        if self.needs_full_reupload {
686
            return true;
687
        }
688

689
        if render_device.limits().max_storage_buffers_per_shader_stage < 3 {
690
            return true;
691
        }
692

693
        // Calculate the number of changed pages via population count.
694
        let changed_page_count: u32 = self
695
            .dirty_pages
696
            .iter()
697
            .map(|atomic_page_word| atomic_page_word.load(Ordering::Relaxed).count_ones())
698
            .sum();
699

700
        self.staging_buffers
701
            .should_perform_full_reupload(changed_page_count, self.values.len())
702
    }
703

704
    /// Writes the entire buffer in bulk.
705
    ///
706
    /// This is the method used when a sparse update is not used, either because
707
    /// the buffer resized or because too much data changed for a sparse update
708
    /// to be worthwhile.
709
    fn write_entire_buffer(&mut self, render_queue: &RenderQueue) {
710
        let Some(ref mut data_buffer) = self.data_buffer else {
711
            error!("Dirty sparse buffer should have created a data buffer by now");
712
            return;
713
        };
714

715
        // SAFETY: We're just writing atomic data to the GPU. The worst that
716
        // can happen is that we race with somebody, which is unfortunate
717
        // but not memory-unsafe.
718
        unsafe {
719
            render_queue.write_buffer(
720
                data_buffer,
721
                0,
722
                slice::from_raw_parts(
723
                    self.values.as_ptr().cast::<u8>(),
724
                    self.values.len() * size_of::<T::Blob>(),
725
                ),
726
            );
727
        }
728

729
        // Mark all pages as clean.
730
        for atomic_page_word in self.dirty_pages.iter() {
731
            atomic_page_word.store(0, Ordering::Relaxed);
732
        }
733
        self.sparse_update_scheduled = false;
734
    }
735

736
    /// Schedules a sparse upload of only the pages that changed.
737
    fn prepare_sparse_upload(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
738
        // Iterate over all dirty pages.
739
        for (page_word_index, atomic_page_word) in self.dirty_pages.iter().enumerate() {
740
            let page_word = atomic_page_word.load(Ordering::Relaxed);
741
            for page_index_in_word in BitIter::new(page_word) {
742
                let page = page_word_index as u32 * PAGES_PER_DIRTY_WORD + page_index_in_word;
743

744
                // Write the index of the page so the shader will know where to
745
                // scatter the data to.
746
                self.staging_buffers.indices.push(page);
747

748
                // Copy the page to the GPU staging buffer.
749
                let page_size = self.staging_buffers.page_size();
750
                let page_start = page as usize * page_size;
751
                let page_end = page_start + page_size;
752
                for value_index in page_start..page_end {
753
                    match self.values.get(value_index) {
754
                        Some(blob) => {
755
                            let value = T::read_from_blob(blob);
756
                            self.staging_buffers
757
                                .source_data
758
                                .extend(bytemuck::cast_slice(&[value]).iter().copied());
759
                        }
760
                        None => {
761
                            self.staging_buffers.source_data.extend(iter::repeat_n(
762
                                0,
763
                                self.staging_buffers.element_word_size as usize,
764
                            ));
765
                        }
766
                    }
767
                }
768

769
                // Make sure we're aligned up to a full page.
770
                debug_assert_eq!(
771
                    self.staging_buffers.source_data.len()
772
                        % (self.staging_buffers.element_word_size as usize
773
                            * self.staging_buffers.page_size()),
774
                    0
775
                );
776
            }
777

778
            // Mark the page as clean.
779
            atomic_page_word.store(0, Ordering::Relaxed);
780
        }
781

782
        // Schedule a sparse update if there was something to do.
783
        self.sparse_update_scheduled = !self.staging_buffers.source_data.is_empty();
784
        if self.sparse_update_scheduled {
785
            self.staging_buffers.write_buffers(
786
                &mut self.metadata_uniform,
787
                render_device,
788
                render_queue,
789
            );
790
        }
791
    }
792

793
    /// If a sparse update has been scheduled, prepares all GPU resources
794
    /// necessary to perform a sparse buffer update, other than updating the
795
    /// metadata uniform.
796
    pub fn prepare_to_populate_buffers(
797
        &mut self,
798
        render_device: &RenderDevice,
799
        pipeline_cache: &PipelineCache,
800
        sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
801
        sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
802
        sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
803
    ) {
804
        if self.sparse_update_scheduled {
805
            match (&self.data_buffer, self.metadata_uniform.buffer()) {
806
                (Some(data_buffer), Some(metadata_buffer)) => {
807
                    prepare_to_populate_buffers(
808
                        self.handle.clone(),
809
                        &self.label,
810
                        data_buffer,
811
                        &mut self.staging_buffers,
812
                        metadata_buffer,
813
                        render_device,
814
                        pipeline_cache,
815
                        sparse_buffer_update_jobs,
816
                        sparse_buffer_update_bind_groups,
817
                        sparse_buffer_update_pipelines,
818
                    );
819
                }
820
                _ => {
821
                    error!("Buffers should have been created by now");
822
                }
823
            }
824
        }
825

826
        // Clear out the staging buffers, now that we know the data is already
827
        // on the GPU.
828
        self.staging_buffers.source_data.clear();
829
        self.staging_buffers.indices.clear();
830

831
        // Reset the `needs_full_reupload` and `needs_sparse_update` flags.
832
        self.needs_full_reupload = false;
833
        self.sparse_update_scheduled = false;
834
    }
835
}
836

837
impl FromWorld for SparseBufferUpdateBindGroups {
838
    fn from_world(world: &mut World) -> Self {
839
        world.resource_scope::<SpecializedComputePipelines<SparseBufferUpdatePipelines>, _>(
840
            |world, mut specialized_sparse_buffer_update_pipelines| {
841
                let pipeline_cache = world.resource::<PipelineCache>();
842
                let sparse_buffer_update_pipelines =
843
                    world.resource::<SparseBufferUpdatePipelines>();
844
                let pipeline_id = specialized_sparse_buffer_update_pipelines.specialize(
845
                    pipeline_cache,
846
                    sparse_buffer_update_pipelines,
847
                    (),
848
                );
849

850
                SparseBufferUpdateBindGroups {
851
                    bind_groups: WeakKeyHashMap::default(),
852
                    pipeline_id,
853
                }
854
            },
855
        )
856
    }
857
}
858

859
/// Prepares all GPU resources necessary to perform a sparse buffer update,
860
/// other than updating the metadata uniform.
861
///
862
/// This function creates the [`SparseBufferUpdateJob`] and ensures the bind
863
/// group and pipeline are up to date.
864
fn prepare_to_populate_buffers(
865
    sparse_buffer_handle: SparseBufferHandle,
866
    label: &Arc<str>,
867
    data_buffer: &Buffer,
868
    staging_buffers: &mut SparseBufferStagingBuffers,
869
    metadata_buffer: &Buffer,
870
    render_device: &RenderDevice,
871
    pipeline_cache: &PipelineCache,
872
    sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
873
    sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
874
    sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
875
) {
876
    let (Some(source_data_staging_buffer), Some(indices_staging_buffer)) = (
877
        staging_buffers.source_data.buffer(),
878
        staging_buffers.indices.buffer(),
879
    ) else {
880
        error!("Staging buffers should have been created by now");
881
        return;
882
    };
883

884
    let Some(bind_group_layout) = &sparse_buffer_update_pipelines.bind_group_layout else {
885
        return;
886
    };
887

888
    // Record the update job.
889
    sparse_buffer_update_jobs.push(SparseBufferUpdateJob {
890
        sparse_buffer_handle: sparse_buffer_handle.clone(),
891
        page_size_log2: staging_buffers.page_size_log2,
892
        updated_page_count: staging_buffers.updated_page_count(),
893
        element_word_size: staging_buffers.element_word_size,
894
        label: (*label).clone(),
895
    });
896

897
    // Create the bind group.
898
    let bind_group = render_device.create_bind_group(
899
        Some(&*format!("{} bind group", label)),
900
        &pipeline_cache.get_bind_group_layout(bind_group_layout),
901
        &BindGroupEntries::sequential((
902
            // @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
903
            data_buffer.as_entire_binding(),
904
            // @group(0) @binding(1) var<storage> src_buffer: array<u32>;
905
            source_data_staging_buffer.as_entire_binding(),
906
            // @group(0) @binding(2) var<storage> indices: array<u32>;
907
            indices_staging_buffer.as_entire_binding(),
908
            // @group(0) @binding(3) var<uniform> metadata:
909
            // SparseBufferUpdateMetadata;
910
            metadata_buffer.as_entire_binding(),
911
        )),
912
    );
913
    sparse_buffer_update_bind_groups.bind_groups.insert(
914
        sparse_buffer_handle,
915
        SparseBufferUpdateBindGroup { bind_group },
916
    );
917
}
918

919
/// Ensures that the backing buffer for an [`AtomicSparseBufferVec`] is present
920
/// on the GPU.
921
///
922
/// The `capacity`, `data_buffer`, and `needs_full_reupload` fields are updated
923
/// to reflect the new buffer.
924
fn reserve(
925
    new_capacity: usize,
926
    capacity: &mut usize,
927
    label: &str,
928
    data_buffer: &mut Option<Buffer>,
929
    buffer_usages: BufferUsages,
930
    needs_full_reupload: &mut bool,
931
    element_size: usize,
932
    render_device: &RenderDevice,
933
) {
934
    // If the buffer is already big enough, do nothing.
935
    if new_capacity == 0 || new_capacity <= *capacity {
936
        return;
937
    }
938

939
    *capacity = new_capacity;
940
    *data_buffer = Some(render_device.create_buffer(&BufferDescriptor {
941
        label: Some(label),
942
        size: element_size as u64 * new_capacity as u64,
943
        usage: buffer_usages,
944
        mapped_at_creation: false,
945
    }));
946

947
    // Since we resized the buffer, we need to reupload it.
948
    *needs_full_reupload = true;
949
}
950

951
impl GpuSparseBufferUpdateMetadata {
952
    /// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type and
953
    /// page size.
954
    fn new<T>(page_size_log2: u32) -> GpuSparseBufferUpdateMetadata {
955
        assert_eq!(size_of::<T>() % 4, 0);
956
        GpuSparseBufferUpdateMetadata {
957
            element_size: (size_of::<T>() / 4) as u32,
958
            updated_page_count: 0,
959
            page_size_log2,
960
        }
961
    }
962
}
963

964
/// Iterates over the bits in a single `u64`, from the least significant bit to
965
/// the most significant bit.
966
struct BitIter(u64);
967

968
impl BitIter {
969
    fn new(bits: u64) -> BitIter {
970
        BitIter(bits)
971
    }
972
}
973

974
impl Iterator for BitIter {
975
    type Item = u32;
976

977
    fn next(&mut self) -> Option<Self::Item> {
978
        let trailing_zeros = self.0.trailing_zeros();
979
        if trailing_zeros == 64 {
980
            return None;
981
        }
982
        self.0 &= !(1 << trailing_zeros);
983
        Some(trailing_zeros)
984
    }
985
}
986

987
/// Calculates the size that a buffer should be in order to balance reallocation
988
/// frequency against memory waste.
989
fn calculate_allocation_size(length: usize) -> usize {
990
    let exponent = (length as f64).log(REALLOCATION_FACTOR).ceil();
991
    let size = REALLOCATION_FACTOR.powf(exponent) as usize;
992
    size.next_multiple_of(REALLOCATION_SIZE_MULTIPLE)
993
}
994

995
Product

Resources

Company