Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bevyengine
GitHub Repository: bevyengine/bevy
Path: blob/main/crates/bevy_render/src/render_resource/sparse_buffer_vec.rs
30636 views
1
//! GPU buffers that support sparse updates if only a small number of elements
2
//! have changed.
3
4
use alloc::sync::{Arc, Weak};
5
use core::{
6
iter, slice,
7
sync::atomic::{AtomicU64, Ordering},
8
};
9
10
use bevy_app::{App, Plugin};
11
use bevy_asset::{embedded_asset, load_embedded_asset, Handle};
12
use bevy_derive::{Deref, DerefMut};
13
use bevy_ecs::{
14
resource::Resource,
15
schedule::IntoScheduleConfigs as _,
16
system::{Res, ResMut},
17
world::{FromWorld, World},
18
};
19
use bevy_log::{error, info};
20
use bevy_material::{
21
bind_group_layout_entries::{
22
binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
23
BindGroupLayoutEntries,
24
},
25
descriptor::{BindGroupLayoutDescriptor, CachedComputePipelineId, ComputePipelineDescriptor},
26
};
27
use bevy_shader::Shader;
28
use bytemuck::{Pod, Zeroable};
29
use encase::ShaderType;
30
use weak_table::WeakKeyHashMap;
31
use wgpu::{BufferDescriptor, BufferUsages, ComputePassDescriptor, ShaderStages};
32
33
use crate::{
34
diagnostic::{DiagnosticsRecorder, RecordDiagnostics as _},
35
render_resource::{
36
AtomicPod, BindGroup, BindGroupEntries, Buffer, PipelineCache, RawBufferVec,
37
SpecializedComputePipeline, SpecializedComputePipelines, UniformBuffer,
38
},
39
renderer::{RenderDevice, RenderGraph, RenderGraphSystems, RenderQueue},
40
ExtractSchedule, RenderApp,
41
};
42
43
/// A plugin that allows sparse updates of GPU buffers if only a small number of
44
/// elements have changed.
45
pub struct SparseBufferPlugin;
46
47
impl Plugin for SparseBufferPlugin {
48
fn build(&self, app: &mut App) {
49
embedded_asset!(app, "sparse_buffer_update.wgsl");
50
}
51
52
fn finish(&self, app: &mut App) {
53
let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
54
return;
55
};
56
57
render_app
58
.init_resource::<SparseBufferUpdateJobs>()
59
.init_resource::<SparseBufferUpdatePipelines>()
60
.init_resource::<SpecializedComputePipelines<SparseBufferUpdatePipelines>>()
61
.init_resource::<SparseBufferUpdateBindGroups>()
62
.add_systems(ExtractSchedule, clear_sparse_buffer_jobs)
63
.add_systems(
64
RenderGraph,
65
// We perform sparse buffer updates very early so that sparse
66
// buffers can be used in any render pass.
67
update_sparse_buffers.in_set(RenderGraphSystems::Begin),
68
);
69
}
70
}
71
72
/// A globally-unique ID that identifies this sparse buffer.
73
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Deref, DerefMut)]
74
pub struct SparseBufferId(pub u64);
75
76
/// An object that allows the sparse buffer ID to be query and holds the bind
77
/// group for that sparse buffer alive.
78
///
79
/// Each sparse buffer holds a strong reference to this handle, and the
80
/// [`SparseBufferUpdateBindGroups`] resource contains a weak map from this
81
/// handle to the bind group. This setup ensures that, when the sparse buffer is
82
/// freed, the bind groups for that sparse buffer are freed as well.
83
pub type SparseBufferHandle = Arc<SparseBufferId>;
84
85
/// The next sparse buffer ID to be assigned.
86
static NEXT_SPARSE_BUFFER_ID: AtomicU64 = AtomicU64::new(0);
87
88
/// The size of a single workgroup in the sparse buffer shader.
89
const SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE: u32 = 256;
90
91
/// The fraction of the buffer that may be changed before we fall back to full
92
/// reupload.
93
///
94
/// This is set to 15% by default. This was obtained experimentally by testing
95
/// very large scenes and roughly matches the values used by other engines that
96
/// perform sparse buffer updates.
97
const SPARSE_UPLOAD_THRESHOLD: f64 = 0.15;
98
99
/// The WebGPU limit on the number of workgroups that can be dispatched.
100
const MAX_WORKGROUPS: u32 = 65535;
101
102
/// We round all allocations up to the nearest power of this.
103
const REALLOCATION_FACTOR: f64 = 1.5;
104
/// We round all allocations up to the nearest multiple of this.
105
const REALLOCATION_SIZE_MULTIPLE: usize = 256;
106
107
/// The number of dirty-page bits packed into each [`AtomicU64`] word.
108
const PAGES_PER_DIRTY_WORD: u32 = 64;
109
110
/// Pipelines for the sparse buffer update shader.
111
///
112
/// This shader is shared among all sparse buffer vectors.
113
#[derive(Resource)]
114
pub struct SparseBufferUpdatePipelines {
115
/// The bind group layout.
116
///
117
/// We only have one bind group layout shared among all sparse buffer
118
/// vectors.
119
bind_group_layout: Option<BindGroupLayoutDescriptor>,
120
/// The shader that performs the scatter operation.
121
shader: Option<Handle<Shader>>,
122
}
123
124
/// A resource, part of the render world, that stores the bind groups for each
125
/// sparse buffer.
126
#[derive(Resource)]
127
pub struct SparseBufferUpdateBindGroups {
128
/// The bind groups for each sparse buffer.
129
///
130
/// These are stored in a weak map so that when the sparse buffer goes away,
131
/// the bind group for that buffer goes away as well.
132
bind_groups: WeakKeyHashMap<Weak<SparseBufferId>, SparseBufferUpdateBindGroup>,
133
/// The ID of the update shader pipeline shared among all sparse buffers.
134
pipeline_id: CachedComputePipelineId,
135
}
136
137
/// A single bind group for the sparse buffer update shader.
138
pub struct SparseBufferUpdateBindGroup {
139
/// The actual bind group.
140
bind_group: BindGroup,
141
}
142
143
/// A resource, part of the render world, that stores all pending sparse updates
144
/// to buffers.
145
#[derive(Resource, Default, Deref, DerefMut)]
146
pub struct SparseBufferUpdateJobs(pub Vec<SparseBufferUpdateJob>);
147
148
/// Describes a sparse update operation for a buffer.
149
pub struct SparseBufferUpdateJob {
150
/// A handle to the buffer to be updated.
151
sparse_buffer_handle: SparseBufferHandle,
152
/// The number of pages to update.
153
updated_page_count: u32,
154
/// The base-2 logarithm of the size of a page for the buffer.
155
///
156
/// The actual page size can be computed as `1 << page_size_log2`.
157
page_size_log2: u32,
158
/// The size of each element in 32-bit words.
159
element_word_size: u32,
160
/// A debugging label for the buffer.
161
label: Arc<str>,
162
}
163
164
impl SparseBufferUpdateJob {
165
/// The number of elements per page.
166
fn page_size(&self) -> u32 {
167
1 << self.page_size_log2
168
}
169
170
/// Calculates the number of words that need to be updated.
171
fn words_to_update(&self) -> u32 {
172
self.updated_page_count * self.page_size() * self.element_word_size
173
}
174
175
/// Calculates the number of workgroups that need to be dispatched.
176
fn workgroup_count(&self) -> u32 {
177
self.words_to_update()
178
.div_ceil(SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE)
179
}
180
}
181
182
/// A GPU type that describes a sparse update that is to be performed.
183
#[derive(Clone, Copy, Default, ShaderType, Pod, Zeroable)]
184
#[repr(C)]
185
struct GpuSparseBufferUpdateMetadata {
186
/// The size of a single element in 32-bit words.
187
element_size: u32,
188
/// The number of pages that need to be updated.
189
updated_page_count: u32,
190
/// The base-2 logarithm of the page size.
191
///
192
/// That is, the page size is `1 << page_size_log2`.
193
page_size_log2: u32,
194
}
195
196
/// A system, part of the render graph, that performs sparse buffer updates to
197
/// buffers for which only a small number of elements have changed.
198
///
199
/// This runs as early in the pipeline as possible so that sparse buffers can be
200
/// used for any subsequent pass.
201
fn update_sparse_buffers(
202
sparse_buffer_update_jobs: Res<SparseBufferUpdateJobs>,
203
sparse_buffer_update_bind_groups: Res<SparseBufferUpdateBindGroups>,
204
pipeline_cache: Res<PipelineCache>,
205
mut diagnostics: Option<ResMut<DiagnosticsRecorder>>,
206
render_device: Res<RenderDevice>,
207
render_queue: Res<RenderQueue>,
208
) {
209
// Bail if we have nothing to do.
210
if sparse_buffer_update_jobs.is_empty() {
211
return;
212
}
213
214
// We need to create a command encoder since this pass isn't associated with
215
// a view.
216
let mut command_encoder =
217
render_device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
218
label: Some("sparse buffer update"),
219
});
220
221
let time_span = diagnostics
222
.as_mut()
223
.map(|diagnostics| diagnostics.time_span(&mut command_encoder, "sparse buffer update"));
224
225
command_encoder.push_debug_group("sparse buffer update");
226
227
let Some(compute_pipeline) =
228
pipeline_cache.get_compute_pipeline(sparse_buffer_update_bind_groups.pipeline_id)
229
else {
230
return;
231
};
232
233
// Process each sparse buffer update job.
234
for sparse_buffer_update_job in sparse_buffer_update_jobs.iter() {
235
let Some(sparse_buffer_update_bind_group) = sparse_buffer_update_bind_groups
236
.bind_groups
237
.get(&sparse_buffer_update_job.sparse_buffer_handle)
238
else {
239
continue;
240
};
241
242
let mut sparse_buffer_update_pass =
243
command_encoder.begin_compute_pass(&ComputePassDescriptor {
244
label: Some(&*format!(
245
"sparse buffer update ({})",
246
&sparse_buffer_update_job.label
247
)),
248
timestamp_writes: None,
249
});
250
sparse_buffer_update_pass.set_pipeline(compute_pipeline);
251
sparse_buffer_update_pass.set_bind_group(
252
0,
253
&sparse_buffer_update_bind_group.bind_group,
254
&[],
255
);
256
sparse_buffer_update_pass.dispatch_workgroups(
257
sparse_buffer_update_job.workgroup_count(),
258
1,
259
1,
260
);
261
}
262
263
command_encoder.pop_debug_group();
264
if let Some(time_span) = time_span {
265
time_span.end(&mut command_encoder);
266
}
267
268
render_queue.submit([command_encoder.finish()]);
269
}
270
271
/// A system that clears out the sparse buffer update jobs in preparation for a
272
/// new frame.
273
fn clear_sparse_buffer_jobs(mut sparse_buffer_update_jobs: ResMut<SparseBufferUpdateJobs>) {
274
sparse_buffer_update_jobs.clear();
275
}
276
277
impl FromWorld for SparseBufferUpdatePipelines {
278
fn from_world(world: &mut World) -> Self {
279
let render_device = world.resource::<RenderDevice>();
280
let limit = render_device.limits().max_storage_buffers_per_shader_stage;
281
282
if limit < 3 {
283
info!(
284
"Sparse buffer updates disabled. RenderDevice lacks support: max_storage_buffers_per_shader_stage ({}) < 3.",
285
limit
286
);
287
288
return SparseBufferUpdatePipelines {
289
bind_group_layout: None,
290
shader: None,
291
};
292
}
293
294
let bind_group_layout = BindGroupLayoutDescriptor::new(
295
"sparse buffer update bind group layout",
296
&BindGroupLayoutEntries::sequential(
297
ShaderStages::COMPUTE,
298
(
299
// @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
300
storage_buffer::<u32>(false),
301
// @group(0) @binding(1) var<storage> src_buffer: array<u32>;
302
storage_buffer_read_only::<u32>(false),
303
// @group(0) @binding(2) var<storage> indices: array<u32>;
304
storage_buffer_read_only::<u32>(false),
305
// @group(0) @binding(3) var<uniform> metadata:
306
// SparseBufferUpdateMetadata;
307
uniform_buffer::<GpuSparseBufferUpdateMetadata>(false),
308
),
309
),
310
);
311
312
SparseBufferUpdatePipelines {
313
bind_group_layout: Some(bind_group_layout),
314
shader: Some(load_embedded_asset!(world, "sparse_buffer_update.wgsl")),
315
}
316
}
317
}
318
319
impl SpecializedComputePipeline for SparseBufferUpdatePipelines {
320
type Key = ();
321
322
fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
323
ComputePipelineDescriptor {
324
label: Some("sparse buffer update pipeline".into()),
325
layout: self.bind_group_layout.clone().into_iter().collect(),
326
shader: self.shader.clone().unwrap_or_default(),
327
shader_defs: vec![],
328
..ComputePipelineDescriptor::default()
329
}
330
}
331
}
332
333
/// The buffers that we use to sparsely scatter new data to the GPU.
334
///
335
/// There's one such set of buffers per sparse buffer vector.
336
struct SparseBufferStagingBuffers {
337
/// All pages that have changed and need to be updated.
338
source_data: RawBufferVec<u32>,
339
340
/// The index at which we write each page in [`Self::source_data`].
341
///
342
/// The length of this buffer is equal to [`Self::source_data`] divided by
343
/// 2^[`Self::page_size_log2`].
344
indices: RawBufferVec<u32>,
345
346
/// The size of each element in 32-bit words.
347
element_word_size: u32,
348
349
/// The base-2 logarithm of the page size in elements.
350
///
351
/// That is, the page size in elements is `1 << page_size_log2`.
352
page_size_log2: u32,
353
}
354
355
impl SparseBufferStagingBuffers {
356
/// The number of elements per page.
357
fn page_size(&self) -> usize {
358
1 << self.page_size_log2
359
}
360
361
/// Creates a new set of staging buffers for a sparse buffer vector.
362
fn new(label: &str, element_word_size: u32, page_size_log2: u32) -> SparseBufferStagingBuffers {
363
let mut source_data_buffer =
364
RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
365
source_data_buffer.set_label(Some(&*format!("{} staging buffer", label)));
366
367
let mut indices_buffer = RawBufferVec::new(BufferUsages::COPY_DST | BufferUsages::STORAGE);
368
indices_buffer.set_label(Some(&*format!("{} index buffer", label)));
369
370
SparseBufferStagingBuffers {
371
source_data: source_data_buffer,
372
indices: indices_buffer,
373
element_word_size,
374
page_size_log2,
375
}
376
}
377
378
/// Returns the number of updated pages.
379
fn updated_page_count(&self) -> u32 {
380
// Note that we don't have to round up here because data is always
381
// uploaded in increments of a whole page.
382
let element_count = self.source_data.len() / self.element_word_size as usize;
383
(element_count / self.page_size()) as u32
384
}
385
386
/// Writes the buffers that contain all the data necessary to perform a
387
/// sparse upload to the GPU.
388
///
389
/// This includes the buffer associated with the supplied
390
/// `metadata_uniform`.
391
fn write_buffers(
392
&mut self,
393
metadata_uniform: &mut UniformBuffer<GpuSparseBufferUpdateMetadata>,
394
render_device: &RenderDevice,
395
render_queue: &RenderQueue,
396
) {
397
metadata_uniform.get_mut().updated_page_count = self.updated_page_count();
398
metadata_uniform.write_buffer(render_device, render_queue);
399
400
self.source_data.write_buffer(render_device, render_queue);
401
self.indices.write_buffer(render_device, render_queue);
402
}
403
404
/// Returns true if a sparse buffer update should *not* be performed because
405
/// too many words changed.
406
fn should_perform_full_reupload(&self, changed_page_count: u32, buffer_length: usize) -> bool {
407
// Calculate the number of changed words. If it's greater than the
408
// maximum number of workgroups as defined by `wgpu`, we must perform a
409
// full reupload.
410
let total_changed_word_count =
411
changed_page_count * self.page_size() as u32 * self.element_word_size;
412
if total_changed_word_count > MAX_WORKGROUPS * SPARSE_BUFFER_UPDATE_WORKGROUP_SIZE {
413
return true;
414
}
415
416
// Don't perform a sparse upload if too many words changed, as it'll end
417
// up being slower than just uploading the whole buffer afresh.
418
let sparse_upload_fraction =
419
changed_page_count as f64 / buffer_length.div_ceil(self.page_size()) as f64;
420
sparse_upload_fraction > SPARSE_UPLOAD_THRESHOLD
421
}
422
}
423
424
/// A GPU buffer that can grow, can be updated atomically from multiple threads
425
/// on the CPU, and is sparsely updated on the GPU if only a small number of
426
/// elements have changed.
427
///
428
/// This type is similar to
429
/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], but instead of
430
/// reuploading the entire buffer to the GPU when it's changed, it tracks
431
/// changes on a per-page level and uploads only the pages that changed if the
432
/// number of such pages is small. It uses a compute shader to scatter the
433
/// changed pages.
434
///
435
/// As the stored data is [`AtomicPod`], multiple threads may update the buffer
436
/// simultaneously. Note that, like
437
/// [`crate::render_resource::buffer_vec::AtomicRawBufferVec`], only existing
438
/// elements may be updated from multiple threads; new data still requires
439
/// exclusive access.
440
///
441
/// `T` must have a size that's a multiple of 4.
442
pub struct AtomicSparseBufferVec<T>
443
where
444
T: AtomicPod,
445
{
446
/// An ID that uniquely identifies this [`AtomicSparseBufferVec`].
447
handle: SparseBufferHandle,
448
/// The underlying values.
449
///
450
/// These are stored as their blob representation to allow for thread-safe
451
/// update.
452
values: Vec<T::Blob>,
453
/// The GPU buffer, if allocated.
454
data_buffer: Option<Buffer>,
455
/// The GPU buffers that data is copied into in preparation to be scattered
456
/// to the [`Self::data_buffer`].
457
staging_buffers: SparseBufferStagingBuffers,
458
/// A GPU buffer that stores information such as the element size and stride
459
/// that's needed to perform sparse updates.
460
metadata_uniform: UniformBuffer<GpuSparseBufferUpdateMetadata>,
461
/// The capacity of the GPU buffer in elements.
462
capacity: usize,
463
/// The allowed `wgpu` buffer usages for the GPU buffer.
464
buffer_usages: BufferUsages,
465
/// An optional debug label to identify this buffer.
466
label: Arc<str>,
467
/// A bit set of dirty pages.
468
///
469
/// The size of this vector in bits is the number of elements divided by the
470
/// page size, rounded up. A 1 in a bit indicates that the page has changed
471
/// since the last upload, while a 0 indicates that the page hasn't changed.
472
dirty_pages: Vec<AtomicU64>,
473
/// True if the entire buffer needs to be reuploaded because it resized.
474
needs_full_reupload: bool,
475
/// True if a sparse update is to be performed.
476
sparse_update_scheduled: bool,
477
}
478
479
impl<T> AtomicSparseBufferVec<T>
480
where
481
T: AtomicPod,
482
{
483
/// The number of elements per page.
484
fn page_size(&self) -> u32 {
485
1 << self.staging_buffers.page_size_log2
486
}
487
488
/// Creates a new [`AtomicSparseBufferVec`] with the given set of buffer
489
/// usages, page size, and label.
490
///
491
/// `buffer_usages` specifies the set of allowed `wgpu` buffer usages for
492
/// the buffer that [`AtomicSparseBufferVec`] manages.
493
/// `BufferUsages::COPY_DST` is automatically added to this set.
494
///
495
/// The `page_size_log2` parameter is the base-2 logarithm of the page size.
496
/// That is, the page size is `1 << page_size_log2`.
497
pub fn new(buffer_usages: BufferUsages, page_size_log2: u32, label: Arc<str>) -> Self {
498
// Make sure the value is word-aligned.
499
debug_assert_eq!(size_of::<T>() % 4, 0);
500
let element_word_size = size_of::<T>() / 4;
501
502
// Create a unique ID.
503
let id = Arc::new(SparseBufferId(
504
NEXT_SPARSE_BUFFER_ID.fetch_add(1, Ordering::Relaxed),
505
));
506
507
Self {
508
handle: id,
509
values: vec![],
510
data_buffer: None,
511
staging_buffers: SparseBufferStagingBuffers::new(
512
&label,
513
element_word_size as u32,
514
page_size_log2,
515
),
516
metadata_uniform: UniformBuffer::from(GpuSparseBufferUpdateMetadata::new::<T>(
517
page_size_log2,
518
)),
519
capacity: 0,
520
buffer_usages: buffer_usages | BufferUsages::COPY_DST,
521
label,
522
dirty_pages: vec![],
523
needs_full_reupload: false,
524
sparse_update_scheduled: false,
525
}
526
}
527
528
/// Returns the number of elements in the CPU side copy of the buffer.
529
pub fn len(&self) -> u32 {
530
self.values.len() as u32
531
}
532
533
/// Returns true if there are no elements in the CPU side copy of the buffer.
534
pub fn is_empty(&self) -> bool {
535
self.values.is_empty()
536
}
537
538
/// Returns a handle to the buffer, if the data has been uploaded.
539
pub fn buffer(&self) -> Option<&Buffer> {
540
self.data_buffer.as_ref()
541
}
542
543
/// Removes all elements from the buffer.
544
pub fn clear(&mut self) {
545
self.truncate(0);
546
}
547
548
/// Copies a value out of the buffer.
549
pub fn get(&self, index: u32) -> T {
550
T::read_from_blob(&self.values[index as usize])
551
}
552
553
/// Sets the value at the given index.
554
///
555
/// If the index isn't in range of the buffer, this method panics.
556
///
557
/// Internally, the value is converted to its blob representation.
558
///
559
/// Note that this method is thread-safe and doesn't require `&mut self`.
560
/// It's your responsibility, however, to ensure synchronization; though
561
/// this method is memory-safe, it's possible for other threads to observe
562
/// partially-overwritten values if [`Self::get`] or similar methods are
563
/// called while the write operation is occurring.
564
pub fn set(&self, index: u32, value: T) {
565
value.write_to_blob(&self.values[index as usize]);
566
self.note_changed_index(index);
567
}
568
569
/// Adds a new value and returns its index.
570
pub fn push(&mut self, value: T) -> u32 {
571
let index = self.values.len() as u32;
572
self.values.push(T::Blob::default());
573
value.write_to_blob(&self.values[index as usize]);
574
575
let page_word = (self.index_to_page(index) / PAGES_PER_DIRTY_WORD) as usize;
576
while self.dirty_pages.len() < page_word + 1 {
577
self.dirty_pages.push(AtomicU64::default());
578
}
579
self.note_changed_index(index);
580
581
index
582
}
583
584
/// Marks the page corresponding to the given element index as dirty so that
585
/// we know that we need to upload it.
586
fn note_changed_index(&self, index: u32) {
587
let page = self.index_to_page(index);
588
let (page_word, page_in_word) = (page / PAGES_PER_DIRTY_WORD, page % PAGES_PER_DIRTY_WORD);
589
self.dirty_pages[page_word as usize].fetch_or(1 << page_in_word, Ordering::Relaxed);
590
}
591
592
/// Returns the page corresponding to the given element index.
593
fn index_to_page(&self, index: u32) -> u32 {
594
index / self.page_size()
595
}
596
597
/// Ensures that the backing buffer for this buffer vector is present and
598
/// appropriately sized on the GPU.
599
pub fn reserve(&mut self, new_capacity: usize, render_device: &RenderDevice) {
600
reserve(
601
new_capacity,
602
&mut self.capacity,
603
&self.label,
604
&mut self.data_buffer,
605
self.buffer_usages,
606
&mut self.needs_full_reupload,
607
size_of::<T::Blob>(),
608
render_device,
609
);
610
}
611
612
/// Grows the buffer by adding default values so that it's at least the
613
/// given size.
614
///
615
/// If the buffer is already large enough, this method does nothing.
616
pub fn grow(&mut self, new_len: u32) {
617
let old_len = self.values.len() as u32;
618
if old_len >= new_len {
619
return;
620
}
621
622
self.values.reserve(new_len as usize - old_len as usize);
623
self.values.resize_with(new_len as usize, T::Blob::default);
624
625
// This is a bit tricky. We want to set the dirty bits corresponding to
626
// all pages that we added, if any. First, we compute the index of the
627
// last page word before the append operation.
628
let old_final_page = self.index_to_page(old_len);
629
let old_final_page_word_index = old_final_page / PAGES_PER_DIRTY_WORD;
630
let old_final_page_in_word = old_final_page % PAGES_PER_DIRTY_WORD;
631
632
// Next, we set the bits corresponding to every page that we added to
633
// that final page word. Note that this might set bits corresponding to
634
// pages past the end of our buffer; that's OK as we ignore them.
635
if old_final_page_in_word != 0
636
&& let Some(ref mut old_final_atomic_page_word) =
637
self.dirty_pages.get_mut(old_final_page_word_index as usize)
638
{
639
*old_final_atomic_page_word.get_mut() |= !((1u64 << old_final_page_in_word) - 1);
640
}
641
642
// Finally, we add any new page words, with all bits set.
643
let new_page_count = self.index_to_page(new_len);
644
self.dirty_pages.resize_with(
645
(new_page_count as usize).div_ceil(PAGES_PER_DIRTY_WORD as usize),
646
|| AtomicU64::new(u64::MAX),
647
);
648
}
649
650
/// Truncates the buffer to the given length.
651
///
652
/// If the buffer is already that length or shorter, this method does
653
/// nothing.
654
pub fn truncate(&mut self, len: u32) {
655
self.values.truncate(len as usize);
656
657
let page = self.index_to_page(len);
658
self.dirty_pages
659
.truncate(page.div_ceil(PAGES_PER_DIRTY_WORD) as usize);
660
}
661
662
/// Writes the data to the GPU, either via a sparse upload or a bulk data
663
/// upload.
664
pub fn write_buffers(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
665
if self.values.is_empty() {
666
return;
667
}
668
669
// Round up the size to a good value to balance reallocation frequency
670
// against memory waste.
671
let good_size = calculate_allocation_size(self.values.len());
672
self.reserve(good_size, render_device);
673
674
if self.should_perform_full_reupload(render_device) {
675
self.write_entire_buffer(render_queue);
676
} else {
677
self.prepare_sparse_upload(render_device, render_queue);
678
}
679
}
680
681
/// Returns true if the sparse buffer should perform a full reupload, either
682
/// because it was resized or because too much data changed for a sparse
683
/// update to be worthwhile.
684
fn should_perform_full_reupload(&self, render_device: &RenderDevice) -> bool {
685
if self.needs_full_reupload {
686
return true;
687
}
688
689
if render_device.limits().max_storage_buffers_per_shader_stage < 3 {
690
return true;
691
}
692
693
// Calculate the number of changed pages via population count.
694
let changed_page_count: u32 = self
695
.dirty_pages
696
.iter()
697
.map(|atomic_page_word| atomic_page_word.load(Ordering::Relaxed).count_ones())
698
.sum();
699
700
self.staging_buffers
701
.should_perform_full_reupload(changed_page_count, self.values.len())
702
}
703
704
/// Writes the entire buffer in bulk.
705
///
706
/// This is the method used when a sparse update is not used, either because
707
/// the buffer resized or because too much data changed for a sparse update
708
/// to be worthwhile.
709
fn write_entire_buffer(&mut self, render_queue: &RenderQueue) {
710
let Some(ref mut data_buffer) = self.data_buffer else {
711
error!("Dirty sparse buffer should have created a data buffer by now");
712
return;
713
};
714
715
// SAFETY: We're just writing atomic data to the GPU. The worst that
716
// can happen is that we race with somebody, which is unfortunate
717
// but not memory-unsafe.
718
unsafe {
719
render_queue.write_buffer(
720
data_buffer,
721
0,
722
slice::from_raw_parts(
723
self.values.as_ptr().cast::<u8>(),
724
self.values.len() * size_of::<T::Blob>(),
725
),
726
);
727
}
728
729
// Mark all pages as clean.
730
for atomic_page_word in self.dirty_pages.iter() {
731
atomic_page_word.store(0, Ordering::Relaxed);
732
}
733
self.sparse_update_scheduled = false;
734
}
735
736
/// Schedules a sparse upload of only the pages that changed.
737
fn prepare_sparse_upload(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
738
// Iterate over all dirty pages.
739
for (page_word_index, atomic_page_word) in self.dirty_pages.iter().enumerate() {
740
let page_word = atomic_page_word.load(Ordering::Relaxed);
741
for page_index_in_word in BitIter::new(page_word) {
742
let page = page_word_index as u32 * PAGES_PER_DIRTY_WORD + page_index_in_word;
743
744
// Write the index of the page so the shader will know where to
745
// scatter the data to.
746
self.staging_buffers.indices.push(page);
747
748
// Copy the page to the GPU staging buffer.
749
let page_size = self.staging_buffers.page_size();
750
let page_start = page as usize * page_size;
751
let page_end = page_start + page_size;
752
for value_index in page_start..page_end {
753
match self.values.get(value_index) {
754
Some(blob) => {
755
let value = T::read_from_blob(blob);
756
self.staging_buffers
757
.source_data
758
.extend(bytemuck::cast_slice(&[value]).iter().copied());
759
}
760
None => {
761
self.staging_buffers.source_data.extend(iter::repeat_n(
762
0,
763
self.staging_buffers.element_word_size as usize,
764
));
765
}
766
}
767
}
768
769
// Make sure we're aligned up to a full page.
770
debug_assert_eq!(
771
self.staging_buffers.source_data.len()
772
% (self.staging_buffers.element_word_size as usize
773
* self.staging_buffers.page_size()),
774
0
775
);
776
}
777
778
// Mark the page as clean.
779
atomic_page_word.store(0, Ordering::Relaxed);
780
}
781
782
// Schedule a sparse update if there was something to do.
783
self.sparse_update_scheduled = !self.staging_buffers.source_data.is_empty();
784
if self.sparse_update_scheduled {
785
self.staging_buffers.write_buffers(
786
&mut self.metadata_uniform,
787
render_device,
788
render_queue,
789
);
790
}
791
}
792
793
/// If a sparse update has been scheduled, prepares all GPU resources
794
/// necessary to perform a sparse buffer update, other than updating the
795
/// metadata uniform.
796
pub fn prepare_to_populate_buffers(
797
&mut self,
798
render_device: &RenderDevice,
799
pipeline_cache: &PipelineCache,
800
sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
801
sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
802
sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
803
) {
804
if self.sparse_update_scheduled {
805
match (&self.data_buffer, self.metadata_uniform.buffer()) {
806
(Some(data_buffer), Some(metadata_buffer)) => {
807
prepare_to_populate_buffers(
808
self.handle.clone(),
809
&self.label,
810
data_buffer,
811
&mut self.staging_buffers,
812
metadata_buffer,
813
render_device,
814
pipeline_cache,
815
sparse_buffer_update_jobs,
816
sparse_buffer_update_bind_groups,
817
sparse_buffer_update_pipelines,
818
);
819
}
820
_ => {
821
error!("Buffers should have been created by now");
822
}
823
}
824
}
825
826
// Clear out the staging buffers, now that we know the data is already
827
// on the GPU.
828
self.staging_buffers.source_data.clear();
829
self.staging_buffers.indices.clear();
830
831
// Reset the `needs_full_reupload` and `needs_sparse_update` flags.
832
self.needs_full_reupload = false;
833
self.sparse_update_scheduled = false;
834
}
835
}
836
837
impl FromWorld for SparseBufferUpdateBindGroups {
838
fn from_world(world: &mut World) -> Self {
839
world.resource_scope::<SpecializedComputePipelines<SparseBufferUpdatePipelines>, _>(
840
|world, mut specialized_sparse_buffer_update_pipelines| {
841
let pipeline_cache = world.resource::<PipelineCache>();
842
let sparse_buffer_update_pipelines =
843
world.resource::<SparseBufferUpdatePipelines>();
844
let pipeline_id = specialized_sparse_buffer_update_pipelines.specialize(
845
pipeline_cache,
846
sparse_buffer_update_pipelines,
847
(),
848
);
849
850
SparseBufferUpdateBindGroups {
851
bind_groups: WeakKeyHashMap::default(),
852
pipeline_id,
853
}
854
},
855
)
856
}
857
}
858
859
/// Prepares all GPU resources necessary to perform a sparse buffer update,
860
/// other than updating the metadata uniform.
861
///
862
/// This function creates the [`SparseBufferUpdateJob`] and ensures the bind
863
/// group and pipeline are up to date.
864
fn prepare_to_populate_buffers(
865
sparse_buffer_handle: SparseBufferHandle,
866
label: &Arc<str>,
867
data_buffer: &Buffer,
868
staging_buffers: &mut SparseBufferStagingBuffers,
869
metadata_buffer: &Buffer,
870
render_device: &RenderDevice,
871
pipeline_cache: &PipelineCache,
872
sparse_buffer_update_jobs: &mut SparseBufferUpdateJobs,
873
sparse_buffer_update_bind_groups: &mut SparseBufferUpdateBindGroups,
874
sparse_buffer_update_pipelines: &SparseBufferUpdatePipelines,
875
) {
876
let (Some(source_data_staging_buffer), Some(indices_staging_buffer)) = (
877
staging_buffers.source_data.buffer(),
878
staging_buffers.indices.buffer(),
879
) else {
880
error!("Staging buffers should have been created by now");
881
return;
882
};
883
884
let Some(bind_group_layout) = &sparse_buffer_update_pipelines.bind_group_layout else {
885
return;
886
};
887
888
// Record the update job.
889
sparse_buffer_update_jobs.push(SparseBufferUpdateJob {
890
sparse_buffer_handle: sparse_buffer_handle.clone(),
891
page_size_log2: staging_buffers.page_size_log2,
892
updated_page_count: staging_buffers.updated_page_count(),
893
element_word_size: staging_buffers.element_word_size,
894
label: (*label).clone(),
895
});
896
897
// Create the bind group.
898
let bind_group = render_device.create_bind_group(
899
Some(&*format!("{} bind group", label)),
900
&pipeline_cache.get_bind_group_layout(bind_group_layout),
901
&BindGroupEntries::sequential((
902
// @group(0) @binding(0) var<storage, read_write> dest_buffer: array<u32>;
903
data_buffer.as_entire_binding(),
904
// @group(0) @binding(1) var<storage> src_buffer: array<u32>;
905
source_data_staging_buffer.as_entire_binding(),
906
// @group(0) @binding(2) var<storage> indices: array<u32>;
907
indices_staging_buffer.as_entire_binding(),
908
// @group(0) @binding(3) var<uniform> metadata:
909
// SparseBufferUpdateMetadata;
910
metadata_buffer.as_entire_binding(),
911
)),
912
);
913
sparse_buffer_update_bind_groups.bind_groups.insert(
914
sparse_buffer_handle,
915
SparseBufferUpdateBindGroup { bind_group },
916
);
917
}
918
919
/// Ensures that the backing buffer for an [`AtomicSparseBufferVec`] is present
920
/// on the GPU.
921
///
922
/// The `capacity`, `data_buffer`, and `needs_full_reupload` fields are updated
923
/// to reflect the new buffer.
924
fn reserve(
925
new_capacity: usize,
926
capacity: &mut usize,
927
label: &str,
928
data_buffer: &mut Option<Buffer>,
929
buffer_usages: BufferUsages,
930
needs_full_reupload: &mut bool,
931
element_size: usize,
932
render_device: &RenderDevice,
933
) {
934
// If the buffer is already big enough, do nothing.
935
if new_capacity == 0 || new_capacity <= *capacity {
936
return;
937
}
938
939
*capacity = new_capacity;
940
*data_buffer = Some(render_device.create_buffer(&BufferDescriptor {
941
label: Some(label),
942
size: element_size as u64 * new_capacity as u64,
943
usage: buffer_usages,
944
mapped_at_creation: false,
945
}));
946
947
// Since we resized the buffer, we need to reupload it.
948
*needs_full_reupload = true;
949
}
950
951
impl GpuSparseBufferUpdateMetadata {
952
/// Returns a new [`GpuSparseBufferUpdateMetadata`] for the given type and
953
/// page size.
954
fn new<T>(page_size_log2: u32) -> GpuSparseBufferUpdateMetadata {
955
assert_eq!(size_of::<T>() % 4, 0);
956
GpuSparseBufferUpdateMetadata {
957
element_size: (size_of::<T>() / 4) as u32,
958
updated_page_count: 0,
959
page_size_log2,
960
}
961
}
962
}
963
964
/// Iterates over the bits in a single `u64`, from the least significant bit to
965
/// the most significant bit.
966
struct BitIter(u64);
967
968
impl BitIter {
969
fn new(bits: u64) -> BitIter {
970
BitIter(bits)
971
}
972
}
973
974
impl Iterator for BitIter {
975
type Item = u32;
976
977
fn next(&mut self) -> Option<Self::Item> {
978
let trailing_zeros = self.0.trailing_zeros();
979
if trailing_zeros == 64 {
980
return None;
981
}
982
self.0 &= !(1 << trailing_zeros);
983
Some(trailing_zeros)
984
}
985
}
986
987
/// Calculates the size that a buffer should be in order to balance reallocation
988
/// frequency against memory waste.
989
fn calculate_allocation_size(length: usize) -> usize {
990
let exponent = (length as f64).log(REALLOCATION_FACTOR).ceil();
991
let size = REALLOCATION_FACTOR.powf(exponent) as usize;
992
size.next_multiple_of(REALLOCATION_SIZE_MULTIPLE)
993
}
994
995