CoCalc -- gpu_preprocessing.rs

GitHub Repository: bevyengine/bevy
Path: blob/main/crates/bevy_render/src/batching/gpu_preprocessing.rs
⁶⁵⁹⁶ views
1
//! Batching functionality when GPU preprocessing is in use.
2

3
use core::{any::TypeId, marker::PhantomData, mem};
4

5
use bevy_app::{App, Plugin};
6
use bevy_derive::{Deref, DerefMut};
7
use bevy_ecs::{
8
    prelude::Entity,
9
    query::{Has, With},
10
    resource::Resource,
11
    schedule::IntoScheduleConfigs as _,
12
    system::{Query, Res, ResMut, StaticSystemParam},
13
    world::{FromWorld, World},
14
};
15
use bevy_encase_derive::ShaderType;
16
use bevy_math::UVec4;
17
use bevy_platform::collections::{hash_map::Entry, HashMap, HashSet};
18
use bevy_utils::{default, TypeIdMap};
19
use bytemuck::{Pod, Zeroable};
20
use encase::{internal::WriteInto, ShaderSize};
21
use indexmap::IndexMap;
22
use nonmax::NonMaxU32;
23
use tracing::{error, info};
24
use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
25

26
use crate::{
27
    experimental::occlusion_culling::OcclusionCulling,
28
    render_phase::{
29
        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
30
        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
31
        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, RenderBin, SortedPhaseItem,
32
        SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
33
        ViewSortedRenderPhases,
34
    },
35
    render_resource::{Buffer, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
36
    renderer::{RenderAdapter, RenderAdapterInfo, RenderDevice, RenderQueue, WgpuWrapper},
37
    sync_world::MainEntity,
38
    view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
39
    Render, RenderApp, RenderDebugFlags, RenderSystems,
40
};
41

42
use super::{BatchMeta, GetBatchData, GetFullBatchData};
43

44
#[derive(Default)]
45
pub struct BatchingPlugin {
46
    /// Debugging flags that can optionally be set when constructing the renderer.
47
    pub debug_flags: RenderDebugFlags,
48
}
49

50
impl Plugin for BatchingPlugin {
51
    fn build(&self, app: &mut App) {
52
        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
53
            return;
54
        };
55

56
        render_app
57
            .insert_resource(IndirectParametersBuffers::new(
58
                self.debug_flags
59
                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
60
            ))
61
            .add_systems(
62
                Render,
63
                write_indirect_parameters_buffers.in_set(RenderSystems::PrepareResourcesFlush),
64
            )
65
            .add_systems(
66
                Render,
67
                clear_indirect_parameters_buffers.in_set(RenderSystems::ManageViews),
68
            );
69
    }
70

71
    fn finish(&self, app: &mut App) {
72
        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
73
            return;
74
        };
75

76
        render_app.init_resource::<GpuPreprocessingSupport>();
77
    }
78
}
79

80
/// Records whether GPU preprocessing and/or GPU culling are supported on the
81
/// device.
82
///
83
/// No GPU preprocessing is supported on WebGL because of the lack of compute
84
/// shader support.  GPU preprocessing is supported on DirectX 12, but due to [a
85
/// `wgpu` limitation] GPU culling is not.
86
///
87
/// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
88
#[derive(Clone, Copy, PartialEq, Resource)]
89
pub struct GpuPreprocessingSupport {
90
    /// The maximum amount of GPU preprocessing available on this platform.
91
    pub max_supported_mode: GpuPreprocessingMode,
92
}
93

94
impl GpuPreprocessingSupport {
95
    /// Returns true if this GPU preprocessing support level isn't `None`.
96
    #[inline]
97
    pub fn is_available(&self) -> bool {
98
        self.max_supported_mode != GpuPreprocessingMode::None
99
    }
100

101
    /// Returns the given GPU preprocessing mode, capped to the current
102
    /// preprocessing mode.
103
    pub fn min(&self, mode: GpuPreprocessingMode) -> GpuPreprocessingMode {
104
        match (self.max_supported_mode, mode) {
105
            (GpuPreprocessingMode::None, _) | (_, GpuPreprocessingMode::None) => {
106
                GpuPreprocessingMode::None
107
            }
108
            (mode, GpuPreprocessingMode::Culling) | (GpuPreprocessingMode::Culling, mode) => mode,
109
            (GpuPreprocessingMode::PreprocessingOnly, GpuPreprocessingMode::PreprocessingOnly) => {
110
                GpuPreprocessingMode::PreprocessingOnly
111
            }
112
        }
113
    }
114

115
    /// Returns true if GPU culling is supported on this platform.
116
    pub fn is_culling_supported(&self) -> bool {
117
        self.max_supported_mode == GpuPreprocessingMode::Culling
118
    }
119
}
120

121
/// The amount of GPU preprocessing (compute and indirect draw) that we do.
122
#[derive(Clone, Copy, PartialEq)]
123
pub enum GpuPreprocessingMode {
124
    /// No GPU preprocessing is in use at all.
125
    ///
126
    /// This is used when GPU compute isn't available.
127
    None,
128

129
    /// GPU preprocessing is in use, but GPU culling isn't.
130
    ///
131
    /// This is used when the [`NoIndirectDrawing`] component is present on the
132
    /// camera.
133
    PreprocessingOnly,
134

135
    /// Both GPU preprocessing and GPU culling are in use.
136
    ///
137
    /// This is used by default.
138
    Culling,
139
}
140

141
/// The GPU buffers holding the data needed to render batches.
142
///
143
/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
144
/// `BD` type parameter in that mode.
145
///
146
/// We have a separate *buffer data input* type (`BDI`) here, which a compute
147
/// shader is expected to expand to the full buffer data (`BD`) type. GPU
148
/// uniform building is generally faster and uses less system RAM to VRAM bus
149
/// bandwidth, but only implemented for some pipelines (for example, not in the
150
/// 2D pipeline at present) and only when compute shader is available.
151
#[derive(Resource)]
152
pub struct BatchedInstanceBuffers<BD, BDI>
153
where
154
    BD: GpuArrayBufferable + Sync + Send + 'static,
155
    BDI: Pod + Default,
156
{
157
    /// The uniform data inputs for the current frame.
158
    ///
159
    /// These are uploaded during the extraction phase.
160
    pub current_input_buffer: InstanceInputUniformBuffer<BDI>,
161

162
    /// The uniform data inputs for the previous frame.
163
    ///
164
    /// The indices don't generally line up between `current_input_buffer`
165
    /// and `previous_input_buffer`, because, among other reasons, entities
166
    /// can spawn or despawn between frames. Instead, each current buffer
167
    /// data input uniform is expected to contain the index of the
168
    /// corresponding buffer data input uniform in this list.
169
    pub previous_input_buffer: InstanceInputUniformBuffer<BDI>,
170

171
    /// The data needed to render buffers for each phase.
172
    ///
173
    /// The keys of this map are the type IDs of each phase: e.g. `Opaque3d`,
174
    /// `AlphaMask3d`, etc.
175
    pub phase_instance_buffers: TypeIdMap<UntypedPhaseBatchedInstanceBuffers<BD>>,
176
}
177

178
impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
179
where
180
    BD: GpuArrayBufferable + Sync + Send + 'static,
181
    BDI: Pod + Sync + Send + Default + 'static,
182
{
183
    fn default() -> Self {
184
        BatchedInstanceBuffers {
185
            current_input_buffer: InstanceInputUniformBuffer::new(),
186
            previous_input_buffer: InstanceInputUniformBuffer::new(),
187
            phase_instance_buffers: HashMap::default(),
188
        }
189
    }
190
}
191

192
/// The GPU buffers holding the data needed to render batches for a single
193
/// phase.
194
///
195
/// These are split out per phase so that we can run the phases in parallel.
196
/// This is the version of the structure that has a type parameter, which
197
/// enables Bevy's scheduler to run the batching operations for the different
198
/// phases in parallel.
199
///
200
/// See the documentation for [`BatchedInstanceBuffers`] for more information.
201
#[derive(Resource)]
202
pub struct PhaseBatchedInstanceBuffers<PI, BD>
203
where
204
    PI: PhaseItem,
205
    BD: GpuArrayBufferable + Sync + Send + 'static,
206
{
207
    /// The buffers for this phase.
208
    pub buffers: UntypedPhaseBatchedInstanceBuffers<BD>,
209
    phantom: PhantomData<PI>,
210
}
211

212
impl<PI, BD> Default for PhaseBatchedInstanceBuffers<PI, BD>
213
where
214
    PI: PhaseItem,
215
    BD: GpuArrayBufferable + Sync + Send + 'static,
216
{
217
    fn default() -> Self {
218
        PhaseBatchedInstanceBuffers {
219
            buffers: UntypedPhaseBatchedInstanceBuffers::default(),
220
            phantom: PhantomData,
221
        }
222
    }
223
}
224

225
/// The GPU buffers holding the data needed to render batches for a single
226
/// phase, without a type parameter for that phase.
227
///
228
/// Since this structure doesn't have a type parameter, it can be placed in
229
/// [`BatchedInstanceBuffers::phase_instance_buffers`].
230
pub struct UntypedPhaseBatchedInstanceBuffers<BD>
231
where
232
    BD: GpuArrayBufferable + Sync + Send + 'static,
233
{
234
    /// A storage area for the buffer data that the GPU compute shader is
235
    /// expected to write to.
236
    ///
237
    /// There will be one entry for each index.
238
    pub data_buffer: UninitBufferVec<BD>,
239

240
    /// The index of the buffer data in the current input buffer that
241
    /// corresponds to each instance.
242
    ///
243
    /// This is keyed off each view. Each view has a separate buffer.
244
    pub work_item_buffers: HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
245

246
    /// A buffer that holds the number of indexed meshes that weren't visible in
247
    /// the previous frame, when GPU occlusion culling is in use.
248
    ///
249
    /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per
250
    /// view. Bevy uses this value to determine how many threads to dispatch to
251
    /// check meshes that weren't visible next frame to see if they became newly
252
    /// visible this frame.
253
    pub late_indexed_indirect_parameters_buffer:
254
        RawBufferVec<LatePreprocessWorkItemIndirectParameters>,
255

256
    /// A buffer that holds the number of non-indexed meshes that weren't
257
    /// visible in the previous frame, when GPU occlusion culling is in use.
258
    ///
259
    /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per
260
    /// view. Bevy uses this value to determine how many threads to dispatch to
261
    /// check meshes that weren't visible next frame to see if they became newly
262
    /// visible this frame.
263
    pub late_non_indexed_indirect_parameters_buffer:
264
        RawBufferVec<LatePreprocessWorkItemIndirectParameters>,
265
}
266

267
/// Holds the GPU buffer of instance input data, which is the data about each
268
/// mesh instance that the CPU provides.
269
///
270
/// `BDI` is the *buffer data input* type, which the GPU mesh preprocessing
271
/// shader is expected to expand to the full *buffer data* type.
272
pub struct InstanceInputUniformBuffer<BDI>
273
where
274
    BDI: Pod + Default,
275
{
276
    /// The buffer containing the data that will be uploaded to the GPU.
277
    buffer: RawBufferVec<BDI>,
278

279
    /// Indices of slots that are free within the buffer.
280
    ///
281
    /// When adding data, we preferentially overwrite these slots first before
282
    /// growing the buffer itself.
283
    free_uniform_indices: Vec<u32>,
284
}
285

286
impl<BDI> InstanceInputUniformBuffer<BDI>
287
where
288
    BDI: Pod + Default,
289
{
290
    /// Creates a new, empty buffer.
291
    pub fn new() -> InstanceInputUniformBuffer<BDI> {
292
        InstanceInputUniformBuffer {
293
            buffer: RawBufferVec::new(BufferUsages::STORAGE),
294
            free_uniform_indices: vec![],
295
        }
296
    }
297

298
    /// Clears the buffer and entity list out.
299
    pub fn clear(&mut self) {
300
        self.buffer.clear();
301
        self.free_uniform_indices.clear();
302
    }
303

304
    /// Returns the [`RawBufferVec`] corresponding to this input uniform buffer.
305
    #[inline]
306
    pub fn buffer(&self) -> &RawBufferVec<BDI> {
307
        &self.buffer
308
    }
309

310
    /// Adds a new piece of buffered data to the uniform buffer and returns its
311
    /// index.
312
    pub fn add(&mut self, element: BDI) -> u32 {
313
        match self.free_uniform_indices.pop() {
314
            Some(uniform_index) => {
315
                self.buffer.values_mut()[uniform_index as usize] = element;
316
                uniform_index
317
            }
318
            None => self.buffer.push(element) as u32,
319
        }
320
    }
321

322
    /// Removes a piece of buffered data from the uniform buffer.
323
    ///
324
    /// This simply marks the data as free.
325
    pub fn remove(&mut self, uniform_index: u32) {
326
        self.free_uniform_indices.push(uniform_index);
327
    }
328

329
    /// Returns the piece of buffered data at the given index.
330
    ///
331
    /// Returns [`None`] if the index is out of bounds or the data is removed.
332
    pub fn get(&self, uniform_index: u32) -> Option<BDI> {
333
        if (uniform_index as usize) >= self.buffer.len()
334
            || self.free_uniform_indices.contains(&uniform_index)
335
        {
336
            None
337
        } else {
338
            Some(self.get_unchecked(uniform_index))
339
        }
340
    }
341

342
    /// Returns the piece of buffered data at the given index.
343
    /// Can return data that has previously been removed.
344
    ///
345
    /// # Panics
346
    /// if `uniform_index` is not in bounds of [`Self::buffer`].
347
    pub fn get_unchecked(&self, uniform_index: u32) -> BDI {
348
        self.buffer.values()[uniform_index as usize]
349
    }
350

351
    /// Stores a piece of buffered data at the given index.
352
    ///
353
    /// # Panics
354
    /// if `uniform_index` is not in bounds of [`Self::buffer`].
355
    pub fn set(&mut self, uniform_index: u32, element: BDI) {
356
        self.buffer.values_mut()[uniform_index as usize] = element;
357
    }
358

359
    // Ensures that the buffers are nonempty, which the GPU requires before an
360
    // upload can take place.
361
    pub fn ensure_nonempty(&mut self) {
362
        if self.buffer.is_empty() {
363
            self.buffer.push(default());
364
        }
365
    }
366

367
    /// Returns the number of instances in this buffer.
368
    pub fn len(&self) -> usize {
369
        self.buffer.len()
370
    }
371

372
    /// Returns true if this buffer has no instances or false if it contains any
373
    /// instances.
374
    pub fn is_empty(&self) -> bool {
375
        self.buffer.is_empty()
376
    }
377

378
    /// Consumes this [`InstanceInputUniformBuffer`] and returns the raw buffer
379
    /// ready to be uploaded to the GPU.
380
    pub fn into_buffer(self) -> RawBufferVec<BDI> {
381
        self.buffer
382
    }
383
}
384

385
impl<BDI> Default for InstanceInputUniformBuffer<BDI>
386
where
387
    BDI: Pod + Default,
388
{
389
    fn default() -> Self {
390
        Self::new()
391
    }
392
}
393

394
/// The buffer of GPU preprocessing work items for a single view.
395
#[cfg_attr(
396
    not(target_arch = "wasm32"),
397
    expect(
398
        clippy::large_enum_variant,
399
        reason = "See https://github.com/bevyengine/bevy/issues/19220"
400
    )
401
)]
402
pub enum PreprocessWorkItemBuffers {
403
    /// The work items we use if we aren't using indirect drawing.
404
    ///
405
    /// Because we don't have to separate indexed from non-indexed meshes in
406
    /// direct mode, we only have a single buffer here.
407
    Direct(RawBufferVec<PreprocessWorkItem>),
408

409
    /// The buffer of work items we use if we are using indirect drawing.
410
    ///
411
    /// We need to separate out indexed meshes from non-indexed meshes in this
412
    /// case because the indirect parameters for these two types of meshes have
413
    /// different sizes.
414
    Indirect {
415
        /// The buffer of work items corresponding to indexed meshes.
416
        indexed: RawBufferVec<PreprocessWorkItem>,
417
        /// The buffer of work items corresponding to non-indexed meshes.
418
        non_indexed: RawBufferVec<PreprocessWorkItem>,
419
        /// The work item buffers we use when GPU occlusion culling is in use.
420
        gpu_occlusion_culling: Option<GpuOcclusionCullingWorkItemBuffers>,
421
    },
422
}
423

424
/// The work item buffers we use when GPU occlusion culling is in use.
425
pub struct GpuOcclusionCullingWorkItemBuffers {
426
    /// The buffer of work items corresponding to indexed meshes.
427
    pub late_indexed: UninitBufferVec<PreprocessWorkItem>,
428
    /// The buffer of work items corresponding to non-indexed meshes.
429
    pub late_non_indexed: UninitBufferVec<PreprocessWorkItem>,
430
    /// The offset into the
431
    /// [`UntypedPhaseBatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
432
    /// where this view's indirect dispatch counts for indexed meshes live.
433
    pub late_indirect_parameters_indexed_offset: u32,
434
    /// The offset into the
435
    /// [`UntypedPhaseBatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
436
    /// where this view's indirect dispatch counts for non-indexed meshes live.
437
    pub late_indirect_parameters_non_indexed_offset: u32,
438
}
439

440
/// A GPU-side data structure that stores the number of workgroups to dispatch
441
/// for the second phase of GPU occlusion culling.
442
///
443
/// The late mesh preprocessing phase checks meshes that weren't visible frame
444
/// to determine if they're potentially visible this frame.
445
#[derive(Clone, Copy, ShaderType, Pod, Zeroable)]
446
#[repr(C)]
447
pub struct LatePreprocessWorkItemIndirectParameters {
448
    /// The number of workgroups to dispatch.
449
    ///
450
    /// This will be equal to `work_item_count / 64`, rounded *up*.
451
    dispatch_x: u32,
452
    /// The number of workgroups along the abstract Y axis to dispatch: always
453
    /// 1.
454
    dispatch_y: u32,
455
    /// The number of workgroups along the abstract Z axis to dispatch: always
456
    /// 1.
457
    dispatch_z: u32,
458
    /// The actual number of work items.
459
    ///
460
    /// The GPU indirect dispatch doesn't read this, but it's used internally to
461
    /// determine the actual number of work items that exist in the late
462
    /// preprocessing work item buffer.
463
    work_item_count: u32,
464
    /// Padding to 64-byte boundaries for some hardware.
465
    pad: UVec4,
466
}
467

468
impl Default for LatePreprocessWorkItemIndirectParameters {
469
    fn default() -> LatePreprocessWorkItemIndirectParameters {
470
        LatePreprocessWorkItemIndirectParameters {
471
            dispatch_x: 0,
472
            dispatch_y: 1,
473
            dispatch_z: 1,
474
            work_item_count: 0,
475
            pad: default(),
476
        }
477
    }
478
}
479

480
/// Returns the set of work item buffers for the given view, first creating it
481
/// if necessary.
482
///
483
/// Bevy uses work item buffers to tell the mesh preprocessing compute shader
484
/// which meshes are to be drawn.
485
///
486
/// You may need to call this function if you're implementing your own custom
487
/// render phases. See the `specialized_mesh_pipeline` example.
488
pub fn get_or_create_work_item_buffer<'a, I>(
489
    work_item_buffers: &'a mut HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
490
    view: RetainedViewEntity,
491
    no_indirect_drawing: bool,
492
    enable_gpu_occlusion_culling: bool,
493
) -> &'a mut PreprocessWorkItemBuffers
494
where
495
    I: 'static,
496
{
497
    let preprocess_work_item_buffers = match work_item_buffers.entry(view) {
498
        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
499
        Entry::Vacant(vacant_entry) => {
500
            if no_indirect_drawing {
501
                vacant_entry.insert(PreprocessWorkItemBuffers::Direct(RawBufferVec::new(
502
                    BufferUsages::STORAGE,
503
                )))
504
            } else {
505
                vacant_entry.insert(PreprocessWorkItemBuffers::Indirect {
506
                    indexed: RawBufferVec::new(BufferUsages::STORAGE),
507
                    non_indexed: RawBufferVec::new(BufferUsages::STORAGE),
508
                    // We fill this in below if `enable_gpu_occlusion_culling`
509
                    // is set.
510
                    gpu_occlusion_culling: None,
511
                })
512
            }
513
        }
514
    };
515

516
    // Initialize the GPU occlusion culling buffers if necessary.
517
    if let PreprocessWorkItemBuffers::Indirect {
518
        ref mut gpu_occlusion_culling,
519
        ..
520
    } = *preprocess_work_item_buffers
521
    {
522
        match (
523
            enable_gpu_occlusion_culling,
524
            gpu_occlusion_culling.is_some(),
525
        ) {
526
            (false, false) | (true, true) => {}
527
            (false, true) => {
528
                *gpu_occlusion_culling = None;
529
            }
530
            (true, false) => {
531
                *gpu_occlusion_culling = Some(GpuOcclusionCullingWorkItemBuffers {
532
                    late_indexed: UninitBufferVec::new(BufferUsages::STORAGE),
533
                    late_non_indexed: UninitBufferVec::new(BufferUsages::STORAGE),
534
                    late_indirect_parameters_indexed_offset: 0,
535
                    late_indirect_parameters_non_indexed_offset: 0,
536
                });
537
            }
538
        }
539
    }
540

541
    preprocess_work_item_buffers
542
}
543

544
/// Initializes work item buffers for a phase in preparation for a new frame.
545
pub fn init_work_item_buffers(
546
    work_item_buffers: &mut PreprocessWorkItemBuffers,
547
    late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
548
        LatePreprocessWorkItemIndirectParameters,
549
    >,
550
    late_non_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
551
        LatePreprocessWorkItemIndirectParameters,
552
    >,
553
) {
554
    // Add the offsets for indirect parameters that the late phase of mesh
555
    // preprocessing writes to.
556
    if let PreprocessWorkItemBuffers::Indirect {
557
        gpu_occlusion_culling:
558
            Some(GpuOcclusionCullingWorkItemBuffers {
559
                ref mut late_indirect_parameters_indexed_offset,
560
                ref mut late_indirect_parameters_non_indexed_offset,
561
                ..
562
            }),
563
        ..
564
    } = *work_item_buffers
565
    {
566
        *late_indirect_parameters_indexed_offset = late_indexed_indirect_parameters_buffer
567
            .push(LatePreprocessWorkItemIndirectParameters::default())
568
            as u32;
569
        *late_indirect_parameters_non_indexed_offset = late_non_indexed_indirect_parameters_buffer
570
            .push(LatePreprocessWorkItemIndirectParameters::default())
571
            as u32;
572
    }
573
}
574

575
impl PreprocessWorkItemBuffers {
576
    /// Adds a new work item to the appropriate buffer.
577
    ///
578
    /// `indexed` specifies whether the work item corresponds to an indexed
579
    /// mesh.
580
    pub fn push(&mut self, indexed: bool, preprocess_work_item: PreprocessWorkItem) {
581
        match *self {
582
            PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
583
                buffer.push(preprocess_work_item);
584
            }
585
            PreprocessWorkItemBuffers::Indirect {
586
                indexed: ref mut indexed_buffer,
587
                non_indexed: ref mut non_indexed_buffer,
588
                ref mut gpu_occlusion_culling,
589
            } => {
590
                if indexed {
591
                    indexed_buffer.push(preprocess_work_item);
592
                } else {
593
                    non_indexed_buffer.push(preprocess_work_item);
594
                }
595

596
                if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
597
                    if indexed {
598
                        gpu_occlusion_culling.late_indexed.add();
599
                    } else {
600
                        gpu_occlusion_culling.late_non_indexed.add();
601
                    }
602
                }
603
            }
604
        }
605
    }
606

607
    /// Clears out the GPU work item buffers in preparation for a new frame.
608
    pub fn clear(&mut self) {
609
        match *self {
610
            PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
611
                buffer.clear();
612
            }
613
            PreprocessWorkItemBuffers::Indirect {
614
                indexed: ref mut indexed_buffer,
615
                non_indexed: ref mut non_indexed_buffer,
616
                ref mut gpu_occlusion_culling,
617
            } => {
618
                indexed_buffer.clear();
619
                non_indexed_buffer.clear();
620

621
                if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
622
                    gpu_occlusion_culling.late_indexed.clear();
623
                    gpu_occlusion_culling.late_non_indexed.clear();
624
                    gpu_occlusion_culling.late_indirect_parameters_indexed_offset = 0;
625
                    gpu_occlusion_culling.late_indirect_parameters_non_indexed_offset = 0;
626
                }
627
            }
628
        }
629
    }
630
}
631

632
/// One invocation of the preprocessing shader: i.e. one mesh instance in a
633
/// view.
634
#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
635
#[repr(C)]
636
pub struct PreprocessWorkItem {
637
    /// The index of the batch input data in the input buffer that the shader
638
    /// reads from.
639
    pub input_index: u32,
640

641
    /// In direct mode, the index of the mesh uniform; in indirect mode, the
642
    /// index of the [`IndirectParametersGpuMetadata`].
643
    ///
644
    /// In indirect mode, this is the index of the
645
    /// [`IndirectParametersGpuMetadata`] in the
646
    /// `IndirectParametersBuffers::indexed_metadata` or
647
    /// `IndirectParametersBuffers::non_indexed_metadata`.
648
    pub output_or_indirect_parameters_index: u32,
649
}
650

651
/// The `wgpu` indirect parameters structure that specifies a GPU draw command.
652
///
653
/// This is the variant for indexed meshes. We generate the instances of this
654
/// structure in the `build_indirect_params.wgsl` compute shader.
655
#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)]
656
#[repr(C)]
657
pub struct IndirectParametersIndexed {
658
    /// The number of indices that this mesh has.
659
    pub index_count: u32,
660
    /// The number of instances we are to draw.
661
    pub instance_count: u32,
662
    /// The offset of the first index for this mesh in the index buffer slab.
663
    pub first_index: u32,
664
    /// The offset of the first vertex for this mesh in the vertex buffer slab.
665
    pub base_vertex: u32,
666
    /// The index of the first mesh instance in the `MeshUniform` buffer.
667
    pub first_instance: u32,
668
}
669

670
/// The `wgpu` indirect parameters structure that specifies a GPU draw command.
671
///
672
/// This is the variant for non-indexed meshes. We generate the instances of
673
/// this structure in the `build_indirect_params.wgsl` compute shader.
674
#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)]
675
#[repr(C)]
676
pub struct IndirectParametersNonIndexed {
677
    /// The number of vertices that this mesh has.
678
    pub vertex_count: u32,
679
    /// The number of instances we are to draw.
680
    pub instance_count: u32,
681
    /// The offset of the first vertex for this mesh in the vertex buffer slab.
682
    pub base_vertex: u32,
683
    /// The index of the first mesh instance in the `Mesh` buffer.
684
    pub first_instance: u32,
685
}
686

687
/// A structure, initialized on CPU and read on GPU, that contains metadata
688
/// about each batch.
689
///
690
/// Each batch will have one instance of this structure.
691
#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
692
#[repr(C)]
693
pub struct IndirectParametersCpuMetadata {
694
    /// The index of the first instance of this mesh in the array of
695
    /// `MeshUniform`s.
696
    ///
697
    /// Note that this is the *first* output index in this batch. Since each
698
    /// instance of this structure refers to arbitrarily many instances, the
699
    /// `MeshUniform`s corresponding to this batch span the indices
700
    /// `base_output_index..(base_output_index + instance_count)`.
701
    pub base_output_index: u32,
702

703
    /// The index of the batch set that this batch belongs to in the
704
    /// [`IndirectBatchSet`] buffer.
705
    ///
706
    /// A *batch set* is a set of meshes that may be multi-drawn together.
707
    /// Multiple batches (and therefore multiple instances of
708
    /// [`IndirectParametersGpuMetadata`] structures) can be part of the same
709
    /// batch set.
710
    pub batch_set_index: u32,
711
}
712

713
/// A structure, written and read GPU, that records how many instances of each
714
/// mesh are actually to be drawn.
715
///
716
/// The GPU mesh preprocessing shader increments the
717
/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it
718
/// determines that meshes are visible.  The indirect parameter building shader
719
/// reads this metadata in order to construct the indirect draw parameters.
720
///
721
/// Each batch will have one instance of this structure.
722
#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
723
#[repr(C)]
724
pub struct IndirectParametersGpuMetadata {
725
    /// The index of the first mesh in this batch in the array of
726
    /// `MeshInputUniform`s.
727
    pub mesh_index: u32,
728

729
    /// The number of instances that were judged visible last frame.
730
    ///
731
    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
732
    /// increments it as it culls mesh instances.
733
    pub early_instance_count: u32,
734

735
    /// The number of instances that have been judged potentially visible this
736
    /// frame that weren't in the last frame's potentially visible set.
737
    ///
738
    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
739
    /// increments it as it culls mesh instances.
740
    pub late_instance_count: u32,
741
}
742

743
/// A structure, shared between CPU and GPU, that holds the number of on-GPU
744
/// indirect draw commands for each *batch set*.
745
///
746
/// A *batch set* is a set of meshes that may be multi-drawn together.
747
///
748
/// If the current hardware and driver support `multi_draw_indirect_count`, the
749
/// indirect parameters building shader increments
750
/// [`Self::indirect_parameters_count`] as it generates indirect parameters. The
751
/// `multi_draw_indirect_count` command reads
752
/// [`Self::indirect_parameters_count`] in order to determine how many commands
753
/// belong to each batch set.
754
#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
755
#[repr(C)]
756
pub struct IndirectBatchSet {
757
    /// The number of indirect parameter commands (i.e. batches) in this batch
758
    /// set.
759
    ///
760
    /// The CPU sets this value to 0 before uploading this structure to GPU. The
761
    /// indirect parameters building shader increments this value as it creates
762
    /// indirect parameters. Then the `multi_draw_indirect_count` command reads
763
    /// this value in order to determine how many indirect draw commands to
764
    /// process.
765
    pub indirect_parameters_count: u32,
766

767
    /// The offset within the `IndirectParametersBuffers::indexed_data` or
768
    /// `IndirectParametersBuffers::non_indexed_data` of the first indirect draw
769
    /// command for this batch set.
770
    ///
771
    /// The CPU fills out this value.
772
    pub indirect_parameters_base: u32,
773
}
774

775
/// The buffers containing all the information that indirect draw commands
776
/// (`multi_draw_indirect`, `multi_draw_indirect_count`) use to draw the scene.
777
///
778
/// In addition to the indirect draw buffers themselves, this structure contains
779
/// the buffers that store [`IndirectParametersGpuMetadata`], which are the
780
/// structures that culling writes to so that the indirect parameter building
781
/// pass can determine how many meshes are actually to be drawn.
782
///
783
/// These buffers will remain empty if indirect drawing isn't in use.
784
#[derive(Resource, Deref, DerefMut)]
785
pub struct IndirectParametersBuffers {
786
    /// A mapping from a phase type ID to the indirect parameters buffers for
787
    /// that phase.
788
    ///
789
    /// Examples of phase type IDs are `Opaque3d` and `AlphaMask3d`.
790
    #[deref]
791
    pub buffers: TypeIdMap<UntypedPhaseIndirectParametersBuffers>,
792
    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
793
    /// that they can be read back to CPU.
794
    ///
795
    /// This is a debugging feature that may reduce performance. It primarily
796
    /// exists for the `occlusion_culling` example.
797
    pub allow_copies_from_indirect_parameter_buffers: bool,
798
}
799

800
impl IndirectParametersBuffers {
801
    /// Initializes a new [`IndirectParametersBuffers`] resource.
802
    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers {
803
        IndirectParametersBuffers {
804
            buffers: TypeIdMap::default(),
805
            allow_copies_from_indirect_parameter_buffers,
806
        }
807
    }
808
}
809

810
/// The buffers containing all the information that indirect draw commands use
811
/// to draw the scene, for a single phase.
812
///
813
/// This is the version of the structure that has a type parameter, so that the
814
/// batching for different phases can run in parallel.
815
///
816
/// See the [`IndirectParametersBuffers`] documentation for more information.
817
#[derive(Resource)]
818
pub struct PhaseIndirectParametersBuffers<PI>
819
where
820
    PI: PhaseItem,
821
{
822
    /// The indirect draw buffers for the phase.
823
    pub buffers: UntypedPhaseIndirectParametersBuffers,
824
    phantom: PhantomData<PI>,
825
}
826

827
impl<PI> PhaseIndirectParametersBuffers<PI>
828
where
829
    PI: PhaseItem,
830
{
831
    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> Self {
832
        PhaseIndirectParametersBuffers {
833
            buffers: UntypedPhaseIndirectParametersBuffers::new(
834
                allow_copies_from_indirect_parameter_buffers,
835
            ),
836
            phantom: PhantomData,
837
        }
838
    }
839
}
840

841
/// The buffers containing all the information that indirect draw commands use
842
/// to draw the scene, for a single phase.
843
///
844
/// This is the version of the structure that doesn't have a type parameter, so
845
/// that it can be inserted into [`IndirectParametersBuffers::buffers`]
846
///
847
/// See the [`IndirectParametersBuffers`] documentation for more information.
848
pub struct UntypedPhaseIndirectParametersBuffers {
849
    /// Information that indirect draw commands use to draw indexed meshes in
850
    /// the scene.
851
    pub indexed: MeshClassIndirectParametersBuffers<IndirectParametersIndexed>,
852
    /// Information that indirect draw commands use to draw non-indexed meshes
853
    /// in the scene.
854
    pub non_indexed: MeshClassIndirectParametersBuffers<IndirectParametersNonIndexed>,
855
}
856

857
impl UntypedPhaseIndirectParametersBuffers {
858
    /// Creates the indirect parameters buffers.
859
    pub fn new(
860
        allow_copies_from_indirect_parameter_buffers: bool,
861
    ) -> UntypedPhaseIndirectParametersBuffers {
862
        let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT;
863
        if allow_copies_from_indirect_parameter_buffers {
864
            indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC;
865
        }
866

867
        UntypedPhaseIndirectParametersBuffers {
868
            non_indexed: MeshClassIndirectParametersBuffers::new(
869
                allow_copies_from_indirect_parameter_buffers,
870
            ),
871
            indexed: MeshClassIndirectParametersBuffers::new(
872
                allow_copies_from_indirect_parameter_buffers,
873
            ),
874
        }
875
    }
876

877
    /// Reserves space for `count` new batches.
878
    ///
879
    /// The `indexed` parameter specifies whether the meshes that these batches
880
    /// correspond to are indexed or not.
881
    pub fn allocate(&mut self, indexed: bool, count: u32) -> u32 {
882
        if indexed {
883
            self.indexed.allocate(count)
884
        } else {
885
            self.non_indexed.allocate(count)
886
        }
887
    }
888

889
    /// Returns the number of batches currently allocated.
890
    ///
891
    /// The `indexed` parameter specifies whether the meshes that these batches
892
    /// correspond to are indexed or not.
893
    fn batch_count(&self, indexed: bool) -> usize {
894
        if indexed {
895
            self.indexed.batch_count()
896
        } else {
897
            self.non_indexed.batch_count()
898
        }
899
    }
900

901
    /// Returns the number of batch sets currently allocated.
902
    ///
903
    /// The `indexed` parameter specifies whether the meshes that these batch
904
    /// sets correspond to are indexed or not.
905
    pub fn batch_set_count(&self, indexed: bool) -> usize {
906
        if indexed {
907
            self.indexed.batch_sets.len()
908
        } else {
909
            self.non_indexed.batch_sets.len()
910
        }
911
    }
912

913
    /// Adds a new batch set to `Self::indexed_batch_sets` or
914
    /// `Self::non_indexed_batch_sets` as appropriate.
915
    ///
916
    /// `indexed` specifies whether the meshes that these batch sets correspond
917
    /// to are indexed or not. `indirect_parameters_base` specifies the offset
918
    /// within `Self::indexed_data` or `Self::non_indexed_data` of the first
919
    /// batch in this batch set.
920
    #[inline]
921
    pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) {
922
        if indexed {
923
            self.indexed.batch_sets.push(IndirectBatchSet {
924
                indirect_parameters_base,
925
                indirect_parameters_count: 0,
926
            });
927
        } else {
928
            self.non_indexed.batch_sets.push(IndirectBatchSet {
929
                indirect_parameters_base,
930
                indirect_parameters_count: 0,
931
            });
932
        }
933
    }
934

935
    /// Returns the index that a newly-added batch set will have.
936
    ///
937
    /// The `indexed` parameter specifies whether the meshes in such a batch set
938
    /// are indexed or not.
939
    pub fn get_next_batch_set_index(&self, indexed: bool) -> Option<NonMaxU32> {
940
        NonMaxU32::new(self.batch_set_count(indexed) as u32)
941
    }
942

943
    /// Clears out the buffers in preparation for a new frame.
944
    pub fn clear(&mut self) {
945
        self.indexed.clear();
946
        self.non_indexed.clear();
947
    }
948
}
949

950
/// The buffers containing all the information that indirect draw commands use
951
/// to draw the scene, for a single mesh class (indexed or non-indexed), for a
952
/// single phase.
953
pub struct MeshClassIndirectParametersBuffers<IP>
954
where
955
    IP: Clone + ShaderSize + WriteInto,
956
{
957
    /// The GPU buffer that stores the indirect draw parameters for the meshes.
958
    ///
959
    /// The indirect parameters building shader writes to this buffer, while the
960
    /// `multi_draw_indirect` or `multi_draw_indirect_count` commands read from
961
    /// it to perform the draws.
962
    data: UninitBufferVec<IP>,
963

964
    /// The GPU buffer that holds the data used to construct indirect draw
965
    /// parameters for meshes.
966
    ///
967
    /// The GPU mesh preprocessing shader writes to this buffer, and the
968
    /// indirect parameters building shader reads this buffer to construct the
969
    /// indirect draw parameters.
970
    cpu_metadata: RawBufferVec<IndirectParametersCpuMetadata>,
971

972
    /// The GPU buffer that holds data built by the GPU used to construct
973
    /// indirect draw parameters for meshes.
974
    ///
975
    /// The GPU mesh preprocessing shader writes to this buffer, and the
976
    /// indirect parameters building shader reads this buffer to construct the
977
    /// indirect draw parameters.
978
    gpu_metadata: UninitBufferVec<IndirectParametersGpuMetadata>,
979

980
    /// The GPU buffer that holds the number of indirect draw commands for each
981
    /// phase of each view, for meshes.
982
    ///
983
    /// The indirect parameters building shader writes to this buffer, and the
984
    /// `multi_draw_indirect_count` command reads from it in order to know how
985
    /// many indirect draw commands to process.
986
    batch_sets: RawBufferVec<IndirectBatchSet>,
987
}
988

989
impl<IP> MeshClassIndirectParametersBuffers<IP>
990
where
991
    IP: Clone + ShaderSize + WriteInto,
992
{
993
    fn new(
994
        allow_copies_from_indirect_parameter_buffers: bool,
995
    ) -> MeshClassIndirectParametersBuffers<IP> {
996
        let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT;
997
        if allow_copies_from_indirect_parameter_buffers {
998
            indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC;
999
        }
1000

1001
        MeshClassIndirectParametersBuffers {
1002
            data: UninitBufferVec::new(indirect_parameter_buffer_usages),
1003
            cpu_metadata: RawBufferVec::new(BufferUsages::STORAGE),
1004
            gpu_metadata: UninitBufferVec::new(BufferUsages::STORAGE),
1005
            batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages),
1006
        }
1007
    }
1008

1009
    /// Returns the GPU buffer that stores the indirect draw parameters for
1010
    /// indexed meshes.
1011
    ///
1012
    /// The indirect parameters building shader writes to this buffer, while the
1013
    /// `multi_draw_indirect` or `multi_draw_indirect_count` commands read from
1014
    /// it to perform the draws.
1015
    #[inline]
1016
    pub fn data_buffer(&self) -> Option<&Buffer> {
1017
        self.data.buffer()
1018
    }
1019

1020
    /// Returns the GPU buffer that holds the CPU-constructed data used to
1021
    /// construct indirect draw parameters for meshes.
1022
    ///
1023
    /// The CPU writes to this buffer, and the indirect parameters building
1024
    /// shader reads this buffer to construct the indirect draw parameters.
1025
    #[inline]
1026
    pub fn cpu_metadata_buffer(&self) -> Option<&Buffer> {
1027
        self.cpu_metadata.buffer()
1028
    }
1029

1030
    /// Returns the GPU buffer that holds the GPU-constructed data used to
1031
    /// construct indirect draw parameters for meshes.
1032
    ///
1033
    /// The GPU mesh preprocessing shader writes to this buffer, and the
1034
    /// indirect parameters building shader reads this buffer to construct the
1035
    /// indirect draw parameters.
1036
    #[inline]
1037
    pub fn gpu_metadata_buffer(&self) -> Option<&Buffer> {
1038
        self.gpu_metadata.buffer()
1039
    }
1040

1041
    /// Returns the GPU buffer that holds the number of indirect draw commands
1042
    /// for each phase of each view.
1043
    ///
1044
    /// The indirect parameters building shader writes to this buffer, and the
1045
    /// `multi_draw_indirect_count` command reads from it in order to know how
1046
    /// many indirect draw commands to process.
1047
    #[inline]
1048
    pub fn batch_sets_buffer(&self) -> Option<&Buffer> {
1049
        self.batch_sets.buffer()
1050
    }
1051

1052
    /// Reserves space for `count` new batches.
1053
    ///
1054
    /// This allocates in the [`Self::cpu_metadata`], [`Self::gpu_metadata`],
1055
    /// and [`Self::data`] buffers.
1056
    fn allocate(&mut self, count: u32) -> u32 {
1057
        let length = self.data.len();
1058
        self.cpu_metadata.reserve_internal(count as usize);
1059
        self.gpu_metadata.add_multiple(count as usize);
1060
        for _ in 0..count {
1061
            self.data.add();
1062
            self.cpu_metadata
1063
                .push(IndirectParametersCpuMetadata::default());
1064
        }
1065
        length as u32
1066
    }
1067

1068
    /// Sets the [`IndirectParametersCpuMetadata`] for the mesh at the given
1069
    /// index.
1070
    pub fn set(&mut self, index: u32, value: IndirectParametersCpuMetadata) {
1071
        self.cpu_metadata.set(index, value);
1072
    }
1073

1074
    /// Returns the number of batches corresponding to meshes that are currently
1075
    /// allocated.
1076
    #[inline]
1077
    pub fn batch_count(&self) -> usize {
1078
        self.data.len()
1079
    }
1080

1081
    /// Clears out all the buffers in preparation for a new frame.
1082
    pub fn clear(&mut self) {
1083
        self.data.clear();
1084
        self.cpu_metadata.clear();
1085
        self.gpu_metadata.clear();
1086
        self.batch_sets.clear();
1087
    }
1088
}
1089

1090
impl Default for IndirectParametersBuffers {
1091
    fn default() -> Self {
1092
        // By default, we don't allow GPU indirect parameter mapping, since
1093
        // that's a debugging option.
1094
        Self::new(false)
1095
    }
1096
}
1097

1098
impl FromWorld for GpuPreprocessingSupport {
1099
    fn from_world(world: &mut World) -> Self {
1100
        let adapter = world.resource::<RenderAdapter>();
1101
        let device = world.resource::<RenderDevice>();
1102

1103
        // Filter Android drivers that are incompatible with GPU preprocessing:
1104
        // - We filter out Adreno 730 and earlier GPUs (except 720, as it's newer
1105
        //   than 730).
1106
        // - We filter out Mali GPUs with driver versions lower than 48.
1107
        fn is_non_supported_android_device(adapter_info: &RenderAdapterInfo) -> bool {
1108
            crate::get_adreno_model(adapter_info).is_some_and(|model| model != 720 && model <= 730)
1109
                || crate::get_mali_driver_version(adapter_info).is_some_and(|version| version < 48)
1110
        }
1111

1112
        let culling_feature_support = device.features().contains(
1113
            Features::INDIRECT_FIRST_INSTANCE
1114
                | Features::MULTI_DRAW_INDIRECT
1115
                | Features::PUSH_CONSTANTS,
1116
        );
1117
        // Depth downsampling for occlusion culling requires 12 textures
1118
        let limit_support = device.limits().max_storage_textures_per_shader_stage >= 12 &&
1119
            // Even if the adapter supports compute, we might be simulating a lack of
1120
            // compute via device limits (see `WgpuSettingsPriority::WebGL2` and
1121
            // `wgpu::Limits::downlevel_webgl2_defaults()`). This will have set all the
1122
            // `max_compute_*` limits to zero, so we arbitrarily pick one as a canary.
1123
            device.limits().max_compute_workgroup_storage_size != 0;
1124

1125
        let downlevel_support = adapter
1126
            .get_downlevel_capabilities()
1127
            .flags
1128
            .contains(DownlevelFlags::COMPUTE_SHADERS);
1129

1130
        let adapter_info = RenderAdapterInfo(WgpuWrapper::new(adapter.get_info()));
1131

1132
        let max_supported_mode = if device.limits().max_compute_workgroup_size_x == 0
1133
            || is_non_supported_android_device(&adapter_info)
1134
            || adapter_info.backend == wgpu::Backend::Gl
1135
        {
1136
            info!(
1137
                "GPU preprocessing is not supported on this device. \
1138
                Falling back to CPU preprocessing.",
1139
            );
1140
            GpuPreprocessingMode::None
1141
        } else if !(culling_feature_support && limit_support && downlevel_support) {
1142
            info!("Some GPU preprocessing are limited on this device.");
1143
            GpuPreprocessingMode::PreprocessingOnly
1144
        } else {
1145
            info!("GPU preprocessing is fully supported on this device.");
1146
            GpuPreprocessingMode::Culling
1147
        };
1148

1149
        GpuPreprocessingSupport { max_supported_mode }
1150
    }
1151
}
1152

1153
impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
1154
where
1155
    BD: GpuArrayBufferable + Sync + Send + 'static,
1156
    BDI: Pod + Sync + Send + Default + 'static,
1157
{
1158
    /// Creates new buffers.
1159
    pub fn new() -> Self {
1160
        Self::default()
1161
    }
1162

1163
    /// Clears out the buffers in preparation for a new frame.
1164
    pub fn clear(&mut self) {
1165
        for phase_instance_buffer in self.phase_instance_buffers.values_mut() {
1166
            phase_instance_buffer.clear();
1167
        }
1168
    }
1169
}
1170

1171
impl<BD> UntypedPhaseBatchedInstanceBuffers<BD>
1172
where
1173
    BD: GpuArrayBufferable + Sync + Send + 'static,
1174
{
1175
    pub fn new() -> Self {
1176
        UntypedPhaseBatchedInstanceBuffers {
1177
            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
1178
            work_item_buffers: HashMap::default(),
1179
            late_indexed_indirect_parameters_buffer: RawBufferVec::new(
1180
                BufferUsages::STORAGE | BufferUsages::INDIRECT,
1181
            ),
1182
            late_non_indexed_indirect_parameters_buffer: RawBufferVec::new(
1183
                BufferUsages::STORAGE | BufferUsages::INDIRECT,
1184
            ),
1185
        }
1186
    }
1187

1188
    /// Returns the binding of the buffer that contains the per-instance data.
1189
    ///
1190
    /// This buffer needs to be filled in via a compute shader.
1191
    pub fn instance_data_binding(&self) -> Option<BindingResource<'_>> {
1192
        self.data_buffer
1193
            .buffer()
1194
            .map(|buffer| buffer.as_entire_binding())
1195
    }
1196

1197
    /// Clears out the buffers in preparation for a new frame.
1198
    pub fn clear(&mut self) {
1199
        self.data_buffer.clear();
1200
        self.late_indexed_indirect_parameters_buffer.clear();
1201
        self.late_non_indexed_indirect_parameters_buffer.clear();
1202

1203
        // Clear each individual set of buffers, but don't depopulate the hash
1204
        // table. We want to avoid reallocating these vectors every frame.
1205
        for view_work_item_buffers in self.work_item_buffers.values_mut() {
1206
            view_work_item_buffers.clear();
1207
        }
1208
    }
1209
}
1210

1211
impl<BD> Default for UntypedPhaseBatchedInstanceBuffers<BD>
1212
where
1213
    BD: GpuArrayBufferable + Sync + Send + 'static,
1214
{
1215
    fn default() -> Self {
1216
        Self::new()
1217
    }
1218
}
1219

1220
/// Information about a render batch that we're building up during a sorted
1221
/// render phase.
1222
struct SortedRenderBatch<F>
1223
where
1224
    F: GetBatchData,
1225
{
1226
    /// The index of the first phase item in this batch in the list of phase
1227
    /// items.
1228
    phase_item_start_index: u32,
1229

1230
    /// The index of the first instance in this batch in the instance buffer.
1231
    instance_start_index: u32,
1232

1233
    /// True if the mesh in question has an index buffer; false otherwise.
1234
    indexed: bool,
1235

1236
    /// The index of the indirect parameters for this batch in the
1237
    /// [`IndirectParametersBuffers`].
1238
    ///
1239
    /// If CPU culling is being used, then this will be `None`.
1240
    indirect_parameters_index: Option<NonMaxU32>,
1241

1242
    /// Metadata that can be used to determine whether an instance can be placed
1243
    /// into this batch.
1244
    ///
1245
    /// If `None`, the item inside is unbatchable.
1246
    meta: Option<BatchMeta<F::CompareData>>,
1247
}
1248

1249
impl<F> SortedRenderBatch<F>
1250
where
1251
    F: GetBatchData,
1252
{
1253
    /// Finalizes this batch and updates the [`SortedRenderPhase`] with the
1254
    /// appropriate indices.
1255
    ///
1256
    /// `instance_end_index` is the index of the last instance in this batch
1257
    /// plus one.
1258
    fn flush<I>(
1259
        self,
1260
        instance_end_index: u32,
1261
        phase: &mut SortedRenderPhase<I>,
1262
        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
1263
    ) where
1264
        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
1265
    {
1266
        let (batch_range, batch_extra_index) =
1267
            phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
1268
        *batch_range = self.instance_start_index..instance_end_index;
1269
        *batch_extra_index = match self.indirect_parameters_index {
1270
            Some(indirect_parameters_index) => PhaseItemExtraIndex::IndirectParametersIndex {
1271
                range: u32::from(indirect_parameters_index)
1272
                    ..(u32::from(indirect_parameters_index) + 1),
1273
                batch_set_index: None,
1274
            },
1275
            None => PhaseItemExtraIndex::None,
1276
        };
1277
        if let Some(indirect_parameters_index) = self.indirect_parameters_index {
1278
            phase_indirect_parameters_buffers
1279
                .add_batch_set(self.indexed, indirect_parameters_index.into());
1280
        }
1281
    }
1282
}
1283

1284
/// A system that runs early in extraction and clears out all the
1285
/// [`BatchedInstanceBuffers`] for the frame.
1286
///
1287
/// We have to run this during extraction because, if GPU preprocessing is in
1288
/// use, the extraction phase will write to the mesh input uniform buffers
1289
/// directly, so the buffers need to be cleared before then.
1290
pub fn clear_batched_gpu_instance_buffers<GFBD>(
1291
    gpu_batched_instance_buffers: Option<
1292
        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
1293
    >,
1294
) where
1295
    GFBD: GetFullBatchData,
1296
{
1297
    // Don't clear the entire table, because that would delete the buffers, and
1298
    // we want to reuse those allocations.
1299
    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
1300
        gpu_batched_instance_buffers.clear();
1301
    }
1302
}
1303

1304
/// A system that removes GPU preprocessing work item buffers that correspond to
1305
/// deleted [`ExtractedView`]s.
1306
///
1307
/// This is a separate system from [`clear_batched_gpu_instance_buffers`]
1308
/// because [`ExtractedView`]s aren't created until after the extraction phase
1309
/// is completed.
1310
pub fn delete_old_work_item_buffers<GFBD>(
1311
    mut gpu_batched_instance_buffers: ResMut<
1312
        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
1313
    >,
1314
    extracted_views: Query<&ExtractedView>,
1315
) where
1316
    GFBD: GetFullBatchData,
1317
{
1318
    let retained_view_entities: HashSet<_> = extracted_views
1319
        .iter()
1320
        .map(|extracted_view| extracted_view.retained_view_entity)
1321
        .collect();
1322
    for phase_instance_buffers in gpu_batched_instance_buffers
1323
        .phase_instance_buffers
1324
        .values_mut()
1325
    {
1326
        phase_instance_buffers
1327
            .work_item_buffers
1328
            .retain(|retained_view_entity, _| {
1329
                retained_view_entities.contains(retained_view_entity)
1330
            });
1331
    }
1332
}
1333

1334
/// Batch the items in a sorted render phase, when GPU instance buffer building
1335
/// is in use. This means comparing metadata needed to draw each phase item and
1336
/// trying to combine the draws into a batch.
1337
pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
1338
    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<I, GFBD::BufferData>>,
1339
    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<I>>,
1340
    mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
1341
    mut views: Query<(
1342
        &ExtractedView,
1343
        Has<NoIndirectDrawing>,
1344
        Has<OcclusionCulling>,
1345
    )>,
1346
    system_param_item: StaticSystemParam<GFBD::Param>,
1347
) where
1348
    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
1349
    GFBD: GetFullBatchData,
1350
{
1351
    // We only process GPU-built batch data in this function.
1352
    let UntypedPhaseBatchedInstanceBuffers {
1353
        ref mut data_buffer,
1354
        ref mut work_item_buffers,
1355
        ref mut late_indexed_indirect_parameters_buffer,
1356
        ref mut late_non_indexed_indirect_parameters_buffer,
1357
    } = phase_batched_instance_buffers.buffers;
1358

1359
    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1360
        let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
1361
            continue;
1362
        };
1363

1364
        // Create the work item buffer if necessary.
1365
        let work_item_buffer = get_or_create_work_item_buffer::<I>(
1366
            work_item_buffers,
1367
            extracted_view.retained_view_entity,
1368
            no_indirect_drawing,
1369
            gpu_occlusion_culling,
1370
        );
1371

1372
        // Initialize those work item buffers in preparation for this new frame.
1373
        init_work_item_buffers(
1374
            work_item_buffer,
1375
            late_indexed_indirect_parameters_buffer,
1376
            late_non_indexed_indirect_parameters_buffer,
1377
        );
1378

1379
        // Walk through the list of phase items, building up batches as we go.
1380
        let mut batch: Option<SortedRenderBatch<GFBD>> = None;
1381

1382
        for current_index in 0..phase.items.len() {
1383
            // Get the index of the input data, and comparison metadata, for
1384
            // this entity.
1385
            let item = &phase.items[current_index];
1386
            let entity = item.main_entity();
1387
            let item_is_indexed = item.indexed();
1388
            let current_batch_input_index =
1389
                GFBD::get_index_and_compare_data(&system_param_item, entity);
1390

1391
            // Unpack that index and metadata. Note that it's possible for index
1392
            // and/or metadata to not be present, which signifies that this
1393
            // entity is unbatchable. In that case, we break the batch here.
1394
            // If the index isn't present the item is not part of this pipeline and so will be skipped.
1395
            let Some((current_input_index, current_meta)) = current_batch_input_index else {
1396
                // Break a batch if we need to.
1397
                if let Some(batch) = batch.take() {
1398
                    batch.flush(
1399
                        data_buffer.len() as u32,
1400
                        phase,
1401
                        &mut phase_indirect_parameters_buffers.buffers,
1402
                    );
1403
                }
1404

1405
                continue;
1406
            };
1407
            let current_meta =
1408
                current_meta.map(|meta| BatchMeta::new(&phase.items[current_index], meta));
1409

1410
            // Determine if this entity can be included in the batch we're
1411
            // building up.
1412
            let can_batch = batch.as_ref().is_some_and(|batch| {
1413
                // `None` for metadata indicates that the items are unbatchable.
1414
                match (&current_meta, &batch.meta) {
1415
                    (Some(current_meta), Some(batch_meta)) => current_meta == batch_meta,
1416
                    (_, _) => false,
1417
                }
1418
            });
1419

1420
            // Make space in the data buffer for this instance.
1421
            let output_index = data_buffer.add() as u32;
1422

1423
            // If we can't batch, break the existing batch and make a new one.
1424
            if !can_batch {
1425
                // Break a batch if we need to.
1426
                if let Some(batch) = batch.take() {
1427
                    batch.flush(
1428
                        output_index,
1429
                        phase,
1430
                        &mut phase_indirect_parameters_buffers.buffers,
1431
                    );
1432
                }
1433

1434
                let indirect_parameters_index = if no_indirect_drawing {
1435
                    None
1436
                } else if item_is_indexed {
1437
                    Some(
1438
                        phase_indirect_parameters_buffers
1439
                            .buffers
1440
                            .indexed
1441
                            .allocate(1),
1442
                    )
1443
                } else {
1444
                    Some(
1445
                        phase_indirect_parameters_buffers
1446
                            .buffers
1447
                            .non_indexed
1448
                            .allocate(1),
1449
                    )
1450
                };
1451

1452
                // Start a new batch.
1453
                if let Some(indirect_parameters_index) = indirect_parameters_index {
1454
                    GFBD::write_batch_indirect_parameters_metadata(
1455
                        item_is_indexed,
1456
                        output_index,
1457
                        None,
1458
                        &mut phase_indirect_parameters_buffers.buffers,
1459
                        indirect_parameters_index,
1460
                    );
1461
                };
1462

1463
                batch = Some(SortedRenderBatch {
1464
                    phase_item_start_index: current_index as u32,
1465
                    instance_start_index: output_index,
1466
                    indexed: item_is_indexed,
1467
                    indirect_parameters_index: indirect_parameters_index.and_then(NonMaxU32::new),
1468
                    meta: current_meta,
1469
                });
1470
            }
1471

1472
            // Add a new preprocessing work item so that the preprocessing
1473
            // shader will copy the per-instance data over.
1474
            if let Some(batch) = batch.as_ref() {
1475
                work_item_buffer.push(
1476
                    item_is_indexed,
1477
                    PreprocessWorkItem {
1478
                        input_index: current_input_index.into(),
1479
                        output_or_indirect_parameters_index: match (
1480
                            no_indirect_drawing,
1481
                            batch.indirect_parameters_index,
1482
                        ) {
1483
                            (true, _) => output_index,
1484
                            (false, Some(indirect_parameters_index)) => {
1485
                                indirect_parameters_index.into()
1486
                            }
1487
                            (false, None) => 0,
1488
                        },
1489
                    },
1490
                );
1491
            }
1492
        }
1493

1494
        // Flush the final batch if necessary.
1495
        if let Some(batch) = batch.take() {
1496
            batch.flush(
1497
                data_buffer.len() as u32,
1498
                phase,
1499
                &mut phase_indirect_parameters_buffers.buffers,
1500
            );
1501
        }
1502
    }
1503
}
1504

1505
/// Creates batches for a render phase that uses bins.
1506
pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
1507
    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
1508
    phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
1509
    mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
1510
    mut views: Query<
1511
        (
1512
            &ExtractedView,
1513
            Has<NoIndirectDrawing>,
1514
            Has<OcclusionCulling>,
1515
        ),
1516
        With<ExtractedView>,
1517
    >,
1518
    param: StaticSystemParam<GFBD::Param>,
1519
) where
1520
    BPI: BinnedPhaseItem,
1521
    GFBD: GetFullBatchData,
1522
{
1523
    let system_param_item = param.into_inner();
1524

1525
    let phase_indirect_parameters_buffers = phase_indirect_parameters_buffers.into_inner();
1526

1527
    let UntypedPhaseBatchedInstanceBuffers {
1528
        ref mut data_buffer,
1529
        ref mut work_item_buffers,
1530
        ref mut late_indexed_indirect_parameters_buffer,
1531
        ref mut late_non_indexed_indirect_parameters_buffer,
1532
    } = phase_batched_instance_buffers.buffers;
1533

1534
    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1535
        let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
1536
            continue;
1537
        };
1538

1539
        // Create the work item buffer if necessary; otherwise, just mark it as
1540
        // used this frame.
1541
        let work_item_buffer = get_or_create_work_item_buffer::<BPI>(
1542
            work_item_buffers,
1543
            extracted_view.retained_view_entity,
1544
            no_indirect_drawing,
1545
            gpu_occlusion_culling,
1546
        );
1547

1548
        // Initialize those work item buffers in preparation for this new frame.
1549
        init_work_item_buffers(
1550
            work_item_buffer,
1551
            late_indexed_indirect_parameters_buffer,
1552
            late_non_indexed_indirect_parameters_buffer,
1553
        );
1554

1555
        // Prepare multidrawables.
1556

1557
        if let (
1558
            &mut BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets),
1559
            &mut PreprocessWorkItemBuffers::Indirect {
1560
                indexed: ref mut indexed_work_item_buffer,
1561
                non_indexed: ref mut non_indexed_work_item_buffer,
1562
                gpu_occlusion_culling: ref mut gpu_occlusion_culling_buffers,
1563
            },
1564
        ) = (&mut phase.batch_sets, &mut *work_item_buffer)
1565
        {
1566
            let mut output_index = data_buffer.len() as u32;
1567

1568
            // Initialize the state for both indexed and non-indexed meshes.
1569
            let mut indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
1570
                MultidrawableBatchSetPreparer::new(
1571
                    phase_indirect_parameters_buffers.buffers.batch_count(true) as u32,
1572
                    phase_indirect_parameters_buffers
1573
                        .buffers
1574
                        .indexed
1575
                        .batch_sets
1576
                        .len() as u32,
1577
                );
1578
            let mut non_indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
1579
                MultidrawableBatchSetPreparer::new(
1580
                    phase_indirect_parameters_buffers.buffers.batch_count(false) as u32,
1581
                    phase_indirect_parameters_buffers
1582
                        .buffers
1583
                        .non_indexed
1584
                        .batch_sets
1585
                        .len() as u32,
1586
                );
1587

1588
            // Prepare each batch set.
1589
            for (batch_set_key, bins) in &phase.multidrawable_meshes {
1590
                if batch_set_key.indexed() {
1591
                    indexed_preparer.prepare_multidrawable_binned_batch_set(
1592
                        bins,
1593
                        &mut output_index,
1594
                        data_buffer,
1595
                        indexed_work_item_buffer,
1596
                        &mut phase_indirect_parameters_buffers.buffers.indexed,
1597
                        batch_sets,
1598
                    );
1599
                } else {
1600
                    non_indexed_preparer.prepare_multidrawable_binned_batch_set(
1601
                        bins,
1602
                        &mut output_index,
1603
                        data_buffer,
1604
                        non_indexed_work_item_buffer,
1605
                        &mut phase_indirect_parameters_buffers.buffers.non_indexed,
1606
                        batch_sets,
1607
                    );
1608
                }
1609
            }
1610

1611
            // Reserve space in the occlusion culling buffers, if necessary.
1612
            if let Some(gpu_occlusion_culling_buffers) = gpu_occlusion_culling_buffers {
1613
                gpu_occlusion_culling_buffers
1614
                    .late_indexed
1615
                    .add_multiple(indexed_preparer.work_item_count);
1616
                gpu_occlusion_culling_buffers
1617
                    .late_non_indexed
1618
                    .add_multiple(non_indexed_preparer.work_item_count);
1619
            }
1620
        }
1621

1622
        // Prepare batchables.
1623

1624
        for (key, bin) in &phase.batchable_meshes {
1625
            let mut batch: Option<BinnedRenderPhaseBatch> = None;
1626
            for (&main_entity, &input_index) in bin.entities() {
1627
                let output_index = data_buffer.add() as u32;
1628

1629
                match batch {
1630
                    Some(ref mut batch) => {
1631
                        batch.instance_range.end = output_index + 1;
1632

1633
                        // Append to the current batch.
1634
                        //
1635
                        // If we're in indirect mode, then we write the first
1636
                        // output index of this batch, so that we have a
1637
                        // tightly-packed buffer if GPU culling discards some of
1638
                        // the instances. Otherwise, we can just write the
1639
                        // output index directly.
1640
                        work_item_buffer.push(
1641
                            key.0.indexed(),
1642
                            PreprocessWorkItem {
1643
                                input_index: *input_index,
1644
                                output_or_indirect_parameters_index: match (
1645
                                    no_indirect_drawing,
1646
                                    &batch.extra_index,
1647
                                ) {
1648
                                    (true, _) => output_index,
1649
                                    (
1650
                                        false,
1651
                                        PhaseItemExtraIndex::IndirectParametersIndex {
1652
                                            range: indirect_parameters_range,
1653
                                            ..
1654
                                        },
1655
                                    ) => indirect_parameters_range.start,
1656
                                    (false, &PhaseItemExtraIndex::DynamicOffset(_))
1657
                                    | (false, &PhaseItemExtraIndex::None) => 0,
1658
                                },
1659
                            },
1660
                        );
1661
                    }
1662

1663
                    None if !no_indirect_drawing => {
1664
                        // Start a new batch, in indirect mode.
1665
                        let indirect_parameters_index = phase_indirect_parameters_buffers
1666
                            .buffers
1667
                            .allocate(key.0.indexed(), 1);
1668
                        let batch_set_index = phase_indirect_parameters_buffers
1669
                            .buffers
1670
                            .get_next_batch_set_index(key.0.indexed());
1671

1672
                        GFBD::write_batch_indirect_parameters_metadata(
1673
                            key.0.indexed(),
1674
                            output_index,
1675
                            batch_set_index,
1676
                            &mut phase_indirect_parameters_buffers.buffers,
1677
                            indirect_parameters_index,
1678
                        );
1679
                        work_item_buffer.push(
1680
                            key.0.indexed(),
1681
                            PreprocessWorkItem {
1682
                                input_index: *input_index,
1683
                                output_or_indirect_parameters_index: indirect_parameters_index,
1684
                            },
1685
                        );
1686
                        batch = Some(BinnedRenderPhaseBatch {
1687
                            representative_entity: (Entity::PLACEHOLDER, main_entity),
1688
                            instance_range: output_index..output_index + 1,
1689
                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex {
1690
                                range: indirect_parameters_index..(indirect_parameters_index + 1),
1691
                                batch_set_index: None,
1692
                            },
1693
                        });
1694
                    }
1695

1696
                    None => {
1697
                        // Start a new batch, in direct mode.
1698
                        work_item_buffer.push(
1699
                            key.0.indexed(),
1700
                            PreprocessWorkItem {
1701
                                input_index: *input_index,
1702
                                output_or_indirect_parameters_index: output_index,
1703
                            },
1704
                        );
1705
                        batch = Some(BinnedRenderPhaseBatch {
1706
                            representative_entity: (Entity::PLACEHOLDER, main_entity),
1707
                            instance_range: output_index..output_index + 1,
1708
                            extra_index: PhaseItemExtraIndex::None,
1709
                        });
1710
                    }
1711
                }
1712
            }
1713

1714
            if let Some(batch) = batch {
1715
                match phase.batch_sets {
1716
                    BinnedRenderPhaseBatchSets::DynamicUniforms(_) => {
1717
                        error!("Dynamic uniform batch sets shouldn't be used here");
1718
                    }
1719
                    BinnedRenderPhaseBatchSets::Direct(ref mut vec) => {
1720
                        vec.push(batch);
1721
                    }
1722
                    BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut vec) => {
1723
                        // The Bevy renderer will never mark a mesh as batchable
1724
                        // but not multidrawable if multidraw is in use.
1725
                        // However, custom render pipelines might do so, such as
1726
                        // the `specialized_mesh_pipeline` example.
1727
                        vec.push(BinnedRenderPhaseBatchSet {
1728
                            first_batch: batch,
1729
                            batch_count: 1,
1730
                            bin_key: key.1.clone(),
1731
                            index: phase_indirect_parameters_buffers
1732
                                .buffers
1733
                                .batch_set_count(key.0.indexed())
1734
                                as u32,
1735
                        });
1736
                    }
1737
                }
1738
            }
1739
        }
1740

1741
        // Prepare unbatchables.
1742
        for (key, unbatchables) in &mut phase.unbatchable_meshes {
1743
            // Allocate the indirect parameters if necessary.
1744
            let mut indirect_parameters_offset = if no_indirect_drawing {
1745
                None
1746
            } else if key.0.indexed() {
1747
                Some(
1748
                    phase_indirect_parameters_buffers
1749
                        .buffers
1750
                        .indexed
1751
                        .allocate(unbatchables.entities.len() as u32),
1752
                )
1753
            } else {
1754
                Some(
1755
                    phase_indirect_parameters_buffers
1756
                        .buffers
1757
                        .non_indexed
1758
                        .allocate(unbatchables.entities.len() as u32),
1759
                )
1760
            };
1761

1762
            for main_entity in unbatchables.entities.keys() {
1763
                let Some(input_index) = GFBD::get_binned_index(&system_param_item, *main_entity)
1764
                else {
1765
                    continue;
1766
                };
1767
                let output_index = data_buffer.add() as u32;
1768

1769
                if let Some(ref mut indirect_parameters_index) = indirect_parameters_offset {
1770
                    // We're in indirect mode, so add an indirect parameters
1771
                    // index.
1772
                    GFBD::write_batch_indirect_parameters_metadata(
1773
                        key.0.indexed(),
1774
                        output_index,
1775
                        None,
1776
                        &mut phase_indirect_parameters_buffers.buffers,
1777
                        *indirect_parameters_index,
1778
                    );
1779
                    work_item_buffer.push(
1780
                        key.0.indexed(),
1781
                        PreprocessWorkItem {
1782
                            input_index: input_index.into(),
1783
                            output_or_indirect_parameters_index: *indirect_parameters_index,
1784
                        },
1785
                    );
1786
                    unbatchables
1787
                        .buffer_indices
1788
                        .add(UnbatchableBinnedEntityIndices {
1789
                            instance_index: *indirect_parameters_index,
1790
                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex {
1791
                                range: *indirect_parameters_index..(*indirect_parameters_index + 1),
1792
                                batch_set_index: None,
1793
                            },
1794
                        });
1795
                    phase_indirect_parameters_buffers
1796
                        .buffers
1797
                        .add_batch_set(key.0.indexed(), *indirect_parameters_index);
1798
                    *indirect_parameters_index += 1;
1799
                } else {
1800
                    work_item_buffer.push(
1801
                        key.0.indexed(),
1802
                        PreprocessWorkItem {
1803
                            input_index: input_index.into(),
1804
                            output_or_indirect_parameters_index: output_index,
1805
                        },
1806
                    );
1807
                    unbatchables
1808
                        .buffer_indices
1809
                        .add(UnbatchableBinnedEntityIndices {
1810
                            instance_index: output_index,
1811
                            extra_index: PhaseItemExtraIndex::None,
1812
                        });
1813
                }
1814
            }
1815
        }
1816
    }
1817
}
1818

1819
/// The state that [`batch_and_prepare_binned_render_phase`] uses to construct
1820
/// multidrawable batch sets.
1821
///
1822
/// The [`batch_and_prepare_binned_render_phase`] system maintains two of these:
1823
/// one for indexed meshes and one for non-indexed meshes.
1824
struct MultidrawableBatchSetPreparer<BPI, GFBD>
1825
where
1826
    BPI: BinnedPhaseItem,
1827
    GFBD: GetFullBatchData,
1828
{
1829
    /// The offset in the indirect parameters buffer at which the next indirect
1830
    /// parameters will be written.
1831
    indirect_parameters_index: u32,
1832
    /// The number of batch sets we've built so far for this mesh class.
1833
    batch_set_index: u32,
1834
    /// The number of work items we've emitted so far for this mesh class.
1835
    work_item_count: usize,
1836
    phantom: PhantomData<(BPI, GFBD)>,
1837
}
1838

1839
impl<BPI, GFBD> MultidrawableBatchSetPreparer<BPI, GFBD>
1840
where
1841
    BPI: BinnedPhaseItem,
1842
    GFBD: GetFullBatchData,
1843
{
1844
    /// Creates a new [`MultidrawableBatchSetPreparer`] that will start writing
1845
    /// indirect parameters and batch sets at the given indices.
1846
    #[inline]
1847
    fn new(initial_indirect_parameters_index: u32, initial_batch_set_index: u32) -> Self {
1848
        MultidrawableBatchSetPreparer {
1849
            indirect_parameters_index: initial_indirect_parameters_index,
1850
            batch_set_index: initial_batch_set_index,
1851
            work_item_count: 0,
1852
            phantom: PhantomData,
1853
        }
1854
    }
1855

1856
    /// Creates batch sets and writes the GPU data needed to draw all visible
1857
    /// entities of one mesh class in the given batch set.
1858
    ///
1859
    /// The *mesh class* represents whether the mesh has indices or not.
1860
    #[inline]
1861
    fn prepare_multidrawable_binned_batch_set<IP>(
1862
        &mut self,
1863
        bins: &IndexMap<BPI::BinKey, RenderBin>,
1864
        output_index: &mut u32,
1865
        data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
1866
        indexed_work_item_buffer: &mut RawBufferVec<PreprocessWorkItem>,
1867
        mesh_class_buffers: &mut MeshClassIndirectParametersBuffers<IP>,
1868
        batch_sets: &mut Vec<BinnedRenderPhaseBatchSet<BPI::BinKey>>,
1869
    ) where
1870
        IP: Clone + ShaderSize + WriteInto,
1871
    {
1872
        let current_indexed_batch_set_index = self.batch_set_index;
1873
        let current_output_index = *output_index;
1874

1875
        let indirect_parameters_base = self.indirect_parameters_index;
1876

1877
        // We're going to write the first entity into the batch set. Do this
1878
        // here so that we can preload the bin into cache as a side effect.
1879
        let Some((first_bin_key, first_bin)) = bins.iter().next() else {
1880
            return;
1881
        };
1882
        let first_bin_len = first_bin.entities().len();
1883
        let first_bin_entity = first_bin
1884
            .entities()
1885
            .keys()
1886
            .next()
1887
            .copied()
1888
            .unwrap_or(MainEntity::from(Entity::PLACEHOLDER));
1889

1890
        // Traverse the batch set, processing each bin.
1891
        for bin in bins.values() {
1892
            // Record the first output index for this batch, as well as its own
1893
            // index.
1894
            mesh_class_buffers
1895
                .cpu_metadata
1896
                .push(IndirectParametersCpuMetadata {
1897
                    base_output_index: *output_index,
1898
                    batch_set_index: self.batch_set_index,
1899
                });
1900

1901
            // Traverse the bin, pushing `PreprocessWorkItem`s for each entity
1902
            // within it. This is a hot loop, so make it as fast as possible.
1903
            for &input_index in bin.entities().values() {
1904
                indexed_work_item_buffer.push(PreprocessWorkItem {
1905
                    input_index: *input_index,
1906
                    output_or_indirect_parameters_index: self.indirect_parameters_index,
1907
                });
1908
            }
1909

1910
            // Reserve space for the appropriate number of entities in the data
1911
            // buffer. Also, advance the output index and work item count.
1912
            let bin_entity_count = bin.entities().len();
1913
            data_buffer.add_multiple(bin_entity_count);
1914
            *output_index += bin_entity_count as u32;
1915
            self.work_item_count += bin_entity_count;
1916

1917
            self.indirect_parameters_index += 1;
1918
        }
1919

1920
        // Reserve space for the bins in this batch set in the GPU buffers.
1921
        let bin_count = bins.len();
1922
        mesh_class_buffers.gpu_metadata.add_multiple(bin_count);
1923
        mesh_class_buffers.data.add_multiple(bin_count);
1924

1925
        // Write the information the GPU will need about this batch set.
1926
        mesh_class_buffers.batch_sets.push(IndirectBatchSet {
1927
            indirect_parameters_base,
1928
            indirect_parameters_count: 0,
1929
        });
1930

1931
        self.batch_set_index += 1;
1932

1933
        // Record the batch set. The render node later processes this record to
1934
        // render the batches.
1935
        batch_sets.push(BinnedRenderPhaseBatchSet {
1936
            first_batch: BinnedRenderPhaseBatch {
1937
                representative_entity: (Entity::PLACEHOLDER, first_bin_entity),
1938
                instance_range: current_output_index..(current_output_index + first_bin_len as u32),
1939
                extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(NonMaxU32::new(
1940
                    indirect_parameters_base,
1941
                )),
1942
            },
1943
            bin_key: (*first_bin_key).clone(),
1944
            batch_count: self.indirect_parameters_index - indirect_parameters_base,
1945
            index: current_indexed_batch_set_index,
1946
        });
1947
    }
1948
}
1949

1950
/// A system that gathers up the per-phase GPU buffers and inserts them into the
1951
/// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
1952
///
1953
/// This runs after the [`batch_and_prepare_binned_render_phase`] or
1954
/// [`batch_and_prepare_sorted_render_phase`] systems. It takes the per-phase
1955
/// [`PhaseBatchedInstanceBuffers`] and [`PhaseIndirectParametersBuffers`]
1956
/// resources and inserts them into the global [`BatchedInstanceBuffers`] and
1957
/// [`IndirectParametersBuffers`] tables.
1958
///
1959
/// This system exists so that the [`batch_and_prepare_binned_render_phase`] and
1960
/// [`batch_and_prepare_sorted_render_phase`] can run in parallel with one
1961
/// another. If those two systems manipulated [`BatchedInstanceBuffers`] and
1962
/// [`IndirectParametersBuffers`] directly, then they wouldn't be able to run in
1963
/// parallel.
1964
pub fn collect_buffers_for_phase<PI, GFBD>(
1965
    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<PI, GFBD::BufferData>>,
1966
    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<PI>>,
1967
    mut batched_instance_buffers: ResMut<
1968
        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
1969
    >,
1970
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
1971
) where
1972
    PI: PhaseItem,
1973
    GFBD: GetFullBatchData + Send + Sync + 'static,
1974
{
1975
    // Insert the `PhaseBatchedInstanceBuffers` into the global table. Replace
1976
    // the contents of the per-phase resource with the old batched instance
1977
    // buffers in order to reuse allocations.
1978
    let untyped_phase_batched_instance_buffers =
1979
        mem::take(&mut phase_batched_instance_buffers.buffers);
1980
    if let Some(mut old_untyped_phase_batched_instance_buffers) = batched_instance_buffers
1981
        .phase_instance_buffers
1982
        .insert(TypeId::of::<PI>(), untyped_phase_batched_instance_buffers)
1983
    {
1984
        old_untyped_phase_batched_instance_buffers.clear();
1985
        phase_batched_instance_buffers.buffers = old_untyped_phase_batched_instance_buffers;
1986
    }
1987

1988
    // Insert the `PhaseIndirectParametersBuffers` into the global table.
1989
    // Replace the contents of the per-phase resource with the old indirect
1990
    // parameters buffers in order to reuse allocations.
1991
    let untyped_phase_indirect_parameters_buffers = mem::replace(
1992
        &mut phase_indirect_parameters_buffers.buffers,
1993
        UntypedPhaseIndirectParametersBuffers::new(
1994
            indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
1995
        ),
1996
    );
1997
    if let Some(mut old_untyped_phase_indirect_parameters_buffers) = indirect_parameters_buffers
1998
        .insert(
1999
            TypeId::of::<PI>(),
2000
            untyped_phase_indirect_parameters_buffers,
2001
        )
2002
    {
2003
        old_untyped_phase_indirect_parameters_buffers.clear();
2004
        phase_indirect_parameters_buffers.buffers = old_untyped_phase_indirect_parameters_buffers;
2005
    }
2006
}
2007

2008
/// A system that writes all instance buffers to the GPU.
2009
pub fn write_batched_instance_buffers<GFBD>(
2010
    render_device: Res<RenderDevice>,
2011
    render_queue: Res<RenderQueue>,
2012
    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
2013
) where
2014
    GFBD: GetFullBatchData,
2015
{
2016
    let BatchedInstanceBuffers {
2017
        current_input_buffer,
2018
        previous_input_buffer,
2019
        phase_instance_buffers,
2020
    } = gpu_array_buffer.into_inner();
2021

2022
    current_input_buffer
2023
        .buffer
2024
        .write_buffer(&render_device, &render_queue);
2025
    previous_input_buffer
2026
        .buffer
2027
        .write_buffer(&render_device, &render_queue);
2028

2029
    for phase_instance_buffers in phase_instance_buffers.values_mut() {
2030
        let UntypedPhaseBatchedInstanceBuffers {
2031
            ref mut data_buffer,
2032
            ref mut work_item_buffers,
2033
            ref mut late_indexed_indirect_parameters_buffer,
2034
            ref mut late_non_indexed_indirect_parameters_buffer,
2035
        } = *phase_instance_buffers;
2036

2037
        data_buffer.write_buffer(&render_device);
2038
        late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
2039
        late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
2040

2041
        for phase_work_item_buffers in work_item_buffers.values_mut() {
2042
            match *phase_work_item_buffers {
2043
                PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
2044
                    buffer_vec.write_buffer(&render_device, &render_queue);
2045
                }
2046
                PreprocessWorkItemBuffers::Indirect {
2047
                    ref mut indexed,
2048
                    ref mut non_indexed,
2049
                    ref mut gpu_occlusion_culling,
2050
                } => {
2051
                    indexed.write_buffer(&render_device, &render_queue);
2052
                    non_indexed.write_buffer(&render_device, &render_queue);
2053

2054
                    if let Some(GpuOcclusionCullingWorkItemBuffers {
2055
                        ref mut late_indexed,
2056
                        ref mut late_non_indexed,
2057
                        late_indirect_parameters_indexed_offset: _,
2058
                        late_indirect_parameters_non_indexed_offset: _,
2059
                    }) = *gpu_occlusion_culling
2060
                    {
2061
                        if !late_indexed.is_empty() {
2062
                            late_indexed.write_buffer(&render_device);
2063
                        }
2064
                        if !late_non_indexed.is_empty() {
2065
                            late_non_indexed.write_buffer(&render_device);
2066
                        }
2067
                    }
2068
                }
2069
            }
2070
        }
2071
    }
2072
}
2073

2074
pub fn clear_indirect_parameters_buffers(
2075
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
2076
) {
2077
    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2078
        phase_indirect_parameters_buffers.clear();
2079
    }
2080
}
2081

2082
pub fn write_indirect_parameters_buffers(
2083
    render_device: Res<RenderDevice>,
2084
    render_queue: Res<RenderQueue>,
2085
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
2086
) {
2087
    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2088
        phase_indirect_parameters_buffers
2089
            .indexed
2090
            .data
2091
            .write_buffer(&render_device);
2092
        phase_indirect_parameters_buffers
2093
            .non_indexed
2094
            .data
2095
            .write_buffer(&render_device);
2096

2097
        phase_indirect_parameters_buffers
2098
            .indexed
2099
            .cpu_metadata
2100
            .write_buffer(&render_device, &render_queue);
2101
        phase_indirect_parameters_buffers
2102
            .non_indexed
2103
            .cpu_metadata
2104
            .write_buffer(&render_device, &render_queue);
2105

2106
        phase_indirect_parameters_buffers
2107
            .non_indexed
2108
            .gpu_metadata
2109
            .write_buffer(&render_device);
2110
        phase_indirect_parameters_buffers
2111
            .indexed
2112
            .gpu_metadata
2113
            .write_buffer(&render_device);
2114

2115
        phase_indirect_parameters_buffers
2116
            .indexed
2117
            .batch_sets
2118
            .write_buffer(&render_device, &render_queue);
2119
        phase_indirect_parameters_buffers
2120
            .non_indexed
2121
            .batch_sets
2122
            .write_buffer(&render_device, &render_queue);
2123
    }
2124
}
2125

2126
#[cfg(test)]
2127
mod tests {
2128
    use super::*;
2129

2130
    #[test]
2131
    fn instance_buffer_correct_behavior() {
2132
        let mut instance_buffer = InstanceInputUniformBuffer::new();
2133

2134
        let index = instance_buffer.add(2);
2135
        instance_buffer.remove(index);
2136
        assert_eq!(instance_buffer.get_unchecked(index), 2);
2137
        assert_eq!(instance_buffer.get(index), None);
2138

2139
        instance_buffer.add(5);
2140
        assert_eq!(instance_buffer.buffer().len(), 1);
2141
    }
2142
}
2143

2144
Product

Resources

Company