Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bevyengine
GitHub Repository: bevyengine/bevy
Path: blob/main/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl
6600 views
#define_import_path bevy_pbr::meshlet_cull_shared

#import bevy_pbr::meshlet_bindings::{
    MeshletAabb,
    DispatchIndirectArgs,
    InstancedOffset,
    depth_pyramid,
    view,
    previous_view,
    meshlet_instance_uniforms,
}
#import bevy_render::maths::affine3_to_square

// https://github.com/zeux/meshoptimizer/blob/1e48e96c7e8059321de492865165e9ef071bffba/demo/nanite.cpp#L115
fn lod_error_is_imperceptible(lod_sphere: vec4<f32>, simplification_error: f32, instance_id: u32) -> bool {
    let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local);
    let world_scale = max(length(world_from_local[0]), max(length(world_from_local[1]), length(world_from_local[2])));
    let camera_pos = view.world_position;

    let projection = view.clip_from_view;
    if projection[3][3] == 1.0 {
        // Orthographic
        let world_error = simplification_error * world_scale;
        let proj = projection[1][1];
        let height = 2.0 / proj;
        let norm_error = world_error / height;
        return norm_error * view.viewport.w < 1.0;
    } else {
        // Perspective
        var near = projection[3][2];
        let world_sphere_center = (world_from_local * vec4<f32>(lod_sphere.xyz, 1.0)).xyz;
        let world_sphere_radius = lod_sphere.w * world_scale;
        let d_pos = world_sphere_center - camera_pos;
        let d = sqrt(dot(d_pos, d_pos)) - world_sphere_radius;
        let norm_error = simplification_error / max(d, near) * projection[1][1] * 0.5;
        return norm_error * view.viewport.w < 1.0;
    }
}

fn normalize_plane(p: vec4<f32>) -> vec4<f32> {
    return p / length(p.xyz);
}

// https://fgiesen.wordpress.com/2012/08/31/frustum-planes-from-the-projection-matrix/
// https://fgiesen.wordpress.com/2010/10/17/view-frustum-culling/
fn aabb_in_frustum(aabb: MeshletAabb, instance_id: u32) -> bool {
    let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local);
    let clip_from_local = view.clip_from_world * world_from_local;
    let row_major = transpose(clip_from_local);
    let planes = array(
        row_major[3] + row_major[0],
        row_major[3] - row_major[0],
        row_major[3] + row_major[1],
        row_major[3] - row_major[1],
        row_major[2],
    );

    for (var i = 0; i < 5; i++) {
        let plane = normalize_plane(planes[i]);
        let flipped = aabb.half_extent * sign(plane.xyz);
        if dot(aabb.center + flipped, plane.xyz) <= -plane.w {
            return false;
        }
    }
    return true;
}

struct ScreenAabb {
    min: vec3<f32>,
    max: vec3<f32>,
}

fn min8(a: vec3<f32>, b: vec3<f32>, c: vec3<f32>, d: vec3<f32>, e: vec3<f32>, f: vec3<f32>, g: vec3<f32>, h: vec3<f32>) -> vec3<f32> {
    return min(min(min(a, b), min(c, d)), min(min(e, f), min(g, h)));
}

fn max8(a: vec3<f32>, b: vec3<f32>, c: vec3<f32>, d: vec3<f32>, e: vec3<f32>, f: vec3<f32>, g: vec3<f32>, h: vec3<f32>) -> vec3<f32> {
    return max(max(max(a, b), max(c, d)), max(max(e, f), max(g, h)));
}

fn min8_4(a: vec4<f32>, b: vec4<f32>, c: vec4<f32>, d: vec4<f32>, e: vec4<f32>, f: vec4<f32>, g: vec4<f32>, h: vec4<f32>) -> vec4<f32> {
    return min(min(min(a, b), min(c, d)), min(min(e, f), min(g, h)));
}

// https://zeux.io/2023/01/12/approximate-projected-bounds/
fn project_aabb(clip_from_local: mat4x4<f32>, near: f32, aabb: MeshletAabb, out: ptr<function, ScreenAabb>) -> bool {
    let extent = aabb.half_extent * 2.0;
    let sx = clip_from_local * vec4<f32>(extent.x, 0.0, 0.0, 0.0);
    let sy = clip_from_local * vec4<f32>(0.0, extent.y, 0.0, 0.0);
    let sz = clip_from_local * vec4<f32>(0.0, 0.0, extent.z, 0.0);

    let p0 = clip_from_local * vec4<f32>(aabb.center - aabb.half_extent, 1.0);
    let p1 = p0 + sz;
    let p2 = p0 + sy;
    let p3 = p2 + sz;
    let p4 = p0 + sx;
    let p5 = p4 + sz;
    let p6 = p4 + sy;
    let p7 = p6 + sz;

    let depth = min8_4(p0, p1, p2, p3, p4, p5, p6, p7).w;
    // do not occlusion cull if we are inside the aabb
    if depth < near {
        return false;
    }

    let dp0 = p0.xyz / p0.w;
    let dp1 = p1.xyz / p1.w;
    let dp2 = p2.xyz / p2.w;
    let dp3 = p3.xyz / p3.w;
    let dp4 = p4.xyz / p4.w;
    let dp5 = p5.xyz / p5.w;
    let dp6 = p6.xyz / p6.w;
    let dp7 = p7.xyz / p7.w;
    let min = min8(dp0, dp1, dp2, dp3, dp4, dp5, dp6, dp7);
    let max = max8(dp0, dp1, dp2, dp3, dp4, dp5, dp6, dp7);
    var vaabb = vec4<f32>(min.xy, max.xy);
    // convert ndc to texture coordinates by rescaling and flipping Y
    vaabb = vaabb.xwzy * vec4<f32>(0.5, -0.5, 0.5, -0.5) + 0.5;
    (*out).min = vec3<f32>(vaabb.xy, min.z);
    (*out).max = vec3<f32>(vaabb.zw, max.z);
    return true;
}

fn sample_hzb(smin: vec2<u32>, smax: vec2<u32>, mip: i32) -> f32 {
    let texel = vec4<u32>(0, 1, 2, 3);
    let sx = min(smin.x + texel, smax.xxxx);
    let sy = min(smin.y + texel, smax.yyyy);
    // TODO: switch to min samplers when wgpu has them
    // sampling 16 times a finer mip is worth the extra cost for better culling
    let a = sample_hzb_row(sx, sy.x, mip);
    let b = sample_hzb_row(sx, sy.y, mip);
    let c = sample_hzb_row(sx, sy.z, mip);
    let d = sample_hzb_row(sx, sy.w, mip);
    return min(min(a, b), min(c, d));
}

fn sample_hzb_row(sx: vec4<u32>, sy: u32, mip: i32) -> f32 {
    let a = textureLoad(depth_pyramid, vec2(sx.x, sy), mip).x;
    let b = textureLoad(depth_pyramid, vec2(sx.y, sy), mip).x;
    let c = textureLoad(depth_pyramid, vec2(sx.z, sy), mip).x;
    let d = textureLoad(depth_pyramid, vec2(sx.w, sy), mip).x;
    return min(min(a, b), min(c, d));
}

// TODO: We should probably be using a POT HZB texture?
fn occlusion_cull_screen_aabb(aabb: ScreenAabb, screen: vec2<f32>) -> bool {
    let hzb_size = ceil(screen * 0.5);
    let aabb_min = aabb.min.xy * hzb_size;
    let aabb_max = aabb.max.xy * hzb_size;

    let min_texel = vec2<u32>(max(aabb_min, vec2<f32>(0.0)));
    let max_texel = vec2<u32>(min(aabb_max, hzb_size - 1.0));
    let size = max_texel - min_texel;
    let max_size = max(size.x, size.y);

    // note: add 1 before max because the unsigned overflow behavior is intentional
    // it wraps around firstLeadingBit(0) = ~0 to 0
    // TODO: we actually sample a 4x4 block, so ideally this would be `max(..., 3u) - 3u`.
    // However, since our HZB is not a power of two, we need to be extra-conservative to not over-cull, so we go up a mip.
    var mip = max(firstLeadingBit(max_size) + 1u, 2u) - 2u;
    
    if any((max_texel >> vec2(mip)) > (min_texel >> vec2(mip)) + 3) {
        mip += 1u;
    }

    let smin = min_texel >> vec2<u32>(mip);
    let smax = max_texel >> vec2<u32>(mip);
    
    let curr_depth = sample_hzb(smin, smax, i32(mip));
    return aabb.max.z <= curr_depth;
}

fn occlusion_cull_projection() -> mat4x4<f32> {
#ifdef FIRST_CULLING_PASS
    return view.clip_from_world;
#else
    return previous_view.clip_from_world;
#endif
}

fn occlusion_cull_clip_from_local(instance_id: u32) -> mat4x4<f32> {
#ifdef FIRST_CULLING_PASS
    let prev_world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].previous_world_from_local);
    return previous_view.clip_from_world * prev_world_from_local;
#else
    let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local);
    return view.clip_from_world * world_from_local;
#endif
}

fn should_occlusion_cull_aabb(aabb: MeshletAabb, instance_id: u32) -> bool {
    let projection = occlusion_cull_projection();
    var near: f32;
    if projection[3][3] == 1.0 {
        near = projection[3][2] / projection[2][2];
    } else {
        near = projection[3][2];
    }

    let clip_from_local = occlusion_cull_clip_from_local(instance_id);
    var screen_aabb = ScreenAabb(vec3<f32>(0.0), vec3<f32>(0.0));
    if project_aabb(clip_from_local, near, aabb, &screen_aabb) {
        return occlusion_cull_screen_aabb(screen_aabb, view.viewport.zw);
    }
    return false;
}