Path: blob/21.2-virgl/src/amd/vulkan/radv_acceleration_structure.c
7176 views
/*1* Copyright © 2021 Bas Nieuwenhuizen2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/22#include "radv_private.h"2324#include "util/half_float.h"25#include "nir_builder.h"26#include "radv_cs.h"27#include "radv_meta.h"2829struct radv_accel_struct_header {30uint32_t root_node_offset;31uint32_t reserved;32float aabb[2][3];33uint64_t compacted_size;34uint64_t serialization_size;35};3637struct radv_bvh_triangle_node {38float coords[3][3];39uint32_t reserved[3];40uint32_t triangle_id;41/* flags in upper 4 bits */42uint32_t geometry_id_and_flags;43uint32_t reserved2;44uint32_t id;45};4647struct radv_bvh_aabb_node {48float aabb[2][3];49uint32_t primitive_id;50/* flags in upper 4 bits */51uint32_t geometry_id_and_flags;52uint32_t reserved[8];53};5455struct radv_bvh_instance_node {56uint64_t base_ptr;57/* lower 24 bits are the custom instance index, upper 8 bits are the visibility mask */58uint32_t custom_instance_and_mask;59/* lower 24 bits are the sbt offset, upper 8 bits are VkGeometryInstanceFlagsKHR */60uint32_t sbt_offset_and_flags;6162/* The translation component is actually a pre-translation instead of a post-translation. If you63* want to get a proper matrix out of it you need to apply the directional component of the64* matrix to it. The pre-translation of the world->object matrix is the same as the65* post-translation of the object->world matrix so this way we can share data between both66* matrices. */67float wto_matrix[12];68float aabb[2][3];69uint32_t instance_id;70uint32_t reserved[9];71};7273struct radv_bvh_box16_node {74uint32_t children[4];75uint32_t coords[4][3];76};7778struct radv_bvh_box32_node {79uint32_t children[4];80float coords[4][2][3];81uint32_t reserved[4];82};8384void85radv_GetAccelerationStructureBuildSizesKHR(86VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,87const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,88const uint32_t *pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo)89{90uint64_t triangles = 0, boxes = 0, instances = 0;9192for (uint32_t i = 0; i < pBuildInfo->geometryCount; ++i) {93const VkAccelerationStructureGeometryKHR *geometry;94if (pBuildInfo->pGeometries)95geometry = &pBuildInfo->pGeometries[i];96else97geometry = pBuildInfo->ppGeometries[i];9899switch (geometry->geometryType) {100case VK_GEOMETRY_TYPE_TRIANGLES_KHR:101triangles += pMaxPrimitiveCounts[i];102break;103case VK_GEOMETRY_TYPE_AABBS_KHR:104boxes += pMaxPrimitiveCounts[i];105break;106case VK_GEOMETRY_TYPE_INSTANCES_KHR:107instances += pMaxPrimitiveCounts[i];108break;109case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:110unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");111}112}113114uint64_t children = boxes + instances + triangles;115uint64_t internal_nodes = 0;116while (children > 1) {117children = DIV_ROUND_UP(children, 4);118internal_nodes += children;119}120121/* The stray 128 is to ensure we have space for a header122* which we'd want to use for some metadata (like the123* total AABB of the BVH) */124uint64_t size = boxes * 128 + instances * 128 + triangles * 64 + internal_nodes * 128 + 192;125126pSizeInfo->accelerationStructureSize = size;127128/* 2x the max number of nodes in a BVH layer (one uint32_t each) */129pSizeInfo->updateScratchSize = pSizeInfo->buildScratchSize =130MAX2(4096, 2 * (boxes + instances + triangles) * sizeof(uint32_t));131}132133VkResult134radv_CreateAccelerationStructureKHR(VkDevice _device,135const VkAccelerationStructureCreateInfoKHR *pCreateInfo,136const VkAllocationCallbacks *pAllocator,137VkAccelerationStructureKHR *pAccelerationStructure)138{139RADV_FROM_HANDLE(radv_device, device, _device);140RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer);141struct radv_acceleration_structure *accel;142143accel = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,144VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);145if (accel == NULL)146return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);147148vk_object_base_init(&device->vk, &accel->base, VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);149150accel->mem_offset = buffer->offset + pCreateInfo->offset;151accel->size = pCreateInfo->size;152accel->bo = buffer->bo;153154*pAccelerationStructure = radv_acceleration_structure_to_handle(accel);155return VK_SUCCESS;156}157158void159radv_DestroyAccelerationStructureKHR(VkDevice _device,160VkAccelerationStructureKHR accelerationStructure,161const VkAllocationCallbacks *pAllocator)162{163RADV_FROM_HANDLE(radv_device, device, _device);164RADV_FROM_HANDLE(radv_acceleration_structure, accel, accelerationStructure);165166if (!accel)167return;168169vk_object_base_finish(&accel->base);170vk_free2(&device->vk.alloc, pAllocator, accel);171}172173VkDeviceAddress174radv_GetAccelerationStructureDeviceAddressKHR(175VkDevice _device, const VkAccelerationStructureDeviceAddressInfoKHR *pInfo)176{177RADV_FROM_HANDLE(radv_acceleration_structure, accel, pInfo->accelerationStructure);178return radv_accel_struct_get_va(accel);179}180181VkResult182radv_WriteAccelerationStructuresPropertiesKHR(183VkDevice _device, uint32_t accelerationStructureCount,184const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,185size_t dataSize, void *pData, size_t stride)186{187RADV_FROM_HANDLE(radv_device, device, _device);188char *data_out = (char*)pData;189190for (uint32_t i = 0; i < accelerationStructureCount; ++i) {191RADV_FROM_HANDLE(radv_acceleration_structure, accel, pAccelerationStructures[i]);192const char *base_ptr = (const char *)device->ws->buffer_map(accel->bo);193if (!base_ptr)194return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);195196const struct radv_accel_struct_header *header = (const void*)(base_ptr + accel->mem_offset);197if (stride * i + sizeof(VkDeviceSize) <= dataSize) {198uint64_t value;199switch (queryType) {200case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:201value = header->compacted_size;202break;203case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:204value = header->serialization_size;205break;206default:207unreachable("Unhandled acceleration structure query");208}209*(VkDeviceSize *)(data_out + stride * i) = value;210}211device->ws->buffer_unmap(accel->bo);212}213return VK_SUCCESS;214}215216struct radv_bvh_build_ctx {217uint32_t *write_scratch;218char *base;219char *curr_ptr;220};221222static void223build_triangles(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeometryKHR *geom,224const VkAccelerationStructureBuildRangeInfoKHR *range, unsigned geometry_id)225{226const VkAccelerationStructureGeometryTrianglesDataKHR *tri_data = &geom->geometry.triangles;227VkTransformMatrixKHR matrix;228const char *index_data = (const char *)tri_data->indexData.hostAddress + range->primitiveOffset;229230if (tri_data->transformData.hostAddress) {231matrix = *(const VkTransformMatrixKHR *)((const char *)tri_data->transformData.hostAddress +232range->transformOffset);233} else {234matrix = (VkTransformMatrixKHR){235.matrix = {{1.0, 0.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0}, {0.0, 0.0, 1.0, 0.0}}};236}237238for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 64) {239struct radv_bvh_triangle_node *node = (void*)ctx->curr_ptr;240uint32_t node_offset = ctx->curr_ptr - ctx->base;241uint32_t node_id = node_offset >> 3;242*ctx->write_scratch++ = node_id;243244for (unsigned v = 0; v < 3; ++v) {245uint32_t v_index = range->firstVertex;246switch (tri_data->indexType) {247case VK_INDEX_TYPE_NONE_KHR:248v_index += p * 3 + v;249break;250case VK_INDEX_TYPE_UINT8_EXT:251v_index += *(const uint8_t *)index_data;252index_data += 1;253break;254case VK_INDEX_TYPE_UINT16:255v_index += *(const uint16_t *)index_data;256index_data += 2;257break;258case VK_INDEX_TYPE_UINT32:259v_index += *(const uint32_t *)index_data;260index_data += 4;261break;262case VK_INDEX_TYPE_MAX_ENUM:263unreachable("Unhandled VK_INDEX_TYPE_MAX_ENUM");264break;265}266267const char *v_data = (const char *)tri_data->vertexData.hostAddress + v_index * tri_data->vertexStride;268float coords[4];269switch (tri_data->vertexFormat) {270case VK_FORMAT_R32G32B32_SFLOAT:271coords[0] = *(const float *)(v_data + 0);272coords[1] = *(const float *)(v_data + 4);273coords[2] = *(const float *)(v_data + 8);274coords[3] = 1.0f;275break;276case VK_FORMAT_R32G32B32A32_SFLOAT:277coords[0] = *(const float *)(v_data + 0);278coords[1] = *(const float *)(v_data + 4);279coords[2] = *(const float *)(v_data + 8);280coords[3] = *(const float *)(v_data + 12);281break;282case VK_FORMAT_R16G16B16_SFLOAT:283coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));284coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));285coords[2] = _mesa_half_to_float(*(const uint16_t *)(v_data + 4));286coords[3] = 1.0f;287break;288case VK_FORMAT_R16G16B16A16_SFLOAT:289coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));290coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));291coords[2] = _mesa_half_to_float(*(const uint16_t *)(v_data + 4));292coords[3] = _mesa_half_to_float(*(const uint16_t *)(v_data + 6));293break;294default:295unreachable("Unhandled vertex format in BVH build");296}297298for (unsigned j = 0; j < 3; ++j) {299float r = 0;300for (unsigned k = 0; k < 4; ++k)301r += matrix.matrix[j][k] * coords[k];302node->coords[v][j] = r;303}304305node->triangle_id = p;306node->geometry_id_and_flags = geometry_id | (geom->flags << 28);307308/* Seems to be needed for IJ, otherwise I = J = ? */309node->id = 9;310}311}312}313314static VkResult315build_instances(struct radv_device *device, struct radv_bvh_build_ctx *ctx,316const VkAccelerationStructureGeometryKHR *geom,317const VkAccelerationStructureBuildRangeInfoKHR *range)318{319const VkAccelerationStructureGeometryInstancesDataKHR *inst_data = &geom->geometry.instances;320321for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 128) {322const VkAccelerationStructureInstanceKHR *instance =323inst_data->arrayOfPointers324? (((const VkAccelerationStructureInstanceKHR *const *)inst_data->data.hostAddress)[p])325: &((const VkAccelerationStructureInstanceKHR *)inst_data->data.hostAddress)[p];326if (!instance->accelerationStructureReference) {327continue;328}329330struct radv_bvh_instance_node *node = (void*)ctx->curr_ptr;331uint32_t node_offset = ctx->curr_ptr - ctx->base;332uint32_t node_id = (node_offset >> 3) | 6;333*ctx->write_scratch++ = node_id;334335float transform[16], inv_transform[16];336memcpy(transform, &instance->transform.matrix, sizeof(instance->transform.matrix));337transform[12] = transform[13] = transform[14] = 0.0f;338transform[15] = 1.0f;339340util_invert_mat4x4(inv_transform, transform);341memcpy(node->wto_matrix, inv_transform, sizeof(node->wto_matrix));342node->wto_matrix[3] = transform[3];343node->wto_matrix[7] = transform[7];344node->wto_matrix[11] = transform[11];345node->custom_instance_and_mask = instance->instanceCustomIndex | (instance->mask << 24);346node->sbt_offset_and_flags =347instance->instanceShaderBindingTableRecordOffset | (instance->flags << 24);348node->instance_id = p;349350RADV_FROM_HANDLE(radv_acceleration_structure, src_accel_struct,351(VkAccelerationStructureKHR)instance->accelerationStructureReference);352const void *src_base = device->ws->buffer_map(src_accel_struct->bo);353if (!src_base)354return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);355356src_base = (const char *)src_base + src_accel_struct->mem_offset;357const struct radv_accel_struct_header *src_header = src_base;358node->base_ptr = radv_accel_struct_get_va(src_accel_struct) | src_header->root_node_offset;359360for (unsigned j = 0; j < 3; ++j) {361node->aabb[0][j] = instance->transform.matrix[j][3];362node->aabb[1][j] = instance->transform.matrix[j][3];363for (unsigned k = 0; k < 3; ++k) {364node->aabb[0][j] += MIN2(instance->transform.matrix[j][k] * src_header->aabb[0][k],365instance->transform.matrix[j][k] * src_header->aabb[1][k]);366node->aabb[1][j] += MAX2(instance->transform.matrix[j][k] * src_header->aabb[0][k],367instance->transform.matrix[j][k] * src_header->aabb[1][k]);368}369}370device->ws->buffer_unmap(src_accel_struct->bo);371}372return VK_SUCCESS;373}374375static void376build_aabbs(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeometryKHR *geom,377const VkAccelerationStructureBuildRangeInfoKHR *range, unsigned geometry_id)378{379const VkAccelerationStructureGeometryAabbsDataKHR *aabb_data = &geom->geometry.aabbs;380381for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 64) {382struct radv_bvh_aabb_node *node = (void*)ctx->curr_ptr;383uint32_t node_offset = ctx->curr_ptr - ctx->base;384uint32_t node_id = (node_offset >> 3) | 6;385*ctx->write_scratch++ = node_id;386387const VkAabbPositionsKHR *aabb =388(const VkAabbPositionsKHR *)((const char *)aabb_data->data.hostAddress +389p * aabb_data->stride);390391node->aabb[0][0] = aabb->minX;392node->aabb[0][1] = aabb->minY;393node->aabb[0][2] = aabb->minZ;394node->aabb[1][0] = aabb->maxX;395node->aabb[1][1] = aabb->maxY;396node->aabb[1][2] = aabb->maxZ;397node->primitive_id = p;398node->geometry_id_and_flags = geometry_id;399}400}401402static uint32_t403leaf_node_count(const VkAccelerationStructureBuildGeometryInfoKHR *info,404const VkAccelerationStructureBuildRangeInfoKHR *ranges)405{406uint32_t count = 0;407for (uint32_t i = 0; i < info->geometryCount; ++i) {408count += ranges[i].primitiveCount;409}410return count;411}412413static void414compute_bounds(const char *base_ptr, uint32_t node_id, float *bounds)415{416for (unsigned i = 0; i < 3; ++i)417bounds[i] = INFINITY;418for (unsigned i = 0; i < 3; ++i)419bounds[3 + i] = -INFINITY;420421switch (node_id & 7) {422case 0: {423const struct radv_bvh_triangle_node *node = (const void*)(base_ptr + (node_id / 8 * 64));424for (unsigned v = 0; v < 3; ++v) {425for (unsigned j = 0; j < 3; ++j) {426bounds[j] = MIN2(bounds[j], node->coords[v][j]);427bounds[3 + j] = MAX2(bounds[3 + j], node->coords[v][j]);428}429}430break;431}432case 5: {433const struct radv_bvh_box32_node *node = (const void*)(base_ptr + (node_id / 8 * 64));434for (unsigned c2 = 0; c2 < 4; ++c2) {435if (isnan(node->coords[c2][0][0]))436continue;437for (unsigned j = 0; j < 3; ++j) {438bounds[j] = MIN2(bounds[j], node->coords[c2][0][j]);439bounds[3 + j] = MAX2(bounds[3 + j], node->coords[c2][1][j]);440}441}442break;443}444case 6: {445const struct radv_bvh_instance_node *node = (const void*)(base_ptr + (node_id / 8 * 64));446for (unsigned j = 0; j < 3; ++j) {447bounds[j] = MIN2(bounds[j], node->aabb[0][j]);448bounds[3 + j] = MAX2(bounds[3 + j], node->aabb[1][j]);449}450break;451}452case 7: {453const struct radv_bvh_aabb_node *node = (const void*)(base_ptr + (node_id / 8 * 64));454for (unsigned j = 0; j < 3; ++j) {455bounds[j] = MIN2(bounds[j], node->aabb[0][j]);456bounds[3 + j] = MAX2(bounds[3 + j], node->aabb[1][j]);457}458break;459}460}461}462463static VkResult464build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometryInfoKHR *info,465const VkAccelerationStructureBuildRangeInfoKHR *ranges)466{467RADV_FROM_HANDLE(radv_acceleration_structure, accel, info->dstAccelerationStructure);468VkResult result = VK_SUCCESS;469470uint32_t *scratch[2];471scratch[0] = info->scratchData.hostAddress;472scratch[1] = scratch[0] + leaf_node_count(info, ranges);473474char *base_ptr = (char*)device->ws->buffer_map(accel->bo);475if (!base_ptr)476return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);477478base_ptr = base_ptr + accel->mem_offset;479struct radv_accel_struct_header *header = (void*)base_ptr;480void *first_node_ptr = (char *)base_ptr + ALIGN(sizeof(*header), 64);481482struct radv_bvh_build_ctx ctx = {.write_scratch = scratch[0],483.base = base_ptr,484.curr_ptr = (char *)first_node_ptr + 128};485486/* This initializes the leaf nodes of the BVH all at the same level. */487for (uint32_t i = 0; i < info->geometryCount; ++i) {488const VkAccelerationStructureGeometryKHR *geom =489info->pGeometries ? &info->pGeometries[i] : info->ppGeometries[i];490491switch (geom->geometryType) {492case VK_GEOMETRY_TYPE_TRIANGLES_KHR:493build_triangles(&ctx, geom, ranges + i, i);494break;495case VK_GEOMETRY_TYPE_AABBS_KHR:496build_aabbs(&ctx, geom, ranges + i, i);497break;498case VK_GEOMETRY_TYPE_INSTANCES_KHR: {499result = build_instances(device, &ctx, geom, ranges + i);500if (result != VK_SUCCESS)501goto fail;502break;503}504case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:505unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");506}507}508509uint32_t node_counts[2] = {ctx.write_scratch - scratch[0], 0};510unsigned d;511512/*513* This is the most naive BVH building algorithm I could think of:514* just iteratively builds each level from bottom to top with515* the children of each node being in-order and tightly packed.516*517* Is probably terrible for traversal but should be easy to build an518* equivalent GPU version.519*/520for (d = 0; node_counts[d & 1] > 1 || d == 0; ++d) {521uint32_t child_count = node_counts[d & 1];522const uint32_t *children = scratch[d & 1];523uint32_t *dst_ids = scratch[(d & 1) ^ 1];524unsigned dst_count;525unsigned child_idx = 0;526for (dst_count = 0; child_idx < MAX2(1, child_count); ++dst_count, child_idx += 4) {527unsigned local_child_count = MIN2(4, child_count - child_idx);528uint32_t child_ids[4];529float bounds[4][6];530531for (unsigned c = 0; c < local_child_count; ++c) {532uint32_t id = children[child_idx + c];533child_ids[c] = id;534535compute_bounds(base_ptr, id, bounds[c]);536}537538struct radv_bvh_box32_node *node;539540/* Put the root node at base_ptr so the id = 0, which allows some541* traversal optimizations. */542if (child_idx == 0 && local_child_count == child_count) {543node = first_node_ptr;544header->root_node_offset = ((char *)first_node_ptr - (char *)base_ptr) / 64 * 8 + 5;545} else {546uint32_t dst_id = (ctx.curr_ptr - base_ptr) / 64;547dst_ids[dst_count] = dst_id * 8 + 5;548549node = (void*)ctx.curr_ptr;550ctx.curr_ptr += 128;551}552553for (unsigned c = 0; c < local_child_count; ++c) {554node->children[c] = child_ids[c];555for (unsigned i = 0; i < 2; ++i)556for (unsigned j = 0; j < 3; ++j)557node->coords[c][i][j] = bounds[c][i * 3 + j];558}559for (unsigned c = local_child_count; c < 4; ++c) {560for (unsigned i = 0; i < 2; ++i)561for (unsigned j = 0; j < 3; ++j)562node->coords[c][i][j] = NAN;563}564}565566node_counts[(d & 1) ^ 1] = dst_count;567}568569compute_bounds(base_ptr, header->root_node_offset, &header->aabb[0][0]);570571/* TODO init sizes and figure out what is needed for serialization. */572573fail:574device->ws->buffer_unmap(accel->bo);575return result;576}577578VkResult579radv_BuildAccelerationStructuresKHR(580VkDevice _device, VkDeferredOperationKHR deferredOperation, uint32_t infoCount,581const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,582const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)583{584RADV_FROM_HANDLE(radv_device, device, _device);585VkResult result = VK_SUCCESS;586587for (uint32_t i = 0; i < infoCount; ++i) {588result = build_bvh(device, pInfos + i, ppBuildRangeInfos[i]);589if (result != VK_SUCCESS)590break;591}592return result;593}594595static nir_ssa_def *596get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id)597{598const struct glsl_type *uvec3_type = glsl_vector_type(GLSL_TYPE_UINT, 3);599nir_variable *result =600nir_variable_create(b->shader, nir_var_shader_temp, uvec3_type, "indices");601602nir_push_if(b, nir_ult(b, type, nir_imm_int(b, 2)));603nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_UINT16)));604{605nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 6));606nir_ssa_def *indices[3];607for (unsigned i = 0; i < 3; ++i) {608indices[i] = nir_build_load_global(609b, 1, 16, nir_iadd(b, addr, nir_u2u64(b, nir_iadd(b, index_id, nir_imm_int(b, 2 * i)))),610.align_mul = 2, .align_offset = 0);611}612nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);613}614nir_push_else(b, NULL);615{616nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 12));617nir_ssa_def *indices = nir_build_load_global(618b, 3, 32, nir_iadd(b, addr, nir_u2u64(b, index_id)), .align_mul = 4, .align_offset = 0);619nir_store_var(b, result, indices, 7);620}621nir_pop_if(b, NULL);622nir_push_else(b, NULL);623{624nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 3));625nir_ssa_def *indices[] = {626index_id,627nir_iadd(b, index_id, nir_imm_int(b, 1)),628nir_iadd(b, index_id, nir_imm_int(b, 2)),629};630631nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_NONE_KHR)));632{633nir_store_var(b, result, nir_vec(b, indices, 3), 7);634}635nir_push_else(b, NULL);636{637for (unsigned i = 0; i < 3; ++i) {638indices[i] = nir_build_load_global(b, 1, 8, nir_iadd(b, addr, nir_u2u64(b, indices[i])),639.align_mul = 1, .align_offset = 0);640}641nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);642}643nir_pop_if(b, NULL);644}645nir_pop_if(b, NULL);646return nir_load_var(b, result);647}648649static void650get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ssa_def *positions[3])651{652const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);653nir_variable *results[3] = {654nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex0"),655nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex1"),656nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex2")};657658VkFormat formats[] = {659VK_FORMAT_R32G32B32_SFLOAT,660VK_FORMAT_R32G32B32A32_SFLOAT,661VK_FORMAT_R16G16B16_SFLOAT,662VK_FORMAT_R16G16B16A16_SFLOAT,663};664665for (unsigned f = 0; f < ARRAY_SIZE(formats); ++f) {666if (f + 1 < ARRAY_SIZE(formats))667nir_push_if(b, nir_ieq(b, format, nir_imm_int(b, formats[f])));668669for (unsigned i = 0; i < 3; ++i) {670switch (formats[f]) {671case VK_FORMAT_R32G32B32_SFLOAT:672case VK_FORMAT_R32G32B32A32_SFLOAT:673nir_store_var(b, results[i],674nir_build_load_global(b, 3, 32, nir_channel(b, addresses, i),675.align_mul = 4, .align_offset = 0),6767);677break;678case VK_FORMAT_R16G16B16_SFLOAT:679case VK_FORMAT_R16G16B16A16_SFLOAT: {680nir_ssa_def *values[3];681nir_ssa_def *addr = nir_channel(b, addresses, i);682for (unsigned j = 0; j < 3; ++j)683values[j] =684nir_build_load_global(b, 1, 16, nir_iadd(b, addr, nir_imm_int64(b, j * 2)),685.align_mul = 2, .align_offset = 0);686nir_store_var(b, results[i], nir_f2f32(b, nir_vec(b, values, 3)), 7);687break;688}689default:690unreachable("Unhandled format");691}692}693if (f + 1 < ARRAY_SIZE(formats))694nir_push_else(b, NULL);695}696for (unsigned f = 1; f < ARRAY_SIZE(formats); ++f) {697nir_pop_if(b, NULL);698}699700for (unsigned i = 0; i < 3; ++i)701positions[i] = nir_load_var(b, results[i]);702}703704struct build_primitive_constants {705uint64_t node_dst_addr;706uint64_t scratch_addr;707uint32_t dst_offset;708uint32_t dst_scratch_offset;709uint32_t geometry_type;710uint32_t geometry_id;711712union {713struct {714uint64_t vertex_addr;715uint64_t index_addr;716uint64_t transform_addr;717uint32_t vertex_stride;718uint32_t vertex_format;719uint32_t index_format;720};721struct {722uint64_t instance_data;723};724struct {725uint64_t aabb_addr;726uint32_t aabb_stride;727};728};729};730731struct build_internal_constants {732uint64_t node_dst_addr;733uint64_t scratch_addr;734uint32_t dst_offset;735uint32_t dst_scratch_offset;736uint32_t src_scratch_offset;737uint32_t fill_header;738};739740/* This inverts a 3x3 matrix using cofactors, as in e.g.741* https://www.mathsisfun.com/algebra/matrix-inverse-minors-cofactors-adjugate.html */742static void743nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3])744{745nir_ssa_def *cofactors[3][3];746for (unsigned i = 0; i < 3; ++i) {747for (unsigned j = 0; j < 3; ++j) {748cofactors[i][j] =749nir_fsub(b, nir_fmul(b, in[(i + 1) % 3][(j + 1) % 3], in[(i + 2) % 3][(j + 2) % 3]),750nir_fmul(b, in[(i + 1) % 3][(j + 2) % 3], in[(i + 2) % 3][(j + 1) % 3]));751}752}753754nir_ssa_def *det = NULL;755for (unsigned i = 0; i < 3; ++i) {756nir_ssa_def *det_part = nir_fmul(b, in[0][i], cofactors[0][i]);757det = det ? nir_fadd(b, det, det_part) : det_part;758}759760nir_ssa_def *det_inv = nir_frcp(b, det);761for (unsigned i = 0; i < 3; ++i) {762for (unsigned j = 0; j < 3; ++j) {763out[i][j] = nir_fmul(b, cofactors[j][i], det_inv);764}765}766}767768static nir_shader *769build_leaf_shader(struct radv_device *dev)770{771const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);772nir_builder b =773nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_leaf_shader");774775b.shader->info.workgroup_size[0] = 64;776b.shader->info.workgroup_size[1] = 1;777b.shader->info.workgroup_size[2] = 1;778779nir_ssa_def *pconst0 =780nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);781nir_ssa_def *pconst1 =782nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);783nir_ssa_def *pconst2 =784nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 32, .range = 16);785nir_ssa_def *pconst3 =786nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 48, .range = 16);787nir_ssa_def *pconst4 =788nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 64, .range = 4);789790nir_ssa_def *geom_type = nir_channel(&b, pconst1, 2);791nir_ssa_def *node_dst_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));792nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));793nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);794nir_ssa_def *scratch_offset = nir_channel(&b, pconst1, 1);795nir_ssa_def *geometry_id = nir_channel(&b, pconst1, 3);796797nir_ssa_def *global_id =798nir_iadd(&b,799nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),800nir_imm_int(&b, b.shader->info.workgroup_size[0])),801nir_channels(&b, nir_load_local_invocation_id(&b), 1));802scratch_addr = nir_iadd(803&b, scratch_addr,804nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4)))));805806nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_TRIANGLES_KHR)));807{ /* Triangles */808nir_ssa_def *vertex_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));809nir_ssa_def *index_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 12));810nir_ssa_def *transform_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst3, 3));811nir_ssa_def *vertex_stride = nir_channel(&b, pconst3, 2);812nir_ssa_def *vertex_format = nir_channel(&b, pconst3, 3);813nir_ssa_def *index_format = nir_channel(&b, pconst4, 0);814unsigned repl_swizzle[4] = {0, 0, 0, 0};815816nir_ssa_def *node_offset =817nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));818nir_ssa_def *triangle_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));819820nir_ssa_def *indices = get_indices(&b, index_addr, index_format, global_id);821nir_ssa_def *vertex_addresses = nir_iadd(822&b, nir_u2u64(&b, nir_imul(&b, indices, nir_swizzle(&b, vertex_stride, repl_swizzle, 3))),823nir_swizzle(&b, vertex_addr, repl_swizzle, 3));824nir_ssa_def *positions[3];825get_vertices(&b, vertex_addresses, vertex_format, positions);826827nir_ssa_def *node_data[16];828memset(node_data, 0, sizeof(node_data));829830nir_variable *transform[] = {831nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform0"),832nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform1"),833nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform2"),834};835nir_store_var(&b, transform[0], nir_imm_vec4(&b, 1.0, 0.0, 0.0, 0.0), 0xf);836nir_store_var(&b, transform[1], nir_imm_vec4(&b, 0.0, 1.0, 0.0, 0.0), 0xf);837nir_store_var(&b, transform[2], nir_imm_vec4(&b, 0.0, 0.0, 1.0, 0.0), 0xf);838839nir_push_if(&b, nir_ine(&b, transform_addr, nir_imm_int64(&b, 0)));840nir_store_var(841&b, transform[0],842nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 0)),843.align_mul = 4, .align_offset = 0),8440xf);845nir_store_var(846&b, transform[1],847nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 16)),848.align_mul = 4, .align_offset = 0),8490xf);850nir_store_var(851&b, transform[2],852nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 32)),853.align_mul = 4, .align_offset = 0),8540xf);855nir_pop_if(&b, NULL);856857for (unsigned i = 0; i < 3; ++i)858for (unsigned j = 0; j < 3; ++j)859node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j]));860861node_data[12] = global_id;862node_data[13] = geometry_id;863node_data[15] = nir_imm_int(&b, 9);864for (unsigned i = 0; i < ARRAY_SIZE(node_data); ++i)865if (!node_data[i])866node_data[i] = nir_imm_int(&b, 0);867868for (unsigned i = 0; i < 4; ++i) {869nir_build_store_global(&b, nir_vec(&b, node_data + i * 4, 4),870nir_iadd(&b, triangle_node_dst_addr, nir_imm_int64(&b, i * 16)),871.write_mask = 15, .align_mul = 16, .align_offset = 0);872}873874nir_ssa_def *node_id = nir_ushr(&b, node_offset, nir_imm_int(&b, 3));875nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,876.align_offset = 0);877}878nir_push_else(&b, NULL);879nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_AABBS_KHR)));880{ /* AABBs */881nir_ssa_def *aabb_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));882nir_ssa_def *aabb_stride = nir_channel(&b, pconst2, 2);883884nir_ssa_def *node_offset =885nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));886nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));887nir_ssa_def *node_id =888nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 7));889nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,890.align_offset = 0);891892aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id)));893894nir_ssa_def *min_bound =895nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 0)),896.align_mul = 4, .align_offset = 0);897nir_ssa_def *max_bound =898nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 12)),899.align_mul = 4, .align_offset = 0);900901nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0),902nir_channel(&b, min_bound, 1),903nir_channel(&b, min_bound, 2),904nir_channel(&b, max_bound, 0),905nir_channel(&b, max_bound, 1),906nir_channel(&b, max_bound, 2),907global_id,908geometry_id};909910nir_build_store_global(&b, nir_vec(&b, values + 0, 4),911nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 0)),912.write_mask = 15, .align_mul = 16, .align_offset = 0);913nir_build_store_global(&b, nir_vec(&b, values + 4, 4),914nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 16)),915.write_mask = 15, .align_mul = 16, .align_offset = 0);916}917nir_push_else(&b, NULL);918{ /* Instances */919920nir_ssa_def *instance_addr =921nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),922nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64))));923nir_ssa_def *inst_transform[] = {924nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 0)),925.align_mul = 4, .align_offset = 0),926nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 16)),927.align_mul = 4, .align_offset = 0),928nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 32)),929.align_mul = 4, .align_offset = 0)};930nir_ssa_def *inst3 =931nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 48)),932.align_mul = 4, .align_offset = 0);933934nir_ssa_def *node_offset =935nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128)));936node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));937nir_ssa_def *node_id =938nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 6));939nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,940.align_offset = 0);941942nir_variable *bounds[2] = {943nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),944nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),945};946947nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);948nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);949950nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12));951nir_push_if(&b, nir_ine(&b, header_addr, nir_imm_int64(&b, 0)));952nir_ssa_def *header_root_offset =953nir_build_load_global(&b, 1, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 0)),954.align_mul = 4, .align_offset = 0);955nir_ssa_def *header_min =956nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 8)),957.align_mul = 4, .align_offset = 0);958nir_ssa_def *header_max =959nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 20)),960.align_mul = 4, .align_offset = 0);961962nir_ssa_def *bound_defs[2][3];963for (unsigned i = 0; i < 3; ++i) {964bound_defs[0][i] = bound_defs[1][i] = nir_channel(&b, inst_transform[i], 3);965966nir_ssa_def *mul_a = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_min);967nir_ssa_def *mul_b = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_max);968nir_ssa_def *mi = nir_fmin(&b, mul_a, mul_b);969nir_ssa_def *ma = nir_fmax(&b, mul_a, mul_b);970for (unsigned j = 0; j < 3; ++j) {971bound_defs[0][i] = nir_fadd(&b, bound_defs[0][i], nir_channel(&b, mi, j));972bound_defs[1][i] = nir_fadd(&b, bound_defs[1][i], nir_channel(&b, ma, j));973}974}975976nir_store_var(&b, bounds[0], nir_vec(&b, bound_defs[0], 3), 7);977nir_store_var(&b, bounds[1], nir_vec(&b, bound_defs[1], 3), 7);978979nir_ssa_def *m_in[3][3], *m_out[3][3], *m_vec[3][4];980for (unsigned i = 0; i < 3; ++i)981for (unsigned j = 0; j < 3; ++j)982m_in[i][j] = nir_channel(&b, inst_transform[i], j);983nir_invert_3x3(&b, m_in, m_out);984for (unsigned i = 0; i < 3; ++i) {985for (unsigned j = 0; j < 3; ++j)986m_vec[i][j] = m_out[i][j];987m_vec[i][3] = nir_channel(&b, inst_transform[i], 3);988}989990for (unsigned i = 0; i < 3; ++i) {991nir_build_store_global(&b, nir_vec(&b, m_vec[i], 4),992nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 16 * i)),993.write_mask = 0xf, .align_mul = 4, .align_offset = 0);994}995996nir_ssa_def *out0[4] = {997nir_ior(&b, nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 0), header_root_offset),998nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 1), nir_channel(&b, inst3, 0),999nir_channel(&b, inst3, 1)};1000nir_build_store_global(&b, nir_vec(&b, out0, 4),1001nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)), .write_mask = 0xf,1002.align_mul = 4, .align_offset = 0);1003nir_build_store_global(&b, global_id, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 88)),1004.write_mask = 0x1, .align_mul = 4, .align_offset = 0);1005nir_pop_if(&b, NULL);1006nir_build_store_global(&b, nir_load_var(&b, bounds[0]),1007nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 64)), .write_mask = 0x7,1008.align_mul = 4, .align_offset = 0);1009nir_build_store_global(&b, nir_load_var(&b, bounds[1]),1010nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 76)), .write_mask = 0x7,1011.align_mul = 4, .align_offset = 0);1012}1013nir_pop_if(&b, NULL);1014nir_pop_if(&b, NULL);10151016return b.shader;1017}10181019static void1020determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id,1021nir_variable *bounds_vars[2])1022{1023nir_ssa_def *node_type = nir_iand(b, node_id, nir_imm_int(b, 7));1024node_addr = nir_iadd(1025b, node_addr,1026nir_u2u64(b, nir_ishl(b, nir_iand(b, node_id, nir_imm_int(b, ~7u)), nir_imm_int(b, 3))));10271028nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 0)));1029{1030nir_ssa_def *positions[3];1031for (unsigned i = 0; i < 3; ++i)1032positions[i] =1033nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),1034.align_mul = 4, .align_offset = 0);1035nir_ssa_def *bounds[] = {positions[0], positions[0]};1036for (unsigned i = 1; i < 3; ++i) {1037bounds[0] = nir_fmin(b, bounds[0], positions[i]);1038bounds[1] = nir_fmax(b, bounds[1], positions[i]);1039}1040nir_store_var(b, bounds_vars[0], bounds[0], 7);1041nir_store_var(b, bounds_vars[1], bounds[1], 7);1042}1043nir_push_else(b, NULL);1044nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 5)));1045{1046nir_ssa_def *input_bounds[4][2];1047for (unsigned i = 0; i < 4; ++i)1048for (unsigned j = 0; j < 2; ++j)1049input_bounds[i][j] = nir_build_load_global(1050b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 16 + i * 24 + j * 12)),1051.align_mul = 4, .align_offset = 0);1052nir_ssa_def *bounds[] = {input_bounds[0][0], input_bounds[0][1]};1053for (unsigned i = 1; i < 4; ++i) {1054bounds[0] = nir_fmin(b, bounds[0], input_bounds[i][0]);1055bounds[1] = nir_fmax(b, bounds[1], input_bounds[i][1]);1056}10571058nir_store_var(b, bounds_vars[0], bounds[0], 7);1059nir_store_var(b, bounds_vars[1], bounds[1], 7);1060}1061nir_push_else(b, NULL);1062nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 6)));1063{ /* Instances */1064nir_ssa_def *bounds[2];1065for (unsigned i = 0; i < 2; ++i)1066bounds[i] =1067nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 64 + i * 12)),1068.align_mul = 4, .align_offset = 0);1069nir_store_var(b, bounds_vars[0], bounds[0], 7);1070nir_store_var(b, bounds_vars[1], bounds[1], 7);1071}1072nir_push_else(b, NULL);1073{ /* AABBs */1074nir_ssa_def *bounds[2];1075for (unsigned i = 0; i < 2; ++i)1076bounds[i] =1077nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),1078.align_mul = 4, .align_offset = 0);1079nir_store_var(b, bounds_vars[0], bounds[0], 7);1080nir_store_var(b, bounds_vars[1], bounds[1], 7);1081}1082nir_pop_if(b, NULL);1083nir_pop_if(b, NULL);1084nir_pop_if(b, NULL);1085}10861087static nir_shader *1088build_internal_shader(struct radv_device *dev)1089{1090const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);1091nir_builder b =1092nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_internal_shader");10931094b.shader->info.workgroup_size[0] = 64;1095b.shader->info.workgroup_size[1] = 1;1096b.shader->info.workgroup_size[2] = 1;10971098/*1099* push constants:1100* i32 x 2: node dst address1101* i32 x 2: scratch address1102* i32: dst offset1103* i32: dst scratch offset1104* i32: src scratch offset1105* i32: src_node_count | (fill_header << 31)1106*/1107nir_ssa_def *pconst0 =1108nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);1109nir_ssa_def *pconst1 =1110nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);11111112nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));1113nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));1114nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);1115nir_ssa_def *dst_scratch_offset = nir_channel(&b, pconst1, 1);1116nir_ssa_def *src_scratch_offset = nir_channel(&b, pconst1, 2);1117nir_ssa_def *src_node_count =1118nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x7FFFFFFFU));1119nir_ssa_def *fill_header =1120nir_ine(&b, nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x80000000U)),1121nir_imm_int(&b, 0));11221123nir_ssa_def *global_id =1124nir_iadd(&b,1125nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),1126nir_imm_int(&b, b.shader->info.workgroup_size[0])),1127nir_channels(&b, nir_load_local_invocation_id(&b), 1));1128nir_ssa_def *src_idx = nir_imul(&b, global_id, nir_imm_int(&b, 4));1129nir_ssa_def *src_count = nir_umin(&b, nir_imm_int(&b, 4), nir_isub(&b, src_node_count, src_idx));11301131nir_ssa_def *node_offset =1132nir_iadd(&b, node_dst_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 7)));1133nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset));1134nir_ssa_def *src_nodes = nir_build_load_global(1135&b, 4, 32,1136nir_iadd(&b, scratch_addr,1137nir_u2u64(&b, nir_iadd(&b, src_scratch_offset,1138nir_ishl(&b, global_id, nir_imm_int(&b, 4))))),1139.align_mul = 4, .align_offset = 0);11401141nir_build_store_global(&b, src_nodes, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)),1142.write_mask = 0xf, .align_mul = 4, .align_offset = 0);11431144nir_ssa_def *total_bounds[2] = {1145nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),1146nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),1147};11481149for (unsigned i = 0; i < 4; ++i) {1150nir_variable *bounds[2] = {1151nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),1152nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),1153};1154nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);1155nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);11561157nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count));1158determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds);1159nir_pop_if(&b, NULL);1160nir_build_store_global(&b, nir_load_var(&b, bounds[0]),1161nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 24 * i)),1162.write_mask = 0x7, .align_mul = 4, .align_offset = 0);1163nir_build_store_global(&b, nir_load_var(&b, bounds[1]),1164nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 28 + 24 * i)),1165.write_mask = 0x7, .align_mul = 4, .align_offset = 0);1166total_bounds[0] = nir_fmin(&b, total_bounds[0], nir_load_var(&b, bounds[0]));1167total_bounds[1] = nir_fmax(&b, total_bounds[1], nir_load_var(&b, bounds[1]));1168}11691170nir_ssa_def *node_id =1171nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 5));1172nir_ssa_def *dst_scratch_addr = nir_iadd(1173&b, scratch_addr,1174nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 2)))));1175nir_build_store_global(&b, node_id, dst_scratch_addr, .write_mask = 1, .align_mul = 4,1176.align_offset = 0);11771178nir_push_if(&b, fill_header);1179nir_build_store_global(&b, node_id, node_addr, .write_mask = 1, .align_mul = 4,1180.align_offset = 0);1181nir_build_store_global(&b, total_bounds[0], nir_iadd(&b, node_addr, nir_imm_int64(&b, 8)),1182.write_mask = 7, .align_mul = 4, .align_offset = 0);1183nir_build_store_global(&b, total_bounds[1], nir_iadd(&b, node_addr, nir_imm_int64(&b, 20)),1184.write_mask = 7, .align_mul = 4, .align_offset = 0);1185nir_pop_if(&b, NULL);1186return b.shader;1187}11881189void1190radv_device_finish_accel_struct_build_state(struct radv_device *device)1191{1192struct radv_meta_state *state = &device->meta_state;1193radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.internal_pipeline,1194&state->alloc);1195radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,1196&state->alloc);1197radv_DestroyPipelineLayout(radv_device_to_handle(device),1198state->accel_struct_build.internal_p_layout, &state->alloc);1199radv_DestroyPipelineLayout(radv_device_to_handle(device),1200state->accel_struct_build.leaf_p_layout, &state->alloc);1201}12021203VkResult1204radv_device_init_accel_struct_build_state(struct radv_device *device)1205{1206VkResult result;1207nir_shader *leaf_cs = build_leaf_shader(device);1208nir_shader *internal_cs = build_internal_shader(device);12091210const VkPipelineLayoutCreateInfo leaf_pl_create_info = {1211.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,1212.setLayoutCount = 0,1213.pushConstantRangeCount = 1,1214.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,1215sizeof(struct build_primitive_constants)},1216};12171218result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,1219&device->meta_state.alloc,1220&device->meta_state.accel_struct_build.leaf_p_layout);1221if (result != VK_SUCCESS)1222goto fail;12231224VkPipelineShaderStageCreateInfo leaf_shader_stage = {1225.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,1226.stage = VK_SHADER_STAGE_COMPUTE_BIT,1227.module = vk_shader_module_handle_from_nir(leaf_cs),1228.pName = "main",1229.pSpecializationInfo = NULL,1230};12311232VkComputePipelineCreateInfo leaf_pipeline_info = {1233.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,1234.stage = leaf_shader_stage,1235.flags = 0,1236.layout = device->meta_state.accel_struct_build.leaf_p_layout,1237};12381239result = radv_CreateComputePipelines(1240radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,1241&leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline);1242if (result != VK_SUCCESS)1243goto fail;12441245const VkPipelineLayoutCreateInfo internal_pl_create_info = {1246.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,1247.setLayoutCount = 0,1248.pushConstantRangeCount = 1,1249.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,1250sizeof(struct build_internal_constants)},1251};12521253result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info,1254&device->meta_state.alloc,1255&device->meta_state.accel_struct_build.internal_p_layout);1256if (result != VK_SUCCESS)1257goto fail;12581259VkPipelineShaderStageCreateInfo internal_shader_stage = {1260.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,1261.stage = VK_SHADER_STAGE_COMPUTE_BIT,1262.module = vk_shader_module_handle_from_nir(internal_cs),1263.pName = "main",1264.pSpecializationInfo = NULL,1265};12661267VkComputePipelineCreateInfo internal_pipeline_info = {1268.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,1269.stage = internal_shader_stage,1270.flags = 0,1271.layout = device->meta_state.accel_struct_build.internal_p_layout,1272};12731274result = radv_CreateComputePipelines(1275radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,1276&internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline);1277if (result != VK_SUCCESS)1278goto fail;12791280return VK_SUCCESS;12811282fail:1283radv_device_finish_accel_struct_build_state(device);1284ralloc_free(internal_cs);1285ralloc_free(leaf_cs);1286return result;1287}12881289struct bvh_state {1290uint32_t node_offset;1291uint32_t node_count;1292uint32_t scratch_offset;1293};12941295void1296radv_CmdBuildAccelerationStructuresKHR(1297VkCommandBuffer commandBuffer, uint32_t infoCount,1298const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,1299const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)1300{1301RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);1302struct radv_meta_saved_state saved_state;13031304radv_meta_save(1305&saved_state, cmd_buffer,1306RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);1307struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));13081309radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,1310cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline);13111312for (uint32_t i = 0; i < infoCount; ++i) {1313RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,1314pInfos[i].dstAccelerationStructure);13151316struct build_primitive_constants prim_consts = {1317.node_dst_addr = radv_accel_struct_get_va(accel_struct),1318.scratch_addr = pInfos[i].scratchData.deviceAddress,1319.dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128,1320.dst_scratch_offset = 0,1321};13221323for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {1324const VkAccelerationStructureGeometryKHR *geom =1325pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];13261327prim_consts.geometry_type = geom->geometryType;1328prim_consts.geometry_id = j | (geom->flags << 28);1329unsigned prim_size;1330switch (geom->geometryType) {1331case VK_GEOMETRY_TYPE_TRIANGLES_KHR:1332prim_consts.vertex_addr =1333geom->geometry.triangles.vertexData.deviceAddress +1334ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride +1335(geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR1336? ppBuildRangeInfos[i][j].primitiveOffset1337: 0);1338prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress +1339ppBuildRangeInfos[i][j].primitiveOffset;1340prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress +1341ppBuildRangeInfos[i][j].transformOffset;1342prim_consts.vertex_stride = geom->geometry.triangles.vertexStride;1343prim_consts.vertex_format = geom->geometry.triangles.vertexFormat;1344prim_consts.index_format = geom->geometry.triangles.indexType;1345prim_size = 64;1346break;1347case VK_GEOMETRY_TYPE_AABBS_KHR:1348prim_consts.aabb_addr =1349geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset;1350prim_consts.aabb_stride = geom->geometry.aabbs.stride;1351prim_size = 64;1352break;1353case VK_GEOMETRY_TYPE_INSTANCES_KHR:1354prim_consts.instance_data = geom->geometry.instances.data.deviceAddress;1355prim_size = 128;1356break;1357default:1358unreachable("Unknown geometryType");1359}13601361radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),1362cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,1363VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts);1364radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);1365prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;1366prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;1367}1368bvh_states[i].node_offset = prim_consts.dst_offset;1369bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4;1370}13711372radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,1373cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline);1374bool progress = true;1375for (unsigned iter = 0; progress; ++iter) {1376progress = false;1377for (uint32_t i = 0; i < infoCount; ++i) {1378RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,1379pInfos[i].dstAccelerationStructure);13801381if (iter && bvh_states[i].node_count == 1)1382continue;13831384if (!progress) {1385cmd_buffer->state.flush_bits |=1386RADV_CMD_FLAG_CS_PARTIAL_FLUSH |1387radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, NULL) |1388radv_dst_access_flush(cmd_buffer,1389VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, NULL);1390}1391progress = true;1392uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4));1393bool final_iter = dst_node_count == 1;1394uint32_t src_scratch_offset = bvh_states[i].scratch_offset;1395uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4;1396uint32_t dst_node_offset = bvh_states[i].node_offset;1397if (final_iter)1398dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);13991400const struct build_internal_constants consts = {1401.node_dst_addr = radv_accel_struct_get_va(accel_struct),1402.scratch_addr = pInfos[i].scratchData.deviceAddress,1403.dst_offset = dst_node_offset,1404.dst_scratch_offset = dst_scratch_offset,1405.src_scratch_offset = src_scratch_offset,1406.fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),1407};14081409radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),1410cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout,1411VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);1412radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1);1413bvh_states[i].node_offset += dst_node_count * 128;1414bvh_states[i].node_count = dst_node_count;1415bvh_states[i].scratch_offset = dst_scratch_offset;1416}1417}1418free(bvh_states);1419radv_meta_restore(&saved_state, cmd_buffer);1420}14211422