CoCalc -- radv_acceleration

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_acceleration_structure.c
⁷¹⁷⁶ views
1
/*
2
 * Copyright © 2021 Bas Nieuwenhuizen
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23
#include "radv_private.h"
24

25
#include "util/half_float.h"
26
#include "nir_builder.h"
27
#include "radv_cs.h"
28
#include "radv_meta.h"
29

30
struct radv_accel_struct_header {
31
   uint32_t root_node_offset;
32
   uint32_t reserved;
33
   float aabb[2][3];
34
   uint64_t compacted_size;
35
   uint64_t serialization_size;
36
};
37

38
struct radv_bvh_triangle_node {
39
   float coords[3][3];
40
   uint32_t reserved[3];
41
   uint32_t triangle_id;
42
   /* flags in upper 4 bits */
43
   uint32_t geometry_id_and_flags;
44
   uint32_t reserved2;
45
   uint32_t id;
46
};
47

48
struct radv_bvh_aabb_node {
49
   float aabb[2][3];
50
   uint32_t primitive_id;
51
   /* flags in upper 4 bits */
52
   uint32_t geometry_id_and_flags;
53
   uint32_t reserved[8];
54
};
55

56
struct radv_bvh_instance_node {
57
   uint64_t base_ptr;
58
   /* lower 24 bits are the custom instance index, upper 8 bits are the visibility mask */
59
   uint32_t custom_instance_and_mask;
60
   /* lower 24 bits are the sbt offset, upper 8 bits are VkGeometryInstanceFlagsKHR */
61
   uint32_t sbt_offset_and_flags;
62

63
   /* The translation component is actually a pre-translation instead of a post-translation. If you
64
    * want to get a proper matrix out of it you need to apply the directional component of the
65
    * matrix to it. The pre-translation of the world->object matrix is the same as the
66
    * post-translation of the object->world matrix so this way we can share data between both
67
    * matrices. */
68
   float wto_matrix[12];
69
   float aabb[2][3];
70
   uint32_t instance_id;
71
   uint32_t reserved[9];
72
};
73

74
struct radv_bvh_box16_node {
75
   uint32_t children[4];
76
   uint32_t coords[4][3];
77
};
78

79
struct radv_bvh_box32_node {
80
   uint32_t children[4];
81
   float coords[4][2][3];
82
   uint32_t reserved[4];
83
};
84

85
void
86
radv_GetAccelerationStructureBuildSizesKHR(
87
   VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
88
   const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo,
89
   const uint32_t *pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo)
90
{
91
   uint64_t triangles = 0, boxes = 0, instances = 0;
92

93
   for (uint32_t i = 0; i < pBuildInfo->geometryCount; ++i) {
94
      const VkAccelerationStructureGeometryKHR *geometry;
95
      if (pBuildInfo->pGeometries)
96
         geometry = &pBuildInfo->pGeometries[i];
97
      else
98
         geometry = pBuildInfo->ppGeometries[i];
99

100
      switch (geometry->geometryType) {
101
      case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
102
         triangles += pMaxPrimitiveCounts[i];
103
         break;
104
      case VK_GEOMETRY_TYPE_AABBS_KHR:
105
         boxes += pMaxPrimitiveCounts[i];
106
         break;
107
      case VK_GEOMETRY_TYPE_INSTANCES_KHR:
108
         instances += pMaxPrimitiveCounts[i];
109
         break;
110
      case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:
111
         unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");
112
      }
113
   }
114

115
   uint64_t children = boxes + instances + triangles;
116
   uint64_t internal_nodes = 0;
117
   while (children > 1) {
118
      children = DIV_ROUND_UP(children, 4);
119
      internal_nodes += children;
120
   }
121

122
   /* The stray 128 is to ensure we have space for a header
123
    * which we'd want to use for some metadata (like the
124
    * total AABB of the BVH) */
125
   uint64_t size = boxes * 128 + instances * 128 + triangles * 64 + internal_nodes * 128 + 192;
126

127
   pSizeInfo->accelerationStructureSize = size;
128

129
   /* 2x the max number of nodes in a BVH layer (one uint32_t each) */
130
   pSizeInfo->updateScratchSize = pSizeInfo->buildScratchSize =
131
      MAX2(4096, 2 * (boxes + instances + triangles) * sizeof(uint32_t));
132
}
133

134
VkResult
135
radv_CreateAccelerationStructureKHR(VkDevice _device,
136
                                    const VkAccelerationStructureCreateInfoKHR *pCreateInfo,
137
                                    const VkAllocationCallbacks *pAllocator,
138
                                    VkAccelerationStructureKHR *pAccelerationStructure)
139
{
140
   RADV_FROM_HANDLE(radv_device, device, _device);
141
   RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer);
142
   struct radv_acceleration_structure *accel;
143

144
   accel = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
145
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
146
   if (accel == NULL)
147
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
148

149
   vk_object_base_init(&device->vk, &accel->base, VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
150

151
   accel->mem_offset = buffer->offset + pCreateInfo->offset;
152
   accel->size = pCreateInfo->size;
153
   accel->bo = buffer->bo;
154

155
   *pAccelerationStructure = radv_acceleration_structure_to_handle(accel);
156
   return VK_SUCCESS;
157
}
158

159
void
160
radv_DestroyAccelerationStructureKHR(VkDevice _device,
161
                                     VkAccelerationStructureKHR accelerationStructure,
162
                                     const VkAllocationCallbacks *pAllocator)
163
{
164
   RADV_FROM_HANDLE(radv_device, device, _device);
165
   RADV_FROM_HANDLE(radv_acceleration_structure, accel, accelerationStructure);
166

167
   if (!accel)
168
      return;
169

170
   vk_object_base_finish(&accel->base);
171
   vk_free2(&device->vk.alloc, pAllocator, accel);
172
}
173

174
VkDeviceAddress
175
radv_GetAccelerationStructureDeviceAddressKHR(
176
   VkDevice _device, const VkAccelerationStructureDeviceAddressInfoKHR *pInfo)
177
{
178
   RADV_FROM_HANDLE(radv_acceleration_structure, accel, pInfo->accelerationStructure);
179
   return radv_accel_struct_get_va(accel);
180
}
181

182
VkResult
183
radv_WriteAccelerationStructuresPropertiesKHR(
184
   VkDevice _device, uint32_t accelerationStructureCount,
185
   const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,
186
   size_t dataSize, void *pData, size_t stride)
187
{
188
   RADV_FROM_HANDLE(radv_device, device, _device);
189
   char *data_out = (char*)pData;
190

191
   for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
192
      RADV_FROM_HANDLE(radv_acceleration_structure, accel, pAccelerationStructures[i]);
193
      const char *base_ptr = (const char *)device->ws->buffer_map(accel->bo);
194
      if (!base_ptr)
195
         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
196

197
      const struct radv_accel_struct_header *header = (const void*)(base_ptr + accel->mem_offset);
198
      if (stride * i + sizeof(VkDeviceSize) <= dataSize) {
199
         uint64_t value;
200
         switch (queryType) {
201
         case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
202
            value = header->compacted_size;
203
            break;
204
         case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
205
            value = header->serialization_size;
206
            break;
207
         default:
208
            unreachable("Unhandled acceleration structure query");
209
         }
210
         *(VkDeviceSize *)(data_out + stride * i) = value;
211
      }
212
      device->ws->buffer_unmap(accel->bo);
213
   }
214
   return VK_SUCCESS;
215
}
216

217
struct radv_bvh_build_ctx {
218
   uint32_t *write_scratch;
219
   char *base;
220
   char *curr_ptr;
221
};
222

223
static void
224
build_triangles(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeometryKHR *geom,
225
                const VkAccelerationStructureBuildRangeInfoKHR *range, unsigned geometry_id)
226
{
227
   const VkAccelerationStructureGeometryTrianglesDataKHR *tri_data = &geom->geometry.triangles;
228
   VkTransformMatrixKHR matrix;
229
   const char *index_data = (const char *)tri_data->indexData.hostAddress + range->primitiveOffset;
230

231
   if (tri_data->transformData.hostAddress) {
232
      matrix = *(const VkTransformMatrixKHR *)((const char *)tri_data->transformData.hostAddress +
233
                                               range->transformOffset);
234
   } else {
235
      matrix = (VkTransformMatrixKHR){
236
         .matrix = {{1.0, 0.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0}, {0.0, 0.0, 1.0, 0.0}}};
237
   }
238

239
   for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 64) {
240
      struct radv_bvh_triangle_node *node = (void*)ctx->curr_ptr;
241
      uint32_t node_offset = ctx->curr_ptr - ctx->base;
242
      uint32_t node_id = node_offset >> 3;
243
      *ctx->write_scratch++ = node_id;
244

245
      for (unsigned v = 0; v < 3; ++v) {
246
         uint32_t v_index = range->firstVertex;
247
         switch (tri_data->indexType) {
248
         case VK_INDEX_TYPE_NONE_KHR:
249
            v_index += p * 3 + v;
250
            break;
251
         case VK_INDEX_TYPE_UINT8_EXT:
252
            v_index += *(const uint8_t *)index_data;
253
            index_data += 1;
254
            break;
255
         case VK_INDEX_TYPE_UINT16:
256
            v_index += *(const uint16_t *)index_data;
257
            index_data += 2;
258
            break;
259
         case VK_INDEX_TYPE_UINT32:
260
            v_index += *(const uint32_t *)index_data;
261
            index_data += 4;
262
            break;
263
         case VK_INDEX_TYPE_MAX_ENUM:
264
            unreachable("Unhandled VK_INDEX_TYPE_MAX_ENUM");
265
            break;
266
         }
267

268
         const char *v_data = (const char *)tri_data->vertexData.hostAddress + v_index * tri_data->vertexStride;
269
         float coords[4];
270
         switch (tri_data->vertexFormat) {
271
         case VK_FORMAT_R32G32B32_SFLOAT:
272
            coords[0] = *(const float *)(v_data + 0);
273
            coords[1] = *(const float *)(v_data + 4);
274
            coords[2] = *(const float *)(v_data + 8);
275
            coords[3] = 1.0f;
276
            break;
277
         case VK_FORMAT_R32G32B32A32_SFLOAT:
278
            coords[0] = *(const float *)(v_data + 0);
279
            coords[1] = *(const float *)(v_data + 4);
280
            coords[2] = *(const float *)(v_data + 8);
281
            coords[3] = *(const float *)(v_data + 12);
282
            break;
283
         case VK_FORMAT_R16G16B16_SFLOAT:
284
            coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));
285
            coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));
286
            coords[2] = _mesa_half_to_float(*(const uint16_t *)(v_data + 4));
287
            coords[3] = 1.0f;
288
            break;
289
         case VK_FORMAT_R16G16B16A16_SFLOAT:
290
            coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));
291
            coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));
292
            coords[2] = _mesa_half_to_float(*(const uint16_t *)(v_data + 4));
293
            coords[3] = _mesa_half_to_float(*(const uint16_t *)(v_data + 6));
294
            break;
295
         default:
296
            unreachable("Unhandled vertex format in BVH build");
297
         }
298

299
         for (unsigned j = 0; j < 3; ++j) {
300
            float r = 0;
301
            for (unsigned k = 0; k < 4; ++k)
302
               r += matrix.matrix[j][k] * coords[k];
303
            node->coords[v][j] = r;
304
         }
305

306
         node->triangle_id = p;
307
         node->geometry_id_and_flags = geometry_id | (geom->flags << 28);
308

309
         /* Seems to be needed for IJ, otherwise I = J = ? */
310
         node->id = 9;
311
      }
312
   }
313
}
314

315
static VkResult
316
build_instances(struct radv_device *device, struct radv_bvh_build_ctx *ctx,
317
                const VkAccelerationStructureGeometryKHR *geom,
318
                const VkAccelerationStructureBuildRangeInfoKHR *range)
319
{
320
   const VkAccelerationStructureGeometryInstancesDataKHR *inst_data = &geom->geometry.instances;
321

322
   for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 128) {
323
      const VkAccelerationStructureInstanceKHR *instance =
324
         inst_data->arrayOfPointers
325
            ? (((const VkAccelerationStructureInstanceKHR *const *)inst_data->data.hostAddress)[p])
326
            : &((const VkAccelerationStructureInstanceKHR *)inst_data->data.hostAddress)[p];
327
      if (!instance->accelerationStructureReference) {
328
         continue;
329
      }
330

331
      struct radv_bvh_instance_node *node = (void*)ctx->curr_ptr;
332
      uint32_t node_offset = ctx->curr_ptr - ctx->base;
333
      uint32_t node_id = (node_offset >> 3) | 6;
334
      *ctx->write_scratch++ = node_id;
335

336
      float transform[16], inv_transform[16];
337
      memcpy(transform, &instance->transform.matrix, sizeof(instance->transform.matrix));
338
      transform[12] = transform[13] = transform[14] = 0.0f;
339
      transform[15] = 1.0f;
340

341
      util_invert_mat4x4(inv_transform, transform);
342
      memcpy(node->wto_matrix, inv_transform, sizeof(node->wto_matrix));
343
      node->wto_matrix[3] = transform[3];
344
      node->wto_matrix[7] = transform[7];
345
      node->wto_matrix[11] = transform[11];
346
      node->custom_instance_and_mask = instance->instanceCustomIndex | (instance->mask << 24);
347
      node->sbt_offset_and_flags =
348
         instance->instanceShaderBindingTableRecordOffset | (instance->flags << 24);
349
      node->instance_id = p;
350

351
      RADV_FROM_HANDLE(radv_acceleration_structure, src_accel_struct,
352
                       (VkAccelerationStructureKHR)instance->accelerationStructureReference);
353
      const void *src_base = device->ws->buffer_map(src_accel_struct->bo);
354
      if (!src_base)
355
         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
356

357
      src_base = (const char *)src_base + src_accel_struct->mem_offset;
358
      const struct radv_accel_struct_header *src_header = src_base;
359
      node->base_ptr = radv_accel_struct_get_va(src_accel_struct) | src_header->root_node_offset;
360

361
      for (unsigned j = 0; j < 3; ++j) {
362
         node->aabb[0][j] = instance->transform.matrix[j][3];
363
         node->aabb[1][j] = instance->transform.matrix[j][3];
364
         for (unsigned k = 0; k < 3; ++k) {
365
            node->aabb[0][j] += MIN2(instance->transform.matrix[j][k] * src_header->aabb[0][k],
366
                                     instance->transform.matrix[j][k] * src_header->aabb[1][k]);
367
            node->aabb[1][j] += MAX2(instance->transform.matrix[j][k] * src_header->aabb[0][k],
368
                                     instance->transform.matrix[j][k] * src_header->aabb[1][k]);
369
         }
370
      }
371
      device->ws->buffer_unmap(src_accel_struct->bo);
372
   }
373
   return VK_SUCCESS;
374
}
375

376
static void
377
build_aabbs(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeometryKHR *geom,
378
            const VkAccelerationStructureBuildRangeInfoKHR *range, unsigned geometry_id)
379
{
380
   const VkAccelerationStructureGeometryAabbsDataKHR *aabb_data = &geom->geometry.aabbs;
381

382
   for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 64) {
383
      struct radv_bvh_aabb_node *node = (void*)ctx->curr_ptr;
384
      uint32_t node_offset = ctx->curr_ptr - ctx->base;
385
      uint32_t node_id = (node_offset >> 3) | 6;
386
      *ctx->write_scratch++ = node_id;
387

388
      const VkAabbPositionsKHR *aabb =
389
         (const VkAabbPositionsKHR *)((const char *)aabb_data->data.hostAddress +
390
                                      p * aabb_data->stride);
391

392
      node->aabb[0][0] = aabb->minX;
393
      node->aabb[0][1] = aabb->minY;
394
      node->aabb[0][2] = aabb->minZ;
395
      node->aabb[1][0] = aabb->maxX;
396
      node->aabb[1][1] = aabb->maxY;
397
      node->aabb[1][2] = aabb->maxZ;
398
      node->primitive_id = p;
399
      node->geometry_id_and_flags = geometry_id;
400
   }
401
}
402

403
static uint32_t
404
leaf_node_count(const VkAccelerationStructureBuildGeometryInfoKHR *info,
405
                const VkAccelerationStructureBuildRangeInfoKHR *ranges)
406
{
407
   uint32_t count = 0;
408
   for (uint32_t i = 0; i < info->geometryCount; ++i) {
409
      count += ranges[i].primitiveCount;
410
   }
411
   return count;
412
}
413

414
static void
415
compute_bounds(const char *base_ptr, uint32_t node_id, float *bounds)
416
{
417
   for (unsigned i = 0; i < 3; ++i)
418
      bounds[i] = INFINITY;
419
   for (unsigned i = 0; i < 3; ++i)
420
      bounds[3 + i] = -INFINITY;
421

422
   switch (node_id & 7) {
423
   case 0: {
424
      const struct radv_bvh_triangle_node *node = (const void*)(base_ptr + (node_id / 8 * 64));
425
      for (unsigned v = 0; v < 3; ++v) {
426
         for (unsigned j = 0; j < 3; ++j) {
427
            bounds[j] = MIN2(bounds[j], node->coords[v][j]);
428
            bounds[3 + j] = MAX2(bounds[3 + j], node->coords[v][j]);
429
         }
430
      }
431
      break;
432
   }
433
   case 5: {
434
      const struct radv_bvh_box32_node *node = (const void*)(base_ptr + (node_id / 8 * 64));
435
      for (unsigned c2 = 0; c2 < 4; ++c2) {
436
         if (isnan(node->coords[c2][0][0]))
437
            continue;
438
         for (unsigned j = 0; j < 3; ++j) {
439
            bounds[j] = MIN2(bounds[j], node->coords[c2][0][j]);
440
            bounds[3 + j] = MAX2(bounds[3 + j], node->coords[c2][1][j]);
441
         }
442
      }
443
      break;
444
   }
445
   case 6: {
446
      const struct radv_bvh_instance_node *node = (const void*)(base_ptr + (node_id / 8 * 64));
447
      for (unsigned j = 0; j < 3; ++j) {
448
         bounds[j] = MIN2(bounds[j], node->aabb[0][j]);
449
         bounds[3 + j] = MAX2(bounds[3 + j], node->aabb[1][j]);
450
      }
451
      break;
452
   }
453
   case 7: {
454
      const struct radv_bvh_aabb_node *node = (const void*)(base_ptr + (node_id / 8 * 64));
455
      for (unsigned j = 0; j < 3; ++j) {
456
         bounds[j] = MIN2(bounds[j], node->aabb[0][j]);
457
         bounds[3 + j] = MAX2(bounds[3 + j], node->aabb[1][j]);
458
      }
459
      break;
460
   }
461
   }
462
}
463

464
static VkResult
465
build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometryInfoKHR *info,
466
          const VkAccelerationStructureBuildRangeInfoKHR *ranges)
467
{
468
   RADV_FROM_HANDLE(radv_acceleration_structure, accel, info->dstAccelerationStructure);
469
   VkResult result = VK_SUCCESS;
470

471
   uint32_t *scratch[2];
472
   scratch[0] = info->scratchData.hostAddress;
473
   scratch[1] = scratch[0] + leaf_node_count(info, ranges);
474

475
   char *base_ptr = (char*)device->ws->buffer_map(accel->bo);
476
   if (!base_ptr)
477
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
478

479
   base_ptr = base_ptr + accel->mem_offset;
480
   struct radv_accel_struct_header *header = (void*)base_ptr;
481
   void *first_node_ptr = (char *)base_ptr + ALIGN(sizeof(*header), 64);
482

483
   struct radv_bvh_build_ctx ctx = {.write_scratch = scratch[0],
484
                                    .base = base_ptr,
485
                                    .curr_ptr = (char *)first_node_ptr + 128};
486

487
   /* This initializes the leaf nodes of the BVH all at the same level. */
488
   for (uint32_t i = 0; i < info->geometryCount; ++i) {
489
      const VkAccelerationStructureGeometryKHR *geom =
490
         info->pGeometries ? &info->pGeometries[i] : info->ppGeometries[i];
491

492
      switch (geom->geometryType) {
493
      case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
494
         build_triangles(&ctx, geom, ranges + i, i);
495
         break;
496
      case VK_GEOMETRY_TYPE_AABBS_KHR:
497
         build_aabbs(&ctx, geom, ranges + i, i);
498
         break;
499
      case VK_GEOMETRY_TYPE_INSTANCES_KHR: {
500
         result = build_instances(device, &ctx, geom, ranges + i);
501
         if (result != VK_SUCCESS)
502
            goto fail;
503
         break;
504
      }
505
      case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:
506
         unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");
507
      }
508
   }
509

510
   uint32_t node_counts[2] = {ctx.write_scratch - scratch[0], 0};
511
   unsigned d;
512

513
   /*
514
    * This is the most naive BVH building algorithm I could think of:
515
    * just iteratively builds each level from bottom to top with
516
    * the children of each node being in-order and tightly packed.
517
    *
518
    * Is probably terrible for traversal but should be easy to build an
519
    * equivalent GPU version.
520
    */
521
   for (d = 0; node_counts[d & 1] > 1 || d == 0; ++d) {
522
      uint32_t child_count = node_counts[d & 1];
523
      const uint32_t *children = scratch[d & 1];
524
      uint32_t *dst_ids = scratch[(d & 1) ^ 1];
525
      unsigned dst_count;
526
      unsigned child_idx = 0;
527
      for (dst_count = 0; child_idx < MAX2(1, child_count); ++dst_count, child_idx += 4) {
528
         unsigned local_child_count = MIN2(4, child_count - child_idx);
529
         uint32_t child_ids[4];
530
         float bounds[4][6];
531

532
         for (unsigned c = 0; c < local_child_count; ++c) {
533
            uint32_t id = children[child_idx + c];
534
            child_ids[c] = id;
535

536
            compute_bounds(base_ptr, id, bounds[c]);
537
         }
538

539
         struct radv_bvh_box32_node *node;
540

541
         /* Put the root node at base_ptr so the id = 0, which allows some
542
          * traversal optimizations. */
543
         if (child_idx == 0 && local_child_count == child_count) {
544
            node = first_node_ptr;
545
            header->root_node_offset = ((char *)first_node_ptr - (char *)base_ptr) / 64 * 8 + 5;
546
         } else {
547
            uint32_t dst_id = (ctx.curr_ptr - base_ptr) / 64;
548
            dst_ids[dst_count] = dst_id * 8 + 5;
549

550
            node = (void*)ctx.curr_ptr;
551
            ctx.curr_ptr += 128;
552
         }
553

554
         for (unsigned c = 0; c < local_child_count; ++c) {
555
            node->children[c] = child_ids[c];
556
            for (unsigned i = 0; i < 2; ++i)
557
               for (unsigned j = 0; j < 3; ++j)
558
                  node->coords[c][i][j] = bounds[c][i * 3 + j];
559
         }
560
         for (unsigned c = local_child_count; c < 4; ++c) {
561
            for (unsigned i = 0; i < 2; ++i)
562
               for (unsigned j = 0; j < 3; ++j)
563
                  node->coords[c][i][j] = NAN;
564
         }
565
      }
566

567
      node_counts[(d & 1) ^ 1] = dst_count;
568
   }
569

570
   compute_bounds(base_ptr, header->root_node_offset, &header->aabb[0][0]);
571

572
   /* TODO init sizes and figure out what is needed for serialization. */
573

574
fail:
575
   device->ws->buffer_unmap(accel->bo);
576
   return result;
577
}
578

579
VkResult
580
radv_BuildAccelerationStructuresKHR(
581
   VkDevice _device, VkDeferredOperationKHR deferredOperation, uint32_t infoCount,
582
   const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
583
   const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
584
{
585
   RADV_FROM_HANDLE(radv_device, device, _device);
586
   VkResult result = VK_SUCCESS;
587

588
   for (uint32_t i = 0; i < infoCount; ++i) {
589
      result = build_bvh(device, pInfos + i, ppBuildRangeInfos[i]);
590
      if (result != VK_SUCCESS)
591
         break;
592
   }
593
   return result;
594
}
595

596
static nir_ssa_def *
597
get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id)
598
{
599
   const struct glsl_type *uvec3_type = glsl_vector_type(GLSL_TYPE_UINT, 3);
600
   nir_variable *result =
601
      nir_variable_create(b->shader, nir_var_shader_temp, uvec3_type, "indices");
602

603
   nir_push_if(b, nir_ult(b, type, nir_imm_int(b, 2)));
604
   nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_UINT16)));
605
   {
606
      nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 6));
607
      nir_ssa_def *indices[3];
608
      for (unsigned i = 0; i < 3; ++i) {
609
         indices[i] = nir_build_load_global(
610
            b, 1, 16, nir_iadd(b, addr, nir_u2u64(b, nir_iadd(b, index_id, nir_imm_int(b, 2 * i)))),
611
            .align_mul = 2, .align_offset = 0);
612
      }
613
      nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);
614
   }
615
   nir_push_else(b, NULL);
616
   {
617
      nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 12));
618
      nir_ssa_def *indices = nir_build_load_global(
619
         b, 3, 32, nir_iadd(b, addr, nir_u2u64(b, index_id)), .align_mul = 4, .align_offset = 0);
620
      nir_store_var(b, result, indices, 7);
621
   }
622
   nir_pop_if(b, NULL);
623
   nir_push_else(b, NULL);
624
   {
625
      nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 3));
626
      nir_ssa_def *indices[] = {
627
         index_id,
628
         nir_iadd(b, index_id, nir_imm_int(b, 1)),
629
         nir_iadd(b, index_id, nir_imm_int(b, 2)),
630
      };
631

632
      nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_NONE_KHR)));
633
      {
634
         nir_store_var(b, result, nir_vec(b, indices, 3), 7);
635
      }
636
      nir_push_else(b, NULL);
637
      {
638
         for (unsigned i = 0; i < 3; ++i) {
639
            indices[i] = nir_build_load_global(b, 1, 8, nir_iadd(b, addr, nir_u2u64(b, indices[i])),
640
                                               .align_mul = 1, .align_offset = 0);
641
         }
642
         nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);
643
      }
644
      nir_pop_if(b, NULL);
645
   }
646
   nir_pop_if(b, NULL);
647
   return nir_load_var(b, result);
648
}
649

650
static void
651
get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ssa_def *positions[3])
652
{
653
   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
654
   nir_variable *results[3] = {
655
      nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex0"),
656
      nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex1"),
657
      nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex2")};
658

659
   VkFormat formats[] = {
660
      VK_FORMAT_R32G32B32_SFLOAT,
661
      VK_FORMAT_R32G32B32A32_SFLOAT,
662
      VK_FORMAT_R16G16B16_SFLOAT,
663
      VK_FORMAT_R16G16B16A16_SFLOAT,
664
   };
665

666
   for (unsigned f = 0; f < ARRAY_SIZE(formats); ++f) {
667
      if (f + 1 < ARRAY_SIZE(formats))
668
         nir_push_if(b, nir_ieq(b, format, nir_imm_int(b, formats[f])));
669

670
      for (unsigned i = 0; i < 3; ++i) {
671
         switch (formats[f]) {
672
         case VK_FORMAT_R32G32B32_SFLOAT:
673
         case VK_FORMAT_R32G32B32A32_SFLOAT:
674
            nir_store_var(b, results[i],
675
                          nir_build_load_global(b, 3, 32, nir_channel(b, addresses, i),
676
                                                .align_mul = 4, .align_offset = 0),
677
                          7);
678
            break;
679
         case VK_FORMAT_R16G16B16_SFLOAT:
680
         case VK_FORMAT_R16G16B16A16_SFLOAT: {
681
            nir_ssa_def *values[3];
682
            nir_ssa_def *addr = nir_channel(b, addresses, i);
683
            for (unsigned j = 0; j < 3; ++j)
684
               values[j] =
685
                  nir_build_load_global(b, 1, 16, nir_iadd(b, addr, nir_imm_int64(b, j * 2)),
686
                                        .align_mul = 2, .align_offset = 0);
687
            nir_store_var(b, results[i], nir_f2f32(b, nir_vec(b, values, 3)), 7);
688
            break;
689
         }
690
         default:
691
            unreachable("Unhandled format");
692
         }
693
      }
694
      if (f + 1 < ARRAY_SIZE(formats))
695
         nir_push_else(b, NULL);
696
   }
697
   for (unsigned f = 1; f < ARRAY_SIZE(formats); ++f) {
698
      nir_pop_if(b, NULL);
699
   }
700

701
   for (unsigned i = 0; i < 3; ++i)
702
      positions[i] = nir_load_var(b, results[i]);
703
}
704

705
struct build_primitive_constants {
706
   uint64_t node_dst_addr;
707
   uint64_t scratch_addr;
708
   uint32_t dst_offset;
709
   uint32_t dst_scratch_offset;
710
   uint32_t geometry_type;
711
   uint32_t geometry_id;
712

713
   union {
714
      struct {
715
         uint64_t vertex_addr;
716
         uint64_t index_addr;
717
         uint64_t transform_addr;
718
         uint32_t vertex_stride;
719
         uint32_t vertex_format;
720
         uint32_t index_format;
721
      };
722
      struct {
723
         uint64_t instance_data;
724
      };
725
      struct {
726
         uint64_t aabb_addr;
727
         uint32_t aabb_stride;
728
      };
729
   };
730
};
731

732
struct build_internal_constants {
733
   uint64_t node_dst_addr;
734
   uint64_t scratch_addr;
735
   uint32_t dst_offset;
736
   uint32_t dst_scratch_offset;
737
   uint32_t src_scratch_offset;
738
   uint32_t fill_header;
739
};
740

741
/* This inverts a 3x3 matrix using cofactors, as in e.g.
742
 * https://www.mathsisfun.com/algebra/matrix-inverse-minors-cofactors-adjugate.html */
743
static void
744
nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3])
745
{
746
   nir_ssa_def *cofactors[3][3];
747
   for (unsigned i = 0; i < 3; ++i) {
748
      for (unsigned j = 0; j < 3; ++j) {
749
         cofactors[i][j] =
750
            nir_fsub(b, nir_fmul(b, in[(i + 1) % 3][(j + 1) % 3], in[(i + 2) % 3][(j + 2) % 3]),
751
                     nir_fmul(b, in[(i + 1) % 3][(j + 2) % 3], in[(i + 2) % 3][(j + 1) % 3]));
752
      }
753
   }
754

755
   nir_ssa_def *det = NULL;
756
   for (unsigned i = 0; i < 3; ++i) {
757
      nir_ssa_def *det_part = nir_fmul(b, in[0][i], cofactors[0][i]);
758
      det = det ? nir_fadd(b, det, det_part) : det_part;
759
   }
760

761
   nir_ssa_def *det_inv = nir_frcp(b, det);
762
   for (unsigned i = 0; i < 3; ++i) {
763
      for (unsigned j = 0; j < 3; ++j) {
764
         out[i][j] = nir_fmul(b, cofactors[j][i], det_inv);
765
      }
766
   }
767
}
768

769
static nir_shader *
770
build_leaf_shader(struct radv_device *dev)
771
{
772
   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
773
   nir_builder b =
774
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_leaf_shader");
775

776
   b.shader->info.workgroup_size[0] = 64;
777
   b.shader->info.workgroup_size[1] = 1;
778
   b.shader->info.workgroup_size[2] = 1;
779

780
   nir_ssa_def *pconst0 =
781
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
782
   nir_ssa_def *pconst1 =
783
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);
784
   nir_ssa_def *pconst2 =
785
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 32, .range = 16);
786
   nir_ssa_def *pconst3 =
787
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 48, .range = 16);
788
   nir_ssa_def *pconst4 =
789
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 64, .range = 4);
790

791
   nir_ssa_def *geom_type = nir_channel(&b, pconst1, 2);
792
   nir_ssa_def *node_dst_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));
793
   nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));
794
   nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);
795
   nir_ssa_def *scratch_offset = nir_channel(&b, pconst1, 1);
796
   nir_ssa_def *geometry_id = nir_channel(&b, pconst1, 3);
797

798
   nir_ssa_def *global_id =
799
      nir_iadd(&b,
800
               nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
801
                          nir_imm_int(&b, b.shader->info.workgroup_size[0])),
802
               nir_channels(&b, nir_load_local_invocation_id(&b), 1));
803
   scratch_addr = nir_iadd(
804
      &b, scratch_addr,
805
      nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4)))));
806

807
   nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_TRIANGLES_KHR)));
808
   { /* Triangles */
809
      nir_ssa_def *vertex_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));
810
      nir_ssa_def *index_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 12));
811
      nir_ssa_def *transform_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst3, 3));
812
      nir_ssa_def *vertex_stride = nir_channel(&b, pconst3, 2);
813
      nir_ssa_def *vertex_format = nir_channel(&b, pconst3, 3);
814
      nir_ssa_def *index_format = nir_channel(&b, pconst4, 0);
815
      unsigned repl_swizzle[4] = {0, 0, 0, 0};
816

817
      nir_ssa_def *node_offset =
818
         nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));
819
      nir_ssa_def *triangle_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
820

821
      nir_ssa_def *indices = get_indices(&b, index_addr, index_format, global_id);
822
      nir_ssa_def *vertex_addresses = nir_iadd(
823
         &b, nir_u2u64(&b, nir_imul(&b, indices, nir_swizzle(&b, vertex_stride, repl_swizzle, 3))),
824
         nir_swizzle(&b, vertex_addr, repl_swizzle, 3));
825
      nir_ssa_def *positions[3];
826
      get_vertices(&b, vertex_addresses, vertex_format, positions);
827

828
      nir_ssa_def *node_data[16];
829
      memset(node_data, 0, sizeof(node_data));
830

831
      nir_variable *transform[] = {
832
         nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform0"),
833
         nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform1"),
834
         nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform2"),
835
      };
836
      nir_store_var(&b, transform[0], nir_imm_vec4(&b, 1.0, 0.0, 0.0, 0.0), 0xf);
837
      nir_store_var(&b, transform[1], nir_imm_vec4(&b, 0.0, 1.0, 0.0, 0.0), 0xf);
838
      nir_store_var(&b, transform[2], nir_imm_vec4(&b, 0.0, 0.0, 1.0, 0.0), 0xf);
839

840
      nir_push_if(&b, nir_ine(&b, transform_addr, nir_imm_int64(&b, 0)));
841
      nir_store_var(
842
         &b, transform[0],
843
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 0)),
844
                               .align_mul = 4, .align_offset = 0),
845
         0xf);
846
      nir_store_var(
847
         &b, transform[1],
848
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 16)),
849
                               .align_mul = 4, .align_offset = 0),
850
         0xf);
851
      nir_store_var(
852
         &b, transform[2],
853
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 32)),
854
                               .align_mul = 4, .align_offset = 0),
855
         0xf);
856
      nir_pop_if(&b, NULL);
857

858
      for (unsigned i = 0; i < 3; ++i)
859
         for (unsigned j = 0; j < 3; ++j)
860
            node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j]));
861

862
      node_data[12] = global_id;
863
      node_data[13] = geometry_id;
864
      node_data[15] = nir_imm_int(&b, 9);
865
      for (unsigned i = 0; i < ARRAY_SIZE(node_data); ++i)
866
         if (!node_data[i])
867
            node_data[i] = nir_imm_int(&b, 0);
868

869
      for (unsigned i = 0; i < 4; ++i) {
870
         nir_build_store_global(&b, nir_vec(&b, node_data + i * 4, 4),
871
                                nir_iadd(&b, triangle_node_dst_addr, nir_imm_int64(&b, i * 16)),
872
                                .write_mask = 15, .align_mul = 16, .align_offset = 0);
873
      }
874

875
      nir_ssa_def *node_id = nir_ushr(&b, node_offset, nir_imm_int(&b, 3));
876
      nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
877
                             .align_offset = 0);
878
   }
879
   nir_push_else(&b, NULL);
880
   nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_AABBS_KHR)));
881
   { /* AABBs */
882
      nir_ssa_def *aabb_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));
883
      nir_ssa_def *aabb_stride = nir_channel(&b, pconst2, 2);
884

885
      nir_ssa_def *node_offset =
886
         nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));
887
      nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
888
      nir_ssa_def *node_id =
889
         nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 7));
890
      nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
891
                             .align_offset = 0);
892

893
      aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id)));
894

895
      nir_ssa_def *min_bound =
896
         nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 0)),
897
                               .align_mul = 4, .align_offset = 0);
898
      nir_ssa_def *max_bound =
899
         nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 12)),
900
                               .align_mul = 4, .align_offset = 0);
901

902
      nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0),
903
                               nir_channel(&b, min_bound, 1),
904
                               nir_channel(&b, min_bound, 2),
905
                               nir_channel(&b, max_bound, 0),
906
                               nir_channel(&b, max_bound, 1),
907
                               nir_channel(&b, max_bound, 2),
908
                               global_id,
909
                               geometry_id};
910

911
      nir_build_store_global(&b, nir_vec(&b, values + 0, 4),
912
                             nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 0)),
913
                             .write_mask = 15, .align_mul = 16, .align_offset = 0);
914
      nir_build_store_global(&b, nir_vec(&b, values + 4, 4),
915
                             nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 16)),
916
                             .write_mask = 15, .align_mul = 16, .align_offset = 0);
917
   }
918
   nir_push_else(&b, NULL);
919
   { /* Instances */
920

921
      nir_ssa_def *instance_addr =
922
         nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),
923
                  nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64))));
924
      nir_ssa_def *inst_transform[] = {
925
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 0)),
926
                               .align_mul = 4, .align_offset = 0),
927
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 16)),
928
                               .align_mul = 4, .align_offset = 0),
929
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 32)),
930
                               .align_mul = 4, .align_offset = 0)};
931
      nir_ssa_def *inst3 =
932
         nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 48)),
933
                               .align_mul = 4, .align_offset = 0);
934

935
      nir_ssa_def *node_offset =
936
         nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128)));
937
      node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
938
      nir_ssa_def *node_id =
939
         nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 6));
940
      nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
941
                             .align_offset = 0);
942

943
      nir_variable *bounds[2] = {
944
         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
945
         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
946
      };
947

948
      nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
949
      nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
950

951
      nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12));
952
      nir_push_if(&b, nir_ine(&b, header_addr, nir_imm_int64(&b, 0)));
953
      nir_ssa_def *header_root_offset =
954
         nir_build_load_global(&b, 1, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 0)),
955
                               .align_mul = 4, .align_offset = 0);
956
      nir_ssa_def *header_min =
957
         nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 8)),
958
                               .align_mul = 4, .align_offset = 0);
959
      nir_ssa_def *header_max =
960
         nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 20)),
961
                               .align_mul = 4, .align_offset = 0);
962

963
      nir_ssa_def *bound_defs[2][3];
964
      for (unsigned i = 0; i < 3; ++i) {
965
         bound_defs[0][i] = bound_defs[1][i] = nir_channel(&b, inst_transform[i], 3);
966

967
         nir_ssa_def *mul_a = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_min);
968
         nir_ssa_def *mul_b = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_max);
969
         nir_ssa_def *mi = nir_fmin(&b, mul_a, mul_b);
970
         nir_ssa_def *ma = nir_fmax(&b, mul_a, mul_b);
971
         for (unsigned j = 0; j < 3; ++j) {
972
            bound_defs[0][i] = nir_fadd(&b, bound_defs[0][i], nir_channel(&b, mi, j));
973
            bound_defs[1][i] = nir_fadd(&b, bound_defs[1][i], nir_channel(&b, ma, j));
974
         }
975
      }
976

977
      nir_store_var(&b, bounds[0], nir_vec(&b, bound_defs[0], 3), 7);
978
      nir_store_var(&b, bounds[1], nir_vec(&b, bound_defs[1], 3), 7);
979

980
      nir_ssa_def *m_in[3][3], *m_out[3][3], *m_vec[3][4];
981
      for (unsigned i = 0; i < 3; ++i)
982
         for (unsigned j = 0; j < 3; ++j)
983
            m_in[i][j] = nir_channel(&b, inst_transform[i], j);
984
      nir_invert_3x3(&b, m_in, m_out);
985
      for (unsigned i = 0; i < 3; ++i) {
986
         for (unsigned j = 0; j < 3; ++j)
987
            m_vec[i][j] = m_out[i][j];
988
         m_vec[i][3] = nir_channel(&b, inst_transform[i], 3);
989
      }
990

991
      for (unsigned i = 0; i < 3; ++i) {
992
         nir_build_store_global(&b, nir_vec(&b, m_vec[i], 4),
993
                                nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 16 * i)),
994
                                .write_mask = 0xf, .align_mul = 4, .align_offset = 0);
995
      }
996

997
      nir_ssa_def *out0[4] = {
998
         nir_ior(&b, nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 0), header_root_offset),
999
         nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 1), nir_channel(&b, inst3, 0),
1000
         nir_channel(&b, inst3, 1)};
1001
      nir_build_store_global(&b, nir_vec(&b, out0, 4),
1002
                             nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)), .write_mask = 0xf,
1003
                             .align_mul = 4, .align_offset = 0);
1004
      nir_build_store_global(&b, global_id, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 88)),
1005
                             .write_mask = 0x1, .align_mul = 4, .align_offset = 0);
1006
      nir_pop_if(&b, NULL);
1007
      nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
1008
                             nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 64)), .write_mask = 0x7,
1009
                             .align_mul = 4, .align_offset = 0);
1010
      nir_build_store_global(&b, nir_load_var(&b, bounds[1]),
1011
                             nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 76)), .write_mask = 0x7,
1012
                             .align_mul = 4, .align_offset = 0);
1013
   }
1014
   nir_pop_if(&b, NULL);
1015
   nir_pop_if(&b, NULL);
1016

1017
   return b.shader;
1018
}
1019

1020
static void
1021
determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id,
1022
                 nir_variable *bounds_vars[2])
1023
{
1024
   nir_ssa_def *node_type = nir_iand(b, node_id, nir_imm_int(b, 7));
1025
   node_addr = nir_iadd(
1026
      b, node_addr,
1027
      nir_u2u64(b, nir_ishl(b, nir_iand(b, node_id, nir_imm_int(b, ~7u)), nir_imm_int(b, 3))));
1028

1029
   nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 0)));
1030
   {
1031
      nir_ssa_def *positions[3];
1032
      for (unsigned i = 0; i < 3; ++i)
1033
         positions[i] =
1034
            nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),
1035
                                  .align_mul = 4, .align_offset = 0);
1036
      nir_ssa_def *bounds[] = {positions[0], positions[0]};
1037
      for (unsigned i = 1; i < 3; ++i) {
1038
         bounds[0] = nir_fmin(b, bounds[0], positions[i]);
1039
         bounds[1] = nir_fmax(b, bounds[1], positions[i]);
1040
      }
1041
      nir_store_var(b, bounds_vars[0], bounds[0], 7);
1042
      nir_store_var(b, bounds_vars[1], bounds[1], 7);
1043
   }
1044
   nir_push_else(b, NULL);
1045
   nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 5)));
1046
   {
1047
      nir_ssa_def *input_bounds[4][2];
1048
      for (unsigned i = 0; i < 4; ++i)
1049
         for (unsigned j = 0; j < 2; ++j)
1050
            input_bounds[i][j] = nir_build_load_global(
1051
               b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 16 + i * 24 + j * 12)),
1052
               .align_mul = 4, .align_offset = 0);
1053
      nir_ssa_def *bounds[] = {input_bounds[0][0], input_bounds[0][1]};
1054
      for (unsigned i = 1; i < 4; ++i) {
1055
         bounds[0] = nir_fmin(b, bounds[0], input_bounds[i][0]);
1056
         bounds[1] = nir_fmax(b, bounds[1], input_bounds[i][1]);
1057
      }
1058

1059
      nir_store_var(b, bounds_vars[0], bounds[0], 7);
1060
      nir_store_var(b, bounds_vars[1], bounds[1], 7);
1061
   }
1062
   nir_push_else(b, NULL);
1063
   nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 6)));
1064
   { /* Instances */
1065
      nir_ssa_def *bounds[2];
1066
      for (unsigned i = 0; i < 2; ++i)
1067
         bounds[i] =
1068
            nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 64 + i * 12)),
1069
                                  .align_mul = 4, .align_offset = 0);
1070
      nir_store_var(b, bounds_vars[0], bounds[0], 7);
1071
      nir_store_var(b, bounds_vars[1], bounds[1], 7);
1072
   }
1073
   nir_push_else(b, NULL);
1074
   { /* AABBs */
1075
      nir_ssa_def *bounds[2];
1076
      for (unsigned i = 0; i < 2; ++i)
1077
         bounds[i] =
1078
            nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),
1079
                                  .align_mul = 4, .align_offset = 0);
1080
      nir_store_var(b, bounds_vars[0], bounds[0], 7);
1081
      nir_store_var(b, bounds_vars[1], bounds[1], 7);
1082
   }
1083
   nir_pop_if(b, NULL);
1084
   nir_pop_if(b, NULL);
1085
   nir_pop_if(b, NULL);
1086
}
1087

1088
static nir_shader *
1089
build_internal_shader(struct radv_device *dev)
1090
{
1091
   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
1092
   nir_builder b =
1093
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_internal_shader");
1094

1095
   b.shader->info.workgroup_size[0] = 64;
1096
   b.shader->info.workgroup_size[1] = 1;
1097
   b.shader->info.workgroup_size[2] = 1;
1098

1099
   /*
1100
    * push constants:
1101
    *   i32 x 2: node dst address
1102
    *   i32 x 2: scratch address
1103
    *   i32: dst offset
1104
    *   i32: dst scratch offset
1105
    *   i32: src scratch offset
1106
    *   i32: src_node_count | (fill_header << 31)
1107
    */
1108
   nir_ssa_def *pconst0 =
1109
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
1110
   nir_ssa_def *pconst1 =
1111
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);
1112

1113
   nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));
1114
   nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));
1115
   nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);
1116
   nir_ssa_def *dst_scratch_offset = nir_channel(&b, pconst1, 1);
1117
   nir_ssa_def *src_scratch_offset = nir_channel(&b, pconst1, 2);
1118
   nir_ssa_def *src_node_count =
1119
      nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x7FFFFFFFU));
1120
   nir_ssa_def *fill_header =
1121
      nir_ine(&b, nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x80000000U)),
1122
              nir_imm_int(&b, 0));
1123

1124
   nir_ssa_def *global_id =
1125
      nir_iadd(&b,
1126
               nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
1127
                          nir_imm_int(&b, b.shader->info.workgroup_size[0])),
1128
               nir_channels(&b, nir_load_local_invocation_id(&b), 1));
1129
   nir_ssa_def *src_idx = nir_imul(&b, global_id, nir_imm_int(&b, 4));
1130
   nir_ssa_def *src_count = nir_umin(&b, nir_imm_int(&b, 4), nir_isub(&b, src_node_count, src_idx));
1131

1132
   nir_ssa_def *node_offset =
1133
      nir_iadd(&b, node_dst_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 7)));
1134
   nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset));
1135
   nir_ssa_def *src_nodes = nir_build_load_global(
1136
      &b, 4, 32,
1137
      nir_iadd(&b, scratch_addr,
1138
               nir_u2u64(&b, nir_iadd(&b, src_scratch_offset,
1139
                                      nir_ishl(&b, global_id, nir_imm_int(&b, 4))))),
1140
      .align_mul = 4, .align_offset = 0);
1141

1142
   nir_build_store_global(&b, src_nodes, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)),
1143
                          .write_mask = 0xf, .align_mul = 4, .align_offset = 0);
1144

1145
   nir_ssa_def *total_bounds[2] = {
1146
      nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
1147
      nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
1148
   };
1149

1150
   for (unsigned i = 0; i < 4; ++i) {
1151
      nir_variable *bounds[2] = {
1152
         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
1153
         nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
1154
      };
1155
      nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
1156
      nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
1157

1158
      nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count));
1159
      determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds);
1160
      nir_pop_if(&b, NULL);
1161
      nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
1162
                             nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 24 * i)),
1163
                             .write_mask = 0x7, .align_mul = 4, .align_offset = 0);
1164
      nir_build_store_global(&b, nir_load_var(&b, bounds[1]),
1165
                             nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 28 + 24 * i)),
1166
                             .write_mask = 0x7, .align_mul = 4, .align_offset = 0);
1167
      total_bounds[0] = nir_fmin(&b, total_bounds[0], nir_load_var(&b, bounds[0]));
1168
      total_bounds[1] = nir_fmax(&b, total_bounds[1], nir_load_var(&b, bounds[1]));
1169
   }
1170

1171
   nir_ssa_def *node_id =
1172
      nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 5));
1173
   nir_ssa_def *dst_scratch_addr = nir_iadd(
1174
      &b, scratch_addr,
1175
      nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 2)))));
1176
   nir_build_store_global(&b, node_id, dst_scratch_addr, .write_mask = 1, .align_mul = 4,
1177
                          .align_offset = 0);
1178

1179
   nir_push_if(&b, fill_header);
1180
   nir_build_store_global(&b, node_id, node_addr, .write_mask = 1, .align_mul = 4,
1181
                          .align_offset = 0);
1182
   nir_build_store_global(&b, total_bounds[0], nir_iadd(&b, node_addr, nir_imm_int64(&b, 8)),
1183
                          .write_mask = 7, .align_mul = 4, .align_offset = 0);
1184
   nir_build_store_global(&b, total_bounds[1], nir_iadd(&b, node_addr, nir_imm_int64(&b, 20)),
1185
                          .write_mask = 7, .align_mul = 4, .align_offset = 0);
1186
   nir_pop_if(&b, NULL);
1187
   return b.shader;
1188
}
1189

1190
void
1191
radv_device_finish_accel_struct_build_state(struct radv_device *device)
1192
{
1193
   struct radv_meta_state *state = &device->meta_state;
1194
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.internal_pipeline,
1195
                        &state->alloc);
1196
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,
1197
                        &state->alloc);
1198
   radv_DestroyPipelineLayout(radv_device_to_handle(device),
1199
                              state->accel_struct_build.internal_p_layout, &state->alloc);
1200
   radv_DestroyPipelineLayout(radv_device_to_handle(device),
1201
                              state->accel_struct_build.leaf_p_layout, &state->alloc);
1202
}
1203

1204
VkResult
1205
radv_device_init_accel_struct_build_state(struct radv_device *device)
1206
{
1207
   VkResult result;
1208
   nir_shader *leaf_cs = build_leaf_shader(device);
1209
   nir_shader *internal_cs = build_internal_shader(device);
1210

1211
   const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
1212
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1213
      .setLayoutCount = 0,
1214
      .pushConstantRangeCount = 1,
1215
      .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
1216
                                                    sizeof(struct build_primitive_constants)},
1217
   };
1218

1219
   result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
1220
                                      &device->meta_state.alloc,
1221
                                      &device->meta_state.accel_struct_build.leaf_p_layout);
1222
   if (result != VK_SUCCESS)
1223
      goto fail;
1224

1225
   VkPipelineShaderStageCreateInfo leaf_shader_stage = {
1226
      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1227
      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1228
      .module = vk_shader_module_handle_from_nir(leaf_cs),
1229
      .pName = "main",
1230
      .pSpecializationInfo = NULL,
1231
   };
1232

1233
   VkComputePipelineCreateInfo leaf_pipeline_info = {
1234
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1235
      .stage = leaf_shader_stage,
1236
      .flags = 0,
1237
      .layout = device->meta_state.accel_struct_build.leaf_p_layout,
1238
   };
1239

1240
   result = radv_CreateComputePipelines(
1241
      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
1242
      &leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline);
1243
   if (result != VK_SUCCESS)
1244
      goto fail;
1245

1246
   const VkPipelineLayoutCreateInfo internal_pl_create_info = {
1247
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1248
      .setLayoutCount = 0,
1249
      .pushConstantRangeCount = 1,
1250
      .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
1251
                                                    sizeof(struct build_internal_constants)},
1252
   };
1253

1254
   result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info,
1255
                                      &device->meta_state.alloc,
1256
                                      &device->meta_state.accel_struct_build.internal_p_layout);
1257
   if (result != VK_SUCCESS)
1258
      goto fail;
1259

1260
   VkPipelineShaderStageCreateInfo internal_shader_stage = {
1261
      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1262
      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1263
      .module = vk_shader_module_handle_from_nir(internal_cs),
1264
      .pName = "main",
1265
      .pSpecializationInfo = NULL,
1266
   };
1267

1268
   VkComputePipelineCreateInfo internal_pipeline_info = {
1269
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1270
      .stage = internal_shader_stage,
1271
      .flags = 0,
1272
      .layout = device->meta_state.accel_struct_build.internal_p_layout,
1273
   };
1274

1275
   result = radv_CreateComputePipelines(
1276
      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
1277
      &internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline);
1278
   if (result != VK_SUCCESS)
1279
      goto fail;
1280

1281
   return VK_SUCCESS;
1282

1283
fail:
1284
   radv_device_finish_accel_struct_build_state(device);
1285
   ralloc_free(internal_cs);
1286
   ralloc_free(leaf_cs);
1287
   return result;
1288
}
1289

1290
struct bvh_state {
1291
   uint32_t node_offset;
1292
   uint32_t node_count;
1293
   uint32_t scratch_offset;
1294
};
1295

1296
void
1297
radv_CmdBuildAccelerationStructuresKHR(
1298
   VkCommandBuffer commandBuffer, uint32_t infoCount,
1299
   const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
1300
   const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
1301
{
1302
   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1303
   struct radv_meta_saved_state saved_state;
1304

1305
   radv_meta_save(
1306
      &saved_state, cmd_buffer,
1307
      RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
1308
   struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
1309

1310
   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
1311
                        cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline);
1312

1313
   for (uint32_t i = 0; i < infoCount; ++i) {
1314
      RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
1315
                       pInfos[i].dstAccelerationStructure);
1316

1317
      struct build_primitive_constants prim_consts = {
1318
         .node_dst_addr = radv_accel_struct_get_va(accel_struct),
1319
         .scratch_addr = pInfos[i].scratchData.deviceAddress,
1320
         .dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128,
1321
         .dst_scratch_offset = 0,
1322
      };
1323

1324
      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
1325
         const VkAccelerationStructureGeometryKHR *geom =
1326
            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
1327

1328
         prim_consts.geometry_type = geom->geometryType;
1329
         prim_consts.geometry_id = j | (geom->flags << 28);
1330
         unsigned prim_size;
1331
         switch (geom->geometryType) {
1332
         case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
1333
            prim_consts.vertex_addr =
1334
               geom->geometry.triangles.vertexData.deviceAddress +
1335
               ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride +
1336
               (geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR
1337
                   ? ppBuildRangeInfos[i][j].primitiveOffset
1338
                   : 0);
1339
            prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress +
1340
                                     ppBuildRangeInfos[i][j].primitiveOffset;
1341
            prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress +
1342
                                         ppBuildRangeInfos[i][j].transformOffset;
1343
            prim_consts.vertex_stride = geom->geometry.triangles.vertexStride;
1344
            prim_consts.vertex_format = geom->geometry.triangles.vertexFormat;
1345
            prim_consts.index_format = geom->geometry.triangles.indexType;
1346
            prim_size = 64;
1347
            break;
1348
         case VK_GEOMETRY_TYPE_AABBS_KHR:
1349
            prim_consts.aabb_addr =
1350
               geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset;
1351
            prim_consts.aabb_stride = geom->geometry.aabbs.stride;
1352
            prim_size = 64;
1353
            break;
1354
         case VK_GEOMETRY_TYPE_INSTANCES_KHR:
1355
            prim_consts.instance_data = geom->geometry.instances.data.deviceAddress;
1356
            prim_size = 128;
1357
            break;
1358
         default:
1359
            unreachable("Unknown geometryType");
1360
         }
1361

1362
         radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1363
                               cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
1364
                               VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts);
1365
         radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);
1366
         prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;
1367
         prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;
1368
      }
1369
      bvh_states[i].node_offset = prim_consts.dst_offset;
1370
      bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4;
1371
   }
1372

1373
   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
1374
                        cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline);
1375
   bool progress = true;
1376
   for (unsigned iter = 0; progress; ++iter) {
1377
      progress = false;
1378
      for (uint32_t i = 0; i < infoCount; ++i) {
1379
         RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
1380
                          pInfos[i].dstAccelerationStructure);
1381

1382
         if (iter && bvh_states[i].node_count == 1)
1383
            continue;
1384

1385
         if (!progress) {
1386
            cmd_buffer->state.flush_bits |=
1387
               RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1388
               radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, NULL) |
1389
               radv_dst_access_flush(cmd_buffer,
1390
                                     VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, NULL);
1391
         }
1392
         progress = true;
1393
         uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4));
1394
         bool final_iter = dst_node_count == 1;
1395
         uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
1396
         uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4;
1397
         uint32_t dst_node_offset = bvh_states[i].node_offset;
1398
         if (final_iter)
1399
            dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);
1400

1401
         const struct build_internal_constants consts = {
1402
            .node_dst_addr = radv_accel_struct_get_va(accel_struct),
1403
            .scratch_addr = pInfos[i].scratchData.deviceAddress,
1404
            .dst_offset = dst_node_offset,
1405
            .dst_scratch_offset = dst_scratch_offset,
1406
            .src_scratch_offset = src_scratch_offset,
1407
            .fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),
1408
         };
1409

1410
         radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1411
                               cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout,
1412
                               VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
1413
         radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1);
1414
         bvh_states[i].node_offset += dst_node_count * 128;
1415
         bvh_states[i].node_count = dst_node_count;
1416
         bvh_states[i].scratch_offset = dst_scratch_offset;
1417
      }
1418
   }
1419
   free(bvh_states);
1420
   radv_meta_restore(&saved_state, cmd_buffer);
1421
}
1422
Product

Resources

Company