Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/kernels/builders/bvh_builder_morton.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "../common/builder.h"
7
#include "../../common/algorithms/parallel_reduce.h"
8
#include "../../common/algorithms/parallel_sort.h"
9
10
namespace embree
11
{
12
namespace isa
13
{
14
struct BVHBuilderMorton
15
{
16
static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
17
static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
18
19
/*! settings for morton builder */
20
struct Settings
21
{
22
/*! default settings */
23
Settings ()
24
: branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
25
26
/*! initialize settings from API settings */
27
Settings (const RTCBuildArguments& settings)
28
: branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
29
{
30
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
31
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
32
if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
33
if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
34
35
minLeafSize = min(minLeafSize,maxLeafSize);
36
}
37
38
Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
39
: branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
40
{
41
minLeafSize = min(minLeafSize,maxLeafSize);
42
}
43
44
public:
45
size_t branchingFactor; //!< branching factor of BVH to build
46
size_t maxDepth; //!< maximum depth of BVH to build
47
size_t minLeafSize; //!< minimum size of a leaf
48
size_t maxLeafSize; //!< maximum size of a leaf
49
size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
50
};
51
52
/*! Build primitive consisting of morton code and primitive ID. */
53
struct __aligned(8) BuildPrim
54
{
55
union {
56
struct {
57
unsigned int code; //!< morton code
58
unsigned int index; //!< i'th primitive
59
};
60
uint64_t t;
61
};
62
63
/*! interface for radix sort */
64
__forceinline operator unsigned() const { return code; }
65
66
/*! interface for standard sort */
67
__forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
68
};
69
70
/*! maps bounding box to morton code */
71
struct MortonCodeMapping
72
{
73
static const size_t LATTICE_BITS_PER_DIM = 10;
74
static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
75
76
vfloat4 base;
77
vfloat4 scale;
78
79
__forceinline MortonCodeMapping(const BBox3fa& bounds)
80
{
81
base = (vfloat4)bounds.lower;
82
const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
83
scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
84
}
85
86
__forceinline const vint4 bin (const BBox3fa& box) const
87
{
88
const vfloat4 lower = (vfloat4)box.lower;
89
const vfloat4 upper = (vfloat4)box.upper;
90
const vfloat4 centroid = lower+upper;
91
return vint4((centroid-base)*scale);
92
}
93
94
__forceinline unsigned int code (const BBox3fa& box) const
95
{
96
const vint4 binID = bin(box);
97
const unsigned int x = extract<0>(binID);
98
const unsigned int y = extract<1>(binID);
99
const unsigned int z = extract<2>(binID);
100
const unsigned int xyz = bitInterleave(x,y,z);
101
return xyz;
102
}
103
};
104
105
#if defined (__AVX2__) || defined(__SYCL_DEVICE_ONLY__)
106
107
/*! for AVX2 there is a fast scalar bitInterleave */
108
struct MortonCodeGenerator
109
{
110
__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
111
: mapping(mapping), dest(dest) {}
112
113
__forceinline void operator() (const BBox3fa& b, const unsigned index)
114
{
115
dest->index = index;
116
dest->code = mapping.code(b);
117
dest++;
118
}
119
120
public:
121
const MortonCodeMapping mapping;
122
BuildPrim* dest;
123
size_t currentID;
124
};
125
126
#else
127
128
/*! before AVX2 is it better to use the SSE version of bitInterleave */
129
struct MortonCodeGenerator
130
{
131
__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
132
: mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
133
134
__forceinline ~MortonCodeGenerator()
135
{
136
if (slots != 0)
137
{
138
const vint4 code = bitInterleave(ax,ay,az);
139
for (size_t i=0; i<slots; i++) {
140
dest[currentID-slots+i].index = ai[i];
141
dest[currentID-slots+i].code = code[i];
142
}
143
}
144
}
145
146
__forceinline void operator() (const BBox3fa& b, const unsigned index)
147
{
148
const vint4 binID = mapping.bin(b);
149
ax[slots] = extract<0>(binID);
150
ay[slots] = extract<1>(binID);
151
az[slots] = extract<2>(binID);
152
ai[slots] = index;
153
slots++;
154
currentID++;
155
156
if (slots == 4)
157
{
158
const vint4 code = bitInterleave(ax,ay,az);
159
vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
160
vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
161
slots = 0;
162
}
163
}
164
165
public:
166
const MortonCodeMapping mapping;
167
BuildPrim* dest;
168
size_t currentID;
169
size_t slots;
170
vint4 ax, ay, az, ai;
171
};
172
173
#endif
174
175
template<
176
typename ReductionTy,
177
typename Allocator,
178
typename CreateAllocator,
179
typename CreateNodeFunc,
180
typename SetNodeBoundsFunc,
181
typename CreateLeafFunc,
182
typename CalculateBounds,
183
typename ProgressMonitor>
184
185
class BuilderT : private Settings
186
{
187
ALIGNED_CLASS_(16);
188
189
public:
190
191
BuilderT (CreateAllocator& createAllocator,
192
CreateNodeFunc& createNode,
193
SetNodeBoundsFunc& setBounds,
194
CreateLeafFunc& createLeaf,
195
CalculateBounds& calculateBounds,
196
ProgressMonitor& progressMonitor,
197
const Settings& settings)
198
199
: Settings(settings),
200
createAllocator(createAllocator),
201
createNode(createNode),
202
setBounds(setBounds),
203
createLeaf(createLeaf),
204
calculateBounds(calculateBounds),
205
progressMonitor(progressMonitor),
206
morton(nullptr) {}
207
208
ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
209
{
210
/* this should never occur but is a fatal error */
211
if (depth > maxDepth)
212
throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
213
214
/* create leaf for few primitives */
215
if (current.size() <= maxLeafSize)
216
return createLeaf(current,alloc);
217
218
/* fill all children by always splitting the largest one */
219
range<unsigned> children[MAX_BRANCHING_FACTOR];
220
size_t numChildren = 1;
221
children[0] = current;
222
223
do {
224
225
/* find best child with largest number of primitives */
226
size_t bestChild = -1;
227
size_t bestSize = 0;
228
for (size_t i=0; i<numChildren; i++)
229
{
230
/* ignore leaves as they cannot get split */
231
if (children[i].size() <= maxLeafSize)
232
continue;
233
234
/* remember child with largest size */
235
if (children[i].size() > bestSize) {
236
bestSize = children[i].size();
237
bestChild = i;
238
}
239
}
240
if (bestChild == size_t(-1)) break;
241
242
/*! split best child into left and right child */
243
auto split = children[bestChild].split();
244
245
/* add new children left and right */
246
children[bestChild] = children[numChildren-1];
247
children[numChildren-1] = split.first;
248
children[numChildren+0] = split.second;
249
numChildren++;
250
251
} while (numChildren < branchingFactor);
252
253
/* create node */
254
auto node = createNode(alloc,numChildren);
255
256
/* recurse into each child */
257
ReductionTy bounds[MAX_BRANCHING_FACTOR];
258
for (size_t i=0; i<numChildren; i++)
259
bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
260
261
return setBounds(node,bounds,numChildren);
262
}
263
264
/*! recreates morton codes when reaching a region where all codes are identical */
265
__noinline void recreateMortonCodes(const range<unsigned>& current) const
266
{
267
/* fast path for small ranges */
268
if (likely(current.size() < 1024))
269
{
270
/*! recalculate centroid bounds */
271
BBox3fa centBounds(empty);
272
for (size_t i=current.begin(); i<current.end(); i++)
273
centBounds.extend(center2(calculateBounds(morton[i])));
274
275
/* recalculate morton codes */
276
MortonCodeMapping mapping(centBounds);
277
for (size_t i=current.begin(); i<current.end(); i++)
278
morton[i].code = mapping.code(calculateBounds(morton[i]));
279
280
/* sort morton codes */
281
std::sort(morton+current.begin(),morton+current.end());
282
}
283
else
284
{
285
/*! recalculate centroid bounds */
286
auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
287
BBox3fa centBounds = empty;
288
for (size_t i=r.begin(); i<r.end(); i++)
289
centBounds.extend(center2(calculateBounds(morton[i])));
290
return centBounds;
291
};
292
const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
293
BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
294
295
/* recalculate morton codes */
296
MortonCodeMapping mapping(centBounds);
297
parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
298
for (size_t i=r.begin(); i<r.end(); i++) {
299
morton[i].code = mapping.code(calculateBounds(morton[i]));
300
}
301
});
302
303
/*! sort morton codes */
304
#if defined(TASKING_TBB)
305
tbb::parallel_sort(morton+current.begin(),morton+current.end());
306
#else
307
radixsort32(morton+current.begin(),current.size());
308
#endif
309
}
310
}
311
312
__forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
313
{
314
const unsigned int code_start = morton[current.begin()].code;
315
const unsigned int code_end = morton[current.end()-1].code;
316
unsigned int bitpos = lzcnt(code_start^code_end);
317
318
/* if all items mapped to same morton code, then re-create new morton codes for the items */
319
if (unlikely(bitpos == 32))
320
{
321
recreateMortonCodes(current);
322
const unsigned int code_start = morton[current.begin()].code;
323
const unsigned int code_end = morton[current.end()-1].code;
324
bitpos = lzcnt(code_start^code_end);
325
326
/* if the morton code is still the same, goto fall back split */
327
if (unlikely(bitpos == 32)) {
328
current.split(left,right);
329
return;
330
}
331
}
332
333
/* split the items at the topmost different morton code bit */
334
const unsigned int bitpos_diff = 31-bitpos;
335
const unsigned int bitmask = 1 << bitpos_diff;
336
337
/* find location where bit differs using binary search */
338
unsigned begin = current.begin();
339
unsigned end = current.end();
340
while (begin + 1 != end) {
341
const unsigned mid = (begin+end)/2;
342
const unsigned bit = morton[mid].code & bitmask;
343
if (bit == 0) begin = mid; else end = mid;
344
}
345
unsigned center = end;
346
#if defined(DEBUG)
347
for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
348
for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
349
#endif
350
351
left = make_range(current.begin(),center);
352
right = make_range(center,current.end());
353
}
354
355
ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
356
{
357
/* get thread local allocator */
358
if (!alloc)
359
alloc = createAllocator();
360
361
/* call memory monitor function to signal progress */
362
if (toplevel && current.size() <= singleThreadThreshold)
363
progressMonitor(current.size());
364
365
/* create leaf node */
366
if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
367
return createLargeLeaf(depth,current,alloc);
368
369
/* fill all children by always splitting the one with the largest surface area */
370
range<unsigned> children[MAX_BRANCHING_FACTOR];
371
split(current,children[0],children[1]);
372
size_t numChildren = 2;
373
374
while (numChildren < branchingFactor)
375
{
376
/* find best child with largest number of primitives */
377
int bestChild = -1;
378
unsigned bestItems = 0;
379
for (unsigned int i=0; i<numChildren; i++)
380
{
381
/* ignore leaves as they cannot get split */
382
if (children[i].size() <= minLeafSize)
383
continue;
384
385
/* remember child with largest area */
386
if (children[i].size() > bestItems) {
387
bestItems = children[i].size();
388
bestChild = i;
389
}
390
}
391
if (bestChild == -1) break;
392
393
/*! split best child into left and right child */
394
range<unsigned> left, right;
395
split(children[bestChild],left,right);
396
397
/* add new children left and right */
398
children[bestChild] = children[numChildren-1];
399
children[numChildren-1] = left;
400
children[numChildren+0] = right;
401
numChildren++;
402
}
403
404
/* create leaf node if no split is possible */
405
if (unlikely(numChildren == 1))
406
return createLeaf(current,alloc);
407
408
/* allocate node */
409
auto node = createNode(alloc,numChildren);
410
411
/* process top parts of tree parallel */
412
ReductionTy bounds[MAX_BRANCHING_FACTOR];
413
if (current.size() > singleThreadThreshold)
414
{
415
/*! parallel_for is faster than spawning sub-tasks */
416
parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
417
for (size_t i=r.begin(); i<r.end(); i++) {
418
bounds[i] = recurse(depth+1,children[i],nullptr,true);
419
_mm_mfence(); // to allow non-temporal stores during build
420
}
421
});
422
}
423
424
/* finish tree sequentially */
425
else
426
{
427
for (size_t i=0; i<numChildren; i++)
428
bounds[i] = recurse(depth+1,children[i],alloc,false);
429
}
430
431
return setBounds(node,bounds,numChildren);
432
}
433
434
/* build function */
435
ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
436
{
437
/* sort morton codes */
438
morton = src;
439
radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
440
441
/* build BVH */
442
const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
443
_mm_mfence(); // to allow non-temporal stores during build
444
return root;
445
}
446
447
public:
448
CreateAllocator& createAllocator;
449
CreateNodeFunc& createNode;
450
SetNodeBoundsFunc& setBounds;
451
CreateLeafFunc& createLeaf;
452
CalculateBounds& calculateBounds;
453
ProgressMonitor& progressMonitor;
454
455
public:
456
BuildPrim* morton;
457
};
458
459
460
template<
461
typename ReductionTy,
462
typename CreateAllocFunc,
463
typename CreateNodeFunc,
464
typename SetBoundsFunc,
465
typename CreateLeafFunc,
466
typename CalculateBoundsFunc,
467
typename ProgressMonitor>
468
469
static ReductionTy build(CreateAllocFunc createAllocator,
470
CreateNodeFunc createNode,
471
SetBoundsFunc setBounds,
472
CreateLeafFunc createLeaf,
473
CalculateBoundsFunc calculateBounds,
474
ProgressMonitor progressMonitor,
475
BuildPrim* src,
476
BuildPrim* tmp,
477
size_t numPrimitives,
478
const Settings& settings)
479
{
480
typedef BuilderT<
481
ReductionTy,
482
decltype(createAllocator()),
483
CreateAllocFunc,
484
CreateNodeFunc,
485
SetBoundsFunc,
486
CreateLeafFunc,
487
CalculateBoundsFunc,
488
ProgressMonitor> Builder;
489
490
Builder builder(createAllocator,
491
createNode,
492
setBounds,
493
createLeaf,
494
calculateBounds,
495
progressMonitor,
496
settings);
497
498
return builder.build(src,tmp,numPrimitives);
499
}
500
};
501
}
502
}
503
504