Path: blob/master/thirdparty/embree/kernels/builders/bvh_builder_morton.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "../common/builder.h"6#include "../../common/algorithms/parallel_reduce.h"7#include "../../common/algorithms/parallel_sort.h"89namespace embree10{11namespace isa12{13struct BVHBuilderMorton14{15static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor16static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth1718/*! settings for morton builder */19struct Settings20{21/*! default settings */22Settings ()23: branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}2425/*! initialize settings from API settings */26Settings (const RTCBuildArguments& settings)27: branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)28{29if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;30if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;31if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;32if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;3334minLeafSize = min(minLeafSize,maxLeafSize);35}3637Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)38: branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)39{40minLeafSize = min(minLeafSize,maxLeafSize);41}4243public:44size_t branchingFactor; //!< branching factor of BVH to build45size_t maxDepth; //!< maximum depth of BVH to build46size_t minLeafSize; //!< minimum size of a leaf47size_t maxLeafSize; //!< maximum size of a leaf48size_t singleThreadThreshold; //!< threshold when we switch to single threaded build49};5051/*! Build primitive consisting of morton code and primitive ID. */52struct __aligned(8) BuildPrim53{54union {55struct {56unsigned int code; //!< morton code57unsigned int index; //!< i'th primitive58};59uint64_t t;60};6162/*! interface for radix sort */63__forceinline operator unsigned() const { return code; }6465/*! interface for standard sort */66__forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }67};6869/*! maps bounding box to morton code */70struct MortonCodeMapping71{72static const size_t LATTICE_BITS_PER_DIM = 10;73static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;7475vfloat4 base;76vfloat4 scale;7778__forceinline MortonCodeMapping(const BBox3fa& bounds)79{80base = (vfloat4)bounds.lower;81const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;82scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));83}8485__forceinline const vint4 bin (const BBox3fa& box) const86{87const vfloat4 lower = (vfloat4)box.lower;88const vfloat4 upper = (vfloat4)box.upper;89const vfloat4 centroid = lower+upper;90return vint4((centroid-base)*scale);91}9293__forceinline unsigned int code (const BBox3fa& box) const94{95const vint4 binID = bin(box);96const unsigned int x = extract<0>(binID);97const unsigned int y = extract<1>(binID);98const unsigned int z = extract<2>(binID);99const unsigned int xyz = bitInterleave(x,y,z);100return xyz;101}102};103104#if defined (__AVX2__) || defined(__SYCL_DEVICE_ONLY__)105106/*! for AVX2 there is a fast scalar bitInterleave */107struct MortonCodeGenerator108{109__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)110: mapping(mapping), dest(dest) {}111112__forceinline void operator() (const BBox3fa& b, const unsigned index)113{114dest->index = index;115dest->code = mapping.code(b);116dest++;117}118119public:120const MortonCodeMapping mapping;121BuildPrim* dest;122size_t currentID;123};124125#else126127/*! before AVX2 is it better to use the SSE version of bitInterleave */128struct MortonCodeGenerator129{130__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)131: mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}132133__forceinline ~MortonCodeGenerator()134{135if (slots != 0)136{137const vint4 code = bitInterleave(ax,ay,az);138for (size_t i=0; i<slots; i++) {139dest[currentID-slots+i].index = ai[i];140dest[currentID-slots+i].code = code[i];141}142}143}144145__forceinline void operator() (const BBox3fa& b, const unsigned index)146{147const vint4 binID = mapping.bin(b);148ax[slots] = extract<0>(binID);149ay[slots] = extract<1>(binID);150az[slots] = extract<2>(binID);151ai[slots] = index;152slots++;153currentID++;154155if (slots == 4)156{157const vint4 code = bitInterleave(ax,ay,az);158vint4::storeu(&dest[currentID-4],unpacklo(code,ai));159vint4::storeu(&dest[currentID-2],unpackhi(code,ai));160slots = 0;161}162}163164public:165const MortonCodeMapping mapping;166BuildPrim* dest;167size_t currentID;168size_t slots;169vint4 ax, ay, az, ai;170};171172#endif173174template<175typename ReductionTy,176typename Allocator,177typename CreateAllocator,178typename CreateNodeFunc,179typename SetNodeBoundsFunc,180typename CreateLeafFunc,181typename CalculateBounds,182typename ProgressMonitor>183184class BuilderT : private Settings185{186ALIGNED_CLASS_(16);187188public:189190BuilderT (CreateAllocator& createAllocator,191CreateNodeFunc& createNode,192SetNodeBoundsFunc& setBounds,193CreateLeafFunc& createLeaf,194CalculateBounds& calculateBounds,195ProgressMonitor& progressMonitor,196const Settings& settings)197198: Settings(settings),199createAllocator(createAllocator),200createNode(createNode),201setBounds(setBounds),202createLeaf(createLeaf),203calculateBounds(calculateBounds),204progressMonitor(progressMonitor),205morton(nullptr) {}206207ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)208{209/* this should never occur but is a fatal error */210if (depth > maxDepth)211throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");212213/* create leaf for few primitives */214if (current.size() <= maxLeafSize)215return createLeaf(current,alloc);216217/* fill all children by always splitting the largest one */218range<unsigned> children[MAX_BRANCHING_FACTOR];219size_t numChildren = 1;220children[0] = current;221222do {223224/* find best child with largest number of primitives */225size_t bestChild = -1;226size_t bestSize = 0;227for (size_t i=0; i<numChildren; i++)228{229/* ignore leaves as they cannot get split */230if (children[i].size() <= maxLeafSize)231continue;232233/* remember child with largest size */234if (children[i].size() > bestSize) {235bestSize = children[i].size();236bestChild = i;237}238}239if (bestChild == size_t(-1)) break;240241/*! split best child into left and right child */242auto split = children[bestChild].split();243244/* add new children left and right */245children[bestChild] = children[numChildren-1];246children[numChildren-1] = split.first;247children[numChildren+0] = split.second;248numChildren++;249250} while (numChildren < branchingFactor);251252/* create node */253auto node = createNode(alloc,numChildren);254255/* recurse into each child */256ReductionTy bounds[MAX_BRANCHING_FACTOR];257for (size_t i=0; i<numChildren; i++)258bounds[i] = createLargeLeaf(depth+1,children[i],alloc);259260return setBounds(node,bounds,numChildren);261}262263/*! recreates morton codes when reaching a region where all codes are identical */264__noinline void recreateMortonCodes(const range<unsigned>& current) const265{266/* fast path for small ranges */267if (likely(current.size() < 1024))268{269/*! recalculate centroid bounds */270BBox3fa centBounds(empty);271for (size_t i=current.begin(); i<current.end(); i++)272centBounds.extend(center2(calculateBounds(morton[i])));273274/* recalculate morton codes */275MortonCodeMapping mapping(centBounds);276for (size_t i=current.begin(); i<current.end(); i++)277morton[i].code = mapping.code(calculateBounds(morton[i]));278279/* sort morton codes */280std::sort(morton+current.begin(),morton+current.end());281}282else283{284/*! recalculate centroid bounds */285auto calculateCentBounds = [&] ( const range<unsigned>& r ) {286BBox3fa centBounds = empty;287for (size_t i=r.begin(); i<r.end(); i++)288centBounds.extend(center2(calculateBounds(morton[i])));289return centBounds;290};291const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),292BBox3fa(empty), calculateCentBounds, BBox3fa::merge);293294/* recalculate morton codes */295MortonCodeMapping mapping(centBounds);296parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {297for (size_t i=r.begin(); i<r.end(); i++) {298morton[i].code = mapping.code(calculateBounds(morton[i]));299}300});301302/*! sort morton codes */303#if defined(TASKING_TBB)304tbb::parallel_sort(morton+current.begin(),morton+current.end());305#else306radixsort32(morton+current.begin(),current.size());307#endif308}309}310311__forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const312{313const unsigned int code_start = morton[current.begin()].code;314const unsigned int code_end = morton[current.end()-1].code;315unsigned int bitpos = lzcnt(code_start^code_end);316317/* if all items mapped to same morton code, then re-create new morton codes for the items */318if (unlikely(bitpos == 32))319{320recreateMortonCodes(current);321const unsigned int code_start = morton[current.begin()].code;322const unsigned int code_end = morton[current.end()-1].code;323bitpos = lzcnt(code_start^code_end);324325/* if the morton code is still the same, goto fall back split */326if (unlikely(bitpos == 32)) {327current.split(left,right);328return;329}330}331332/* split the items at the topmost different morton code bit */333const unsigned int bitpos_diff = 31-bitpos;334const unsigned int bitmask = 1 << bitpos_diff;335336/* find location where bit differs using binary search */337unsigned begin = current.begin();338unsigned end = current.end();339while (begin + 1 != end) {340const unsigned mid = (begin+end)/2;341const unsigned bit = morton[mid].code & bitmask;342if (bit == 0) begin = mid; else end = mid;343}344unsigned center = end;345#if defined(DEBUG)346for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);347for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);348#endif349350left = make_range(current.begin(),center);351right = make_range(center,current.end());352}353354ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)355{356/* get thread local allocator */357if (!alloc)358alloc = createAllocator();359360/* call memory monitor function to signal progress */361if (toplevel && current.size() <= singleThreadThreshold)362progressMonitor(current.size());363364/* create leaf node */365if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))366return createLargeLeaf(depth,current,alloc);367368/* fill all children by always splitting the one with the largest surface area */369range<unsigned> children[MAX_BRANCHING_FACTOR];370split(current,children[0],children[1]);371size_t numChildren = 2;372373while (numChildren < branchingFactor)374{375/* find best child with largest number of primitives */376int bestChild = -1;377unsigned bestItems = 0;378for (unsigned int i=0; i<numChildren; i++)379{380/* ignore leaves as they cannot get split */381if (children[i].size() <= minLeafSize)382continue;383384/* remember child with largest area */385if (children[i].size() > bestItems) {386bestItems = children[i].size();387bestChild = i;388}389}390if (bestChild == -1) break;391392/*! split best child into left and right child */393range<unsigned> left, right;394split(children[bestChild],left,right);395396/* add new children left and right */397children[bestChild] = children[numChildren-1];398children[numChildren-1] = left;399children[numChildren+0] = right;400numChildren++;401}402403/* create leaf node if no split is possible */404if (unlikely(numChildren == 1))405return createLeaf(current,alloc);406407/* allocate node */408auto node = createNode(alloc,numChildren);409410/* process top parts of tree parallel */411ReductionTy bounds[MAX_BRANCHING_FACTOR];412if (current.size() > singleThreadThreshold)413{414/*! parallel_for is faster than spawning sub-tasks */415parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {416for (size_t i=r.begin(); i<r.end(); i++) {417bounds[i] = recurse(depth+1,children[i],nullptr,true);418_mm_mfence(); // to allow non-temporal stores during build419}420});421}422423/* finish tree sequentially */424else425{426for (size_t i=0; i<numChildren; i++)427bounds[i] = recurse(depth+1,children[i],alloc,false);428}429430return setBounds(node,bounds,numChildren);431}432433/* build function */434ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)435{436/* sort morton codes */437morton = src;438radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);439440/* build BVH */441const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);442_mm_mfence(); // to allow non-temporal stores during build443return root;444}445446public:447CreateAllocator& createAllocator;448CreateNodeFunc& createNode;449SetNodeBoundsFunc& setBounds;450CreateLeafFunc& createLeaf;451CalculateBounds& calculateBounds;452ProgressMonitor& progressMonitor;453454public:455BuildPrim* morton;456};457458459template<460typename ReductionTy,461typename CreateAllocFunc,462typename CreateNodeFunc,463typename SetBoundsFunc,464typename CreateLeafFunc,465typename CalculateBoundsFunc,466typename ProgressMonitor>467468static ReductionTy build(CreateAllocFunc createAllocator,469CreateNodeFunc createNode,470SetBoundsFunc setBounds,471CreateLeafFunc createLeaf,472CalculateBoundsFunc calculateBounds,473ProgressMonitor progressMonitor,474BuildPrim* src,475BuildPrim* tmp,476size_t numPrimitives,477const Settings& settings)478{479typedef BuilderT<480ReductionTy,481decltype(createAllocator()),482CreateAllocFunc,483CreateNodeFunc,484SetBoundsFunc,485CreateLeafFunc,486CalculateBoundsFunc,487ProgressMonitor> Builder;488489Builder builder(createAllocator,490createNode,491setBounds,492createLeaf,493calculateBounds,494progressMonitor,495settings);496497return builder.build(src,tmp,numPrimitives);498}499};500}501}502503504