Path: blob/master/thirdparty/embree/kernels/bvh/bvh_traverser1.h
9906 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "bvh.h"6#include "node_intersector1.h"7#include "../common/stack_item.h"89#define NEW_SORTING_CODE 11011namespace embree12{13namespace isa14{15/*! BVH regular node traversal for single rays. */16template<int N, int types>17class BVHNNodeTraverser1Hit;1819#if defined(__AVX512VL__) // SKX2021template<int N>22__forceinline void isort_update(vint<N> &dist, const vint<N> &d)23{24const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);25const vboolf<N> m_geq = d >= dist;26const vboolf<N> m_geq_shift = m_geq << 1;27dist = select(m_geq,d,dist);28dist = select(m_geq_shift,dist_shift,dist);29}3031template<int N>32__forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {33dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));34}3536__forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {37return toScalar(permutex2var((__m256i)index,n0,n1));38}3940__forceinline float permuteExtract(const vint8& index, const vfloat8& n) {41return toScalar(permute(n,index));42}4344#endif4546/* Specialization for BVH4. */47template<int types>48class BVHNNodeTraverser1Hit<4, types>49{50typedef BVH4 BVH;51typedef BVH4::NodeRef NodeRef;52typedef BVH4::BaseNode BaseNode;535455public:56/* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */57static __forceinline void traverseClosestHit(NodeRef& cur,58size_t mask,59const vfloat4& tNear,60StackItemT<NodeRef>*& stackPtr,61StackItemT<NodeRef>* stackEnd)62{63assert(mask != 0);64const BaseNode* node = cur.baseNode();6566/*! one child is hit, continue with that child */67size_t r = bscf(mask);68cur = node->child(r);69BVH::prefetch(cur,types);70if (likely(mask == 0)) {71assert(cur != BVH::emptyNode);72return;73}7475/*! two children are hit, push far child, and continue with closer child */76NodeRef c0 = cur;77const unsigned int d0 = ((unsigned int*)&tNear)[r];78r = bscf(mask);79NodeRef c1 = node->child(r);80BVH::prefetch(c1,types);81const unsigned int d1 = ((unsigned int*)&tNear)[r];82assert(c0 != BVH::emptyNode);83assert(c1 != BVH::emptyNode);84if (likely(mask == 0)) {85assert(stackPtr < stackEnd);86if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }87else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }88}8990#if NEW_SORTING_CODE == 191vint4 s0((size_t)c0,(size_t)d0);92vint4 s1((size_t)c1,(size_t)d1);93r = bscf(mask);94NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];95vint4 s2((size_t)c2,(size_t)d2);96/* 3 hits */97if (likely(mask == 0)) {98StackItemT<NodeRef>::sort3(s0,s1,s2);99*(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;100cur = toSizeT(s2);101stackPtr+=2;102return;103}104r = bscf(mask);105NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];106vint4 s3((size_t)c3,(size_t)d3);107/* 4 hits */108StackItemT<NodeRef>::sort4(s0,s1,s2,s3);109*(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;110cur = toSizeT(s3);111stackPtr+=3;112#else113/*! Here starts the slow path for 3 or 4 hit children. We push114* all nodes onto the stack to sort them there. */115assert(stackPtr < stackEnd);116stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;117assert(stackPtr < stackEnd);118stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;119120/*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */121assert(stackPtr < stackEnd);122r = bscf(mask);123NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;124assert(c != BVH::emptyNode);125if (likely(mask == 0)) {126sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);127cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;128return;129}130131/*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */132assert(stackPtr < stackEnd);133r = bscf(mask);134c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;135assert(c != BVH::emptyNode);136sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);137cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;138#endif139}140141/* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */142static __forceinline void traverseAnyHit(NodeRef& cur,143size_t mask,144const vfloat4& tNear,145NodeRef*& stackPtr,146NodeRef* stackEnd)147{148const BaseNode* node = cur.baseNode();149150/*! one child is hit, continue with that child */151size_t r = bscf(mask);152cur = node->child(r);153BVH::prefetch(cur,types);154155/* simpler in sequence traversal order */156assert(cur != BVH::emptyNode);157if (likely(mask == 0)) return;158assert(stackPtr < stackEnd);159*stackPtr = cur; stackPtr++;160161for (; ;)162{163r = bscf(mask);164cur = node->child(r); BVH::prefetch(cur,types);165assert(cur != BVH::emptyNode);166if (likely(mask == 0)) return;167assert(stackPtr < stackEnd);168*stackPtr = cur; stackPtr++;169}170}171};172173/* Specialization for BVH8. */174template<int types>175class BVHNNodeTraverser1Hit<8, types>176{177typedef BVH8 BVH;178typedef BVH8::NodeRef NodeRef;179typedef BVH8::BaseNode BaseNode;180181#if defined(__AVX512VL__)182template<class NodeRef, class BaseNode>183static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,184size_t mask,185const vfloat8& tNear,186StackItemT<NodeRef>*& stackPtr,187StackItemT<NodeRef>* stackEnd)188{189assert(mask != 0);190const BaseNode* node = cur.baseNode();191const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);192const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);193vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);194distance_i = vint8::compact((int)mask,distance_i,distance_i);195cur = permuteExtract(distance_i,n0,n1);196BVH::prefetch(cur,types);197198mask &= mask-1;199if (likely(mask == 0)) return;200201/* 2 hits: order A0 B0 */202const vint8 d0(distance_i);203const vint8 d1(shuffle<1>(distance_i));204cur = permuteExtract(d1,n0,n1);205BVH::prefetch(cur,types);206207const vint8 dist_A0 = min(d0, d1);208const vint8 dist_B0 = max(d0, d1);209assert(dist_A0[0] < dist_B0[0]);210211mask &= mask-1;212if (likely(mask == 0)) {213cur = permuteExtract(dist_A0,n0,n1);214stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1);215*(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);216stackPtr++;217return;218}219220/* 3 hits: order A1 B1 C1 */221222const vint8 d2(shuffle<2>(distance_i));223cur = permuteExtract(d2,n0,n1);224BVH::prefetch(cur,types);225226const vint8 dist_A1 = min(dist_A0,d2);227const vint8 dist_tmp_B1 = max(dist_A0,d2);228const vint8 dist_B1 = min(dist_B0,dist_tmp_B1);229const vint8 dist_C1 = max(dist_B0,dist_tmp_B1);230assert(dist_A1[0] < dist_B1[0]);231assert(dist_B1[0] < dist_C1[0]);232233mask &= mask-1;234if (likely(mask == 0)) {235cur = permuteExtract(dist_A1,n0,n1);236stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1);237*(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);238stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1);239*(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);240stackPtr+=2;241return;242}243244/* 4 hits: order A2 B2 C2 D2 */245246const vint8 d3(shuffle<3>(distance_i));247cur = permuteExtract(d3,n0,n1);248BVH::prefetch(cur,types);249250const vint8 dist_A2 = min(dist_A1,d3);251const vint8 dist_tmp_B2 = max(dist_A1,d3);252const vint8 dist_B2 = min(dist_B1,dist_tmp_B2);253const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);254const vint8 dist_C2 = min(dist_C1,dist_tmp_C2);255const vint8 dist_D2 = max(dist_C1,dist_tmp_C2);256assert(dist_A2[0] < dist_B2[0]);257assert(dist_B2[0] < dist_C2[0]);258assert(dist_C2[0] < dist_D2[0]);259260mask &= mask-1;261if (likely(mask == 0)) {262cur = permuteExtract(dist_A2,n0,n1);263stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1);264*(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);265stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1);266*(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);267stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1);268*(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);269stackPtr+=3;270return;271}272273/* >=5 hits: reverse to descending order for writing to stack */274275distance_i = align_shift_right<3>(distance_i,distance_i);276const size_t hits = 4 + popcnt(mask);277vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert278279isort_quick_update<8>(dist,dist_A2);280isort_quick_update<8>(dist,dist_B2);281isort_quick_update<8>(dist,dist_C2);282isort_quick_update<8>(dist,dist_D2);283284do {285286distance_i = align_shift_right<1>(distance_i,distance_i);287cur = permuteExtract(distance_i,n0,n1);288BVH::prefetch(cur,types);289const vint8 new_dist(permute(distance_i,vint8(zero)));290mask &= mask-1;291isort_update<8>(dist,new_dist);292293} while(mask);294295for (size_t i=0; i<7; i++)296assert(dist[i+0]>=dist[i+1]);297298for (size_t i=0;i<hits-1;i++)299{300stackPtr->ptr = permuteExtract(dist,n0,n1);301*(float*)&stackPtr->dist = permuteExtract(dist,tNear);302dist = align_shift_right<1>(dist,dist);303stackPtr++;304}305cur = permuteExtract(dist,n0,n1);306}307#endif308309public:310static __forceinline void traverseClosestHit(NodeRef& cur,311size_t mask,312const vfloat8& tNear,313StackItemT<NodeRef>*& stackPtr,314StackItemT<NodeRef>* stackEnd)315{316assert(mask != 0);317#if defined(__AVX512VL__)318traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);319#else320321const BaseNode* node = cur.baseNode();322323/*! one child is hit, continue with that child */324size_t r = bscf(mask);325cur = node->child(r);326BVH::prefetch(cur,types);327if (likely(mask == 0)) {328assert(cur != BVH::emptyNode);329return;330}331332/*! two children are hit, push far child, and continue with closer child */333NodeRef c0 = cur;334const unsigned int d0 = ((unsigned int*)&tNear)[r];335r = bscf(mask);336NodeRef c1 = node->child(r);337BVH::prefetch(c1,types);338const unsigned int d1 = ((unsigned int*)&tNear)[r];339340assert(c0 != BVH::emptyNode);341assert(c1 != BVH::emptyNode);342if (likely(mask == 0)) {343assert(stackPtr < stackEnd);344if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }345else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }346}347#if NEW_SORTING_CODE == 1348vint4 s0((size_t)c0,(size_t)d0);349vint4 s1((size_t)c1,(size_t)d1);350351r = bscf(mask);352NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];353vint4 s2((size_t)c2,(size_t)d2);354/* 3 hits */355if (likely(mask == 0)) {356StackItemT<NodeRef>::sort3(s0,s1,s2);357*(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;358cur = toSizeT(s2);359stackPtr+=2;360return;361}362r = bscf(mask);363NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];364vint4 s3((size_t)c3,(size_t)d3);365/* 4 hits */366if (likely(mask == 0)) {367StackItemT<NodeRef>::sort4(s0,s1,s2,s3);368*(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;369cur = toSizeT(s3);370stackPtr+=3;371return;372}373*(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;374/*! fallback case if more than 4 children are hit */375StackItemT<NodeRef>* stackFirst = stackPtr;376stackPtr+=4;377while (1)378{379assert(stackPtr < stackEnd);380r = bscf(mask);381NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r];382const vint4 s((size_t)c,(size_t)d);383*(vint4*)stackPtr++ = s;384assert(c != BVH::emptyNode);385if (unlikely(mask == 0)) break;386}387sort(stackFirst,stackPtr);388cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;389#else390/*! Here starts the slow path for 3 or 4 hit children. We push391* all nodes onto the stack to sort them there. */392assert(stackPtr < stackEnd);393stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;394assert(stackPtr < stackEnd);395stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;396397/*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */398assert(stackPtr < stackEnd);399r = bscf(mask);400NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;401assert(c != BVH::emptyNode);402if (likely(mask == 0)) {403sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);404cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;405return;406}407408/*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */409assert(stackPtr < stackEnd);410r = bscf(mask);411c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;412assert(c != BVH::emptyNode);413if (likely(mask == 0)) {414sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);415cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;416return;417}418/*! fallback case if more than 4 children are hit */419StackItemT<NodeRef>* stackFirst = stackPtr-4;420while (1)421{422assert(stackPtr < stackEnd);423r = bscf(mask);424c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;425assert(c != BVH::emptyNode);426if (unlikely(mask == 0)) break;427}428sort(stackFirst,stackPtr);429cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;430#endif431#endif432}433434static __forceinline void traverseAnyHit(NodeRef& cur,435size_t mask,436const vfloat8& tNear,437NodeRef*& stackPtr,438NodeRef* stackEnd)439{440const BaseNode* node = cur.baseNode();441442/*! one child is hit, continue with that child */443size_t r = bscf(mask);444cur = node->child(r);445BVH::prefetch(cur,types);446447/* simpler in sequence traversal order */448assert(cur != BVH::emptyNode);449if (likely(mask == 0)) return;450assert(stackPtr < stackEnd);451*stackPtr = cur; stackPtr++;452453for (; ;)454{455r = bscf(mask);456cur = node->child(r); BVH::prefetch(cur,types);457assert(cur != BVH::emptyNode);458if (likely(mask == 0)) return;459assert(stackPtr < stackEnd);460*stackPtr = cur; stackPtr++;461}462}463};464}465}466467468