Path: blob/master/thirdparty/embree/common/simd/vfloat16_avx512.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#define vboolf vboolf_impl6#define vboold vboold_impl7#define vint vint_impl8#define vuint vuint_impl9#define vllong vllong_impl10#define vfloat vfloat_impl11#define vdouble vdouble_impl1213namespace embree14{15/* 16-wide AVX-512 float type */16template<>17struct vfloat<16>18{19ALIGNED_STRUCT_(64);2021typedef vboolf16 Bool;22typedef vint16 Int;23typedef vfloat16 Float;2425enum { size = 16 }; // number of SIMD elements26union { // data27__m512 v;28float f[16];29int i[16];30};3132////////////////////////////////////////////////////////////////////////////////33/// Constructors, Assignment & Cast Operators34////////////////////////////////////////////////////////////////////////////////3536__forceinline vfloat() {}37__forceinline vfloat(const vfloat16& t) { v = t; }38__forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }3940__forceinline vfloat(const __m512& t) { v = t; }41__forceinline operator __m512() const { return v; }42__forceinline operator __m256() const { return _mm512_castps512_ps256(v); }43__forceinline operator __m128() const { return _mm512_castps512_ps128(v); }4445__forceinline vfloat(float f) {46v = _mm512_set1_ps(f);47}4849__forceinline vfloat(float a, float b, float c, float d) {50v = _mm512_set4_ps(a, b, c, d);51}5253__forceinline vfloat(const vfloat4& i) {54v = _mm512_broadcast_f32x4(i);55}5657__forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {58v = _mm512_castps128_ps512(a);59v = _mm512_insertf32x4(v, b, 1);60v = _mm512_insertf32x4(v, c, 2);61v = _mm512_insertf32x4(v, d, 3);62}6364__forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {65v = _mm512_broadcast_f32x4(a);66v = _mm512_mask_broadcast_f32x4(v,mask,b);67}6869__forceinline vfloat(const vfloat8& i) {70v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));71}7273__forceinline vfloat(const vfloat8& a, const vfloat8& b) {74v = _mm512_castps256_ps512(a);75#if defined(__AVX512DQ__)76v = _mm512_insertf32x8(v, b, 1);77#else78v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));79#endif80}8182/* WARNING: due to f64x4 the mask is considered as an 8bit mask */83/*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {84__m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));85aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));86v = _mm512_castpd_ps(aa);87}*/8889__forceinline explicit vfloat(const vint16& a) {90v = _mm512_cvtepi32_ps(a);91}9293__forceinline explicit vfloat(const vuint16& a) {94v = _mm512_cvtepu32_ps(a);95}9697////////////////////////////////////////////////////////////////////////////////98/// Constants99////////////////////////////////////////////////////////////////////////////////100101__forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {}102__forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {}103__forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}104__forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}105__forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}106__forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {}107__forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}108109////////////////////////////////////////////////////////////////////////////////110/// Loads and Stores111////////////////////////////////////////////////////////////////////////////////112113static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); }114static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }115116static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }117static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }118119static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }120static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }121122static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }123static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }124125static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {126_mm512_stream_ps((float*)ptr,a);127}128129static __forceinline vfloat16 broadcast(const float* f) {130return _mm512_set1_ps(*f);131}132133template<int scale = 4>134static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {135return _mm512_i32gather_ps(index, ptr, scale);136}137138template<int scale = 4>139static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {140vfloat16 r = zero;141return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);142}143144template<int scale = 4>145static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {146_mm512_i32scatter_ps(ptr, index, v, scale);147}148149template<int scale = 4>150static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {151_mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);152}153154////////////////////////////////////////////////////////////////////////////////155/// Array Access156////////////////////////////////////////////////////////////////////////////////157158__forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; }159__forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }160};161162////////////////////////////////////////////////////////////////////////////////163/// Unary Operators164////////////////////////////////////////////////////////////////////////////////165166__forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); }167__forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); }168__forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }169170__forceinline vint16 toInt (const vfloat16& a) { return vint16(a); }171__forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); }172173__forceinline vfloat16 operator +(const vfloat16& a) { return a; }174__forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }175176__forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }177__forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }178179__forceinline vfloat16 rcp(const vfloat16& a)180{181const vfloat16 r = _mm512_rcp14_ps(a);182return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r); // computes r + r * (1 - a*r)183}184185__forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }186__forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }187188__forceinline vfloat16 rsqrt(const vfloat16& a)189{190const vfloat16 r = _mm512_rsqrt14_ps(a);191return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,192_mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r)));193}194195////////////////////////////////////////////////////////////////////////////////196/// Binary Operators197////////////////////////////////////////////////////////////////////////////////198199__forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }200__forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); }201__forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; }202203__forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }204__forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); }205__forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; }206207__forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }208__forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); }209__forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; }210211__forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }212__forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); }213__forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; }214215__forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }216__forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }217__forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {218return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b)));219}220221__forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b); }222__forceinline vfloat16 min(const vfloat16& a, float b) { return _mm512_min_ps(a,vfloat16(b)); }223__forceinline vfloat16 min(const float& a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }224225__forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }226__forceinline vfloat16 max(const vfloat16& a, float b) { return _mm512_max_ps(a,vfloat16(b)); }227__forceinline vfloat16 max(const float& a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }228229__forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {230const vint16 ai = _mm512_castps_si512(a);231const vint16 bi = _mm512_castps_si512(b);232const vint16 ci = _mm512_min_epi32(ai,bi);233return _mm512_castsi512_ps(ci);234}235236__forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {237const vint16 ai = _mm512_castps_si512(a);238const vint16 bi = _mm512_castps_si512(b);239const vint16 ci = _mm512_max_epi32(ai,bi);240return _mm512_castsi512_ps(ci);241}242243////////////////////////////////////////////////////////////////////////////////244/// Ternary Operators245////////////////////////////////////////////////////////////////////////////////246247__forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }248__forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }249__forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }250__forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }251252////////////////////////////////////////////////////////////////////////////////253/// Assignment Operators254////////////////////////////////////////////////////////////////////////////////255256__forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }257__forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; }258259__forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }260__forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; }261262__forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }263__forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; }264265__forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }266__forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; }267268////////////////////////////////////////////////////////////////////////////////269/// Comparison Operators + Select270////////////////////////////////////////////////////////////////////////////////271272__forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }273__forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); }274__forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; }275276__forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }277__forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); }278__forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; }279280__forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }281__forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); }282__forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; }283284__forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }285__forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); }286__forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; }287288__forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }289__forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); }290__forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; }291292__forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }293__forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); }294__forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; }295296__forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }297__forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }298__forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }299__forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }300__forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }301__forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }302303__forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }304__forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }305__forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }306__forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }307__forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }308__forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }309310__forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {311return _mm512_mask_blend_ps(s, f, t);312}313314__forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {315return madd(t,b-a,a);316}317318__forceinline bool isvalid (const vfloat16& v) {319return all((v > vfloat16(-FLT_LARGE)) & (v < vfloat16(+FLT_LARGE)));320}321322__forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)323{324vfloat16 c = a;325a = select(m,b,a);326b = select(m,c,b);327}328329////////////////////////////////////////////////////////////////////////////////330/// Rounding Functions331////////////////////////////////////////////////////////////////////////////////332333__forceinline vfloat16 floor(const vfloat16& a) {334return _mm512_floor_ps(a);335}336__forceinline vfloat16 ceil (const vfloat16& a) {337return _mm512_ceil_ps(a);338}339__forceinline vfloat16 round (const vfloat16& a) {340return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);341}342__forceinline vint16 floori (const vfloat16& a) {343return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);344}345346////////////////////////////////////////////////////////////////////////////////347/// Movement/Shifting/Shuffling Functions348////////////////////////////////////////////////////////////////////////////////349350__forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }351__forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }352353template<int i>354__forceinline vfloat16 shuffle(const vfloat16& v) {355return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));356}357358template<int i0, int i1, int i2, int i3>359__forceinline vfloat16 shuffle(const vfloat16& v) {360return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));361}362363template<int i>364__forceinline vfloat16 shuffle4(const vfloat16& v) {365return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));366}367368template<int i0, int i1, int i2, int i3>369__forceinline vfloat16 shuffle4(const vfloat16& v) {370return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));371}372373__forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {374return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));375}376377__forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {378return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));379}380381__forceinline vfloat16 permute(vfloat16 v, __m512i index) {382return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));383}384385__forceinline vfloat16 reverse(const vfloat16& v) {386return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));387}388389template<int i>390__forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {391return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i));392};393394template<int i>395__forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {396return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i));397};398399__forceinline vfloat16 shift_left_1(const vfloat16& a) {400vfloat16 z = zero;401return mask_align_shift_right<15>(0xfffe,z,a,a);402}403404__forceinline vfloat16 shift_right_1(const vfloat16& x) {405return align_shift_right<1>(zero,x);406}407408__forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }409410411template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }412413template<int N, int i>414vfloat<N> extractN(const vfloat16& v);415416template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); }417template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }418template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }419template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }420421template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); }422template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }423424template<int i> __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }425template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); }426427template<int i> __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }428template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); }429430////////////////////////////////////////////////////////////////////////////////431/// Transpose432////////////////////////////////////////////////////////////////////////////////433434__forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,435vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)436{437vfloat16 a0a2_b0b2 = unpacklo(r0, r2);438vfloat16 c0c2_d0d2 = unpackhi(r0, r2);439vfloat16 a1a3_b1b3 = unpacklo(r1, r3);440vfloat16 c1c3_d1d3 = unpackhi(r1, r3);441442c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);443c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);444c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);445c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);446}447448__forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3,449const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,450const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11,451const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,452vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)453{454return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),455c0, c1, c2, c3);456}457458__forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,459const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,460vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,461vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)462{463vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;464transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);465466vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;467transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);468469c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);470c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);471c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);472c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);473c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);474c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);475c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);476c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);477}478479__forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3,480const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,481const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11,482const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,483vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,484vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)485{486return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11),487vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),488c0, c1, c2, c3, c4, c5, c6, c7);489}490491////////////////////////////////////////////////////////////////////////////////492/// Reductions493////////////////////////////////////////////////////////////////////////////////494495__forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); }496__forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }497__forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }498__forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }499500__forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); }501__forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }502__forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }503__forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }504505__forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); }506__forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }507__forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }508__forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }509510__forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }511__forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }512__forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }513514__forceinline size_t select_min(const vfloat16& v) {515return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));516}517518__forceinline size_t select_max(const vfloat16& v) {519return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));520}521522__forceinline size_t select_min(const vboolf16& valid, const vfloat16& v)523{524const vfloat16 a = select(valid,v,vfloat16(pos_inf));525const vbool16 valid_min = valid & (a == vreduce_min(a));526return bsf(movemask(any(valid_min) ? valid_min : valid));527}528529__forceinline size_t select_max(const vboolf16& valid, const vfloat16& v)530{531const vfloat16 a = select(valid,v,vfloat16(neg_inf));532const vbool16 valid_max = valid & (a == vreduce_max(a));533return bsf(movemask(any(valid_max) ? valid_max : valid));534}535536__forceinline vfloat16 prefix_sum(const vfloat16& a)537{538const vfloat16 z(zero);539vfloat16 v = a;540v = v + align_shift_right<16-1>(v,z);541v = v + align_shift_right<16-2>(v,z);542v = v + align_shift_right<16-4>(v,z);543v = v + align_shift_right<16-8>(v,z);544return v;545}546547__forceinline vfloat16 reverse_prefix_sum(const vfloat16& a)548{549const vfloat16 z(zero);550vfloat16 v = a;551v = v + align_shift_right<1>(z,v);552v = v + align_shift_right<2>(z,v);553v = v + align_shift_right<4>(z,v);554v = v + align_shift_right<8>(z,v);555return v;556}557558__forceinline vfloat16 prefix_min(const vfloat16& a)559{560const vfloat16 z(pos_inf);561vfloat16 v = a;562v = min(v,align_shift_right<16-1>(v,z));563v = min(v,align_shift_right<16-2>(v,z));564v = min(v,align_shift_right<16-4>(v,z));565v = min(v,align_shift_right<16-8>(v,z));566return v;567}568569__forceinline vfloat16 prefix_max(const vfloat16& a)570{571const vfloat16 z(neg_inf);572vfloat16 v = a;573v = max(v,align_shift_right<16-1>(v,z));574v = max(v,align_shift_right<16-2>(v,z));575v = max(v,align_shift_right<16-4>(v,z));576v = max(v,align_shift_right<16-8>(v,z));577return v;578}579580581__forceinline vfloat16 reverse_prefix_min(const vfloat16& a)582{583const vfloat16 z(pos_inf);584vfloat16 v = a;585v = min(v,align_shift_right<1>(z,v));586v = min(v,align_shift_right<2>(z,v));587v = min(v,align_shift_right<4>(z,v));588v = min(v,align_shift_right<8>(z,v));589return v;590}591592__forceinline vfloat16 reverse_prefix_max(const vfloat16& a)593{594const vfloat16 z(neg_inf);595vfloat16 v = a;596v = max(v,align_shift_right<1>(z,v));597v = max(v,align_shift_right<2>(z,v));598v = max(v,align_shift_right<4>(z,v));599v = max(v,align_shift_right<8>(z,v));600return v;601}602603__forceinline vfloat16 rcp_safe(const vfloat16& a) {604return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));605}606607////////////////////////////////////////////////////////////////////////////////608/// Output Operators609////////////////////////////////////////////////////////////////////////////////610611__forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)612{613cout << "<" << v[0];614for (int i=1; i<16; i++) cout << ", " << v[i];615cout << ">";616return cout;617}618}619620#undef vboolf621#undef vboold622#undef vint623#undef vuint624#undef vllong625#undef vfloat626#undef vdouble627628629