Path: blob/master/thirdparty/embree/common/math/vec3fa.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "../sys/alloc.h"6#include "emath.h"78#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)9# include "vec3fa_sycl.h"10#else1112#include "../simd/sse.h"1314namespace embree15{16////////////////////////////////////////////////////////////////////////////////17/// SSE Vec3fa Type18////////////////////////////////////////////////////////////////////////////////1920struct __aligned(16) Vec3fa21{22ALIGNED_STRUCT_(16);2324typedef float Scalar;25enum { N = 3 };26union {27__m128 m128;28struct { float x,y,z; };29};3031////////////////////////////////////////////////////////////////////////////////32/// Constructors, Assignment & Cast Operators33////////////////////////////////////////////////////////////////////////////////3435__forceinline Vec3fa( ) {}36__forceinline Vec3fa( const __m128 a ) : m128(a) {}3738__forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }39//__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }4041__forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }42__forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }4344__forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}45__forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}4647__forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}4849__forceinline explicit operator const vfloat4() const { return vfloat4(m128); }50__forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }51__forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }52__forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }5354//__forceinline operator const __m128&() const { return m128; }55//__forceinline operator __m128&() { return m128; }5657////////////////////////////////////////////////////////////////////////////////58/// Loads and Stores59////////////////////////////////////////////////////////////////////////////////6061static __forceinline Vec3fa load( const void* const a ) {62#if defined(__aarch64__)63__m128 t = _mm_load_ps((float*)a);64t[3] = 0.0f;65return Vec3fa(t);66#else67return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));68#endif69}7071static __forceinline Vec3fa loadu( const void* const a ) {72return Vec3fa(_mm_loadu_ps((float*)a));73}7475static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {76_mm_storeu_ps((float*)ptr,v.m128);77}7879////////////////////////////////////////////////////////////////////////////////80/// Constants81////////////////////////////////////////////////////////////////////////////////8283__forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}84__forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}85__forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}86__forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}8788////////////////////////////////////////////////////////////////////////////////89/// Array Access90////////////////////////////////////////////////////////////////////////////////9192__forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }93__forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }94};9596////////////////////////////////////////////////////////////////////////////////97/// Unary Operators98////////////////////////////////////////////////////////////////////////////////99100__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }101__forceinline Vec3fa operator -( const Vec3fa& a ) {102#if defined(__aarch64__)103return vnegq_f32(a.m128);104#else105const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));106return _mm_xor_ps(a.m128, mask);107#endif108}109__forceinline Vec3fa abs ( const Vec3fa& a ) {110#if defined(__aarch64__)111return _mm_abs_ps(a.m128);112#else113const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));114return _mm_and_ps(a.m128, mask);115#endif116}117__forceinline Vec3fa sign ( const Vec3fa& a ) {118return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));119}120121__forceinline Vec3fa rcp ( const Vec3fa& a )122{123#if defined(__aarch64__)124return vdivq_f32(vdupq_n_f32(1.0f),a.m128);125#else126127#if defined(__AVX512VL__)128const Vec3fa r = _mm_rcp14_ps(a.m128);129#else130const Vec3fa r = _mm_rcp_ps(a.m128);131#endif132133#if defined(__AVX2__)134const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)135const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n136#else137const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)138const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n139#endif140141return res;142#endif //defined(__aarch64__)143}144145__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }146__forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }147148__forceinline Vec3fa rsqrt( const Vec3fa& a )149{150#if defined(__aarch64__)151__m128 r = _mm_rsqrt_ps(a.m128);152r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));153r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));154return r;155#else156157#if defined(__AVX512VL__)158__m128 r = _mm_rsqrt14_ps(a.m128);159#else160__m128 r = _mm_rsqrt_ps(a.m128);161#endif162return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));163#endif164}165166__forceinline Vec3fa zero_fix(const Vec3fa& a) {167return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));168}169__forceinline Vec3fa rcp_safe(const Vec3fa& a) {170return rcp(zero_fix(a));171}172__forceinline Vec3fa log ( const Vec3fa& a ) {173return Vec3fa(logf(a.x),logf(a.y),logf(a.z));174}175176__forceinline Vec3fa exp ( const Vec3fa& a ) {177return Vec3fa(expf(a.x),expf(a.y),expf(a.z));178}179180////////////////////////////////////////////////////////////////////////////////181/// Binary Operators182////////////////////////////////////////////////////////////////////////////////183184__forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }185__forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }186__forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }187__forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }188__forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }189__forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }190__forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }191__forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }192193__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }194__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }195196#if defined(__aarch64__) || defined(__SSE4_1__)197__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {198const vint4 ai = _mm_castps_si128(a.m128);199const vint4 bi = _mm_castps_si128(b.m128);200const vint4 ci = _mm_min_epi32(ai,bi);201return _mm_castsi128_ps(ci);202}203#endif204205#if defined(__aarch64__) || defined(__SSE4_1__)206__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {207const vint4 ai = _mm_castps_si128(a.m128);208const vint4 bi = _mm_castps_si128(b.m128);209const vint4 ci = _mm_max_epi32(ai,bi);210return _mm_castsi128_ps(ci);211}212#endif213214__forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {215return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));216}217218////////////////////////////////////////////////////////////////////////////////219/// Ternary Operators220////////////////////////////////////////////////////////////////////////////////221222#if defined(__AVX2__) || defined(__ARM_NEON)223__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }224__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }225__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }226__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }227#else228__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }229__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}230__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }231__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }232#endif233234__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }235__forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }236__forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }237__forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }238239////////////////////////////////////////////////////////////////////////////////240/// Assignment Operators241////////////////////////////////////////////////////////////////////////////////242243__forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }244__forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }245__forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }246__forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }247__forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }248__forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }249250////////////////////////////////////////////////////////////////////////////////251/// Reductions252////////////////////////////////////////////////////////////////////////////////253#if defined(__aarch64__)254__forceinline float reduce_add(const Vec3fa& v) {255float32x4_t t = v.m128;256t[3] = 0.0f;257return vaddvq_f32(t);258}259260__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }261__forceinline float reduce_min(const Vec3fa& v) {262float32x4_t t = v.m128;263t[3] = t[2];264return vminvq_f32(t);265}266__forceinline float reduce_max(const Vec3fa& v) {267float32x4_t t = v.m128;268t[3] = t[2];269return vmaxvq_f32(t);270}271#else272__forceinline float reduce_add(const Vec3fa& v) {273const vfloat4 a(v.m128);274const vfloat4 b = shuffle<1>(a);275const vfloat4 c = shuffle<2>(a);276return _mm_cvtss_f32(a+b+c);277}278279__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }280__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }281__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }282#endif283284////////////////////////////////////////////////////////////////////////////////285/// Comparison Operators286////////////////////////////////////////////////////////////////////////////////287288__forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }289__forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }290291__forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }292__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }293__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }294__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }295#if defined(__aarch64__)296__forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }297__forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }298#else299__forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }300__forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }301#endif302303__forceinline bool isvalid ( const Vec3fa& v ) {304return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));305}306307__forceinline bool is_finite ( const Vec3fa& a ) {308return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));309}310311__forceinline bool isvalid4 ( const Vec3fa& v ) {312return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));313}314315__forceinline bool is_finite4 ( const Vec3fa& a ) {316return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));317}318319////////////////////////////////////////////////////////////////////////////////320/// Euclidean Space Operators321////////////////////////////////////////////////////////////////////////////////322323#if defined(__SSE4_1__)324__forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {325return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));326}327#else328__forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {329return reduce_add(a*b);330}331#endif332333__forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )334{335vfloat4 a0 = vfloat4(a.m128);336vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));337vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));338vfloat4 b1 = vfloat4(b.m128);339return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));340}341342__forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }343__forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }344__forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }345__forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }346__forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }347__forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }348__forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }349__forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }350351__forceinline Vec3fa normalize_safe( const Vec3fa& a ) {352const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);353}354355/*! differentiated normalization */356__forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)357{358const float pp = dot(p,p);359const float pdp = dot(p,dp);360return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);361}362363////////////////////////////////////////////////////////////////////////////////364/// Select365////////////////////////////////////////////////////////////////////////////////366367__forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {368__m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();369return blendv_ps(f.m128, t.m128, mask);370}371372__forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {373return blendv_ps(f.m128, t.m128, s);374}375376__forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {377return madd(1.0f-t,v0,t*v1);378}379380__forceinline int maxDim ( const Vec3fa& a )381{382const Vec3fa b = abs(a);383if (b.x > b.y) {384if (b.x > b.z) return 0; else return 2;385} else {386if (b.y > b.z) return 1; else return 2;387}388}389390////////////////////////////////////////////////////////////////////////////////391/// Rounding Functions392////////////////////////////////////////////////////////////////////////////////393394#if defined(__aarch64__)395__forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }396__forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }397__forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }398#elif defined (__SSE4_1__)399__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }400__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }401__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }402#else403__forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }404__forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }405__forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }406#endif407408////////////////////////////////////////////////////////////////////////////////409/// Output Operators410////////////////////////////////////////////////////////////////////////////////411412__forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {413return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";414}415416typedef Vec3fa Vec3fa_t;417418419////////////////////////////////////////////////////////////////////////////////420/// SSE Vec3fx Type421////////////////////////////////////////////////////////////////////////////////422423struct __aligned(16) Vec3fx424{425ALIGNED_STRUCT_(16);426427typedef float Scalar;428enum { N = 3 };429union {430__m128 m128;431struct { float x,y,z; union { int a; unsigned u; float w; }; };432};433434////////////////////////////////////////////////////////////////////////////////435/// Constructors, Assignment & Cast Operators436////////////////////////////////////////////////////////////////////////////////437438__forceinline Vec3fx( ) {}439__forceinline Vec3fx( const __m128 a ) : m128(a) {}440441__forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}442__forceinline operator Vec3fa () const { return Vec3fa(m128); }443444__forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }445//__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }446447__forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }448__forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }449450__forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}451__forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}452453__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }454__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }455__forceinline Vec3fx( const Vec3fa& other, const float w1) {456#if defined (__aarch64__)457m128 = other.m128; m128[3] = w1;458#elif defined (__SSE4_1__)459m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);460#else461const vint4 mask(-1,-1,-1,0);462m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));463#endif464}465//__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!466//__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!467__forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}468469//__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}470471__forceinline explicit operator const vfloat4() const { return vfloat4(m128); }472__forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }473__forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }474__forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }475476//__forceinline operator const __m128&() const { return m128; }477//__forceinline operator __m128&() { return m128; }478479////////////////////////////////////////////////////////////////////////////////480/// Loads and Stores481////////////////////////////////////////////////////////////////////////////////482483static __forceinline Vec3fx load( const void* const a ) {484return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));485}486487static __forceinline Vec3fx loadu( const void* const a ) {488return Vec3fx(_mm_loadu_ps((float*)a));489}490491static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {492_mm_storeu_ps((float*)ptr,v.m128);493}494495////////////////////////////////////////////////////////////////////////////////496/// Constants497////////////////////////////////////////////////////////////////////////////////498499__forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}500__forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {}501__forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}502__forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}503504////////////////////////////////////////////////////////////////////////////////505/// Array Access506////////////////////////////////////////////////////////////////////////////////507508__forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }509__forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }510};511512////////////////////////////////////////////////////////////////////////////////513/// Unary Operators514////////////////////////////////////////////////////////////////////////////////515516__forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }517__forceinline Vec3fx operator -( const Vec3fx& a ) {518const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));519return _mm_xor_ps(a.m128, mask);520}521__forceinline Vec3fx abs ( const Vec3fx& a ) {522const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));523return _mm_and_ps(a.m128, mask);524}525__forceinline Vec3fx sign ( const Vec3fx& a ) {526return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));527}528529__forceinline Vec3fx rcp ( const Vec3fx& a )530{531#if defined(__AVX512VL__)532const Vec3fx r = _mm_rcp14_ps(a.m128);533#else534const Vec3fx r = _mm_rcp_ps(a.m128);535#endif536537#if defined(__AVX2__)538const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));539#else540const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));541//return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));542#endif543544return res;545}546547__forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }548__forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }549550__forceinline Vec3fx rsqrt( const Vec3fx& a )551{552#if defined(__AVX512VL__)553__m128 r = _mm_rsqrt14_ps(a.m128);554#else555__m128 r = _mm_rsqrt_ps(a.m128);556#endif557return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));558}559560__forceinline Vec3fx zero_fix(const Vec3fx& a) {561return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));562}563__forceinline Vec3fx rcp_safe(const Vec3fx& a) {564return rcp(zero_fix(a));565}566__forceinline Vec3fx log ( const Vec3fx& a ) {567return Vec3fx(logf(a.x),logf(a.y),logf(a.z));568}569570__forceinline Vec3fx exp ( const Vec3fx& a ) {571return Vec3fx(expf(a.x),expf(a.y),expf(a.z));572}573574////////////////////////////////////////////////////////////////////////////////575/// Binary Operators576////////////////////////////////////////////////////////////////////////////////577578__forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }579__forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }580__forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }581__forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }582__forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }583__forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }584__forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }585__forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }586587__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }588__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }589590#if defined(__SSE4_1__) || defined(__aarch64__)591__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {592const vint4 ai = _mm_castps_si128(a.m128);593const vint4 bi = _mm_castps_si128(b.m128);594const vint4 ci = _mm_min_epi32(ai,bi);595return _mm_castsi128_ps(ci);596}597#endif598599#if defined(__SSE4_1__) || defined(__aarch64__)600__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {601const vint4 ai = _mm_castps_si128(a.m128);602const vint4 bi = _mm_castps_si128(b.m128);603const vint4 ci = _mm_max_epi32(ai,bi);604return _mm_castsi128_ps(ci);605}606#endif607608__forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {609return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));610}611612////////////////////////////////////////////////////////////////////////////////613/// Ternary Operators614////////////////////////////////////////////////////////////////////////////////615616#if defined(__AVX2__)617__forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }618__forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }619__forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }620__forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }621#else622__forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }623__forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }624__forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}625__forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }626#endif627628__forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }629__forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }630__forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }631__forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }632633////////////////////////////////////////////////////////////////////////////////634/// Assignment Operators635////////////////////////////////////////////////////////////////////////////////636637__forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }638__forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }639__forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }640__forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }641__forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }642__forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }643644////////////////////////////////////////////////////////////////////////////////645/// Reductions646////////////////////////////////////////////////////////////////////////////////647648__forceinline float reduce_add(const Vec3fx& v) {649const vfloat4 a(v.m128);650const vfloat4 b = shuffle<1>(a);651const vfloat4 c = shuffle<2>(a);652return _mm_cvtss_f32(a+b+c);653}654655__forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }656__forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }657__forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }658659////////////////////////////////////////////////////////////////////////////////660/// Comparison Operators661////////////////////////////////////////////////////////////////////////////////662663__forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }664__forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }665666__forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }667__forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }668__forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }669__forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }670__forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }671__forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }672673__forceinline bool isvalid ( const Vec3fx& v ) {674return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));675}676677__forceinline bool is_finite ( const Vec3fx& a ) {678return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));679}680681__forceinline bool isvalid4 ( const Vec3fx& v ) {682return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));683}684685__forceinline bool is_finite4 ( const Vec3fx& a ) {686return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));687}688689////////////////////////////////////////////////////////////////////////////////690/// Euclidean Space Operators691////////////////////////////////////////////////////////////////////////////////692693#if defined(__SSE4_1__)694__forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {695return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));696}697#else698__forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {699return reduce_add(a*b);700}701#endif702703__forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )704{705vfloat4 a0 = vfloat4(a.m128);706vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));707vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));708vfloat4 b1 = vfloat4(b.m128);709return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));710}711712__forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); }713__forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); }714__forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); }715__forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); }716__forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); }717__forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }718__forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }719__forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }720721__forceinline Vec3fx normalize_safe( const Vec3fx& a ) {722const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);723}724725/*! differentiated normalization */726__forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)727{728const float pp = dot(p,p);729const float pdp = dot(p,dp);730return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);731}732733////////////////////////////////////////////////////////////////////////////////734/// Select735////////////////////////////////////////////////////////////////////////////////736737__forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {738__m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();739return blendv_ps(f.m128, t.m128, mask);740}741742__forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {743return blendv_ps(f.m128, t.m128, s);744}745746__forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {747return madd(1.0f-t,v0,t*v1);748}749750__forceinline int maxDim ( const Vec3fx& a )751{752const Vec3fx b = abs(a);753if (b.x > b.y) {754if (b.x > b.z) return 0; else return 2;755} else {756if (b.y > b.z) return 1; else return 2;757}758}759760////////////////////////////////////////////////////////////////////////////////761/// Rounding Functions762////////////////////////////////////////////////////////////////////////////////763764#if defined(__aarch64__)765__forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }766__forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }767__forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }768#elif defined (__SSE4_1__)769__forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }770__forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }771__forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }772#else773__forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }774__forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }775__forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }776#endif777778////////////////////////////////////////////////////////////////////////////////779/// Output Operators780////////////////////////////////////////////////////////////////////////////////781782__forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {783return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";784}785786787typedef Vec3fx Vec3ff;788}789790#endif791792793