Path: blob/master/thirdparty/embree/common/math/vec2fa.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "../sys/alloc.h"6#include "emath.h"78#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)9# include "vec2fa_sycl.h"10#else1112#include "../simd/sse.h"1314namespace embree15{16////////////////////////////////////////////////////////////////////////////////17/// SSE Vec2fa Type18////////////////////////////////////////////////////////////////////////////////1920struct __aligned(16) Vec2fa21{22ALIGNED_STRUCT_(16);2324typedef float Scalar;25enum { N = 2 };26union {27__m128 m128;28struct { float x,y,az,aw; };29};3031////////////////////////////////////////////////////////////////////////////////32/// Constructors, Assignment & Cast Operators33////////////////////////////////////////////////////////////////////////////////3435__forceinline Vec2fa( ) {}36__forceinline Vec2fa( const __m128 a ) : m128(a) {}3738__forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; }39__forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }4041__forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; }42__forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }4344__forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}45__forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}4647__forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}4849__forceinline operator const __m128&() const { return m128; }50__forceinline operator __m128&() { return m128; }5152////////////////////////////////////////////////////////////////////////////////53/// Loads and Stores54////////////////////////////////////////////////////////////////////////////////5556static __forceinline Vec2fa load( const void* const a ) {57return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));58}5960static __forceinline Vec2fa loadu( const void* const a ) {61return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));62}6364static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {65_mm_storeu_ps((float*)ptr,v);66}6768////////////////////////////////////////////////////////////////////////////////69/// Constants70////////////////////////////////////////////////////////////////////////////////7172__forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {}73__forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}74__forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}75__forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}7677////////////////////////////////////////////////////////////////////////////////78/// Array Access79////////////////////////////////////////////////////////////////////////////////8081__forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }82__forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; }83};8485////////////////////////////////////////////////////////////////////////////////86/// Unary Operators87////////////////////////////////////////////////////////////////////////////////8889__forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }90__forceinline Vec2fa operator -( const Vec2fa& a ) {91const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));92return _mm_xor_ps(a.m128, mask);93}94__forceinline Vec2fa abs ( const Vec2fa& a ) {95const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));96return _mm_and_ps(a.m128, mask);97}98__forceinline Vec2fa sign ( const Vec2fa& a ) {99return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));100}101102__forceinline Vec2fa rcp ( const Vec2fa& a )103{104#if defined(__aarch64__)105__m128 reciprocal = _mm_rcp_ps(a.m128);106reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);107reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);108return (const Vec2fa)reciprocal;109#else110#if defined(__AVX512VL__)111const Vec2fa r = _mm_rcp14_ps(a.m128);112#else113const Vec2fa r = _mm_rcp_ps(a.m128);114#endif115116#if defined(__AVX2__)117const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)118const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n119#else120const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0)121const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n122#endif123124return res;125#endif //defined(__aarch64__)126}127128__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }129__forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }130131__forceinline Vec2fa rsqrt( const Vec2fa& a )132{133#if defined(__aarch64__)134__m128 r = _mm_rsqrt_ps(a.m128);135r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));136r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));137return r;138#else139140#if defined(__AVX512VL__)141__m128 r = _mm_rsqrt14_ps(a.m128);142#else143__m128 r = _mm_rsqrt_ps(a.m128);144#endif145return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));146147#endif148}149150__forceinline Vec2fa zero_fix(const Vec2fa& a) {151return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));152}153__forceinline Vec2fa rcp_safe(const Vec2fa& a) {154return rcp(zero_fix(a));155}156__forceinline Vec2fa log ( const Vec2fa& a ) {157return Vec2fa(logf(a.x),logf(a.y));158}159160__forceinline Vec2fa exp ( const Vec2fa& a ) {161return Vec2fa(expf(a.x),expf(a.y));162}163164////////////////////////////////////////////////////////////////////////////////165/// Binary Operators166////////////////////////////////////////////////////////////////////////////////167168__forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }169__forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }170__forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }171__forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }172__forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }173__forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }174__forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }175__forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }176177__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }178__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }179180#if defined(__aarch64__) || defined(__SSE4_1__)181__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {182const vint4 ai = _mm_castps_si128(a);183const vint4 bi = _mm_castps_si128(b);184const vint4 ci = _mm_min_epi32(ai,bi);185return _mm_castsi128_ps(ci);186}187#endif188189#if defined(__aarch64__) || defined(__SSE4_1__)190__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {191const vint4 ai = _mm_castps_si128(a);192const vint4 bi = _mm_castps_si128(b);193const vint4 ci = _mm_max_epi32(ai,bi);194return _mm_castsi128_ps(ci);195}196#endif197198__forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {199return Vec2fa(powf(a.x,b),powf(a.y,b));200}201202////////////////////////////////////////////////////////////////////////////////203/// Ternary Operators204////////////////////////////////////////////////////////////////////////////////205206#if defined(__AVX2__)207__forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }208__forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }209__forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }210__forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }211#else212__forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }213__forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }214__forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}215__forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }216#endif217218__forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }219__forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }220__forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }221__forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }222223////////////////////////////////////////////////////////////////////////////////224/// Assignment Operators225////////////////////////////////////////////////////////////////////////////////226227__forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }228__forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }229__forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }230__forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; }231__forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }232__forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; }233234////////////////////////////////////////////////////////////////////////////////235/// Reductions236////////////////////////////////////////////////////////////////////////////////237238__forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }239__forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }240__forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }241__forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }242243////////////////////////////////////////////////////////////////////////////////244/// Comparison Operators245////////////////////////////////////////////////////////////////////////////////246247__forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }248__forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }249250////////////////////////////////////////////////////////////////////////////////251/// Euclidean Space Operators252////////////////////////////////////////////////////////////////////////////////253254#if defined(__SSE4_1__)255__forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {256return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));257}258#else259__forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {260return reduce_add(a*b);261}262#endif263264__forceinline Vec2fa cross ( const Vec2fa& a ) {265return Vec2fa(-a.y,a.x);266}267268__forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); }269__forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); }270__forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); }271__forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); }272__forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); }273__forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }274275////////////////////////////////////////////////////////////////////////////////276/// Select277////////////////////////////////////////////////////////////////////////////////278279__forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {280__m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();281return blendv_ps(f, t, mask);282}283284__forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {285return madd(1.0f-t,v0,t*v1);286}287288__forceinline int maxDim ( const Vec2fa& a )289{290const Vec2fa b = abs(a);291if (b.x > b.y) return 0;292else return 1;293}294295////////////////////////////////////////////////////////////////////////////////296/// Rounding Functions297////////////////////////////////////////////////////////////////////////////////298299#if defined(__aarch64__)300//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }301__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }302__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }303#elif defined (__SSE4_1__)304//__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }305__forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }306__forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }307#else308//__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }309__forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }310__forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }311#endif312313////////////////////////////////////////////////////////////////////////////////314/// Output Operators315////////////////////////////////////////////////////////////////////////////////316317__forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {318return cout << "(" << a.x << ", " << a.y << ")";319}320321typedef Vec2fa Vec2fa_t;322}323324#endif325326327