Path: blob/master/thirdparty/embree/common/math/emath.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "../sys/platform.h"6#include "../sys/intrinsics.h"7#include "constants.h"8#include <cmath>910#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)11# include "math_sycl.h"12#else1314#if defined(__ARM_NEON)15#include "../simd/arm/emulation.h"16#else17#include <emmintrin.h>18#include <xmmintrin.h>19#include <immintrin.h>20#endif2122#if defined(__WIN32__)23#if defined(_MSC_VER) && (_MSC_VER <= 1700)24namespace std25{26__forceinline bool isinf ( const float x ) { return _finite(x) == 0; }27__forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }28__forceinline bool isfinite (const float x) { return _finite(x) != 0; }29}30#endif31#endif3233namespace embree34{35__forceinline bool isvalid ( const float& v ) {36return (v > -FLT_LARGE) & (v < +FLT_LARGE);37}3839__forceinline int cast_f2i(float f) {40union { float f; int i; } v; v.f = f; return v.i;41}4243__forceinline float cast_i2f(int i) {44union { float f; int i; } v; v.i = i; return v.f;45}4647__forceinline int toInt (const float& a) { return int(a); }48__forceinline float toFloat(const int& a) { return float(a); }4950__forceinline int asInt (const float& a) { return *((int*)&a); }51__forceinline float asFloat(const int& a) { return *((float*)&a); }5253#if defined(__WIN32__)54__forceinline bool finite ( const float x ) { return _finite(x) != 0; }55#endif5657__forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }58__forceinline float sqr ( const float x ) { return x*x; }5960__forceinline float rcp ( const float x )61{62#if defined(__aarch64__)63// Move scalar to vector register and do rcp.64__m128 a;65a[0] = x;66float32x4_t reciprocal = vrecpeq_f32(a);67reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);68reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);69return reciprocal[0];70#else7172const __m128 a = _mm_set_ss(x);7374#if defined(__AVX512VL__)75const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);76#else77const __m128 r = _mm_rcp_ss(a);78#endif7980#if defined(__AVX2__)81return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));82#else83return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));84#endif8586#endif //defined(__aarch64__)87}8889__forceinline float signmsk ( const float x ) {90#if defined(__aarch64__)91// FP and Neon shares same vector register in arm6492__m128 a;93__m128i b;94a[0] = x;95b[0] = 0x80000000;96a = _mm_and_ps(a, vreinterpretq_f32_s32(b));97return a[0];98#else99return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));100#endif101}102__forceinline float xorf( const float x, const float y ) {103#if defined(__aarch64__)104// FP and Neon shares same vector register in arm64105__m128 a;106__m128 b;107a[0] = x;108b[0] = y;109a = _mm_xor_ps(a, b);110return a[0];111#else112return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));113#endif114}115__forceinline float andf( const float x, const unsigned y ) {116#if defined(__aarch64__)117// FP and Neon shares same vector register in arm64118__m128 a;119__m128i b;120a[0] = x;121b[0] = y;122a = _mm_and_ps(a, vreinterpretq_f32_s32(b));123return a[0];124#else125return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));126#endif127}128__forceinline float rsqrt( const float x )129{130#if defined(__aarch64__)131// FP and Neon shares same vector register in arm64132__m128 a;133a[0] = x;134__m128 value = _mm_rsqrt_ps(a);135value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));136value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));137return value[0];138#else139140const __m128 a = _mm_set_ss(x);141#if defined(__AVX512VL__)142__m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);143#else144__m128 r = _mm_rsqrt_ss(a);145#endif146const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),147_mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));148return _mm_cvtss_f32(c);149#endif150}151152#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)153__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }154__forceinline double nextafter(double x, double y) { return _nextafter(x, y); }155__forceinline int roundf(float f) { return (int)(f + 0.5f); }156#else157__forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }158__forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }159#endif160161__forceinline float abs ( const float x ) { return ::fabsf(x); }162__forceinline float acos ( const float x ) { return ::acosf (x); }163__forceinline float asin ( const float x ) { return ::asinf (x); }164__forceinline float atan ( const float x ) { return ::atanf (x); }165__forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }166__forceinline float cos ( const float x ) { return ::cosf (x); }167__forceinline float cosh ( const float x ) { return ::coshf (x); }168__forceinline float exp ( const float x ) { return ::expf (x); }169__forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }170__forceinline float log ( const float x ) { return ::logf (x); }171__forceinline float log10( const float x ) { return ::log10f(x); }172__forceinline float pow ( const float x, const float y ) { return ::powf (x, y); }173__forceinline float sin ( const float x ) { return ::sinf (x); }174__forceinline float sinh ( const float x ) { return ::sinhf (x); }175__forceinline float sqrt ( const float x ) { return ::sqrtf (x); }176__forceinline float tan ( const float x ) { return ::tanf (x); }177__forceinline float tanh ( const float x ) { return ::tanhf (x); }178__forceinline float floor( const float x ) { return ::floorf (x); }179__forceinline float ceil ( const float x ) { return ::ceilf (x); }180__forceinline float frac ( const float x ) { return x-floor(x); }181182__forceinline double abs ( const double x ) { return ::fabs(x); }183__forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }184__forceinline double acos ( const double x ) { return ::acos (x); }185__forceinline double asin ( const double x ) { return ::asin (x); }186__forceinline double atan ( const double x ) { return ::atan (x); }187__forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }188__forceinline double cos ( const double x ) { return ::cos (x); }189__forceinline double cosh ( const double x ) { return ::cosh (x); }190__forceinline double exp ( const double x ) { return ::exp (x); }191__forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }192__forceinline double log ( const double x ) { return ::log (x); }193__forceinline double log10( const double x ) { return ::log10(x); }194__forceinline double pow ( const double x, const double y ) { return ::pow (x, y); }195__forceinline double rcp ( const double x ) { return 1.0/x; }196__forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }197__forceinline double sin ( const double x ) { return ::sin (x); }198__forceinline double sinh ( const double x ) { return ::sinh (x); }199__forceinline double sqr ( const double x ) { return x*x; }200__forceinline double sqrt ( const double x ) { return ::sqrt (x); }201__forceinline double tan ( const double x ) { return ::tan (x); }202__forceinline double tanh ( const double x ) { return ::tanh (x); }203__forceinline double floor( const double x ) { return ::floor (x); }204__forceinline double ceil ( const double x ) { return ::ceil (x); }205206#if defined(__aarch64__)207__forceinline float mini(float a, float b) {208// FP and Neon shares same vector register in arm64209__m128 x;210__m128 y;211x[0] = a;212y[0] = b;213x = _mm_min_ps(x, y);214return x[0];215}216#elif defined(__SSE4_1__)217__forceinline float mini(float a, float b) {218const __m128i ai = _mm_castps_si128(_mm_set_ss(a));219const __m128i bi = _mm_castps_si128(_mm_set_ss(b));220const __m128i ci = _mm_min_epi32(ai,bi);221return _mm_cvtss_f32(_mm_castsi128_ps(ci));222}223#endif224225#if defined(__aarch64__)226__forceinline float maxi(float a, float b) {227// FP and Neon shares same vector register in arm64228__m128 x;229__m128 y;230x[0] = a;231y[0] = b;232x = _mm_max_ps(x, y);233return x[0];234}235#elif defined(__SSE4_1__)236__forceinline float maxi(float a, float b) {237const __m128i ai = _mm_castps_si128(_mm_set_ss(a));238const __m128i bi = _mm_castps_si128(_mm_set_ss(b));239const __m128i ci = _mm_max_epi32(ai,bi);240return _mm_cvtss_f32(_mm_castsi128_ps(ci));241}242#endif243244template<typename T>245__forceinline T twice(const T& a) { return a+a; }246247__forceinline int min(int a, int b) { return a<b ? a:b; }248__forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }249__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }250__forceinline float min(float a, float b) { return a<b ? a:b; }251__forceinline double min(double a, double b) { return a<b ? a:b; }252#if defined(__64BIT__) || defined(__EMSCRIPTEN__)253__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }254#endif255#if defined(__EMSCRIPTEN__)256__forceinline long min(long a, long b) { return a<b ? a:b; }257#endif258259template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }260template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }261template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }262263template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }264template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }265template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }266267__forceinline int max(int a, int b) { return a<b ? b:a; }268__forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }269__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }270__forceinline float max(float a, float b) { return a<b ? b:a; }271__forceinline double max(double a, double b) { return a<b ? b:a; }272#if defined(__64BIT__) || defined(__EMSCRIPTEN__)273__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }274#endif275#if defined(__EMSCRIPTEN__)276__forceinline long max(long a, long b) { return a<b ? b:a; }277#endif278279template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }280template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }281template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }282283template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }284template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }285template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }286287#if defined(__MACOSX__)288__forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }289__forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }290#endif291292#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)293__forceinline void sincosf(float x, float *sin, float *cos) {294__sincosf(x,sin,cos);295}296#endif297298#if defined(__WIN32__) || defined(__FreeBSD__)299__forceinline void sincosf(float x, float *s, float *c) {300*s = sinf(x); *c = cosf(x);301}302#endif303304template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }305template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }306307template<typename T> __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); }308template<typename T> __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); }309template<typename T> __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); }310template<typename T> __forceinline T cos2sin ( const T& x ) { return sin2cos(x); }311312#if defined(__AVX2__)313__forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }314__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }315__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }316__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }317318#elif defined (__aarch64__) && defined(__clang__)319#pragma clang fp contract(fast)320__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }321__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }322__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }323__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }324#pragma clang fp contract(on)325326#else327__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }328__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }329__forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}330__forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }331#endif332333/*! random functions */334template<typename T> T random() { return T(0); }335#if defined(_WIN32)336template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }337template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }338#else339template<> __forceinline int random() { return int(rand()); }340template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }341#endif342template<> __forceinline float random() { return rand()/float(RAND_MAX); }343template<> __forceinline double random() { return rand()/double(RAND_MAX); }344345#if _WIN32346__forceinline double drand48() {347return double(rand())/double(RAND_MAX);348}349350__forceinline void srand48(long seed) {351return srand(seed);352}353#endif354355/*! selects */356__forceinline bool select(bool s, bool t , bool f) { return s ? t : f; }357__forceinline int select(bool s, int t, int f) { return s ? t : f; }358__forceinline float select(bool s, float t, float f) { return s ? t : f; }359360__forceinline bool none(bool s) { return !s; }361__forceinline bool all (bool s) { return s; }362__forceinline bool any (bool s) { return s; }363364__forceinline unsigned movemask (bool s) { return (unsigned)s; }365366__forceinline float lerp(const float v0, const float v1, const float t) {367return madd(1.0f-t,v0,t*v1);368}369370template<typename T>371__forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {372return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));373}374375/*! exchange */376template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }377378/* load/store */379template<typename Ty> struct mem;380381template<> struct mem<float> {382static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }383static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }384385static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }386static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }387};388389/*! bit reverse operation */390template<class T>391__forceinline T bitReverse(const T& vin)392{393T v = vin;394v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);395v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);396v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);397v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);398v = ( v >> 16 ) | ( v << 16);399return v;400}401402/*! bit interleave operation */403template<class T>404__forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)405{406T x = xin, y = yin, z = zin;407x = (x | (x << 16)) & 0x030000FF;408x = (x | (x << 8)) & 0x0300F00F;409x = (x | (x << 4)) & 0x030C30C3;410x = (x | (x << 2)) & 0x09249249;411412y = (y | (y << 16)) & 0x030000FF;413y = (y | (y << 8)) & 0x0300F00F;414y = (y | (y << 4)) & 0x030C30C3;415y = (y | (y << 2)) & 0x09249249;416417z = (z | (z << 16)) & 0x030000FF;418z = (z | (z << 8)) & 0x0300F00F;419z = (z | (z << 4)) & 0x030C30C3;420z = (z | (z << 2)) & 0x09249249;421422return x | (y << 1) | (z << 2);423}424425#if defined(__AVX2__) && !defined(__aarch64__)426427template<>428__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)429{430const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );431const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);432const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);433return xx | yy | zz;434}435436#endif437438/*! bit interleave operation for 64bit data types*/439template<class T>440__forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){441T x = xin & 0x1fffff;442T y = yin & 0x1fffff;443T z = zin & 0x1fffff;444445x = (x | x << 32) & 0x1f00000000ffff;446x = (x | x << 16) & 0x1f0000ff0000ff;447x = (x | x << 8) & 0x100f00f00f00f00f;448x = (x | x << 4) & 0x10c30c30c30c30c3;449x = (x | x << 2) & 0x1249249249249249;450451y = (y | y << 32) & 0x1f00000000ffff;452y = (y | y << 16) & 0x1f0000ff0000ff;453y = (y | y << 8) & 0x100f00f00f00f00f;454y = (y | y << 4) & 0x10c30c30c30c30c3;455y = (y | y << 2) & 0x1249249249249249;456457z = (z | z << 32) & 0x1f00000000ffff;458z = (z | z << 16) & 0x1f0000ff0000ff;459z = (z | z << 8) & 0x100f00f00f00f00f;460z = (z | z << 4) & 0x10c30c30c30c30c3;461z = (z | z << 2) & 0x1249249249249249;462463return x | (y << 1) | (z << 2);464}465}466467#endif468469470