Path: blob/master/thirdparty/embree/common/math/transcendental.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45// Transcendental functions from "ispc": https://github.com/ispc/ispc/6// Most of the transcendental implementations in ispc code come from7// Solomon Boulos's "syrah": https://github.com/boulos/syrah/89#include "../simd/simd.h"1011namespace embree12{1314namespace fastapprox15{1617template <typename T>18__forceinline T sin(const T &v)19{20static const float piOverTwoVec = 1.57079637050628662109375;21static const float twoOverPiVec = 0.636619746685028076171875;22auto scaled = v * twoOverPiVec;23auto kReal = floor(scaled);24auto k = toInt(kReal);2526// Reduced range version of x27auto x = v - kReal * piOverTwoVec;28auto kMod4 = k & 3;29auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);30auto flipSign = (kMod4 > 1);3132// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,33// 4, 6, 8, 10|], [|single...|], [0;Pi/2]);34static const float sinC2 = -0.16666667163372039794921875;35static const float sinC4 = +8.333347737789154052734375e-3;36static const float sinC6 = -1.9842604524455964565277099609375e-4;37static const float sinC8 = +2.760012648650445044040679931640625e-6;38static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;3940static const float cosC2 = -0.5;41static const float cosC4 = +4.166664183139801025390625e-2;42static const float cosC6 = -1.388833043165504932403564453125e-3;43static const float cosC8 = +2.47562347794882953166961669921875e-5;44static const float cosC10 = -2.59630184018533327616751194000244140625e-7;4546auto outside = select(sinUseCos, 1., x);47auto c2 = select(sinUseCos, T(cosC2), T(sinC2));48auto c4 = select(sinUseCos, T(cosC4), T(sinC4));49auto c6 = select(sinUseCos, T(cosC6), T(sinC6));50auto c8 = select(sinUseCos, T(cosC8), T(sinC8));51auto c10 = select(sinUseCos, T(cosC10), T(sinC10));5253auto x2 = x * x;54auto formula = x2 * c10 + c8;55formula = x2 * formula + c6;56formula = x2 * formula + c4;57formula = x2 * formula + c2;58formula = x2 * formula + 1.;59formula *= outside;6061formula = select(flipSign, -formula, formula);62return formula;63}6465template <typename T>66__forceinline T cos(const T &v)67{68static const float piOverTwoVec = 1.57079637050628662109375;69static const float twoOverPiVec = 0.636619746685028076171875;70auto scaled = v * twoOverPiVec;71auto kReal = floor(scaled);72auto k = toInt(kReal);7374// Reduced range version of x75auto x = v - kReal * piOverTwoVec;7677auto kMod4 = k & 3;78auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);79auto flipSign = (kMod4 == 1) | (kMod4 == 2);8081const float sinC2 = -0.16666667163372039794921875;82const float sinC4 = +8.333347737789154052734375e-3;83const float sinC6 = -1.9842604524455964565277099609375e-4;84const float sinC8 = +2.760012648650445044040679931640625e-6;85const float sinC10 = -2.50293279435709337121807038784027099609375e-8;8687const float cosC2 = -0.5;88const float cosC4 = +4.166664183139801025390625e-2;89const float cosC6 = -1.388833043165504932403564453125e-3;90const float cosC8 = +2.47562347794882953166961669921875e-5;91const float cosC10 = -2.59630184018533327616751194000244140625e-7;9293auto outside = select(cosUseCos, 1., x);94auto c2 = select(cosUseCos, T(cosC2), T(sinC2));95auto c4 = select(cosUseCos, T(cosC4), T(sinC4));96auto c6 = select(cosUseCos, T(cosC6), T(sinC6));97auto c8 = select(cosUseCos, T(cosC8), T(sinC8));98auto c10 = select(cosUseCos, T(cosC10), T(sinC10));99100auto x2 = x * x;101auto formula = x2 * c10 + c8;102formula = x2 * formula + c6;103formula = x2 * formula + c4;104formula = x2 * formula + c2;105formula = x2 * formula + 1.;106formula *= outside;107108formula = select(flipSign, -formula, formula);109return formula;110}111112template <typename T>113__forceinline void sincos(const T &v, T &sinResult, T &cosResult)114{115const float piOverTwoVec = 1.57079637050628662109375;116const float twoOverPiVec = 0.636619746685028076171875;117auto scaled = v * twoOverPiVec;118auto kReal = floor(scaled);119auto k = toInt(kReal);120121// Reduced range version of x122auto x = v - kReal * piOverTwoVec;123auto kMod4 = k & 3;124auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));125auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));126auto sinFlipSign = (kMod4 > 1);127auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));128129const float oneVec = +1.;130const float sinC2 = -0.16666667163372039794921875;131const float sinC4 = +8.333347737789154052734375e-3;132const float sinC6 = -1.9842604524455964565277099609375e-4;133const float sinC8 = +2.760012648650445044040679931640625e-6;134const float sinC10 = -2.50293279435709337121807038784027099609375e-8;135136const float cosC2 = -0.5;137const float cosC4 = +4.166664183139801025390625e-2;138const float cosC6 = -1.388833043165504932403564453125e-3;139const float cosC8 = +2.47562347794882953166961669921875e-5;140const float cosC10 = -2.59630184018533327616751194000244140625e-7;141142auto x2 = x * x;143144auto sinFormula = x2 * sinC10 + sinC8;145auto cosFormula = x2 * cosC10 + cosC8;146sinFormula = x2 * sinFormula + sinC6;147cosFormula = x2 * cosFormula + cosC6;148149sinFormula = x2 * sinFormula + sinC4;150cosFormula = x2 * cosFormula + cosC4;151152sinFormula = x2 * sinFormula + sinC2;153cosFormula = x2 * cosFormula + cosC2;154155sinFormula = x2 * sinFormula + oneVec;156cosFormula = x2 * cosFormula + oneVec;157158sinFormula *= x;159160sinResult = select(sinUseCos, cosFormula, sinFormula);161cosResult = select(cosUseCos, cosFormula, sinFormula);162163sinResult = select(sinFlipSign, -sinResult, sinResult);164cosResult = select(cosFlipSign, -cosResult, cosResult);165}166167template <typename T>168__forceinline T tan(const T &v)169{170const float piOverFourVec = 0.785398185253143310546875;171const float fourOverPiVec = 1.27323949337005615234375;172173auto xLt0 = v < 0.;174auto y = select(xLt0, -v, v);175auto scaled = y * fourOverPiVec;176177auto kReal = floor(scaled);178auto k = toInt(kReal);179180auto x = y - kReal * piOverFourVec;181182// If k & 1, x -= Pi/4183auto needOffset = (k & 1) != 0;184x = select(needOffset, x - piOverFourVec, x);185186// If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...187auto kMod4 = k & 3;188auto useCotan = (kMod4 == 1) | (kMod4 == 2);189190const float oneVec = 1.0;191192const float tanC2 = +0.33333075046539306640625;193const float tanC4 = +0.13339905440807342529296875;194const float tanC6 = +5.3348250687122344970703125e-2;195const float tanC8 = +2.46033705770969390869140625e-2;196const float tanC10 = +2.892402000725269317626953125e-3;197const float tanC12 = +9.500005282461643218994140625e-3;198199const float cotC2 = -0.3333333432674407958984375;200const float cotC4 = -2.222204394638538360595703125e-2;201const float cotC6 = -2.11752182804048061370849609375e-3;202const float cotC8 = -2.0846328698098659515380859375e-4;203const float cotC10 = -2.548247357481159269809722900390625e-5;204const float cotC12 = -3.5257363606433500535786151885986328125e-7;205206auto x2 = x * x;207T z;208if (any(useCotan))209{210auto cotVal = x2 * cotC12 + cotC10;211cotVal = x2 * cotVal + cotC8;212cotVal = x2 * cotVal + cotC6;213cotVal = x2 * cotVal + cotC4;214cotVal = x2 * cotVal + cotC2;215cotVal = x2 * cotVal + oneVec;216// The equation is for x * cot(x) but we need -x * cot(x) for the tan part.217cotVal /= -x;218z = cotVal;219}220auto useTan = !useCotan;221if (any(useTan))222{223auto tanVal = x2 * tanC12 + tanC10;224tanVal = x2 * tanVal + tanC8;225tanVal = x2 * tanVal + tanC6;226tanVal = x2 * tanVal + tanC4;227tanVal = x2 * tanVal + tanC2;228tanVal = x2 * tanVal + oneVec;229// Equation was for tan(x)/x230tanVal *= x;231z = select(useTan, tanVal, z);232}233return select(xLt0, -z, z);234}235236template <typename T>237__forceinline T asin(const T &x0)238{239auto isneg = (x0 < 0.f);240auto x = abs(x0);241auto isnan = (x > 1.f);242243// sollya244// fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],245// [1e-20;.9999999999999999]);246// avg error: 1.1105439e-06, max error 1.3187528e-06247auto v = 1.57079517841339111328125f +248x * (-0.21450997889041900634765625f +249x * (8.78556668758392333984375e-2f +250x * (-4.489909112453460693359375e-2f +251x * (1.928029954433441162109375e-2f +252x * (-4.3095736764371395111083984375e-3f)))));253254v *= -sqrt(1.f - x);255v = v + 1.57079637050628662109375f;256257v = select(v < 0.f, T(0.f), v);258v = select(isneg, -v, v);259v = select(isnan, T(cast_i2f(0x7fc00000)), v);260261return v;262}263264template <typename T>265__forceinline T acos(const T &v)266{267return 1.57079637050628662109375f - asin(v);268}269270template <typename T>271__forceinline T atan(const T &v)272{273const float piOverTwoVec = 1.57079637050628662109375;274// atan(-x) = -atan(x) (so flip from negative to positive first)275// If x > 1 -> atan(x) = Pi/2 - atan(1/x)276auto xNeg = v < 0.f;277auto xFlipped = select(xNeg, -v, v);278279auto xGt1 = xFlipped > 1.;280auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);281282// These coefficients approximate atan(x)/x283const float atanC0 = +0.99999988079071044921875;284const float atanC2 = -0.3333191573619842529296875;285const float atanC4 = +0.199689209461212158203125;286const float atanC6 = -0.14015688002109527587890625;287const float atanC8 = +9.905083477497100830078125e-2;288const float atanC10 = -5.93664981424808502197265625e-2;289const float atanC12 = +2.417283318936824798583984375e-2;290const float atanC14 = -4.6721356920897960662841796875e-3;291292auto x2 = x * x;293auto result = x2 * atanC14 + atanC12;294result = x2 * result + atanC10;295result = x2 * result + atanC8;296result = x2 * result + atanC6;297result = x2 * result + atanC4;298result = x2 * result + atanC2;299result = x2 * result + atanC0;300result *= x;301302result = select(xGt1, piOverTwoVec - result, result);303result = select(xNeg, -result, result);304return result;305}306307template <typename T>308__forceinline T atan2(const T &y, const T &x)309{310const float piVec = 3.1415926536;311// atan2(y, x) =312//313// atan2(y > 0, x = +-0) -> Pi/2314// atan2(y < 0, x = +-0) -> -Pi/2315// atan2(y = +-0, x < +0) -> +-Pi316// atan2(y = +-0, x >= +0) -> +-0317//318// atan2(y >= 0, x < 0) -> Pi + atan(y/x)319// atan2(y < 0, x < 0) -> -Pi + atan(y/x)320// atan2(y, x > 0) -> atan(y/x)321//322// and then a bunch of code for dealing with infinities.323auto yOverX = y * rcpSafe(x);324auto atanArg = atan(yOverX);325auto xLt0 = x < 0.f;326auto yLt0 = y < 0.f;327auto offset = select(xLt0,328select(yLt0, T(-piVec), T(piVec)), 0.f);329return offset + atanArg;330}331332template <typename T>333__forceinline T exp(const T &v)334{335const float ln2Part1 = 0.6931457519;336const float ln2Part2 = 1.4286067653e-6;337const float oneOverLn2 = 1.44269502162933349609375;338339auto scaled = v * oneOverLn2;340auto kReal = floor(scaled);341auto k = toInt(kReal);342343// Reduced range version of x344auto x = v - kReal * ln2Part1;345x -= kReal * ln2Part2;346347// These coefficients are for e^x in [0, ln(2)]348const float one = 1.;349const float c2 = 0.4999999105930328369140625;350const float c3 = 0.166668415069580078125;351const float c4 = 4.16539050638675689697265625e-2;352const float c5 = 8.378830738365650177001953125e-3;353const float c6 = 1.304379315115511417388916015625e-3;354const float c7 = 2.7555381529964506626129150390625e-4;355356auto result = x * c7 + c6;357result = x * result + c5;358result = x * result + c4;359result = x * result + c3;360result = x * result + c2;361result = x * result + one;362result = x * result + one;363364// Compute 2^k (should differ for float and double, but I'll avoid365// it for now and just do floats)366const int fpbias = 127;367auto biasedN = k + fpbias;368auto overflow = kReal > fpbias;369// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)370// we've got underflow. -127 * ln(2) -> -88.02. So the most371// negative float input that doesn't result in zero is like -88.372auto underflow = kReal <= -fpbias;373const int infBits = 0x7f800000;374biasedN <<= 23;375// Reinterpret this thing as float376auto twoToTheN = asFloat(biasedN);377// Handle both doubles and floats (hopefully eliding the copy for float)378auto elemtype2n = twoToTheN;379result *= elemtype2n;380result = select(overflow, cast_i2f(infBits), result);381result = select(underflow, 0., result);382return result;383}384385// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n386// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).387template <typename T, typename R>388__forceinline void __rangeReduceLog(const T &input,389T &reduced,390R &exponent)391{392auto intVersion = asInt(input);393// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM394// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000395// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0396// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111397// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF398399//const int exponentMask(0x7F800000)400static const int nonexponentMask = 0x807FFFFF;401402// We want the reduced version to have an exponent of -1 which is403// -1 + 127 after biasing or 126404static const int exponentNeg1 = (126l << 23);405// NOTE(boulos): We don't need to mask anything out since we know406// the sign bit has to be 0. If it's 1, we need to return infinity/nan407// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).408auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]409410auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2411exponent = offsetExponent - 127; // get the real value412413// Blend the offset_exponent with the original input (do this in414// int for now, until I decide if float can have & and ¬)415auto blended = (intVersion & nonexponentMask) | (exponentNeg1);416reduced = asFloat(blended);417}418419template <typename T> struct ExponentType { };420template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };421template <> struct ExponentType<float> { typedef int Ty; };422423template <typename T>424__forceinline T log(const T &v)425{426T reduced;427typename ExponentType<T>::Ty exponent;428429const int nanBits = 0x7fc00000;430const int negInfBits = 0xFF800000;431const float nan = cast_i2f(nanBits);432const float negInf = cast_i2f(negInfBits);433auto useNan = v < 0.;434auto useInf = v == 0.;435auto exceptional = useNan | useInf;436const float one = 1.0;437438auto patched = select(exceptional, one, v);439__rangeReduceLog(patched, reduced, exponent);440441const float ln2 = 0.693147182464599609375;442443auto x1 = one - reduced;444const float c1 = +0.50000095367431640625;445const float c2 = +0.33326041698455810546875;446const float c3 = +0.2519190013408660888671875;447const float c4 = +0.17541764676570892333984375;448const float c5 = +0.3424419462680816650390625;449const float c6 = -0.599632322788238525390625;450const float c7 = +1.98442304134368896484375;451const float c8 = -2.4899270534515380859375;452const float c9 = +1.7491014003753662109375;453454auto result = x1 * c9 + c8;455result = x1 * result + c7;456result = x1 * result + c6;457result = x1 * result + c5;458result = x1 * result + c4;459result = x1 * result + c3;460result = x1 * result + c2;461result = x1 * result + c1;462result = x1 * result + one;463464// Equation was for -(ln(red)/(1-red))465result *= -x1;466result += toFloat(exponent) * ln2;467468return select(exceptional,469select(useNan, T(nan), T(negInf)),470result);471}472473template <typename T>474__forceinline T pow(const T &x, const T &y)475{476auto x1 = abs(x);477auto z = exp(y * log(x1));478479// Handle special cases480const float twoOver23 = 8388608.0f;481auto yInt = y == round(y);482auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit483484// x == 0485z = select(x == 0.0f,486select(y < 0.0f, T(inf) | signmsk(x),487select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);488489// x < 0490auto xNegative = x < 0.0f;491if (any(xNegative))492{493auto z1 = z | asFloat(yOddInt);494z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());495z = select(xNegative, z1, z);496}497498auto xFinite = isfinite(x);499auto yFinite = isfinite(y);500if (all(xFinite & yFinite))501return z;502503// x finite and y infinite504z = select(andn(xFinite, yFinite),505select(x1 == 1.0f, 1.0f,506select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);507508// x infinite509z = select(xFinite, z,510select(y == 0.0f, 1.0f,511select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));512513return z;514}515516template <typename T>517__forceinline T pow(const T &x, float y)518{519return pow(x, T(y));520}521522} // namespace fastapprox523524} // namespace embree525526527