Path: blob/main/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h
48375 views
/*1* Double-precision x^y function.2*3* Copyright (c) 2018-2024, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67#include "math_config.h"89/* Scalar version of pow used for fallbacks in vector implementations. */1011/* Data is defined in v_pow_log_data.c. */12#define N_LOG (1 << V_POW_LOG_TABLE_BITS)13#define Off 0x3fe695550000000014#define As __v_pow_log_data.poly1516/* Data is defined in v_pow_exp_data.c. */17#define N_EXP (1 << V_POW_EXP_TABLE_BITS)18#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)19#define SmallExp 0x3c9 /* top12(0x1p-54). */20#define BigExp 0x408 /* top12(512.0). */21#define ThresExp 0x03f /* BigExp - SmallExp. */22#define InvLn2N __v_pow_exp_data.n_over_ln223#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi24#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo25#define SBits __v_pow_exp_data.sbits26#define Cs __v_pow_exp_data.poly2728/* Constants associated with pow. */29#define SmallPowX 0x001 /* top12(0x1p-126). */30#define BigPowX 0x7ff /* top12(INFINITY). */31#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */32#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */33#define BigPowY 0x43e /* top12(0x1.749p62). */34#define ThresPowY 0x080 /* BigPowY - SmallPowY. */3536/* Top 12 bits of a double (sign and exponent bits). */37static inline uint32_t38top12 (double x)39{40return asuint64 (x) >> 52;41}4243/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about44additional 15 bits precision. IX is the bit representation of x, but45normalized in the subnormal range using the sign bit for the exponent. */46static inline double47log_inline (uint64_t ix, double *tail)48{49/* x = 2^k z; where z is in range [Off,2*Off) and exact.50The range is split into N subintervals.51The ith subinterval contains z and c is near its center. */52uint64_t tmp = ix - Off;53int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1);54int k = (int64_t) tmp >> 52; /* arithmetic shift. */55uint64_t iz = ix - (tmp & 0xfffULL << 52);56double z = asdouble (iz);57double kd = (double) k;5859/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */60double invc = __v_pow_log_data.invc[i];61double logc = __v_pow_log_data.logc[i];62double logctail = __v_pow_log_data.logctail[i];6364/* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and65|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */66double r = fma (z, invc, -1.0);6768/* k*Ln2 + log(c) + r. */69double t1 = kd * __v_pow_log_data.ln2_hi + logc;70double t2 = t1 + r;71double lo1 = kd * __v_pow_log_data.ln2_lo + logctail;72double lo2 = t1 - t2 + r;7374/* Evaluation is optimized assuming superscalar pipelined execution. */75double ar = As[0] * r;76double ar2 = r * ar;77double ar3 = r * ar2;78/* k*Ln2 + log(c) + r + A[0]*r*r. */79double hi = t2 + ar2;80double lo3 = fma (ar, r, -ar2);81double lo4 = t2 - hi + ar2;82/* p = log1p(r) - r - A[0]*r*r. */83double p = (ar384* (As[1] + r * As[2]85+ ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6]))));86double lo = lo1 + lo2 + lo3 + lo4 + p;87double y = hi + lo;88*tail = hi - y + lo;89return y;90}9192/* Handle cases that may overflow or underflow when computing the result that93is scale*(1+TMP) without intermediate rounding. The bit representation of94scale is in SBITS, however it has a computed exponent that may have95overflown into the sign bit so that needs to be adjusted before using it as96a double. (int32_t)KI is the k used in the argument reduction and exponent97adjustment of scale, positive k here means the result may overflow and98negative k means the result may underflow. */99static inline double100special_case (double tmp, uint64_t sbits, uint64_t ki)101{102double scale, y;103104if ((ki & 0x80000000) == 0)105{106/* k > 0, the exponent of scale might have overflowed by <= 460. */107sbits -= 1009ull << 52;108scale = asdouble (sbits);109y = 0x1p1009 * (scale + scale * tmp);110return y;111}112/* k < 0, need special care in the subnormal range. */113sbits += 1022ull << 52;114/* Note: sbits is signed scale. */115scale = asdouble (sbits);116y = scale + scale * tmp;117#if WANT_SIMD_EXCEPT118if (fabs (y) < 1.0)119{120/* Round y to the right precision before scaling it into the subnormal121range to avoid double rounding that can cause 0.5+E/2 ulp error where122E is the worst-case ulp error outside the subnormal range. So this123is only useful if the goal is better than 1 ulp worst-case error. */124double hi, lo, one = 1.0;125if (y < 0.0)126one = -1.0;127lo = scale - y + scale * tmp;128hi = one + y;129lo = one - hi + y + lo;130y = (hi + lo) - one;131/* Fix the sign of 0. */132if (y == 0.0)133y = asdouble (sbits & 0x8000000000000000);134/* The underflow exception needs to be signaled explicitly. */135force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);136}137#endif138y = 0x1p-1022 * y;139return y;140}141142/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.143The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */144static inline double145exp_inline (double x, double xtail, uint32_t sign_bias)146{147uint32_t abstop = top12 (x) & 0x7ff;148if (unlikely (abstop - SmallExp >= ThresExp))149{150if (abstop - SmallExp >= 0x80000000)151{152/* Avoid spurious underflow for tiny x. */153/* Note: 0 is common input. */154return sign_bias ? -1.0 : 1.0;155}156if (abstop >= top12 (1024.0))157{158/* Note: inf and nan are already handled. */159/* Skip errno handling. */160#if WANT_SIMD_EXCEPT161return asuint64 (x) >> 63 ? __math_uflow (sign_bias)162: __math_oflow (sign_bias);163#else164double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;165return sign_bias ? -res_uoflow : res_uoflow;166#endif167}168/* Large x is special cased below. */169abstop = 0;170}171172/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */173/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */174double z = InvLn2N * x;175double kd = round (z);176uint64_t ki = lround (z);177double r = x - kd * Ln2HiN - kd * Ln2LoN;178/* The code assumes 2^-200 < |xtail| < 2^-8/N. */179r += xtail;180/* 2^(k/N) ~= scale. */181uint64_t idx = ki & (N_EXP - 1);182uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS);183/* This is only a valid scale when -1023*N < k < 1024*N. */184uint64_t sbits = SBits[idx] + top;185/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */186/* Evaluation is optimized assuming superscalar pipelined execution. */187double r2 = r * r;188double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);189if (unlikely (abstop == 0))190return special_case (tmp, sbits, ki);191double scale = asdouble (sbits);192/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there193is no spurious underflow here even without fma. */194return scale + scale * tmp;195}196197/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.198A version of exp_inline that is not inlined and for which sign_bias is199equal to 0. */200static double NOINLINE201exp_nosignbias (double x, double xtail)202{203uint32_t abstop = top12 (x) & 0x7ff;204if (unlikely (abstop - SmallExp >= ThresExp))205{206/* Avoid spurious underflow for tiny x. */207if (abstop - SmallExp >= 0x80000000)208return 1.0;209/* Note: inf and nan are already handled. */210if (abstop >= top12 (1024.0))211#if WANT_SIMD_EXCEPT212return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);213#else214return asuint64 (x) >> 63 ? 0.0 : INFINITY;215#endif216/* Large x is special cased below. */217abstop = 0;218}219220/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */221/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */222double z = InvLn2N * x;223double kd = round (z);224uint64_t ki = lround (z);225double r = x - kd * Ln2HiN - kd * Ln2LoN;226/* The code assumes 2^-200 < |xtail| < 2^-8/N. */227r += xtail;228/* 2^(k/N) ~= scale. */229uint64_t idx = ki & (N_EXP - 1);230uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS);231/* This is only a valid scale when -1023*N < k < 1024*N. */232uint64_t sbits = SBits[idx] + top;233/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */234double r2 = r * r;235double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);236if (unlikely (abstop == 0))237return special_case (tmp, sbits, ki);238double scale = asdouble (sbits);239/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there240is no spurious underflow here even without fma. */241return scale + scale * tmp;242}243244/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is245the bit representation of a non-zero finite floating-point value. */246static inline int247checkint (uint64_t iy)248{249int e = iy >> 52 & 0x7ff;250if (e < 0x3ff)251return 0;252if (e > 0x3ff + 52)253return 2;254if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))255return 0;256if (iy & (1ULL << (0x3ff + 52 - e)))257return 1;258return 2;259}260261/* Returns 1 if input is the bit representation of 0, infinity or nan. */262static inline int263zeroinfnan (uint64_t i)264{265return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;266}267268static double NOINLINE269pow_scalar_special_case (double x, double y)270{271uint32_t sign_bias = 0;272uint64_t ix, iy;273uint32_t topx, topy;274275ix = asuint64 (x);276iy = asuint64 (y);277topx = top12 (x);278topy = top12 (y);279if (unlikely (topx - SmallPowX >= ThresPowX280|| (topy & 0x7ff) - SmallPowY >= ThresPowY))281{282/* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0283and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */284/* Special cases: (x < 0x1p-126 or inf or nan) or285(|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */286if (unlikely (zeroinfnan (iy)))287{288if (2 * iy == 0)289return issignaling_inline (x) ? x + y : 1.0;290if (ix == asuint64 (1.0))291return issignaling_inline (y) ? x + y : 1.0;292if (2 * ix > 2 * asuint64 (INFINITY)293|| 2 * iy > 2 * asuint64 (INFINITY))294return x + y;295if (2 * ix == 2 * asuint64 (1.0))296return 1.0;297if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))298return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */299return y * y;300}301if (unlikely (zeroinfnan (ix)))302{303double x2 = x * x;304if (ix >> 63 && checkint (iy) == 1)305{306x2 = -x2;307sign_bias = 1;308}309#if WANT_SIMD_EXCEPT310if (2 * ix == 0 && iy >> 63)311return __math_divzero (sign_bias);312#endif313return iy >> 63 ? 1 / x2 : x2;314}315/* Here x and y are non-zero finite. */316if (ix >> 63)317{318/* Finite x < 0. */319int yint = checkint (iy);320if (yint == 0)321#if WANT_SIMD_EXCEPT322return __math_invalid (x);323#else324return __builtin_nan ("");325#endif326if (yint == 1)327sign_bias = SignBias;328ix &= 0x7fffffffffffffff;329topx &= 0x7ff;330}331if ((topy & 0x7ff) - SmallPowY >= ThresPowY)332{333/* Note: sign_bias == 0 here because y is not odd. */334if (ix == asuint64 (1.0))335return 1.0;336/* |y| < 2^-65, x^y ~= 1 + y*log(x). */337if ((topy & 0x7ff) < SmallPowY)338return 1.0;339#if WANT_SIMD_EXCEPT340return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)341: __math_uflow (0);342#else343return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;344#endif345}346if (topx == 0)347{348/* Normalize subnormal x so exponent becomes negative. */349ix = asuint64 (x * 0x1p52);350ix &= 0x7fffffffffffffff;351ix -= 52ULL << 52;352}353}354355double lo;356double hi = log_inline (ix, &lo);357double ehi = y * hi;358double elo = y * lo + fma (y, hi, -ehi);359return exp_inline (ehi, elo, sign_bias);360}361362363