Path: blob/main/contrib/arm-optimized-routines/math/aarch64/sve/exp2.c
48378 views
/*1* Double-precision SVE 2^x function.2*3* Copyright (c) 2023-2025, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67#include "sv_math.h"8#include "test_sig.h"9#include "test_defs.h"1011#define N (1 << V_EXP_TABLE_BITS)1213#define BigBound 102214#define UOFlowBound 12801516static const struct data17{18double c0, c2;19double c1, c3;20double shift, big_bound, uoflow_bound;21} data = {22/* Coefficients are computed using Remez algorithm with23minimisation of the absolute error. */24.c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,25.c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,26.shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,27.big_bound = BigBound,28};2930#define SpecialOffset 0x6000000000000000 /* 0x1p513. */31/* SpecialBias1 + SpecialBias1 = asuint(1.0). */32#define SpecialBias1 0x7000000000000000 /* 0x1p769. */33#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */3435/* Update of both special and non-special cases, if any special case is36detected. */37static inline svfloat64_t38special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,39const struct data *d)40{41/* s=2^n may overflow, break it up into s=s1*s2,42such that exp = s + s*y can be computed as s1*(s2+s2*y)43and s1*s1 overflows only if n>0. */4445/* If n<=0 then set b to 0x6, 0 otherwise. */46svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */47svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);4849/* Set s1 to generate overflow depending on sign of exponent n. */50svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));51/* Offset s to avoid overflow in final result if n is below threshold. */52svfloat64_t s2 = svreinterpret_f64 (53svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));5455/* |n| > 1280 => 2^(n) overflows. */56svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);5758svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);59svfloat64_t r2 = svmla_x (pg, s2, s2, y);60svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);6162return svsel (p_cmp, r1, r0);63}6465/* Fast vector implementation of exp2.66Maximum measured error is 1.65 ulp.67_ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-168want 0x1.f8db0d4df721dp-1. */69svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)70{71const struct data *d = ptr_barrier (&data);72svbool_t no_big_scale = svacle (pg, x, d->big_bound);73svbool_t special = svnot_z (pg, no_big_scale);7475/* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */76svfloat64_t shift = sv_f64 (d->shift);77svfloat64_t kd = svadd_x (pg, x, shift);78svuint64_t ki = svreinterpret_u64 (kd);79/* kd = k/N. */80kd = svsub_x (pg, kd, shift);81svfloat64_t r = svsub_x (pg, x, kd);8283/* scale ~= 2^(k/N). */84svuint64_t idx = svand_x (pg, ki, N - 1);85svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);86/* This is only a valid scale when -1023*N < k < 1024*N. */87svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);88svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));8990svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);91/* Approximate exp2(r) using polynomial. */92/* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */93svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);94svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);95svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);96svfloat64_t p = svmla_x (pg, p01, p23, r2);97svfloat64_t y = svmul_x (svptrue_b64 (), r, p);98/* Assemble exp2(x) = exp2(r) * scale. */99if (unlikely (svptest_any (pg, special)))100return special_case (pg, scale, y, kd, d);101return svmla_x (pg, scale, scale, y);102}103104TEST_SIG (SV, D, 1, exp2, -9.9, 9.9)105TEST_ULP (SV_NAME_D1 (exp2), 1.15)106TEST_DISABLE_FENV (SV_NAME_D1 (exp2))107TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000)108TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000)109TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000)110CLOSE_SVE_ATTR111112113