CoCalc -- erfinv

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinv_25u.c
⁴⁸⁵²⁵ views
1
/*
2
 * Double-precision inverse error function (AdvSIMD variant).
3
 *
4
 * Copyright (c) 2023-2024, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7
#include "v_math.h"
8
#include "test_defs.h"
9
#include "mathlib.h"
10
#include "math_config.h"
11
#include "test_sig.h"
12
#include "v_poly_f64.h"
13
#define V_LOG_INLINE_POLY_ORDER 4
14
#include "v_log_inline.h"
15

16
const static struct data
17
{
18
  /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
19
      coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
20
      of the denominator. P is interleaved P_17 and P_37, similar for Q. P17
21
      and Q17 are provided as homogenous vectors as well for when the shortcut
22
      can be taken.  */
23
  double P[8][2], Q[7][2];
24
  float64x2_t tailshift;
25
  uint8_t idx[16];
26
  struct v_log_inline_data log_tbl;
27
  float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
28
} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
29
		  { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 },
30
		  { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 },
31
		  { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 },
32
		  { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 },
33
		  { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 },
34
		  { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 },
35
		  { 0, 0x1.e66f265ce9e5p-3 } },
36
	   .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 },
37
		  { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 },
38
		  { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 },
39
		  { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 },
40
		  { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 },
41
		  { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 },
42
		  { 0x1p+0, -0x1.4075c56404eecp+3 } },
43
	   .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7),
44
		     V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1),
45
		     V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3),
46
		     V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1),
47
		     V2 (0x1.c5aa03eef7258p-1) },
48
	   .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7),
49
		     V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1),
50
		     V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3),
51
		     V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3),
52
		     V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) },
53
	   .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6),
54
		     V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7),
55
		     V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3),
56
		     V2 (0x1.689181bbafd0cp-3) },
57
	   .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6),
58
		     V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
59
		     V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
60
	   .tailshift = V2 (-0.87890625),
61
	   .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
62
	   .log_tbl = V_LOG_CONSTANTS };
63

64
static inline float64x2_t
65
special (float64x2_t x, const struct data *d)
66
{
67
  /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf.
68
     By using log here, instead of log1p, we return finite values for both
69
     these inputs, and values outside [-1, 1]. This is non-compliant, but is an
70
     acceptable optimisation at Ofast. To get correct behaviour for all finite
71
     values use the log1p_inline helper on -abs(x) - note that erfinv(inf)
72
     will still be finite.  */
73
  float64x2_t t = vnegq_f64 (
74
      v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl));
75
  t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t));
76
  float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x);
77
  return vdivq_f64 (v_horner_8_f64 (t, d->P_57),
78
		    vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57)));
79
}
80

81
static inline float64x2_t
82
lookup (const double *c, uint8x16_t idx)
83
{
84
  float64x2_t x = vld1q_f64 (c);
85
  return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx));
86
}
87

88
static inline float64x2_t VPCS_ATTR
89
notails (float64x2_t x, const struct data *d)
90
{
91
  /* Shortcut when no input is in a tail region - no need to gather shift or
92
     coefficients.  */
93
  float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x);
94
  float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x);
95
  float64x2_t q = vaddq_f64 (d->Q_17[5], t);
96
  for (int i = 4; i >= 0; i--)
97
    q = vfmaq_f64 (d->Q_17[i], q, t);
98
  return vdivq_f64 (p, q);
99
}
100

101
/* Vector implementation of Blair et al's rational approximation to inverse
102
   error function in single-precision. Largest observed error is 24.75 ULP:
103
   _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0
104
					want 0x1.ea0547268660cp+0.  */
105
float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
106
{
107
  const struct data *d = ptr_barrier (&data);
108
  /* Calculate inverse error using algorithm described in
109
     J. M. Blair, C. A. Edwards, and J. H. Johnson,
110
     "Rational Chebyshev approximations for the inverse of the error function",
111
     Math. Comp. 30, pp. 827--830 (1976).
112
     https://doi.org/10.1090/S0025-5718-1976-0421040-7.
113

114
     Algorithm has 3 intervals:
115
     - 'Normal' region [-0.75, 0.75]
116
     - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
117
     - Extreme tail [-1, -0.9375] U [0.9375, 1]
118
     Normal and tail are both rational approximation of similar order on
119
     shifted input - these are typically performed in parallel using gather
120
     loads to obtain correct coefficients depending on interval.  */
121
  uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75));
122

123
  if (unlikely (!v_any_u64 (is_tail)))
124
    /* If input is normally distributed in [-1, 1] then likelihood of this is
125
       0.75^2 ~= 0.56.  */
126
    return notails (x, d);
127

128
  uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
129

130
  uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
131
  uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off);
132

133
  float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
134
  t = vfmaq_f64 (t, x, x);
135

136
  float64x2_t p = lookup (&d->P[7][0], idx);
137
  /* Last coeff of q is either 0 or 1 - use mask instead of load.  */
138
  float64x2_t q = vreinterpretq_f64_u64 (
139
      vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1))));
140
  for (int i = 6; i >= 0; i--)
141
    {
142
      p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t);
143
      q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t);
144
    }
145
  p = vmulq_f64 (p, x);
146

147
  if (unlikely (v_any_u64 (extreme_tail)))
148
    return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q));
149

150
  return vdivq_f64 (p, q);
151
}
152

153
#if USE_MPFR
154
# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference
155
#else
156
TEST_SIG (V, D, 1, erfinv, -0.99, 0.99)
157
TEST_ULP (V_NAME_D1 (erfinv), 24.8)
158
TEST_DISABLE_FENV (V_NAME_D1 (erfinv))
159
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
160
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
161
TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
162
/* Test with control lane in each interval.  */
163
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5)
164
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8)
165
TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95)
166
#endif
167

168
Product

Resources

Company