CoCalc -- erfinvf

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/erfinvf_5u.c
⁴⁸⁵²⁵ views
1
/*
2
 * Single-precision inverse error function (AdvSIMD variant).
3
 *
4
 * Copyright (c) 2023-2024, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7
#include "v_math.h"
8
#include "test_sig.h"
9
#include "test_defs.h"
10
#include "v_poly_f32.h"
11
#include "v_logf_inline.h"
12

13
const static struct data
14
{
15
  /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
16
      coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
17
      of the denominator. Coefficients are stored in various interleaved
18
      formats to allow for table-based (vector-to-vector) lookup.
19

20
      Plo is first two coefficients of P_10 and P_29 interleaved.
21
      PQ is third coeff of P_10 and first of Q_29 interleaved.
22
      Qhi is second and third coeffs of Q_29 interleaved.
23
      P29_3 is a homogenous vector with fourth coeff of P_29.
24

25
      P_10 and Q_10 are also stored in homogenous vectors to allow better
26
      memory access when no lanes are in a tail region.  */
27
  float Plo[4], PQ[4], Qhi[4];
28
  float32x4_t P29_3, tailshift;
29
  float32x4_t P_50[6], Q_50[2];
30
  float32x4_t P_10[3], Q_10[3];
31
  uint8_t idxhi[16], idxlo[16];
32
  struct v_logf_data logf_tbl;
33
} data = {
34
  .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 },
35
  .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 },
36
  .P29_3 = V4 (0x1.b13626p-2),
37
  .tailshift = V4 (-0.87890625),
38
  .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
39
  .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 },
40
  .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 },
41
  .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1),
42
	    V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) },
43
  .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) },
44
  .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) },
45
  .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) },
46
  .logf_tbl = V_LOGF_CONSTANTS
47
};
48

49
static inline float32x4_t
50
special (float32x4_t x, const struct data *d)
51
{
52
  /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf.
53
     By using log here, instead of log1p, we return finite values for both
54
     these inputs, and values outside [-1, 1]. This is non-compliant, but is an
55
     acceptable optimisation at Ofast. To get correct behaviour for all finite
56
     values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf)
57
     will still be finite.  */
58
  float32x4_t t = vdivq_f32 (
59
      v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline (
60
		     vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl))));
61
  float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x);
62
  float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t);
63
  return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q));
64
}
65

66
static inline float32x4_t
67
notails (float32x4_t x, const struct data *d)
68
{
69
  /* Shortcut when no input is in a tail region - no need to gather shift or
70
     coefficients.  */
71
  float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x);
72
  float32x4_t q = vaddq_f32 (t, d->Q_10[2]);
73
  q = vfmaq_f32 (d->Q_10[1], t, q);
74
  q = vfmaq_f32 (d->Q_10[0], t, q);
75

76
  return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q);
77
}
78

79
static inline float32x4_t
80
lookup (float32x4_t tbl, uint8x16_t idx)
81
{
82
  return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx));
83
}
84

85
/* Vector implementation of Blair et al's rational approximation to inverse
86
   error function in single-precision. Worst-case error is 4.98 ULP, in the
87
   tail region:
88
   _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
89
				  want 0x1.b4793ap+0 .  */
90
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x)
91
{
92
  const struct data *d = ptr_barrier (&data);
93

94
  /* Calculate inverse error using algorithm described in
95
     J. M. Blair, C. A. Edwards, and J. H. Johnson,
96
     "Rational Chebyshev approximations for the inverse of the error
97
      function", Math. Comp. 30, pp. 827--830 (1976).
98
     https://doi.org/10.1090/S0025-5718-1976-0421040-7.
99

100
    Algorithm has 3 intervals:
101
     - 'Normal' region [-0.75, 0.75]
102
     - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
103
     - Extreme tail [-1, -0.9375] U [0.9375, 1]
104
     Normal and tail are both rational approximation of similar order on
105
     shifted input - these are typically performed in parallel using gather
106
     loads to obtain correct coefficients depending on interval.  */
107
  uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75));
108
  uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375));
109

110
  if (unlikely (!v_any_u32 (is_tail)))
111
    /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to
112
       gather coefficients. If input is uniform in [-1, 1] then likelihood of
113
       this is 0.75^4 ~= 0.31.  */
114
    return notails (x, d);
115

116
  /* Select requisite shift depending on interval: polynomial is evaluated on
117
     x * x - shift.
118
     Normal shift = 0.5625
119
     Tail shift   = 0.87890625.  */
120
  float32x4_t t
121
      = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x);
122

123
  /* Calculate indexes for tbl: tbl is byte-wise, so:
124
     [0, 1, 2, 3, 4, 5, 6, ....] copies the vector
125
     Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
126
     two pairs of coeffs, so we need two idx vectors - one for each pair.  */
127
  uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
128
  uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off);
129
  uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off);
130

131
  /* Load the tables.  */
132
  float32x4_t plo = vld1q_f32 (d->Plo);
133
  float32x4_t pq = vld1q_f32 (d->PQ);
134
  float32x4_t qhi = vld1q_f32 (d->Qhi);
135

136
  /* Do the lookup (and calculate p3 by masking non-tail lanes).  */
137
  float32x4_t p3 = vreinterpretq_f32_u32 (
138
      vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
139
  float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi),
140
	      p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
141
	      q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
142

143
  float32x4_t p = vfmaq_f32 (p2, p3, t);
144
  p = vfmaq_f32 (p1, p, t);
145
  p = vfmaq_f32 (p0, p, t);
146
  p = vmulq_f32 (x, p);
147

148
  float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t);
149
  q = vfmaq_f32 (q0, q, t);
150

151
  if (unlikely (v_any_u32 (extreme_tail)))
152
    /* At least one lane is in the extreme tail - if input is uniform in
153
       [-1, 1] the likelihood of this is ~0.23.  */
154
    return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q));
155

156
  return vdivq_f32 (p, q);
157
}
158

159
HALF_WIDTH_ALIAS_F1 (erfinv)
160

161
#if USE_MPFR
162
# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference
163
#else
164
TEST_SIG (V, F, 1, erfinv, -0.99, 0.99)
165
TEST_DISABLE_FENV (V_NAME_F1 (erfinv))
166
TEST_ULP (V_NAME_F1 (erfinv), 4.49)
167
TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000)
168
/* Test with control lane in each interval.  */
169
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5)
170
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8)
171
TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95)
172
#endif
173

174
Product

Resources

Company