CoCalc -- powf.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/math/aarch64/sve/powf.c
⁴⁸³⁷⁵ views
1
/*
2
 * Single-precision SVE powf function.
3
 *
4
 * Copyright (c) 2023-2025, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
#include "sv_math.h"
9
#include "test_sig.h"
10
#include "test_defs.h"
11

12
/* The following data is used in the SVE pow core computation
13
   and special case detection.  */
14
#define Tinvc __v_powf_data.invc
15
#define Tlogc __v_powf_data.logc
16
#define Texp __v_powf_data.scale
17
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18
#define Norm 0x1p23f /* 0x4b000000.  */
19

20
/* Overall ULP error bound for pow is 2.6 ulp
21
   ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
22
static const struct data
23
{
24
  double log_poly[4];
25
  double exp_poly[3];
26
  float uflow_bound, oflow_bound, small_bound;
27
  uint32_t sign_bias, subnormal_bias, off;
28
} data = {
29
  /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
30
     V_POWF_EXP2_N.  */
31
  .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
32
		-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
33
  /* rel err: 1.69 * 2^-34.  */
34
  .exp_poly = {
35
    0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3.  */
36
    0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2.  */
37
    0x1.62e42ff0c52d6p-6,   /* A3 / V_POWF_EXP2_N.  */
38
  },
39
  .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N.  */
40
  .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N.  */
41
  .small_bound = 0x1p-126f,
42
  .off = 0x3f35d000,
43
  .sign_bias = SignBias,
44
  .subnormal_bias = 0x0b800000, /* 23 << 23.  */
45
};
46

47
#define A(i) sv_f64 (d->log_poly[i])
48
#define C(i) sv_f64 (d->exp_poly[i])
49

50
/* Check if x is an integer.  */
51
static inline svbool_t
52
svisint (svbool_t pg, svfloat32_t x)
53
{
54
  return svcmpeq (pg, svrintz_z (pg, x), x);
55
}
56

57
/* Check if x is real not integer valued.  */
58
static inline svbool_t
59
svisnotint (svbool_t pg, svfloat32_t x)
60
{
61
  return svcmpne (pg, svrintz_z (pg, x), x);
62
}
63

64
/* Check if x is an odd integer.  */
65
static inline svbool_t
66
svisodd (svbool_t pg, svfloat32_t x)
67
{
68
  svfloat32_t y = svmul_x (pg, x, 0.5f);
69
  return svisnotint (pg, y);
70
}
71

72
/* Check if zero, inf or nan.  */
73
static inline svbool_t
74
sv_zeroinfnan (svbool_t pg, svuint32_t i)
75
{
76
  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
77
		  2u * 0x7f800000 - 1);
78
}
79

80
/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
81
   the bit representation of a non-zero finite floating-point value.  */
82
static inline int
83
checkint (uint32_t iy)
84
{
85
  int e = iy >> 23 & 0xff;
86
  if (e < 0x7f)
87
    return 0;
88
  if (e > 0x7f + 23)
89
    return 2;
90
  if (iy & ((1 << (0x7f + 23 - e)) - 1))
91
    return 0;
92
  if (iy & (1 << (0x7f + 23 - e)))
93
    return 1;
94
  return 2;
95
}
96

97
/* Check if zero, inf or nan.  */
98
static inline int
99
zeroinfnan (uint32_t ix)
100
{
101
  return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
102
}
103

104
/* A scalar subroutine used to fix main power special cases. Similar to the
105
   preamble of scalar powf except that we do not update ix and sign_bias. This
106
   is done in the preamble of the SVE powf.  */
107
static inline float
108
powf_specialcase (float x, float y, float z)
109
{
110
  uint32_t ix = asuint (x);
111
  uint32_t iy = asuint (y);
112
  /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
113
  if (unlikely (zeroinfnan (iy)))
114
    {
115
      if (2 * iy == 0)
116
	return issignalingf_inline (x) ? x + y : 1.0f;
117
      if (ix == 0x3f800000)
118
	return issignalingf_inline (y) ? x + y : 1.0f;
119
      if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
120
	return x + y;
121
      if (2 * ix == 2 * 0x3f800000)
122
	return 1.0f;
123
      if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
124
	return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
125
      return y * y;
126
    }
127
  if (unlikely (zeroinfnan (ix)))
128
    {
129
      float_t x2 = x * x;
130
      if (ix & 0x80000000 && checkint (iy) == 1)
131
	x2 = -x2;
132
      return iy & 0x80000000 ? 1 / x2 : x2;
133
    }
134
  /* We need a return here in case x<0 and y is integer, but all other tests
135
   need to be run.  */
136
  return z;
137
}
138

139
/* Scalar fallback for special case routines with custom signature.  */
140
static svfloat32_t NOINLINE
141
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
142
{
143
  /* Special cases of x or y: zero, inf and nan.  */
144
  svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
145
  svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
146
  svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
147

148
  svbool_t p = svpfirst (cmp, svpfalse ());
149
  while (svptest_any (cmp, p))
150
    {
151
      float sx1 = svclastb (p, 0, x1);
152
      float sx2 = svclastb (p, 0, x2);
153
      float elem = svclastb (p, 0, y);
154
      elem = powf_specialcase (sx1, sx2, elem);
155
      svfloat32_t y2 = sv_f32 (elem);
156
      y = svsel (p, y2, y);
157
      p = svpnext_b32 (cmp, p);
158
    }
159
  return y;
160
}
161

162
/* Compute core for half of the lanes in double precision.  */
163
static inline svfloat64_t
164
sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
165
		  svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
166
		  const struct data *d)
167
{
168
  svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
169
  svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
170

171
  /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
172
  svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
173
  svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
174

175
  /* Polynomial to approximate log1p(r)/ln2.  */
176
  svfloat64_t logx = A (0);
177
  logx = svmad_x (pg, r, logx, A (1));
178
  logx = svmad_x (pg, r, logx, A (2));
179
  logx = svmad_x (pg, r, logx, A (3));
180
  logx = svmad_x (pg, r, logx, y0);
181
  *pylogx = svmul_x (pg, y, logx);
182

183
  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
184
  svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
185
  svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
186

187
  r = svsub_x (pg, *pylogx, kd);
188

189
  /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
190
  svuint64_t t = svld1_gather_index (
191
      svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
192
  svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
193
  t = svadd_x (svptrue_b64 (), t,
194
	       svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
195
  svfloat64_t s = svreinterpret_f64 (t);
196

197
  svfloat64_t p = C (0);
198
  p = svmla_x (pg, C (1), p, r);
199
  p = svmla_x (pg, C (2), p, r);
200
  p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
201

202
  return p;
203
}
204

205
/* Widen vector to double precision and compute core on both halves of the
206
   vector. Lower cost of promotion by considering all lanes active.  */
207
static inline svfloat32_t
208
sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
209
	      svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
210
	      const struct data *d)
211
{
212
  const svbool_t ptrue = svptrue_b64 ();
213

214
  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
215
     in order to perform core computation in double precision.  */
216
  const svbool_t pg_lo = svunpklo (pg);
217
  const svbool_t pg_hi = svunpkhi (pg);
218
  svfloat64_t y_lo
219
      = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
220
  svfloat64_t y_hi
221
      = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
222
  svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
223
  svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
224
  svuint64_t i_lo = svunpklo (i);
225
  svuint64_t i_hi = svunpkhi (i);
226
  svint64_t k_lo = svunpklo (k);
227
  svint64_t k_hi = svunpkhi (k);
228
  svuint64_t sign_bias_lo = svunpklo (sign_bias);
229
  svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230

231
  /* Compute each part in double precision.  */
232
  svfloat64_t ylogx_lo, ylogx_hi;
233
  svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234
				     sign_bias_lo, &ylogx_lo, d);
235
  svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236
				     sign_bias_hi, &ylogx_hi, d);
237

238
  /* Convert back to single-precision and interleave.  */
239
  svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240
  svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241
  *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242
  svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243
  svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244
  return svuzp1 (lo_32, hi_32);
245
}
246

247
/* Implementation of SVE powf.
248
   Provides the same accuracy as AdvSIMD powf, since it relies on the same
249
   algorithm. The theoretical maximum error is under 2.60 ULPs.
250
   Maximum measured error is 2.57 ULPs:
251
   SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
252
						   want 0x1.fff862p+127.  */
253
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254
{
255
  const struct data *d = ptr_barrier (&data);
256

257
  svuint32_t vix0 = svreinterpret_u32 (x);
258
  svuint32_t viy0 = svreinterpret_u32 (y);
259

260
  /* Negative x cases.  */
261
  svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
262

263
  /* Set sign_bias and ix depending on sign of x and nature of y.  */
264
  svbool_t yint_or_xpos = pg;
265
  svuint32_t sign_bias = sv_u32 (0);
266
  svuint32_t vix = vix0;
267
  if (unlikely (svptest_any (pg, xisneg)))
268
    {
269
      /* Determine nature of y.  */
270
      yint_or_xpos = svisint (xisneg, y);
271
      svbool_t yisodd_xisneg = svisodd (xisneg, y);
272
      /* ix set to abs(ix) if y is integer.  */
273
      vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
274
      /* Set to SignBias if x is negative and y is odd.  */
275
      sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
276
    }
277

278
  /* Special cases of x or y: zero, inf and nan.  */
279
  svbool_t xspecial = sv_zeroinfnan (pg, vix0);
280
  svbool_t yspecial = sv_zeroinfnan (pg, viy0);
281
  svbool_t cmp = svorr_z (pg, xspecial, yspecial);
282

283
  /* Small cases of x: |x| < 0x1p-126.  */
284
  svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
285
  if (unlikely (svptest_any (yint_or_xpos, xsmall)))
286
    {
287
      /* Normalize subnormal x so exponent becomes negative.  */
288
      svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
289
      vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
290
      vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
291
      vix = svsel (xsmall, vix_norm, vix);
292
    }
293
  /* Part of core computation carried in working precision.  */
294
  svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
295
  svuint32_t i = svand_x (
296
      yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
297
      V_POWF_LOG2_N - 1);
298
  svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
299
  svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
300
  svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
301
			 (23 - V_POWF_EXP2_TABLE_BITS));
302

303
  /* Compute core in extended precision and return intermediate ylogx results
304
     to handle cases of underflow and underflow in exp.  */
305
  svfloat32_t ylogx;
306
  svfloat32_t ret
307
      = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
308

309
  /* Handle exp special cases of underflow and overflow.  */
310
  svuint32_t sign
311
      = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
312
  svfloat32_t ret_oflow
313
      = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
314
  svfloat32_t ret_uflow = svreinterpret_f32 (sign);
315
  ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
316
  ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
317

318
  /* Cases of finite y and finite negative x.  */
319
  ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
320

321
  if (unlikely (svptest_any (cmp, cmp)))
322
    return sv_call_powf_sc (x, y, ret);
323

324
  return ret;
325
}
326

327
TEST_SIG (SV, F, 2, pow)
328
TEST_ULP (SV_NAME_F2 (pow), 2.08)
329
TEST_DISABLE_FENV (SV_NAME_F2 (pow))
330
/* Wide intervals spanning the whole domain but shared between x and y.  */
331
#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n)                              \
332
  TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n)                    \
333
  TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n)                  \
334
  TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n)                  \
335
  TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
336
SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
337
SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
338
SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
339
/* x~1 or y~1.  */
340
SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
341
SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
342
SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
343
/* around estimated argmaxs of ULP error.  */
344
SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
345
SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
346
/* x is negative, y is odd or even integer, or y is real not integer.  */
347
TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
348
TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
349
TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
350
TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
351
/* |x| is inf, y is odd or even integer, or y is real not integer.  */
352
SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
353
SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
354
SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
355
SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
356
/* 0.0^y.  */
357
SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
358
/* 1.0^y.  */
359
TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
360
TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
361
TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
362
TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
363
CLOSE_SVE_ATTR
364

365
Product

Resources

Company