Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/cppspmd_math.h
9902 views
1
// Do not include this header directly.
2
//
3
// Copyright 2020-2024 Binomial LLC
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License");
6
// you may not use this file except in compliance with the License.
7
// You may obtain a copy of the License at
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
17
// The general goal of these vectorized estimated math functions is scalability/performance.
18
// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either.
19
// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper
20
// engineering analysis before relying on them.
21
// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
22
// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
23
24
CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv)
25
{
26
vfloat c = frac(abs(a * b_inv)) * abs(b);
27
return spmd_ternaryf(a < 0, -c, c);
28
}
29
30
CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv)
31
{
32
return frac(a * b_inv) * b;
33
}
34
35
// Avoids dividing by zero or very small values.
36
CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
37
{
38
return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
39
}
40
41
/*
42
clang 9.0.0 for win /fp:precise release
43
f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
44
45
log2_est():
46
max abs err: 0.0000023076808731
47
max rel err: 0.0000000756678881
48
avg abs err: 0.0000007535452724
49
avg rel err: 0.0000000235117843
50
51
XMVectorLog2():
52
max abs err: 0.0000023329709933
53
max rel err: 0.0000000826961046
54
avg abs err: 0.0000007564889684
55
avg rel err: 0.0000000236051899
56
57
std::log2f():
58
max abs err: 0.0000020265979401
59
max rel err: 0.0000000626647654
60
avg abs err: 0.0000007494445227
61
avg rel err: 0.0000000233800985
62
*/
63
64
// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
65
inline vfloat spmd_kernel::log2_est(vfloat v)
66
{
67
vfloat signif, fexp;
68
69
// Just clamp to a very small value, instead of checking for invalid inputs.
70
vfloat x = max(v, 2.2e-38f);
71
72
/*
73
* Assume IEEE representation, which is sgn(1):exp(8):frac(23)
74
* representing (1+frac)*2^(exp-127). Call 1+frac the significand
75
*/
76
77
// get exponent
78
vint ux1_i = cast_vfloat_to_vint(x);
79
80
vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
81
82
// actual exponent is exp-127, will subtract 127 later
83
84
vint ux2_i;
85
vfloat ux2_f;
86
87
vint greater = ux1_i & 0x00400000; // true if signif > 1.5
88
SPMD_SIF(greater != 0)
89
{
90
// signif >= 1.5 so need to divide by 2. Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1
91
store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
92
93
store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
94
95
// 126 instead of 127 compensates for division by 2
96
store_all(fexp, vfloat(exp - 126));
97
}
98
SPMD_SELSE(greater != 0)
99
{
100
// get signif by stuffing exp = 127 which corresponds to an exponent of 0
101
store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
102
103
store(ux2_f, cast_vint_to_vfloat(ux2_i));
104
105
store(fexp, vfloat(exp - 127));
106
}
107
SPMD_SENDIF
108
109
store_all(signif, ux2_f);
110
store_all(signif, signif - 1.0f);
111
112
const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
113
114
vfloat xm1 = signif;
115
vfloat xm1sqr = xm1 * xm1;
116
117
return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
118
119
// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
120
//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
121
}
122
123
// Uses log2_est(), so this function must be <= the precision of that.
124
inline vfloat spmd_kernel::log_est(vfloat v)
125
{
126
return log2_est(v) * 0.693147181f;
127
}
128
129
CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
130
{
131
// Assume we're using equation (2)
132
store_all(adjustment, 0);
133
134
// integer part of the input argument
135
vint int_arg = (vint)arg;
136
137
// if frac(arg) is in [0.5, 1.0]...
138
SPMD_SIF((arg - int_arg) > 0.5f)
139
{
140
store(adjustment, 1);
141
142
// then change it to [0.0, 0.5]
143
store(arg, arg - 0.5f);
144
}
145
SPMD_SENDIF
146
147
// arg == just the fractional part
148
store_all(arg, arg - (vfloat)int_arg);
149
150
// Now compute 2** (int) arg.
151
store_all(int_arg, min(int_arg + 127, 254));
152
153
store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
154
}
155
156
/*
157
clang 9.0.0 for win /fp:precise release
158
f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
159
160
exp2_est():
161
Total passed near - zero check : 16777216
162
Total sign diffs : 0
163
max abs err: 1668910609.7500000000000000
164
max rel err: 0.0000015642030031
165
avg abs err: 10793794.4007573910057545
166
avg rel err: 0.0000003890893282
167
168
XMVectorExp2():
169
Total passed near-zero check: 16777216
170
Total sign diffs: 0
171
max abs err: 1665552836.8750000000000000
172
max rel err: 0.0000114674862370
173
avg abs err: 10771868.2627860084176064
174
avg rel err: 0.0000011218880770
175
176
std::exp2f():
177
Total passed near-zero check: 16777216
178
Total sign diffs: 0
179
max abs err: 1591636585.6250000000000000
180
max rel err: 0.0000014849731018
181
avg abs err: 10775800.3204844966530800
182
avg rel err: 0.0000003851496422
183
*/
184
185
// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
186
inline vfloat spmd_kernel::exp2_est(vfloat arg)
187
{
188
SPMD_BEGIN_CALL
189
190
const vfloat P00 = +7.2152891521493f;
191
const vfloat P01 = +0.0576900723731f;
192
const vfloat Q00 = +20.8189237930062f;
193
const vfloat Q01 = +1.0f;
194
const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling
195
196
vfloat result = 0.0f;
197
198
// Return 0 if arg is too large.
199
// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
200
SPMD_IF(abs(arg) > 126.0f)
201
{
202
spmd_return();
203
}
204
SPMD_END_IF
205
206
// 2**(int(a))
207
vfloat two_int_a;
208
209
// set to 1 by reduce_expb
210
vint adjustment;
211
212
// 0 if arg is +; 1 if negative
213
vint negative = 0;
214
215
// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
216
SPMD_SIF(arg < 0.0f)
217
{
218
store(arg, -arg);
219
store(negative, 1);
220
}
221
SPMD_SENDIF
222
223
store_all(arg, min(arg, 126.0f));
224
225
// reduce to [0.0, 0.5]
226
reduce_expb(arg, two_int_a, adjustment);
227
228
// The format of the polynomial is:
229
// answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
230
//
231
// The following computes the polynomial in several steps:
232
233
// Q(x**2)
234
vfloat Q = vfma(Q01, (arg * arg), Q00);
235
236
// x*P(x**2)
237
vfloat x_P = arg * (vfma(P01, arg * arg, P00));
238
239
vfloat answer = (Q + x_P) / (Q - x_P);
240
241
// Now correct for the scaling factor of 2**(int(a))
242
store_all(answer, answer * two_int_a);
243
244
// If the result had a fractional part > 0.5, correct for that
245
store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
246
247
// Correct for a negative input
248
SPMD_SIF(negative != 0)
249
{
250
store(answer, 1.0f / answer);
251
}
252
SPMD_SENDIF
253
254
store(result, answer);
255
256
return result;
257
}
258
259
inline vfloat spmd_kernel::exp_est(vfloat arg)
260
{
261
// e^x = exp2(x / log_base_e(2))
262
// constant is 1.0/(log(2)/log(e)) or 1/log(2)
263
return exp2_est(arg * 1.44269504f);
264
}
265
266
inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
267
{
268
return exp_est(log_est(arg1) * arg2);
269
}
270
271
/*
272
clang 9.0.0 for win /fp:precise release
273
Total near-zero: 144, output above near-zero tresh: 30
274
Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
275
Total near-zero sign diffs: 5
276
Total passed near-zero check: 16777072
277
Total sign diffs: 5
278
max abs err: 0.0000031375306036
279
max rel err: 0.1140846017075028
280
avg abs err: 0.0000003026226621
281
avg rel err: 0.0000033564977623
282
*/
283
284
// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
285
// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
286
inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
287
{
288
const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
289
const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
290
const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
291
const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
292
const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
293
294
vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
295
296
store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
297
298
store_all(r1_y, frac(r1_x));
299
300
store_all(r2_x, (vfloat)(r1_y < c1_x));
301
302
store_all(r2_y, (vfloat)(r1_y >= c1_y));
303
store_all(r2_z, (vfloat)(r1_y >= c1_z));
304
305
store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
306
307
store_all(r0_x, c0_x - r1_y);
308
store_all(r0_y, c0_y - r1_y);
309
store_all(r0_z, c0_z - r1_y);
310
311
store_all(r0_x, r0_x * r0_x);
312
store_all(r0_y, r0_y * r0_y);
313
store_all(r0_z, r0_z * r0_z);
314
315
store_all(r1_x, vfma(c2_x, r0_x, c2_z));
316
store_all(r1_y, vfma(c2_y, r0_y, c2_w));
317
store_all(r1_z, vfma(c2_x, r0_z, c2_z));
318
319
store_all(r1_x, vfma(r1_x, r0_x, c3_x));
320
store_all(r1_y, vfma(r1_y, r0_y, c3_y));
321
store_all(r1_z, vfma(r1_z, r0_z, c3_x));
322
323
store_all(r1_x, vfma(r1_x, r0_x, c3_z));
324
store_all(r1_y, vfma(r1_y, r0_y, c3_w));
325
store_all(r1_z, vfma(r1_z, r0_z, c3_z));
326
327
store_all(r1_x, vfma(r1_x, r0_x, c4_x));
328
store_all(r1_y, vfma(r1_y, r0_y, c4_y));
329
store_all(r1_z, vfma(r1_z, r0_z, c4_x));
330
331
store_all(r1_x, vfma(r1_x, r0_x, c4_z));
332
store_all(r1_y, vfma(r1_y, r0_y, c4_w));
333
store_all(r1_z, vfma(r1_z, r0_z, c4_z));
334
335
store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
336
337
return r0_x;
338
}
339
340
// positive values only
341
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
342
{
343
//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
344
const int mag = 0x7EF311C3;
345
const float fMinThresh = .0000125f;
346
347
vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
348
349
vint x_l = vint(mag) - cast_vfloat_to_vint(l);
350
351
vfloat rcp_l = cast_vint_to_vfloat(x_l);
352
353
return rcp_l * vfnma(rcp_l, q, 2.0f);
354
}
355
356
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
357
{
358
//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
359
const int mag = 0x7EF311C3;
360
const float fMinThresh = .0000125f;
361
362
vfloat s = sign(t);
363
vfloat q = abs(t);
364
365
vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
366
367
vint x_l = vint(mag) - cast_vfloat_to_vint(l);
368
369
vfloat rcp_l = cast_vint_to_vfloat(x_l);
370
371
return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
372
}
373
374
// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
375
// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
376
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
377
{
378
vfloat xhalf = 0.5f * x0;
379
vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
380
return x * vfnma(xhalf * x, x, 1.5008909f);
381
}
382
383
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
384
{
385
vfloat xhalf = 0.5f * x0;
386
vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
387
vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
388
vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
389
return x2;
390
}
391
392
// Math from: http://developer.download.nvidia.com/cg/atan2.html
393
// TODO: Needs more validation, parameter checking.
394
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
395
{
396
vfloat t1 = abs(y);
397
vfloat t3 = abs(x);
398
399
vfloat t0 = max(t3, t1);
400
store_all(t1, min(t3, t1));
401
402
store_all(t3, t1 / t0);
403
404
vfloat t4 = t3 * t3;
405
store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
406
store_all(t0, vfms(t0, t4, 0.121239071f));
407
store_all(t0, vfma(t0, t4, 0.195635925f));
408
store_all(t0, vfms(t0, t4, 0.332994597f));
409
store_all(t0, vfma(t0, t4, 0.999995630f));
410
store_all(t3, t0 * t3);
411
412
store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
413
414
store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
415
store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
416
417
return t3;
418
}
419
420
/*
421
clang 9.0.0 for win /fp:precise release
422
Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
423
Skipped angles near 90/270 within +- .001 radians.
424
Near-zero threshold: .0000125f
425
Near-zero output above check threshold: 1e-6f
426
427
Total near-zero: 144, output above near-zero tresh: 20
428
Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
429
Total near-zero sign diffs: 5
430
Total passed near-zero check: 16766400
431
Total sign diffs: 5
432
max abs err: 1.4982600811139264
433
max rel err: 0.1459155900188041
434
avg rel err: 0.0000054659502568
435
436
XMVectorTan() precise:
437
Total near-zero: 144, output above near-zero tresh: 18
438
Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
439
Total near-zero sign diffs: 0
440
Total passed near-zero check: 16766400
441
Total sign diffs: 0
442
max abs err: 1.9883573246424930
443
max rel err: 0.1459724171926864
444
avg rel err: 0.0000054965766843
445
446
std::tanf():
447
Total near-zero: 144, output above near-zero tresh: 0
448
Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
449
Total near-zero sign diffs: 11
450
Total passed near-zero check: 16766400
451
Total sign diffs: 11
452
max abs err: 0.8989131818294709
453
max rel err: 0.0573181403173166
454
avg rel err: 0.0000030791301203
455
456
Originally from:
457
http://www.ganssle.com/approx.htm
458
*/
459
460
CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
461
{
462
// Original double version was 8.2 digits
463
//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
464
// Tuned float constants for lower avg rel error (without using FMA3):
465
const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
466
vfloat x2 = x * x;
467
return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
468
}
469
470
// Don't call this for angles close to 90/270!.
471
inline vfloat spmd_kernel::tan_est(vfloat x)
472
{
473
const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
474
CPPSPMD_DECL(const uint8_t, s_table0[16]) = { 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4 };
475
476
vint table = init_lookup4(s_table0); // a load
477
vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
478
479
store_all(x, abs(x));
480
vfloat orig_x = x;
481
482
vfloat q = x * fOneOverPi;
483
store_all(x, q - floor(q));
484
485
vfloat x4 = x * 4.0f;
486
vint octant = (vint)(x4);
487
488
vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
489
490
vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
491
492
vfloat bias = (vfloat)k + -128.0f;
493
vfloat y = x0 + bias;
494
495
vfloat z = tan82(y);
496
497
vfloat r;
498
499
vbool octant_one_or_two = (octant == 1) || (octant == 2);
500
501
// SPMD optimization - skip costly divide if we can
502
if (spmd_any(octant_one_or_two))
503
{
504
const float fDivThresh = .4371e-7f;
505
vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
506
507
vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
508
store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
509
}
510
else
511
{
512
store_all(r, spmd_ternaryf(octant == 0, z, -z));
513
}
514
515
// Small angle approximation, to decrease the max rel error near Pi.
516
SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
517
{
518
store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
519
}
520
SPMD_SENDIF
521
522
return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
523
}
524
525
inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
526
{
527
store(x.a, 0xf1ea5eed);
528
store(x.b, seed ^ 0xd8487b1f);
529
store(x.c, seed ^ 0xdbadef9a);
530
store(x.d, seed);
531
for (int i = 0; i < 20; ++i)
532
(void)get_randu(x);
533
}
534
535
// https://burtleburtle.net/bob/rand/smallprng.html
536
// Returns 32-bit unsigned random numbers.
537
inline vint spmd_kernel::get_randu(rand_context& x)
538
{
539
vint e = x.a - VINT_ROT(x.b, 27);
540
store(x.a, x.b ^ VINT_ROT(x.c, 17));
541
store(x.b, x.c + x.d);
542
store(x.c, x.d + e);
543
store(x.d, e + x.a);
544
return x.d;
545
}
546
547
// Returns random numbers between [low, high), or low if low >= high
548
inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
549
{
550
vint rnd = get_randu(x);
551
552
vint range = high - low;
553
554
vint rnd_range = mulhiu(rnd, range);
555
556
return spmd_ternaryi(low < high, low + rnd_range, low);
557
}
558
559
// Returns random numbers between [low, high), or low if low >= high
560
inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
561
{
562
vint rndi = get_randu(x) & 0x7fffff;
563
564
vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
565
566
return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
567
}
568
569
CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
570
{
571
const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
572
const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
573
store_all(tab1, init_lookup4(tab1_bytes));
574
store_all(tab2, init_lookup4(tab2_bytes));
575
}
576
577
CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
578
{
579
vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
580
vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
581
vint r3 = r0 | r1;
582
return byteswap(r3);
583
}
584
585
CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
586
{
587
CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
588
589
vint tab = init_lookup4(s_tab);
590
591
//x <= 0x0000ffff
592
vbool c0 = (x & 0xFFFF0000) == 0;
593
vint n0 = spmd_ternaryi(c0, 16, 0);
594
vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
595
596
//x <= 0x00ffffff
597
vbool c1 = (x0 & 0xFF000000) == 0;
598
vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
599
vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
600
601
//x <= 0x0fffffff
602
vbool c2 = (x1 & 0xF0000000) == 0;
603
vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
604
vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
605
606
return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
607
}
608
609
CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
610
{
611
//x <= 0x0000ffff
612
vbool c0 = (x & 0xFFFF0000) == 0;
613
vint n0 = spmd_ternaryi(c0, 16, 0);
614
vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
615
616
//x <= 0x00ffffff
617
vbool c1 = (x0 & 0xFF000000) == 0;
618
vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
619
vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
620
621
//x <= 0x0fffffff
622
vbool c2 = (x1 & 0xF0000000) == 0;
623
vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
624
vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
625
626
// x <= 0x3fffffff
627
vbool c3 = (x2 & 0xC0000000) == 0;
628
vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
629
vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
630
631
// x <= 0x7fffffff
632
vbool c4 = (x3 & 0x80000000) == 0;
633
return spmd_ternaryi(c4, n3 + 1, n3);
634
}
635
636
CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
637
{
638
// cast the least significant bit in v to a float
639
vfloat f = (vfloat)(x & -x);
640
641
// extract exponent and adjust
642
return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
643
}
644
645
CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
646
{
647
vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);
648
vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);
649
return VUINT_SHIFT_RIGHT(((v1 + (VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F)) * 0x1010101), 24);
650
}
651
652
CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b)
653
{
654
return cmpeq_epi16(subs_epu16(a, b), vint(0));
655
}
656
657
CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b)
658
{
659
return cmple_epu16(b, a);
660
}
661
662
CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
663
{
664
return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
665
}
666
667
CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
668
{
669
return cmpgt_epu16(b, a);
670
}
671
672
CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
673
{
674
return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
675
}
676
677
CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
678
{
679
return cmpge_epi16(b, a);
680
}
681
682
void spmd_kernel::print_vint(vint v)
683
{
684
for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
685
printf("%i ", extract(v, i));
686
printf("\n");
687
}
688
689
void spmd_kernel::print_vbool(vbool v)
690
{
691
for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
692
printf("%i ", extract(v, i) ? 1 : 0);
693
printf("\n");
694
}
695
696
void spmd_kernel::print_vint_hex(vint v)
697
{
698
for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
699
printf("0x%X ", extract(v, i));
700
printf("\n");
701
}
702
703
void spmd_kernel::print_active_lanes(const char *pPrefix)
704
{
705
CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
706
memset(flags, 0, sizeof(flags));
707
storeu_linear(flags, vint(1));
708
709
if (pPrefix)
710
printf("%s", pPrefix);
711
712
for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
713
{
714
if (flags[i])
715
printf("%u ", i);
716
}
717
printf("\n");
718
}
719
720
void spmd_kernel::print_vfloat(vfloat v)
721
{
722
for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
723
printf("%f ", extract(v, i));
724
printf("\n");
725
}
726
727