Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib.h
9896 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2025 Arm Limited
4
// Copyright 2008 Jose Fonseca
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
7
// use this file except in compliance with the License. You may obtain a copy
8
// of the License at:
9
//
10
// http://www.apache.org/licenses/LICENSE-2.0
11
//
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15
// License for the specific language governing permissions and limitations
16
// under the License.
17
// ----------------------------------------------------------------------------
18
19
/*
20
* This module implements vector support for floats, ints, and vector lane
21
* control masks. It provides access to both explicit vector width types, and
22
* flexible N-wide types where N can be determined at compile time.
23
*
24
* The design of this module encourages use of vector length agnostic code, via
25
* the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26
* with that is available at compile time. The current vector width is
27
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28
*
29
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
30
* These are provided primarily for prototyping and algorithm debug of VLA
31
* implementations.
32
*
33
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34
* types. These are provided for use by VLA code, but are also expected to be
35
* used as a fixed-width type and will supported a reference C++ fallback for
36
* use on platforms without SIMD intrinsics.
37
*
38
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39
* types. These are provide for use by VLA code, and are not expected to be
40
* used as a fixed-width type in normal code. No reference C implementation is
41
* provided on platforms without underlying SIMD intrinsics.
42
*
43
* With the current implementation ISA support is provided for:
44
*
45
* * 1-wide for scalar reference
46
* * 4-wide for Armv8-A NEON
47
* * 4-wide for x86-64 SSE2
48
* * 4-wide for x86-64 SSE4.1
49
* * 8-wide for Armv8-A SVE
50
* * 8-wide for x86-64 AVX2
51
*/
52
53
#ifndef ASTC_VECMATHLIB_H_INCLUDED
54
#define ASTC_VECMATHLIB_H_INCLUDED
55
56
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
57
#include <immintrin.h>
58
#endif
59
60
#if ASTCENC_SVE != 0
61
#include <arm_sve.h>
62
#include <arm_neon_sve_bridge.h>
63
#endif
64
65
#if ASTCENC_NEON != 0
66
#include <arm_neon.h>
67
#endif
68
69
#if !defined(__clang__) && defined(_MSC_VER)
70
#define ASTCENC_SIMD_INLINE __forceinline
71
#define ASTCENC_NO_INLINE
72
#elif defined(__GNUC__) && !defined(__clang__)
73
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
74
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
75
#else
76
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
77
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
78
#endif
79
80
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
81
82
#if ASTCENC_AVX >= 2
83
// If we have AVX2 expose 8-wide VLA.
84
#include "astcenc_vecmathlib_sse_4.h"
85
#include "astcenc_vecmathlib_common_4.h"
86
#include "astcenc_vecmathlib_avx2_8.h"
87
88
#define ASTCENC_SIMD_WIDTH 8
89
90
using vfloat = vfloat8;
91
92
#if defined(ASTCENC_NO_INVARIANCE)
93
using vfloatacc = vfloat8;
94
#else
95
using vfloatacc = vfloat4;
96
#endif
97
98
using vint = vint8;
99
using vmask = vmask8;
100
101
using vtable_16x8 = vtable8_16x8;
102
using vtable_32x8 = vtable8_32x8;
103
using vtable_64x8 = vtable8_64x8;
104
105
constexpr auto loada = vfloat8::loada;
106
constexpr auto load1 = vfloat8::load1;
107
constexpr auto vint_from_size = vint8_from_size;
108
109
#elif ASTCENC_SSE >= 20
110
// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
111
#include "astcenc_vecmathlib_sse_4.h"
112
#include "astcenc_vecmathlib_common_4.h"
113
114
#define ASTCENC_SIMD_WIDTH 4
115
116
using vfloat = vfloat4;
117
using vfloatacc = vfloat4;
118
using vint = vint4;
119
using vmask = vmask4;
120
121
using vtable_16x8 = vtable4_16x8;
122
using vtable_32x8 = vtable4_32x8;
123
using vtable_64x8 = vtable4_64x8;
124
125
constexpr auto loada = vfloat4::loada;
126
constexpr auto load1 = vfloat4::load1;
127
constexpr auto vint_from_size = vint4_from_size;
128
129
#elif ASTCENC_SVE == 8
130
// Check the compiler is configured with fixed-length 256-bit SVE.
131
#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
132
#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
133
#endif
134
135
// If we have SVE configured as 8-wide, expose 8-wide VLA.
136
#include "astcenc_vecmathlib_neon_4.h"
137
#include "astcenc_vecmathlib_common_4.h"
138
#include "astcenc_vecmathlib_sve_8.h"
139
140
#define ASTCENC_SIMD_WIDTH 8
141
142
using vfloat = vfloat8;
143
144
#if defined(ASTCENC_NO_INVARIANCE)
145
using vfloatacc = vfloat8;
146
#else
147
using vfloatacc = vfloat4;
148
#endif
149
150
using vint = vint8;
151
using vmask = vmask8;
152
153
using vtable_16x8 = vtable8_16x8;
154
using vtable_32x8 = vtable8_32x8;
155
using vtable_64x8 = vtable8_64x8;
156
157
constexpr auto loada = vfloat8::loada;
158
constexpr auto load1 = vfloat8::load1;
159
constexpr auto vint_from_size = vint8_from_size;
160
161
#elif ASTCENC_NEON > 0
162
// If we have NEON expose 4-wide VLA.
163
#include "astcenc_vecmathlib_neon_4.h"
164
#include "astcenc_vecmathlib_common_4.h"
165
166
#define ASTCENC_SIMD_WIDTH 4
167
168
using vfloat = vfloat4;
169
using vfloatacc = vfloat4;
170
using vint = vint4;
171
using vmask = vmask4;
172
173
using vtable_16x8 = vtable4_16x8;
174
using vtable_32x8 = vtable4_32x8;
175
using vtable_64x8 = vtable4_64x8;
176
177
constexpr auto loada = vfloat4::loada;
178
constexpr auto load1 = vfloat4::load1;
179
constexpr auto vint_from_size = vint4_from_size;
180
181
#else
182
// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
183
184
// Note: We no longer expose the 1-wide scalar fallback because it is not
185
// invariant with the 4-wide path due to algorithms that use horizontal
186
// operations that accumulate a local vector sum before accumulating into
187
// a running sum.
188
//
189
// For 4 items adding into an accumulator using 1-wide vectors the sum is:
190
//
191
// result = ((((sum + l0) + l1) + l2) + l3)
192
//
193
// ... whereas the accumulator for a 4-wide vector sum is:
194
//
195
// result = sum + ((l0 + l2) + (l1 + l3))
196
//
197
// In "normal maths" this is the same, but the floating point reassociation
198
// differences mean that these will not produce the same result.
199
200
#include "astcenc_vecmathlib_none_4.h"
201
#include "astcenc_vecmathlib_common_4.h"
202
203
#define ASTCENC_SIMD_WIDTH 4
204
205
using vfloat = vfloat4;
206
using vfloatacc = vfloat4;
207
using vint = vint4;
208
using vmask = vmask4;
209
210
using vtable_16x8 = vtable4_16x8;
211
using vtable_32x8 = vtable4_32x8;
212
using vtable_64x8 = vtable4_64x8;
213
214
constexpr auto loada = vfloat4::loada;
215
constexpr auto load1 = vfloat4::load1;
216
constexpr auto vint_from_size = vint4_from_size;
217
#endif
218
219
/**
220
* @brief Round a count down to the largest multiple of the SIMD width.
221
*
222
* Assumption that the vector width is a power of two ...
223
*
224
* @param count The unrounded value.
225
*
226
* @return The rounded value.
227
*/
228
ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
229
{
230
return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
231
}
232
233
/**
234
* @brief Round a count up to the largest multiple of the SIMD width.
235
*
236
* Assumption that the vector width is a power of two ...
237
*
238
* @param count The unrounded value.
239
*
240
* @return The rounded value.
241
*/
242
ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
243
{
244
size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
245
return multiples * ASTCENC_SIMD_WIDTH;
246
}
247
248
/**
249
* @brief Return @c a with lanes negated if the @c b lane is negative.
250
*/
251
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
252
{
253
vint ia = float_as_int(a);
254
vint ib = float_as_int(b);
255
vint sign_mask(static_cast<int>(0x80000000));
256
vint r = ia ^ (ib & sign_mask);
257
return int_as_float(r);
258
}
259
260
/**
261
* @brief Return fast, but approximate, vector atan(x).
262
*
263
* Max error of this implementation is 0.004883.
264
*/
265
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
266
{
267
vmask c = abs(x) > vfloat(1.0f);
268
vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
269
vfloat y = select(x, vfloat(1.0f) / x, c);
270
y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
271
return select(y, z - y, c);
272
}
273
274
/**
275
* @brief Return fast, but approximate, vector atan2(x, y).
276
*/
277
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
278
{
279
vfloat z = atan(abs(y / x));
280
vmask xmask = x < vfloat::zero();
281
return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
282
}
283
284
/*
285
* @brief Factory that returns a unit length 4 component vfloat4.
286
*/
287
static ASTCENC_SIMD_INLINE vfloat4 unit4()
288
{
289
return vfloat4(0.5f);
290
}
291
292
/**
293
* @brief Factory that returns a unit length 3 component vfloat4.
294
*/
295
static ASTCENC_SIMD_INLINE vfloat4 unit3()
296
{
297
float val = 0.577350258827209473f;
298
return vfloat4(val, val, val, 0.0f);
299
}
300
301
/**
302
* @brief Factory that returns a unit length 2 component vfloat4.
303
*/
304
static ASTCENC_SIMD_INLINE vfloat4 unit2()
305
{
306
float val = 0.707106769084930420f;
307
return vfloat4(val, val, 0.0f, 0.0f);
308
}
309
310
/**
311
* @brief Factory that returns a 3 component vfloat4.
312
*/
313
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
314
{
315
return vfloat4(a, b, c, 0.0f);
316
}
317
318
/**
319
* @brief Factory that returns a 2 component vfloat4.
320
*/
321
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
322
{
323
return vfloat4(a, b, 0.0f, 0.0f);
324
}
325
326
/**
327
* @brief Normalize a non-zero length vector to unit length.
328
*/
329
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
330
{
331
vfloat4 length = dot(a, a);
332
return a / sqrt(length);
333
}
334
335
/**
336
* @brief Normalize a vector, returning @c safe if len is zero.
337
*/
338
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
339
{
340
vfloat4 length = dot(a, a);
341
if (length.lane<0>() != 0.0f)
342
{
343
return a / sqrt(length);
344
}
345
346
return safe;
347
}
348
349
350
351
#define POLY0(x, c0) ( c0)
352
#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
353
#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
354
#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
355
#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
356
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
357
358
/**
359
* @brief Compute an approximate exp2(x) for each lane in the vector.
360
*
361
* Based on 5th degree minimax polynomials, ported from this blog
362
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
363
*/
364
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
365
{
366
x = clamp(-126.99999f, 129.0f, x);
367
368
vint4 ipart = float_to_int(x - 0.5f);
369
vfloat4 fpart = x - int_to_float(ipart);
370
371
// Integer contrib, using 1 << ipart
372
vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
373
374
// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
375
vfloat4 fexp = POLY5(fpart,
376
9.9999994e-1f,
377
6.9315308e-1f,
378
2.4015361e-1f,
379
5.5826318e-2f,
380
8.9893397e-3f,
381
1.8775767e-3f);
382
383
return iexp * fexp;
384
}
385
386
/**
387
* @brief Compute an approximate log2(x) for each lane in the vector.
388
*
389
* Based on 5th degree minimax polynomials, ported from this blog
390
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
391
*/
392
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
393
{
394
vint4 exp(0x7F800000);
395
vint4 mant(0x007FFFFF);
396
vint4 one(0x3F800000);
397
398
vint4 i = float_as_int(x);
399
400
vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
401
402
vfloat4 m = int_as_float((i & mant) | one);
403
404
// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
405
vfloat4 p = POLY4(m,
406
2.8882704548164776201f,
407
-2.52074962577807006663f,
408
1.48116647521213171641f,
409
-0.465725644288844778798f,
410
0.0596515482674574969533f);
411
412
// Increases the polynomial degree, but ensures that log2(1) == 0
413
p = p * (m - 1.0f);
414
415
return p + e;
416
}
417
418
/**
419
* @brief Compute an approximate pow(x, y) for each lane in the vector.
420
*
421
* Power function based on the exp2(log2(x) * y) transform.
422
*/
423
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
424
{
425
vmask4 zero_mask = y == vfloat4(0.0f);
426
vfloat4 estimate = exp2(log2(x) * y);
427
428
// Guarantee that y == 0 returns exactly 1.0f
429
return select(estimate, vfloat4(1.0f), zero_mask);
430
}
431
432
/**
433
* @brief Count the leading zeros for each lane in @c a.
434
*
435
* Valid for all data values of @c a; will return a per-lane value [0, 32].
436
*/
437
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
438
{
439
// This function is a horrible abuse of floating point exponents to convert
440
// the original integer value into a 2^N encoding we can recover easily.
441
442
// Convert to float without risk of rounding up by keeping only top 8 bits.
443
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
444
a = (~lsr<8>(a)) & a;
445
a = float_as_int(int_to_float(a));
446
447
// Extract and unbias exponent
448
a = vint4(127 + 31) - lsr<23>(a);
449
450
// Clamp result to a valid 32-bit range
451
return clamp(0, 32, a);
452
}
453
454
/**
455
* @brief Return lanewise 2^a for each lane in @c a.
456
*
457
* Use of signed int means that this is only valid for values in range [0, 31].
458
*/
459
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
460
{
461
// 2^30 is the largest signed number than can be represented
462
assert(all(a < vint4(31)));
463
464
// This function is a horrible abuse of floating point to use the exponent
465
// and float conversion to generate a 2^N multiple.
466
467
// Bias the exponent
468
vint4 exp = a + 127;
469
exp = lsl<23>(exp);
470
471
// Reinterpret the bits as a float, and then convert to an int
472
vfloat4 f = int_as_float(exp);
473
return float_to_int(f);
474
}
475
476
/**
477
* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
478
*/
479
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
480
{
481
vint4 fp16_one = vint4(0x3C00);
482
vint4 fp16_small = lsl<8>(p);
483
484
vmask4 is_one = p == vint4(0xFFFF);
485
vmask4 is_small = p < vint4(4);
486
487
// Manually inline clz() on Visual Studio to avoid release build codegen bug
488
// see https://github.com/ARM-software/astc-encoder/issues/259
489
#if !defined(__clang__) && defined(_MSC_VER)
490
vint4 a = (~lsr<8>(p)) & p;
491
a = float_as_int(int_to_float(a));
492
a = vint4(127 + 31) - lsr<23>(a);
493
vint4 lz = clamp(0, 32, a) - 16;
494
#else
495
vint4 lz = clz(p) - 16;
496
#endif
497
498
p = p * two_to_the_n(lz + 1);
499
p = p & vint4(0xFFFF);
500
501
p = lsr<6>(p);
502
503
p = p | lsl<10>(vint4(14) - lz);
504
505
vint4 r = select(p, fp16_one, is_one);
506
r = select(r, fp16_small, is_small);
507
return r;
508
}
509
510
/**
511
* @brief Convert 16-bit LNS to float16.
512
*/
513
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
514
{
515
vint4 mc = p & 0x7FF;
516
vint4 ec = lsr<11>(p);
517
518
vint4 mc_512 = mc * 3;
519
vmask4 mask_512 = mc < vint4(512);
520
521
vint4 mc_1536 = mc * 4 - 512;
522
vmask4 mask_1536 = mc < vint4(1536);
523
524
vint4 mc_else = mc * 5 - 2048;
525
526
vint4 mt = mc_else;
527
mt = select(mt, mc_1536, mask_1536);
528
mt = select(mt, mc_512, mask_512);
529
530
vint4 res = lsl<10>(ec) | lsr<3>(mt);
531
return min(res, vint4(0x7BFF));
532
}
533
534
/**
535
* @brief Extract mantissa and exponent of a float value.
536
*
537
* @param a The input value.
538
* @param[out] exp The output exponent.
539
*
540
* @return The mantissa.
541
*/
542
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
543
{
544
// Interpret the bits as an integer
545
vint4 ai = float_as_int(a);
546
547
// Extract and unbias the exponent
548
exp = (lsr<23>(ai) & 0xFF) - 126;
549
550
// Extract and unbias the mantissa
551
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
552
return int_as_float(manti);
553
}
554
555
/**
556
* @brief Convert float to 16-bit LNS.
557
*/
558
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
559
{
560
vint4 exp;
561
vfloat4 mant = frexp(a, exp);
562
563
// Do these early before we start messing about ...
564
vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
565
vmask4 mask_infinity = a >= vfloat4(65536.0f);
566
567
// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
568
vmask4 exp_lt_m13 = exp < vint4(-13);
569
570
vfloat4 a1a = a * 33554432.0f;
571
vint4 expa = vint4::zero();
572
573
vfloat4 a1b = (mant - 0.5f) * 4096;
574
vint4 expb = exp + 14;
575
576
a = select(a1b, a1a, exp_lt_m13);
577
exp = select(expb, expa, exp_lt_m13);
578
579
vmask4 a_lt_384 = a < vfloat4(384.0f);
580
vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
581
582
vfloat4 a2a = a * (4.0f / 3.0f);
583
vfloat4 a2b = a + 128.0f;
584
vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
585
586
a = a2c;
587
a = select(a, a2b, a_lt_1408);
588
a = select(a, a2a, a_lt_384);
589
590
a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
591
592
a = select(a, vfloat4(65535.0f), mask_infinity);
593
a = select(a, vfloat4::zero(), mask_underflow_nan);
594
595
return a;
596
}
597
598
namespace astc
599
{
600
601
static ASTCENC_SIMD_INLINE float pow(float x, float y)
602
{
603
return pow(vfloat4(x), vfloat4(y)).lane<0>();
604
}
605
606
}
607
608
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
609
610