Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
9898 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
/**
19
* @brief 4x32-bit vectors, implemented using Armv8-A NEON.
20
*
21
* This module implements 4-wide 32-bit float, int, and mask vectors for
22
* Armv8-A NEON.
23
*
24
* There is a baseline level of functionality provided by all vector widths and
25
* implementations. This is implemented using identical function signatures,
26
* modulo data type, so we can use them as substitutable implementations in VLA
27
* code.
28
*
29
* The 4-wide vectors are also used as a fixed-width type, and significantly
30
* extend the functionality above that available to VLA code.
31
*/
32
33
#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
34
#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
35
36
#ifndef ASTCENC_SIMD_INLINE
37
#error "Include astcenc_vecmathlib.h, do not include directly"
38
#endif
39
40
#include <cstdio>
41
#include <cstring>
42
43
// ============================================================================
44
// vfloat4 data type
45
// ============================================================================
46
47
/**
48
* @brief Data type for 4-wide floats.
49
*/
50
struct vfloat4
51
{
52
/**
53
* @brief Construct from zero-initialized value.
54
*/
55
ASTCENC_SIMD_INLINE vfloat4() = default;
56
57
/**
58
* @brief Construct from 4 values loaded from an unaligned address.
59
*
60
* Consider using loada() which is better with vectors if data is aligned
61
* to vector length.
62
*/
63
ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
64
{
65
m = vld1q_f32(p);
66
}
67
68
/**
69
* @brief Construct from 1 scalar value replicated across all lanes.
70
*
71
* Consider using zero() for constexpr zeros.
72
*/
73
ASTCENC_SIMD_INLINE explicit vfloat4(float a)
74
{
75
m = vdupq_n_f32(a);
76
}
77
78
/**
79
* @brief Construct from 4 scalar values.
80
*
81
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
82
*/
83
ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
84
{
85
float v[4] { a, b, c, d };
86
m = vld1q_f32(v);
87
}
88
89
/**
90
* @brief Construct from an existing SIMD register.
91
*/
92
ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
93
{
94
m = a;
95
}
96
97
/**
98
* @brief Get the scalar value of a single lane.
99
*/
100
template <int l> ASTCENC_SIMD_INLINE float lane() const
101
{
102
return vgetq_lane_f32(m, l);
103
}
104
105
/**
106
* @brief Set the scalar value of a single lane.
107
*/
108
template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
109
{
110
m = vsetq_lane_f32(a, m, l);
111
}
112
113
/**
114
* @brief Factory that returns a vector of zeros.
115
*/
116
static ASTCENC_SIMD_INLINE vfloat4 zero()
117
{
118
return vfloat4(0.0f);
119
}
120
121
/**
122
* @brief Factory that returns a replicated scalar loaded from memory.
123
*/
124
static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
125
{
126
return vfloat4(vld1q_dup_f32(p));
127
}
128
129
/**
130
* @brief Factory that returns a vector loaded from 16B aligned memory.
131
*/
132
static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
133
{
134
return vfloat4(vld1q_f32(p));
135
}
136
137
/**
138
* @brief Return a swizzled float 2.
139
*/
140
template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
141
{
142
return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
143
}
144
145
/**
146
* @brief Return a swizzled float 3.
147
*/
148
template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
149
{
150
return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
151
}
152
153
/**
154
* @brief Return a swizzled float 4.
155
*/
156
template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
157
{
158
return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
159
}
160
161
/**
162
* @brief The vector ...
163
*/
164
float32x4_t m;
165
};
166
167
// ============================================================================
168
// vint4 data type
169
// ============================================================================
170
171
/**
172
* @brief Data type for 4-wide ints.
173
*/
174
struct vint4
175
{
176
/**
177
* @brief Construct from zero-initialized value.
178
*/
179
ASTCENC_SIMD_INLINE vint4() = default;
180
181
/**
182
* @brief Construct from 4 values loaded from an unaligned address.
183
*
184
* Consider using loada() which is better with vectors if data is aligned
185
* to vector length.
186
*/
187
ASTCENC_SIMD_INLINE explicit vint4(const int *p)
188
{
189
m = vld1q_s32(p);
190
}
191
192
/**
193
* @brief Construct from 4 uint8_t loaded from an unaligned address.
194
*/
195
ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
196
{
197
#if ASTCENC_SVE == 0
198
// Cast is safe - NEON loads are allowed to be unaligned
199
uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
200
uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
201
m = vreinterpretq_s32_u32(vmovl_u16(t16));
202
#else
203
svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
204
m = svget_neonq(data);
205
#endif
206
}
207
208
/**
209
* @brief Construct from 1 scalar value replicated across all lanes.
210
*
211
* Consider using zero() for constexpr zeros.
212
*/
213
ASTCENC_SIMD_INLINE explicit vint4(int a)
214
{
215
m = vdupq_n_s32(a);
216
}
217
218
/**
219
* @brief Construct from 4 scalar values.
220
*
221
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
222
*/
223
ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
224
{
225
int v[4] { a, b, c, d };
226
m = vld1q_s32(v);
227
}
228
229
/**
230
* @brief Construct from an existing SIMD register.
231
*/
232
ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
233
{
234
m = a;
235
}
236
237
/**
238
* @brief Get the scalar from a single lane.
239
*/
240
template <int l> ASTCENC_SIMD_INLINE int lane() const
241
{
242
return vgetq_lane_s32(m, l);
243
}
244
245
/**
246
* @brief Set the scalar value of a single lane.
247
*/
248
template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
249
{
250
m = vsetq_lane_s32(a, m, l);
251
}
252
253
/**
254
* @brief Factory that returns a vector of zeros.
255
*/
256
static ASTCENC_SIMD_INLINE vint4 zero()
257
{
258
return vint4(0);
259
}
260
261
/**
262
* @brief Factory that returns a replicated scalar loaded from memory.
263
*/
264
static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
265
{
266
return vint4(*p);
267
}
268
269
/**
270
* @brief Factory that returns a vector loaded from unaligned memory.
271
*/
272
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
273
{
274
vint4 data;
275
std::memcpy(&data.m, p, 4 * sizeof(int));
276
return data;
277
}
278
279
/**
280
* @brief Factory that returns a vector loaded from 16B aligned memory.
281
*/
282
static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
283
{
284
return vint4(p);
285
}
286
287
/**
288
* @brief Factory that returns a vector containing the lane IDs.
289
*/
290
static ASTCENC_SIMD_INLINE vint4 lane_id()
291
{
292
alignas(16) static const int data[4] { 0, 1, 2, 3 };
293
return vint4(vld1q_s32(data));
294
}
295
296
/**
297
* @brief The vector ...
298
*/
299
int32x4_t m;
300
};
301
302
// ============================================================================
303
// vmask4 data type
304
// ============================================================================
305
306
/**
307
* @brief Data type for 4-wide control plane masks.
308
*/
309
struct vmask4
310
{
311
/**
312
* @brief Construct from an existing SIMD register.
313
*/
314
ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
315
{
316
m = a;
317
}
318
319
#if !defined(_MSC_VER)
320
/**
321
* @brief Construct from an existing SIMD register.
322
*/
323
ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
324
{
325
m = vreinterpretq_u32_s32(a);
326
}
327
#endif
328
329
/**
330
* @brief Construct from 1 scalar value.
331
*/
332
ASTCENC_SIMD_INLINE explicit vmask4(bool a)
333
{
334
m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
335
}
336
337
/**
338
* @brief Construct from 4 scalar values.
339
*
340
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
341
*/
342
ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
343
{
344
int v[4] {
345
a == true ? -1 : 0,
346
b == true ? -1 : 0,
347
c == true ? -1 : 0,
348
d == true ? -1 : 0
349
};
350
351
int32x4_t ms = vld1q_s32(v);
352
m = vreinterpretq_u32_s32(ms);
353
}
354
355
/**
356
* @brief Get the scalar from a single lane.
357
*/
358
template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
359
{
360
return vgetq_lane_u32(m, l) != 0;
361
}
362
363
/**
364
* @brief The vector ...
365
*/
366
uint32x4_t m;
367
};
368
369
// ============================================================================
370
// vmask4 operators and functions
371
// ============================================================================
372
373
/**
374
* @brief Overload: mask union (or).
375
*/
376
ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
377
{
378
return vmask4(vorrq_u32(a.m, b.m));
379
}
380
381
/**
382
* @brief Overload: mask intersect (and).
383
*/
384
ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
385
{
386
return vmask4(vandq_u32(a.m, b.m));
387
}
388
389
/**
390
* @brief Overload: mask difference (xor).
391
*/
392
ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
393
{
394
return vmask4(veorq_u32(a.m, b.m));
395
}
396
397
/**
398
* @brief Overload: mask invert (not).
399
*/
400
ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
401
{
402
return vmask4(vmvnq_u32(a.m));
403
}
404
405
/**
406
* @brief Return a 4-bit mask code indicating mask status.
407
*
408
* bit0 = lane 0
409
*/
410
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
411
{
412
static const int shifta[4] { 0, 1, 2, 3 };
413
static const int32x4_t shift = vld1q_s32(shifta);
414
415
uint32x4_t tmp = vshrq_n_u32(a.m, 31);
416
return vaddvq_u32(vshlq_u32(tmp, shift));
417
}
418
419
/**
420
* @brief True if any lanes are enabled, false otherwise.
421
*/
422
ASTCENC_SIMD_INLINE bool any(vmask4 a)
423
{
424
return vmaxvq_u32(a.m) != 0;
425
}
426
427
/**
428
* @brief True if all lanes are enabled, false otherwise.
429
*/
430
ASTCENC_SIMD_INLINE bool all(vmask4 a)
431
{
432
return vminvq_u32(a.m) != 0;
433
}
434
435
// ============================================================================
436
// vint4 operators and functions
437
// ============================================================================
438
439
/**
440
* @brief Overload: vector by vector addition.
441
*/
442
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
443
{
444
return vint4(vaddq_s32(a.m, b.m));
445
}
446
447
/**
448
* @brief Overload: vector by vector subtraction.
449
*/
450
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
451
{
452
return vint4(vsubq_s32(a.m, b.m));
453
}
454
455
/**
456
* @brief Overload: vector by vector multiplication.
457
*/
458
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
459
{
460
return vint4(vmulq_s32(a.m, b.m));
461
}
462
463
/**
464
* @brief Overload: vector bit invert.
465
*/
466
ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
467
{
468
return vint4(vmvnq_s32(a.m));
469
}
470
471
/**
472
* @brief Overload: vector by vector bitwise or.
473
*/
474
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
475
{
476
return vint4(vorrq_s32(a.m, b.m));
477
}
478
479
/**
480
* @brief Overload: vector by vector bitwise and.
481
*/
482
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
483
{
484
return vint4(vandq_s32(a.m, b.m));
485
}
486
487
/**
488
* @brief Overload: vector by vector bitwise xor.
489
*/
490
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
491
{
492
return vint4(veorq_s32(a.m, b.m));
493
}
494
495
/**
496
* @brief Overload: vector by vector equality.
497
*/
498
ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
499
{
500
return vmask4(vceqq_s32(a.m, b.m));
501
}
502
503
/**
504
* @brief Overload: vector by vector inequality.
505
*/
506
ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
507
{
508
return ~vmask4(vceqq_s32(a.m, b.m));
509
}
510
511
/**
512
* @brief Overload: vector by vector less than.
513
*/
514
ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
515
{
516
return vmask4(vcltq_s32(a.m, b.m));
517
}
518
519
/**
520
* @brief Overload: vector by vector greater than.
521
*/
522
ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
523
{
524
return vmask4(vcgtq_s32(a.m, b.m));
525
}
526
527
/**
528
* @brief Logical shift left.
529
*/
530
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
531
{
532
return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
533
}
534
535
/**
536
* @brief Logical shift right.
537
*/
538
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
539
{
540
uint32x4_t ua = vreinterpretq_u32_s32(a.m);
541
ua = vshlq_u32(ua, vdupq_n_s32(-s));
542
return vint4(vreinterpretq_s32_u32(ua));
543
}
544
545
/**
546
* @brief Arithmetic shift right.
547
*/
548
template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
549
{
550
return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
551
}
552
553
/**
554
* @brief Return the min vector of two vectors.
555
*/
556
ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
557
{
558
return vint4(vminq_s32(a.m, b.m));
559
}
560
561
/**
562
* @brief Return the max vector of two vectors.
563
*/
564
ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
565
{
566
return vint4(vmaxq_s32(a.m, b.m));
567
}
568
569
/**
570
* @brief Return the horizontal minimum of a vector.
571
*/
572
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
573
{
574
return vint4(vminvq_s32(a.m));
575
}
576
577
/**
578
* @brief Return the horizontal maximum of a vector.
579
*/
580
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
581
{
582
return vint4(vmaxvq_s32(a.m));
583
}
584
585
/**
586
* @brief Store a vector to a 16B aligned memory address.
587
*/
588
ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
589
{
590
vst1q_s32(p, a.m);
591
}
592
593
/**
594
* @brief Store a vector to an unaligned memory address.
595
*/
596
ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
597
{
598
vst1q_s32(p, a.m);
599
}
600
601
/**
602
* @brief Store a vector to an unaligned memory address.
603
*/
604
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
605
{
606
std::memcpy(p, &a.m, sizeof(int) * 4);
607
}
608
609
/**
610
* @brief Store lowest N (vector width) bytes into an unaligned address.
611
*/
612
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
613
{
614
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
615
}
616
617
/**
618
* @brief Pack and store low 8 bits of each vector lane.
619
*/
620
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
621
{
622
alignas(16) uint8_t shuf[16] {
623
0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
624
};
625
uint8x16_t idx = vld1q_u8(shuf);
626
int8x16_t av = vreinterpretq_s8_s32(a.m);
627
a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
628
store_nbytes(a, data);
629
}
630
631
/**
632
* @brief Return lanes from @c b if @c cond is set, else @c a.
633
*/
634
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
635
{
636
return vint4(vbslq_s32(cond.m, b.m, a.m));
637
}
638
639
// ============================================================================
640
// vfloat4 operators and functions
641
// ============================================================================
642
643
/**
644
* @brief Overload: vector by vector addition.
645
*/
646
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
647
{
648
return vfloat4(vaddq_f32(a.m, b.m));
649
}
650
651
/**
652
* @brief Overload: vector by vector subtraction.
653
*/
654
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
655
{
656
return vfloat4(vsubq_f32(a.m, b.m));
657
}
658
659
/**
660
* @brief Overload: vector by vector multiplication.
661
*/
662
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
663
{
664
return vfloat4(vmulq_f32(a.m, b.m));
665
}
666
667
/**
668
* @brief Overload: vector by vector division.
669
*/
670
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
671
{
672
return vfloat4(vdivq_f32(a.m, b.m));
673
}
674
675
/**
676
* @brief Overload: vector by vector equality.
677
*/
678
ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
679
{
680
return vmask4(vceqq_f32(a.m, b.m));
681
}
682
683
/**
684
* @brief Overload: vector by vector inequality.
685
*/
686
ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
687
{
688
return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
689
}
690
691
/**
692
* @brief Overload: vector by vector less than.
693
*/
694
ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
695
{
696
return vmask4(vcltq_f32(a.m, b.m));
697
}
698
699
/**
700
* @brief Overload: vector by vector greater than.
701
*/
702
ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
703
{
704
return vmask4(vcgtq_f32(a.m, b.m));
705
}
706
707
/**
708
* @brief Overload: vector by vector less than or equal.
709
*/
710
ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
711
{
712
return vmask4(vcleq_f32(a.m, b.m));
713
}
714
715
/**
716
* @brief Overload: vector by vector greater than or equal.
717
*/
718
ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
719
{
720
return vmask4(vcgeq_f32(a.m, b.m));
721
}
722
723
/**
724
* @brief Return the min vector of two vectors.
725
*
726
* If either lane value is NaN, @c b will be returned for that lane.
727
*/
728
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
729
{
730
// Do not reorder - second operand will return if either is NaN
731
return vfloat4(vminnmq_f32(a.m, b.m));
732
}
733
734
/**
735
* @brief Return the max vector of two vectors.
736
*
737
* If either lane value is NaN, @c b will be returned for that lane.
738
*/
739
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
740
{
741
// Do not reorder - second operand will return if either is NaN
742
return vfloat4(vmaxnmq_f32(a.m, b.m));
743
}
744
745
/**
746
* @brief Return the absolute value of the float vector.
747
*/
748
ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
749
{
750
float32x4_t zero = vdupq_n_f32(0.0f);
751
float32x4_t inv = vsubq_f32(zero, a.m);
752
return vfloat4(vmaxq_f32(a.m, inv));
753
}
754
755
/**
756
* @brief Return a float rounded to the nearest integer value.
757
*/
758
ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
759
{
760
return vfloat4(vrndnq_f32(a.m));
761
}
762
763
/**
764
* @brief Return the horizontal minimum of a vector.
765
*/
766
ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
767
{
768
return vfloat4(vminvq_f32(a.m));
769
}
770
771
/**
772
* @brief Return the horizontal maximum of a vector.
773
*/
774
ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
775
{
776
return vfloat4(vmaxvq_f32(a.m));
777
}
778
779
/**
780
* @brief Return the horizontal sum of a vector.
781
*/
782
ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
783
{
784
// Perform halving add to ensure invariance; we cannot use vaddqv as this
785
// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
786
float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
787
return vget_lane_f32(vpadd_f32(t, t), 0);
788
}
789
790
/**
791
* @brief Return the sqrt of the lanes in the vector.
792
*/
793
ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
794
{
795
return vfloat4(vsqrtq_f32(a.m));
796
}
797
798
/**
799
* @brief Return lanes from @c b if @c cond is set, else @c a.
800
*/
801
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
802
{
803
return vfloat4(vbslq_f32(cond.m, b.m, a.m));
804
}
805
806
/**
807
* @brief Load a vector of gathered results from an array;
808
*/
809
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
810
{
811
#if ASTCENC_SVE == 0
812
alignas(16) int idx[4];
813
storea(indices, idx);
814
alignas(16) float vals[4];
815
vals[0] = base[idx[0]];
816
vals[1] = base[idx[1]];
817
vals[2] = base[idx[2]];
818
vals[3] = base[idx[3]];
819
return vfloat4(vals);
820
#else
821
svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
822
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
823
return vfloat4(svget_neonq_f32(data));
824
#endif
825
}
826
827
/**
828
* @brief Load a vector of gathered results from an array using byte indices from memory
829
*/
830
template<>
831
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
832
{
833
#if ASTCENC_SVE == 0
834
alignas(16) float vals[4];
835
vals[0] = base[indices[0]];
836
vals[1] = base[indices[1]];
837
vals[2] = base[indices[2]];
838
vals[3] = base[indices[3]];
839
return vfloat4(vals);
840
#else
841
svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
842
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
843
return vfloat4(svget_neonq_f32(data));
844
#endif
845
}
846
/**
847
* @brief Store a vector to an unaligned memory address.
848
*/
849
ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
850
{
851
vst1q_f32(p, a.m);
852
}
853
854
/**
855
* @brief Store a vector to a 16B aligned memory address.
856
*/
857
ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
858
{
859
vst1q_f32(p, a.m);
860
}
861
862
/**
863
* @brief Return a integer value for a float vector, using truncation.
864
*/
865
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
866
{
867
return vint4(vcvtq_s32_f32(a.m));
868
}
869
870
/**
871
* @brief Return a integer value for a float vector, using round-to-nearest.
872
*/
873
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
874
{
875
a = a + vfloat4(0.5f);
876
return vint4(vcvtq_s32_f32(a.m));
877
}
878
879
/**
880
* @brief Return a float value for an integer vector.
881
*/
882
ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
883
{
884
return vfloat4(vcvtq_f32_s32(a.m));
885
}
886
887
/**
888
* @brief Return a float16 value for a float vector, using round-to-nearest.
889
*/
890
ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
891
{
892
// Generate float16 value
893
float16x4_t f16 = vcvt_f16_f32(a.m);
894
895
// Convert each 16-bit float pattern to a 32-bit pattern
896
uint16x4_t u16 = vreinterpret_u16_f16(f16);
897
uint32x4_t u32 = vmovl_u16(u16);
898
return vint4(vreinterpretq_s32_u32(u32));
899
}
900
901
/**
902
* @brief Return a float16 value for a float scalar, using round-to-nearest.
903
*/
904
static inline uint16_t float_to_float16(float a)
905
{
906
vfloat4 av(a);
907
return static_cast<uint16_t>(float_to_float16(av).lane<0>());
908
}
909
910
/**
911
* @brief Return a float value for a float16 vector.
912
*/
913
ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
914
{
915
// Convert each 32-bit float pattern to a 16-bit pattern
916
uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
917
uint16x4_t u16 = vmovn_u32(u32);
918
float16x4_t f16 = vreinterpret_f16_u16(u16);
919
920
// Generate float16 value
921
return vfloat4(vcvt_f32_f16(f16));
922
}
923
924
/**
925
* @brief Return a float value for a float16 scalar.
926
*/
927
ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
928
{
929
vint4 av(a);
930
return float16_to_float(av).lane<0>();
931
}
932
933
/**
934
* @brief Return a float value as an integer bit pattern (i.e. no conversion).
935
*
936
* It is a common trick to convert floats into integer bit patterns, perform
937
* some bit hackery based on knowledge they are IEEE 754 layout, and then
938
* convert them back again. This is the first half of that flip.
939
*/
940
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
941
{
942
return vint4(vreinterpretq_s32_f32(a.m));
943
}
944
945
/**
946
* @brief Return a integer value as a float bit pattern (i.e. no conversion).
947
*
948
* It is a common trick to convert floats into integer bit patterns, perform
949
* some bit hackery based on knowledge they are IEEE 754 layout, and then
950
* convert them back again. This is the second half of that flip.
951
*/
952
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
953
{
954
return vfloat4(vreinterpretq_f32_s32(v.m));
955
}
956
957
/*
958
* Table structure for a 16x 8-bit entry table.
959
*/
960
struct vtable4_16x8 {
961
uint8x16_t t0;
962
};
963
964
/*
965
* Table structure for a 32x 8-bit entry table.
966
*/
967
struct vtable4_32x8 {
968
uint8x16x2_t t01;
969
};
970
971
/*
972
* Table structure for a 64x 8-bit entry table.
973
*/
974
struct vtable4_64x8 {
975
uint8x16x4_t t0123;
976
};
977
978
/**
979
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
980
*/
981
ASTCENC_SIMD_INLINE void vtable_prepare(
982
vtable4_16x8& table,
983
const uint8_t* data
984
) {
985
table.t0 = vld1q_u8(data);
986
}
987
988
/**
989
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
990
*/
991
ASTCENC_SIMD_INLINE void vtable_prepare(
992
vtable4_32x8& table,
993
const uint8_t* data
994
) {
995
table.t01 = uint8x16x2_t {
996
vld1q_u8(data),
997
vld1q_u8(data + 16)
998
};
999
}
1000
1001
/**
1002
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
1003
*/
1004
ASTCENC_SIMD_INLINE void vtable_prepare(
1005
vtable4_64x8& table,
1006
const uint8_t* data
1007
) {
1008
table.t0123 = uint8x16x4_t {
1009
vld1q_u8(data),
1010
vld1q_u8(data + 16),
1011
vld1q_u8(data + 32),
1012
vld1q_u8(data + 48)
1013
};
1014
}
1015
1016
/**
1017
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
1018
*/
1019
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1020
const vtable4_16x8& tbl,
1021
vint4 idx
1022
) {
1023
// Set index byte above max index for unused bytes so table lookup returns zero
1024
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1025
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1026
1027
return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
1028
}
1029
1030
/**
1031
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
1032
*/
1033
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1034
const vtable4_32x8& tbl,
1035
vint4 idx
1036
) {
1037
// Set index byte above max index for unused bytes so table lookup returns zero
1038
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1039
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1040
1041
return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
1042
}
1043
1044
/**
1045
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
1046
*/
1047
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1048
const vtable4_64x8& tbl,
1049
vint4 idx
1050
) {
1051
// Set index byte above max index for unused bytes so table lookup returns zero
1052
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1053
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1054
1055
return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
1056
}
1057
1058
/**
1059
* @brief Return a vector of interleaved RGBA data.
1060
*
1061
* Input vectors have the value stored in the bottom 8 bits of each lane,
1062
* with high bits set to zero.
1063
*
1064
* Output vector stores a single RGBA texel packed in each lane.
1065
*/
1066
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
1067
{
1068
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1069
}
1070
1071
/**
1072
* @brief Store a single vector lane to an unaligned address.
1073
*/
1074
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
1075
{
1076
std::memcpy(base, &data, sizeof(int));
1077
}
1078
1079
/**
1080
* @brief Store a vector, skipping masked lanes.
1081
*
1082
* All masked lanes must be at the end of vector, after all non-masked lanes.
1083
*/
1084
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
1085
{
1086
if (mask.lane<3>())
1087
{
1088
store(data, base);
1089
}
1090
else if (mask.lane<2>() != 0.0f)
1091
{
1092
store_lane(base + 0, data.lane<0>());
1093
store_lane(base + 4, data.lane<1>());
1094
store_lane(base + 8, data.lane<2>());
1095
}
1096
else if (mask.lane<1>() != 0.0f)
1097
{
1098
store_lane(base + 0, data.lane<0>());
1099
store_lane(base + 4, data.lane<1>());
1100
}
1101
else if (mask.lane<0>() != 0.0f)
1102
{
1103
store_lane(base + 0, data.lane<0>());
1104
}
1105
}
1106
1107
#define ASTCENC_USE_NATIVE_POPCOUNT 1
1108
1109
/**
1110
* @brief Population bit count.
1111
*
1112
* @param v The value to population count.
1113
*
1114
* @return The number of 1 bits.
1115
*/
1116
ASTCENC_SIMD_INLINE int popcount(uint64_t v)
1117
{
1118
return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
1119
}
1120
1121
#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
1122
1123