Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
9898 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
/**
19
* @brief 8x32-bit vectors, implemented using AVX2.
20
*
21
* This module implements 8-wide 32-bit float, int, and mask vectors for x86
22
* AVX2.
23
*
24
* There is a baseline level of functionality provided by all vector widths and
25
* implementations. This is implemented using identical function signatures,
26
* modulo data type, so we can use them as substitutable implementations in VLA
27
* code.
28
*/
29
30
#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31
#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32
33
#ifndef ASTCENC_SIMD_INLINE
34
#error "Include astcenc_vecmathlib.h, do not include directly"
35
#endif
36
37
#include <cstdio>
38
39
// Define convenience intrinsics that are missing on older compilers
40
#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41
42
// ============================================================================
43
// vfloat8 data type
44
// ============================================================================
45
46
/**
47
* @brief Data type for 8-wide floats.
48
*/
49
struct vfloat8
50
{
51
/**
52
* @brief Construct from zero-initialized value.
53
*/
54
ASTCENC_SIMD_INLINE vfloat8() = default;
55
56
/**
57
* @brief Construct from 8 values loaded from an unaligned address.
58
*
59
* Consider using loada() which is better with vectors if data is aligned
60
* to vector length.
61
*/
62
ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63
{
64
m = _mm256_loadu_ps(p);
65
}
66
67
/**
68
* @brief Construct from 1 scalar value replicated across all lanes.
69
*
70
* Consider using zero() for constexpr zeros.
71
*/
72
ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73
{
74
m = _mm256_set1_ps(a);
75
}
76
77
/**
78
* @brief Construct from an existing SIMD register.
79
*/
80
ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
81
{
82
m = a;
83
}
84
85
/**
86
* @brief Factory that returns a vector of zeros.
87
*/
88
static ASTCENC_SIMD_INLINE vfloat8 zero()
89
{
90
return vfloat8(_mm256_setzero_ps());
91
}
92
93
/**
94
* @brief Factory that returns a replicated scalar loaded from memory.
95
*/
96
static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
97
{
98
return vfloat8(_mm256_broadcast_ss(p));
99
}
100
101
/**
102
* @brief Factory that returns a vector loaded from 32B aligned memory.
103
*/
104
static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
105
{
106
return vfloat8(_mm256_load_ps(p));
107
}
108
109
/**
110
* @brief The vector ...
111
*/
112
__m256 m;
113
};
114
115
// ============================================================================
116
// vint8 data type
117
// ============================================================================
118
119
/**
120
* @brief Data type for 8-wide ints.
121
*/
122
struct vint8
123
{
124
/**
125
* @brief Construct from zero-initialized value.
126
*/
127
ASTCENC_SIMD_INLINE vint8() = default;
128
129
/**
130
* @brief Construct from 8 values loaded from an unaligned address.
131
*
132
* Consider using loada() which is better with vectors if data is aligned
133
* to vector length.
134
*/
135
ASTCENC_SIMD_INLINE explicit vint8(const int *p)
136
{
137
m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
138
}
139
140
/**
141
* @brief Construct from 8 uint8_t loaded from an unaligned address.
142
*/
143
ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
144
{
145
// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
146
m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
147
}
148
149
/**
150
* @brief Construct from 1 scalar value replicated across all lanes.
151
*
152
* Consider using zero() for constexpr zeros.
153
*/
154
ASTCENC_SIMD_INLINE explicit vint8(int a)
155
{
156
m = _mm256_set1_epi32(a);
157
}
158
159
/**
160
* @brief Construct from an existing SIMD register.
161
*/
162
ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
163
{
164
m = a;
165
}
166
167
/**
168
* @brief Factory that returns a vector of zeros.
169
*/
170
static ASTCENC_SIMD_INLINE vint8 zero()
171
{
172
return vint8(_mm256_setzero_si256());
173
}
174
175
/**
176
* @brief Factory that returns a replicated scalar loaded from memory.
177
*/
178
static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
179
{
180
__m128i a = _mm_set1_epi32(*p);
181
return vint8(_mm256_broadcastd_epi32(a));
182
}
183
184
/**
185
* @brief Factory that returns a vector loaded from unaligned memory.
186
*/
187
static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
188
{
189
return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
190
}
191
192
/**
193
* @brief Factory that returns a vector loaded from 32B aligned memory.
194
*/
195
static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
196
{
197
return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
198
}
199
200
/**
201
* @brief Factory that returns a vector containing the lane IDs.
202
*/
203
static ASTCENC_SIMD_INLINE vint8 lane_id()
204
{
205
return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
206
}
207
208
/**
209
* @brief The vector ...
210
*/
211
__m256i m;
212
};
213
214
// ============================================================================
215
// vmask8 data type
216
// ============================================================================
217
218
/**
219
* @brief Data type for 8-wide control plane masks.
220
*/
221
struct vmask8
222
{
223
/**
224
* @brief Construct from an existing SIMD register.
225
*/
226
ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
227
{
228
m = a;
229
}
230
231
/**
232
* @brief Construct from an existing SIMD register.
233
*/
234
ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
235
{
236
m = _mm256_castsi256_ps(a);
237
}
238
239
/**
240
* @brief Construct from 1 scalar value.
241
*/
242
ASTCENC_SIMD_INLINE explicit vmask8(bool a)
243
{
244
vint8 mask(a == false ? 0 : -1);
245
m = _mm256_castsi256_ps(mask.m);
246
}
247
248
/**
249
* @brief The vector ...
250
*/
251
__m256 m;
252
};
253
254
// ============================================================================
255
// vmask8 operators and functions
256
// ============================================================================
257
258
/**
259
* @brief Overload: mask union (or).
260
*/
261
ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
262
{
263
return vmask8(_mm256_or_ps(a.m, b.m));
264
}
265
266
/**
267
* @brief Overload: mask intersect (and).
268
*/
269
ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
270
{
271
return vmask8(_mm256_and_ps(a.m, b.m));
272
}
273
274
/**
275
* @brief Overload: mask difference (xor).
276
*/
277
ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
278
{
279
return vmask8(_mm256_xor_ps(a.m, b.m));
280
}
281
282
/**
283
* @brief Overload: mask invert (not).
284
*/
285
ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
286
{
287
return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
288
}
289
290
/**
291
* @brief Return a 8-bit mask code indicating mask status.
292
*
293
* bit0 = lane 0
294
*/
295
ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
296
{
297
return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
298
}
299
300
/**
301
* @brief True if any lanes are enabled, false otherwise.
302
*/
303
ASTCENC_SIMD_INLINE bool any(vmask8 a)
304
{
305
return mask(a) != 0;
306
}
307
308
/**
309
* @brief True if all lanes are enabled, false otherwise.
310
*/
311
ASTCENC_SIMD_INLINE bool all(vmask8 a)
312
{
313
return mask(a) == 0xFF;
314
}
315
316
// ============================================================================
317
// vint8 operators and functions
318
// ============================================================================
319
/**
320
* @brief Overload: vector by vector addition.
321
*/
322
ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
323
{
324
return vint8(_mm256_add_epi32(a.m, b.m));
325
}
326
327
/**
328
* @brief Overload: vector by vector incremental addition.
329
*/
330
ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
331
{
332
a = a + b;
333
return a;
334
}
335
336
/**
337
* @brief Overload: vector by vector subtraction.
338
*/
339
ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
340
{
341
return vint8(_mm256_sub_epi32(a.m, b.m));
342
}
343
344
/**
345
* @brief Overload: vector by vector multiplication.
346
*/
347
ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
348
{
349
return vint8(_mm256_mullo_epi32(a.m, b.m));
350
}
351
352
/**
353
* @brief Overload: vector bit invert.
354
*/
355
ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
356
{
357
return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
358
}
359
360
/**
361
* @brief Overload: vector by vector bitwise or.
362
*/
363
ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
364
{
365
return vint8(_mm256_or_si256(a.m, b.m));
366
}
367
368
/**
369
* @brief Overload: vector by vector bitwise and.
370
*/
371
ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
372
{
373
return vint8(_mm256_and_si256(a.m, b.m));
374
}
375
376
/**
377
* @brief Overload: vector by vector bitwise xor.
378
*/
379
ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
380
{
381
return vint8(_mm256_xor_si256(a.m, b.m));
382
}
383
384
/**
385
* @brief Overload: vector by vector equality.
386
*/
387
ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
388
{
389
return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
390
}
391
392
/**
393
* @brief Overload: vector by vector inequality.
394
*/
395
ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
396
{
397
return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
398
}
399
400
/**
401
* @brief Overload: vector by vector less than.
402
*/
403
ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
404
{
405
return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
406
}
407
408
/**
409
* @brief Overload: vector by vector greater than.
410
*/
411
ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
412
{
413
return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
414
}
415
416
/**
417
* @brief Logical shift left.
418
*/
419
template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
420
{
421
return vint8(_mm256_slli_epi32(a.m, s));
422
}
423
424
/**
425
* @brief Arithmetic shift right.
426
*/
427
template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
428
{
429
return vint8(_mm256_srai_epi32(a.m, s));
430
}
431
432
/**
433
* @brief Logical shift right.
434
*/
435
template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
436
{
437
return vint8(_mm256_srli_epi32(a.m, s));
438
}
439
440
/**
441
* @brief Return the min vector of two vectors.
442
*/
443
ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
444
{
445
return vint8(_mm256_min_epi32(a.m, b.m));
446
}
447
448
/**
449
* @brief Return the max vector of two vectors.
450
*/
451
ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
452
{
453
return vint8(_mm256_max_epi32(a.m, b.m));
454
}
455
456
/**
457
* @brief Return the horizontal minimum of a vector.
458
*/
459
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
460
{
461
// Build min within groups of 2, then 4, then 8
462
__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
463
m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
464
m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
465
466
vint8 vmin(m);
467
return vmin;
468
}
469
470
/**
471
* @brief Return the horizontal minimum of a vector.
472
*/
473
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
474
{
475
return _mm256_cvtsi256_si32(hmin(a).m);
476
}
477
478
/**
479
* @brief Return the horizontal maximum of a vector.
480
*/
481
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
482
{
483
// Build max within groups of 2, then 4, then 8
484
__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
485
m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
486
m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
487
488
vint8 vmax(m);
489
return vmax;
490
}
491
492
/**
493
* @brief Return the horizontal maximum of a vector.
494
*/
495
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
496
{
497
return _mm256_cvtsi256_si32(hmax(a).m);
498
}
499
500
/**
501
* @brief Generate a vint8 from a size_t.
502
*/
503
ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
504
{
505
assert(a <= std::numeric_limits<int>::max());
506
return vint8(static_cast<int>(a));
507
}
508
509
/**
510
* @brief Store a vector to a 16B aligned memory address.
511
*/
512
ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
513
{
514
_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
515
}
516
517
/**
518
* @brief Store a vector to an unaligned memory address.
519
*/
520
ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
521
{
522
_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
523
}
524
525
/**
526
* @brief Store lowest N (vector width) bytes into an unaligned address.
527
*/
528
ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
529
{
530
// This is the most logical implementation, but the convenience intrinsic
531
// is missing on older compilers (supported in g++ 9 and clang++ 9).
532
// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
533
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
534
}
535
536
/**
537
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
538
*/
539
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
540
{
541
__m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
542
0, 0, 0, 0, 28, 24, 20, 16,
543
0, 0, 0, 0, 0, 0, 0, 0,
544
0, 0, 0, 0, 12, 8, 4, 0);
545
__m256i a = _mm256_shuffle_epi8(v.m, shuf);
546
__m128i a0 = _mm256_extracti128_si256(a, 0);
547
__m128i a1 = _mm256_extracti128_si256(a, 1);
548
__m128i b = _mm_unpacklo_epi32(a0, a1);
549
550
__m256i r = astcenc_mm256_set_m128i(b, b);
551
552
store_nbytes(vint8(r), p);
553
}
554
555
/**
556
* @brief Return lanes from @c b if @c cond is set, else @c a.
557
*/
558
ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
559
{
560
__m256i condi = _mm256_castps_si256(cond.m);
561
return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
562
}
563
564
// ============================================================================
565
// vfloat8 operators and functions
566
// ============================================================================
567
568
/**
569
* @brief Overload: vector by vector addition.
570
*/
571
ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
572
{
573
return vfloat8(_mm256_add_ps(a.m, b.m));
574
}
575
576
/**
577
* @brief Overload: vector by vector incremental addition.
578
*/
579
ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
580
{
581
a = a + b;
582
return a;
583
}
584
585
/**
586
* @brief Overload: vector by vector subtraction.
587
*/
588
ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
589
{
590
return vfloat8(_mm256_sub_ps(a.m, b.m));
591
}
592
593
/**
594
* @brief Overload: vector by vector multiplication.
595
*/
596
ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
597
{
598
return vfloat8(_mm256_mul_ps(a.m, b.m));
599
}
600
601
/**
602
* @brief Overload: vector by scalar multiplication.
603
*/
604
ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
605
{
606
return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
607
}
608
609
/**
610
* @brief Overload: scalar by vector multiplication.
611
*/
612
ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
613
{
614
return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
615
}
616
617
/**
618
* @brief Overload: vector by vector division.
619
*/
620
ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
621
{
622
return vfloat8(_mm256_div_ps(a.m, b.m));
623
}
624
625
/**
626
* @brief Overload: vector by scalar division.
627
*/
628
ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
629
{
630
return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
631
}
632
633
/**
634
* @brief Overload: scalar by vector division.
635
*/
636
ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
637
{
638
return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
639
}
640
641
/**
642
* @brief Overload: vector by vector equality.
643
*/
644
ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
645
{
646
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
647
}
648
649
/**
650
* @brief Overload: vector by vector inequality.
651
*/
652
ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
653
{
654
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
655
}
656
657
/**
658
* @brief Overload: vector by vector less than.
659
*/
660
ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
661
{
662
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
663
}
664
665
/**
666
* @brief Overload: vector by vector greater than.
667
*/
668
ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
669
{
670
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
671
}
672
673
/**
674
* @brief Overload: vector by vector less than or equal.
675
*/
676
ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
677
{
678
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
679
}
680
681
/**
682
* @brief Overload: vector by vector greater than or equal.
683
*/
684
ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
685
{
686
return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
687
}
688
689
/**
690
* @brief Return the min vector of two vectors.
691
*
692
* If either lane value is NaN, @c b will be returned for that lane.
693
*/
694
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
695
{
696
return vfloat8(_mm256_min_ps(a.m, b.m));
697
}
698
699
/**
700
* @brief Return the min vector of a vector and a scalar.
701
*
702
* If either lane value is NaN, @c b will be returned for that lane.
703
*/
704
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
705
{
706
return min(a, vfloat8(b));
707
}
708
709
/**
710
* @brief Return the max vector of two vectors.
711
*
712
* If either lane value is NaN, @c b will be returned for that lane.
713
*/
714
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
715
{
716
return vfloat8(_mm256_max_ps(a.m, b.m));
717
}
718
719
/**
720
* @brief Return the max vector of a vector and a scalar.
721
*
722
* If either lane value is NaN, @c b will be returned for that lane.
723
*/
724
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
725
{
726
return max(a, vfloat8(b));
727
}
728
729
/**
730
* @brief Return the clamped value between min and max.
731
*
732
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
733
* then @c min will be returned for that lane.
734
*/
735
ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
736
{
737
// Do not reorder - second operand will return if either is NaN
738
a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
739
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
740
return a;
741
}
742
743
/**
744
* @brief Return a clamped value between 0.0f and 1.0f.
745
*
746
* If @c a is NaN then zero will be returned for that lane.
747
*/
748
ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
749
{
750
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
751
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
752
return a;
753
}
754
755
/**
756
* @brief Return the absolute value of the float vector.
757
*/
758
ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
759
{
760
__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
761
return vfloat8(_mm256_and_ps(a.m, msk));
762
}
763
764
/**
765
* @brief Return a float rounded to the nearest integer value.
766
*/
767
ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
768
{
769
constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
770
return vfloat8(_mm256_round_ps(a.m, flags));
771
}
772
773
/**
774
* @brief Return the horizontal minimum of a vector.
775
*/
776
ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
777
{
778
__m128 vlow = _mm256_castps256_ps128(a.m);
779
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
780
vlow = _mm_min_ps(vlow, vhigh);
781
782
// First do an horizontal reduction.
783
__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
784
__m128 mins = _mm_min_ps(vlow, shuf);
785
shuf = _mm_movehl_ps(shuf, mins);
786
mins = _mm_min_ss(mins, shuf);
787
788
// This is the most logical implementation, but the convenience intrinsic
789
// is missing on older compilers (supported in g++ 9 and clang++ 9).
790
//__m256i r = _mm256_set_m128(m, m)
791
__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
792
793
return vfloat8(_mm256_permute_ps(r, 0));
794
}
795
796
/**
797
* @brief Return the horizontal minimum of a vector.
798
*/
799
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
800
{
801
return _mm256_cvtss_f32(hmin(a).m);
802
}
803
804
/**
805
* @brief Return the horizontal maximum of a vector.
806
*/
807
ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
808
{
809
__m128 vlow = _mm256_castps256_ps128(a.m);
810
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
811
vhigh = _mm_max_ps(vlow, vhigh);
812
813
// First do an horizontal reduction.
814
__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
815
__m128 maxs = _mm_max_ps(vhigh, shuf);
816
shuf = _mm_movehl_ps(shuf,maxs);
817
maxs = _mm_max_ss(maxs, shuf);
818
819
// This is the most logical implementation, but the convenience intrinsic
820
// is missing on older compilers (supported in g++ 9 and clang++ 9).
821
//__m256i r = _mm256_set_m128(m, m)
822
__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
823
return vfloat8(_mm256_permute_ps(r, 0));
824
}
825
826
/**
827
* @brief Return the horizontal maximum of a vector.
828
*/
829
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
830
{
831
return _mm256_cvtss_f32(hmax(a).m);
832
}
833
834
/**
835
* @brief Return the horizontal sum of a vector.
836
*/
837
ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
838
{
839
// Two sequential 4-wide adds gives invariance with 4-wide code
840
vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
841
vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
842
return hadd_s(lo) + hadd_s(hi);
843
}
844
845
/**
846
* @brief Return lanes from @c b if @c cond is set, else @c a.
847
*/
848
ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
849
{
850
return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
851
}
852
853
/**
854
* @brief Accumulate lane-wise sums for a vector, folded 4-wide.
855
*
856
* This is invariant with 4-wide implementations.
857
*/
858
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
859
{
860
vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
861
haccumulate(accum, lo);
862
863
vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
864
haccumulate(accum, hi);
865
}
866
867
/**
868
* @brief Accumulate lane-wise sums for a vector.
869
*
870
* This is NOT invariant with 4-wide implementations.
871
*/
872
ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
873
{
874
accum += a;
875
}
876
877
/**
878
* @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
879
*
880
* This is invariant with 4-wide implementations.
881
*/
882
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
883
{
884
a = select(vfloat8::zero(), a, m);
885
haccumulate(accum, a);
886
}
887
888
/**
889
* @brief Accumulate masked lane-wise sums for a vector.
890
*
891
* This is NOT invariant with 4-wide implementations.
892
*/
893
ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
894
{
895
a = select(vfloat8::zero(), a, m);
896
haccumulate(accum, a);
897
}
898
899
/**
900
* @brief Return the sqrt of the lanes in the vector.
901
*/
902
ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
903
{
904
return vfloat8(_mm256_sqrt_ps(a.m));
905
}
906
907
/**
908
* @brief Load a vector of gathered results from an array;
909
*/
910
ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
911
{
912
return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
913
}
914
915
/**
916
* @brief Load a vector of gathered results from an array using byte indices from memory
917
*/
918
template<>
919
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
920
{
921
#if ASTCENC_X86_GATHERS == 0
922
// Perform manual gather using scalar loads in two separate dependency chains,
923
// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
924
// into a bunch of memory-operand inserts on 128-bit halves then merges late,
925
// which performs significantly worse in tests.
926
__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
927
__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
928
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
929
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
930
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
931
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
932
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
933
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
934
935
return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
936
#else
937
vint8 inds(indices);
938
return gatherf(base, inds);
939
#endif
940
}
941
942
/**
943
* @brief Store a vector to an unaligned memory address.
944
*/
945
ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
946
{
947
_mm256_storeu_ps(p, a.m);
948
}
949
950
/**
951
* @brief Store a vector to a 32B aligned memory address.
952
*/
953
ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
954
{
955
_mm256_store_ps(p, a.m);
956
}
957
958
/**
959
* @brief Return a integer value for a float vector, using truncation.
960
*/
961
ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
962
{
963
return vint8(_mm256_cvttps_epi32(a.m));
964
}
965
966
/**
967
* @brief Return a integer value for a float vector, using round-to-nearest.
968
*/
969
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
970
{
971
a = a + vfloat8(0.5f);
972
return vint8(_mm256_cvttps_epi32(a.m));
973
}
974
975
976
/**
977
* @brief Return a float value for an integer vector.
978
*/
979
ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
980
{
981
return vfloat8(_mm256_cvtepi32_ps(a.m));
982
}
983
984
/**
985
* @brief Return a float value as an integer bit pattern (i.e. no conversion).
986
*
987
* It is a common trick to convert floats into integer bit patterns, perform
988
* some bit hackery based on knowledge they are IEEE 754 layout, and then
989
* convert them back again. This is the first half of that flip.
990
*/
991
ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
992
{
993
return vint8(_mm256_castps_si256(a.m));
994
}
995
996
/**
997
* @brief Return a integer value as a float bit pattern (i.e. no conversion).
998
*
999
* It is a common trick to convert floats into integer bit patterns, perform
1000
* some bit hackery based on knowledge they are IEEE 754 layout, and then
1001
* convert them back again. This is the second half of that flip.
1002
*/
1003
ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1004
{
1005
return vfloat8(_mm256_castsi256_ps(a.m));
1006
}
1007
1008
/*
1009
* Table structure for a 16x 8-bit entry table.
1010
*/
1011
struct vtable8_16x8 {
1012
vint8 t0;
1013
};
1014
1015
/*
1016
* Table structure for a 32x 8-bit entry table.
1017
*/
1018
struct vtable8_32x8 {
1019
vint8 t0;
1020
vint8 t1;
1021
};
1022
1023
/*
1024
* Table structure for a 64x 8-bit entry table.
1025
*/
1026
struct vtable8_64x8 {
1027
vint8 t0;
1028
vint8 t1;
1029
vint8 t2;
1030
vint8 t3;
1031
};
1032
1033
/**
1034
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
1035
*/
1036
ASTCENC_SIMD_INLINE void vtable_prepare(
1037
vtable8_16x8& table,
1038
const uint8_t* data
1039
) {
1040
// AVX2 tables duplicate table entries in each 128-bit half-register
1041
vint4 d0 = vint4::load(data);
1042
1043
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1044
}
1045
1046
/**
1047
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
1048
*/
1049
ASTCENC_SIMD_INLINE void vtable_prepare(
1050
vtable8_32x8& table,
1051
const uint8_t* data
1052
) {
1053
// AVX2 tables duplicate table entries in each 128-bit half-register
1054
vint4 d0 = vint4::load(data);
1055
vint4 d1 = vint4::load(data + 16);
1056
1057
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1058
table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
1059
1060
// XOR chain the high rows to allow table emulation
1061
table.t1 = table.t1 ^ table.t0;
1062
}
1063
1064
/**
1065
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
1066
*/
1067
ASTCENC_SIMD_INLINE void vtable_prepare(
1068
vtable8_64x8& table,
1069
const uint8_t* data
1070
) {
1071
// AVX2 tables duplicate table entries in each 128-bit half-register
1072
vint4 d0 = vint4::load(data);
1073
vint4 d1 = vint4::load(data + 16);
1074
vint4 d2 = vint4::load(data + 32);
1075
vint4 d3 = vint4::load(data + 48);
1076
1077
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1078
table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
1079
table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
1080
table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
1081
1082
// XOR chain the high rows to allow table emulation
1083
table.t3 = table.t3 ^ table.t2;
1084
table.t2 = table.t2 ^ table.t1;
1085
table.t1 = table.t1 ^ table.t0;
1086
}
1087
1088
/**
1089
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
1090
*/
1091
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1092
const vtable8_16x8& tbl,
1093
vint8 idx
1094
) {
1095
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1096
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1097
1098
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1099
return vint8(result);
1100
}
1101
1102
/**
1103
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
1104
*/
1105
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1106
const vtable8_32x8& tbl,
1107
vint8 idx
1108
) {
1109
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1110
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1111
1112
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1113
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1114
1115
__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
1116
result = _mm256_xor_si256(result, result2);
1117
return vint8(result);
1118
}
1119
1120
/**
1121
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
1122
*/
1123
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1124
const vtable8_64x8& tbl,
1125
vint8 idx
1126
) {
1127
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1128
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1129
1130
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1131
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1132
1133
__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
1134
result = _mm256_xor_si256(result, result2);
1135
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1136
1137
result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
1138
result = _mm256_xor_si256(result, result2);
1139
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1140
1141
result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
1142
result = _mm256_xor_si256(result, result2);
1143
1144
return vint8(result);
1145
}
1146
1147
/**
1148
* @brief Return a vector of interleaved RGBA data.
1149
*
1150
* Input vectors have the value stored in the bottom 8 bits of each lane,
1151
* with high bits set to zero.
1152
*
1153
* Output vector stores a single RGBA texel packed in each lane.
1154
*/
1155
ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1156
{
1157
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1158
}
1159
1160
/**
1161
* @brief Store a vector, skipping masked lanes.
1162
*
1163
* All masked lanes must be at the end of vector, after all non-masked lanes.
1164
*/
1165
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
1166
{
1167
_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
1168
}
1169
1170
/**
1171
* @brief Debug function to print a vector of ints.
1172
*/
1173
ASTCENC_SIMD_INLINE void print(vint8 a)
1174
{
1175
alignas(32) int v[8];
1176
storea(a, v);
1177
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
1178
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1179
}
1180
1181
/**
1182
* @brief Debug function to print a vector of ints.
1183
*/
1184
ASTCENC_SIMD_INLINE void printx(vint8 a)
1185
{
1186
alignas(32) int v[8];
1187
storea(a, v);
1188
1189
unsigned int uv[8];
1190
std::memcpy(uv, v, sizeof(int) * 8);
1191
1192
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
1193
uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
1194
}
1195
1196
/**
1197
* @brief Debug function to print a vector of floats.
1198
*/
1199
ASTCENC_SIMD_INLINE void print(vfloat8 a)
1200
{
1201
alignas(32) float v[8];
1202
storea(a, v);
1203
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1204
static_cast<double>(v[0]), static_cast<double>(v[1]),
1205
static_cast<double>(v[2]), static_cast<double>(v[3]),
1206
static_cast<double>(v[4]), static_cast<double>(v[5]),
1207
static_cast<double>(v[6]), static_cast<double>(v[7]));
1208
}
1209
1210
/**
1211
* @brief Debug function to print a vector of masks.
1212
*/
1213
ASTCENC_SIMD_INLINE void print(vmask8 a)
1214
{
1215
print(select(vint8(0), vint8(1), a));
1216
}
1217
1218
#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1219
1220