CoCalc -- astcenc_vecmathlib_avx2

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
⁹⁸⁹⁸ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief 8x32-bit vectors, implemented using AVX2.
20
 *
21
 * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22
 * AVX2.
23
 *
24
 * There is a baseline level of functionality provided by all vector widths and
25
 * implementations. This is implemented using identical function signatures,
26
 * modulo data type, so we can use them as substitutable implementations in VLA
27
 * code.
28
 */
29

30
#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31
#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32

33
#ifndef ASTCENC_SIMD_INLINE
34
	#error "Include astcenc_vecmathlib.h, do not include directly"
35
#endif
36

37
#include <cstdio>
38

39
// Define convenience intrinsics that are missing on older compilers
40
#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41

42
// ============================================================================
43
// vfloat8 data type
44
// ============================================================================
45

46
/**
47
 * @brief Data type for 8-wide floats.
48
 */
49
struct vfloat8
50
{
51
	/**
52
	 * @brief Construct from zero-initialized value.
53
	 */
54
	ASTCENC_SIMD_INLINE vfloat8() = default;
55

56
	/**
57
	 * @brief Construct from 8 values loaded from an unaligned address.
58
	 *
59
	 * Consider using loada() which is better with vectors if data is aligned
60
	 * to vector length.
61
	 */
62
	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63
	{
64
		m = _mm256_loadu_ps(p);
65
	}
66

67
	/**
68
	 * @brief Construct from 1 scalar value replicated across all lanes.
69
	 *
70
	 * Consider using zero() for constexpr zeros.
71
	 */
72
	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73
	{
74
		m = _mm256_set1_ps(a);
75
	}
76

77
	/**
78
	 * @brief Construct from an existing SIMD register.
79
	 */
80
	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
81
	{
82
		m = a;
83
	}
84

85
	/**
86
	 * @brief Factory that returns a vector of zeros.
87
	 */
88
	static ASTCENC_SIMD_INLINE vfloat8 zero()
89
	{
90
		return vfloat8(_mm256_setzero_ps());
91
	}
92

93
	/**
94
	 * @brief Factory that returns a replicated scalar loaded from memory.
95
	 */
96
	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
97
	{
98
		return vfloat8(_mm256_broadcast_ss(p));
99
	}
100

101
	/**
102
	 * @brief Factory that returns a vector loaded from 32B aligned memory.
103
	 */
104
	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
105
	{
106
		return vfloat8(_mm256_load_ps(p));
107
	}
108

109
	/**
110
	 * @brief The vector ...
111
	 */
112
	__m256 m;
113
};
114

115
// ============================================================================
116
// vint8 data type
117
// ============================================================================
118

119
/**
120
 * @brief Data type for 8-wide ints.
121
 */
122
struct vint8
123
{
124
	/**
125
	 * @brief Construct from zero-initialized value.
126
	 */
127
	ASTCENC_SIMD_INLINE vint8() = default;
128

129
	/**
130
	 * @brief Construct from 8 values loaded from an unaligned address.
131
	 *
132
	 * Consider using loada() which is better with vectors if data is aligned
133
	 * to vector length.
134
	 */
135
	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
136
	{
137
		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
138
	}
139

140
	/**
141
	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
142
	 */
143
	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
144
	{
145
		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
146
		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
147
	}
148

149
	/**
150
	 * @brief Construct from 1 scalar value replicated across all lanes.
151
	 *
152
	 * Consider using zero() for constexpr zeros.
153
	 */
154
	ASTCENC_SIMD_INLINE explicit vint8(int a)
155
	{
156
		m = _mm256_set1_epi32(a);
157
	}
158

159
	/**
160
	 * @brief Construct from an existing SIMD register.
161
	 */
162
	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
163
	{
164
		m = a;
165
	}
166

167
	/**
168
	 * @brief Factory that returns a vector of zeros.
169
	 */
170
	static ASTCENC_SIMD_INLINE vint8 zero()
171
	{
172
		return vint8(_mm256_setzero_si256());
173
	}
174

175
	/**
176
	 * @brief Factory that returns a replicated scalar loaded from memory.
177
	 */
178
	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
179
	{
180
		__m128i a = _mm_set1_epi32(*p);
181
		return vint8(_mm256_broadcastd_epi32(a));
182
	}
183

184
	/**
185
	 * @brief Factory that returns a vector loaded from unaligned memory.
186
	 */
187
	static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
188
	{
189
		return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
190
	}
191

192
	/**
193
	 * @brief Factory that returns a vector loaded from 32B aligned memory.
194
	 */
195
	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
196
	{
197
		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
198
	}
199

200
	/**
201
	 * @brief Factory that returns a vector containing the lane IDs.
202
	 */
203
	static ASTCENC_SIMD_INLINE vint8 lane_id()
204
	{
205
		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
206
	}
207

208
	/**
209
	 * @brief The vector ...
210
	 */
211
	__m256i m;
212
};
213

214
// ============================================================================
215
// vmask8 data type
216
// ============================================================================
217

218
/**
219
 * @brief Data type for 8-wide control plane masks.
220
 */
221
struct vmask8
222
{
223
	/**
224
	 * @brief Construct from an existing SIMD register.
225
	 */
226
	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
227
	{
228
		m = a;
229
	}
230

231
	/**
232
	 * @brief Construct from an existing SIMD register.
233
	 */
234
	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
235
	{
236
		m = _mm256_castsi256_ps(a);
237
	}
238

239
	/**
240
	 * @brief Construct from 1 scalar value.
241
	 */
242
	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
243
	{
244
		vint8 mask(a == false ? 0 : -1);
245
		m = _mm256_castsi256_ps(mask.m);
246
	}
247

248
	/**
249
	 * @brief The vector ...
250
	 */
251
	__m256 m;
252
};
253

254
// ============================================================================
255
// vmask8 operators and functions
256
// ============================================================================
257

258
/**
259
 * @brief Overload: mask union (or).
260
 */
261
ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
262
{
263
	return vmask8(_mm256_or_ps(a.m, b.m));
264
}
265

266
/**
267
 * @brief Overload: mask intersect (and).
268
 */
269
ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
270
{
271
	return vmask8(_mm256_and_ps(a.m, b.m));
272
}
273

274
/**
275
 * @brief Overload: mask difference (xor).
276
 */
277
ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
278
{
279
	return vmask8(_mm256_xor_ps(a.m, b.m));
280
}
281

282
/**
283
 * @brief Overload: mask invert (not).
284
 */
285
ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
286
{
287
	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
288
}
289

290
/**
291
 * @brief Return a 8-bit mask code indicating mask status.
292
 *
293
 * bit0 = lane 0
294
 */
295
ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
296
{
297
	return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
298
}
299

300
/**
301
 * @brief True if any lanes are enabled, false otherwise.
302
 */
303
ASTCENC_SIMD_INLINE bool any(vmask8 a)
304
{
305
	return mask(a) != 0;
306
}
307

308
/**
309
 * @brief True if all lanes are enabled, false otherwise.
310
 */
311
ASTCENC_SIMD_INLINE bool all(vmask8 a)
312
{
313
	return mask(a) == 0xFF;
314
}
315

316
// ============================================================================
317
// vint8 operators and functions
318
// ============================================================================
319
/**
320
 * @brief Overload: vector by vector addition.
321
 */
322
ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
323
{
324
	return vint8(_mm256_add_epi32(a.m, b.m));
325
}
326

327
/**
328
 * @brief Overload: vector by vector incremental addition.
329
 */
330
ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
331
{
332
	a = a + b;
333
	return a;
334
}
335

336
/**
337
 * @brief Overload: vector by vector subtraction.
338
 */
339
ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
340
{
341
	return vint8(_mm256_sub_epi32(a.m, b.m));
342
}
343

344
/**
345
 * @brief Overload: vector by vector multiplication.
346
 */
347
ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
348
{
349
	return vint8(_mm256_mullo_epi32(a.m, b.m));
350
}
351

352
/**
353
 * @brief Overload: vector bit invert.
354
 */
355
ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
356
{
357
	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
358
}
359

360
/**
361
 * @brief Overload: vector by vector bitwise or.
362
 */
363
ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
364
{
365
	return vint8(_mm256_or_si256(a.m, b.m));
366
}
367

368
/**
369
 * @brief Overload: vector by vector bitwise and.
370
 */
371
ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
372
{
373
	return vint8(_mm256_and_si256(a.m, b.m));
374
}
375

376
/**
377
 * @brief Overload: vector by vector bitwise xor.
378
 */
379
ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
380
{
381
	return vint8(_mm256_xor_si256(a.m, b.m));
382
}
383

384
/**
385
 * @brief Overload: vector by vector equality.
386
 */
387
ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
388
{
389
	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
390
}
391

392
/**
393
 * @brief Overload: vector by vector inequality.
394
 */
395
ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
396
{
397
	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
398
}
399

400
/**
401
 * @brief Overload: vector by vector less than.
402
 */
403
ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
404
{
405
	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
406
}
407

408
/**
409
 * @brief Overload: vector by vector greater than.
410
 */
411
ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
412
{
413
	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
414
}
415

416
/**
417
 * @brief Logical shift left.
418
 */
419
template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
420
{
421
	return vint8(_mm256_slli_epi32(a.m, s));
422
}
423

424
/**
425
 * @brief Arithmetic shift right.
426
 */
427
template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
428
{
429
	return vint8(_mm256_srai_epi32(a.m, s));
430
}
431

432
/**
433
 * @brief Logical shift right.
434
 */
435
template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
436
{
437
	return vint8(_mm256_srli_epi32(a.m, s));
438
}
439

440
/**
441
 * @brief Return the min vector of two vectors.
442
 */
443
ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
444
{
445
	return vint8(_mm256_min_epi32(a.m, b.m));
446
}
447

448
/**
449
 * @brief Return the max vector of two vectors.
450
 */
451
ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
452
{
453
	return vint8(_mm256_max_epi32(a.m, b.m));
454
}
455

456
/**
457
 * @brief Return the horizontal minimum of a vector.
458
 */
459
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
460
{
461
	// Build min within groups of 2, then 4, then 8
462
	__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
463
	m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
464
	m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
465

466
	vint8 vmin(m);
467
	return vmin;
468
}
469

470
/**
471
 * @brief Return the horizontal minimum of a vector.
472
 */
473
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
474
{
475
	return _mm256_cvtsi256_si32(hmin(a).m);
476
}
477

478
/**
479
 * @brief Return the horizontal maximum of a vector.
480
 */
481
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
482
{
483
	// Build max within groups of 2, then 4, then 8
484
	__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
485
	m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
486
	m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
487

488
	vint8 vmax(m);
489
	return vmax;
490
}
491

492
/**
493
 * @brief Return the horizontal maximum of a vector.
494
 */
495
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
496
{
497
	return _mm256_cvtsi256_si32(hmax(a).m);
498
}
499

500
/**
501
 * @brief Generate a vint8 from a size_t.
502
 */
503
 ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
504
 {
505
	assert(a <= std::numeric_limits<int>::max());
506
	return vint8(static_cast<int>(a));
507
 }
508

509
/**
510
 * @brief Store a vector to a 16B aligned memory address.
511
 */
512
ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
513
{
514
	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
515
}
516

517
/**
518
 * @brief Store a vector to an unaligned memory address.
519
 */
520
ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
521
{
522
	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
523
}
524

525
/**
526
 * @brief Store lowest N (vector width) bytes into an unaligned address.
527
 */
528
ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
529
{
530
	// This is the most logical implementation, but the convenience intrinsic
531
	// is missing on older compilers (supported in g++ 9 and clang++ 9).
532
	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
533
	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
534
}
535

536
/**
537
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
538
 */
539
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
540
{
541
	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
542
	                               0, 0, 0, 0, 28, 24, 20, 16,
543
	                               0, 0, 0, 0,  0,  0,  0,  0,
544
	                               0, 0, 0, 0, 12,  8,  4,  0);
545
	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
546
	__m128i a0 = _mm256_extracti128_si256(a, 0);
547
	__m128i a1 = _mm256_extracti128_si256(a, 1);
548
	__m128i b = _mm_unpacklo_epi32(a0, a1);
549

550
	__m256i r = astcenc_mm256_set_m128i(b, b);
551

552
	store_nbytes(vint8(r), p);
553
}
554

555
/**
556
 * @brief Return lanes from @c b if @c cond is set, else @c a.
557
 */
558
ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
559
{
560
	__m256i condi = _mm256_castps_si256(cond.m);
561
	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
562
}
563

564
// ============================================================================
565
// vfloat8 operators and functions
566
// ============================================================================
567

568
/**
569
 * @brief Overload: vector by vector addition.
570
 */
571
ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
572
{
573
	return vfloat8(_mm256_add_ps(a.m, b.m));
574
}
575

576
/**
577
 * @brief Overload: vector by vector incremental addition.
578
 */
579
ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
580
{
581
	a = a + b;
582
	return a;
583
}
584

585
/**
586
 * @brief Overload: vector by vector subtraction.
587
 */
588
ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
589
{
590
	return vfloat8(_mm256_sub_ps(a.m, b.m));
591
}
592

593
/**
594
 * @brief Overload: vector by vector multiplication.
595
 */
596
ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
597
{
598
	return vfloat8(_mm256_mul_ps(a.m, b.m));
599
}
600

601
/**
602
 * @brief Overload: vector by scalar multiplication.
603
 */
604
ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
605
{
606
	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
607
}
608

609
/**
610
 * @brief Overload: scalar by vector multiplication.
611
 */
612
ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
613
{
614
	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
615
}
616

617
/**
618
 * @brief Overload: vector by vector division.
619
 */
620
ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
621
{
622
	return vfloat8(_mm256_div_ps(a.m, b.m));
623
}
624

625
/**
626
 * @brief Overload: vector by scalar division.
627
 */
628
ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
629
{
630
	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
631
}
632

633
/**
634
 * @brief Overload: scalar by vector division.
635
 */
636
ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
637
{
638
	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
639
}
640

641
/**
642
 * @brief Overload: vector by vector equality.
643
 */
644
ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
645
{
646
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
647
}
648

649
/**
650
 * @brief Overload: vector by vector inequality.
651
 */
652
ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
653
{
654
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
655
}
656

657
/**
658
 * @brief Overload: vector by vector less than.
659
 */
660
ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
661
{
662
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
663
}
664

665
/**
666
 * @brief Overload: vector by vector greater than.
667
 */
668
ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
669
{
670
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
671
}
672

673
/**
674
 * @brief Overload: vector by vector less than or equal.
675
 */
676
ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
677
{
678
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
679
}
680

681
/**
682
 * @brief Overload: vector by vector greater than or equal.
683
 */
684
ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
685
{
686
	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
687
}
688

689
/**
690
 * @brief Return the min vector of two vectors.
691
 *
692
 * If either lane value is NaN, @c b will be returned for that lane.
693
 */
694
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
695
{
696
	return vfloat8(_mm256_min_ps(a.m, b.m));
697
}
698

699
/**
700
 * @brief Return the min vector of a vector and a scalar.
701
 *
702
 * If either lane value is NaN, @c b will be returned for that lane.
703
 */
704
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
705
{
706
	return min(a, vfloat8(b));
707
}
708

709
/**
710
 * @brief Return the max vector of two vectors.
711
 *
712
 * If either lane value is NaN, @c b will be returned for that lane.
713
 */
714
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
715
{
716
	return vfloat8(_mm256_max_ps(a.m, b.m));
717
}
718

719
/**
720
 * @brief Return the max vector of a vector and a scalar.
721
 *
722
 * If either lane value is NaN, @c b will be returned for that lane.
723
 */
724
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
725
{
726
	return max(a, vfloat8(b));
727
}
728

729
/**
730
 * @brief Return the clamped value between min and max.
731
 *
732
 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
733
 * then @c min will be returned for that lane.
734
 */
735
ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
736
{
737
	// Do not reorder - second operand will return if either is NaN
738
	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
739
	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
740
	return a;
741
}
742

743
/**
744
 * @brief Return a clamped value between 0.0f and 1.0f.
745
 *
746
 * If @c a is NaN then zero will be returned for that lane.
747
 */
748
ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
749
{
750
	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
751
	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
752
	return a;
753
}
754

755
/**
756
 * @brief Return the absolute value of the float vector.
757
 */
758
ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
759
{
760
	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
761
	return vfloat8(_mm256_and_ps(a.m, msk));
762
}
763

764
/**
765
 * @brief Return a float rounded to the nearest integer value.
766
 */
767
ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
768
{
769
	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
770
	return vfloat8(_mm256_round_ps(a.m, flags));
771
}
772

773
/**
774
 * @brief Return the horizontal minimum of a vector.
775
 */
776
ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
777
{
778
	__m128 vlow = _mm256_castps256_ps128(a.m);
779
	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
780
	vlow = _mm_min_ps(vlow, vhigh);
781

782
	// First do an horizontal reduction.
783
	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
784
	__m128 mins = _mm_min_ps(vlow, shuf);
785
	shuf = _mm_movehl_ps(shuf, mins);
786
	mins = _mm_min_ss(mins, shuf);
787

788
	// This is the most logical implementation, but the convenience intrinsic
789
	// is missing on older compilers (supported in g++ 9 and clang++ 9).
790
	//__m256i r = _mm256_set_m128(m, m)
791
	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
792

793
	return vfloat8(_mm256_permute_ps(r, 0));
794
}
795

796
/**
797
 * @brief Return the horizontal minimum of a vector.
798
 */
799
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
800
{
801
	return _mm256_cvtss_f32(hmin(a).m);
802
}
803

804
/**
805
 * @brief Return the horizontal maximum of a vector.
806
 */
807
ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
808
{
809
	__m128 vlow = _mm256_castps256_ps128(a.m);
810
	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
811
	vhigh = _mm_max_ps(vlow, vhigh);
812

813
	// First do an horizontal reduction.
814
	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
815
	__m128 maxs = _mm_max_ps(vhigh, shuf);
816
	shuf = _mm_movehl_ps(shuf,maxs);
817
	maxs = _mm_max_ss(maxs, shuf);
818

819
	// This is the most logical implementation, but the convenience intrinsic
820
	// is missing on older compilers (supported in g++ 9 and clang++ 9).
821
	//__m256i r = _mm256_set_m128(m, m)
822
	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
823
	return vfloat8(_mm256_permute_ps(r, 0));
824
}
825

826
/**
827
 * @brief Return the horizontal maximum of a vector.
828
 */
829
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
830
{
831
	return _mm256_cvtss_f32(hmax(a).m);
832
}
833

834
/**
835
 * @brief Return the horizontal sum of a vector.
836
 */
837
ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
838
{
839
	// Two sequential 4-wide adds gives invariance with 4-wide code
840
	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
841
	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
842
	return hadd_s(lo) + hadd_s(hi);
843
}
844

845
/**
846
 * @brief Return lanes from @c b if @c cond is set, else @c a.
847
 */
848
ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
849
{
850
	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
851
}
852

853
/**
854
 * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
855
 *
856
 * This is invariant with 4-wide implementations.
857
 */
858
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
859
{
860
	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
861
	haccumulate(accum, lo);
862

863
	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
864
	haccumulate(accum, hi);
865
}
866

867
/**
868
 * @brief Accumulate lane-wise sums for a vector.
869
 *
870
 * This is NOT invariant with 4-wide implementations.
871
 */
872
ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
873
{
874
	accum += a;
875
}
876

877
/**
878
 * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
879
 *
880
 * This is invariant with 4-wide implementations.
881
 */
882
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
883
{
884
	a = select(vfloat8::zero(), a, m);
885
	haccumulate(accum, a);
886
}
887

888
/**
889
 * @brief Accumulate masked lane-wise sums for a vector.
890
 *
891
 * This is NOT invariant with 4-wide implementations.
892
 */
893
ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
894
{
895
	a = select(vfloat8::zero(), a, m);
896
	haccumulate(accum, a);
897
}
898

899
/**
900
 * @brief Return the sqrt of the lanes in the vector.
901
 */
902
ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
903
{
904
	return vfloat8(_mm256_sqrt_ps(a.m));
905
}
906

907
/**
908
 * @brief Load a vector of gathered results from an array;
909
 */
910
ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
911
{
912
	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
913
}
914

915
/**
916
 * @brief Load a vector of gathered results from an array using byte indices from memory
917
 */
918
template<>
919
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
920
{
921
#if ASTCENC_X86_GATHERS == 0
922
	// Perform manual gather using scalar loads in two separate dependency chains,
923
	// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
924
	// into a bunch of memory-operand inserts on 128-bit halves then merges late,
925
	// which performs significantly worse in tests.
926
	__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
927
	__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
928
	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
929
	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
930
	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
931
	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
932
	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
933
	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
934

935
	return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
936
#else
937
	vint8 inds(indices);
938
	return gatherf(base, inds);
939
#endif
940
}
941

942
/**
943
 * @brief Store a vector to an unaligned memory address.
944
 */
945
ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
946
{
947
	_mm256_storeu_ps(p, a.m);
948
}
949

950
/**
951
 * @brief Store a vector to a 32B aligned memory address.
952
 */
953
ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
954
{
955
	_mm256_store_ps(p, a.m);
956
}
957

958
/**
959
 * @brief Return a integer value for a float vector, using truncation.
960
 */
961
ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
962
{
963
	return vint8(_mm256_cvttps_epi32(a.m));
964
}
965

966
/**
967
 * @brief Return a integer value for a float vector, using round-to-nearest.
968
 */
969
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
970
{
971
	a = a + vfloat8(0.5f);
972
	return vint8(_mm256_cvttps_epi32(a.m));
973
}
974

975

976
/**
977
 * @brief Return a float value for an integer vector.
978
 */
979
ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
980
{
981
	return vfloat8(_mm256_cvtepi32_ps(a.m));
982
}
983

984
/**
985
 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
986
 *
987
 * It is a common trick to convert floats into integer bit patterns, perform
988
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
989
 * convert them back again. This is the first half of that flip.
990
 */
991
ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
992
{
993
	return vint8(_mm256_castps_si256(a.m));
994
}
995

996
/**
997
 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
998
 *
999
 * It is a common trick to convert floats into integer bit patterns, perform
1000
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1001
 * convert them back again. This is the second half of that flip.
1002
 */
1003
ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1004
{
1005
	return vfloat8(_mm256_castsi256_ps(a.m));
1006
}
1007

1008
/*
1009
 * Table structure for a 16x 8-bit entry table.
1010
 */
1011
struct vtable8_16x8 {
1012
	vint8 t0;
1013
};
1014

1015
/*
1016
 * Table structure for a 32x 8-bit entry table.
1017
 */
1018
struct vtable8_32x8 {
1019
	vint8 t0;
1020
	vint8 t1;
1021
};
1022

1023
/*
1024
 * Table structure for a 64x 8-bit entry table.
1025
 */
1026
struct vtable8_64x8 {
1027
	vint8 t0;
1028
	vint8 t1;
1029
	vint8 t2;
1030
	vint8 t3;
1031
};
1032

1033
/**
1034
 * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
1035
 */
1036
ASTCENC_SIMD_INLINE void vtable_prepare(
1037
	vtable8_16x8& table,
1038
	const uint8_t* data
1039
) {
1040
	// AVX2 tables duplicate table entries in each 128-bit half-register
1041
	vint4 d0 = vint4::load(data);
1042

1043
	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1044
}
1045

1046
/**
1047
 * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
1048
 */
1049
ASTCENC_SIMD_INLINE void vtable_prepare(
1050
	vtable8_32x8& table,
1051
	const uint8_t* data
1052
) {
1053
	// AVX2 tables duplicate table entries in each 128-bit half-register
1054
	vint4 d0 = vint4::load(data);
1055
	vint4 d1 = vint4::load(data + 16);
1056

1057
	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1058
	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
1059

1060
	// XOR chain the high rows to allow table emulation
1061
	table.t1 = table.t1 ^ table.t0;
1062
}
1063

1064
/**
1065
 * @brief Prepare a vtable lookup table 64x 8-bit entry table.
1066
 */
1067
ASTCENC_SIMD_INLINE void vtable_prepare(
1068
	vtable8_64x8& table,
1069
	const uint8_t* data
1070
) {
1071
	// AVX2 tables duplicate table entries in each 128-bit half-register
1072
	vint4 d0 = vint4::load(data);
1073
	vint4 d1 = vint4::load(data + 16);
1074
	vint4 d2 = vint4::load(data + 32);
1075
	vint4 d3 = vint4::load(data + 48);
1076

1077
	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
1078
	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
1079
	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
1080
	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
1081

1082
	// XOR chain the high rows to allow table emulation
1083
	table.t3 = table.t3 ^ table.t2;
1084
	table.t2 = table.t2 ^ table.t1;
1085
	table.t1 = table.t1 ^ table.t0;
1086
}
1087

1088
/**
1089
 * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
1090
 */
1091
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1092
	const vtable8_16x8& tbl,
1093
	vint8 idx
1094
) {
1095
	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1096
	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1097

1098
	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1099
	return vint8(result);
1100
}
1101

1102
/**
1103
 * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
1104
 */
1105
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1106
	const vtable8_32x8& tbl,
1107
	vint8 idx
1108
) {
1109
	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1110
	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1111

1112
	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1113
	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1114

1115
	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
1116
	result = _mm256_xor_si256(result, result2);
1117
	return vint8(result);
1118
}
1119

1120
/**
1121
 * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
1122
 */
1123
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1124
	const vtable8_64x8& tbl,
1125
	vint8 idx
1126
) {
1127
	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1128
	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1129

1130
	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
1131
	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1132

1133
	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
1134
	result = _mm256_xor_si256(result, result2);
1135
	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1136

1137
	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
1138
	result = _mm256_xor_si256(result, result2);
1139
	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1140

1141
	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
1142
	result = _mm256_xor_si256(result, result2);
1143

1144
	return vint8(result);
1145
}
1146

1147
/**
1148
 * @brief Return a vector of interleaved RGBA data.
1149
 *
1150
 * Input vectors have the value stored in the bottom 8 bits of each lane,
1151
 * with high bits set to zero.
1152
 *
1153
 * Output vector stores a single RGBA texel packed in each lane.
1154
 */
1155
ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1156
{
1157
	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1158
}
1159

1160
/**
1161
 * @brief Store a vector, skipping masked lanes.
1162
 *
1163
 * All masked lanes must be at the end of vector, after all non-masked lanes.
1164
 */
1165
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
1166
{
1167
	_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
1168
}
1169

1170
/**
1171
 * @brief Debug function to print a vector of ints.
1172
 */
1173
ASTCENC_SIMD_INLINE void print(vint8 a)
1174
{
1175
	alignas(32) int v[8];
1176
	storea(a, v);
1177
	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
1178
	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1179
}
1180

1181
/**
1182
 * @brief Debug function to print a vector of ints.
1183
 */
1184
ASTCENC_SIMD_INLINE void printx(vint8 a)
1185
{
1186
	alignas(32) int v[8];
1187
	storea(a, v);
1188

1189
	unsigned int uv[8];
1190
	std::memcpy(uv, v, sizeof(int) * 8);
1191

1192
	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
1193
		uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
1194
}
1195

1196
/**
1197
 * @brief Debug function to print a vector of floats.
1198
 */
1199
ASTCENC_SIMD_INLINE void print(vfloat8 a)
1200
{
1201
	alignas(32) float v[8];
1202
	storea(a, v);
1203
	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1204
	       static_cast<double>(v[0]), static_cast<double>(v[1]),
1205
	       static_cast<double>(v[2]), static_cast<double>(v[3]),
1206
	       static_cast<double>(v[4]), static_cast<double>(v[5]),
1207
	       static_cast<double>(v[6]), static_cast<double>(v[7]));
1208
}
1209

1210
/**
1211
 * @brief Debug function to print a vector of masks.
1212
 */
1213
ASTCENC_SIMD_INLINE void print(vmask8 a)
1214
{
1215
	print(select(vint8(0), vint8(1), a));
1216
}
1217

1218
#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1219

1220
Product

Resources

Company