CoCalc -- astcenc_vecmathlib_neon

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
⁹⁸⁹⁸ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
20
 *
21
 * This module implements 4-wide 32-bit float, int, and mask vectors for
22
 * Armv8-A NEON.
23
 *
24
 * There is a baseline level of functionality provided by all vector widths and
25
 * implementations. This is implemented using identical function signatures,
26
 * modulo data type, so we can use them as substitutable implementations in VLA
27
 * code.
28
 *
29
 * The 4-wide vectors are also used as a fixed-width type, and significantly
30
 * extend the functionality above that available to VLA code.
31
 */
32

33
#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
34
#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
35

36
#ifndef ASTCENC_SIMD_INLINE
37
	#error "Include astcenc_vecmathlib.h, do not include directly"
38
#endif
39

40
#include <cstdio>
41
#include <cstring>
42

43
// ============================================================================
44
// vfloat4 data type
45
// ============================================================================
46

47
/**
48
 * @brief Data type for 4-wide floats.
49
 */
50
struct vfloat4
51
{
52
	/**
53
	 * @brief Construct from zero-initialized value.
54
	 */
55
	ASTCENC_SIMD_INLINE vfloat4() = default;
56

57
	/**
58
	 * @brief Construct from 4 values loaded from an unaligned address.
59
	 *
60
	 * Consider using loada() which is better with vectors if data is aligned
61
	 * to vector length.
62
	 */
63
	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
64
	{
65
		m = vld1q_f32(p);
66
	}
67

68
	/**
69
	 * @brief Construct from 1 scalar value replicated across all lanes.
70
	 *
71
	 * Consider using zero() for constexpr zeros.
72
	 */
73
	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
74
	{
75
		m = vdupq_n_f32(a);
76
	}
77

78
	/**
79
	 * @brief Construct from 4 scalar values.
80
	 *
81
	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
82
	 */
83
	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
84
	{
85
		float v[4] { a, b, c, d };
86
		m = vld1q_f32(v);
87
	}
88

89
	/**
90
	 * @brief Construct from an existing SIMD register.
91
	 */
92
	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
93
	{
94
		m = a;
95
	}
96

97
	/**
98
	 * @brief Get the scalar value of a single lane.
99
	 */
100
	template <int l> ASTCENC_SIMD_INLINE float lane() const
101
	{
102
		return vgetq_lane_f32(m, l);
103
	}
104

105
	/**
106
	 * @brief Set the scalar value of a single lane.
107
	 */
108
	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
109
	{
110
		m = vsetq_lane_f32(a, m, l);
111
	}
112

113
	/**
114
	 * @brief Factory that returns a vector of zeros.
115
	 */
116
	static ASTCENC_SIMD_INLINE vfloat4 zero()
117
	{
118
		return vfloat4(0.0f);
119
	}
120

121
	/**
122
	 * @brief Factory that returns a replicated scalar loaded from memory.
123
	 */
124
	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
125
	{
126
		return vfloat4(vld1q_dup_f32(p));
127
	}
128

129
	/**
130
	 * @brief Factory that returns a vector loaded from 16B aligned memory.
131
	 */
132
	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
133
	{
134
		return vfloat4(vld1q_f32(p));
135
	}
136

137
	/**
138
	 * @brief Return a swizzled float 2.
139
	 */
140
	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
141
	{
142
		return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
143
	}
144

145
	/**
146
	 * @brief Return a swizzled float 3.
147
	 */
148
	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
149
	{
150
		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
151
	}
152

153
	/**
154
	 * @brief Return a swizzled float 4.
155
	 */
156
	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
157
	{
158
		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
159
	}
160

161
	/**
162
	 * @brief The vector ...
163
	 */
164
	float32x4_t m;
165
};
166

167
// ============================================================================
168
// vint4 data type
169
// ============================================================================
170

171
/**
172
 * @brief Data type for 4-wide ints.
173
 */
174
struct vint4
175
{
176
	/**
177
	 * @brief Construct from zero-initialized value.
178
	 */
179
	ASTCENC_SIMD_INLINE vint4() = default;
180

181
	/**
182
	 * @brief Construct from 4 values loaded from an unaligned address.
183
	 *
184
	 * Consider using loada() which is better with vectors if data is aligned
185
	 * to vector length.
186
	 */
187
	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
188
	{
189
		m = vld1q_s32(p);
190
	}
191

192
	/**
193
	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
194
	 */
195
	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
196
	{
197
#if ASTCENC_SVE == 0
198
	// Cast is safe - NEON loads are allowed to be unaligned
199
	uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
200
	uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
201
	m = vreinterpretq_s32_u32(vmovl_u16(t16));
202
#else
203
	svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
204
	m = svget_neonq(data);
205
#endif
206
	}
207

208
	/**
209
	 * @brief Construct from 1 scalar value replicated across all lanes.
210
	 *
211
	 * Consider using zero() for constexpr zeros.
212
	 */
213
	ASTCENC_SIMD_INLINE explicit vint4(int a)
214
	{
215
		m = vdupq_n_s32(a);
216
	}
217

218
	/**
219
	 * @brief Construct from 4 scalar values.
220
	 *
221
	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
222
	 */
223
	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
224
	{
225
		int v[4] { a, b, c, d };
226
		m = vld1q_s32(v);
227
	}
228

229
	/**
230
	 * @brief Construct from an existing SIMD register.
231
	 */
232
	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
233
	{
234
		m = a;
235
	}
236

237
	/**
238
	 * @brief Get the scalar from a single lane.
239
	 */
240
	template <int l> ASTCENC_SIMD_INLINE int lane() const
241
	{
242
		return vgetq_lane_s32(m, l);
243
	}
244

245
	/**
246
	 * @brief Set the scalar value of a single lane.
247
	 */
248
	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
249
	{
250
		m = vsetq_lane_s32(a, m, l);
251
	}
252

253
	/**
254
	 * @brief Factory that returns a vector of zeros.
255
	 */
256
	static ASTCENC_SIMD_INLINE vint4 zero()
257
	{
258
		return vint4(0);
259
	}
260

261
	/**
262
	 * @brief Factory that returns a replicated scalar loaded from memory.
263
	 */
264
	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
265
	{
266
		return vint4(*p);
267
	}
268

269
	/**
270
	 * @brief Factory that returns a vector loaded from unaligned memory.
271
	 */
272
	static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
273
	{
274
		vint4 data;
275
		std::memcpy(&data.m, p, 4 * sizeof(int));
276
		return data;
277
	}
278

279
	/**
280
	 * @brief Factory that returns a vector loaded from 16B aligned memory.
281
	 */
282
	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
283
	{
284
		return vint4(p);
285
	}
286

287
	/**
288
	 * @brief Factory that returns a vector containing the lane IDs.
289
	 */
290
	static ASTCENC_SIMD_INLINE vint4 lane_id()
291
	{
292
		alignas(16) static const int data[4] { 0, 1, 2, 3 };
293
		return vint4(vld1q_s32(data));
294
	}
295

296
	/**
297
	 * @brief The vector ...
298
	 */
299
	int32x4_t m;
300
};
301

302
// ============================================================================
303
// vmask4 data type
304
// ============================================================================
305

306
/**
307
 * @brief Data type for 4-wide control plane masks.
308
 */
309
struct vmask4
310
{
311
	/**
312
	 * @brief Construct from an existing SIMD register.
313
	 */
314
	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
315
	{
316
		m = a;
317
	}
318

319
#if !defined(_MSC_VER)
320
	/**
321
	 * @brief Construct from an existing SIMD register.
322
	 */
323
	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
324
	{
325
		m = vreinterpretq_u32_s32(a);
326
	}
327
#endif
328

329
	/**
330
	 * @brief Construct from 1 scalar value.
331
	 */
332
	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
333
	{
334
		m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
335
	}
336

337
	/**
338
	 * @brief Construct from 4 scalar values.
339
	 *
340
	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
341
	 */
342
	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
343
	{
344
		int v[4] {
345
			a == true ? -1 : 0,
346
			b == true ? -1 : 0,
347
			c == true ? -1 : 0,
348
			d == true ? -1 : 0
349
		};
350

351
		int32x4_t ms = vld1q_s32(v);
352
		m = vreinterpretq_u32_s32(ms);
353
	}
354

355
	/**
356
	 * @brief Get the scalar from a single lane.
357
	 */
358
	template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
359
	{
360
		return vgetq_lane_u32(m, l) != 0;
361
	}
362

363
	/**
364
	 * @brief The vector ...
365
	 */
366
	uint32x4_t m;
367
};
368

369
// ============================================================================
370
// vmask4 operators and functions
371
// ============================================================================
372

373
/**
374
 * @brief Overload: mask union (or).
375
 */
376
ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
377
{
378
	return vmask4(vorrq_u32(a.m, b.m));
379
}
380

381
/**
382
 * @brief Overload: mask intersect (and).
383
 */
384
ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
385
{
386
	return vmask4(vandq_u32(a.m, b.m));
387
}
388

389
/**
390
 * @brief Overload: mask difference (xor).
391
 */
392
ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
393
{
394
	return vmask4(veorq_u32(a.m, b.m));
395
}
396

397
/**
398
 * @brief Overload: mask invert (not).
399
 */
400
ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
401
{
402
	return vmask4(vmvnq_u32(a.m));
403
}
404

405
/**
406
 * @brief Return a 4-bit mask code indicating mask status.
407
 *
408
 * bit0 = lane 0
409
 */
410
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
411
{
412
	static const int shifta[4] { 0, 1, 2, 3 };
413
	static const int32x4_t shift = vld1q_s32(shifta);
414

415
	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
416
	return vaddvq_u32(vshlq_u32(tmp, shift));
417
}
418

419
/**
420
 * @brief True if any lanes are enabled, false otherwise.
421
 */
422
ASTCENC_SIMD_INLINE bool any(vmask4 a)
423
{
424
	return vmaxvq_u32(a.m) != 0;
425
}
426

427
/**
428
 * @brief True if all lanes are enabled, false otherwise.
429
 */
430
ASTCENC_SIMD_INLINE bool all(vmask4 a)
431
{
432
	return vminvq_u32(a.m) != 0;
433
}
434

435
// ============================================================================
436
// vint4 operators and functions
437
// ============================================================================
438

439
/**
440
 * @brief Overload: vector by vector addition.
441
 */
442
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
443
{
444
	return vint4(vaddq_s32(a.m, b.m));
445
}
446

447
/**
448
 * @brief Overload: vector by vector subtraction.
449
 */
450
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
451
{
452
	return vint4(vsubq_s32(a.m, b.m));
453
}
454

455
/**
456
 * @brief Overload: vector by vector multiplication.
457
 */
458
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
459
{
460
	return vint4(vmulq_s32(a.m, b.m));
461
}
462

463
/**
464
 * @brief Overload: vector bit invert.
465
 */
466
ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
467
{
468
	return vint4(vmvnq_s32(a.m));
469
}
470

471
/**
472
 * @brief Overload: vector by vector bitwise or.
473
 */
474
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
475
{
476
	return vint4(vorrq_s32(a.m, b.m));
477
}
478

479
/**
480
 * @brief Overload: vector by vector bitwise and.
481
 */
482
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
483
{
484
	return vint4(vandq_s32(a.m, b.m));
485
}
486

487
/**
488
 * @brief Overload: vector by vector bitwise xor.
489
 */
490
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
491
{
492
	return vint4(veorq_s32(a.m, b.m));
493
}
494

495
/**
496
 * @brief Overload: vector by vector equality.
497
 */
498
ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
499
{
500
	return vmask4(vceqq_s32(a.m, b.m));
501
}
502

503
/**
504
 * @brief Overload: vector by vector inequality.
505
 */
506
ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
507
{
508
	return ~vmask4(vceqq_s32(a.m, b.m));
509
}
510

511
/**
512
 * @brief Overload: vector by vector less than.
513
 */
514
ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
515
{
516
	return vmask4(vcltq_s32(a.m, b.m));
517
}
518

519
/**
520
 * @brief Overload: vector by vector greater than.
521
 */
522
ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
523
{
524
	return vmask4(vcgtq_s32(a.m, b.m));
525
}
526

527
/**
528
 * @brief Logical shift left.
529
 */
530
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
531
{
532
	return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
533
}
534

535
/**
536
 * @brief Logical shift right.
537
 */
538
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
539
{
540
	uint32x4_t ua = vreinterpretq_u32_s32(a.m);
541
	ua = vshlq_u32(ua, vdupq_n_s32(-s));
542
	return vint4(vreinterpretq_s32_u32(ua));
543
}
544

545
/**
546
 * @brief Arithmetic shift right.
547
 */
548
template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
549
{
550
	return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
551
}
552

553
/**
554
 * @brief Return the min vector of two vectors.
555
 */
556
ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
557
{
558
	return vint4(vminq_s32(a.m, b.m));
559
}
560

561
/**
562
 * @brief Return the max vector of two vectors.
563
 */
564
ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
565
{
566
	return vint4(vmaxq_s32(a.m, b.m));
567
}
568

569
/**
570
 * @brief Return the horizontal minimum of a vector.
571
 */
572
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
573
{
574
	return vint4(vminvq_s32(a.m));
575
}
576

577
/**
578
 * @brief Return the horizontal maximum of a vector.
579
 */
580
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
581
{
582
	return vint4(vmaxvq_s32(a.m));
583
}
584

585
/**
586
 * @brief Store a vector to a 16B aligned memory address.
587
 */
588
ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
589
{
590
	vst1q_s32(p, a.m);
591
}
592

593
/**
594
 * @brief Store a vector to an unaligned memory address.
595
 */
596
ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
597
{
598
	vst1q_s32(p, a.m);
599
}
600

601
/**
602
 * @brief Store a vector to an unaligned memory address.
603
 */
604
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
605
{
606
	std::memcpy(p, &a.m, sizeof(int) * 4);
607
}
608

609
/**
610
 * @brief Store lowest N (vector width) bytes into an unaligned address.
611
 */
612
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
613
{
614
	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
615
}
616

617
/**
618
 * @brief Pack and store low 8 bits of each vector lane.
619
 */
620
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
621
{
622
	alignas(16) uint8_t shuf[16] {
623
		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
624
	};
625
	uint8x16_t idx = vld1q_u8(shuf);
626
	int8x16_t av = vreinterpretq_s8_s32(a.m);
627
	a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
628
	store_nbytes(a, data);
629
}
630

631
/**
632
 * @brief Return lanes from @c b if @c cond is set, else @c a.
633
 */
634
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
635
{
636
	return vint4(vbslq_s32(cond.m, b.m, a.m));
637
}
638

639
// ============================================================================
640
// vfloat4 operators and functions
641
// ============================================================================
642

643
/**
644
 * @brief Overload: vector by vector addition.
645
 */
646
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
647
{
648
	return vfloat4(vaddq_f32(a.m, b.m));
649
}
650

651
/**
652
 * @brief Overload: vector by vector subtraction.
653
 */
654
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
655
{
656
	return vfloat4(vsubq_f32(a.m, b.m));
657
}
658

659
/**
660
 * @brief Overload: vector by vector multiplication.
661
 */
662
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
663
{
664
	return vfloat4(vmulq_f32(a.m, b.m));
665
}
666

667
/**
668
 * @brief Overload: vector by vector division.
669
 */
670
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
671
{
672
	return vfloat4(vdivq_f32(a.m, b.m));
673
}
674

675
/**
676
 * @brief Overload: vector by vector equality.
677
 */
678
ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
679
{
680
	return vmask4(vceqq_f32(a.m, b.m));
681
}
682

683
/**
684
 * @brief Overload: vector by vector inequality.
685
 */
686
ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
687
{
688
	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
689
}
690

691
/**
692
 * @brief Overload: vector by vector less than.
693
 */
694
ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
695
{
696
	return vmask4(vcltq_f32(a.m, b.m));
697
}
698

699
/**
700
 * @brief Overload: vector by vector greater than.
701
 */
702
ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
703
{
704
	return vmask4(vcgtq_f32(a.m, b.m));
705
}
706

707
/**
708
 * @brief Overload: vector by vector less than or equal.
709
 */
710
ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
711
{
712
	return vmask4(vcleq_f32(a.m, b.m));
713
}
714

715
/**
716
 * @brief Overload: vector by vector greater than or equal.
717
 */
718
ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
719
{
720
	return vmask4(vcgeq_f32(a.m, b.m));
721
}
722

723
/**
724
 * @brief Return the min vector of two vectors.
725
 *
726
 * If either lane value is NaN, @c b will be returned for that lane.
727
 */
728
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
729
{
730
	// Do not reorder - second operand will return if either is NaN
731
	return vfloat4(vminnmq_f32(a.m, b.m));
732
}
733

734
/**
735
 * @brief Return the max vector of two vectors.
736
 *
737
 * If either lane value is NaN, @c b will be returned for that lane.
738
 */
739
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
740
{
741
	// Do not reorder - second operand will return if either is NaN
742
	return vfloat4(vmaxnmq_f32(a.m, b.m));
743
}
744

745
/**
746
 * @brief Return the absolute value of the float vector.
747
 */
748
ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
749
{
750
	float32x4_t zero = vdupq_n_f32(0.0f);
751
	float32x4_t inv = vsubq_f32(zero, a.m);
752
	return vfloat4(vmaxq_f32(a.m, inv));
753
}
754

755
/**
756
 * @brief Return a float rounded to the nearest integer value.
757
 */
758
ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
759
{
760
	return vfloat4(vrndnq_f32(a.m));
761
}
762

763
/**
764
 * @brief Return the horizontal minimum of a vector.
765
 */
766
ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
767
{
768
	return vfloat4(vminvq_f32(a.m));
769
}
770

771
/**
772
 * @brief Return the horizontal maximum of a vector.
773
 */
774
ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
775
{
776
	return vfloat4(vmaxvq_f32(a.m));
777
}
778

779
/**
780
 * @brief Return the horizontal sum of a vector.
781
 */
782
ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
783
{
784
	// Perform halving add to ensure invariance; we cannot use vaddqv as this
785
	// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
786
	float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
787
	return vget_lane_f32(vpadd_f32(t, t), 0);
788
}
789

790
/**
791
 * @brief Return the sqrt of the lanes in the vector.
792
 */
793
ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
794
{
795
	return vfloat4(vsqrtq_f32(a.m));
796
}
797

798
/**
799
 * @brief Return lanes from @c b if @c cond is set, else @c a.
800
 */
801
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
802
{
803
	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
804
}
805

806
/**
807
 * @brief Load a vector of gathered results from an array;
808
 */
809
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
810
{
811
#if ASTCENC_SVE == 0
812
	alignas(16) int idx[4];
813
	storea(indices, idx);
814
	alignas(16) float vals[4];
815
	vals[0] = base[idx[0]];
816
	vals[1] = base[idx[1]];
817
	vals[2] = base[idx[2]];
818
	vals[3] = base[idx[3]];
819
	return vfloat4(vals);
820
#else
821
	svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
822
	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
823
	return vfloat4(svget_neonq_f32(data));
824
#endif
825
}
826

827
/**
828
 * @brief Load a vector of gathered results from an array using byte indices from memory
829
 */
830
template<>
831
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
832
{
833
#if ASTCENC_SVE == 0
834
	alignas(16) float vals[4];
835
	vals[0] = base[indices[0]];
836
	vals[1] = base[indices[1]];
837
	vals[2] = base[indices[2]];
838
	vals[3] = base[indices[3]];
839
	return vfloat4(vals);
840
#else
841
	svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
842
	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
843
	return vfloat4(svget_neonq_f32(data));
844
#endif
845
}
846
/**
847
 * @brief Store a vector to an unaligned memory address.
848
 */
849
ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
850
{
851
	vst1q_f32(p, a.m);
852
}
853

854
/**
855
 * @brief Store a vector to a 16B aligned memory address.
856
 */
857
ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
858
{
859
	vst1q_f32(p, a.m);
860
}
861

862
/**
863
 * @brief Return a integer value for a float vector, using truncation.
864
 */
865
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
866
{
867
	return vint4(vcvtq_s32_f32(a.m));
868
}
869

870
/**
871
 * @brief Return a integer value for a float vector, using round-to-nearest.
872
 */
873
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
874
{
875
	a = a + vfloat4(0.5f);
876
	return vint4(vcvtq_s32_f32(a.m));
877
}
878

879
/**
880
 * @brief Return a float value for an integer vector.
881
 */
882
ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
883
{
884
	return vfloat4(vcvtq_f32_s32(a.m));
885
}
886

887
/**
888
 * @brief Return a float16 value for a float vector, using round-to-nearest.
889
 */
890
ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
891
{
892
	// Generate float16 value
893
	float16x4_t f16 = vcvt_f16_f32(a.m);
894

895
	// Convert each 16-bit float pattern to a 32-bit pattern
896
	uint16x4_t u16 = vreinterpret_u16_f16(f16);
897
	uint32x4_t u32 = vmovl_u16(u16);
898
	return vint4(vreinterpretq_s32_u32(u32));
899
}
900

901
/**
902
 * @brief Return a float16 value for a float scalar, using round-to-nearest.
903
 */
904
static inline uint16_t float_to_float16(float a)
905
{
906
	vfloat4 av(a);
907
	return static_cast<uint16_t>(float_to_float16(av).lane<0>());
908
}
909

910
/**
911
 * @brief Return a float value for a float16 vector.
912
 */
913
ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
914
{
915
	// Convert each 32-bit float pattern to a 16-bit pattern
916
	uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
917
	uint16x4_t u16 = vmovn_u32(u32);
918
	float16x4_t f16 = vreinterpret_f16_u16(u16);
919

920
	// Generate float16 value
921
	return vfloat4(vcvt_f32_f16(f16));
922
}
923

924
/**
925
 * @brief Return a float value for a float16 scalar.
926
 */
927
ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
928
{
929
	vint4 av(a);
930
	return float16_to_float(av).lane<0>();
931
}
932

933
/**
934
 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
935
 *
936
 * It is a common trick to convert floats into integer bit patterns, perform
937
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
938
 * convert them back again. This is the first half of that flip.
939
 */
940
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
941
{
942
	return vint4(vreinterpretq_s32_f32(a.m));
943
}
944

945
/**
946
 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
947
 *
948
 * It is a common trick to convert floats into integer bit patterns, perform
949
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
950
 * convert them back again. This is the second half of that flip.
951
 */
952
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
953
{
954
	return vfloat4(vreinterpretq_f32_s32(v.m));
955
}
956

957
/*
958
 * Table structure for a 16x 8-bit entry table.
959
 */
960
struct vtable4_16x8 {
961
	uint8x16_t t0;
962
};
963

964
/*
965
 * Table structure for a 32x 8-bit entry table.
966
 */
967
struct vtable4_32x8 {
968
	uint8x16x2_t t01;
969
};
970

971
/*
972
 * Table structure for a 64x 8-bit entry table.
973
 */
974
struct vtable4_64x8 {
975
	uint8x16x4_t t0123;
976
};
977

978
/**
979
 * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
980
 */
981
ASTCENC_SIMD_INLINE void vtable_prepare(
982
	vtable4_16x8& table,
983
	const uint8_t* data
984
) {
985
	table.t0 = vld1q_u8(data);
986
}
987

988
/**
989
 * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
990
 */
991
ASTCENC_SIMD_INLINE void vtable_prepare(
992
	vtable4_32x8& table,
993
	const uint8_t* data
994
) {
995
	table.t01 = uint8x16x2_t {
996
		vld1q_u8(data),
997
		vld1q_u8(data + 16)
998
	};
999
}
1000

1001
/**
1002
 * @brief Prepare a vtable lookup table 64x 8-bit entry table.
1003
 */
1004
ASTCENC_SIMD_INLINE void vtable_prepare(
1005
	vtable4_64x8& table,
1006
	const uint8_t* data
1007
) {
1008
	table.t0123 = uint8x16x4_t {
1009
		vld1q_u8(data),
1010
		vld1q_u8(data + 16),
1011
		vld1q_u8(data + 32),
1012
		vld1q_u8(data + 48)
1013
	};
1014
}
1015

1016
/**
1017
 * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
1018
 */
1019
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1020
	const vtable4_16x8& tbl,
1021
	vint4 idx
1022
) {
1023
	// Set index byte above max index for unused bytes so table lookup returns zero
1024
	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1025
	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1026

1027
	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
1028
}
1029

1030
/**
1031
 * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
1032
 */
1033
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1034
	const vtable4_32x8& tbl,
1035
	vint4 idx
1036
) {
1037
	// Set index byte above max index for unused bytes so table lookup returns zero
1038
	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1039
	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1040

1041
	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
1042
}
1043

1044
/**
1045
 * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
1046
 */
1047
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1048
	const vtable4_64x8& tbl,
1049
	vint4 idx
1050
) {
1051
	// Set index byte above max index for unused bytes so table lookup returns zero
1052
	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1053
	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1054

1055
	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
1056
}
1057

1058
/**
1059
 * @brief Return a vector of interleaved RGBA data.
1060
 *
1061
 * Input vectors have the value stored in the bottom 8 bits of each lane,
1062
 * with high  bits set to zero.
1063
 *
1064
 * Output vector stores a single RGBA texel packed in each lane.
1065
 */
1066
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
1067
{
1068
	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1069
}
1070

1071
/**
1072
 * @brief Store a single vector lane to an unaligned address.
1073
 */
1074
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
1075
{
1076
	std::memcpy(base, &data, sizeof(int));
1077
}
1078

1079
/**
1080
 * @brief Store a vector, skipping masked lanes.
1081
 *
1082
 * All masked lanes must be at the end of vector, after all non-masked lanes.
1083
 */
1084
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
1085
{
1086
	if (mask.lane<3>())
1087
	{
1088
		store(data, base);
1089
	}
1090
	else if (mask.lane<2>() != 0.0f)
1091
	{
1092
		store_lane(base + 0, data.lane<0>());
1093
		store_lane(base + 4, data.lane<1>());
1094
		store_lane(base + 8, data.lane<2>());
1095
	}
1096
	else if (mask.lane<1>() != 0.0f)
1097
	{
1098
		store_lane(base + 0, data.lane<0>());
1099
		store_lane(base + 4, data.lane<1>());
1100
	}
1101
	else if (mask.lane<0>() != 0.0f)
1102
	{
1103
		store_lane(base + 0, data.lane<0>());
1104
	}
1105
}
1106

1107
#define ASTCENC_USE_NATIVE_POPCOUNT 1
1108

1109
/**
1110
 * @brief Population bit count.
1111
 *
1112
 * @param v   The value to population count.
1113
 *
1114
 * @return The number of 1 bits.
1115
 */
1116
ASTCENC_SIMD_INLINE int popcount(uint64_t v)
1117
{
1118
	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
1119
}
1120

1121
#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
1122

1123
Product

Resources

Company