Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Math/CrossSIMD.h
5665 views
1
// CrossSIMD
2
//
3
// This file will contain cross-instruction-set SIMD instruction wrappers.
4
//
5
// This specific file (and a future CrossSIMD.cpp) file is under public domain or MIT, unlike most of the rest of the emulator.
6
7
#pragma once
8
9
#include <cstring>
10
#include "Common/Math/SIMDHeaders.h"
11
12
#define TEST_FALLBACK 0
13
14
#if PPSSPP_ARCH(SSE2) && !TEST_FALLBACK
15
16
// The point of this, as opposed to a float4 array, is to almost force the compiler
17
// to keep the matrix in registers, rather than loading on every access.
18
struct Mat4F32 {
19
Mat4F32() {}
20
Mat4F32(const float *matrix) {
21
col0 = _mm_loadu_ps(matrix);
22
col1 = _mm_loadu_ps(matrix + 4);
23
col2 = _mm_loadu_ps(matrix + 8);
24
col3 = _mm_loadu_ps(matrix + 12);
25
}
26
void Store(float *m) {
27
_mm_storeu_ps(m, col0);
28
_mm_storeu_ps(m + 4, col1);
29
_mm_storeu_ps(m + 8, col2);
30
_mm_storeu_ps(m + 12, col3);
31
}
32
33
// Unlike the old one, this one is careful about not loading out-of-range data.
34
// The last two loads overlap.
35
static Mat4F32 Load4x3(const float *m) {
36
Mat4F32 result;
37
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
38
alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
39
__m128 mask1110 = _mm_loadu_ps((const float *)mask);
40
result.col0 = _mm_and_ps(_mm_loadu_ps(m), mask1110);
41
result.col1 = _mm_and_ps(_mm_loadu_ps(m + 3), mask1110);
42
result.col2 = _mm_and_ps(_mm_loadu_ps(m + 6), mask1110);
43
__m128 lastCol = _mm_loadu_ps(m + 8);
44
result.col3 = _mm_or_ps(_mm_and_ps(_mm_shuffle_ps(lastCol, lastCol, _MM_SHUFFLE(3, 3, 2, 1)), mask1110), _mm_load_ps(onelane3));
45
return result;
46
}
47
48
__m128 col0;
49
__m128 col1;
50
__m128 col2;
51
__m128 col3;
52
};
53
54
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
55
struct Mat4x3F32 {
56
Mat4x3F32(const float *matrix) {
57
data0 = _mm_loadu_ps(matrix);
58
data1 = _mm_loadu_ps(matrix + 4);
59
data2 = _mm_loadu_ps(matrix + 8);
60
}
61
62
__m128 data0;
63
__m128 data1;
64
__m128 data2;
65
};
66
67
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
68
Mat4F32 result;
69
70
__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col0, 0));
71
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col0, 1)));
72
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col0, 2)));
73
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col0, 3)));
74
75
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col1, 0));
76
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col1, 1)));
77
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col1, 2)));
78
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col1, 3)));
79
80
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col2, 0));
81
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col2, 1)));
82
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col2, 2)));
83
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col2, 3)));
84
85
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col3, 0));
86
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col3, 1)));
87
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col3, 2)));
88
result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col3, 3)));
89
90
return result;
91
}
92
93
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
94
Mat4F32 result;
95
96
__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 0));
97
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data0, 1)));
98
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data0, 2)));
99
100
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 3));
101
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 0)));
102
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data1, 1)));
103
104
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data1, 2));
105
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 3)));
106
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 0)));
107
108
r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data2, 1));
109
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data2, 2)));
110
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 3)));
111
112
// The last entry has an implied 1.0f.
113
result.col3 = _mm_add_ps(r_col, b.col3);
114
return result;
115
}
116
117
struct Vec4S32 {
118
__m128i v;
119
120
static Vec4S32 Zero() { return Vec4S32{ _mm_setzero_si128() }; }
121
static Vec4S32 Splat(int lane) { return Vec4S32{ _mm_set1_epi32(lane) }; }
122
123
static Vec4S32 Load(const int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
124
static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
125
void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
126
void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
127
void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
128
129
Vec4S32 SignBits32ToMask() {
130
return Vec4S32{
131
_mm_srai_epi32(v, 31)
132
};
133
}
134
135
// Reads 16 bits from both operands, produces a 32-bit result per lane.
136
// On SSE2, much faster than _mm_mullo_epi32_SSE2.
137
// On NEON though, it'll read the full 32 bits, so beware.
138
// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
139
Vec4S32 Mul16(Vec4S32 other) const {
140
// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
141
// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
142
// in the other register.
143
return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
144
}
145
146
Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
147
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
148
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
149
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
150
Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
151
152
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
153
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
154
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
155
Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ _mm_and_si128(v, other.v) }; }
156
Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ _mm_xor_si128(v, other.v) }; }
157
// TODO: andnot
158
void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
159
void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
160
void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
161
void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
162
void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
163
164
Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
165
Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
166
167
template<int imm>
168
Vec4S32 Shl() const { return Vec4S32{ imm == 0 ? v : _mm_slli_epi32(v, imm) }; }
169
170
// NOTE: May be slow.
171
int operator[](size_t index) const { return ((int *)&v)[index]; }
172
173
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
174
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0)
175
176
Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ _mm_cmpeq_epi32(v, other.v) }; }
177
Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ _mm_cmplt_epi32(v, other.v) }; }
178
Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ _mm_cmpgt_epi32(v, other.v) }; }
179
};
180
181
inline bool AnyZeroSignBit(Vec4S32 value) {
182
return _mm_movemask_ps(_mm_castsi128_ps(value.v)) != 0xF;
183
}
184
185
struct Vec4F32 {
186
__m128 v;
187
188
static Vec4F32 Zero() { return Vec4F32{ _mm_setzero_ps() }; }
189
static Vec4F32 Splat(float lane) { return Vec4F32{ _mm_set1_ps(lane) }; }
190
191
static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
192
static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
193
static Vec4F32 LoadS8Norm(const int8_t *src) {
194
__m128i value = _mm_set1_epi32(*((uint32_t *)src));
195
__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, value), value);
196
// Sign extension. A bit ugly without SSE4.
197
value32 = _mm_srai_epi32(value32, 24);
198
return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_set1_ps(1.0f / 128.0f)) };
199
}
200
static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f
201
__m128i bits = _mm_loadl_epi64((const __m128i*)src);
202
// Sign extension. A bit ugly without SSE4.
203
bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
204
return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(bits), _mm_set1_ps(1.0f / 32768.0f)) };
205
}
206
207
static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes
208
__m128i value = _mm_loadl_epi64((const __m128i *)src);
209
// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
210
return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value, value), 16)) };
211
}
212
213
static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes
214
__m128i value = _mm_loadl_epi64((const __m128i *)src);
215
__m128i value16 = _mm_unpacklo_epi8(value, value);
216
// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
217
return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
218
}
219
220
// NOTE: Does not normalize to 0..255 range.
221
static Vec4F32 LoadConvertU8(const uint8_t *src) { // Note: will load 8 bytes
222
__m128i value = _mm_loadl_epi64((const __m128i *)src);
223
__m128i zero = _mm_setzero_si128();
224
__m128i value16 = _mm_unpacklo_epi8(value, zero);
225
// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
226
return Vec4F32{ _mm_cvtepi32_ps(_mm_unpacklo_epi16(value16, zero)) };
227
}
228
229
static Vec4F32 LoadF24x3_One(const uint32_t *src) {
230
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
231
alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
232
233
__m128 value = _mm_castsi128_ps(_mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8));
234
return Vec4F32{ _mm_or_ps(_mm_and_ps(value, _mm_load_ps((const float *)mask)), _mm_load_ps(onelane3)) };
235
}
236
237
void Store(float *dst) { _mm_storeu_ps(dst, v); }
238
void Store2(float *dst) { _mm_storel_epi64((__m128i *)dst, _mm_castps_si128(v)); }
239
void StoreAligned(float *dst) { _mm_store_ps(dst, v); }
240
void Store3(float *dst) {
241
// This seems to be the best way with SSE2.
242
_mm_storel_pd((double *)dst, _mm_castps_pd(v));
243
_mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
244
}
245
void StoreConvertToU8(uint8_t *dst) {
246
__m128i zero = _mm_setzero_si128();
247
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvttps_epi32(v), zero), zero);
248
int32_t lo = _mm_cvtsi128_si32(ivalue);
249
memcpy(dst, &lo, 4);
250
}
251
252
static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
253
254
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
255
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
256
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
257
Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ _mm_min_ps(v, other.v) }; }
258
Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ _mm_max_ps(v, other.v) }; }
259
void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
260
void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
261
void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
262
void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
263
void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
264
Vec4F32 operator *(float f) const { return Vec4F32{_mm_mul_ps(v, _mm_set1_ps(f))}; }
265
void operator *=(float f) { v = _mm_mul_ps(v, _mm_set1_ps(f)); }
266
// NOTE: May be slow.
267
float operator[](size_t index) const { return ((float *)&v)[index]; }
268
269
Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
270
Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
271
Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
272
273
Vec4F32 Clamp(float lower, float higher) const {
274
return Vec4F32{
275
_mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher))
276
};
277
}
278
279
Vec4F32 WithLane3Zero() const {
280
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
281
return Vec4F32{ _mm_and_ps(v, _mm_load_ps((const float *)mask)) };
282
}
283
284
Vec4F32 WithLane3One() const {
285
alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
286
alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
287
return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
288
}
289
290
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
291
return Vec4F32{ _mm_add_ps(
292
_mm_add_ps(
293
_mm_mul_ps(m.col0, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))),
294
_mm_mul_ps(m.col1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)))
295
),
296
_mm_add_ps(
297
_mm_mul_ps(m.col2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))),
298
m.col3)
299
)
300
};
301
}
302
303
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
304
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
305
}
306
307
// This is here because ARM64 can do this very efficiently.
308
static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
309
col0.v = _mm_loadu_ps(src);
310
col1.v = _mm_loadu_ps(src + 4);
311
col2.v = _mm_loadu_ps(src + 8);
312
col3.v = _mm_loadu_ps(src + 12);
313
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
314
}
315
316
Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
317
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
318
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
319
320
template<int i> float GetLane() const {
321
return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
322
}
323
};
324
325
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
326
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
327
328
inline bool AnyZeroSignBit(Vec4F32 value) {
329
return _mm_movemask_ps(value.v) != 0xF;
330
}
331
332
// Make sure the W component of scale is 1.0f.
333
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
334
m.col0 = _mm_mul_ps(m.col0, scale.v);
335
m.col1 = _mm_mul_ps(m.col1, scale.v);
336
m.col2 = _mm_mul_ps(m.col2, scale.v);
337
m.col3 = _mm_mul_ps(m.col3, scale.v);
338
}
339
340
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
341
m.col0 = _mm_add_ps(_mm_mul_ps(m.col0, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col0, m.col0, _MM_SHUFFLE(3,3,3,3))));
342
m.col1 = _mm_add_ps(_mm_mul_ps(m.col1, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col1, m.col1, _MM_SHUFFLE(3,3,3,3))));
343
m.col2 = _mm_add_ps(_mm_mul_ps(m.col2, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col2, m.col2, _MM_SHUFFLE(3,3,3,3))));
344
m.col3 = _mm_add_ps(_mm_mul_ps(m.col3, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col3, m.col3, _MM_SHUFFLE(3,3,3,3))));
345
}
346
347
struct Vec4U16 {
348
__m128i v; // we only use the lower 64 bits.
349
350
static Vec4U16 Zero() { return Vec4U16{ _mm_setzero_si128() }; }
351
// static Vec4U16 AllOnes() { return Vec4U16{ _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) }; }
352
353
static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
354
void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
355
356
// NOTE: 16-bit signed saturation! Will work for a lot of things, but not all.
357
static Vec4U16 FromVec4S32(Vec4S32 v) {
358
return Vec4U16{ _mm_packu_epi32_SSE2(v.v)};
359
}
360
static Vec4U16 FromVec4F32(Vec4F32 v) {
361
return Vec4U16{ _mm_packu_epi32_SSE2(_mm_cvtps_epi32(v.v)) };
362
}
363
364
Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ _mm_or_si128(v, other.v) }; }
365
Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ _mm_and_si128(v, other.v) }; }
366
Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ _mm_xor_si128(v, other.v) }; }
367
368
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
369
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
370
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
371
372
inline Vec4U16 AndNot(Vec4U16 inverted) {
373
return Vec4U16{
374
_mm_andnot_si128(inverted.v, v) // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
375
};
376
}
377
};
378
379
struct Vec8U16 {
380
__m128i v;
381
382
static Vec8U16 Zero() { return Vec8U16{ _mm_setzero_si128() }; }
383
static Vec8U16 Splat(uint16_t value) { return Vec8U16{ _mm_set1_epi16((int16_t)value) }; }
384
385
static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ _mm_loadu_si128((__m128i *)mem) }; }
386
void Store(uint16_t *mem) { _mm_storeu_si128((__m128i *)mem, v); }
387
};
388
389
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
390
__m128i temp = _mm_srai_epi32(v.v, 31);
391
return Vec4U16 {
392
_mm_packs_epi32(temp, temp)
393
};
394
}
395
396
#elif PPSSPP_ARCH(ARM_NEON) && !TEST_FALLBACK
397
398
struct Mat4F32 {
399
Mat4F32() {}
400
Mat4F32(const float *matrix) {
401
col0 = vld1q_f32(matrix);
402
col1 = vld1q_f32(matrix + 4);
403
col2 = vld1q_f32(matrix + 8);
404
col3 = vld1q_f32(matrix + 12);
405
}
406
void Store(float *m) {
407
vst1q_f32(m, col0);
408
vst1q_f32(m + 4, col1);
409
vst1q_f32(m + 8, col2);
410
vst1q_f32(m + 12, col3);
411
}
412
413
// Unlike the old one, this one is careful about not loading out-of-range data.
414
// The last two loads overlap.
415
static Mat4F32 Load4x3(const float *m) {
416
Mat4F32 result;
417
result.col0 = vsetq_lane_f32(0.0f, vld1q_f32(m), 3);
418
result.col1 = vsetq_lane_f32(0.0f, vld1q_f32(m + 3), 3);
419
result.col2 = vsetq_lane_f32(0.0f, vld1q_f32(m + 6), 3);
420
result.col3 = vsetq_lane_f32(1.0f, vld1q_f32(m + 9), 3); // TODO: Fix this out of bounds read
421
return result;
422
}
423
424
float32x4_t col0;
425
float32x4_t col1;
426
float32x4_t col2;
427
float32x4_t col3;
428
};
429
430
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
431
struct Mat4x3F32 {
432
Mat4x3F32(const float *matrix) {
433
data0 = vld1q_f32(matrix);
434
data1 = vld1q_f32(matrix + 4);
435
data2 = vld1q_f32(matrix + 8);
436
}
437
438
float32x4_t data0;
439
float32x4_t data1;
440
float32x4_t data2;
441
};
442
443
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
444
Mat4F32 result;
445
446
float32x4_t r_col = vmulq_laneq_f32(b.col0, a.col0, 0);
447
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col0, 1);
448
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col0, 2);
449
result.col0 = vfmaq_laneq_f32(r_col, b.col3, a.col0, 3);
450
451
r_col = vmulq_laneq_f32(b.col0, a.col1, 0);
452
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col1, 1);
453
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col1, 2);
454
result.col1 = vfmaq_laneq_f32(r_col, b.col3, a.col1, 3);
455
456
r_col = vmulq_laneq_f32(b.col0, a.col2, 0);
457
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col2, 1);
458
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col2, 2);
459
result.col2 = vfmaq_laneq_f32(r_col, b.col3, a.col2, 3);
460
461
r_col = vmulq_laneq_f32(b.col0, a.col3, 0);
462
r_col = vfmaq_laneq_f32(r_col, b.col1, a.col3, 1);
463
r_col = vfmaq_laneq_f32(r_col, b.col2, a.col3, 2);
464
result.col3 = vfmaq_laneq_f32(r_col, b.col3, a.col3, 3);
465
466
return result;
467
}
468
469
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
470
Mat4F32 result;
471
472
float32x4_t r_col = vmulq_laneq_f32(b.col0, a.data0, 0);
473
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data0, 1);
474
result.col0 = vfmaq_laneq_f32(r_col, b.col2, a.data0, 2);
475
476
r_col = vmulq_laneq_f32(b.col0, a.data0, 3);
477
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 0);
478
result.col1 = vfmaq_laneq_f32(r_col, b.col2, a.data1, 1);
479
480
r_col = vmulq_laneq_f32(b.col0, a.data1, 2);
481
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 3);
482
result.col2 = vfmaq_laneq_f32(r_col, b.col2, a.data2, 0);
483
484
r_col = vmulq_laneq_f32(b.col0, a.data2, 1);
485
r_col = vfmaq_laneq_f32(r_col, b.col1, a.data2, 2);
486
r_col = vfmaq_laneq_f32(r_col, b.col2, a.data2, 3);
487
488
// The last entry has an implied 1.0f.
489
result.col3 = vaddq_f32(r_col, b.col3);
490
return result;
491
}
492
493
struct Vec4S32 {
494
int32x4_t v;
495
496
static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
497
static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
498
499
static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
500
static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
501
void Store(int *dst) { vst1q_s32(dst, v); }
502
void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
503
void StoreAligned(int *dst) { vst1q_s32(dst, v); }
504
505
// Warning: Unlike on x86, this is a full 32-bit multiplication.
506
Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
507
508
Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
509
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
510
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
511
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
512
Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
513
514
// NOTE: May be slow.
515
int operator[](size_t index) const { return ((int *)&v)[index]; }
516
517
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
518
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
519
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
520
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
521
Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
522
Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
523
Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
524
Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
525
void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
526
527
template<int imm>
528
Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
529
530
void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
531
void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
532
533
Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_s32(v, other.v)) }; }
534
Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_s32(v, other.v)) }; }
535
Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_s32(v, other.v)) }; }
536
Vec4S32 CompareGtZero() const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_s32(v, vdupq_n_s32(0))) }; }
537
};
538
539
struct Vec4F32 {
540
float32x4_t v;
541
542
static Vec4F32 Zero() { return Vec4F32{ vdupq_n_f32(0.0f) }; }
543
static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; }
544
545
static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
546
static Vec4F32 LoadS8Norm(const int8_t *src) {
547
const int8x8_t value = (int8x8_t)vdup_n_u32(*((uint32_t *)src));
548
const int16x8_t value16 = vmovl_s8(value);
549
return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(value16)), 7) };
550
}
551
static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f
552
return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vld1_s16(src)), 15) };
553
}
554
static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
555
556
static Vec4F32 LoadConvertS16(const int16_t *src) {
557
int16x4_t value = vld1_s16(src);
558
return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
559
}
560
561
static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
562
int8x8_t value = vld1_s8(src);
563
int16x4_t value16 = vget_low_s16(vmovl_s8(value));
564
return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value16)) };
565
}
566
567
static Vec4F32 LoadConvertU8(const uint8_t *src) { // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
568
uint8x8_t value = vld1_u8(src);
569
uint16x4_t value16 = vget_low_u16(vmovl_u8(value));
570
return Vec4F32{ vcvtq_f32_u32(vmovl_u16(value16)) };
571
}
572
573
static Vec4F32 LoadF24x3_One(const uint32_t *src) {
574
return Vec4F32{ vsetq_lane_f32(1.0f, vreinterpretq_f32_u32(vshlq_n_u32(vld1q_u32(src), 8)), 3) };
575
}
576
577
static Vec4F32 FromVec4S32(Vec4S32 other) {
578
return Vec4F32{ vcvtq_f32_s32(other.v) };
579
}
580
581
void Store(float *dst) { vst1q_f32(dst, v); }
582
void Store2(float *dst) { vst1_f32(dst, vget_low_f32(v)); }
583
void StoreAligned(float *dst) { vst1q_f32(dst, v); }
584
void Store3(float *dst) {
585
// TODO: There might be better ways. Try to avoid this when possible.
586
vst1_f32(dst, vget_low_f32(v));
587
#if PPSSPP_ARCH(ARM64_NEON)
588
vst1q_lane_f32(dst + 2, v, 2);
589
#else
590
dst[2] = vgetq_lane_f32(v, 2);
591
#endif
592
}
593
void StoreConvertToU8(uint8_t *dest) {
594
uint32x4_t ivalue32 = vcvtq_u32_f32(v);
595
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
596
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
597
uint32_t value = vget_lane_u32(vreinterpret_u32_u8(ivalue8), 0);
598
memcpy(dest, &value, sizeof(uint32_t));
599
}
600
601
// NOTE: May be slow.
602
float operator[](size_t index) const { return ((float *)&v)[index]; }
603
604
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
605
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
606
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
607
Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ vminq_f32(v, other.v) }; }
608
Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ vmaxq_f32(v, other.v) }; }
609
void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
610
void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
611
void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
612
#if PPSSPP_ARCH(ARM64_NEON)
613
void operator /=(Vec4F32 other) { v = vdivq_f32(v, other.v); }
614
#else
615
// ARM32 doesn't have vdivq.
616
void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
617
#endif
618
void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
619
Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
620
void operator *=(float f) { v = vmulq_f32(v, vdupq_n_f32(f)); }
621
622
Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
623
624
Vec4F32 Recip() const {
625
float32x4_t recip = vrecpeq_f32(v);
626
// Use a couple Newton-Raphson steps to refine the estimate.
627
// To save one iteration at the expense of accuracy, use RecipApprox().
628
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
629
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
630
return Vec4F32{ recip };
631
}
632
633
Vec4F32 RecipApprox() const {
634
float32x4_t recip = vrecpeq_f32(v);
635
// To approximately match the precision of x86-64's rcpps, do a single iteration.
636
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
637
return Vec4F32{ recip };
638
}
639
640
Vec4F32 Clamp(float lower, float higher) const {
641
return Vec4F32{
642
vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
643
};
644
}
645
646
Vec4F32 WithLane3Zero() const {
647
return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) };
648
}
649
650
Vec4F32 WithLane3One() const {
651
return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
652
}
653
654
Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
655
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
656
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
657
Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
658
Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }
659
660
// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
661
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
662
#if PPSSPP_ARCH(ARM64_NEON)
663
// Only works on ARM64
664
float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
665
float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
666
float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
667
float32x4_t temp3 = vzip2q_f32(col1.v, col3.v);
668
col0.v = vzip1q_f32(temp0, temp2);
669
col1.v = vzip2q_f32(temp0, temp2);
670
col2.v = vzip1q_f32(temp1, temp3);
671
col3.v = vzip2q_f32(temp1, temp3);
672
#else
673
float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
674
float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
675
col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
676
col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
677
col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
678
col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
679
#endif
680
}
681
682
static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
683
// The optimizer hopefully gets rid of the copies below.
684
float32x4x4_t r = vld4q_f32(src);
685
col0.v = r.val[0];
686
col1.v = r.val[1];
687
col2.v = r.val[2];
688
col3.v = r.val[3];
689
}
690
691
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
692
#if PPSSPP_ARCH(ARM64_NEON)
693
float32x4_t sum = vaddq_f32(
694
vaddq_f32(vmulq_laneq_f32(m.col0, v, 0), vmulq_laneq_f32(m.col1, v, 1)),
695
vaddq_f32(vmulq_laneq_f32(m.col2, v, 2), m.col3));
696
#else
697
float32x4_t sum = vaddq_f32(
698
vaddq_f32(vmulq_lane_f32(m.col0, vget_low_f32(v), 0), vmulq_lane_f32(m.col1, vget_low_f32(v), 1)),
699
vaddq_f32(vmulq_lane_f32(m.col2, vget_high_f32(v), 0), m.col3));
700
#endif
701
return Vec4F32{ sum };
702
}
703
704
template<int i> float GetLane() const {
705
return vgetq_lane_f32(v, i);
706
}
707
};
708
709
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
710
inline Vec4F32 Vec4F32FromS32(Vec4S32 s) { return Vec4F32{ vcvtq_f32_s32(s.v) }; }
711
712
// Make sure the W component of scale is 1.0f.
713
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
714
m.col0 = vmulq_f32(m.col0, scale.v);
715
m.col1 = vmulq_f32(m.col1, scale.v);
716
m.col2 = vmulq_f32(m.col2, scale.v);
717
m.col3 = vmulq_f32(m.col3, scale.v);
718
}
719
720
// Make sure the W component of scale is 1.0f, and the W component of translate should be 0.
721
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
722
m.col0 = vaddq_f32(vmulq_f32(m.col0, scale.v), vmulq_laneq_f32(translate.v, m.col0, 3));
723
m.col1 = vaddq_f32(vmulq_f32(m.col1, scale.v), vmulq_laneq_f32(translate.v, m.col1, 3));
724
m.col2 = vaddq_f32(vmulq_f32(m.col2, scale.v), vmulq_laneq_f32(translate.v, m.col2, 3));
725
m.col3 = vaddq_f32(vmulq_f32(m.col3, scale.v), vmulq_laneq_f32(translate.v, m.col3, 3));
726
}
727
728
inline bool AnyZeroSignBit(Vec4S32 value) {
729
#if PPSSPP_ARCH(ARM64_NEON)
730
// Shortcut on arm64
731
return vmaxvq_s32(value.v) >= 0;
732
#else
733
// Very suboptimal, let's optimize later.
734
int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v));
735
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
736
return (mask & 0x80000000) == 0;
737
#endif
738
}
739
740
inline bool AnyZeroSignBit(Vec4F32 value) {
741
int32x4_t ival = vreinterpretq_s32_f32(value.v);
742
#if PPSSPP_ARCH(ARM64_NEON)
743
// Shortcut on arm64
744
return vmaxvq_s32(ival) >= 0;
745
#else
746
int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
747
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
748
return (mask & 0x80000000) == 0;
749
#endif
750
}
751
752
struct Vec4U16 {
753
uint16x4_t v; // 64 bits.
754
755
static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; }
756
static Vec4U16 Splat(uint16_t value) { return Vec4U16{ vdup_n_u16(value) }; }
757
758
static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
759
void Store(uint16_t *mem) { vst1_u16(mem, v); }
760
761
static Vec4U16 FromVec4S32(Vec4S32 v) {
762
return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(v.v)) };
763
}
764
static Vec4U16 FromVec4F32(Vec4F32 v) {
765
return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(v.v))) };
766
}
767
768
Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; }
769
Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ vand_u16(v, other.v) }; }
770
Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ veor_u16(v, other.v) }; }
771
772
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
773
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
774
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
775
776
Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
777
};
778
779
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
780
int32x4_t sign_mask = vshrq_n_s32(v.v, 31);
781
uint16x4_t result = vreinterpret_u16_s16(vmovn_s32(sign_mask));
782
return Vec4U16{ result };
783
}
784
785
struct Vec8U16 {
786
uint16x8_t v;
787
788
static Vec8U16 Zero() { return Vec8U16{ vdupq_n_u16(0) }; }
789
static Vec8U16 Splat(uint16_t value) { return Vec8U16{ vdupq_n_u16(value) }; }
790
791
static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ vld1q_u16(mem) }; }
792
void Store(uint16_t *mem) { vst1q_u16(mem, v); }
793
};
794
795
#else
796
797
#define CROSSSIMD_SLOW 1
798
799
// Fake SIMD by using scalar.
800
801
struct Mat4F32 {
802
Mat4F32() {}
803
Mat4F32(const float *src) {
804
memcpy(m, src, sizeof(m));
805
}
806
void Store(float *dest) {
807
memcpy(dest, m, sizeof(m));
808
}
809
static Mat4F32 Load4x3(const float *src) {
810
Mat4F32 mat;
811
mat.m[0] = src[0];
812
mat.m[1] = src[1];
813
mat.m[2] = src[2];
814
mat.m[3] = 0.0f;
815
mat.m[4] = src[3];
816
mat.m[5] = src[4];
817
mat.m[6] = src[5];
818
mat.m[7] = 0.0f;
819
mat.m[8] = src[6];
820
mat.m[9] = src[7];
821
mat.m[10] = src[8];
822
mat.m[11] = 0.0f;
823
mat.m[12] = src[9];
824
mat.m[13] = src[10];
825
mat.m[14] = src[11];
826
mat.m[15] = 1.0f;
827
return mat;
828
}
829
830
// cols are consecutive
831
float m[16];
832
};
833
834
// The columns are consecutive but missing the last row (implied 0,0,0,1).
835
// This is just intermediate storage for multiplication.
836
struct Mat4x3F32 {
837
Mat4x3F32(const float *matrix) {
838
memcpy(m, matrix, 12 * sizeof(float));
839
}
840
float m[12];
841
};
842
843
struct Vec4S32 {
844
int32_t v[4];
845
846
static Vec4S32 Zero() { return Vec4S32{}; }
847
static Vec4S32 Splat(int lane) { return Vec4S32{ { lane, lane, lane, lane } }; }
848
849
static Vec4S32 Load(const int *src) { return Vec4S32{ { src[0], src[1], src[2], src[3] }}; }
850
static Vec4S32 LoadAligned(const int *src) { return Load(src); }
851
void Store(int *dst) { memcpy(dst, v, sizeof(v)); }
852
void Store2(int *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
853
void StoreAligned(int *dst) { memcpy(dst, v, sizeof(v)); }
854
855
// Warning: Unlike on x86 SSE2, this is a full 32-bit multiplication.
856
Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3] } }; }
857
858
Vec4S32 SignExtend16() const {
859
Vec4S32 tmp;
860
for (int i = 0; i < 4; i++) {
861
tmp.v[i] = (int32_t)(int16_t)v[i];
862
}
863
return tmp;
864
}
865
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
866
Vec4S32 Min16(Vec4S32 other) const {
867
Vec4S32 tmp;
868
for (int i = 0; i < 4; i++) {
869
tmp.v[i] = other.v[i] < v[i] ? other.v[i] : v[i];
870
}
871
return tmp;
872
}
873
Vec4S32 Max16(Vec4S32 other) const {
874
Vec4S32 tmp;
875
for (int i = 0; i < 4; i++) {
876
tmp.v[i] = other.v[i] > v[i] ? other.v[i] : v[i];
877
}
878
return tmp;
879
}
880
Vec4S32 FixupAfterMinMax() const { return *this; }
881
882
int operator[](size_t index) const { return v[index]; }
883
884
Vec4S32 operator +(Vec4S32 other) const {
885
return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
886
}
887
Vec4S32 operator -(Vec4S32 other) const {
888
return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
889
}
890
Vec4S32 operator *(Vec4S32 other) const {
891
return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
892
}
893
// TODO: Can optimize the bitwise ones with 64-bit operations.
894
Vec4S32 operator |(Vec4S32 other) const {
895
return Vec4S32{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } };
896
}
897
Vec4S32 operator &(Vec4S32 other) const {
898
return Vec4S32{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } };
899
}
900
Vec4S32 operator ^(Vec4S32 other) const {
901
return Vec4S32{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } };
902
}
903
Vec4S32 AndNot(Vec4S32 other) const {
904
return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } };
905
}
906
Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
907
908
void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; }
909
void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; }
910
void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; }
911
912
template<int imm>
913
Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; }
914
915
Vec4S32 CompareEq(Vec4S32 other) const {
916
Vec4S32 out;
917
for (int i = 0; i < 4; i++) {
918
out.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
919
}
920
return out;
921
}
922
Vec4S32 CompareLt(Vec4S32 other) const {
923
Vec4S32 out;
924
for (int i = 0; i < 4; i++) {
925
out.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
926
}
927
return out;
928
}
929
Vec4S32 CompareGt(Vec4S32 other) const {
930
Vec4S32 out;
931
for (int i = 0; i < 4; i++) {
932
out.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
933
}
934
return out;
935
}
936
Vec4S32 CompareGtZero() const {
937
Vec4S32 out;
938
for (int i = 0; i < 4; i++) {
939
out.v[i] = v[i] > 0 ? 0xFFFFFFFF : 0;
940
}
941
return out;
942
}
943
};
944
945
struct Vec4F32 {
946
float v[4];
947
948
static Vec4F32 Zero() { return Vec4F32{}; }
949
static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; }
950
951
static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
952
static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
953
static Vec4F32 LoadS8Norm(const int8_t *src) {
954
Vec4F32 temp;
955
for (int i = 0; i < 4; i++) {
956
temp.v[i] = (float)src[i] * (1.0f / 128.0f);
957
}
958
return temp;
959
}
960
static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f
961
Vec4F32 temp;
962
for (int i = 0; i < 4; i++) {
963
temp.v[i] = (float)src[i] * (1.0f / 32768.0f);
964
}
965
return temp;
966
}
967
void Store(float *dst) { memcpy(dst, v, sizeof(v)); }
968
void Store2(float *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
969
void StoreAligned(float *dst) { memcpy(dst, v, sizeof(v)); }
970
void Store3(float *dst) {
971
memcpy(dst, v, sizeof(v[0]) * 3);
972
}
973
974
static Vec4F32 LoadConvertS16(const int16_t *src) {
975
Vec4F32 temp;
976
for (int i = 0; i < 4; i++) {
977
temp.v[i] = (float)src[i];
978
}
979
return temp;
980
}
981
982
static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
983
Vec4F32 temp;
984
for (int i = 0; i < 4; i++) {
985
temp.v[i] = (float)src[i];
986
}
987
return temp;
988
}
989
990
static Vec4F32 LoadF24x3_One(const uint32_t *src) {
991
uint32_t shifted[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
992
Vec4F32 temp;
993
memcpy(temp.v, shifted, sizeof(temp.v));
994
return temp;
995
}
996
997
static Vec4F32 FromVec4S32(Vec4S32 src) {
998
Vec4F32 temp;
999
for (int i = 0; i < 4; i++) {
1000
temp.v[i] = (float)src[i];
1001
}
1002
return temp;
1003
}
1004
1005
float operator[](size_t index) const { return v[index]; }
1006
1007
Vec4F32 operator +(Vec4F32 other) const {
1008
return Vec4F32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
1009
}
1010
Vec4F32 operator -(Vec4F32 other) const {
1011
return Vec4F32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
1012
}
1013
Vec4F32 operator *(Vec4F32 other) const {
1014
return Vec4F32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
1015
}
1016
Vec4F32 Min(Vec4F32 other) const {
1017
Vec4F32 temp;
1018
for (int i = 0; i < 4; i++) {
1019
temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
1020
}
1021
return temp;
1022
}
1023
Vec4F32 Max(Vec4F32 other) const {
1024
Vec4F32 temp;
1025
for (int i = 0; i < 4; i++) {
1026
temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
1027
}
1028
return temp;
1029
}
1030
void operator +=(Vec4F32 other) {
1031
for (int i = 0; i < 4; i++) {
1032
v[i] += other.v[i];
1033
}
1034
}
1035
void operator -=(Vec4F32 other) {
1036
for (int i = 0; i < 4; i++) {
1037
v[i] -= other.v[i];
1038
}
1039
}
1040
void operator *=(Vec4F32 other) {
1041
for (int i = 0; i < 4; i++) {
1042
v[i] *= other.v[i];
1043
}
1044
}
1045
void operator /=(Vec4F32 other) {
1046
for (int i = 0; i < 4; i++) {
1047
v[i] /= other.v[i];
1048
}
1049
}
1050
void operator &=(Vec4S32 other) {
1051
// TODO: This can be done simpler, although with some ugly casts.
1052
for (int i = 0; i < 4; i++) {
1053
uint32_t val;
1054
memcpy(&val, &v[i], 4);
1055
val &= other.v[i];
1056
memcpy(&v[i], &val, 4);
1057
}
1058
}
1059
Vec4F32 operator *(float f) const {
1060
return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
1061
}
1062
1063
Vec4F32 Mul(float f) const {
1064
return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
1065
}
1066
1067
Vec4F32 Recip() const {
1068
return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
1069
}
1070
1071
Vec4F32 RecipApprox() const {
1072
return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
1073
}
1074
1075
Vec4F32 Clamp(float lower, float higher) const {
1076
Vec4F32 temp;
1077
for (int i = 0; i < 4; i++) {
1078
if (v[i] > higher) {
1079
temp.v[i] = higher;
1080
} else if (v[i] < lower) {
1081
temp.v[i] = lower;
1082
} else {
1083
temp.v[i] = v[i];
1084
}
1085
}
1086
return temp;
1087
}
1088
1089
Vec4F32 WithLane3Zero() const {
1090
return Vec4F32{ { v[0], v[1], v[2], 0.0f } };
1091
}
1092
1093
Vec4F32 WithLane3One() const {
1094
return Vec4F32{ { v[0], v[1], v[2], 1.0f } };
1095
}
1096
1097
Vec4S32 CompareEq(Vec4F32 other) const {
1098
Vec4S32 temp;
1099
for (int i = 0; i < 4; i++) {
1100
temp.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
1101
}
1102
return temp;
1103
}
1104
Vec4S32 CompareLt(Vec4F32 other) const {
1105
Vec4S32 temp;
1106
for (int i = 0; i < 4; i++) {
1107
temp.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
1108
}
1109
return temp;
1110
}
1111
Vec4S32 CompareGt(Vec4F32 other) const {
1112
Vec4S32 temp;
1113
for (int i = 0; i < 4; i++) {
1114
temp.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
1115
}
1116
return temp;
1117
}
1118
Vec4S32 CompareLe(Vec4F32 other) const {
1119
Vec4S32 temp;
1120
for (int i = 0; i < 4; i++) {
1121
temp.v[i] = v[i] <= other.v[i] ? 0xFFFFFFFF : 0;
1122
}
1123
return temp;
1124
}
1125
Vec4S32 CompareGe(Vec4F32 other) const {
1126
Vec4S32 temp;
1127
for (int i = 0; i < 4; i++) {
1128
temp.v[i] = v[i] >= other.v[i] ? 0xFFFFFFFF : 0;
1129
}
1130
return temp;
1131
}
1132
1133
// In-place transpose.
1134
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
1135
float m[16];
1136
for (int i = 0; i < 4; i++) {
1137
m[0 + i] = col0.v[i];
1138
m[4 + i] = col1.v[i];
1139
m[8 + i] = col2.v[i];
1140
m[12 + i] = col3.v[i];
1141
}
1142
for (int i = 0; i < 4; i++) {
1143
col0.v[i] = m[i * 4 + 0];
1144
col1.v[i] = m[i * 4 + 1];
1145
col2.v[i] = m[i * 4 + 2];
1146
col3.v[i] = m[i * 4 + 3];
1147
}
1148
}
1149
1150
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
1151
float x = m.m[0] * v[0] + m.m[4] * v[1] + m.m[8] * v[2] + m.m[12];
1152
float y = m.m[1] * v[0] + m.m[5] * v[1] + m.m[9] * v[2] + m.m[13];
1153
float z = m.m[2] * v[0] + m.m[6] * v[1] + m.m[10] * v[2] + m.m[14];
1154
1155
return Vec4F32{ { x, y, z, 1.0f } };
1156
}
1157
1158
template<int i> float GetLane() const {
1159
return v[i];
1160
}
1161
};
1162
1163
inline bool AnyZeroSignBit(Vec4S32 value) {
1164
for (int i = 0; i < 4; i++) {
1165
if (value.v[i] >= 0) {
1166
return true;
1167
}
1168
}
1169
return false;
1170
}
1171
1172
inline bool AnyZeroSignBit(Vec4F32 value) {
1173
for (int i = 0; i < 4; i++) {
1174
if (value.v[i] >= 0.0f) {
1175
return true;
1176
}
1177
}
1178
return false;
1179
}
1180
1181
struct Vec4U16 {
1182
uint16_t v[4]; // 64 bits.
1183
1184
static Vec4U16 Zero() { return Vec4U16{}; }
1185
static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; }
1186
1187
static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; }
1188
void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
1189
1190
static Vec4U16 FromVec4S32(Vec4S32 v) {
1191
return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
1192
}
1193
static Vec4U16 FromVec4F32(Vec4F32 v) {
1194
return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
1195
}
1196
1197
Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] | other.v[0]), (uint16_t)(v[1] | other.v[1]), (uint16_t)(v[2] | other.v[2]), (uint16_t)(v[3] | other.v[3]), } }; }
1198
Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] & other.v[0]), (uint16_t)(v[1] & other.v[1]), (uint16_t)(v[2] & other.v[2]), (uint16_t)(v[3] & other.v[3]), } }; }
1199
Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { (uint16_t) (v[0] ^ other.v[0]), (uint16_t)(v[1] ^ other.v[1]), (uint16_t)(v[2] ^ other.v[2]), (uint16_t)(v[3] ^ other.v[3]), } }; }
1200
1201
Vec4U16 Max(Vec4U16 other) const {
1202
Vec4U16 temp;
1203
for (int i = 0; i < 4; i++) {
1204
temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
1205
}
1206
return temp;
1207
}
1208
Vec4U16 Min(Vec4U16 other) const {
1209
Vec4U16 temp;
1210
for (int i = 0; i < 4; i++) {
1211
temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
1212
}
1213
return temp;
1214
}
1215
Vec4U16 CompareLT(Vec4U16 other) const {
1216
Vec4U16 temp;
1217
for (int i = 0; i < 4; i++) {
1218
temp.v[i] = v[i] < other.v[i] ? 0xFFFF : 0;
1219
}
1220
return temp;
1221
}
1222
Vec4U16 AndNot(Vec4U16 other) const {
1223
Vec4U16 temp;
1224
for (int i = 0; i < 4; i++) {
1225
temp.v[i] = v[i] & ~other.v[i];
1226
}
1227
return temp;
1228
}
1229
};
1230
1231
struct Vec8U16 {
1232
uint16_t v[8];
1233
1234
static Vec8U16 Zero() { return Vec8U16{}; }
1235
static Vec8U16 Splat(uint16_t value) { return Vec8U16{ {
1236
value, value, value, value, value, value, value, value,
1237
}}; }
1238
1239
static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); return tmp; }
1240
void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
1241
};
1242
1243
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
1244
return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31), } };
1245
}
1246
1247
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) {
1248
return Vec4S32{ { (int32_t)f.v[0], (int32_t)f.v[1], (int32_t)f.v[2], (int32_t)f.v[3] } };
1249
}
1250
1251
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) {
1252
return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } };
1253
}
1254
1255
// Make sure the W component of scale is 1.0f.
1256
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
1257
for (int i = 0; i < 4; i++) {
1258
m.m[i * 4 + 0] *= scale.v[0];
1259
m.m[i * 4 + 1] *= scale.v[1];
1260
m.m[i * 4 + 2] *= scale.v[2];
1261
m.m[i * 4 + 3] *= scale.v[3];
1262
}
1263
}
1264
1265
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
1266
for (int i = 0; i < 4; i++) {
1267
m.m[i * 4 + 0] = m.m[i * 4 + 0] * scale.v[0] + translate.v[0] * m.m[i * 4 + 3];
1268
m.m[i * 4 + 1] = m.m[i * 4 + 1] * scale.v[1] + translate.v[1] * m.m[i * 4 + 3];
1269
m.m[i * 4 + 2] = m.m[i * 4 + 2] * scale.v[2] + translate.v[2] * m.m[i * 4 + 3];
1270
m.m[i * 4 + 3] = m.m[i * 4 + 3] * scale.v[3] + translate.v[3] * m.m[i * 4 + 3];
1271
}
1272
}
1273
1274
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
1275
Mat4F32 result;
1276
for (int j = 0; j < 4; j++) {
1277
for (int i = 0; i < 4; i++) {
1278
float sum = 0.0f;
1279
for (int k = 0; k < 4; k++) {
1280
sum += b.m[k * 4 + i] * a.m[j * 4 + k];
1281
}
1282
result.m[j * 4 + i] = sum;
1283
}
1284
}
1285
return result;
1286
}
1287
1288
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
1289
Mat4F32 result;
1290
1291
for (int j = 0; j < 4; j++) {
1292
for (int i = 0; i < 4; i++) {
1293
float sum = 0.0f;
1294
for (int k = 0; k < 3; k++) {
1295
sum += b.m[k * 4 + i] * a.m[j * 3 + k];
1296
}
1297
if (j == 3) {
1298
sum += b.m[12 + i];
1299
}
1300
result.m[j * 4 + i] = sum;
1301
}
1302
}
1303
return result;
1304
}
1305
1306
#endif
1307
1308